Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Nov 23, 2024

Commit

79bbe0b

verified ·

1 Parent(s): 4ec5d16

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -3

app.py CHANGED Viewed

@@ -21,6 +21,71 @@ EMOTION_LABELS = {
     'LABEL_2': 'Neutral'
 }
 def format_topics(topic_model, topic_counts):
     """Convert topic numbers to readable labels."""
     formatted_topics = []
@@ -50,10 +115,26 @@ def format_emotions(emotion_counts):
         })
     return formatted_emotions
-# [Previous functions remain the same until process_and_summarize]
 def process_and_summarize(uploaded_file, top_n=50):
-    # [Previous code remains the same until the summaries loop]
     # Initialize BERTopic with specific parameters
     topic_model = BERTopic(

     'LABEL_2': 'Neutral'
 }
+def chunk_text(text, max_length=512):
+    """Split text into chunks of maximum token length."""
+    tokens = bert_tokenizer.encode(text, add_special_tokens=False)
+    chunks = []
+    for i in range(0, len(tokens), max_length - 2):  # -2 to account for [CLS] and [SEP] tokens
+        chunk = tokens[i:i + max_length - 2]
+        # Add special tokens
+        chunk = [bert_tokenizer.cls_token_id] + chunk + [bert_tokenizer.sep_token_id]
+        chunks.append(chunk)
+    return chunks
+def get_embedding_for_text(text):
+    """Get embedding for a single text."""
+    chunks = chunk_text(text)
+    chunk_embeddings = []
+    for chunk in chunks:
+        # Convert to tensor and add batch dimension
+        input_ids = torch.tensor([chunk]).to(bert_model.device)
+        attention_mask = torch.ones_like(input_ids)
+        with torch.no_grad():
+            outputs = bert_model(input_ids, attention_mask=attention_mask)
+        # Get [CLS] token embedding for this chunk
+        chunk_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+        chunk_embeddings.append(chunk_embedding[0])
+    # Average embeddings from all chunks
+    if chunk_embeddings:
+        return np.mean(chunk_embeddings, axis=0)
+    return np.zeros(bert_model.config.hidden_size)  # fallback
+def generate_embeddings(texts):
+    """Generate embeddings for a list of texts."""
+    embeddings = []
+    for text in texts:
+        try:
+            embedding = get_embedding_for_text(text)
+            embeddings.append(embedding)
+        except Exception as e:
+            st.warning(f"Error processing text: {str(e)}")
+            # Add zero embedding as fallback
+            embeddings.append(np.zeros(bert_model.config.hidden_size))
+    return np.array(embeddings)
+def classify_emotion(text):
+    """Classify emotion for a single text."""
+    try:
+        chunks = chunk_text(text)
+        if not chunks:
+            return "unknown"
+        # Use first chunk for classification
+        chunk_text = bert_tokenizer.decode(chunks[0])
+        result = emotion_classifier(chunk_text)[0]
+        return result['label']
+    except Exception as e:
+        st.warning(f"Error in emotion classification: {str(e)}")
+        return "unknown"
 def format_topics(topic_model, topic_counts):
     """Convert topic numbers to readable labels."""
     formatted_topics = []
         })
     return formatted_emotions
 def process_and_summarize(uploaded_file, top_n=50):
+    # Determine the file type
+    if uploaded_file.name.endswith(".csv"):
+        df = pd.read_csv(uploaded_file)
+    elif uploaded_file.name.endswith(".xlsx"):
+        df = pd.read_excel(uploaded_file)
+    else:
+        st.error("Unsupported file format.")
+        return None, None
+    # Validate required columns
+    required_columns = ['country', 'poem']
+    missing_columns = [col for col in required_columns if col not in df.columns]
+    if missing_columns:
+        st.error(f"Missing columns: {', '.join(missing_columns)}")
+        return None, None
+    # Parse and preprocess the file
+    df['country'] = df['country'].str.strip()
+    df = df.dropna(subset=['country', 'poem'])
     # Initialize BERTopic with specific parameters
     topic_model = BERTopic(