Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Nov 24, 2024

Commit

b88eade

verified ·

1 Parent(s): 89175c6

Update app.py

Browse files

Files changed (1) hide show

app.py +131 -93

app.py CHANGED Viewed

@@ -17,96 +17,113 @@ st.set_page_config(
 @st.cache_resource
 def load_models():
     """Load and cache the models to prevent reloading"""
-    bert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
     bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
     emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
-    emotion_classifier = pipeline("text-classification", model=emotion_model, tokenizer=bert_tokenizer)
-    return bert_tokenizer, bert_model, emotion_classifier
-# Load models
-try:
-    bert_tokenizer, bert_model, emotion_classifier = load_models()
-    st.success("Models loaded successfully!")
-except Exception as e:
-    st.error(f"Error loading models: {str(e)}")
-    st.stop()
-# Define emotion labels mapping
-EMOTION_LABELS = {
-    'LABEL_0': 'Negative',
-    'LABEL_1': 'Positive',
-    'LABEL_2': 'Neutral'
-}
-def chunk_long_text(text, tokenizer, max_length=512):
-    """Split text into chunks respecting token limit."""
-    tokens = tokenizer.encode(text, add_special_tokens=False)
     chunks = []
-    text_chunks = []
-    for i in range(0, len(tokens), max_length-2):
-        chunk = tokens[i:i + max_length-2]
-        full_chunk = [tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id]
-        chunks.append(full_chunk)
-        text_chunks.append(tokenizer.decode(chunk))
-    return chunks, text_chunks
-def get_embedding_for_text(text, tokenizer, model):
-    """Get embedding for a text, handling long sequences."""
-    _, text_chunks = chunk_long_text(text, tokenizer)
-    chunk_embeddings = []
-    for chunk in text_chunks:
-        inputs = tokenizer(chunk,
-                          return_tensors="pt",
-                          padding=True,
-                          truncation=True,
-                          max_length=512)
-        inputs = {k: v.to(model.device) for k, v in inputs.items()}
-        with torch.no_grad():
-            outputs = model(**inputs)
-        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
-        chunk_embeddings.append(embedding[0])
-    if chunk_embeddings:
-        return np.mean(chunk_embeddings, axis=0)
-    return np.zeros(model.config.hidden_size)
-def generate_embeddings(texts, tokenizer, model):
-    """Generate embeddings for a list of texts."""
-    embeddings = []
-    for text in texts:
-        try:
-            embedding = get_embedding_for_text(text, tokenizer, model)
-            embeddings.append(embedding)
-        except Exception as e:
-            st.warning(f"Error processing text: {str(e)}")
-            embeddings.append(np.zeros(model.config.hidden_size))
-    return np.array(embeddings)
-def classify_emotion(text, tokenizer, classifier):
-    """Classify emotion for a text using majority voting."""
     try:
-        _, text_chunks = chunk_long_text(text, tokenizer)
-        chunk_emotions = []
-        for chunk in text_chunks:
-            result = classifier(chunk, max_length=512, truncation=True)[0]
-            chunk_emotions.append(result['label'])
-        if chunk_emotions:
-            final_emotion = Counter(chunk_emotions).most_common(1)[0][0]
             return final_emotion
-        return "unknown"
     except Exception as e:
         st.warning(f"Error in emotion classification: {str(e)}")
-        return "unknown"
 def format_topics(topic_model, topic_counts):
     """Format topics for display."""
@@ -116,7 +133,7 @@ def format_topics(topic_model, topic_counts):
             topic_label = "Miscellaneous"
         else:
             words = topic_model.get_topic(topic_num)
-            topic_label = " | ".join([word for word, _ in words[:3]])
         formatted_topics.append({
             'topic': topic_label,
@@ -126,6 +143,13 @@ def format_topics(topic_model, topic_counts):
 def format_emotions(emotion_counts):
     """Format emotions for display."""
     formatted_emotions = []
     for label, count in emotion_counts:
         emotion = EMOTION_LABELS.get(label, label)
@@ -139,11 +163,13 @@ def process_and_summarize(df, top_n=50):
     """Process the data and generate summaries."""
     summaries = []
-    # Initialize BERTopic
     topic_model = BERTopic(
-        language="arabic",
         calculate_probabilities=True,
-        min_topic_size=5,
         verbose=True
     )
@@ -153,20 +179,24 @@ def process_and_summarize(df, top_n=50):
         progress_bar = st.progress(0, text=progress_text)
         texts = group['poem'].dropna().tolist()
-        batch_size = 10
         all_emotions = []
-        # Generate embeddings
-        embeddings = generate_embeddings(texts, bert_tokenizer, bert_model)
-        progress_bar.progress(0.33, text="Generating embeddings...")
-        # Process emotions
-        for i in range(0, len(texts), batch_size):
-            batch_texts = texts[i:i + batch_size]
-            batch_emotions = [classify_emotion(text, bert_tokenizer, emotion_classifier)
-                            for text in batch_texts]
-            all_emotions.extend(batch_emotions)
-        progress_bar.progress(0.66, text="Classifying emotions...")
         try:
             # Fit topic model
@@ -183,12 +213,21 @@ def process_and_summarize(df, top_n=50):
                 'top_emotions': top_emotions
             })
             progress_bar.progress(1.0, text="Processing complete!")
         except Exception as e:
             st.warning(f"Could not generate topics for {country}: {str(e)}")
             continue
     return summaries, topic_model
 # Main app interface
 st.title("📚 Arabic Poem Analysis")
 st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
@@ -251,7 +290,7 @@ if uploaded_file is not None:
                                 topic_name = "Miscellaneous"
                             else:
                                 words = topic_model.get_topic(row['Topic'])
-                                topic_name = " | ".join([word for word, _ in words[:3]])
                             st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
     except Exception as e:
@@ -265,5 +304,4 @@ else:
         'country': ['Egypt', 'Saudi Arabia'],
         'poem': ['قصيدة مصرية', 'قصيدة سعودية']
     })
-    st.dataframe(example_df)

 @st.cache_resource
 def load_models():
     """Load and cache the models to prevent reloading"""
+    # Use CAMeL-Lab's tokenizer for consistency with the emotion model
+    tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
     bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
     emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
+    emotion_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
+    emotion_classifier = pipeline(
+        "sentiment-analysis",
+        model=emotion_model,
+        tokenizer=emotion_tokenizer,
+        return_all_scores=True
+    )
+    return tokenizer, bert_model, emotion_classifier
+def split_text(text, max_length=512):
+    """Split text into chunks of maximum token length while preserving word boundaries."""
+    words = text.split()
     chunks = []
+    current_chunk = []
+    current_length = 0
+    for word in words:
+        word_length = len(word.split())
+        if current_length + word_length > max_length:
+            if current_chunk:  # Only append if there are words in the current chunk
+                chunks.append(' '.join(current_chunk))
+            current_chunk = [word]
+            current_length = word_length
+        else:
+            current_chunk.append(word)
+            current_length += word_length
+    if current_chunk:  # Append the last chunk if it exists
+        chunks.append(' '.join(current_chunk))
+    return chunks
+def classify_emotion(text, classifier):
+    """Classify emotion for complete text."""
     try:
+        # Split text into manageable chunks
+        chunks = split_text(text)
+        all_scores = []
+        for chunk in chunks:
+            result = classifier(chunk)
+            scores = result[0]  # Get scores for all labels
+            all_scores.append(scores)
+        # Average scores across all chunks
+        if all_scores:
+            # Create a dictionary to store summed scores for each label
+            label_scores = {}
+            count = len(all_scores)
+            # Sum up scores for each label
+            for scores in all_scores:
+                for score in scores:
+                    label = score['label']
+                    if label not in label_scores:
+                        label_scores[label] = 0
+                    label_scores[label] += score['score']
+            # Calculate averages
+            avg_scores = {label: score/count for label, score in label_scores.items()}
+            # Get the label with highest average score
+            final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
             return final_emotion
+        return "LABEL_2"  # Default to neutral if no valid results
     except Exception as e:
         st.warning(f"Error in emotion classification: {str(e)}")
+        return "LABEL_2"  # Default to neutral
+def get_embedding_for_text(text, tokenizer, model):
+    """Get embedding for complete text."""
+    chunks = split_text(text)
+    chunk_embeddings = []
+    for chunk in chunks:
+        try:
+            inputs = tokenizer(
+                chunk,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=512
+            )
+            inputs = {k: v.to(model.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = model(**inputs)
+            embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+            chunk_embeddings.append(embedding[0])
+        except Exception as e:
+            st.warning(f"Error processing chunk: {str(e)}")
+            continue
+    if chunk_embeddings:
+        # Use weighted average based on chunk length
+        weights = np.array([len(chunk.split()) for chunk in chunks])
+        weights = weights / weights.sum()
+        weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
+        return weighted_embedding
+    return np.zeros(model.config.hidden_size)
 def format_topics(topic_model, topic_counts):
     """Format topics for display."""
             topic_label = "Miscellaneous"
         else:
             words = topic_model.get_topic(topic_num)
+            topic_label = " | ".join([word for word, _ in words[:5]])  # Show top 5 words per topic
         formatted_topics.append({
             'topic': topic_label,
 def format_emotions(emotion_counts):
     """Format emotions for display."""
+    # Define emotion labels mapping
+    EMOTION_LABELS = {
+        'LABEL_0': 'Negative',
+        'LABEL_1': 'Positive',
+        'LABEL_2': 'Neutral'
+    }
     formatted_emotions = []
     for label, count in emotion_counts:
         emotion = EMOTION_LABELS.get(label, label)
     """Process the data and generate summaries."""
     summaries = []
+    # Initialize BERTopic with Arabic-specific settings
     topic_model = BERTopic(
+        language="multilingual",
         calculate_probabilities=True,
+        min_topic_size=2,  # Allow smaller topic groups
+        n_gram_range=(1, 3),  # Include up to trigrams
+        top_n_words=15,  # Show more words per topic
         verbose=True
     )
         progress_bar = st.progress(0, text=progress_text)
         texts = group['poem'].dropna().tolist()
         all_emotions = []
+        # Generate embeddings with progress tracking
+        embeddings = []
+        for i, text in enumerate(texts):
+            embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
+            embeddings.append(embedding)
+            progress = (i + 1) / len(texts) * 0.4
+            progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
+        embeddings = np.array(embeddings)
+        # Process emotions with progress tracking
+        for i, text in enumerate(texts):
+            emotion = classify_emotion(text, emotion_classifier)
+            all_emotions.append(emotion)
+            progress = 0.4 + ((i + 1) / len(texts) * 0.3)
+            progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
         try:
             # Fit topic model
                 'top_emotions': top_emotions
             })
             progress_bar.progress(1.0, text="Processing complete!")
         except Exception as e:
             st.warning(f"Could not generate topics for {country}: {str(e)}")
             continue
     return summaries, topic_model
+# Load models
+try:
+    bert_tokenizer, bert_model, emotion_classifier = load_models()
+    st.success("Models loaded successfully!")
+except Exception as e:
+    st.error(f"Error loading models: {str(e)}")
+    st.stop()
 # Main app interface
 st.title("📚 Arabic Poem Analysis")
 st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
                                 topic_name = "Miscellaneous"
                             else:
                                 words = topic_model.get_topic(row['Topic'])
+                                topic_name = " | ".join([word for word, _ in words[:5]])
                             st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
     except Exception as e:
         'country': ['Egypt', 'Saudi Arabia'],
         'poem': ['قصيدة مصرية', 'قصيدة سعودية']
     })
+    st.dataframe(example_df)