Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -17,96 +17,113 @@ st.set_page_config(
|
|
| 17 |
@st.cache_resource
|
| 18 |
def load_models():
|
| 19 |
"""Load and cache the models to prevent reloading"""
|
| 20 |
-
|
|
|
|
| 21 |
bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
|
| 22 |
emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
st.error(f"Error loading models: {str(e)}")
|
| 32 |
-
st.stop()
|
| 33 |
-
|
| 34 |
-
# Define emotion labels mapping
|
| 35 |
-
EMOTION_LABELS = {
|
| 36 |
-
'LABEL_0': 'Negative',
|
| 37 |
-
'LABEL_1': 'Positive',
|
| 38 |
-
'LABEL_2': 'Neutral'
|
| 39 |
-
}
|
| 40 |
|
| 41 |
-
def
|
| 42 |
-
"""Split text into chunks
|
| 43 |
-
|
| 44 |
chunks = []
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
for i in range(0, len(tokens), max_length-2):
|
| 48 |
-
chunk = tokens[i:i + max_length-2]
|
| 49 |
-
full_chunk = [tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id]
|
| 50 |
-
chunks.append(full_chunk)
|
| 51 |
-
text_chunks.append(tokenizer.decode(chunk))
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
padding=True,
|
| 64 |
-
truncation=True,
|
| 65 |
-
max_length=512)
|
| 66 |
-
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
| 67 |
-
|
| 68 |
-
with torch.no_grad():
|
| 69 |
-
outputs = model(**inputs)
|
| 70 |
-
|
| 71 |
-
embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
| 72 |
-
chunk_embeddings.append(embedding[0])
|
| 73 |
-
|
| 74 |
-
if chunk_embeddings:
|
| 75 |
-
return np.mean(chunk_embeddings, axis=0)
|
| 76 |
-
return np.zeros(model.config.hidden_size)
|
| 77 |
-
|
| 78 |
-
def generate_embeddings(texts, tokenizer, model):
|
| 79 |
-
"""Generate embeddings for a list of texts."""
|
| 80 |
-
embeddings = []
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
embedding = get_embedding_for_text(text, tokenizer, model)
|
| 85 |
-
embeddings.append(embedding)
|
| 86 |
-
except Exception as e:
|
| 87 |
-
st.warning(f"Error processing text: {str(e)}")
|
| 88 |
-
embeddings.append(np.zeros(model.config.hidden_size))
|
| 89 |
|
| 90 |
-
return
|
| 91 |
|
| 92 |
-
def classify_emotion(text,
|
| 93 |
-
"""Classify emotion for
|
| 94 |
try:
|
| 95 |
-
|
| 96 |
-
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
return final_emotion
|
| 105 |
-
|
|
|
|
| 106 |
|
| 107 |
except Exception as e:
|
| 108 |
st.warning(f"Error in emotion classification: {str(e)}")
|
| 109 |
-
return "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
def format_topics(topic_model, topic_counts):
|
| 112 |
"""Format topics for display."""
|
|
@@ -116,7 +133,7 @@ def format_topics(topic_model, topic_counts):
|
|
| 116 |
topic_label = "Miscellaneous"
|
| 117 |
else:
|
| 118 |
words = topic_model.get_topic(topic_num)
|
| 119 |
-
topic_label = " | ".join([word for word, _ in words[:
|
| 120 |
|
| 121 |
formatted_topics.append({
|
| 122 |
'topic': topic_label,
|
|
@@ -126,6 +143,13 @@ def format_topics(topic_model, topic_counts):
|
|
| 126 |
|
| 127 |
def format_emotions(emotion_counts):
|
| 128 |
"""Format emotions for display."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
formatted_emotions = []
|
| 130 |
for label, count in emotion_counts:
|
| 131 |
emotion = EMOTION_LABELS.get(label, label)
|
|
@@ -139,11 +163,13 @@ def process_and_summarize(df, top_n=50):
|
|
| 139 |
"""Process the data and generate summaries."""
|
| 140 |
summaries = []
|
| 141 |
|
| 142 |
-
# Initialize BERTopic
|
| 143 |
topic_model = BERTopic(
|
| 144 |
-
language="
|
| 145 |
calculate_probabilities=True,
|
| 146 |
-
min_topic_size=
|
|
|
|
|
|
|
| 147 |
verbose=True
|
| 148 |
)
|
| 149 |
|
|
@@ -153,20 +179,24 @@ def process_and_summarize(df, top_n=50):
|
|
| 153 |
progress_bar = st.progress(0, text=progress_text)
|
| 154 |
|
| 155 |
texts = group['poem'].dropna().tolist()
|
| 156 |
-
batch_size = 10
|
| 157 |
all_emotions = []
|
| 158 |
|
| 159 |
-
# Generate embeddings
|
| 160 |
-
embeddings =
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
-
# Process emotions
|
| 164 |
-
for i in
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
progress_bar.progress(0.66, text="Classifying emotions...")
|
| 170 |
|
| 171 |
try:
|
| 172 |
# Fit topic model
|
|
@@ -183,12 +213,21 @@ def process_and_summarize(df, top_n=50):
|
|
| 183 |
'top_emotions': top_emotions
|
| 184 |
})
|
| 185 |
progress_bar.progress(1.0, text="Processing complete!")
|
|
|
|
| 186 |
except Exception as e:
|
| 187 |
st.warning(f"Could not generate topics for {country}: {str(e)}")
|
| 188 |
continue
|
| 189 |
|
| 190 |
return summaries, topic_model
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
# Main app interface
|
| 193 |
st.title("📚 Arabic Poem Analysis")
|
| 194 |
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
|
|
@@ -251,7 +290,7 @@ if uploaded_file is not None:
|
|
| 251 |
topic_name = "Miscellaneous"
|
| 252 |
else:
|
| 253 |
words = topic_model.get_topic(row['Topic'])
|
| 254 |
-
topic_name = " | ".join([word for word, _ in words[:
|
| 255 |
st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
|
| 256 |
|
| 257 |
except Exception as e:
|
|
@@ -265,5 +304,4 @@ else:
|
|
| 265 |
'country': ['Egypt', 'Saudi Arabia'],
|
| 266 |
'poem': ['قصيدة مصرية', 'قصيدة سعودية']
|
| 267 |
})
|
| 268 |
-
st.dataframe(example_df)
|
| 269 |
-
|
|
|
|
| 17 |
@st.cache_resource
|
| 18 |
def load_models():
|
| 19 |
"""Load and cache the models to prevent reloading"""
|
| 20 |
+
# Use CAMeL-Lab's tokenizer for consistency with the emotion model
|
| 21 |
+
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
|
| 22 |
bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
|
| 23 |
emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
|
| 24 |
+
emotion_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
|
| 25 |
+
emotion_classifier = pipeline(
|
| 26 |
+
"sentiment-analysis",
|
| 27 |
+
model=emotion_model,
|
| 28 |
+
tokenizer=emotion_tokenizer,
|
| 29 |
+
return_all_scores=True
|
| 30 |
+
)
|
| 31 |
+
return tokenizer, bert_model, emotion_classifier
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
+
def split_text(text, max_length=512):
|
| 34 |
+
"""Split text into chunks of maximum token length while preserving word boundaries."""
|
| 35 |
+
words = text.split()
|
| 36 |
chunks = []
|
| 37 |
+
current_chunk = []
|
| 38 |
+
current_length = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
for word in words:
|
| 41 |
+
word_length = len(word.split())
|
| 42 |
+
if current_length + word_length > max_length:
|
| 43 |
+
if current_chunk: # Only append if there are words in the current chunk
|
| 44 |
+
chunks.append(' '.join(current_chunk))
|
| 45 |
+
current_chunk = [word]
|
| 46 |
+
current_length = word_length
|
| 47 |
+
else:
|
| 48 |
+
current_chunk.append(word)
|
| 49 |
+
current_length += word_length
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
+
if current_chunk: # Append the last chunk if it exists
|
| 52 |
+
chunks.append(' '.join(current_chunk))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
return chunks
|
| 55 |
|
| 56 |
+
def classify_emotion(text, classifier):
|
| 57 |
+
"""Classify emotion for complete text."""
|
| 58 |
try:
|
| 59 |
+
# Split text into manageable chunks
|
| 60 |
+
chunks = split_text(text)
|
| 61 |
|
| 62 |
+
all_scores = []
|
| 63 |
+
for chunk in chunks:
|
| 64 |
+
result = classifier(chunk)
|
| 65 |
+
scores = result[0] # Get scores for all labels
|
| 66 |
+
all_scores.append(scores)
|
| 67 |
|
| 68 |
+
# Average scores across all chunks
|
| 69 |
+
if all_scores:
|
| 70 |
+
# Create a dictionary to store summed scores for each label
|
| 71 |
+
label_scores = {}
|
| 72 |
+
count = len(all_scores)
|
| 73 |
+
|
| 74 |
+
# Sum up scores for each label
|
| 75 |
+
for scores in all_scores:
|
| 76 |
+
for score in scores:
|
| 77 |
+
label = score['label']
|
| 78 |
+
if label not in label_scores:
|
| 79 |
+
label_scores[label] = 0
|
| 80 |
+
label_scores[label] += score['score']
|
| 81 |
+
|
| 82 |
+
# Calculate averages
|
| 83 |
+
avg_scores = {label: score/count for label, score in label_scores.items()}
|
| 84 |
+
|
| 85 |
+
# Get the label with highest average score
|
| 86 |
+
final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0]
|
| 87 |
return final_emotion
|
| 88 |
+
|
| 89 |
+
return "LABEL_2" # Default to neutral if no valid results
|
| 90 |
|
| 91 |
except Exception as e:
|
| 92 |
st.warning(f"Error in emotion classification: {str(e)}")
|
| 93 |
+
return "LABEL_2" # Default to neutral
|
| 94 |
+
|
| 95 |
+
def get_embedding_for_text(text, tokenizer, model):
|
| 96 |
+
"""Get embedding for complete text."""
|
| 97 |
+
chunks = split_text(text)
|
| 98 |
+
chunk_embeddings = []
|
| 99 |
+
|
| 100 |
+
for chunk in chunks:
|
| 101 |
+
try:
|
| 102 |
+
inputs = tokenizer(
|
| 103 |
+
chunk,
|
| 104 |
+
return_tensors="pt",
|
| 105 |
+
padding=True,
|
| 106 |
+
truncation=True,
|
| 107 |
+
max_length=512
|
| 108 |
+
)
|
| 109 |
+
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
| 110 |
+
|
| 111 |
+
with torch.no_grad():
|
| 112 |
+
outputs = model(**inputs)
|
| 113 |
+
|
| 114 |
+
embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
| 115 |
+
chunk_embeddings.append(embedding[0])
|
| 116 |
+
except Exception as e:
|
| 117 |
+
st.warning(f"Error processing chunk: {str(e)}")
|
| 118 |
+
continue
|
| 119 |
+
|
| 120 |
+
if chunk_embeddings:
|
| 121 |
+
# Use weighted average based on chunk length
|
| 122 |
+
weights = np.array([len(chunk.split()) for chunk in chunks])
|
| 123 |
+
weights = weights / weights.sum()
|
| 124 |
+
weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights)
|
| 125 |
+
return weighted_embedding
|
| 126 |
+
return np.zeros(model.config.hidden_size)
|
| 127 |
|
| 128 |
def format_topics(topic_model, topic_counts):
|
| 129 |
"""Format topics for display."""
|
|
|
|
| 133 |
topic_label = "Miscellaneous"
|
| 134 |
else:
|
| 135 |
words = topic_model.get_topic(topic_num)
|
| 136 |
+
topic_label = " | ".join([word for word, _ in words[:5]]) # Show top 5 words per topic
|
| 137 |
|
| 138 |
formatted_topics.append({
|
| 139 |
'topic': topic_label,
|
|
|
|
| 143 |
|
| 144 |
def format_emotions(emotion_counts):
|
| 145 |
"""Format emotions for display."""
|
| 146 |
+
# Define emotion labels mapping
|
| 147 |
+
EMOTION_LABELS = {
|
| 148 |
+
'LABEL_0': 'Negative',
|
| 149 |
+
'LABEL_1': 'Positive',
|
| 150 |
+
'LABEL_2': 'Neutral'
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
formatted_emotions = []
|
| 154 |
for label, count in emotion_counts:
|
| 155 |
emotion = EMOTION_LABELS.get(label, label)
|
|
|
|
| 163 |
"""Process the data and generate summaries."""
|
| 164 |
summaries = []
|
| 165 |
|
| 166 |
+
# Initialize BERTopic with Arabic-specific settings
|
| 167 |
topic_model = BERTopic(
|
| 168 |
+
language="multilingual",
|
| 169 |
calculate_probabilities=True,
|
| 170 |
+
min_topic_size=2, # Allow smaller topic groups
|
| 171 |
+
n_gram_range=(1, 3), # Include up to trigrams
|
| 172 |
+
top_n_words=15, # Show more words per topic
|
| 173 |
verbose=True
|
| 174 |
)
|
| 175 |
|
|
|
|
| 179 |
progress_bar = st.progress(0, text=progress_text)
|
| 180 |
|
| 181 |
texts = group['poem'].dropna().tolist()
|
|
|
|
| 182 |
all_emotions = []
|
| 183 |
|
| 184 |
+
# Generate embeddings with progress tracking
|
| 185 |
+
embeddings = []
|
| 186 |
+
for i, text in enumerate(texts):
|
| 187 |
+
embedding = get_embedding_for_text(text, bert_tokenizer, bert_model)
|
| 188 |
+
embeddings.append(embedding)
|
| 189 |
+
progress = (i + 1) / len(texts) * 0.4
|
| 190 |
+
progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...")
|
| 191 |
+
|
| 192 |
+
embeddings = np.array(embeddings)
|
| 193 |
|
| 194 |
+
# Process emotions with progress tracking
|
| 195 |
+
for i, text in enumerate(texts):
|
| 196 |
+
emotion = classify_emotion(text, emotion_classifier)
|
| 197 |
+
all_emotions.append(emotion)
|
| 198 |
+
progress = 0.4 + ((i + 1) / len(texts) * 0.3)
|
| 199 |
+
progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...")
|
|
|
|
| 200 |
|
| 201 |
try:
|
| 202 |
# Fit topic model
|
|
|
|
| 213 |
'top_emotions': top_emotions
|
| 214 |
})
|
| 215 |
progress_bar.progress(1.0, text="Processing complete!")
|
| 216 |
+
|
| 217 |
except Exception as e:
|
| 218 |
st.warning(f"Could not generate topics for {country}: {str(e)}")
|
| 219 |
continue
|
| 220 |
|
| 221 |
return summaries, topic_model
|
| 222 |
|
| 223 |
+
# Load models
|
| 224 |
+
try:
|
| 225 |
+
bert_tokenizer, bert_model, emotion_classifier = load_models()
|
| 226 |
+
st.success("Models loaded successfully!")
|
| 227 |
+
except Exception as e:
|
| 228 |
+
st.error(f"Error loading models: {str(e)}")
|
| 229 |
+
st.stop()
|
| 230 |
+
|
| 231 |
# Main app interface
|
| 232 |
st.title("📚 Arabic Poem Analysis")
|
| 233 |
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
|
|
|
|
| 290 |
topic_name = "Miscellaneous"
|
| 291 |
else:
|
| 292 |
words = topic_model.get_topic(row['Topic'])
|
| 293 |
+
topic_name = " | ".join([word for word, _ in words[:5]])
|
| 294 |
st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
|
| 295 |
|
| 296 |
except Exception as e:
|
|
|
|
| 304 |
'country': ['Egypt', 'Saudi Arabia'],
|
| 305 |
'poem': ['قصيدة مصرية', 'قصيدة سعودية']
|
| 306 |
})
|
| 307 |
+
st.dataframe(example_df)
|
|
|