Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -13,8 +13,8 @@ st.set_page_config(
|
|
| 13 |
page_icon="๐",
|
| 14 |
layout="wide"
|
| 15 |
)
|
| 16 |
-
@st.cache_resource
|
| 17 |
|
|
|
|
| 18 |
def load_models():
|
| 19 |
"""Load and cache the models to prevent reloading"""
|
| 20 |
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
|
|
@@ -55,20 +55,21 @@ def split_text(text, max_length=512):
|
|
| 55 |
def clean_arabic_text(text):
|
| 56 |
# Add Arabic stop words
|
| 57 |
ARABIC_STOP_WORDS = {
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
| 72 |
"""Clean Arabic text by removing stop words and normalizing."""
|
| 73 |
words = text.split()
|
| 74 |
cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
|
|
@@ -270,142 +271,142 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
| 270 |
continue
|
| 271 |
|
| 272 |
return summaries, topic_model
|
| 273 |
-
|
| 274 |
-
# Load models
|
| 275 |
-
try:
|
| 276 |
-
bert_tokenizer, bert_model, emotion_classifier = load_models()
|
| 277 |
-
st.success("Models loaded successfully!")
|
| 278 |
-
except Exception as e:
|
| 279 |
-
st.error(f"Error loading models: {str(e)}")
|
| 280 |
-
st.stop()
|
| 281 |
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
|
| 286 |
-
|
| 287 |
-
|
|
|
|
| 288 |
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
["Auto", "Manual"],
|
| 315 |
-
help="Choose whether to let the model determine the optimal number of topics or set it manually"
|
| 316 |
-
)
|
| 317 |
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
max_topics
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
"Number of Topics",
|
| 325 |
-
min_value=2,
|
| 326 |
-
max_value=max_topics,
|
| 327 |
-
value=min(20, max_topics),
|
| 328 |
-
help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
|
| 329 |
-
)
|
| 330 |
-
|
| 331 |
-
st.info(f"""
|
| 332 |
-
๐ก For your dataset of {n_documents:,} documents:
|
| 333 |
-
- Minimum topics: 2
|
| 334 |
-
- Maximum topics: {max_topics}
|
| 335 |
-
- Recommended range: {max(2, max_topics//5)}-{max_topics//2}
|
| 336 |
-
""")
|
| 337 |
-
|
| 338 |
-
with col2:
|
| 339 |
-
top_n = st.number_input(
|
| 340 |
-
"Number of top topics/emotions to display:",
|
| 341 |
-
min_value=1,
|
| 342 |
-
max_value=100,
|
| 343 |
-
value=10
|
| 344 |
)
|
| 345 |
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
)
|
| 353 |
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
# Display results in tabs
|
| 371 |
-
tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
|
| 372 |
-
|
| 373 |
-
with tab1:
|
| 374 |
-
for summary in summaries:
|
| 375 |
-
with st.expander(f"๐ {summary['country']} ({summary['total_poems']} poems)"):
|
| 376 |
-
col1, col2 = st.columns(2)
|
| 377 |
-
|
| 378 |
-
with col1:
|
| 379 |
-
st.subheader("Top Topics")
|
| 380 |
-
for topic in summary['top_topics']:
|
| 381 |
-
st.write(f"โข {topic['topic']}: {topic['count']} poems")
|
| 382 |
-
|
| 383 |
-
with col2:
|
| 384 |
-
st.subheader("Emotions")
|
| 385 |
-
for emotion in summary['top_emotions']:
|
| 386 |
-
st.write(f"โข {emotion['emotion']}: {emotion['count']} poems")
|
| 387 |
-
|
| 388 |
-
with tab2:
|
| 389 |
-
st.subheader("Global Topic Distribution")
|
| 390 |
-
topic_info = topic_model.get_topic_info()
|
| 391 |
-
for _, row in topic_info.iterrows():
|
| 392 |
-
if row['Topic'] == -1:
|
| 393 |
-
topic_name = "Miscellaneous"
|
| 394 |
-
else:
|
| 395 |
-
words = topic_model.get_topic(row['Topic'])
|
| 396 |
-
topic_name = " | ".join([word for word, _ in words[:5]])
|
| 397 |
-
st.write(f"โข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
|
| 398 |
-
|
| 399 |
-
except Exception as e:
|
| 400 |
-
st.error(f"Error processing file: {str(e)}")
|
| 401 |
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
})
|
| 411 |
-
st.dataframe(example_df)
|
|
|
|
| 13 |
page_icon="๐",
|
| 14 |
layout="wide"
|
| 15 |
)
|
|
|
|
| 16 |
|
| 17 |
+
@st.cache_resource
|
| 18 |
def load_models():
|
| 19 |
"""Load and cache the models to prevent reloading"""
|
| 20 |
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
|
|
|
|
| 55 |
def clean_arabic_text(text):
|
| 56 |
# Add Arabic stop words
|
| 57 |
ARABIC_STOP_WORDS = {
|
| 58 |
+
'ูู', 'ู
ู', 'ุฅูู', 'ุนูู', 'ุนู', 'ู
ุน', 'ุฎูุงู', 'ุญุชู', 'ุฅุฐุง', 'ุซู
',
|
| 59 |
+
'ุฃู', 'ู', 'ู', 'ู', 'ุจ', 'ู', 'ูู', 'ุงู', 'ูุฐุง', 'ูุฐู', 'ุฐูู',
|
| 60 |
+
'ุชูู', 'ูุคูุงุก', 'ูู
', 'ูู', 'ูู', 'ูู', 'ูุญู', 'ุงูุช', 'ุงูุชู
',
|
| 61 |
+
'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ุงู', 'ูู', 'ุจุนุถ', 'ุบูุฑ', 'ุญูู',
|
| 62 |
+
'ุนูุฏ', 'ูุฏ', 'ููุฏ', 'ูู
', 'ูู', 'ูู', 'ู
ุง', 'ู
ุงุฐุง', 'ู
ุชู', 'ููู',
|
| 63 |
+
'ุงูู', 'ูู
ุงุฐุง', 'ุงูุฐู', 'ุงูุชู', 'ุงูุฐูู', 'ุงููุงุชู', 'ุงูููุงุชู',
|
| 64 |
+
'ุงูุงู', 'ุจูู', 'ููู', 'ุชุญุช', 'ุงู
ุงู
', 'ุฎูู', 'ุญูู', 'ูุจู', 'ุจุนุฏ',
|
| 65 |
+
'ู', 'ุฃู', 'ูู', 'ูู', 'ูู
', 'ูู', 'ูู', 'ู
ู', 'ูู', 'ูู', 'ููุฉ',
|
| 66 |
+
'ูู
ุง', 'ููุง', 'ู
ูุฐ', 'ููุฏ', 'ููุง', 'ููุณ', 'ููู
', 'ุญูุซ', 'ููุงู',
|
| 67 |
+
'ุฌุฏุง', 'ุฐุงุช', 'ุถู
ู', 'ุงูู', 'ูุฏู', 'ุนููู', 'ู
ุซู', 'ููู', 'ุนูุฏ',
|
| 68 |
+
'ุฃู
ุง', 'ูุฐู', 'ูุฃู', 'ููู', 'ููุงู', 'ูุฏู', 'ููุงู', 'ููู', 'ููู',
|
| 69 |
+
'ููู', 'ุชูู', 'ููู
', 'ููู', 'ููู', 'ููู', 'ูููุฏ', 'ูู
ู', 'ููุฐุง',
|
| 70 |
+
'ุงูู', 'ุถู
ู', 'ุงููุง', 'ุฌู
ูุน', 'ุงูุฐู', 'ูุจู', 'ุจุนุฏ', 'ุญูู', 'ุงูุถุง',
|
| 71 |
+
'ูุงุฒู
', 'ุญุงุฌุฉ', 'ุนูู', 'ูุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุชุญุช', 'ุถุฏ'
|
| 72 |
+
}
|
| 73 |
"""Clean Arabic text by removing stop words and normalizing."""
|
| 74 |
words = text.split()
|
| 75 |
cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1]
|
|
|
|
| 271 |
continue
|
| 272 |
|
| 273 |
return summaries, topic_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
+
# Load models
|
| 276 |
+
try:
|
| 277 |
+
bert_tokenizer, bert_model, emotion_classifier = load_models()
|
| 278 |
+
st.success("Models loaded successfully!")
|
| 279 |
+
except Exception as e:
|
| 280 |
+
st.error(f"Error loading models: {str(e)}")
|
| 281 |
+
st.stop()
|
| 282 |
|
| 283 |
+
# Main app interface
|
| 284 |
+
st.title("๐ Arabic Poem Analysis")
|
| 285 |
+
st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.")
|
| 286 |
|
| 287 |
+
# File upload
|
| 288 |
+
uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
|
| 289 |
+
|
| 290 |
+
if uploaded_file is not None:
|
| 291 |
+
try:
|
| 292 |
+
# Read the file
|
| 293 |
+
if uploaded_file.name.endswith('.csv'):
|
| 294 |
+
df = pd.read_csv(uploaded_file)
|
| 295 |
+
else:
|
| 296 |
+
df = pd.read_excel(uploaded_file)
|
| 297 |
+
|
| 298 |
+
# Validate columns
|
| 299 |
+
required_columns = ['country', 'poem']
|
| 300 |
+
if not all(col in df.columns for col in required_columns):
|
| 301 |
+
st.error("File must contain 'country' and 'poem' columns.")
|
| 302 |
+
st.stop()
|
| 303 |
+
|
| 304 |
+
# Clean data
|
| 305 |
+
df['country'] = df['country'].str.strip()
|
| 306 |
+
df = df.dropna(subset=['country', 'poem'])
|
| 307 |
+
|
| 308 |
+
# Add topic modeling controls
|
| 309 |
+
st.subheader("Topic Modeling Settings")
|
| 310 |
+
col1, col2 = st.columns(2)
|
| 311 |
+
|
| 312 |
+
with col1:
|
| 313 |
+
topic_strategy = st.radio(
|
| 314 |
+
"Topic Number Strategy",
|
| 315 |
+
["Auto", "Manual"],
|
| 316 |
+
help="Choose whether to let the model determine the optimal number of topics or set it manually"
|
| 317 |
+
)
|
| 318 |
|
| 319 |
+
if topic_strategy == "Manual":
|
| 320 |
+
n_documents = len(df)
|
| 321 |
+
max_topics = max(2, min(50, n_documents // 20))
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
+
n_topics = st.slider(
|
| 324 |
+
"Number of Topics",
|
| 325 |
+
min_value=2,
|
| 326 |
+
max_value=max_topics,
|
| 327 |
+
value=min(20, max_topics),
|
| 328 |
+
help=f"Select the desired number of topics (max {max_topics} based on dataset size)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
)
|
| 330 |
|
| 331 |
+
st.info(f"""
|
| 332 |
+
๐ก For your dataset of {n_documents:,} documents:
|
| 333 |
+
- Minimum topics: 2
|
| 334 |
+
- Maximum topics: {max_topics}
|
| 335 |
+
- Recommended range: {max(2, max_topics//5)}-{max_topics//2}
|
| 336 |
+
""")
|
| 337 |
+
|
| 338 |
+
with col2:
|
| 339 |
+
top_n = st.number_input(
|
| 340 |
+
"Number of top topics/emotions to display:",
|
| 341 |
+
min_value=1,
|
| 342 |
+
max_value=100,
|
| 343 |
+
value=10
|
| 344 |
+
)
|
| 345 |
+
|
| 346 |
+
min_topic_size = st.slider(
|
| 347 |
+
"Minimum Topic Size",
|
| 348 |
+
min_value=10,
|
| 349 |
+
max_value=100,
|
| 350 |
+
value=30,
|
| 351 |
+
help="Minimum number of documents required to form a topic"
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
if st.button("Process Data"):
|
| 355 |
+
with st.spinner("Processing your data..."):
|
| 356 |
+
summaries, topic_model = process_and_summarize(
|
| 357 |
+
df,
|
| 358 |
+
bert_tokenizer,
|
| 359 |
+
bert_model,
|
| 360 |
+
emotion_classifier,
|
| 361 |
+
top_n=top_n,
|
| 362 |
+
topic_strategy=topic_strategy,
|
| 363 |
+
n_topics=n_topics if topic_strategy == "Manual" else None,
|
| 364 |
+
min_topic_size=min_topic_size
|
| 365 |
)
|
| 366 |
|
| 367 |
+
if summaries:
|
| 368 |
+
st.success("Analysis complete!")
|
| 369 |
+
|
| 370 |
+
# Display results in tabs
|
| 371 |
+
tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"])
|
| 372 |
+
|
| 373 |
+
with tab1:
|
| 374 |
+
for summary in summaries:
|
| 375 |
+
with st.expander(f"๐ {summary['country']} ({summary['total_poems']} poems)"):
|
| 376 |
+
col1, col2 = st.columns(2)
|
| 377 |
+
|
| 378 |
+
with col1:
|
| 379 |
+
st.subheader("Top Topics")
|
| 380 |
+
for topic in summary['top_topics']:
|
| 381 |
+
st.write(f"โข {topic['topic']}: {topic['count']} poems")
|
| 382 |
+
|
| 383 |
+
with col2:
|
| 384 |
+
st.subheader("Emotions")
|
| 385 |
+
for emotion in summary['top_emotions']:
|
| 386 |
+
st.write(f"โข {emotion['emotion']}: {emotion['count']} poems")
|
| 387 |
+
|
| 388 |
+
with tab2:
|
| 389 |
+
st.subheader("Global Topic Distribution")
|
| 390 |
+
topic_info = topic_model.get_topic_info()
|
| 391 |
+
for _, row in topic_info.iterrows():
|
| 392 |
+
if row['Topic'] == -1:
|
| 393 |
+
topic_name = "Miscellaneous"
|
| 394 |
+
else:
|
| 395 |
+
words = topic_model.get_topic(row['Topic'])
|
| 396 |
+
topic_name = " | ".join([word for word, _ in words[:5]])
|
| 397 |
+
st.write(f"โข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
|
| 398 |
+
|
| 399 |
+
except Exception as e:
|
| 400 |
+
st.error(f"Error processing file: {str(e)}")
|
| 401 |
|
| 402 |
+
else:
|
| 403 |
+
st.info("๐ Upload a file to get started!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
|
| 405 |
+
# Example format
|
| 406 |
+
st.write("### Expected File Format:")
|
| 407 |
+
example_df = pd.DataFrame({
|
| 408 |
+
'country': ['Egypt', 'Palestine'],
|
| 409 |
+
'poem': ['ูุตูุฏุฉ ู
ุตุฑูุฉ', 'ูุตูุฏุฉ ููุณุทูููุฉ']
|
| 410 |
+
})
|
| 411 |
+
st.dataframe(example_df)
|
| 412 |
+
|
|
|
|
|
|