Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from bertopic import BERTopic | |
| import torch | |
| import numpy as np | |
| from collections import Counter | |
| import os | |
| from wordcloud import WordCloud | |
| import matplotlib.pyplot as plt | |
| # Add Arabic stop words | |
| ARABIC_STOP_WORDS = { | |
| 'ูู', 'ู ู', 'ุฅูู', 'ุนูู', 'ุนู', 'ู ุน', 'ุฎูุงู', 'ุญุชู', 'ุฅุฐุง', 'ุซู ', | |
| 'ุฃู', 'ู', 'ู', 'ู', 'ุจ', 'ู', 'ูู', 'ุงู', 'ูุฐุง', 'ูุฐู', 'ุฐูู', | |
| 'ุชูู', 'ูุคูุงุก', 'ูู ', 'ูู', 'ูู', 'ูู', 'ูุญู', 'ุงูุช', 'ุงูุชู ', | |
| 'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ุงู', 'ูู', 'ุจุนุถ', 'ุบูุฑ', 'ุญูู', | |
| 'ุนูุฏ', 'ูุฏ', 'ููุฏ', 'ูู ', 'ูู', 'ูู', 'ู ุง', 'ู ุงุฐุง', 'ู ุชู', 'ููู', | |
| 'ุงูู', 'ูู ุงุฐุง', 'ุงูุฐู', 'ุงูุชู', 'ุงูุฐูู', 'ุงููุงุชู', 'ุงูููุงุชู', | |
| 'ุงูุงู', 'ุจูู', 'ููู', 'ุชุญุช', 'ุงู ุงู ', 'ุฎูู', 'ุญูู', 'ูุจู', 'ุจุนุฏ', | |
| 'ู', 'ุฃู', 'ูู', 'ูู', 'ูู ', 'ูู', 'ูู', 'ู ู', 'ูู', 'ูู', 'ููุฉ', | |
| 'ูู ุง', 'ููุง', 'ู ูุฐ', 'ููุฏ', 'ููุง', 'ููุณ', 'ููู ', 'ุญูุซ', 'ููุงู', | |
| 'ุฌุฏุง', 'ุฐุงุช', 'ุถู ู', 'ุงูู', 'ูุฏู', 'ุนููู', 'ู ุซู', 'ููู', 'ุนูุฏ', | |
| 'ุฃู ุง', 'ูุฐู', 'ูุฃู', 'ููู', 'ููุงู', 'ูุฏู', 'ููุงู', 'ููู', 'ููู', | |
| 'ููู', 'ุชูู', 'ููู ', 'ููู', 'ููู', 'ููู', 'ูููุฏ', 'ูู ู', 'ููุฐุง', | |
| 'ุงูู', 'ุถู ู', 'ุงููุง', 'ุฌู ูุน', 'ุงูุฐู', 'ูุจู', 'ุจุนุฏ', 'ุญูู', 'ุงูุถุง', | |
| 'ูุงุฒู ', 'ุญุงุฌุฉ', 'ุนูู', 'ูุฌุจ', 'ุตุงุฑ', 'ุตุงุฑุช', 'ุชุญุช', 'ุถุฏ' | |
| } | |
| # Configure page | |
| st.set_page_config( | |
| page_title="Arabic Poem Analysis", | |
| page_icon="๐", | |
| layout="wide" | |
| ) | |
| def load_models(): | |
| """Load and cache the models to prevent reloading""" | |
| tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment") | |
| bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2") | |
| emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment") | |
| emotion_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment") | |
| emotion_classifier = pipeline( | |
| "sentiment-analysis", | |
| model=emotion_model, | |
| tokenizer=emotion_tokenizer, | |
| return_all_scores=True | |
| ) | |
| return tokenizer, bert_model, emotion_classifier | |
| def split_text(text, max_length=512): | |
| """Split text into chunks of maximum token length while preserving word boundaries.""" | |
| words = text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| current_length = 0 | |
| for word in words: | |
| word_length = len(word.split()) | |
| if current_length + word_length > max_length: | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| current_chunk = [word] | |
| current_length = word_length | |
| else: | |
| current_chunk.append(word) | |
| current_length += word_length | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| return chunks | |
| def create_arabic_wordcloud(text, title): | |
| wordcloud = WordCloud( | |
| width=1200, | |
| height=600, | |
| background_color='white', | |
| font_path='arial', # Works with system Arabic fonts | |
| max_words=200 | |
| ).generate(text) | |
| fig, ax = plt.subplots(figsize=(15, 8)) | |
| ax.imshow(wordcloud, interpolation='bilinear') | |
| ax.axis('off') | |
| ax.set_title(title, fontsize=16, pad=20) | |
| return fig | |
| def clean_arabic_text(text): | |
| """Clean Arabic text by removing stop words and normalizing.""" | |
| words = text.split() | |
| cleaned_words = [word for word in words if word not in ARABIC_STOP_WORDS and len(word) > 1] | |
| return ' '.join(cleaned_words) | |
| def classify_emotion(text, classifier): | |
| """Classify emotion for complete text with proper token handling.""" | |
| try: | |
| words = text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| current_length = 0 | |
| for word in words: | |
| word_tokens = len(classifier.tokenizer.encode(word)) | |
| if current_length + word_tokens > 512: | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| current_chunk = [word] | |
| current_length = word_tokens | |
| else: | |
| current_chunk.append(word) | |
| current_length += word_tokens | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| if not chunks: | |
| chunks = [text] | |
| all_scores = [] | |
| for chunk in chunks: | |
| try: | |
| inputs = classifier.tokenizer( | |
| chunk, | |
| truncation=True, | |
| max_length=512, | |
| return_tensors="pt" | |
| ) | |
| result = classifier(chunk, truncation=True, max_length=512) | |
| scores = result[0] | |
| all_scores.append(scores) | |
| except Exception as chunk_error: | |
| st.warning(f"Skipping chunk due to error: {str(chunk_error)}") | |
| continue | |
| if all_scores: | |
| label_scores = {} | |
| count = len(all_scores) | |
| for scores in all_scores: | |
| for score in scores: | |
| label = score['label'] | |
| if label not in label_scores: | |
| label_scores[label] = 0 | |
| label_scores[label] += score['score'] | |
| avg_scores = {label: score/count for label, score in label_scores.items()} | |
| final_emotion = max(avg_scores.items(), key=lambda x: x[1])[0] | |
| return final_emotion | |
| return "LABEL_2" | |
| except Exception as e: | |
| st.warning(f"Error in emotion classification: {str(e)}") | |
| return "LABEL_2" | |
| def get_embedding_for_text(text, tokenizer, model): | |
| """Get embedding for complete text.""" | |
| chunks = split_text(text) | |
| chunk_embeddings = [] | |
| for chunk in chunks: | |
| try: | |
| inputs = tokenizer( | |
| chunk, | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=512 | |
| ) | |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy() | |
| chunk_embeddings.append(embedding[0]) | |
| except Exception as e: | |
| st.warning(f"Error processing chunk: {str(e)}") | |
| continue | |
| if chunk_embeddings: | |
| weights = np.array([len(chunk.split()) for chunk in chunks]) | |
| weights = weights / weights.sum() | |
| weighted_embedding = np.average(chunk_embeddings, axis=0, weights=weights) | |
| return weighted_embedding | |
| return np.zeros(model.config.hidden_size) | |
| def format_topics(topic_model, topic_counts): | |
| """Format topics for display.""" | |
| formatted_topics = [] | |
| for topic_num, count in topic_counts: | |
| if topic_num == -1: | |
| topic_label = "Miscellaneous" | |
| else: | |
| words = topic_model.get_topic(topic_num) | |
| topic_label = " | ".join([word for word, _ in words[:5]]) | |
| formatted_topics.append({ | |
| 'topic': topic_label, | |
| 'count': count | |
| }) | |
| return formatted_topics | |
| def format_emotions(emotion_counts): | |
| """Format emotions for display.""" | |
| EMOTION_LABELS = { | |
| 'LABEL_0': 'Negative', | |
| 'LABEL_1': 'Positive', | |
| 'LABEL_2': 'Neutral' | |
| } | |
| formatted_emotions = [] | |
| for label, count in emotion_counts: | |
| emotion = EMOTION_LABELS.get(label, label) | |
| formatted_emotions.append({ | |
| 'emotion': emotion, | |
| 'count': count | |
| }) | |
| return formatted_emotions | |
| def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=1): | |
| """Process the data and generate summaries with flexible topic configuration.""" | |
| summaries = [] | |
| topic_model_params = { | |
| "language": "arabic", | |
| "calculate_probabilities": True, | |
| "min_topic_size": 1, | |
| "n_gram_range": (1, 2), | |
| "top_n_words": 15, | |
| "verbose": True, | |
| "min_samples": 1 | |
| } | |
| st.write(f"Total documents: {len(df)}") | |
| st.write(f"Topic strategy: {topic_strategy}") | |
| st.write(f"Min topic size: {min_topic_size}") | |
| if topic_strategy == "Manual": | |
| topic_model_params["nr_topics"] = n_topics | |
| else: | |
| topic_model_params["nr_topics"] = "auto" | |
| topic_model = BERTopic(**topic_model_params) | |
| # Create vectorizer with stop words | |
| vectorizer = CountVectorizer(stop_words=list(ARABIC_STOP_WORDS)) | |
| topic_model.vectorizer_model = vectorizer | |
| for country, group in df.groupby('country'): | |
| progress_text = f"Processing poems for {country}..." | |
| progress_bar = st.progress(0, text=progress_text) | |
| texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()] | |
| all_emotions = [] | |
| embeddings = [] | |
| for i, text in enumerate(texts): | |
| embedding = get_embedding_for_text(text, bert_tokenizer, bert_model) | |
| embeddings.append(embedding) | |
| progress = (i + 1) / len(texts) * 0.4 | |
| progress_bar.progress(progress, text=f"Generated embeddings for {i+1}/{len(texts)} poems...") | |
| embeddings = np.array(embeddings) | |
| for i, text in enumerate(texts): | |
| emotion = classify_emotion(text, emotion_classifier) | |
| all_emotions.append(emotion) | |
| progress = 0.4 + ((i + 1) / len(texts) * 0.3) | |
| progress_bar.progress(progress, text=f"Classified emotions for {i+1}/{len(texts)} poems...") | |
| try: | |
| topics, probs = topic_model.fit_transform(texts, embeddings) | |
| st.write(f"Number of unique topics: {len(set(topics))}") | |
| st.write(f"Topic distribution: {Counter(topics)}") | |
| topic_counts = Counter(topics) | |
| if -1 in topic_counts: | |
| del topic_counts[-1] | |
| top_topics = format_topics(topic_model, topic_counts.most_common(top_n)) | |
| top_emotions = format_emotions(Counter(all_emotions).most_common(top_n)) | |
| summaries.append({ | |
| 'country': country, | |
| 'total_poems': len(texts), | |
| 'top_topics': top_topics, | |
| 'top_emotions': top_emotions | |
| }) | |
| progress_bar.progress(1.0, text="Processing complete!") | |
| except Exception as e: | |
| st.warning(f"Could not generate topics for {country}: {str(e)}") | |
| continue | |
| return summaries, topic_model | |
| # Load models | |
| try: | |
| bert_tokenizer, bert_model, emotion_classifier = load_models() | |
| st.success("Models loaded successfully!") | |
| except Exception as e: | |
| st.error(f"Error loading models: {str(e)}") | |
| st.stop() | |
| # Main app interface | |
| st.title("๐ Arabic Poem Analysis") | |
| st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.") | |
| # File upload | |
| uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"]) | |
| if uploaded_file is not None: | |
| try: | |
| # Read the file | |
| if uploaded_file.name.endswith('.csv'): | |
| df = pd.read_csv(uploaded_file) | |
| else: | |
| df = pd.read_excel(uploaded_file) | |
| # Validate columns | |
| required_columns = ['country', 'poem'] | |
| if not all(col in df.columns for col in required_columns): | |
| st.error("File must contain 'country' and 'poem' columns.") | |
| st.stop() | |
| # Clean data | |
| df['country'] = df['country'].str.strip() | |
| df = df.dropna(subset=['country', 'poem']) | |
| # Add topic modeling controls | |
| st.subheader("Topic Modeling Settings") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| topic_strategy = st.radio( | |
| "Topic Number Strategy", | |
| ["Auto", "Manual"], | |
| help="Choose whether to let the model determine the optimal number of topics or set it manually" | |
| ) | |
| if topic_strategy == "Manual": | |
| n_documents = len(df) | |
| max_topics = 500 | |
| min_topics = 5 | |
| default_topics = 20 | |
| n_topics = st.slider( | |
| "Number of Topics", | |
| min_value=min_topics, | |
| max_value=max_topics, | |
| value=default_topics, | |
| help=f"Select the desired number of topics (max {max_topics} based on dataset size)" | |
| ) | |
| st.info(f""" | |
| ๐ก For your dataset of {n_documents:,} documents: | |
| - Available topic range: {min_topics}-{max_topics} | |
| - Recommended range: {max_topics//10}-{max_topics//3} for optimal coherence | |
| """) | |
| with col2: | |
| top_n = st.number_input( | |
| "Number of top topics/emotions to display:", | |
| min_value=1, | |
| max_value=100, | |
| value=10 | |
| ) | |
| min_topic_size = st.slider( | |
| "Minimum Topic Size", | |
| min_value=10, | |
| max_value=100, | |
| value=30, | |
| help="Minimum number of documents required to form a topic" | |
| ) | |
| if st.button("Process Data"): | |
| with st.spinner("Processing your data..."): | |
| summaries, topic_model = process_and_summarize( | |
| df, | |
| bert_tokenizer, | |
| bert_model, | |
| emotion_classifier, | |
| top_n=top_n, | |
| topic_strategy=topic_strategy, | |
| n_topics=n_topics if topic_strategy == "Manual" else None, | |
| min_topic_size=min_topic_size | |
| ) | |
| if summaries: | |
| st.success("Analysis complete!") | |
| # Display results in tabs | |
| tab1, tab2 = st.tabs(["Country Summaries", "Global Topics"]) | |
| with tab1: | |
| for summary in summaries: | |
| with st.expander(f"๐ {summary['country']} ({summary['total_poems']} poems)"): | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("Top Topics") | |
| for topic in summary['top_topics']: | |
| st.write(f"โข {topic['topic']}: {topic['count']} poems") | |
| with col2: | |
| st.subheader("Emotions") | |
| for emotion in summary['top_emotions']: | |
| st.write(f"โข {emotion['emotion']}: {emotion['count']} poems") | |
| st.subheader("Word Cloud Visualization") | |
| country_poems = df[df['country'] == summary['country']]['poem'] | |
| combined_text = ' '.join(country_poems) | |
| wordcloud_fig = create_arabic_wordcloud(combined_text, f"Most Common Words in {summary['country']} Poems") | |
| st.pyplot(wordcloud_fig) | |
| with tab2: | |
| st.subheader("Global Topic Distribution") | |
| topic_info = topic_model.get_topic_info() | |
| for _, row in topic_info.iterrows(): | |
| if row['Topic'] == -1: | |
| topic_name = "Miscellaneous" | |
| else: | |
| words = topic_model.get_topic(row['Topic']) | |
| topic_name = " | ".join([word for word, _ in words[:5]]) | |
| st.write(f"โข Topic {row['Topic']}: {topic_name} ({row['Count']} poems)") | |
| except Exception as e: | |
| st.error(f"Error processing file: {str(e)}") | |
| else: | |
| st.info("๐ Upload a file to get started!") | |
| # Example format | |
| st.write("### Expected File Format:") | |
| example_df = pd.DataFrame({ | |
| 'country': ['Egypt', 'Palestine'], | |
| 'poem': ['ูุตูุฏุฉ ู ุตุฑูุฉ', 'ูุตูุฏุฉ ููุณุทูููุฉ'] | |
| }) | |
| st.dataframe(example_df) | |