Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on 25 days ago

Commit

5e57630

verified ·

1 Parent(s): 19d1d68

Upload app.py

Browse files

Files changed (1) hide show

app.py +55 -28

app.py CHANGED Viewed

@@ -71,13 +71,18 @@ def validate_dataset(dataset_name: str, split: str, input_field: str, target_fie
         try:
             dataset = load_dataset(dataset_name, split=split, streaming=True)
         except ValueError as e:
-            # If it fails with config error, try with "main" config (common for datasets like gsm8k)
             if "config" in str(e).lower() or "Config name is missing" in str(e):
-                print(f"Dataset requires config, trying with 'main' config...")
                 try:
-                    dataset = load_dataset(dataset_name, "main", split=split, streaming=True)
                 except:
-                    # If "main" doesn't work, raise the original error
                     raise e
             else:
                 raise
@@ -161,9 +166,13 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
         try:
             dataset = load_dataset(dataset_name, split=split, streaming=False)
         except ValueError as e:
-            # If it fails with config error, try with "main" config (common for datasets like gsm8k)
             if "config" in str(e).lower() or "Config name is missing" in str(e):
-                dataset = load_dataset(dataset_name, "main", split=split, streaming=False)
             else:
                 raise
@@ -213,7 +222,7 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
                         {"role": "system", "content": "You are a helpful assistant."},
                         {"role": "user", "content": formatted_prompt}
                     ],
-                    temperature=0.1,
                     max_tokens=500,
                 )
@@ -266,11 +275,18 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
                     target_lower = target_answer.lower()
                     pred_lower = pred_answer.lower()
-                    # Sentiment mappings
                     if target_lower in ["1", "positive", "pos"]:
-                        is_correct = any(word in pred_lower for word in ["positive", "good", "great", "1"])
                     elif target_lower in ["0", "negative", "neg"]:
-                        is_correct = any(word in pred_lower for word in ["negative", "bad", "poor", "0"])
                 if is_correct:
                     correct += 1
@@ -542,9 +558,13 @@ def evaluate(prompt: str) -> dict:
         try:
             dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
         except ValueError as e:
-            # If it fails with config error, try with "main" config (common for datasets like gsm8k)
             if "config" in str(e).lower() or "Config name is missing" in str(e):
-                dataset = load_dataset("{dataset_name}", "main", split="{split}", streaming=False)
             else:
                 raise
@@ -581,7 +601,7 @@ def evaluate(prompt: str) -> dict:
                             {{"role": "system", "content": "You are a helpful assistant."}},
                             {{"role": "user", "content": formatted_prompt}}
                         ],
-                        temperature=0.1,
                         max_tokens=500,
                     )
@@ -636,11 +656,18 @@ def evaluate(prompt: str) -> dict:
                         target_lower = target_answer.lower()
                         pred_lower = pred_answer.lower()
-                        # Sentiment mappings
                         if target_lower in ["1", "positive", "pos"]:
-                            is_correct = any(word in pred_lower for word in ["positive", "good", "great", "1"])
                         elif target_lower in ["0", "negative", "neg"]:
-                            is_correct = any(word in pred_lower for word in ["negative", "bad", "poor", "0"])
                     if is_correct:
                         correct += 1
@@ -1062,12 +1089,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
     ## How it works:
     1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
-    2. Default dataset is **GSM8K** (grade school math problems) - challenging for showing improvement!
-    3. Specify the dataset split and field names (or use other datasets like `stanfordnlp/imdb`)
     4. Choose a free model from OpenRouter
     5. Click "Optimize Prompt" - the system will validate everything first!
     6. Watch the evolution progress in real-time
-    7. Compare initial vs. evolved performance - expect 20-40% improvement on GSM8K!
     **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
     """)
@@ -1085,34 +1112,34 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
             dataset_name = gr.Textbox(
                 label="HuggingFace Dataset (Full Name)",
-                value="gsm8k",
-                placeholder="e.g., gsm8k, stanfordnlp/imdb, openai/gsm8k",
-                info="Dataset name from HuggingFace Hub. Configs auto-detected (e.g., 'gsm8k' → 'gsm8k:main')"
             )
             dataset_split = gr.Textbox(
                 label="Dataset Split",
-                value="train",
                 placeholder="e.g., train, test, validation"
             )
             input_field = gr.Textbox(
                 label="Input Field Name",
-                value="question",
-                placeholder="e.g., question, text, context",
                 info="The field containing inputs to process"
             )
             target_field = gr.Textbox(
                 label="Target Field Name",
-                value="answer",
-                placeholder="e.g., answer, label, target",
                 info="The field containing expected outputs"
             )
             initial_prompt = gr.TextArea(
                 label="Initial Prompt",
-                value="{input}\n\nAnswer:",
                 lines=6,
                 info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
             )

         try:
             dataset = load_dataset(dataset_name, split=split, streaming=True)
         except ValueError as e:
+            # If it fails with config error, try common configs
             if "config" in str(e).lower() or "Config name is missing" in str(e):
+                # Try common configs based on dataset name
+                default_config = "main"
+                if dataset_name.lower() == "glue":
+                    default_config = "sst2"
+                print(f"Dataset requires config, trying with '{default_config}' config...")
                 try:
+                    dataset = load_dataset(dataset_name, default_config, split=split, streaming=True)
                 except:
+                    # If default config doesn't work, raise the original error
                     raise e
             else:
                 raise
         try:
             dataset = load_dataset(dataset_name, split=split, streaming=False)
         except ValueError as e:
+            # If it fails with config error, try common configs
             if "config" in str(e).lower() or "Config name is missing" in str(e):
+                # Try common configs based on dataset name
+                default_config = "main"
+                if dataset_name.lower() == "glue":
+                    default_config = "sst2"
+                dataset = load_dataset(dataset_name, default_config, split=split, streaming=False)
             else:
                 raise
                         {"role": "system", "content": "You are a helpful assistant."},
                         {"role": "user", "content": formatted_prompt}
                     ],
+                    temperature=0.0,
                     max_tokens=500,
                 )
                     target_lower = target_answer.lower()
                     pred_lower = pred_answer.lower()
+                    # Sentiment mappings with expanded synonyms
+                    positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic",
+                                     "amazing", "love", "best", "1", "pos", "admiration", "appreciation",
+                                     "praise", "favorable", "approve"]
+                    negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate",
+                                     "0", "neg", "criticism", "disdain", "disapproval", "unfavorable",
+                                     "critique", "condemn", "sarcasm"]
                     if target_lower in ["1", "positive", "pos"]:
+                        is_correct = any(word in pred_lower for word in positive_words)
                     elif target_lower in ["0", "negative", "neg"]:
+                        is_correct = any(word in pred_lower for word in negative_words)
                 if is_correct:
                     correct += 1
         try:
             dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
         except ValueError as e:
+            # If it fails with config error, try common configs
             if "config" in str(e).lower() or "Config name is missing" in str(e):
+                # Try common configs based on dataset name
+                default_config = "main"
+                if "{dataset_name}".lower() == "glue":
+                    default_config = "sst2"
+                dataset = load_dataset("{dataset_name}", default_config, split="{split}", streaming=False)
             else:
                 raise
                             {{"role": "system", "content": "You are a helpful assistant."}},
                             {{"role": "user", "content": formatted_prompt}}
                         ],
+                        temperature=0.0,
                         max_tokens=500,
                     )
                         target_lower = target_answer.lower()
                         pred_lower = pred_answer.lower()
+                        # Sentiment mappings with expanded synonyms
+                        positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic",
+                                         "amazing", "love", "best", "1", "pos", "admiration", "appreciation",
+                                         "praise", "favorable", "approve"]
+                        negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate",
+                                         "0", "neg", "criticism", "disdain", "disapproval", "unfavorable",
+                                         "critique", "condemn", "sarcasm"]
                         if target_lower in ["1", "positive", "pos"]:
+                            is_correct = any(word in pred_lower for word in positive_words)
                         elif target_lower in ["0", "negative", "neg"]:
+                            is_correct = any(word in pred_lower for word in negative_words)
                     if is_correct:
                         correct += 1
     ## How it works:
     1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
+    2. Default dataset is **SST-2** (binary sentiment classification) - perfect for showing prompt improvement!
+    3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `stanfordnlp/imdb`)
     4. Choose a free model from OpenRouter
     5. Click "Optimize Prompt" - the system will validate everything first!
     6. Watch the evolution progress in real-time
+    7. Compare initial vs. evolved performance - expect improvement from ~15% to 70%+ on SST-2!
     **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
     """)
             dataset_name = gr.Textbox(
                 label="HuggingFace Dataset (Full Name)",
+                value="glue",
+                placeholder="e.g., glue, gsm8k, stanfordnlp/imdb",
+                info="Dataset name from HuggingFace Hub. Configs auto-detected (e.g., 'glue' → 'glue:sst2')"
             )
             dataset_split = gr.Textbox(
                 label="Dataset Split",
+                value="validation",
                 placeholder="e.g., train, test, validation"
             )
             input_field = gr.Textbox(
                 label="Input Field Name",
+                value="sentence",
+                placeholder="e.g., sentence, text, question",
                 info="The field containing inputs to process"
             )
             target_field = gr.Textbox(
                 label="Target Field Name",
+                value="label",
+                placeholder="e.g., label, answer, target",
                 info="The field containing expected outputs"
             )
             initial_prompt = gr.TextArea(
                 label="Initial Prompt",
+                value="{input}\n\nIs this review positive or negative? Answer with just 'positive' or 'negative':",
                 lines=6,
                 info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
             )