Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on 25 days ago

Commit

12e9ab7

verified ·

1 Parent(s): 5e57630

Upload app.py

Browse files

Files changed (1) hide show

app.py +20 -20

app.py CHANGED Viewed

@@ -540,11 +540,11 @@ def evaluate(prompt: str) -> dict:
     """
     Evaluate a prompt using 2-stage cascading evaluation to save API calls.
-    Stage 1: Evaluate with 20 samples
     - If accuracy >= 0.5, proceed to Stage 2
-    - If accuracy < 0.5, return early (no point wasting 80 more samples)
-    Stage 2: Evaluate with 80 more samples (total 100)
     - Combine results for final score
     Returns dict with combined_score (0-1), accuracy, correct, and total.
@@ -679,8 +679,8 @@ def evaluate(prompt: str) -> dict:
             return correct, total
-        # STAGE 1: Evaluate with 20 samples first
-        stage1_size = 20
         stage1_samples_count = min(stage1_size, len(dataset))
         if len(dataset) > stage1_samples_count:
@@ -697,7 +697,7 @@ def evaluate(prompt: str) -> dict:
         # Early exit if Stage 1 score is below threshold
         if stage1_score < 0.5:
-            print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 80 API calls)")
             return {{
                 "combined_score": stage1_score,
                 "accuracy": stage1_score,
@@ -706,9 +706,9 @@ def evaluate(prompt: str) -> dict:
                 "stage": "stage1_early_exit"
             }}
-        # STAGE 2: Continue with 80 more samples
-        print(f"[Stage 2/2] Score >= 0.5 - proceeding with 80 more samples...")
-        stage2_size = 80
         stage2_samples_count = min(stage2_size, max(0, len(dataset) - stage1_samples_count))
         if stage2_samples_count > 0:
@@ -1089,12 +1089,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
     ## How it works:
     1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
-    2. Default dataset is **SST-2** (binary sentiment classification) - perfect for showing prompt improvement!
-    3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `stanfordnlp/imdb`)
     4. Choose a free model from OpenRouter
     5. Click "Optimize Prompt" - the system will validate everything first!
     6. Watch the evolution progress in real-time
-    7. Compare initial vs. evolved performance - expect improvement from ~15% to 70%+ on SST-2!
     **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
     """)
@@ -1112,34 +1112,34 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
             dataset_name = gr.Textbox(
                 label="HuggingFace Dataset (Full Name)",
-                value="glue",
-                placeholder="e.g., glue, gsm8k, stanfordnlp/imdb",
                 info="Dataset name from HuggingFace Hub. Configs auto-detected (e.g., 'glue' → 'glue:sst2')"
             )
             dataset_split = gr.Textbox(
                 label="Dataset Split",
-                value="validation",
                 placeholder="e.g., train, test, validation"
             )
             input_field = gr.Textbox(
                 label="Input Field Name",
-                value="sentence",
-                placeholder="e.g., sentence, text, question",
                 info="The field containing inputs to process"
             )
             target_field = gr.Textbox(
                 label="Target Field Name",
-                value="label",
-                placeholder="e.g., label, answer, target",
                 info="The field containing expected outputs"
             )
             initial_prompt = gr.TextArea(
                 label="Initial Prompt",
-                value="{input}\n\nIs this review positive or negative? Answer with just 'positive' or 'negative':",
                 lines=6,
                 info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
             )

     """
     Evaluate a prompt using 2-stage cascading evaluation to save API calls.
+    Stage 1: Evaluate with 50 samples
     - If accuracy >= 0.5, proceed to Stage 2
+    - If accuracy < 0.5, return early (no point wasting 200 more samples)
+    Stage 2: Evaluate with 200 more samples (total 250)
     - Combine results for final score
     Returns dict with combined_score (0-1), accuracy, correct, and total.
             return correct, total
+        # STAGE 1: Evaluate with 50 samples first
+        stage1_size = 50
         stage1_samples_count = min(stage1_size, len(dataset))
         if len(dataset) > stage1_samples_count:
         # Early exit if Stage 1 score is below threshold
         if stage1_score < 0.5:
+            print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 200 API calls)")
             return {{
                 "combined_score": stage1_score,
                 "accuracy": stage1_score,
                 "stage": "stage1_early_exit"
             }}
+        # STAGE 2: Continue with 200 more samples
+        print(f"[Stage 2/2] Score >= 0.5 - proceeding with 200 more samples...")
+        stage2_size = 200
         stage2_samples_count = min(stage2_size, max(0, len(dataset) - stage1_samples_count))
         if stage2_samples_count > 0:
     ## How it works:
     1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
+    2. Default dataset is **GSM8K** (grade school math) - great for showing prompt improvement!
+    3. Specify the dataset split and field names (or use other datasets like `glue`, `stanfordnlp/imdb`)
     4. Choose a free model from OpenRouter
     5. Click "Optimize Prompt" - the system will validate everything first!
     6. Watch the evolution progress in real-time
+    7. Compare initial vs. evolved performance - uses 50 samples for stage 1, 200 for stage 2!
     **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
     """)
             dataset_name = gr.Textbox(
                 label="HuggingFace Dataset (Full Name)",
+                value="gsm8k",
+                placeholder="e.g., gsm8k, glue, stanfordnlp/imdb",
                 info="Dataset name from HuggingFace Hub. Configs auto-detected (e.g., 'glue' → 'glue:sst2')"
             )
             dataset_split = gr.Textbox(
                 label="Dataset Split",
+                value="train",
                 placeholder="e.g., train, test, validation"
             )
             input_field = gr.Textbox(
                 label="Input Field Name",
+                value="question",
+                placeholder="e.g., question, sentence, text",
                 info="The field containing inputs to process"
             )
             target_field = gr.Textbox(
                 label="Target Field Name",
+                value="answer",
+                placeholder="e.g., answer, label, target",
                 info="The field containing expected outputs"
             )
             initial_prompt = gr.TextArea(
                 label="Initial Prompt",
+                value="{input}\n\nAnswer:",
                 lines=6,
                 info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
             )