Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on 24 days ago

Commit

96e88bc

verified ·

1 Parent(s): 12e9ab7

Upload app.py

Browse files

Files changed (1) hide show

app.py +25 -147

app.py CHANGED Viewed

@@ -857,7 +857,7 @@ Your improved prompt here
 def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
                    model: str, input_field: str, target_field: str,
-                   progress=gr.Progress()) -> Tuple[str, str, str, str, List[str], int, int]:
     """Run OpenEvolve to optimize the prompt."""
     progress(0, desc="Validating inputs...")
@@ -868,7 +868,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
     )
     if not is_valid:
-        return f"## Validation Failed\n\n{validation_message}", "", "", "", [], 0, 0
     progress(0.05, desc=f"Validation passed: {validation_message}")
@@ -890,19 +890,19 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
         progress(0.15, desc="Creating configuration...")
         config_path = create_config_file(model, work_dir)
-        # Run initial evaluation (using 20 samples to save API calls)
         # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
-        progress(0.2, desc="Running initial evaluation on 20 samples...")
         initial_eval = evaluate_prompt(
-            initial_prompt, dataset_name, dataset_split, 20,
             model, input_field, target_field
         )
         if "error" in initial_eval:
-            return f"## Error\n\n❌ Initial evaluation failed: {initial_eval['error']}", "", "", "", [initial_prompt], 0, 1
         if initial_eval["total"] == 0:
-            return f"## Error\n\n❌ Initial evaluation failed: No samples could be evaluated. This usually means:\n- API key is invalid or has no credits\n- Model is unavailable or rate-limited\n- Dataset fields are incorrect\n- Network connectivity issues\n\nPlease check your configuration and try again.", "", "", "", [initial_prompt], 0, 1
         # Save the indices for final evaluation (ensures fair comparison)
         eval_indices = initial_eval.get("indices", [])
@@ -973,7 +973,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
             # Parse evolution history for visualization
             evolution_viz = parse_evolution_history(output_dir)
-            progress(0.85, desc="Evaluating best evolved prompt on 20 samples...")
             # Get the best prompt (OpenEvolve saves to output_dir/best/best_program.txt)
             best_prompt_path = os.path.join(output_dir, "best", "best_program.txt")
@@ -991,7 +991,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
             # Evaluate best prompt on THE SAME samples as initial eval (fair comparison)
             final_eval = evaluate_prompt(
-                best_prompt, dataset_name, dataset_split, 20,
                 model, input_field, target_field,
                 fixed_indices=eval_indices  # Use same samples as initial eval!
             )
@@ -1023,8 +1023,8 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
 ### Summary
 - **Dataset**: {dataset_name} ({dataset_split} split)
 - **Model**: {model}
-- **Initial/Final Eval**: 20 samples each
-- **Evolution Eval**: Staged (20 → 100 if score ≥ 0.5)
 - **Iterations**: 10
 ### Results
@@ -1037,39 +1037,10 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
             progress(1.0, desc="Complete!")
-            # Collect only the "best" prompts - ones that improved the score during evolution
-            all_prompts = []
-            # Add initial prompt
-            initial_score = initial_eval['accuracy'] / 100.0  # Convert to 0-1 scale
-            all_prompts.append({
-                "prompt": initial_prompt,
-                "score": initial_score,
-                "label": "Initial Prompt",
-                "iteration": 0
-            })
-            # Add evolved prompts (only programs that were "best" at some point)
-            # Pass initial_score so we only show programs that BEAT the initial prompt
-            prompt_history = collect_prompt_history(output_dir, initial_score=initial_score)
-            for i, p in enumerate(prompt_history):
-                # Skip if it's the same as initial (shouldn't happen, but just in case)
-                if i == 0 and p.get("iteration", -1) == 0:
-                    continue
-                all_prompts.append({
-                    "prompt": p["prompt"],
-                    "score": p.get("score", 0.0),
-                    "label": f"Best at Iteration {p.get('iteration', i+1)}",
-                    "iteration": p.get("iteration", i+1)
-                })
-            return summary, initial_results, evolution_viz, final_results, all_prompts, 0, len(all_prompts)
         except Exception as e:
-            # Return error with initial prompt in dict format
-            error_prompts = [{"prompt": initial_prompt, "score": 0.0, "label": "Initial Prompt"}]
-            return f"## Error During Evolution\n\n❌ {str(e)}", initial_results, "", "", error_prompts, 0, 1
     finally:
         # Don't clean up - keep prompts for browsing
@@ -1094,7 +1065,7 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
     4. Choose a free model from OpenRouter
     5. Click "Optimize Prompt" - the system will validate everything first!
     6. Watch the evolution progress in real-time
-    7. Compare initial vs. evolved performance - uses 50 samples for stage 1, 200 for stage 2!
     **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
     """)
@@ -1157,39 +1128,15 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
         with gr.Column():
             summary = gr.Markdown("Click 'Validate & Optimize Prompt' to start optimization...", visible=True)
-    with gr.Row():
-        with gr.Column():
-            initial_results = gr.Markdown("### Initial Results\nWill appear here after validation...", visible=True)
-        with gr.Column():
-            final_results = gr.Markdown("### Final Results\nWill appear here after optimization...", visible=True)
-    with gr.Row():
-        with gr.Column():
-            evolution_progress = gr.Markdown("### Evolution Progress\nEvolution progress will appear here during optimization...", visible=True)
-    # Prompt History Browser
     gr.Markdown("---")
-    gr.Markdown("## 📜 Prompt History Browser")
-    gr.Markdown("Browse through the progression of **best** prompts found during evolution. Only shows prompts that improved the score (no duplicates or intermediate programs).")
     with gr.Row():
-        with gr.Column(scale=8):
-            prompt_display = gr.TextArea(
-                label="",
-                lines=10,
-                interactive=False,
-                placeholder="Prompts will appear here after optimization completes...",
-                show_label=False
-            )
-        with gr.Column(scale=2):
-            prompt_counter = gr.Markdown("**Prompt**: -/-")
-            prev_btn = gr.Button("⬅️ Previous", size="sm")
-            next_btn = gr.Button("Next ➡️", size="sm")
-            gr.Markdown("**Prompt Types:**\n- First = Initial\n- Middle = Intermediate\n- Last = Final Best")
-    # Hidden state to store prompt history and current index
-    prompt_history_state = gr.State([])
-    current_prompt_index = gr.State(0)
     # Documentation section - in collapsible accordion
     gr.Markdown("---")
@@ -1222,9 +1169,10 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
            - Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
         4. **Run & Monitor**:
            - All inputs are validated before starting
-           - Evolution uses staged evaluation (20 samples first, then 80 more if promising)
            - Saves API calls by early-stopping poor prompts (< 50% accuracy)
-           - Watch evolution progress visualization in real-time
         ### About OpenEvolve:
         OpenEvolve is an open-source evolutionary optimization framework. Learn more at:
@@ -1232,82 +1180,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
         - [Documentation](https://github.com/algorithmicsuperintelligence/openevolve#readme)
         """)
-    # Navigation functions for prompt browser
-    def show_previous_prompt(prompts, current_idx):
-        if not prompts or len(prompts) == 0:
-            return "", "**Prompt**: -/-", 0
-        new_idx = max(0, current_idx - 1)
-        prompt_obj = prompts[new_idx]
-        # Handle both old string format and new dict format
-        if isinstance(prompt_obj, dict):
-            prompt_text = prompt_obj["prompt"]
-            score = prompt_obj.get("score", 0.0)
-            label = prompt_obj.get("label", "")
-            counter_text = f"**{label}** ({new_idx + 1}/{len(prompts)}) | Score: {score:.2%}"
-        else:
-            prompt_text = prompt_obj
-            counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
-        return prompt_text, counter_text, new_idx
-    def show_next_prompt(prompts, current_idx):
-        if not prompts or len(prompts) == 0:
-            return "", "**Prompt**: -/-", 0
-        new_idx = min(len(prompts) - 1, current_idx + 1)
-        prompt_obj = prompts[new_idx]
-        # Handle both old string format and new dict format
-        if isinstance(prompt_obj, dict):
-            prompt_text = prompt_obj["prompt"]
-            score = prompt_obj.get("score", 0.0)
-            label = prompt_obj.get("label", "")
-            counter_text = f"**{label}** ({new_idx + 1}/{len(prompts)}) | Score: {score:.2%}"
-        else:
-            prompt_text = prompt_obj
-            counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
-        return prompt_text, counter_text, new_idx
-    def update_prompt_display(prompts, idx, total):
-        if not prompts or len(prompts) == 0:
-            return "", "**Prompt**: -/-"
-        idx = min(idx, len(prompts) - 1)
-        prompt_obj = prompts[idx]
-        # Handle both old string format and new dict format
-        if isinstance(prompt_obj, dict):
-            prompt_text = prompt_obj["prompt"]
-            score = prompt_obj.get("score", 0.0)
-            label = prompt_obj.get("label", "")
-            counter_text = f"**{label}** ({idx + 1}/{len(prompts)}) | Score: {score:.2%}"
-        else:
-            prompt_text = prompt_obj
-            counter_text = f"**Prompt**: {idx + 1}/{len(prompts)}"
-        return prompt_text, counter_text
     # Wire up the optimize button
-    optimize_result = optimize_btn.click(
         fn=optimize_prompt,
         inputs=[initial_prompt, dataset_name, dataset_split, model,
                 input_field, target_field],
-        outputs=[summary, initial_results, evolution_progress, final_results,
-                 prompt_history_state, current_prompt_index, gr.State()]  # dummy for total
-    )
-    # Update prompt display when optimization completes
-    optimize_result.then(
-        fn=update_prompt_display,
-        inputs=[prompt_history_state, current_prompt_index, gr.State()],
-        outputs=[prompt_display, prompt_counter]
-    )
-    # Wire up navigation buttons
-    prev_btn.click(
-        fn=show_previous_prompt,
-        inputs=[prompt_history_state, current_prompt_index],
-        outputs=[prompt_display, prompt_counter, current_prompt_index]
-    )
-    next_btn.click(
-        fn=show_next_prompt,
-        inputs=[prompt_history_state, current_prompt_index],
-        outputs=[prompt_display, prompt_counter, current_prompt_index]
     )
 if __name__ == "__main__":

 def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
                    model: str, input_field: str, target_field: str,
+                   progress=gr.Progress()) -> Tuple[str, str, str]:
     """Run OpenEvolve to optimize the prompt."""
     progress(0, desc="Validating inputs...")
     )
     if not is_valid:
+        return f"## Validation Failed\n\n{validation_message}", "", ""
     progress(0.05, desc=f"Validation passed: {validation_message}")
         progress(0.15, desc="Creating configuration...")
         config_path = create_config_file(model, work_dir)
+        # Run initial evaluation (using 200 samples for accurate baseline)
         # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
+        progress(0.2, desc="Running initial evaluation on 200 samples...")
         initial_eval = evaluate_prompt(
+            initial_prompt, dataset_name, dataset_split, 200,
             model, input_field, target_field
         )
         if "error" in initial_eval:
+            return f"## Error\n\n❌ Initial evaluation failed: {initial_eval['error']}", "", ""
         if initial_eval["total"] == 0:
+            return f"## Error\n\n❌ Initial evaluation failed: No samples could be evaluated. This usually means:\n- API key is invalid or has no credits\n- Model is unavailable or rate-limited\n- Dataset fields are incorrect\n- Network connectivity issues\n\nPlease check your configuration and try again.", "", ""
         # Save the indices for final evaluation (ensures fair comparison)
         eval_indices = initial_eval.get("indices", [])
             # Parse evolution history for visualization
             evolution_viz = parse_evolution_history(output_dir)
+            progress(0.85, desc="Evaluating best evolved prompt on 200 samples...")
             # Get the best prompt (OpenEvolve saves to output_dir/best/best_program.txt)
             best_prompt_path = os.path.join(output_dir, "best", "best_program.txt")
             # Evaluate best prompt on THE SAME samples as initial eval (fair comparison)
             final_eval = evaluate_prompt(
+                best_prompt, dataset_name, dataset_split, 200,
                 model, input_field, target_field,
                 fixed_indices=eval_indices  # Use same samples as initial eval!
             )
 ### Summary
 - **Dataset**: {dataset_name} ({dataset_split} split)
 - **Model**: {model}
+- **Initial/Final Eval**: 200 samples each (same samples for fair comparison)
+- **Evolution Eval**: Staged (50 → 200 if score ≥ 0.5)
 - **Iterations**: 10
 ### Results
             progress(1.0, desc="Complete!")
+            return summary, initial_results, final_results
         except Exception as e:
+            return f"## Error During Evolution\n\n❌ {str(e)}", initial_results, ""
     finally:
         # Don't clean up - keep prompts for browsing
     4. Choose a free model from OpenRouter
     5. Click "Optimize Prompt" - the system will validate everything first!
     6. Watch the evolution progress in real-time
+    7. Compare initial vs. best prompt side-by-side with full 200-sample accuracy!
     **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
     """)
         with gr.Column():
             summary = gr.Markdown("Click 'Validate & Optimize Prompt' to start optimization...", visible=True)
+    # Side-by-side comparison: Initial vs Best Prompt
     gr.Markdown("---")
+    gr.Markdown("## 🔍 Prompt Comparison: Initial vs Best")
     with gr.Row():
+        with gr.Column():
+            initial_results = gr.Markdown("### Initial Prompt\nWill appear here after validation...", visible=True)
+        with gr.Column():
+            final_results = gr.Markdown("### Best Prompt\nWill appear here after optimization...", visible=True)
     # Documentation section - in collapsible accordion
     gr.Markdown("---")
            - Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
         4. **Run & Monitor**:
            - All inputs are validated before starting
+           - Initial and final evaluations use 200 samples each for accurate comparison
+           - Evolution uses staged evaluation (50 samples first, then 200 more if promising)
            - Saves API calls by early-stopping poor prompts (< 50% accuracy)
+           - Compare initial vs best prompt side-by-side
         ### About OpenEvolve:
         OpenEvolve is an open-source evolutionary optimization framework. Learn more at:
         - [Documentation](https://github.com/algorithmicsuperintelligence/openevolve#readme)
         """)
     # Wire up the optimize button
+    optimize_btn.click(
         fn=optimize_prompt,
         inputs=[initial_prompt, dataset_name, dataset_split, model,
                 input_field, target_field],
+        outputs=[summary, initial_results, final_results]
     )
 if __name__ == "__main__":