codelion commited on
Commit
96e88bc
·
verified ·
1 Parent(s): 12e9ab7

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -147
app.py CHANGED
@@ -857,7 +857,7 @@ Your improved prompt here
857
 
858
  def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
859
  model: str, input_field: str, target_field: str,
860
- progress=gr.Progress()) -> Tuple[str, str, str, str, List[str], int, int]:
861
  """Run OpenEvolve to optimize the prompt."""
862
 
863
  progress(0, desc="Validating inputs...")
@@ -868,7 +868,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
868
  )
869
 
870
  if not is_valid:
871
- return f"## Validation Failed\n\n{validation_message}", "", "", "", [], 0, 0
872
 
873
  progress(0.05, desc=f"Validation passed: {validation_message}")
874
 
@@ -890,19 +890,19 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
890
  progress(0.15, desc="Creating configuration...")
891
  config_path = create_config_file(model, work_dir)
892
 
893
- # Run initial evaluation (using 20 samples to save API calls)
894
  # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
895
- progress(0.2, desc="Running initial evaluation on 20 samples...")
896
  initial_eval = evaluate_prompt(
897
- initial_prompt, dataset_name, dataset_split, 20,
898
  model, input_field, target_field
899
  )
900
 
901
  if "error" in initial_eval:
902
- return f"## Error\n\n❌ Initial evaluation failed: {initial_eval['error']}", "", "", "", [initial_prompt], 0, 1
903
 
904
  if initial_eval["total"] == 0:
905
- return f"## Error\n\n❌ Initial evaluation failed: No samples could be evaluated. This usually means:\n- API key is invalid or has no credits\n- Model is unavailable or rate-limited\n- Dataset fields are incorrect\n- Network connectivity issues\n\nPlease check your configuration and try again.", "", "", "", [initial_prompt], 0, 1
906
 
907
  # Save the indices for final evaluation (ensures fair comparison)
908
  eval_indices = initial_eval.get("indices", [])
@@ -973,7 +973,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
973
  # Parse evolution history for visualization
974
  evolution_viz = parse_evolution_history(output_dir)
975
 
976
- progress(0.85, desc="Evaluating best evolved prompt on 20 samples...")
977
 
978
  # Get the best prompt (OpenEvolve saves to output_dir/best/best_program.txt)
979
  best_prompt_path = os.path.join(output_dir, "best", "best_program.txt")
@@ -991,7 +991,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
991
 
992
  # Evaluate best prompt on THE SAME samples as initial eval (fair comparison)
993
  final_eval = evaluate_prompt(
994
- best_prompt, dataset_name, dataset_split, 20,
995
  model, input_field, target_field,
996
  fixed_indices=eval_indices # Use same samples as initial eval!
997
  )
@@ -1023,8 +1023,8 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
1023
  ### Summary
1024
  - **Dataset**: {dataset_name} ({dataset_split} split)
1025
  - **Model**: {model}
1026
- - **Initial/Final Eval**: 20 samples each
1027
- - **Evolution Eval**: Staged (20100 if score ≥ 0.5)
1028
  - **Iterations**: 10
1029
 
1030
  ### Results
@@ -1037,39 +1037,10 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
1037
 
1038
  progress(1.0, desc="Complete!")
1039
 
1040
- # Collect only the "best" prompts - ones that improved the score during evolution
1041
- all_prompts = []
1042
-
1043
- # Add initial prompt
1044
- initial_score = initial_eval['accuracy'] / 100.0 # Convert to 0-1 scale
1045
- all_prompts.append({
1046
- "prompt": initial_prompt,
1047
- "score": initial_score,
1048
- "label": "Initial Prompt",
1049
- "iteration": 0
1050
- })
1051
-
1052
- # Add evolved prompts (only programs that were "best" at some point)
1053
- # Pass initial_score so we only show programs that BEAT the initial prompt
1054
- prompt_history = collect_prompt_history(output_dir, initial_score=initial_score)
1055
- for i, p in enumerate(prompt_history):
1056
- # Skip if it's the same as initial (shouldn't happen, but just in case)
1057
- if i == 0 and p.get("iteration", -1) == 0:
1058
- continue
1059
-
1060
- all_prompts.append({
1061
- "prompt": p["prompt"],
1062
- "score": p.get("score", 0.0),
1063
- "label": f"Best at Iteration {p.get('iteration', i+1)}",
1064
- "iteration": p.get("iteration", i+1)
1065
- })
1066
-
1067
- return summary, initial_results, evolution_viz, final_results, all_prompts, 0, len(all_prompts)
1068
 
1069
  except Exception as e:
1070
- # Return error with initial prompt in dict format
1071
- error_prompts = [{"prompt": initial_prompt, "score": 0.0, "label": "Initial Prompt"}]
1072
- return f"## Error During Evolution\n\n❌ {str(e)}", initial_results, "", "", error_prompts, 0, 1
1073
 
1074
  finally:
1075
  # Don't clean up - keep prompts for browsing
@@ -1094,7 +1065,7 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1094
  4. Choose a free model from OpenRouter
1095
  5. Click "Optimize Prompt" - the system will validate everything first!
1096
  6. Watch the evolution progress in real-time
1097
- 7. Compare initial vs. evolved performance - uses 50 samples for stage 1, 200 for stage 2!
1098
 
1099
  **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
1100
  """)
@@ -1157,39 +1128,15 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1157
  with gr.Column():
1158
  summary = gr.Markdown("Click 'Validate & Optimize Prompt' to start optimization...", visible=True)
1159
 
1160
- with gr.Row():
1161
- with gr.Column():
1162
- initial_results = gr.Markdown("### Initial Results\nWill appear here after validation...", visible=True)
1163
- with gr.Column():
1164
- final_results = gr.Markdown("### Final Results\nWill appear here after optimization...", visible=True)
1165
-
1166
- with gr.Row():
1167
- with gr.Column():
1168
- evolution_progress = gr.Markdown("### Evolution Progress\nEvolution progress will appear here during optimization...", visible=True)
1169
-
1170
- # Prompt History Browser
1171
  gr.Markdown("---")
1172
- gr.Markdown("## 📜 Prompt History Browser")
1173
- gr.Markdown("Browse through the progression of **best** prompts found during evolution. Only shows prompts that improved the score (no duplicates or intermediate programs).")
1174
 
1175
  with gr.Row():
1176
- with gr.Column(scale=8):
1177
- prompt_display = gr.TextArea(
1178
- label="",
1179
- lines=10,
1180
- interactive=False,
1181
- placeholder="Prompts will appear here after optimization completes...",
1182
- show_label=False
1183
- )
1184
- with gr.Column(scale=2):
1185
- prompt_counter = gr.Markdown("**Prompt**: -/-")
1186
- prev_btn = gr.Button("⬅️ Previous", size="sm")
1187
- next_btn = gr.Button("Next ➡️", size="sm")
1188
- gr.Markdown("**Prompt Types:**\n- First = Initial\n- Middle = Intermediate\n- Last = Final Best")
1189
-
1190
- # Hidden state to store prompt history and current index
1191
- prompt_history_state = gr.State([])
1192
- current_prompt_index = gr.State(0)
1193
 
1194
  # Documentation section - in collapsible accordion
1195
  gr.Markdown("---")
@@ -1222,9 +1169,10 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1222
  - Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
1223
  4. **Run & Monitor**:
1224
  - All inputs are validated before starting
1225
- - Evolution uses staged evaluation (20 samples first, then 80 more if promising)
 
1226
  - Saves API calls by early-stopping poor prompts (< 50% accuracy)
1227
- - Watch evolution progress visualization in real-time
1228
 
1229
  ### About OpenEvolve:
1230
  OpenEvolve is an open-source evolutionary optimization framework. Learn more at:
@@ -1232,82 +1180,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1232
  - [Documentation](https://github.com/algorithmicsuperintelligence/openevolve#readme)
1233
  """)
1234
 
1235
- # Navigation functions for prompt browser
1236
- def show_previous_prompt(prompts, current_idx):
1237
- if not prompts or len(prompts) == 0:
1238
- return "", "**Prompt**: -/-", 0
1239
- new_idx = max(0, current_idx - 1)
1240
- prompt_obj = prompts[new_idx]
1241
- # Handle both old string format and new dict format
1242
- if isinstance(prompt_obj, dict):
1243
- prompt_text = prompt_obj["prompt"]
1244
- score = prompt_obj.get("score", 0.0)
1245
- label = prompt_obj.get("label", "")
1246
- counter_text = f"**{label}** ({new_idx + 1}/{len(prompts)}) | Score: {score:.2%}"
1247
- else:
1248
- prompt_text = prompt_obj
1249
- counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
1250
- return prompt_text, counter_text, new_idx
1251
-
1252
- def show_next_prompt(prompts, current_idx):
1253
- if not prompts or len(prompts) == 0:
1254
- return "", "**Prompt**: -/-", 0
1255
- new_idx = min(len(prompts) - 1, current_idx + 1)
1256
- prompt_obj = prompts[new_idx]
1257
- # Handle both old string format and new dict format
1258
- if isinstance(prompt_obj, dict):
1259
- prompt_text = prompt_obj["prompt"]
1260
- score = prompt_obj.get("score", 0.0)
1261
- label = prompt_obj.get("label", "")
1262
- counter_text = f"**{label}** ({new_idx + 1}/{len(prompts)}) | Score: {score:.2%}"
1263
- else:
1264
- prompt_text = prompt_obj
1265
- counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
1266
- return prompt_text, counter_text, new_idx
1267
-
1268
- def update_prompt_display(prompts, idx, total):
1269
- if not prompts or len(prompts) == 0:
1270
- return "", "**Prompt**: -/-"
1271
- idx = min(idx, len(prompts) - 1)
1272
- prompt_obj = prompts[idx]
1273
- # Handle both old string format and new dict format
1274
- if isinstance(prompt_obj, dict):
1275
- prompt_text = prompt_obj["prompt"]
1276
- score = prompt_obj.get("score", 0.0)
1277
- label = prompt_obj.get("label", "")
1278
- counter_text = f"**{label}** ({idx + 1}/{len(prompts)}) | Score: {score:.2%}"
1279
- else:
1280
- prompt_text = prompt_obj
1281
- counter_text = f"**Prompt**: {idx + 1}/{len(prompts)}"
1282
- return prompt_text, counter_text
1283
-
1284
  # Wire up the optimize button
1285
- optimize_result = optimize_btn.click(
1286
  fn=optimize_prompt,
1287
  inputs=[initial_prompt, dataset_name, dataset_split, model,
1288
  input_field, target_field],
1289
- outputs=[summary, initial_results, evolution_progress, final_results,
1290
- prompt_history_state, current_prompt_index, gr.State()] # dummy for total
1291
- )
1292
-
1293
- # Update prompt display when optimization completes
1294
- optimize_result.then(
1295
- fn=update_prompt_display,
1296
- inputs=[prompt_history_state, current_prompt_index, gr.State()],
1297
- outputs=[prompt_display, prompt_counter]
1298
- )
1299
-
1300
- # Wire up navigation buttons
1301
- prev_btn.click(
1302
- fn=show_previous_prompt,
1303
- inputs=[prompt_history_state, current_prompt_index],
1304
- outputs=[prompt_display, prompt_counter, current_prompt_index]
1305
- )
1306
-
1307
- next_btn.click(
1308
- fn=show_next_prompt,
1309
- inputs=[prompt_history_state, current_prompt_index],
1310
- outputs=[prompt_display, prompt_counter, current_prompt_index]
1311
  )
1312
 
1313
  if __name__ == "__main__":
 
857
 
858
  def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
859
  model: str, input_field: str, target_field: str,
860
+ progress=gr.Progress()) -> Tuple[str, str, str]:
861
  """Run OpenEvolve to optimize the prompt."""
862
 
863
  progress(0, desc="Validating inputs...")
 
868
  )
869
 
870
  if not is_valid:
871
+ return f"## Validation Failed\n\n{validation_message}", "", ""
872
 
873
  progress(0.05, desc=f"Validation passed: {validation_message}")
874
 
 
890
  progress(0.15, desc="Creating configuration...")
891
  config_path = create_config_file(model, work_dir)
892
 
893
+ # Run initial evaluation (using 200 samples for accurate baseline)
894
  # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
895
+ progress(0.2, desc="Running initial evaluation on 200 samples...")
896
  initial_eval = evaluate_prompt(
897
+ initial_prompt, dataset_name, dataset_split, 200,
898
  model, input_field, target_field
899
  )
900
 
901
  if "error" in initial_eval:
902
+ return f"## Error\n\n❌ Initial evaluation failed: {initial_eval['error']}", "", ""
903
 
904
  if initial_eval["total"] == 0:
905
+ return f"## Error\n\n❌ Initial evaluation failed: No samples could be evaluated. This usually means:\n- API key is invalid or has no credits\n- Model is unavailable or rate-limited\n- Dataset fields are incorrect\n- Network connectivity issues\n\nPlease check your configuration and try again.", "", ""
906
 
907
  # Save the indices for final evaluation (ensures fair comparison)
908
  eval_indices = initial_eval.get("indices", [])
 
973
  # Parse evolution history for visualization
974
  evolution_viz = parse_evolution_history(output_dir)
975
 
976
+ progress(0.85, desc="Evaluating best evolved prompt on 200 samples...")
977
 
978
  # Get the best prompt (OpenEvolve saves to output_dir/best/best_program.txt)
979
  best_prompt_path = os.path.join(output_dir, "best", "best_program.txt")
 
991
 
992
  # Evaluate best prompt on THE SAME samples as initial eval (fair comparison)
993
  final_eval = evaluate_prompt(
994
+ best_prompt, dataset_name, dataset_split, 200,
995
  model, input_field, target_field,
996
  fixed_indices=eval_indices # Use same samples as initial eval!
997
  )
 
1023
  ### Summary
1024
  - **Dataset**: {dataset_name} ({dataset_split} split)
1025
  - **Model**: {model}
1026
+ - **Initial/Final Eval**: 200 samples each (same samples for fair comparison)
1027
+ - **Evolution Eval**: Staged (50200 if score ≥ 0.5)
1028
  - **Iterations**: 10
1029
 
1030
  ### Results
 
1037
 
1038
  progress(1.0, desc="Complete!")
1039
 
1040
+ return summary, initial_results, final_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1041
 
1042
  except Exception as e:
1043
+ return f"## Error During Evolution\n\n❌ {str(e)}", initial_results, ""
 
 
1044
 
1045
  finally:
1046
  # Don't clean up - keep prompts for browsing
 
1065
  4. Choose a free model from OpenRouter
1066
  5. Click "Optimize Prompt" - the system will validate everything first!
1067
  6. Watch the evolution progress in real-time
1068
+ 7. Compare initial vs. best prompt side-by-side with full 200-sample accuracy!
1069
 
1070
  **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
1071
  """)
 
1128
  with gr.Column():
1129
  summary = gr.Markdown("Click 'Validate & Optimize Prompt' to start optimization...", visible=True)
1130
 
1131
+ # Side-by-side comparison: Initial vs Best Prompt
 
 
 
 
 
 
 
 
 
 
1132
  gr.Markdown("---")
1133
+ gr.Markdown("## 🔍 Prompt Comparison: Initial vs Best")
 
1134
 
1135
  with gr.Row():
1136
+ with gr.Column():
1137
+ initial_results = gr.Markdown("### Initial Prompt\nWill appear here after validation...", visible=True)
1138
+ with gr.Column():
1139
+ final_results = gr.Markdown("### Best Prompt\nWill appear here after optimization...", visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
1140
 
1141
  # Documentation section - in collapsible accordion
1142
  gr.Markdown("---")
 
1169
  - Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
1170
  4. **Run & Monitor**:
1171
  - All inputs are validated before starting
1172
+ - Initial and final evaluations use 200 samples each for accurate comparison
1173
+ - Evolution uses staged evaluation (50 samples first, then 200 more if promising)
1174
  - Saves API calls by early-stopping poor prompts (< 50% accuracy)
1175
+ - Compare initial vs best prompt side-by-side
1176
 
1177
  ### About OpenEvolve:
1178
  OpenEvolve is an open-source evolutionary optimization framework. Learn more at:
 
1180
  - [Documentation](https://github.com/algorithmicsuperintelligence/openevolve#readme)
1181
  """)
1182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1183
  # Wire up the optimize button
1184
+ optimize_btn.click(
1185
  fn=optimize_prompt,
1186
  inputs=[initial_prompt, dataset_name, dataset_split, model,
1187
  input_field, target_field],
1188
+ outputs=[summary, initial_results, final_results]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1189
  )
1190
 
1191
  if __name__ == "__main__":