Upload app.py
Browse files
app.py
CHANGED
|
@@ -857,7 +857,7 @@ Your improved prompt here
|
|
| 857 |
|
| 858 |
def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
| 859 |
model: str, input_field: str, target_field: str,
|
| 860 |
-
progress=gr.Progress()) -> Tuple[str, str, str
|
| 861 |
"""Run OpenEvolve to optimize the prompt."""
|
| 862 |
|
| 863 |
progress(0, desc="Validating inputs...")
|
|
@@ -868,7 +868,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 868 |
)
|
| 869 |
|
| 870 |
if not is_valid:
|
| 871 |
-
return f"## Validation Failed\n\n{validation_message}", "", ""
|
| 872 |
|
| 873 |
progress(0.05, desc=f"Validation passed: {validation_message}")
|
| 874 |
|
|
@@ -890,19 +890,19 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 890 |
progress(0.15, desc="Creating configuration...")
|
| 891 |
config_path = create_config_file(model, work_dir)
|
| 892 |
|
| 893 |
-
# Run initial evaluation (using
|
| 894 |
# IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
|
| 895 |
-
progress(0.2, desc="Running initial evaluation on
|
| 896 |
initial_eval = evaluate_prompt(
|
| 897 |
-
initial_prompt, dataset_name, dataset_split,
|
| 898 |
model, input_field, target_field
|
| 899 |
)
|
| 900 |
|
| 901 |
if "error" in initial_eval:
|
| 902 |
-
return f"## Error\n\n❌ Initial evaluation failed: {initial_eval['error']}", "", ""
|
| 903 |
|
| 904 |
if initial_eval["total"] == 0:
|
| 905 |
-
return f"## Error\n\n❌ Initial evaluation failed: No samples could be evaluated. This usually means:\n- API key is invalid or has no credits\n- Model is unavailable or rate-limited\n- Dataset fields are incorrect\n- Network connectivity issues\n\nPlease check your configuration and try again.", "", ""
|
| 906 |
|
| 907 |
# Save the indices for final evaluation (ensures fair comparison)
|
| 908 |
eval_indices = initial_eval.get("indices", [])
|
|
@@ -973,7 +973,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 973 |
# Parse evolution history for visualization
|
| 974 |
evolution_viz = parse_evolution_history(output_dir)
|
| 975 |
|
| 976 |
-
progress(0.85, desc="Evaluating best evolved prompt on
|
| 977 |
|
| 978 |
# Get the best prompt (OpenEvolve saves to output_dir/best/best_program.txt)
|
| 979 |
best_prompt_path = os.path.join(output_dir, "best", "best_program.txt")
|
|
@@ -991,7 +991,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 991 |
|
| 992 |
# Evaluate best prompt on THE SAME samples as initial eval (fair comparison)
|
| 993 |
final_eval = evaluate_prompt(
|
| 994 |
-
best_prompt, dataset_name, dataset_split,
|
| 995 |
model, input_field, target_field,
|
| 996 |
fixed_indices=eval_indices # Use same samples as initial eval!
|
| 997 |
)
|
|
@@ -1023,8 +1023,8 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 1023 |
### Summary
|
| 1024 |
- **Dataset**: {dataset_name} ({dataset_split} split)
|
| 1025 |
- **Model**: {model}
|
| 1026 |
-
- **Initial/Final Eval**:
|
| 1027 |
-
- **Evolution Eval**: Staged (
|
| 1028 |
- **Iterations**: 10
|
| 1029 |
|
| 1030 |
### Results
|
|
@@ -1037,39 +1037,10 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 1037 |
|
| 1038 |
progress(1.0, desc="Complete!")
|
| 1039 |
|
| 1040 |
-
|
| 1041 |
-
all_prompts = []
|
| 1042 |
-
|
| 1043 |
-
# Add initial prompt
|
| 1044 |
-
initial_score = initial_eval['accuracy'] / 100.0 # Convert to 0-1 scale
|
| 1045 |
-
all_prompts.append({
|
| 1046 |
-
"prompt": initial_prompt,
|
| 1047 |
-
"score": initial_score,
|
| 1048 |
-
"label": "Initial Prompt",
|
| 1049 |
-
"iteration": 0
|
| 1050 |
-
})
|
| 1051 |
-
|
| 1052 |
-
# Add evolved prompts (only programs that were "best" at some point)
|
| 1053 |
-
# Pass initial_score so we only show programs that BEAT the initial prompt
|
| 1054 |
-
prompt_history = collect_prompt_history(output_dir, initial_score=initial_score)
|
| 1055 |
-
for i, p in enumerate(prompt_history):
|
| 1056 |
-
# Skip if it's the same as initial (shouldn't happen, but just in case)
|
| 1057 |
-
if i == 0 and p.get("iteration", -1) == 0:
|
| 1058 |
-
continue
|
| 1059 |
-
|
| 1060 |
-
all_prompts.append({
|
| 1061 |
-
"prompt": p["prompt"],
|
| 1062 |
-
"score": p.get("score", 0.0),
|
| 1063 |
-
"label": f"Best at Iteration {p.get('iteration', i+1)}",
|
| 1064 |
-
"iteration": p.get("iteration", i+1)
|
| 1065 |
-
})
|
| 1066 |
-
|
| 1067 |
-
return summary, initial_results, evolution_viz, final_results, all_prompts, 0, len(all_prompts)
|
| 1068 |
|
| 1069 |
except Exception as e:
|
| 1070 |
-
|
| 1071 |
-
error_prompts = [{"prompt": initial_prompt, "score": 0.0, "label": "Initial Prompt"}]
|
| 1072 |
-
return f"## Error During Evolution\n\n❌ {str(e)}", initial_results, "", "", error_prompts, 0, 1
|
| 1073 |
|
| 1074 |
finally:
|
| 1075 |
# Don't clean up - keep prompts for browsing
|
|
@@ -1094,7 +1065,7 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 1094 |
4. Choose a free model from OpenRouter
|
| 1095 |
5. Click "Optimize Prompt" - the system will validate everything first!
|
| 1096 |
6. Watch the evolution progress in real-time
|
| 1097 |
-
7. Compare initial vs.
|
| 1098 |
|
| 1099 |
**Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
|
| 1100 |
""")
|
|
@@ -1157,39 +1128,15 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 1157 |
with gr.Column():
|
| 1158 |
summary = gr.Markdown("Click 'Validate & Optimize Prompt' to start optimization...", visible=True)
|
| 1159 |
|
| 1160 |
-
|
| 1161 |
-
with gr.Column():
|
| 1162 |
-
initial_results = gr.Markdown("### Initial Results\nWill appear here after validation...", visible=True)
|
| 1163 |
-
with gr.Column():
|
| 1164 |
-
final_results = gr.Markdown("### Final Results\nWill appear here after optimization...", visible=True)
|
| 1165 |
-
|
| 1166 |
-
with gr.Row():
|
| 1167 |
-
with gr.Column():
|
| 1168 |
-
evolution_progress = gr.Markdown("### Evolution Progress\nEvolution progress will appear here during optimization...", visible=True)
|
| 1169 |
-
|
| 1170 |
-
# Prompt History Browser
|
| 1171 |
gr.Markdown("---")
|
| 1172 |
-
gr.Markdown("##
|
| 1173 |
-
gr.Markdown("Browse through the progression of **best** prompts found during evolution. Only shows prompts that improved the score (no duplicates or intermediate programs).")
|
| 1174 |
|
| 1175 |
with gr.Row():
|
| 1176 |
-
with gr.Column(
|
| 1177 |
-
|
| 1178 |
-
|
| 1179 |
-
|
| 1180 |
-
interactive=False,
|
| 1181 |
-
placeholder="Prompts will appear here after optimization completes...",
|
| 1182 |
-
show_label=False
|
| 1183 |
-
)
|
| 1184 |
-
with gr.Column(scale=2):
|
| 1185 |
-
prompt_counter = gr.Markdown("**Prompt**: -/-")
|
| 1186 |
-
prev_btn = gr.Button("⬅️ Previous", size="sm")
|
| 1187 |
-
next_btn = gr.Button("Next ➡️", size="sm")
|
| 1188 |
-
gr.Markdown("**Prompt Types:**\n- First = Initial\n- Middle = Intermediate\n- Last = Final Best")
|
| 1189 |
-
|
| 1190 |
-
# Hidden state to store prompt history and current index
|
| 1191 |
-
prompt_history_state = gr.State([])
|
| 1192 |
-
current_prompt_index = gr.State(0)
|
| 1193 |
|
| 1194 |
# Documentation section - in collapsible accordion
|
| 1195 |
gr.Markdown("---")
|
|
@@ -1222,9 +1169,10 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 1222 |
- Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
|
| 1223 |
4. **Run & Monitor**:
|
| 1224 |
- All inputs are validated before starting
|
| 1225 |
-
-
|
|
|
|
| 1226 |
- Saves API calls by early-stopping poor prompts (< 50% accuracy)
|
| 1227 |
-
-
|
| 1228 |
|
| 1229 |
### About OpenEvolve:
|
| 1230 |
OpenEvolve is an open-source evolutionary optimization framework. Learn more at:
|
|
@@ -1232,82 +1180,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 1232 |
- [Documentation](https://github.com/algorithmicsuperintelligence/openevolve#readme)
|
| 1233 |
""")
|
| 1234 |
|
| 1235 |
-
# Navigation functions for prompt browser
|
| 1236 |
-
def show_previous_prompt(prompts, current_idx):
|
| 1237 |
-
if not prompts or len(prompts) == 0:
|
| 1238 |
-
return "", "**Prompt**: -/-", 0
|
| 1239 |
-
new_idx = max(0, current_idx - 1)
|
| 1240 |
-
prompt_obj = prompts[new_idx]
|
| 1241 |
-
# Handle both old string format and new dict format
|
| 1242 |
-
if isinstance(prompt_obj, dict):
|
| 1243 |
-
prompt_text = prompt_obj["prompt"]
|
| 1244 |
-
score = prompt_obj.get("score", 0.0)
|
| 1245 |
-
label = prompt_obj.get("label", "")
|
| 1246 |
-
counter_text = f"**{label}** ({new_idx + 1}/{len(prompts)}) | Score: {score:.2%}"
|
| 1247 |
-
else:
|
| 1248 |
-
prompt_text = prompt_obj
|
| 1249 |
-
counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
|
| 1250 |
-
return prompt_text, counter_text, new_idx
|
| 1251 |
-
|
| 1252 |
-
def show_next_prompt(prompts, current_idx):
|
| 1253 |
-
if not prompts or len(prompts) == 0:
|
| 1254 |
-
return "", "**Prompt**: -/-", 0
|
| 1255 |
-
new_idx = min(len(prompts) - 1, current_idx + 1)
|
| 1256 |
-
prompt_obj = prompts[new_idx]
|
| 1257 |
-
# Handle both old string format and new dict format
|
| 1258 |
-
if isinstance(prompt_obj, dict):
|
| 1259 |
-
prompt_text = prompt_obj["prompt"]
|
| 1260 |
-
score = prompt_obj.get("score", 0.0)
|
| 1261 |
-
label = prompt_obj.get("label", "")
|
| 1262 |
-
counter_text = f"**{label}** ({new_idx + 1}/{len(prompts)}) | Score: {score:.2%}"
|
| 1263 |
-
else:
|
| 1264 |
-
prompt_text = prompt_obj
|
| 1265 |
-
counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
|
| 1266 |
-
return prompt_text, counter_text, new_idx
|
| 1267 |
-
|
| 1268 |
-
def update_prompt_display(prompts, idx, total):
|
| 1269 |
-
if not prompts or len(prompts) == 0:
|
| 1270 |
-
return "", "**Prompt**: -/-"
|
| 1271 |
-
idx = min(idx, len(prompts) - 1)
|
| 1272 |
-
prompt_obj = prompts[idx]
|
| 1273 |
-
# Handle both old string format and new dict format
|
| 1274 |
-
if isinstance(prompt_obj, dict):
|
| 1275 |
-
prompt_text = prompt_obj["prompt"]
|
| 1276 |
-
score = prompt_obj.get("score", 0.0)
|
| 1277 |
-
label = prompt_obj.get("label", "")
|
| 1278 |
-
counter_text = f"**{label}** ({idx + 1}/{len(prompts)}) | Score: {score:.2%}"
|
| 1279 |
-
else:
|
| 1280 |
-
prompt_text = prompt_obj
|
| 1281 |
-
counter_text = f"**Prompt**: {idx + 1}/{len(prompts)}"
|
| 1282 |
-
return prompt_text, counter_text
|
| 1283 |
-
|
| 1284 |
# Wire up the optimize button
|
| 1285 |
-
|
| 1286 |
fn=optimize_prompt,
|
| 1287 |
inputs=[initial_prompt, dataset_name, dataset_split, model,
|
| 1288 |
input_field, target_field],
|
| 1289 |
-
outputs=[summary, initial_results,
|
| 1290 |
-
prompt_history_state, current_prompt_index, gr.State()] # dummy for total
|
| 1291 |
-
)
|
| 1292 |
-
|
| 1293 |
-
# Update prompt display when optimization completes
|
| 1294 |
-
optimize_result.then(
|
| 1295 |
-
fn=update_prompt_display,
|
| 1296 |
-
inputs=[prompt_history_state, current_prompt_index, gr.State()],
|
| 1297 |
-
outputs=[prompt_display, prompt_counter]
|
| 1298 |
-
)
|
| 1299 |
-
|
| 1300 |
-
# Wire up navigation buttons
|
| 1301 |
-
prev_btn.click(
|
| 1302 |
-
fn=show_previous_prompt,
|
| 1303 |
-
inputs=[prompt_history_state, current_prompt_index],
|
| 1304 |
-
outputs=[prompt_display, prompt_counter, current_prompt_index]
|
| 1305 |
-
)
|
| 1306 |
-
|
| 1307 |
-
next_btn.click(
|
| 1308 |
-
fn=show_next_prompt,
|
| 1309 |
-
inputs=[prompt_history_state, current_prompt_index],
|
| 1310 |
-
outputs=[prompt_display, prompt_counter, current_prompt_index]
|
| 1311 |
)
|
| 1312 |
|
| 1313 |
if __name__ == "__main__":
|
|
|
|
| 857 |
|
| 858 |
def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
| 859 |
model: str, input_field: str, target_field: str,
|
| 860 |
+
progress=gr.Progress()) -> Tuple[str, str, str]:
|
| 861 |
"""Run OpenEvolve to optimize the prompt."""
|
| 862 |
|
| 863 |
progress(0, desc="Validating inputs...")
|
|
|
|
| 868 |
)
|
| 869 |
|
| 870 |
if not is_valid:
|
| 871 |
+
return f"## Validation Failed\n\n{validation_message}", "", ""
|
| 872 |
|
| 873 |
progress(0.05, desc=f"Validation passed: {validation_message}")
|
| 874 |
|
|
|
|
| 890 |
progress(0.15, desc="Creating configuration...")
|
| 891 |
config_path = create_config_file(model, work_dir)
|
| 892 |
|
| 893 |
+
# Run initial evaluation (using 200 samples for accurate baseline)
|
| 894 |
# IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
|
| 895 |
+
progress(0.2, desc="Running initial evaluation on 200 samples...")
|
| 896 |
initial_eval = evaluate_prompt(
|
| 897 |
+
initial_prompt, dataset_name, dataset_split, 200,
|
| 898 |
model, input_field, target_field
|
| 899 |
)
|
| 900 |
|
| 901 |
if "error" in initial_eval:
|
| 902 |
+
return f"## Error\n\n❌ Initial evaluation failed: {initial_eval['error']}", "", ""
|
| 903 |
|
| 904 |
if initial_eval["total"] == 0:
|
| 905 |
+
return f"## Error\n\n❌ Initial evaluation failed: No samples could be evaluated. This usually means:\n- API key is invalid or has no credits\n- Model is unavailable or rate-limited\n- Dataset fields are incorrect\n- Network connectivity issues\n\nPlease check your configuration and try again.", "", ""
|
| 906 |
|
| 907 |
# Save the indices for final evaluation (ensures fair comparison)
|
| 908 |
eval_indices = initial_eval.get("indices", [])
|
|
|
|
| 973 |
# Parse evolution history for visualization
|
| 974 |
evolution_viz = parse_evolution_history(output_dir)
|
| 975 |
|
| 976 |
+
progress(0.85, desc="Evaluating best evolved prompt on 200 samples...")
|
| 977 |
|
| 978 |
# Get the best prompt (OpenEvolve saves to output_dir/best/best_program.txt)
|
| 979 |
best_prompt_path = os.path.join(output_dir, "best", "best_program.txt")
|
|
|
|
| 991 |
|
| 992 |
# Evaluate best prompt on THE SAME samples as initial eval (fair comparison)
|
| 993 |
final_eval = evaluate_prompt(
|
| 994 |
+
best_prompt, dataset_name, dataset_split, 200,
|
| 995 |
model, input_field, target_field,
|
| 996 |
fixed_indices=eval_indices # Use same samples as initial eval!
|
| 997 |
)
|
|
|
|
| 1023 |
### Summary
|
| 1024 |
- **Dataset**: {dataset_name} ({dataset_split} split)
|
| 1025 |
- **Model**: {model}
|
| 1026 |
+
- **Initial/Final Eval**: 200 samples each (same samples for fair comparison)
|
| 1027 |
+
- **Evolution Eval**: Staged (50 → 200 if score ≥ 0.5)
|
| 1028 |
- **Iterations**: 10
|
| 1029 |
|
| 1030 |
### Results
|
|
|
|
| 1037 |
|
| 1038 |
progress(1.0, desc="Complete!")
|
| 1039 |
|
| 1040 |
+
return summary, initial_results, final_results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1041 |
|
| 1042 |
except Exception as e:
|
| 1043 |
+
return f"## Error During Evolution\n\n❌ {str(e)}", initial_results, ""
|
|
|
|
|
|
|
| 1044 |
|
| 1045 |
finally:
|
| 1046 |
# Don't clean up - keep prompts for browsing
|
|
|
|
| 1065 |
4. Choose a free model from OpenRouter
|
| 1066 |
5. Click "Optimize Prompt" - the system will validate everything first!
|
| 1067 |
6. Watch the evolution progress in real-time
|
| 1068 |
+
7. Compare initial vs. best prompt side-by-side with full 200-sample accuracy!
|
| 1069 |
|
| 1070 |
**Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
|
| 1071 |
""")
|
|
|
|
| 1128 |
with gr.Column():
|
| 1129 |
summary = gr.Markdown("Click 'Validate & Optimize Prompt' to start optimization...", visible=True)
|
| 1130 |
|
| 1131 |
+
# Side-by-side comparison: Initial vs Best Prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1132 |
gr.Markdown("---")
|
| 1133 |
+
gr.Markdown("## 🔍 Prompt Comparison: Initial vs Best")
|
|
|
|
| 1134 |
|
| 1135 |
with gr.Row():
|
| 1136 |
+
with gr.Column():
|
| 1137 |
+
initial_results = gr.Markdown("### Initial Prompt\nWill appear here after validation...", visible=True)
|
| 1138 |
+
with gr.Column():
|
| 1139 |
+
final_results = gr.Markdown("### Best Prompt\nWill appear here after optimization...", visible=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1140 |
|
| 1141 |
# Documentation section - in collapsible accordion
|
| 1142 |
gr.Markdown("---")
|
|
|
|
| 1169 |
- Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
|
| 1170 |
4. **Run & Monitor**:
|
| 1171 |
- All inputs are validated before starting
|
| 1172 |
+
- Initial and final evaluations use 200 samples each for accurate comparison
|
| 1173 |
+
- Evolution uses staged evaluation (50 samples first, then 200 more if promising)
|
| 1174 |
- Saves API calls by early-stopping poor prompts (< 50% accuracy)
|
| 1175 |
+
- Compare initial vs best prompt side-by-side
|
| 1176 |
|
| 1177 |
### About OpenEvolve:
|
| 1178 |
OpenEvolve is an open-source evolutionary optimization framework. Learn more at:
|
|
|
|
| 1180 |
- [Documentation](https://github.com/algorithmicsuperintelligence/openevolve#readme)
|
| 1181 |
""")
|
| 1182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1183 |
# Wire up the optimize button
|
| 1184 |
+
optimize_btn.click(
|
| 1185 |
fn=optimize_prompt,
|
| 1186 |
inputs=[initial_prompt, dataset_name, dataset_split, model,
|
| 1187 |
input_field, target_field],
|
| 1188 |
+
outputs=[summary, initial_results, final_results]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1189 |
)
|
| 1190 |
|
| 1191 |
if __name__ == "__main__":
|