Upload app.py
Browse files
app.py
CHANGED
|
@@ -249,12 +249,14 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
|
|
| 249 |
|
| 250 |
def collect_prompt_history(output_dir: str) -> List[Dict]:
|
| 251 |
"""
|
| 252 |
-
Collect
|
|
|
|
| 253 |
|
| 254 |
Returns a list of dicts with: {prompt, score, iteration, id}
|
| 255 |
"""
|
| 256 |
try:
|
| 257 |
prompts = []
|
|
|
|
| 258 |
|
| 259 |
# OpenEvolve saves programs in checkpoint directories as JSON files
|
| 260 |
# Structure: output_dir/checkpoints/checkpoint_{iteration}/programs/{program_id}.json
|
|
@@ -280,24 +282,37 @@ def collect_prompt_history(output_dir: str) -> List[Dict]:
|
|
| 280 |
program_data = json.load(f)
|
| 281 |
|
| 282 |
# Extract the code (prompt) from the program data
|
| 283 |
-
prompt_content = program_data.get("code", "")
|
| 284 |
prog_id = program_data.get("id", os.path.basename(pfile).replace(".json", ""))
|
| 285 |
iteration = program_data.get("iteration_found", 0)
|
| 286 |
metrics = program_data.get("metrics", {})
|
| 287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
prompts.append({
|
| 289 |
"prompt": prompt_content,
|
| 290 |
"id": prog_id,
|
| 291 |
"file": pfile,
|
| 292 |
"iteration": iteration,
|
| 293 |
-
"metrics": metrics
|
|
|
|
| 294 |
})
|
| 295 |
except Exception as e:
|
| 296 |
print(f"Error reading program file {pfile}: {e}")
|
| 297 |
continue
|
| 298 |
|
| 299 |
-
# Sort by
|
| 300 |
-
prompts.sort(key=lambda x: x.get("
|
| 301 |
|
| 302 |
return prompts
|
| 303 |
except Exception as e:
|
|
@@ -568,6 +583,58 @@ def evaluate(prompt: str) -> dict:
|
|
| 568 |
|
| 569 |
def create_config_file(model: str, work_dir: str):
|
| 570 |
"""Create a config.yaml file for OpenEvolve."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 571 |
config = {
|
| 572 |
"llm": {
|
| 573 |
"primary_model": model,
|
|
@@ -579,6 +646,9 @@ def create_config_file(model: str, work_dir: str):
|
|
| 579 |
"diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
|
| 580 |
"language": "text", # CRITICAL: Optimize text/prompts, not Python code!
|
| 581 |
"max_code_length": 40000, # Allow long prompts (default 10000 is too short)
|
|
|
|
|
|
|
|
|
|
| 582 |
"evolution": {
|
| 583 |
"population_size": 10,
|
| 584 |
"num_islands": 1,
|
|
@@ -781,20 +851,31 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 781 |
|
| 782 |
progress(1.0, desc="Complete!")
|
| 783 |
|
| 784 |
-
# Collect all discovered prompts for browsing
|
| 785 |
-
all_prompts = [
|
| 786 |
-
prompt_history = collect_prompt_history(output_dir)
|
| 787 |
-
for p in prompt_history:
|
| 788 |
-
all_prompts.append(p["prompt"])
|
| 789 |
|
| 790 |
-
#
|
| 791 |
-
|
| 792 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 793 |
|
| 794 |
return summary, initial_results, evolution_viz, final_results, all_prompts, 0, len(all_prompts)
|
| 795 |
|
| 796 |
except Exception as e:
|
| 797 |
-
|
|
|
|
|
|
|
| 798 |
|
| 799 |
finally:
|
| 800 |
# Don't clean up - keep prompts for browsing
|
|
@@ -962,40 +1043,49 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 962 |
if not prompts or len(prompts) == 0:
|
| 963 |
return "", "**Prompt**: -/-", 0
|
| 964 |
new_idx = max(0, current_idx - 1)
|
| 965 |
-
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
|
|
|
|
|
|
| 970 |
else:
|
| 971 |
-
|
| 972 |
-
|
|
|
|
| 973 |
|
| 974 |
def show_next_prompt(prompts, current_idx):
|
| 975 |
if not prompts or len(prompts) == 0:
|
| 976 |
return "", "**Prompt**: -/-", 0
|
| 977 |
new_idx = min(len(prompts) - 1, current_idx + 1)
|
| 978 |
-
|
| 979 |
-
|
| 980 |
-
|
| 981 |
-
|
| 982 |
-
|
|
|
|
|
|
|
| 983 |
else:
|
| 984 |
-
|
| 985 |
-
|
|
|
|
| 986 |
|
| 987 |
def update_prompt_display(prompts, idx, total):
|
| 988 |
if not prompts or len(prompts) == 0:
|
| 989 |
return "", "**Prompt**: -/-"
|
| 990 |
idx = min(idx, len(prompts) - 1)
|
| 991 |
-
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
|
|
|
|
|
|
|
| 996 |
else:
|
| 997 |
-
|
| 998 |
-
|
|
|
|
| 999 |
|
| 1000 |
# Wire up the optimize button
|
| 1001 |
optimize_result = optimize_btn.click(
|
|
|
|
| 249 |
|
| 250 |
def collect_prompt_history(output_dir: str) -> List[Dict]:
|
| 251 |
"""
|
| 252 |
+
Collect unique, high-quality prompts discovered during evolution.
|
| 253 |
+
Only returns prompts that are better than previous ones (no duplicates).
|
| 254 |
|
| 255 |
Returns a list of dicts with: {prompt, score, iteration, id}
|
| 256 |
"""
|
| 257 |
try:
|
| 258 |
prompts = []
|
| 259 |
+
seen_prompts = set() # Track unique prompts
|
| 260 |
|
| 261 |
# OpenEvolve saves programs in checkpoint directories as JSON files
|
| 262 |
# Structure: output_dir/checkpoints/checkpoint_{iteration}/programs/{program_id}.json
|
|
|
|
| 282 |
program_data = json.load(f)
|
| 283 |
|
| 284 |
# Extract the code (prompt) from the program data
|
| 285 |
+
prompt_content = program_data.get("code", "").strip()
|
| 286 |
prog_id = program_data.get("id", os.path.basename(pfile).replace(".json", ""))
|
| 287 |
iteration = program_data.get("iteration_found", 0)
|
| 288 |
metrics = program_data.get("metrics", {})
|
| 289 |
|
| 290 |
+
# Get combined score for comparison
|
| 291 |
+
combined_score = metrics.get("combined_score", 0.0)
|
| 292 |
+
|
| 293 |
+
# Create a normalized version for duplicate detection (ignore whitespace differences)
|
| 294 |
+
normalized_prompt = " ".join(prompt_content.split())
|
| 295 |
+
|
| 296 |
+
# Skip duplicates
|
| 297 |
+
if normalized_prompt in seen_prompts:
|
| 298 |
+
continue
|
| 299 |
+
|
| 300 |
+
seen_prompts.add(normalized_prompt)
|
| 301 |
+
|
| 302 |
prompts.append({
|
| 303 |
"prompt": prompt_content,
|
| 304 |
"id": prog_id,
|
| 305 |
"file": pfile,
|
| 306 |
"iteration": iteration,
|
| 307 |
+
"metrics": metrics,
|
| 308 |
+
"score": combined_score
|
| 309 |
})
|
| 310 |
except Exception as e:
|
| 311 |
print(f"Error reading program file {pfile}: {e}")
|
| 312 |
continue
|
| 313 |
|
| 314 |
+
# Sort by score (descending) to show best prompts first
|
| 315 |
+
prompts.sort(key=lambda x: x.get("score", 0.0), reverse=True)
|
| 316 |
|
| 317 |
return prompts
|
| 318 |
except Exception as e:
|
|
|
|
| 583 |
|
| 584 |
def create_config_file(model: str, work_dir: str):
|
| 585 |
"""Create a config.yaml file for OpenEvolve."""
|
| 586 |
+
|
| 587 |
+
# Create custom templates directory for prompt optimization
|
| 588 |
+
templates_dir = os.path.join(work_dir, "templates")
|
| 589 |
+
os.makedirs(templates_dir, exist_ok=True)
|
| 590 |
+
|
| 591 |
+
# Create custom system template for PROMPT optimization (not code)
|
| 592 |
+
system_template = """You are an expert prompt engineer tasked with iteratively improving prompts for language models.
|
| 593 |
+
Your job is to analyze the current prompt and suggest improvements based on performance feedback.
|
| 594 |
+
Focus on making the prompt clearer, more specific, and more effective at achieving its goal.
|
| 595 |
+
Consider:
|
| 596 |
+
- Clarity and specificity of instructions
|
| 597 |
+
- Examples and demonstrations that guide the model
|
| 598 |
+
- Formatting that makes the prompt easier to follow
|
| 599 |
+
- Edge cases and error handling in the instructions
|
| 600 |
+
"""
|
| 601 |
+
|
| 602 |
+
with open(os.path.join(templates_dir, "system_message.txt"), "w") as f:
|
| 603 |
+
f.write(system_template)
|
| 604 |
+
|
| 605 |
+
# Create custom user template for prompt rewriting
|
| 606 |
+
user_template = """# Current Prompt Performance
|
| 607 |
+
- Current metrics: {metrics}
|
| 608 |
+
- Areas for improvement: {improvement_areas}
|
| 609 |
+
|
| 610 |
+
{artifacts}
|
| 611 |
+
|
| 612 |
+
# Prompt Evolution History
|
| 613 |
+
{evolution_history}
|
| 614 |
+
|
| 615 |
+
# Current Prompt
|
| 616 |
+
```text
|
| 617 |
+
{current_program}
|
| 618 |
+
```
|
| 619 |
+
|
| 620 |
+
# Task
|
| 621 |
+
Rewrite the prompt above to improve its performance on the specified metrics.
|
| 622 |
+
Provide a complete new version of the prompt that:
|
| 623 |
+
1. Maintains the same input/output format (keep placeholders like {{input}}, {{text}}, etc.)
|
| 624 |
+
2. Improves clarity and effectiveness
|
| 625 |
+
3. Adds helpful examples or instructions if beneficial
|
| 626 |
+
4. Is more likely to get correct results
|
| 627 |
+
|
| 628 |
+
Output ONLY the new prompt text between ```text markers:
|
| 629 |
+
|
| 630 |
+
```text
|
| 631 |
+
Your improved prompt here
|
| 632 |
+
```
|
| 633 |
+
"""
|
| 634 |
+
|
| 635 |
+
with open(os.path.join(templates_dir, "full_rewrite_user.txt"), "w") as f:
|
| 636 |
+
f.write(user_template)
|
| 637 |
+
|
| 638 |
config = {
|
| 639 |
"llm": {
|
| 640 |
"primary_model": model,
|
|
|
|
| 646 |
"diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
|
| 647 |
"language": "text", # CRITICAL: Optimize text/prompts, not Python code!
|
| 648 |
"max_code_length": 40000, # Allow long prompts (default 10000 is too short)
|
| 649 |
+
"prompt": {
|
| 650 |
+
"template_dir": templates_dir, # Use our custom prompt engineering templates
|
| 651 |
+
},
|
| 652 |
"evolution": {
|
| 653 |
"population_size": 10,
|
| 654 |
"num_islands": 1,
|
|
|
|
| 851 |
|
| 852 |
progress(1.0, desc="Complete!")
|
| 853 |
|
| 854 |
+
# Collect all unique discovered prompts for browsing (sorted by score, best first)
|
| 855 |
+
all_prompts = []
|
|
|
|
|
|
|
|
|
|
| 856 |
|
| 857 |
+
# Add initial prompt
|
| 858 |
+
all_prompts.append({
|
| 859 |
+
"prompt": initial_prompt,
|
| 860 |
+
"score": initial_eval['accuracy'] / 100.0, # Convert to 0-1 scale
|
| 861 |
+
"label": "Initial Prompt"
|
| 862 |
+
})
|
| 863 |
+
|
| 864 |
+
# Add evolved prompts (already unique and sorted by score)
|
| 865 |
+
prompt_history = collect_prompt_history(output_dir)
|
| 866 |
+
for i, p in enumerate(prompt_history):
|
| 867 |
+
all_prompts.append({
|
| 868 |
+
"prompt": p["prompt"],
|
| 869 |
+
"score": p.get("score", 0.0),
|
| 870 |
+
"label": f"Evolved #{i+1}"
|
| 871 |
+
})
|
| 872 |
|
| 873 |
return summary, initial_results, evolution_viz, final_results, all_prompts, 0, len(all_prompts)
|
| 874 |
|
| 875 |
except Exception as e:
|
| 876 |
+
# Return error with initial prompt in dict format
|
| 877 |
+
error_prompts = [{"prompt": initial_prompt, "score": 0.0, "label": "Initial Prompt"}]
|
| 878 |
+
return f"## Error During Evolution\n\n❌ {str(e)}", initial_results, "", "", error_prompts, 0, 1
|
| 879 |
|
| 880 |
finally:
|
| 881 |
# Don't clean up - keep prompts for browsing
|
|
|
|
| 1043 |
if not prompts or len(prompts) == 0:
|
| 1044 |
return "", "**Prompt**: -/-", 0
|
| 1045 |
new_idx = max(0, current_idx - 1)
|
| 1046 |
+
prompt_obj = prompts[new_idx]
|
| 1047 |
+
# Handle both old string format and new dict format
|
| 1048 |
+
if isinstance(prompt_obj, dict):
|
| 1049 |
+
prompt_text = prompt_obj["prompt"]
|
| 1050 |
+
score = prompt_obj.get("score", 0.0)
|
| 1051 |
+
label = prompt_obj.get("label", "")
|
| 1052 |
+
counter_text = f"**{label}** ({new_idx + 1}/{len(prompts)}) | Score: {score:.2%}"
|
| 1053 |
else:
|
| 1054 |
+
prompt_text = prompt_obj
|
| 1055 |
+
counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
|
| 1056 |
+
return prompt_text, counter_text, new_idx
|
| 1057 |
|
| 1058 |
def show_next_prompt(prompts, current_idx):
|
| 1059 |
if not prompts or len(prompts) == 0:
|
| 1060 |
return "", "**Prompt**: -/-", 0
|
| 1061 |
new_idx = min(len(prompts) - 1, current_idx + 1)
|
| 1062 |
+
prompt_obj = prompts[new_idx]
|
| 1063 |
+
# Handle both old string format and new dict format
|
| 1064 |
+
if isinstance(prompt_obj, dict):
|
| 1065 |
+
prompt_text = prompt_obj["prompt"]
|
| 1066 |
+
score = prompt_obj.get("score", 0.0)
|
| 1067 |
+
label = prompt_obj.get("label", "")
|
| 1068 |
+
counter_text = f"**{label}** ({new_idx + 1}/{len(prompts)}) | Score: {score:.2%}"
|
| 1069 |
else:
|
| 1070 |
+
prompt_text = prompt_obj
|
| 1071 |
+
counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
|
| 1072 |
+
return prompt_text, counter_text, new_idx
|
| 1073 |
|
| 1074 |
def update_prompt_display(prompts, idx, total):
|
| 1075 |
if not prompts or len(prompts) == 0:
|
| 1076 |
return "", "**Prompt**: -/-"
|
| 1077 |
idx = min(idx, len(prompts) - 1)
|
| 1078 |
+
prompt_obj = prompts[idx]
|
| 1079 |
+
# Handle both old string format and new dict format
|
| 1080 |
+
if isinstance(prompt_obj, dict):
|
| 1081 |
+
prompt_text = prompt_obj["prompt"]
|
| 1082 |
+
score = prompt_obj.get("score", 0.0)
|
| 1083 |
+
label = prompt_obj.get("label", "")
|
| 1084 |
+
counter_text = f"**{label}** ({idx + 1}/{len(prompts)}) | Score: {score:.2%}"
|
| 1085 |
else:
|
| 1086 |
+
prompt_text = prompt_obj
|
| 1087 |
+
counter_text = f"**Prompt**: {idx + 1}/{len(prompts)}"
|
| 1088 |
+
return prompt_text, counter_text
|
| 1089 |
|
| 1090 |
# Wire up the optimize button
|
| 1091 |
optimize_result = optimize_btn.click(
|