codelion commited on
Commit
eb3dc19
·
verified ·
1 Parent(s): d2e98f9

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -35
app.py CHANGED
@@ -249,12 +249,14 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
249
 
250
  def collect_prompt_history(output_dir: str) -> List[Dict]:
251
  """
252
- Collect all prompts discovered during evolution with their scores.
 
253
 
254
  Returns a list of dicts with: {prompt, score, iteration, id}
255
  """
256
  try:
257
  prompts = []
 
258
 
259
  # OpenEvolve saves programs in checkpoint directories as JSON files
260
  # Structure: output_dir/checkpoints/checkpoint_{iteration}/programs/{program_id}.json
@@ -280,24 +282,37 @@ def collect_prompt_history(output_dir: str) -> List[Dict]:
280
  program_data = json.load(f)
281
 
282
  # Extract the code (prompt) from the program data
283
- prompt_content = program_data.get("code", "")
284
  prog_id = program_data.get("id", os.path.basename(pfile).replace(".json", ""))
285
  iteration = program_data.get("iteration_found", 0)
286
  metrics = program_data.get("metrics", {})
287
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  prompts.append({
289
  "prompt": prompt_content,
290
  "id": prog_id,
291
  "file": pfile,
292
  "iteration": iteration,
293
- "metrics": metrics
 
294
  })
295
  except Exception as e:
296
  print(f"Error reading program file {pfile}: {e}")
297
  continue
298
 
299
- # Sort by iteration
300
- prompts.sort(key=lambda x: x.get("iteration", 0))
301
 
302
  return prompts
303
  except Exception as e:
@@ -568,6 +583,58 @@ def evaluate(prompt: str) -> dict:
568
 
569
  def create_config_file(model: str, work_dir: str):
570
  """Create a config.yaml file for OpenEvolve."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571
  config = {
572
  "llm": {
573
  "primary_model": model,
@@ -579,6 +646,9 @@ def create_config_file(model: str, work_dir: str):
579
  "diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
580
  "language": "text", # CRITICAL: Optimize text/prompts, not Python code!
581
  "max_code_length": 40000, # Allow long prompts (default 10000 is too short)
 
 
 
582
  "evolution": {
583
  "population_size": 10,
584
  "num_islands": 1,
@@ -781,20 +851,31 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
781
 
782
  progress(1.0, desc="Complete!")
783
 
784
- # Collect all discovered prompts for browsing
785
- all_prompts = [initial_prompt] # Start with initial prompt
786
- prompt_history = collect_prompt_history(output_dir)
787
- for p in prompt_history:
788
- all_prompts.append(p["prompt"])
789
 
790
- # Ensure we have the best prompt at the end
791
- if best_prompt not in all_prompts:
792
- all_prompts.append(best_prompt)
 
 
 
 
 
 
 
 
 
 
 
 
793
 
794
  return summary, initial_results, evolution_viz, final_results, all_prompts, 0, len(all_prompts)
795
 
796
  except Exception as e:
797
- return f"## Error During Evolution\n\n❌ {str(e)}", initial_results, "", "", [initial_prompt], 0, 1
 
 
798
 
799
  finally:
800
  # Don't clean up - keep prompts for browsing
@@ -962,40 +1043,49 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
962
  if not prompts or len(prompts) == 0:
963
  return "", "**Prompt**: -/-", 0
964
  new_idx = max(0, current_idx - 1)
965
- counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
966
- if new_idx == 0:
967
- counter_text += " (Initial)"
968
- elif new_idx == len(prompts) - 1:
969
- counter_text += " (Final Best)"
 
 
970
  else:
971
- counter_text += " (Intermediate)"
972
- return prompts[new_idx], counter_text, new_idx
 
973
 
974
  def show_next_prompt(prompts, current_idx):
975
  if not prompts or len(prompts) == 0:
976
  return "", "**Prompt**: -/-", 0
977
  new_idx = min(len(prompts) - 1, current_idx + 1)
978
- counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
979
- if new_idx == 0:
980
- counter_text += " (Initial)"
981
- elif new_idx == len(prompts) - 1:
982
- counter_text += " (Final Best)"
 
 
983
  else:
984
- counter_text += " (Intermediate)"
985
- return prompts[new_idx], counter_text, new_idx
 
986
 
987
  def update_prompt_display(prompts, idx, total):
988
  if not prompts or len(prompts) == 0:
989
  return "", "**Prompt**: -/-"
990
  idx = min(idx, len(prompts) - 1)
991
- counter_text = f"**Prompt**: {idx + 1}/{len(prompts)}"
992
- if idx == 0:
993
- counter_text += " (Initial)"
994
- elif idx == len(prompts) - 1:
995
- counter_text += " (Final Best)"
 
 
996
  else:
997
- counter_text += " (Intermediate)"
998
- return prompts[idx], counter_text
 
999
 
1000
  # Wire up the optimize button
1001
  optimize_result = optimize_btn.click(
 
249
 
250
  def collect_prompt_history(output_dir: str) -> List[Dict]:
251
  """
252
+ Collect unique, high-quality prompts discovered during evolution.
253
+ Only returns prompts that are better than previous ones (no duplicates).
254
 
255
  Returns a list of dicts with: {prompt, score, iteration, id}
256
  """
257
  try:
258
  prompts = []
259
+ seen_prompts = set() # Track unique prompts
260
 
261
  # OpenEvolve saves programs in checkpoint directories as JSON files
262
  # Structure: output_dir/checkpoints/checkpoint_{iteration}/programs/{program_id}.json
 
282
  program_data = json.load(f)
283
 
284
  # Extract the code (prompt) from the program data
285
+ prompt_content = program_data.get("code", "").strip()
286
  prog_id = program_data.get("id", os.path.basename(pfile).replace(".json", ""))
287
  iteration = program_data.get("iteration_found", 0)
288
  metrics = program_data.get("metrics", {})
289
 
290
+ # Get combined score for comparison
291
+ combined_score = metrics.get("combined_score", 0.0)
292
+
293
+ # Create a normalized version for duplicate detection (ignore whitespace differences)
294
+ normalized_prompt = " ".join(prompt_content.split())
295
+
296
+ # Skip duplicates
297
+ if normalized_prompt in seen_prompts:
298
+ continue
299
+
300
+ seen_prompts.add(normalized_prompt)
301
+
302
  prompts.append({
303
  "prompt": prompt_content,
304
  "id": prog_id,
305
  "file": pfile,
306
  "iteration": iteration,
307
+ "metrics": metrics,
308
+ "score": combined_score
309
  })
310
  except Exception as e:
311
  print(f"Error reading program file {pfile}: {e}")
312
  continue
313
 
314
+ # Sort by score (descending) to show best prompts first
315
+ prompts.sort(key=lambda x: x.get("score", 0.0), reverse=True)
316
 
317
  return prompts
318
  except Exception as e:
 
583
 
584
  def create_config_file(model: str, work_dir: str):
585
  """Create a config.yaml file for OpenEvolve."""
586
+
587
+ # Create custom templates directory for prompt optimization
588
+ templates_dir = os.path.join(work_dir, "templates")
589
+ os.makedirs(templates_dir, exist_ok=True)
590
+
591
+ # Create custom system template for PROMPT optimization (not code)
592
+ system_template = """You are an expert prompt engineer tasked with iteratively improving prompts for language models.
593
+ Your job is to analyze the current prompt and suggest improvements based on performance feedback.
594
+ Focus on making the prompt clearer, more specific, and more effective at achieving its goal.
595
+ Consider:
596
+ - Clarity and specificity of instructions
597
+ - Examples and demonstrations that guide the model
598
+ - Formatting that makes the prompt easier to follow
599
+ - Edge cases and error handling in the instructions
600
+ """
601
+
602
+ with open(os.path.join(templates_dir, "system_message.txt"), "w") as f:
603
+ f.write(system_template)
604
+
605
+ # Create custom user template for prompt rewriting
606
+ user_template = """# Current Prompt Performance
607
+ - Current metrics: {metrics}
608
+ - Areas for improvement: {improvement_areas}
609
+
610
+ {artifacts}
611
+
612
+ # Prompt Evolution History
613
+ {evolution_history}
614
+
615
+ # Current Prompt
616
+ ```text
617
+ {current_program}
618
+ ```
619
+
620
+ # Task
621
+ Rewrite the prompt above to improve its performance on the specified metrics.
622
+ Provide a complete new version of the prompt that:
623
+ 1. Maintains the same input/output format (keep placeholders like {{input}}, {{text}}, etc.)
624
+ 2. Improves clarity and effectiveness
625
+ 3. Adds helpful examples or instructions if beneficial
626
+ 4. Is more likely to get correct results
627
+
628
+ Output ONLY the new prompt text between ```text markers:
629
+
630
+ ```text
631
+ Your improved prompt here
632
+ ```
633
+ """
634
+
635
+ with open(os.path.join(templates_dir, "full_rewrite_user.txt"), "w") as f:
636
+ f.write(user_template)
637
+
638
  config = {
639
  "llm": {
640
  "primary_model": model,
 
646
  "diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
647
  "language": "text", # CRITICAL: Optimize text/prompts, not Python code!
648
  "max_code_length": 40000, # Allow long prompts (default 10000 is too short)
649
+ "prompt": {
650
+ "template_dir": templates_dir, # Use our custom prompt engineering templates
651
+ },
652
  "evolution": {
653
  "population_size": 10,
654
  "num_islands": 1,
 
851
 
852
  progress(1.0, desc="Complete!")
853
 
854
+ # Collect all unique discovered prompts for browsing (sorted by score, best first)
855
+ all_prompts = []
 
 
 
856
 
857
+ # Add initial prompt
858
+ all_prompts.append({
859
+ "prompt": initial_prompt,
860
+ "score": initial_eval['accuracy'] / 100.0, # Convert to 0-1 scale
861
+ "label": "Initial Prompt"
862
+ })
863
+
864
+ # Add evolved prompts (already unique and sorted by score)
865
+ prompt_history = collect_prompt_history(output_dir)
866
+ for i, p in enumerate(prompt_history):
867
+ all_prompts.append({
868
+ "prompt": p["prompt"],
869
+ "score": p.get("score", 0.0),
870
+ "label": f"Evolved #{i+1}"
871
+ })
872
 
873
  return summary, initial_results, evolution_viz, final_results, all_prompts, 0, len(all_prompts)
874
 
875
  except Exception as e:
876
+ # Return error with initial prompt in dict format
877
+ error_prompts = [{"prompt": initial_prompt, "score": 0.0, "label": "Initial Prompt"}]
878
+ return f"## Error During Evolution\n\n❌ {str(e)}", initial_results, "", "", error_prompts, 0, 1
879
 
880
  finally:
881
  # Don't clean up - keep prompts for browsing
 
1043
  if not prompts or len(prompts) == 0:
1044
  return "", "**Prompt**: -/-", 0
1045
  new_idx = max(0, current_idx - 1)
1046
+ prompt_obj = prompts[new_idx]
1047
+ # Handle both old string format and new dict format
1048
+ if isinstance(prompt_obj, dict):
1049
+ prompt_text = prompt_obj["prompt"]
1050
+ score = prompt_obj.get("score", 0.0)
1051
+ label = prompt_obj.get("label", "")
1052
+ counter_text = f"**{label}** ({new_idx + 1}/{len(prompts)}) | Score: {score:.2%}"
1053
  else:
1054
+ prompt_text = prompt_obj
1055
+ counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
1056
+ return prompt_text, counter_text, new_idx
1057
 
1058
  def show_next_prompt(prompts, current_idx):
1059
  if not prompts or len(prompts) == 0:
1060
  return "", "**Prompt**: -/-", 0
1061
  new_idx = min(len(prompts) - 1, current_idx + 1)
1062
+ prompt_obj = prompts[new_idx]
1063
+ # Handle both old string format and new dict format
1064
+ if isinstance(prompt_obj, dict):
1065
+ prompt_text = prompt_obj["prompt"]
1066
+ score = prompt_obj.get("score", 0.0)
1067
+ label = prompt_obj.get("label", "")
1068
+ counter_text = f"**{label}** ({new_idx + 1}/{len(prompts)}) | Score: {score:.2%}"
1069
  else:
1070
+ prompt_text = prompt_obj
1071
+ counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
1072
+ return prompt_text, counter_text, new_idx
1073
 
1074
  def update_prompt_display(prompts, idx, total):
1075
  if not prompts or len(prompts) == 0:
1076
  return "", "**Prompt**: -/-"
1077
  idx = min(idx, len(prompts) - 1)
1078
+ prompt_obj = prompts[idx]
1079
+ # Handle both old string format and new dict format
1080
+ if isinstance(prompt_obj, dict):
1081
+ prompt_text = prompt_obj["prompt"]
1082
+ score = prompt_obj.get("score", 0.0)
1083
+ label = prompt_obj.get("label", "")
1084
+ counter_text = f"**{label}** ({idx + 1}/{len(prompts)}) | Score: {score:.2%}"
1085
  else:
1086
+ prompt_text = prompt_obj
1087
+ counter_text = f"**Prompt**: {idx + 1}/{len(prompts)}"
1088
+ return prompt_text, counter_text
1089
 
1090
  # Wire up the optimize button
1091
  optimize_result = optimize_btn.click(