Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on 25 days ago

Commit

eb3dc19

verified ·

1 Parent(s): d2e98f9

Upload app.py

Browse files

Files changed (1) hide show

app.py +125 -35

app.py CHANGED Viewed

@@ -249,12 +249,14 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
 def collect_prompt_history(output_dir: str) -> List[Dict]:
     """
-    Collect all prompts discovered during evolution with their scores.
     Returns a list of dicts with: {prompt, score, iteration, id}
     """
     try:
         prompts = []
         # OpenEvolve saves programs in checkpoint directories as JSON files
         # Structure: output_dir/checkpoints/checkpoint_{iteration}/programs/{program_id}.json
@@ -280,24 +282,37 @@ def collect_prompt_history(output_dir: str) -> List[Dict]:
                         program_data = json.load(f)
                     # Extract the code (prompt) from the program data
-                    prompt_content = program_data.get("code", "")
                     prog_id = program_data.get("id", os.path.basename(pfile).replace(".json", ""))
                     iteration = program_data.get("iteration_found", 0)
                     metrics = program_data.get("metrics", {})
                     prompts.append({
                         "prompt": prompt_content,
                         "id": prog_id,
                         "file": pfile,
                         "iteration": iteration,
-                        "metrics": metrics
                     })
                 except Exception as e:
                     print(f"Error reading program file {pfile}: {e}")
                     continue
-        # Sort by iteration
-        prompts.sort(key=lambda x: x.get("iteration", 0))
         return prompts
     except Exception as e:
@@ -568,6 +583,58 @@ def evaluate(prompt: str) -> dict:
 def create_config_file(model: str, work_dir: str):
     """Create a config.yaml file for OpenEvolve."""
     config = {
         "llm": {
             "primary_model": model,
@@ -579,6 +646,9 @@ def create_config_file(model: str, work_dir: str):
         "diff_based_evolution": False,  # Use full rewrite mode for prompts (not diff/patch mode)
         "language": "text",  # CRITICAL: Optimize text/prompts, not Python code!
         "max_code_length": 40000,  # Allow long prompts (default 10000 is too short)
         "evolution": {
             "population_size": 10,
             "num_islands": 1,
@@ -781,20 +851,31 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
             progress(1.0, desc="Complete!")
-            # Collect all discovered prompts for browsing
-            all_prompts = [initial_prompt]  # Start with initial prompt
-            prompt_history = collect_prompt_history(output_dir)
-            for p in prompt_history:
-                all_prompts.append(p["prompt"])
-            # Ensure we have the best prompt at the end
-            if best_prompt not in all_prompts:
-                all_prompts.append(best_prompt)
             return summary, initial_results, evolution_viz, final_results, all_prompts, 0, len(all_prompts)
         except Exception as e:
-            return f"## Error During Evolution\n\n❌ {str(e)}", initial_results, "", "", [initial_prompt], 0, 1
     finally:
         # Don't clean up - keep prompts for browsing
@@ -962,40 +1043,49 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
         if not prompts or len(prompts) == 0:
             return "", "**Prompt**: -/-", 0
         new_idx = max(0, current_idx - 1)
-        counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
-        if new_idx == 0:
-            counter_text += " (Initial)"
-        elif new_idx == len(prompts) - 1:
-            counter_text += " (Final Best)"
         else:
-            counter_text += " (Intermediate)"
-        return prompts[new_idx], counter_text, new_idx
     def show_next_prompt(prompts, current_idx):
         if not prompts or len(prompts) == 0:
             return "", "**Prompt**: -/-", 0
         new_idx = min(len(prompts) - 1, current_idx + 1)
-        counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
-        if new_idx == 0:
-            counter_text += " (Initial)"
-        elif new_idx == len(prompts) - 1:
-            counter_text += " (Final Best)"
         else:
-            counter_text += " (Intermediate)"
-        return prompts[new_idx], counter_text, new_idx
     def update_prompt_display(prompts, idx, total):
         if not prompts or len(prompts) == 0:
             return "", "**Prompt**: -/-"
         idx = min(idx, len(prompts) - 1)
-        counter_text = f"**Prompt**: {idx + 1}/{len(prompts)}"
-        if idx == 0:
-            counter_text += " (Initial)"
-        elif idx == len(prompts) - 1:
-            counter_text += " (Final Best)"
         else:
-            counter_text += " (Intermediate)"
-        return prompts[idx], counter_text
     # Wire up the optimize button
     optimize_result = optimize_btn.click(

 def collect_prompt_history(output_dir: str) -> List[Dict]:
     """
+    Collect unique, high-quality prompts discovered during evolution.
+    Only returns prompts that are better than previous ones (no duplicates).
     Returns a list of dicts with: {prompt, score, iteration, id}
     """
     try:
         prompts = []
+        seen_prompts = set()  # Track unique prompts
         # OpenEvolve saves programs in checkpoint directories as JSON files
         # Structure: output_dir/checkpoints/checkpoint_{iteration}/programs/{program_id}.json
                         program_data = json.load(f)
                     # Extract the code (prompt) from the program data
+                    prompt_content = program_data.get("code", "").strip()
                     prog_id = program_data.get("id", os.path.basename(pfile).replace(".json", ""))
                     iteration = program_data.get("iteration_found", 0)
                     metrics = program_data.get("metrics", {})
+                    # Get combined score for comparison
+                    combined_score = metrics.get("combined_score", 0.0)
+                    # Create a normalized version for duplicate detection (ignore whitespace differences)
+                    normalized_prompt = " ".join(prompt_content.split())
+                    # Skip duplicates
+                    if normalized_prompt in seen_prompts:
+                        continue
+                    seen_prompts.add(normalized_prompt)
                     prompts.append({
                         "prompt": prompt_content,
                         "id": prog_id,
                         "file": pfile,
                         "iteration": iteration,
+                        "metrics": metrics,
+                        "score": combined_score
                     })
                 except Exception as e:
                     print(f"Error reading program file {pfile}: {e}")
                     continue
+        # Sort by score (descending) to show best prompts first
+        prompts.sort(key=lambda x: x.get("score", 0.0), reverse=True)
         return prompts
     except Exception as e:
 def create_config_file(model: str, work_dir: str):
     """Create a config.yaml file for OpenEvolve."""
+    # Create custom templates directory for prompt optimization
+    templates_dir = os.path.join(work_dir, "templates")
+    os.makedirs(templates_dir, exist_ok=True)
+    # Create custom system template for PROMPT optimization (not code)
+    system_template = """You are an expert prompt engineer tasked with iteratively improving prompts for language models.
+Your job is to analyze the current prompt and suggest improvements based on performance feedback.
+Focus on making the prompt clearer, more specific, and more effective at achieving its goal.
+Consider:
+- Clarity and specificity of instructions
+- Examples and demonstrations that guide the model
+- Formatting that makes the prompt easier to follow
+- Edge cases and error handling in the instructions
+"""
+    with open(os.path.join(templates_dir, "system_message.txt"), "w") as f:
+        f.write(system_template)
+    # Create custom user template for prompt rewriting
+    user_template = """# Current Prompt Performance
+- Current metrics: {metrics}
+- Areas for improvement: {improvement_areas}
+{artifacts}
+# Prompt Evolution History
+{evolution_history}
+# Current Prompt
+```text
+{current_program}
+```
+# Task
+Rewrite the prompt above to improve its performance on the specified metrics.
+Provide a complete new version of the prompt that:
+1. Maintains the same input/output format (keep placeholders like {{input}}, {{text}}, etc.)
+2. Improves clarity and effectiveness
+3. Adds helpful examples or instructions if beneficial
+4. Is more likely to get correct results
+Output ONLY the new prompt text between ```text markers:
+```text
+Your improved prompt here
+```
+"""
+    with open(os.path.join(templates_dir, "full_rewrite_user.txt"), "w") as f:
+        f.write(user_template)
     config = {
         "llm": {
             "primary_model": model,
         "diff_based_evolution": False,  # Use full rewrite mode for prompts (not diff/patch mode)
         "language": "text",  # CRITICAL: Optimize text/prompts, not Python code!
         "max_code_length": 40000,  # Allow long prompts (default 10000 is too short)
+        "prompt": {
+            "template_dir": templates_dir,  # Use our custom prompt engineering templates
+        },
         "evolution": {
             "population_size": 10,
             "num_islands": 1,
             progress(1.0, desc="Complete!")
+            # Collect all unique discovered prompts for browsing (sorted by score, best first)
+            all_prompts = []
+            # Add initial prompt
+            all_prompts.append({
+                "prompt": initial_prompt,
+                "score": initial_eval['accuracy'] / 100.0,  # Convert to 0-1 scale
+                "label": "Initial Prompt"
+            })
+            # Add evolved prompts (already unique and sorted by score)
+            prompt_history = collect_prompt_history(output_dir)
+            for i, p in enumerate(prompt_history):
+                all_prompts.append({
+                    "prompt": p["prompt"],
+                    "score": p.get("score", 0.0),
+                    "label": f"Evolved #{i+1}"
+                })
             return summary, initial_results, evolution_viz, final_results, all_prompts, 0, len(all_prompts)
         except Exception as e:
+            # Return error with initial prompt in dict format
+            error_prompts = [{"prompt": initial_prompt, "score": 0.0, "label": "Initial Prompt"}]
+            return f"## Error During Evolution\n\n❌ {str(e)}", initial_results, "", "", error_prompts, 0, 1
     finally:
         # Don't clean up - keep prompts for browsing
         if not prompts or len(prompts) == 0:
             return "", "**Prompt**: -/-", 0
         new_idx = max(0, current_idx - 1)
+        prompt_obj = prompts[new_idx]
+        # Handle both old string format and new dict format
+        if isinstance(prompt_obj, dict):
+            prompt_text = prompt_obj["prompt"]
+            score = prompt_obj.get("score", 0.0)
+            label = prompt_obj.get("label", "")
+            counter_text = f"**{label}** ({new_idx + 1}/{len(prompts)}) | Score: {score:.2%}"
         else:
+            prompt_text = prompt_obj
+            counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
+        return prompt_text, counter_text, new_idx
     def show_next_prompt(prompts, current_idx):
         if not prompts or len(prompts) == 0:
             return "", "**Prompt**: -/-", 0
         new_idx = min(len(prompts) - 1, current_idx + 1)
+        prompt_obj = prompts[new_idx]
+        # Handle both old string format and new dict format
+        if isinstance(prompt_obj, dict):
+            prompt_text = prompt_obj["prompt"]
+            score = prompt_obj.get("score", 0.0)
+            label = prompt_obj.get("label", "")
+            counter_text = f"**{label}** ({new_idx + 1}/{len(prompts)}) | Score: {score:.2%}"
         else:
+            prompt_text = prompt_obj
+            counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}"
+        return prompt_text, counter_text, new_idx
     def update_prompt_display(prompts, idx, total):
         if not prompts or len(prompts) == 0:
             return "", "**Prompt**: -/-"
         idx = min(idx, len(prompts) - 1)
+        prompt_obj = prompts[idx]
+        # Handle both old string format and new dict format
+        if isinstance(prompt_obj, dict):
+            prompt_text = prompt_obj["prompt"]
+            score = prompt_obj.get("score", 0.0)
+            label = prompt_obj.get("label", "")
+            counter_text = f"**{label}** ({idx + 1}/{len(prompts)}) | Score: {score:.2%}"
         else:
+            prompt_text = prompt_obj
+            counter_text = f"**Prompt**: {idx + 1}/{len(prompts)}"
+        return prompt_text, counter_text
     # Wire up the optimize button
     optimize_result = optimize_btn.click(