Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on 25 days ago

Commit

741a123

verified ·

1 Parent(s): e35db16

Upload app.py

Browse files

Files changed (1) hide show

app.py +19 -7

app.py CHANGED Viewed

@@ -247,10 +247,14 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
         }
-def collect_prompt_history(output_dir: str) -> List[Dict]:
     """
     Collect only the prompts that were "best" at some point during evolution.
-    Returns: initial prompt + any program that improved the best score (deduplicated).
     Returns a list of dicts with: {prompt, score, iteration, id}
     """
@@ -307,12 +311,18 @@ def collect_prompt_history(output_dir: str) -> List[Dict]:
         all_programs.sort(key=lambda x: x.get("iteration", 0))
         # Filter to keep only programs that improved the best score
         best_programs = []
-        current_best_score = -float('inf')
         for program in all_programs:
             prompt_content = program["prompt"]
             score = program["score"]
             # Create a normalized version for duplicate detection (ignore whitespace differences)
             normalized_prompt = " ".join(prompt_content.split())
@@ -325,8 +335,9 @@ def collect_prompt_history(output_dir: str) -> List[Dict]:
             if score > current_best_score:
                 seen_prompts.add(normalized_prompt)
                 best_programs.append(program)
                 current_best_score = score
-                print(f"  Best program at iteration {program['iteration']}: score={score:.2%}")
         return best_programs
@@ -870,16 +881,17 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
             all_prompts = []
             # Add initial prompt
             all_prompts.append({
                 "prompt": initial_prompt,
-                "score": initial_eval['accuracy'] / 100.0,  # Convert to 0-1 scale
                 "label": "Initial Prompt",
                 "iteration": 0
             })
             # Add evolved prompts (only programs that were "best" at some point)
-            # These are already filtered to show progression: initial → better → best
-            prompt_history = collect_prompt_history(output_dir)
             for i, p in enumerate(prompt_history):
                 # Skip if it's the same as initial (shouldn't happen, but just in case)
                 if i == 0 and p.get("iteration", -1) == 0:

         }
+def collect_prompt_history(output_dir: str, initial_score: float = 0.0) -> List[Dict]:
     """
     Collect only the prompts that were "best" at some point during evolution.
+    Returns only programs that improved upon the initial score (deduplicated).
+    Args:
+        output_dir: Directory containing checkpoint data
+        initial_score: Score of the initial prompt (baseline to beat)
     Returns a list of dicts with: {prompt, score, iteration, id}
     """
         all_programs.sort(key=lambda x: x.get("iteration", 0))
         # Filter to keep only programs that improved the best score
+        # Start from the initial score as the baseline
         best_programs = []
+        current_best_score = initial_score
         for program in all_programs:
             prompt_content = program["prompt"]
             score = program["score"]
+            iteration = program["iteration"]
+            # Skip iteration 0 (that's the initial prompt, already added separately)
+            if iteration == 0:
+                continue
             # Create a normalized version for duplicate detection (ignore whitespace differences)
             normalized_prompt = " ".join(prompt_content.split())
             if score > current_best_score:
                 seen_prompts.add(normalized_prompt)
                 best_programs.append(program)
+                improvement = score - current_best_score
+                print(f"  ✓ Best program at iteration {iteration}: score={score:.2%} (improved by +{improvement:.2%})")
                 current_best_score = score
         return best_programs
             all_prompts = []
             # Add initial prompt
+            initial_score = initial_eval['accuracy'] / 100.0  # Convert to 0-1 scale
             all_prompts.append({
                 "prompt": initial_prompt,
+                "score": initial_score,
                 "label": "Initial Prompt",
                 "iteration": 0
             })
             # Add evolved prompts (only programs that were "best" at some point)
+            # Pass initial_score so we only show programs that BEAT the initial prompt
+            prompt_history = collect_prompt_history(output_dir, initial_score=initial_score)
             for i, p in enumerate(prompt_history):
                 # Skip if it's the same as initial (shouldn't happen, but just in case)
                 if i == 0 and p.get("iteration", -1) == 0: