codelion commited on
Commit
741a123
·
verified ·
1 Parent(s): e35db16

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -7
app.py CHANGED
@@ -247,10 +247,14 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
247
  }
248
 
249
 
250
- def collect_prompt_history(output_dir: str) -> List[Dict]:
251
  """
252
  Collect only the prompts that were "best" at some point during evolution.
253
- Returns: initial prompt + any program that improved the best score (deduplicated).
 
 
 
 
254
 
255
  Returns a list of dicts with: {prompt, score, iteration, id}
256
  """
@@ -307,12 +311,18 @@ def collect_prompt_history(output_dir: str) -> List[Dict]:
307
  all_programs.sort(key=lambda x: x.get("iteration", 0))
308
 
309
  # Filter to keep only programs that improved the best score
 
310
  best_programs = []
311
- current_best_score = -float('inf')
312
 
313
  for program in all_programs:
314
  prompt_content = program["prompt"]
315
  score = program["score"]
 
 
 
 
 
316
 
317
  # Create a normalized version for duplicate detection (ignore whitespace differences)
318
  normalized_prompt = " ".join(prompt_content.split())
@@ -325,8 +335,9 @@ def collect_prompt_history(output_dir: str) -> List[Dict]:
325
  if score > current_best_score:
326
  seen_prompts.add(normalized_prompt)
327
  best_programs.append(program)
 
 
328
  current_best_score = score
329
- print(f" Best program at iteration {program['iteration']}: score={score:.2%}")
330
 
331
  return best_programs
332
 
@@ -870,16 +881,17 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
870
  all_prompts = []
871
 
872
  # Add initial prompt
 
873
  all_prompts.append({
874
  "prompt": initial_prompt,
875
- "score": initial_eval['accuracy'] / 100.0, # Convert to 0-1 scale
876
  "label": "Initial Prompt",
877
  "iteration": 0
878
  })
879
 
880
  # Add evolved prompts (only programs that were "best" at some point)
881
- # These are already filtered to show progression: initial better best
882
- prompt_history = collect_prompt_history(output_dir)
883
  for i, p in enumerate(prompt_history):
884
  # Skip if it's the same as initial (shouldn't happen, but just in case)
885
  if i == 0 and p.get("iteration", -1) == 0:
 
247
  }
248
 
249
 
250
+ def collect_prompt_history(output_dir: str, initial_score: float = 0.0) -> List[Dict]:
251
  """
252
  Collect only the prompts that were "best" at some point during evolution.
253
+ Returns only programs that improved upon the initial score (deduplicated).
254
+
255
+ Args:
256
+ output_dir: Directory containing checkpoint data
257
+ initial_score: Score of the initial prompt (baseline to beat)
258
 
259
  Returns a list of dicts with: {prompt, score, iteration, id}
260
  """
 
311
  all_programs.sort(key=lambda x: x.get("iteration", 0))
312
 
313
  # Filter to keep only programs that improved the best score
314
+ # Start from the initial score as the baseline
315
  best_programs = []
316
+ current_best_score = initial_score
317
 
318
  for program in all_programs:
319
  prompt_content = program["prompt"]
320
  score = program["score"]
321
+ iteration = program["iteration"]
322
+
323
+ # Skip iteration 0 (that's the initial prompt, already added separately)
324
+ if iteration == 0:
325
+ continue
326
 
327
  # Create a normalized version for duplicate detection (ignore whitespace differences)
328
  normalized_prompt = " ".join(prompt_content.split())
 
335
  if score > current_best_score:
336
  seen_prompts.add(normalized_prompt)
337
  best_programs.append(program)
338
+ improvement = score - current_best_score
339
+ print(f" ✓ Best program at iteration {iteration}: score={score:.2%} (improved by +{improvement:.2%})")
340
  current_best_score = score
 
341
 
342
  return best_programs
343
 
 
881
  all_prompts = []
882
 
883
  # Add initial prompt
884
+ initial_score = initial_eval['accuracy'] / 100.0 # Convert to 0-1 scale
885
  all_prompts.append({
886
  "prompt": initial_prompt,
887
+ "score": initial_score,
888
  "label": "Initial Prompt",
889
  "iteration": 0
890
  })
891
 
892
  # Add evolved prompts (only programs that were "best" at some point)
893
+ # Pass initial_score so we only show programs that BEAT the initial prompt
894
+ prompt_history = collect_prompt_history(output_dir, initial_score=initial_score)
895
  for i, p in enumerate(prompt_history):
896
  # Skip if it's the same as initial (shouldn't happen, but just in case)
897
  if i == 0 and p.get("iteration", -1) == 0: