Upload app.py
Browse files
app.py
CHANGED
|
@@ -247,10 +247,14 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
|
|
| 247 |
}
|
| 248 |
|
| 249 |
|
| 250 |
-
def collect_prompt_history(output_dir: str) -> List[Dict]:
|
| 251 |
"""
|
| 252 |
Collect only the prompts that were "best" at some point during evolution.
|
| 253 |
-
Returns
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
Returns a list of dicts with: {prompt, score, iteration, id}
|
| 256 |
"""
|
|
@@ -307,12 +311,18 @@ def collect_prompt_history(output_dir: str) -> List[Dict]:
|
|
| 307 |
all_programs.sort(key=lambda x: x.get("iteration", 0))
|
| 308 |
|
| 309 |
# Filter to keep only programs that improved the best score
|
|
|
|
| 310 |
best_programs = []
|
| 311 |
-
current_best_score =
|
| 312 |
|
| 313 |
for program in all_programs:
|
| 314 |
prompt_content = program["prompt"]
|
| 315 |
score = program["score"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
# Create a normalized version for duplicate detection (ignore whitespace differences)
|
| 318 |
normalized_prompt = " ".join(prompt_content.split())
|
|
@@ -325,8 +335,9 @@ def collect_prompt_history(output_dir: str) -> List[Dict]:
|
|
| 325 |
if score > current_best_score:
|
| 326 |
seen_prompts.add(normalized_prompt)
|
| 327 |
best_programs.append(program)
|
|
|
|
|
|
|
| 328 |
current_best_score = score
|
| 329 |
-
print(f" Best program at iteration {program['iteration']}: score={score:.2%}")
|
| 330 |
|
| 331 |
return best_programs
|
| 332 |
|
|
@@ -870,16 +881,17 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 870 |
all_prompts = []
|
| 871 |
|
| 872 |
# Add initial prompt
|
|
|
|
| 873 |
all_prompts.append({
|
| 874 |
"prompt": initial_prompt,
|
| 875 |
-
"score":
|
| 876 |
"label": "Initial Prompt",
|
| 877 |
"iteration": 0
|
| 878 |
})
|
| 879 |
|
| 880 |
# Add evolved prompts (only programs that were "best" at some point)
|
| 881 |
-
#
|
| 882 |
-
prompt_history = collect_prompt_history(output_dir)
|
| 883 |
for i, p in enumerate(prompt_history):
|
| 884 |
# Skip if it's the same as initial (shouldn't happen, but just in case)
|
| 885 |
if i == 0 and p.get("iteration", -1) == 0:
|
|
|
|
| 247 |
}
|
| 248 |
|
| 249 |
|
| 250 |
+
def collect_prompt_history(output_dir: str, initial_score: float = 0.0) -> List[Dict]:
|
| 251 |
"""
|
| 252 |
Collect only the prompts that were "best" at some point during evolution.
|
| 253 |
+
Returns only programs that improved upon the initial score (deduplicated).
|
| 254 |
+
|
| 255 |
+
Args:
|
| 256 |
+
output_dir: Directory containing checkpoint data
|
| 257 |
+
initial_score: Score of the initial prompt (baseline to beat)
|
| 258 |
|
| 259 |
Returns a list of dicts with: {prompt, score, iteration, id}
|
| 260 |
"""
|
|
|
|
| 311 |
all_programs.sort(key=lambda x: x.get("iteration", 0))
|
| 312 |
|
| 313 |
# Filter to keep only programs that improved the best score
|
| 314 |
+
# Start from the initial score as the baseline
|
| 315 |
best_programs = []
|
| 316 |
+
current_best_score = initial_score
|
| 317 |
|
| 318 |
for program in all_programs:
|
| 319 |
prompt_content = program["prompt"]
|
| 320 |
score = program["score"]
|
| 321 |
+
iteration = program["iteration"]
|
| 322 |
+
|
| 323 |
+
# Skip iteration 0 (that's the initial prompt, already added separately)
|
| 324 |
+
if iteration == 0:
|
| 325 |
+
continue
|
| 326 |
|
| 327 |
# Create a normalized version for duplicate detection (ignore whitespace differences)
|
| 328 |
normalized_prompt = " ".join(prompt_content.split())
|
|
|
|
| 335 |
if score > current_best_score:
|
| 336 |
seen_prompts.add(normalized_prompt)
|
| 337 |
best_programs.append(program)
|
| 338 |
+
improvement = score - current_best_score
|
| 339 |
+
print(f" ✓ Best program at iteration {iteration}: score={score:.2%} (improved by +{improvement:.2%})")
|
| 340 |
current_best_score = score
|
|
|
|
| 341 |
|
| 342 |
return best_programs
|
| 343 |
|
|
|
|
| 881 |
all_prompts = []
|
| 882 |
|
| 883 |
# Add initial prompt
|
| 884 |
+
initial_score = initial_eval['accuracy'] / 100.0 # Convert to 0-1 scale
|
| 885 |
all_prompts.append({
|
| 886 |
"prompt": initial_prompt,
|
| 887 |
+
"score": initial_score,
|
| 888 |
"label": "Initial Prompt",
|
| 889 |
"iteration": 0
|
| 890 |
})
|
| 891 |
|
| 892 |
# Add evolved prompts (only programs that were "best" at some point)
|
| 893 |
+
# Pass initial_score so we only show programs that BEAT the initial prompt
|
| 894 |
+
prompt_history = collect_prompt_history(output_dir, initial_score=initial_score)
|
| 895 |
for i, p in enumerate(prompt_history):
|
| 896 |
# Skip if it's the same as initial (shouldn't happen, but just in case)
|
| 897 |
if i == 0 and p.get("iteration", -1) == 0:
|