Upload app.py
Browse files
app.py
CHANGED
|
@@ -516,10 +516,22 @@ def evaluate(prompt: str) -> dict:
|
|
| 516 |
"""
|
| 517 |
Evaluate a prompt using 50 fixed samples - SAME as initial and final evaluation.
|
| 518 |
|
|
|
|
| 519 |
Using the same 50 samples ensures evolution optimizes for the exact test set.
|
| 520 |
Includes early stopping and rate limit handling.
|
| 521 |
"""
|
| 522 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 523 |
# IMPORTANT: Use fixed seed for consistent sampling across all evaluations
|
| 524 |
random.seed(42)
|
| 525 |
|
|
@@ -569,8 +581,8 @@ def evaluate(prompt: str) -> dict:
|
|
| 569 |
if isinstance(target, dict):
|
| 570 |
target = str(target)
|
| 571 |
|
| 572 |
-
# Format the prompt
|
| 573 |
-
formatted_prompt =
|
| 574 |
|
| 575 |
# Call the model with retry logic for transient failures
|
| 576 |
max_retries = 3
|
|
@@ -646,8 +658,8 @@ def evaluate(prompt: str) -> dict:
|
|
| 646 |
|
| 647 |
print(f"Final: {{correct}}/{{total}} = {{accuracy:.2%}}")
|
| 648 |
|
| 649 |
-
# DEBUG: Log the prompt being evaluated and its score
|
| 650 |
-
prompt_preview =
|
| 651 |
print(f"[EVAL DEBUG] Prompt: '{{prompt_preview}}...' → Score: {{accuracy:.2%}}")
|
| 652 |
|
| 653 |
return {{
|
|
@@ -826,10 +838,14 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 826 |
work_dir = tempfile.mkdtemp(prefix="openevolve_")
|
| 827 |
|
| 828 |
try:
|
| 829 |
-
# Save initial prompt
|
|
|
|
| 830 |
initial_prompt_path = os.path.join(work_dir, "initial_prompt.txt")
|
| 831 |
with open(initial_prompt_path, "w") as f:
|
|
|
|
|
|
|
| 832 |
f.write(initial_prompt)
|
|
|
|
| 833 |
|
| 834 |
# Create evaluator
|
| 835 |
progress(0.1, desc="Creating evaluator...")
|
|
@@ -929,16 +945,22 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 929 |
best_prompt_path = os.path.join(output_dir, "best", "best_program.txt")
|
| 930 |
if os.path.exists(best_prompt_path):
|
| 931 |
with open(best_prompt_path, "r") as f:
|
| 932 |
-
|
|
|
|
|
|
|
| 933 |
print(f"\n[SELECTION] OpenEvolve selected best prompt from: {best_prompt_path}")
|
|
|
|
| 934 |
print(f"[SELECTION] Best prompt: '{best_prompt[:100].replace(chr(10), ' ')}...'")
|
| 935 |
else:
|
| 936 |
# Fallback: try without the "best" subdirectory
|
| 937 |
best_prompt_path_alt = os.path.join(output_dir, "best_program.txt")
|
| 938 |
if os.path.exists(best_prompt_path_alt):
|
| 939 |
with open(best_prompt_path_alt, "r") as f:
|
| 940 |
-
|
|
|
|
|
|
|
| 941 |
print(f"\n[SELECTION] OpenEvolve selected best prompt from: {best_prompt_path_alt}")
|
|
|
|
| 942 |
print(f"[SELECTION] Best prompt: '{best_prompt[:100].replace(chr(10), ' ')}...'")
|
| 943 |
else:
|
| 944 |
best_prompt = initial_prompt
|
|
|
|
| 516 |
"""
|
| 517 |
Evaluate a prompt using 50 fixed samples - SAME as initial and final evaluation.
|
| 518 |
|
| 519 |
+
OpenEvolve passes a file path, so we need to read the prompt from the file.
|
| 520 |
Using the same 50 samples ensures evolution optimizes for the exact test set.
|
| 521 |
Includes early stopping and rate limit handling.
|
| 522 |
"""
|
| 523 |
try:
|
| 524 |
+
# CRITICAL: OpenEvolve passes a FILE PATH, not the prompt text!
|
| 525 |
+
# Check if prompt is a file path and read it
|
| 526 |
+
if os.path.exists(prompt):
|
| 527 |
+
with open(prompt, 'r') as f:
|
| 528 |
+
prompt_text = f.read()
|
| 529 |
+
# Strip EVOLVE-BLOCK markers if present
|
| 530 |
+
prompt_text = prompt_text.replace("# EVOLVE-BLOCK-START", "").replace("# EVOLVE-BLOCK-END", "").strip()
|
| 531 |
+
else:
|
| 532 |
+
# If not a file path, use as-is (for backward compatibility)
|
| 533 |
+
prompt_text = prompt
|
| 534 |
+
|
| 535 |
# IMPORTANT: Use fixed seed for consistent sampling across all evaluations
|
| 536 |
random.seed(42)
|
| 537 |
|
|
|
|
| 581 |
if isinstance(target, dict):
|
| 582 |
target = str(target)
|
| 583 |
|
| 584 |
+
# Format the prompt (use prompt_text that we read from file)
|
| 585 |
+
formatted_prompt = prompt_text.replace("{{input}}", str(input_text))
|
| 586 |
|
| 587 |
# Call the model with retry logic for transient failures
|
| 588 |
max_retries = 3
|
|
|
|
| 658 |
|
| 659 |
print(f"Final: {{correct}}/{{total}} = {{accuracy:.2%}}")
|
| 660 |
|
| 661 |
+
# DEBUG: Log the prompt being evaluated and its score (use prompt_text, not file path)
|
| 662 |
+
prompt_preview = prompt_text[:80].replace('\\n', ' ') if len(prompt_text) > 80 else prompt_text.replace('\\n', ' ')
|
| 663 |
print(f"[EVAL DEBUG] Prompt: '{{prompt_preview}}...' → Score: {{accuracy:.2%}}")
|
| 664 |
|
| 665 |
return {{
|
|
|
|
| 838 |
work_dir = tempfile.mkdtemp(prefix="openevolve_")
|
| 839 |
|
| 840 |
try:
|
| 841 |
+
# Save initial prompt with EVOLVE-BLOCK markers for OpenEvolve
|
| 842 |
+
# These markers tell OpenEvolve which part to optimize
|
| 843 |
initial_prompt_path = os.path.join(work_dir, "initial_prompt.txt")
|
| 844 |
with open(initial_prompt_path, "w") as f:
|
| 845 |
+
# Wrap prompt in evolve markers so OpenEvolve knows what to optimize
|
| 846 |
+
f.write("# EVOLVE-BLOCK-START\n")
|
| 847 |
f.write(initial_prompt)
|
| 848 |
+
f.write("\n# EVOLVE-BLOCK-END\n")
|
| 849 |
|
| 850 |
# Create evaluator
|
| 851 |
progress(0.1, desc="Creating evaluator...")
|
|
|
|
| 945 |
best_prompt_path = os.path.join(output_dir, "best", "best_program.txt")
|
| 946 |
if os.path.exists(best_prompt_path):
|
| 947 |
with open(best_prompt_path, "r") as f:
|
| 948 |
+
best_prompt_raw = f.read()
|
| 949 |
+
# Strip EVOLVE-BLOCK markers that we added
|
| 950 |
+
best_prompt = best_prompt_raw.replace("# EVOLVE-BLOCK-START", "").replace("# EVOLVE-BLOCK-END", "").strip()
|
| 951 |
print(f"\n[SELECTION] OpenEvolve selected best prompt from: {best_prompt_path}")
|
| 952 |
+
print(f"[SELECTION] Raw prompt length: {len(best_prompt_raw)} chars")
|
| 953 |
print(f"[SELECTION] Best prompt: '{best_prompt[:100].replace(chr(10), ' ')}...'")
|
| 954 |
else:
|
| 955 |
# Fallback: try without the "best" subdirectory
|
| 956 |
best_prompt_path_alt = os.path.join(output_dir, "best_program.txt")
|
| 957 |
if os.path.exists(best_prompt_path_alt):
|
| 958 |
with open(best_prompt_path_alt, "r") as f:
|
| 959 |
+
best_prompt_raw = f.read()
|
| 960 |
+
# Strip EVOLVE-BLOCK markers
|
| 961 |
+
best_prompt = best_prompt_raw.replace("# EVOLVE-BLOCK-START", "").replace("# EVOLVE-BLOCK-END", "").strip()
|
| 962 |
print(f"\n[SELECTION] OpenEvolve selected best prompt from: {best_prompt_path_alt}")
|
| 963 |
+
print(f"[SELECTION] Raw prompt length: {len(best_prompt_raw)} chars")
|
| 964 |
print(f"[SELECTION] Best prompt: '{best_prompt[:100].replace(chr(10), ' ')}...'")
|
| 965 |
else:
|
| 966 |
best_prompt = initial_prompt
|