Upload app.py
Browse files
app.py
CHANGED
|
@@ -504,7 +504,7 @@ def parse_evolution_history(output_dir: str) -> str:
|
|
| 504 |
|
| 505 |
def create_evaluator_file(dataset_name: str, split: str, model: str,
|
| 506 |
input_field: str, target_field: str, work_dir: str):
|
| 507 |
-
"""Create an evaluator.py file for OpenEvolve that uses
|
| 508 |
evaluator_code = f'''
|
| 509 |
import os
|
| 510 |
import random
|
|
@@ -514,9 +514,9 @@ from openai import OpenAI
|
|
| 514 |
|
| 515 |
def evaluate(prompt: str) -> dict:
|
| 516 |
"""
|
| 517 |
-
Evaluate a prompt using
|
| 518 |
|
| 519 |
-
|
| 520 |
Includes early stopping and rate limit handling.
|
| 521 |
"""
|
| 522 |
try:
|
|
@@ -535,8 +535,8 @@ def evaluate(prompt: str) -> dict:
|
|
| 535 |
else:
|
| 536 |
raise
|
| 537 |
|
| 538 |
-
# Sample
|
| 539 |
-
num_samples =
|
| 540 |
if len(dataset) > num_samples:
|
| 541 |
# Use SAME sampling logic as initial/final eval
|
| 542 |
indices = random.sample(range(len(dataset)), num_samples)
|
|
@@ -973,7 +973,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 973 |
- **Model**: {model}
|
| 974 |
- **Initial Eval**: 50 samples
|
| 975 |
- **Final Eval**: 50 samples (same samples for fair comparison)
|
| 976 |
-
- **Evolution**:
|
| 977 |
- **Iterations**: 10
|
| 978 |
|
| 979 |
### Results
|
|
|
|
| 504 |
|
| 505 |
def create_evaluator_file(dataset_name: str, split: str, model: str,
|
| 506 |
input_field: str, target_field: str, work_dir: str):
|
| 507 |
+
"""Create an evaluator.py file for OpenEvolve that uses same 50 samples as initial/final eval."""
|
| 508 |
evaluator_code = f'''
|
| 509 |
import os
|
| 510 |
import random
|
|
|
|
| 514 |
|
| 515 |
def evaluate(prompt: str) -> dict:
|
| 516 |
"""
|
| 517 |
+
Evaluate a prompt using 50 fixed samples - SAME as initial and final evaluation.
|
| 518 |
|
| 519 |
+
Using the same 50 samples ensures evolution optimizes for the exact test set.
|
| 520 |
Includes early stopping and rate limit handling.
|
| 521 |
"""
|
| 522 |
try:
|
|
|
|
| 535 |
else:
|
| 536 |
raise
|
| 537 |
|
| 538 |
+
# Sample 50 samples with seed 42 - SAME as initial/final evaluation for consistency!
|
| 539 |
+
num_samples = 50
|
| 540 |
if len(dataset) > num_samples:
|
| 541 |
# Use SAME sampling logic as initial/final eval
|
| 542 |
indices = random.sample(range(len(dataset)), num_samples)
|
|
|
|
| 973 |
- **Model**: {model}
|
| 974 |
- **Initial Eval**: 50 samples
|
| 975 |
- **Final Eval**: 50 samples (same samples for fair comparison)
|
| 976 |
+
- **Evolution**: 50 samples per variant (SAME samples as initial/final for consistency!)
|
| 977 |
- **Iterations**: 10
|
| 978 |
|
| 979 |
### Results
|