Upload app.py
Browse files
app.py
CHANGED
|
@@ -529,7 +529,7 @@ def parse_evolution_history(output_dir: str) -> str:
|
|
| 529 |
|
| 530 |
def create_evaluator_file(dataset_name: str, split: str, model: str,
|
| 531 |
input_field: str, target_field: str, work_dir: str):
|
| 532 |
-
"""Create an evaluator.py file for OpenEvolve
|
| 533 |
evaluator_code = f'''
|
| 534 |
import os
|
| 535 |
import random
|
|
@@ -538,29 +538,20 @@ from openai import OpenAI
|
|
| 538 |
|
| 539 |
def evaluate(prompt: str) -> dict:
|
| 540 |
"""
|
| 541 |
-
Evaluate a prompt using
|
| 542 |
|
| 543 |
-
|
| 544 |
-
-
|
| 545 |
-
- If accuracy < 0.5, return early (no point wasting 200 more samples)
|
| 546 |
-
|
| 547 |
-
Stage 2: Evaluate with 200 more samples (total 250)
|
| 548 |
-
- Combine results for final score
|
| 549 |
-
|
| 550 |
-
Returns dict with combined_score (0-1), accuracy, correct, and total.
|
| 551 |
"""
|
| 552 |
try:
|
| 553 |
# IMPORTANT: Use fixed seed for consistent sampling across all evaluations
|
| 554 |
random.seed(42)
|
| 555 |
|
| 556 |
# Load dataset
|
| 557 |
-
# Try loading with just dataset name first
|
| 558 |
try:
|
| 559 |
dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
|
| 560 |
except ValueError as e:
|
| 561 |
-
# If it fails with config error, try common configs
|
| 562 |
if "config" in str(e).lower() or "Config name is missing" in str(e):
|
| 563 |
-
# Try common configs based on dataset name
|
| 564 |
default_config = "main"
|
| 565 |
if "{dataset_name}".lower() == "glue":
|
| 566 |
default_config = "sst2"
|
|
@@ -568,6 +559,16 @@ def evaluate(prompt: str) -> dict:
|
|
| 568 |
else:
|
| 569 |
raise
|
| 570 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 571 |
# Initialize OpenAI client
|
| 572 |
api_key = os.environ.get("OPENAI_API_KEY")
|
| 573 |
client = OpenAI(
|
|
@@ -575,172 +576,116 @@ def evaluate(prompt: str) -> dict:
|
|
| 575 |
api_key=api_key,
|
| 576 |
)
|
| 577 |
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
correct = correct_so_far
|
| 581 |
-
total = total_so_far
|
| 582 |
|
| 583 |
-
|
| 584 |
-
try:
|
| 585 |
-
# Get input and target
|
| 586 |
-
input_text = sample.get("{input_field}", "")
|
| 587 |
-
if isinstance(input_text, dict):
|
| 588 |
-
input_text = str(input_text)
|
| 589 |
-
|
| 590 |
-
target = sample.get("{target_field}", "")
|
| 591 |
-
if isinstance(target, dict):
|
| 592 |
-
target = str(target)
|
| 593 |
-
|
| 594 |
-
# Format the prompt
|
| 595 |
-
formatted_prompt = prompt.replace("{{input}}", str(input_text))
|
| 596 |
-
|
| 597 |
-
# Call the model
|
| 598 |
-
response = client.chat.completions.create(
|
| 599 |
-
model="{model}",
|
| 600 |
-
messages=[
|
| 601 |
-
{{"role": "system", "content": "You are a helpful assistant."}},
|
| 602 |
-
{{"role": "user", "content": formatted_prompt}}
|
| 603 |
-
],
|
| 604 |
-
temperature=0.0,
|
| 605 |
-
max_tokens=500,
|
| 606 |
-
)
|
| 607 |
-
|
| 608 |
-
prediction = response.choices[0].message.content.strip()
|
| 609 |
-
|
| 610 |
-
# Smart evaluation - handle both math and text answers
|
| 611 |
-
target_str = str(target).strip()
|
| 612 |
-
pred_str = prediction.strip()
|
| 613 |
-
|
| 614 |
-
def extract_answer(text):
|
| 615 |
-
"""Extract answer from text - handles GSM8K format and general text"""
|
| 616 |
-
import re
|
| 617 |
-
|
| 618 |
-
# GSM8K format: "#### NUMBER" at the end
|
| 619 |
-
if "####" in text:
|
| 620 |
-
parts = text.split("####")
|
| 621 |
-
if len(parts) > 1:
|
| 622 |
-
answer_part = parts[-1].strip()
|
| 623 |
-
# Remove comma separators (1,000 -> 1000)
|
| 624 |
-
answer_part = answer_part.replace(',', '')
|
| 625 |
-
return answer_part
|
| 626 |
-
|
| 627 |
-
# Try to extract last number from free-form text
|
| 628 |
-
numbers = re.findall(r'-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?', text)
|
| 629 |
-
if numbers:
|
| 630 |
-
# Return the last number found (usually the final answer)
|
| 631 |
-
return numbers[-1].replace(',', '')
|
| 632 |
-
|
| 633 |
-
return text
|
| 634 |
-
|
| 635 |
-
def is_mathematically_equal(str1, str2):
|
| 636 |
-
"""Check if two strings represent the same mathematical value"""
|
| 637 |
-
try:
|
| 638 |
-
# Try to convert both to floats and compare
|
| 639 |
-
num1 = float(str1.replace(',', ''))
|
| 640 |
-
num2 = float(str2.replace(',', ''))
|
| 641 |
-
# Use small epsilon for float comparison
|
| 642 |
-
return abs(num1 - num2) < 1e-6
|
| 643 |
-
except (ValueError, AttributeError):
|
| 644 |
-
# If conversion fails, do string comparison
|
| 645 |
-
return str1.lower().strip() == str2.lower().strip()
|
| 646 |
-
|
| 647 |
-
# Extract answers
|
| 648 |
-
target_answer = extract_answer(target_str)
|
| 649 |
-
pred_answer = extract_answer(pred_str)
|
| 650 |
-
|
| 651 |
-
# Check if answers match mathematically or textually
|
| 652 |
-
is_correct = is_mathematically_equal(target_answer, pred_answer)
|
| 653 |
-
|
| 654 |
-
# Fallback: check for semantic equivalents for sentiment analysis
|
| 655 |
-
if not is_correct:
|
| 656 |
-
target_lower = target_answer.lower()
|
| 657 |
-
pred_lower = pred_answer.lower()
|
| 658 |
-
|
| 659 |
-
# Sentiment mappings with expanded synonyms
|
| 660 |
-
positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic",
|
| 661 |
-
"amazing", "love", "best", "1", "pos", "admiration", "appreciation",
|
| 662 |
-
"praise", "favorable", "approve"]
|
| 663 |
-
negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate",
|
| 664 |
-
"0", "neg", "criticism", "disdain", "disapproval", "unfavorable",
|
| 665 |
-
"critique", "condemn", "sarcasm"]
|
| 666 |
-
|
| 667 |
-
if target_lower in ["1", "positive", "pos"]:
|
| 668 |
-
is_correct = any(word in pred_lower for word in positive_words)
|
| 669 |
-
elif target_lower in ["0", "negative", "neg"]:
|
| 670 |
-
is_correct = any(word in pred_lower for word in negative_words)
|
| 671 |
-
|
| 672 |
-
if is_correct:
|
| 673 |
-
correct += 1
|
| 674 |
-
total += 1
|
| 675 |
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
|
|
|
|
|
|
|
|
|
| 679 |
|
| 680 |
-
|
|
|
|
|
|
|
| 681 |
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
stage1_samples_count = min(stage1_size, len(dataset))
|
| 685 |
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 744 |
|
| 745 |
except Exception as e:
|
| 746 |
print(f"Error in evaluation: {{e}}")
|
|
@@ -1028,7 +973,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 1028 |
- **Model**: {model}
|
| 1029 |
- **Initial Eval**: 200 samples
|
| 1030 |
- **Final Eval**: 200 samples (same samples for fair comparison)
|
| 1031 |
-
- **Evolution**:
|
| 1032 |
- **Iterations**: 10
|
| 1033 |
|
| 1034 |
### Results
|
|
@@ -1176,7 +1121,7 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 1176 |
- **Evaluation strategy**:
|
| 1177 |
- Initial evaluation: 200 samples
|
| 1178 |
- Final evaluation: Same 200 samples (fair comparison)
|
| 1179 |
-
- Evolution:
|
| 1180 |
- Compare initial vs best prompt side-by-side with identical test sets
|
| 1181 |
|
| 1182 |
### About OpenEvolve:
|
|
|
|
| 529 |
|
| 530 |
def create_evaluator_file(dataset_name: str, split: str, model: str,
|
| 531 |
input_field: str, target_field: str, work_dir: str):
|
| 532 |
+
"""Create an evaluator.py file for OpenEvolve that uses fixed 200 samples."""
|
| 533 |
evaluator_code = f'''
|
| 534 |
import os
|
| 535 |
import random
|
|
|
|
| 538 |
|
| 539 |
def evaluate(prompt: str) -> dict:
|
| 540 |
"""
|
| 541 |
+
Evaluate a prompt using 200 fixed samples (same as initial/final eval).
|
| 542 |
|
| 543 |
+
This ensures evolution optimizes for the SAME test set we measure on.
|
| 544 |
+
No staging - always evaluates all 200 samples for consistency.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 545 |
"""
|
| 546 |
try:
|
| 547 |
# IMPORTANT: Use fixed seed for consistent sampling across all evaluations
|
| 548 |
random.seed(42)
|
| 549 |
|
| 550 |
# Load dataset
|
|
|
|
| 551 |
try:
|
| 552 |
dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
|
| 553 |
except ValueError as e:
|
|
|
|
| 554 |
if "config" in str(e).lower() or "Config name is missing" in str(e):
|
|
|
|
| 555 |
default_config = "main"
|
| 556 |
if "{dataset_name}".lower() == "glue":
|
| 557 |
default_config = "sst2"
|
|
|
|
| 559 |
else:
|
| 560 |
raise
|
| 561 |
|
| 562 |
+
# Sample 200 samples with seed 42 (SAME as initial/final eval)
|
| 563 |
+
num_samples = 200
|
| 564 |
+
if len(dataset) > num_samples:
|
| 565 |
+
# Use SAME sampling logic as initial/final eval
|
| 566 |
+
indices = random.sample(range(len(dataset)), num_samples)
|
| 567 |
+
samples = [dataset[i] for i in indices]
|
| 568 |
+
else:
|
| 569 |
+
indices = list(range(min(num_samples, len(dataset))))
|
| 570 |
+
samples = list(dataset)[:num_samples]
|
| 571 |
+
|
| 572 |
# Initialize OpenAI client
|
| 573 |
api_key = os.environ.get("OPENAI_API_KEY")
|
| 574 |
client = OpenAI(
|
|
|
|
| 576 |
api_key=api_key,
|
| 577 |
)
|
| 578 |
|
| 579 |
+
correct = 0
|
| 580 |
+
total = 0
|
|
|
|
|
|
|
| 581 |
|
| 582 |
+
print(f"Evaluating on {{len(samples)}} samples...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 583 |
|
| 584 |
+
for idx, sample in enumerate(samples):
|
| 585 |
+
try:
|
| 586 |
+
# Get input and target
|
| 587 |
+
input_text = sample.get("{input_field}", "")
|
| 588 |
+
if isinstance(input_text, dict):
|
| 589 |
+
input_text = str(input_text)
|
| 590 |
|
| 591 |
+
target = sample.get("{target_field}", "")
|
| 592 |
+
if isinstance(target, dict):
|
| 593 |
+
target = str(target)
|
| 594 |
|
| 595 |
+
# Format the prompt
|
| 596 |
+
formatted_prompt = prompt.replace("{{input}}", str(input_text))
|
|
|
|
| 597 |
|
| 598 |
+
# Call the model
|
| 599 |
+
response = client.chat.completions.create(
|
| 600 |
+
model="{model}",
|
| 601 |
+
messages=[
|
| 602 |
+
{{"role": "system", "content": "You are a helpful assistant."}},
|
| 603 |
+
{{"role": "user", "content": formatted_prompt}}
|
| 604 |
+
],
|
| 605 |
+
temperature=0.0,
|
| 606 |
+
max_tokens=500,
|
| 607 |
+
)
|
| 608 |
+
|
| 609 |
+
prediction = response.choices[0].message.content.strip()
|
| 610 |
+
|
| 611 |
+
# Smart evaluation - handle both math and text answers
|
| 612 |
+
target_str = str(target).strip()
|
| 613 |
+
pred_str = prediction.strip()
|
| 614 |
+
|
| 615 |
+
def extract_answer(text):
|
| 616 |
+
"""Extract answer from text - handles GSM8K format and general text"""
|
| 617 |
+
import re
|
| 618 |
+
|
| 619 |
+
# GSM8K format: "#### NUMBER" at the end
|
| 620 |
+
if "####" in text:
|
| 621 |
+
parts = text.split("####")
|
| 622 |
+
if len(parts) > 1:
|
| 623 |
+
answer_part = parts[-1].strip()
|
| 624 |
+
answer_part = answer_part.replace(',', '')
|
| 625 |
+
return answer_part
|
| 626 |
+
|
| 627 |
+
# Try to extract last number from free-form text
|
| 628 |
+
numbers = re.findall(r'-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?', text)
|
| 629 |
+
if numbers:
|
| 630 |
+
return numbers[-1].replace(',', '')
|
| 631 |
+
|
| 632 |
+
return text
|
| 633 |
+
|
| 634 |
+
def is_mathematically_equal(str1, str2):
|
| 635 |
+
"""Check if two strings represent the same mathematical value"""
|
| 636 |
+
try:
|
| 637 |
+
num1 = float(str1.replace(',', ''))
|
| 638 |
+
num2 = float(str2.replace(',', ''))
|
| 639 |
+
return abs(num1 - num2) < 1e-6
|
| 640 |
+
except (ValueError, AttributeError):
|
| 641 |
+
return str1.lower().strip() == str2.lower().strip()
|
| 642 |
+
|
| 643 |
+
# Extract answers
|
| 644 |
+
target_answer = extract_answer(target_str)
|
| 645 |
+
pred_answer = extract_answer(pred_str)
|
| 646 |
+
|
| 647 |
+
# Check if answers match mathematically or textually
|
| 648 |
+
is_correct = is_mathematically_equal(target_answer, pred_answer)
|
| 649 |
+
|
| 650 |
+
# Fallback: check for semantic equivalents for sentiment analysis
|
| 651 |
+
if not is_correct:
|
| 652 |
+
target_lower = target_answer.lower()
|
| 653 |
+
pred_lower = pred_answer.lower()
|
| 654 |
+
|
| 655 |
+
# Sentiment mappings with expanded synonyms
|
| 656 |
+
positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic",
|
| 657 |
+
"amazing", "love", "best", "1", "pos", "admiration", "appreciation",
|
| 658 |
+
"praise", "favorable", "approve"]
|
| 659 |
+
negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate",
|
| 660 |
+
"0", "neg", "criticism", "disdain", "disapproval", "unfavorable",
|
| 661 |
+
"critique", "condemn", "sarcasm"]
|
| 662 |
+
|
| 663 |
+
if target_lower in ["1", "positive", "pos"]:
|
| 664 |
+
is_correct = any(word in pred_lower for word in positive_words)
|
| 665 |
+
elif target_lower in ["0", "negative", "neg"]:
|
| 666 |
+
is_correct = any(word in pred_lower for word in negative_words)
|
| 667 |
+
|
| 668 |
+
if is_correct:
|
| 669 |
+
correct += 1
|
| 670 |
+
total += 1
|
| 671 |
+
|
| 672 |
+
if (idx + 1) % 50 == 0:
|
| 673 |
+
print(f" Progress: {{idx + 1}}/{{len(samples)}} - Current accuracy: {{correct/total:.2%}}")
|
| 674 |
+
|
| 675 |
+
except Exception as e:
|
| 676 |
+
print(f"Error evaluating sample {{idx+1}}: {{e}}")
|
| 677 |
+
continue
|
| 678 |
+
|
| 679 |
+
accuracy = (correct / total) if total > 0 else 0.0
|
| 680 |
+
|
| 681 |
+
print(f"Final: {{correct}}/{{total}} = {{accuracy:.2%}}")
|
| 682 |
+
|
| 683 |
+
return {{
|
| 684 |
+
"combined_score": accuracy,
|
| 685 |
+
"accuracy": accuracy,
|
| 686 |
+
"correct": correct,
|
| 687 |
+
"total": total
|
| 688 |
+
}}
|
| 689 |
|
| 690 |
except Exception as e:
|
| 691 |
print(f"Error in evaluation: {{e}}")
|
|
|
|
| 973 |
- **Model**: {model}
|
| 974 |
- **Initial Eval**: 200 samples
|
| 975 |
- **Final Eval**: 200 samples (same samples for fair comparison)
|
| 976 |
+
- **Evolution**: 200 samples per variant (same samples as initial/final)
|
| 977 |
- **Iterations**: 10
|
| 978 |
|
| 979 |
### Results
|
|
|
|
| 1121 |
- **Evaluation strategy**:
|
| 1122 |
- Initial evaluation: 200 samples
|
| 1123 |
- Final evaluation: Same 200 samples (fair comparison)
|
| 1124 |
+
- Evolution: Each variant tested on same 200 samples (ensures optimization aligns with test set)
|
| 1125 |
- Compare initial vs best prompt side-by-side with identical test sets
|
| 1126 |
|
| 1127 |
### About OpenEvolve:
|