codelion commited on
Commit
f5714df
·
verified ·
1 Parent(s): 424a3e8

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -176
app.py CHANGED
@@ -529,7 +529,7 @@ def parse_evolution_history(output_dir: str) -> str:
529
 
530
  def create_evaluator_file(dataset_name: str, split: str, model: str,
531
  input_field: str, target_field: str, work_dir: str):
532
- """Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
533
  evaluator_code = f'''
534
  import os
535
  import random
@@ -538,29 +538,20 @@ from openai import OpenAI
538
 
539
  def evaluate(prompt: str) -> dict:
540
  """
541
- Evaluate a prompt using 2-stage cascading evaluation to save API calls.
542
 
543
- Stage 1: Evaluate with 50 samples
544
- - If accuracy >= 0.5, proceed to Stage 2
545
- - If accuracy < 0.5, return early (no point wasting 200 more samples)
546
-
547
- Stage 2: Evaluate with 200 more samples (total 250)
548
- - Combine results for final score
549
-
550
- Returns dict with combined_score (0-1), accuracy, correct, and total.
551
  """
552
  try:
553
  # IMPORTANT: Use fixed seed for consistent sampling across all evaluations
554
  random.seed(42)
555
 
556
  # Load dataset
557
- # Try loading with just dataset name first
558
  try:
559
  dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
560
  except ValueError as e:
561
- # If it fails with config error, try common configs
562
  if "config" in str(e).lower() or "Config name is missing" in str(e):
563
- # Try common configs based on dataset name
564
  default_config = "main"
565
  if "{dataset_name}".lower() == "glue":
566
  default_config = "sst2"
@@ -568,6 +559,16 @@ def evaluate(prompt: str) -> dict:
568
  else:
569
  raise
570
 
 
 
 
 
 
 
 
 
 
 
571
  # Initialize OpenAI client
572
  api_key = os.environ.get("OPENAI_API_KEY")
573
  client = OpenAI(
@@ -575,172 +576,116 @@ def evaluate(prompt: str) -> dict:
575
  api_key=api_key,
576
  )
577
 
578
- def evaluate_samples(samples, correct_so_far=0, total_so_far=0):
579
- """Helper function to evaluate a batch of samples."""
580
- correct = correct_so_far
581
- total = total_so_far
582
 
583
- for sample in samples:
584
- try:
585
- # Get input and target
586
- input_text = sample.get("{input_field}", "")
587
- if isinstance(input_text, dict):
588
- input_text = str(input_text)
589
-
590
- target = sample.get("{target_field}", "")
591
- if isinstance(target, dict):
592
- target = str(target)
593
-
594
- # Format the prompt
595
- formatted_prompt = prompt.replace("{{input}}", str(input_text))
596
-
597
- # Call the model
598
- response = client.chat.completions.create(
599
- model="{model}",
600
- messages=[
601
- {{"role": "system", "content": "You are a helpful assistant."}},
602
- {{"role": "user", "content": formatted_prompt}}
603
- ],
604
- temperature=0.0,
605
- max_tokens=500,
606
- )
607
-
608
- prediction = response.choices[0].message.content.strip()
609
-
610
- # Smart evaluation - handle both math and text answers
611
- target_str = str(target).strip()
612
- pred_str = prediction.strip()
613
-
614
- def extract_answer(text):
615
- """Extract answer from text - handles GSM8K format and general text"""
616
- import re
617
-
618
- # GSM8K format: "#### NUMBER" at the end
619
- if "####" in text:
620
- parts = text.split("####")
621
- if len(parts) > 1:
622
- answer_part = parts[-1].strip()
623
- # Remove comma separators (1,000 -> 1000)
624
- answer_part = answer_part.replace(',', '')
625
- return answer_part
626
-
627
- # Try to extract last number from free-form text
628
- numbers = re.findall(r'-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?', text)
629
- if numbers:
630
- # Return the last number found (usually the final answer)
631
- return numbers[-1].replace(',', '')
632
-
633
- return text
634
-
635
- def is_mathematically_equal(str1, str2):
636
- """Check if two strings represent the same mathematical value"""
637
- try:
638
- # Try to convert both to floats and compare
639
- num1 = float(str1.replace(',', ''))
640
- num2 = float(str2.replace(',', ''))
641
- # Use small epsilon for float comparison
642
- return abs(num1 - num2) < 1e-6
643
- except (ValueError, AttributeError):
644
- # If conversion fails, do string comparison
645
- return str1.lower().strip() == str2.lower().strip()
646
-
647
- # Extract answers
648
- target_answer = extract_answer(target_str)
649
- pred_answer = extract_answer(pred_str)
650
-
651
- # Check if answers match mathematically or textually
652
- is_correct = is_mathematically_equal(target_answer, pred_answer)
653
-
654
- # Fallback: check for semantic equivalents for sentiment analysis
655
- if not is_correct:
656
- target_lower = target_answer.lower()
657
- pred_lower = pred_answer.lower()
658
-
659
- # Sentiment mappings with expanded synonyms
660
- positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic",
661
- "amazing", "love", "best", "1", "pos", "admiration", "appreciation",
662
- "praise", "favorable", "approve"]
663
- negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate",
664
- "0", "neg", "criticism", "disdain", "disapproval", "unfavorable",
665
- "critique", "condemn", "sarcasm"]
666
-
667
- if target_lower in ["1", "positive", "pos"]:
668
- is_correct = any(word in pred_lower for word in positive_words)
669
- elif target_lower in ["0", "negative", "neg"]:
670
- is_correct = any(word in pred_lower for word in negative_words)
671
-
672
- if is_correct:
673
- correct += 1
674
- total += 1
675
 
676
- except Exception as e:
677
- print(f"Error evaluating sample: {{e}}")
678
- continue
 
 
 
679
 
680
- return correct, total
 
 
681
 
682
- # STAGE 1: Evaluate with 50 samples first
683
- stage1_size = 50
684
- stage1_samples_count = min(stage1_size, len(dataset))
685
 
686
- if len(dataset) > stage1_samples_count:
687
- stage1_indices = random.sample(range(len(dataset)), stage1_samples_count)
688
- stage1_samples = [dataset[i] for i in stage1_indices]
689
- else:
690
- stage1_samples = list(dataset)[:stage1_samples_count]
691
-
692
- print(f"[Stage 1/2] Evaluating with {{len(stage1_samples)}} samples...")
693
- correct, total = evaluate_samples(stage1_samples)
694
- stage1_score = (correct / total) if total > 0 else 0.0
695
-
696
- print(f"[Stage 1/2] Score: {{stage1_score:.3f}} ({{correct}}/{{total}})")
697
-
698
- # Early exit if Stage 1 score is below threshold
699
- if stage1_score < 0.5:
700
- print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 200 API calls)")
701
- return {{
702
- "combined_score": stage1_score,
703
- "accuracy": stage1_score,
704
- "correct": correct,
705
- "total": total,
706
- "stage": "stage1_early_exit"
707
- }}
708
-
709
- # STAGE 2: Continue with 200 more samples
710
- print(f"[Stage 2/2] Score >= 0.5 - proceeding with 200 more samples...")
711
- stage2_size = 200
712
- stage2_samples_count = min(stage2_size, max(0, len(dataset) - stage1_samples_count))
713
-
714
- if stage2_samples_count > 0:
715
- # Get different samples from Stage 1
716
- remaining_indices = list(set(range(len(dataset))) - set(stage1_indices if 'stage1_indices' in locals() else []))
717
-
718
- if len(remaining_indices) >= stage2_samples_count:
719
- stage2_indices = random.sample(remaining_indices, stage2_samples_count)
720
- stage2_samples = [dataset[i] for i in stage2_indices]
721
- else:
722
- stage2_samples = [dataset[i] for i in remaining_indices[:stage2_samples_count]]
723
-
724
- correct, total = evaluate_samples(stage2_samples, correct, total)
725
- final_score = (correct / total) if total > 0 else stage1_score
726
-
727
- print(f"[Stage 2/2] Final score: {{final_score:.3f}} ({{correct}}/{{total}})")
728
- return {{
729
- "combined_score": final_score,
730
- "accuracy": final_score,
731
- "correct": correct,
732
- "total": total,
733
- "stage": "stage2_complete"
734
- }}
735
- else:
736
- print(f"[Stage 2/2] Not enough samples in dataset for Stage 2")
737
- return {{
738
- "combined_score": stage1_score,
739
- "accuracy": stage1_score,
740
- "correct": correct,
741
- "total": total,
742
- "stage": "stage1_complete"
743
- }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
744
 
745
  except Exception as e:
746
  print(f"Error in evaluation: {{e}}")
@@ -1028,7 +973,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
1028
  - **Model**: {model}
1029
  - **Initial Eval**: 200 samples
1030
  - **Final Eval**: 200 samples (same samples for fair comparison)
1031
- - **Evolution**: Staged (50 200 if score 0.5)
1032
  - **Iterations**: 10
1033
 
1034
  ### Results
@@ -1176,7 +1121,7 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1176
  - **Evaluation strategy**:
1177
  - Initial evaluation: 200 samples
1178
  - Final evaluation: Same 200 samples (fair comparison)
1179
- - Evolution: Staged (50 200 if score 0.5 to save API calls)
1180
  - Compare initial vs best prompt side-by-side with identical test sets
1181
 
1182
  ### About OpenEvolve:
 
529
 
530
  def create_evaluator_file(dataset_name: str, split: str, model: str,
531
  input_field: str, target_field: str, work_dir: str):
532
+ """Create an evaluator.py file for OpenEvolve that uses fixed 200 samples."""
533
  evaluator_code = f'''
534
  import os
535
  import random
 
538
 
539
  def evaluate(prompt: str) -> dict:
540
  """
541
+ Evaluate a prompt using 200 fixed samples (same as initial/final eval).
542
 
543
+ This ensures evolution optimizes for the SAME test set we measure on.
544
+ No staging - always evaluates all 200 samples for consistency.
 
 
 
 
 
 
545
  """
546
  try:
547
  # IMPORTANT: Use fixed seed for consistent sampling across all evaluations
548
  random.seed(42)
549
 
550
  # Load dataset
 
551
  try:
552
  dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
553
  except ValueError as e:
 
554
  if "config" in str(e).lower() or "Config name is missing" in str(e):
 
555
  default_config = "main"
556
  if "{dataset_name}".lower() == "glue":
557
  default_config = "sst2"
 
559
  else:
560
  raise
561
 
562
+ # Sample 200 samples with seed 42 (SAME as initial/final eval)
563
+ num_samples = 200
564
+ if len(dataset) > num_samples:
565
+ # Use SAME sampling logic as initial/final eval
566
+ indices = random.sample(range(len(dataset)), num_samples)
567
+ samples = [dataset[i] for i in indices]
568
+ else:
569
+ indices = list(range(min(num_samples, len(dataset))))
570
+ samples = list(dataset)[:num_samples]
571
+
572
  # Initialize OpenAI client
573
  api_key = os.environ.get("OPENAI_API_KEY")
574
  client = OpenAI(
 
576
  api_key=api_key,
577
  )
578
 
579
+ correct = 0
580
+ total = 0
 
 
581
 
582
+ print(f"Evaluating on {{len(samples)}} samples...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
583
 
584
+ for idx, sample in enumerate(samples):
585
+ try:
586
+ # Get input and target
587
+ input_text = sample.get("{input_field}", "")
588
+ if isinstance(input_text, dict):
589
+ input_text = str(input_text)
590
 
591
+ target = sample.get("{target_field}", "")
592
+ if isinstance(target, dict):
593
+ target = str(target)
594
 
595
+ # Format the prompt
596
+ formatted_prompt = prompt.replace("{{input}}", str(input_text))
 
597
 
598
+ # Call the model
599
+ response = client.chat.completions.create(
600
+ model="{model}",
601
+ messages=[
602
+ {{"role": "system", "content": "You are a helpful assistant."}},
603
+ {{"role": "user", "content": formatted_prompt}}
604
+ ],
605
+ temperature=0.0,
606
+ max_tokens=500,
607
+ )
608
+
609
+ prediction = response.choices[0].message.content.strip()
610
+
611
+ # Smart evaluation - handle both math and text answers
612
+ target_str = str(target).strip()
613
+ pred_str = prediction.strip()
614
+
615
+ def extract_answer(text):
616
+ """Extract answer from text - handles GSM8K format and general text"""
617
+ import re
618
+
619
+ # GSM8K format: "#### NUMBER" at the end
620
+ if "####" in text:
621
+ parts = text.split("####")
622
+ if len(parts) > 1:
623
+ answer_part = parts[-1].strip()
624
+ answer_part = answer_part.replace(',', '')
625
+ return answer_part
626
+
627
+ # Try to extract last number from free-form text
628
+ numbers = re.findall(r'-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?', text)
629
+ if numbers:
630
+ return numbers[-1].replace(',', '')
631
+
632
+ return text
633
+
634
+ def is_mathematically_equal(str1, str2):
635
+ """Check if two strings represent the same mathematical value"""
636
+ try:
637
+ num1 = float(str1.replace(',', ''))
638
+ num2 = float(str2.replace(',', ''))
639
+ return abs(num1 - num2) < 1e-6
640
+ except (ValueError, AttributeError):
641
+ return str1.lower().strip() == str2.lower().strip()
642
+
643
+ # Extract answers
644
+ target_answer = extract_answer(target_str)
645
+ pred_answer = extract_answer(pred_str)
646
+
647
+ # Check if answers match mathematically or textually
648
+ is_correct = is_mathematically_equal(target_answer, pred_answer)
649
+
650
+ # Fallback: check for semantic equivalents for sentiment analysis
651
+ if not is_correct:
652
+ target_lower = target_answer.lower()
653
+ pred_lower = pred_answer.lower()
654
+
655
+ # Sentiment mappings with expanded synonyms
656
+ positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic",
657
+ "amazing", "love", "best", "1", "pos", "admiration", "appreciation",
658
+ "praise", "favorable", "approve"]
659
+ negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate",
660
+ "0", "neg", "criticism", "disdain", "disapproval", "unfavorable",
661
+ "critique", "condemn", "sarcasm"]
662
+
663
+ if target_lower in ["1", "positive", "pos"]:
664
+ is_correct = any(word in pred_lower for word in positive_words)
665
+ elif target_lower in ["0", "negative", "neg"]:
666
+ is_correct = any(word in pred_lower for word in negative_words)
667
+
668
+ if is_correct:
669
+ correct += 1
670
+ total += 1
671
+
672
+ if (idx + 1) % 50 == 0:
673
+ print(f" Progress: {{idx + 1}}/{{len(samples)}} - Current accuracy: {{correct/total:.2%}}")
674
+
675
+ except Exception as e:
676
+ print(f"Error evaluating sample {{idx+1}}: {{e}}")
677
+ continue
678
+
679
+ accuracy = (correct / total) if total > 0 else 0.0
680
+
681
+ print(f"Final: {{correct}}/{{total}} = {{accuracy:.2%}}")
682
+
683
+ return {{
684
+ "combined_score": accuracy,
685
+ "accuracy": accuracy,
686
+ "correct": correct,
687
+ "total": total
688
+ }}
689
 
690
  except Exception as e:
691
  print(f"Error in evaluation: {{e}}")
 
973
  - **Model**: {model}
974
  - **Initial Eval**: 200 samples
975
  - **Final Eval**: 200 samples (same samples for fair comparison)
976
+ - **Evolution**: 200 samples per variant (same samples as initial/final)
977
  - **Iterations**: 10
978
 
979
  ### Results
 
1121
  - **Evaluation strategy**:
1122
  - Initial evaluation: 200 samples
1123
  - Final evaluation: Same 200 samples (fair comparison)
1124
+ - Evolution: Each variant tested on same 200 samples (ensures optimization aligns with test set)
1125
  - Compare initial vs best prompt side-by-side with identical test sets
1126
 
1127
  ### About OpenEvolve: