Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on 24 days ago

Commit

f5714df

verified ·

1 Parent(s): 424a3e8

Upload app.py

Browse files

Files changed (1) hide show

app.py +121 -176

app.py CHANGED Viewed

@@ -529,7 +529,7 @@ def parse_evolution_history(output_dir: str) -> str:
 def create_evaluator_file(dataset_name: str, split: str, model: str,
                          input_field: str, target_field: str, work_dir: str):
-    """Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
     evaluator_code = f'''
 import os
 import random
@@ -538,29 +538,20 @@ from openai import OpenAI
 def evaluate(prompt: str) -> dict:
     """
-    Evaluate a prompt using 2-stage cascading evaluation to save API calls.
-    Stage 1: Evaluate with 50 samples
-    - If accuracy >= 0.5, proceed to Stage 2
-    - If accuracy < 0.5, return early (no point wasting 200 more samples)
-    Stage 2: Evaluate with 200 more samples (total 250)
-    - Combine results for final score
-    Returns dict with combined_score (0-1), accuracy, correct, and total.
     """
     try:
         # IMPORTANT: Use fixed seed for consistent sampling across all evaluations
         random.seed(42)
         # Load dataset
-        # Try loading with just dataset name first
         try:
             dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
         except ValueError as e:
-            # If it fails with config error, try common configs
             if "config" in str(e).lower() or "Config name is missing" in str(e):
-                # Try common configs based on dataset name
                 default_config = "main"
                 if "{dataset_name}".lower() == "glue":
                     default_config = "sst2"
@@ -568,6 +559,16 @@ def evaluate(prompt: str) -> dict:
             else:
                 raise
         # Initialize OpenAI client
         api_key = os.environ.get("OPENAI_API_KEY")
         client = OpenAI(
@@ -575,172 +576,116 @@ def evaluate(prompt: str) -> dict:
             api_key=api_key,
         )
-        def evaluate_samples(samples, correct_so_far=0, total_so_far=0):
-            """Helper function to evaluate a batch of samples."""
-            correct = correct_so_far
-            total = total_so_far
-            for sample in samples:
-                try:
-                    # Get input and target
-                    input_text = sample.get("{input_field}", "")
-                    if isinstance(input_text, dict):
-                        input_text = str(input_text)
-                    target = sample.get("{target_field}", "")
-                    if isinstance(target, dict):
-                        target = str(target)
-                    # Format the prompt
-                    formatted_prompt = prompt.replace("{{input}}", str(input_text))
-                    # Call the model
-                    response = client.chat.completions.create(
-                        model="{model}",
-                        messages=[
-                            {{"role": "system", "content": "You are a helpful assistant."}},
-                            {{"role": "user", "content": formatted_prompt}}
-                        ],
-                        temperature=0.0,
-                        max_tokens=500,
-                    )
-                    prediction = response.choices[0].message.content.strip()
-                    # Smart evaluation - handle both math and text answers
-                    target_str = str(target).strip()
-                    pred_str = prediction.strip()
-                    def extract_answer(text):
-                        """Extract answer from text - handles GSM8K format and general text"""
-                        import re
-                        # GSM8K format: "#### NUMBER" at the end
-                        if "####" in text:
-                            parts = text.split("####")
-                            if len(parts) > 1:
-                                answer_part = parts[-1].strip()
-                                # Remove comma separators (1,000 -> 1000)
-                                answer_part = answer_part.replace(',', '')
-                                return answer_part
-                        # Try to extract last number from free-form text
-                        numbers = re.findall(r'-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?', text)
-                        if numbers:
-                            # Return the last number found (usually the final answer)
-                            return numbers[-1].replace(',', '')
-                        return text
-                    def is_mathematically_equal(str1, str2):
-                        """Check if two strings represent the same mathematical value"""
-                        try:
-                            # Try to convert both to floats and compare
-                            num1 = float(str1.replace(',', ''))
-                            num2 = float(str2.replace(',', ''))
-                            # Use small epsilon for float comparison
-                            return abs(num1 - num2) < 1e-6
-                        except (ValueError, AttributeError):
-                            # If conversion fails, do string comparison
-                            return str1.lower().strip() == str2.lower().strip()
-                    # Extract answers
-                    target_answer = extract_answer(target_str)
-                    pred_answer = extract_answer(pred_str)
-                    # Check if answers match mathematically or textually
-                    is_correct = is_mathematically_equal(target_answer, pred_answer)
-                    # Fallback: check for semantic equivalents for sentiment analysis
-                    if not is_correct:
-                        target_lower = target_answer.lower()
-                        pred_lower = pred_answer.lower()
-                        # Sentiment mappings with expanded synonyms
-                        positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic",
-                                         "amazing", "love", "best", "1", "pos", "admiration", "appreciation",
-                                         "praise", "favorable", "approve"]
-                        negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate",
-                                         "0", "neg", "criticism", "disdain", "disapproval", "unfavorable",
-                                         "critique", "condemn", "sarcasm"]
-                        if target_lower in ["1", "positive", "pos"]:
-                            is_correct = any(word in pred_lower for word in positive_words)
-                        elif target_lower in ["0", "negative", "neg"]:
-                            is_correct = any(word in pred_lower for word in negative_words)
-                    if is_correct:
-                        correct += 1
-                    total += 1
-                except Exception as e:
-                    print(f"Error evaluating sample: {{e}}")
-                    continue
-            return correct, total
-        # STAGE 1: Evaluate with 50 samples first
-        stage1_size = 50
-        stage1_samples_count = min(stage1_size, len(dataset))
-        if len(dataset) > stage1_samples_count:
-            stage1_indices = random.sample(range(len(dataset)), stage1_samples_count)
-            stage1_samples = [dataset[i] for i in stage1_indices]
-        else:
-            stage1_samples = list(dataset)[:stage1_samples_count]
-        print(f"[Stage 1/2] Evaluating with {{len(stage1_samples)}} samples...")
-        correct, total = evaluate_samples(stage1_samples)
-        stage1_score = (correct / total) if total > 0 else 0.0
-        print(f"[Stage 1/2] Score: {{stage1_score:.3f}} ({{correct}}/{{total}})")
-        # Early exit if Stage 1 score is below threshold
-        if stage1_score < 0.5:
-            print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 200 API calls)")
-            return {{
-                "combined_score": stage1_score,
-                "accuracy": stage1_score,
-                "correct": correct,
-                "total": total,
-                "stage": "stage1_early_exit"
-            }}
-        # STAGE 2: Continue with 200 more samples
-        print(f"[Stage 2/2] Score >= 0.5 - proceeding with 200 more samples...")
-        stage2_size = 200
-        stage2_samples_count = min(stage2_size, max(0, len(dataset) - stage1_samples_count))
-        if stage2_samples_count > 0:
-            # Get different samples from Stage 1
-            remaining_indices = list(set(range(len(dataset))) - set(stage1_indices if 'stage1_indices' in locals() else []))
-            if len(remaining_indices) >= stage2_samples_count:
-                stage2_indices = random.sample(remaining_indices, stage2_samples_count)
-                stage2_samples = [dataset[i] for i in stage2_indices]
-            else:
-                stage2_samples = [dataset[i] for i in remaining_indices[:stage2_samples_count]]
-            correct, total = evaluate_samples(stage2_samples, correct, total)
-            final_score = (correct / total) if total > 0 else stage1_score
-            print(f"[Stage 2/2] Final score: {{final_score:.3f}} ({{correct}}/{{total}})")
-            return {{
-                "combined_score": final_score,
-                "accuracy": final_score,
-                "correct": correct,
-                "total": total,
-                "stage": "stage2_complete"
-            }}
-        else:
-            print(f"[Stage 2/2] Not enough samples in dataset for Stage 2")
-            return {{
-                "combined_score": stage1_score,
-                "accuracy": stage1_score,
-                "correct": correct,
-                "total": total,
-                "stage": "stage1_complete"
-            }}
     except Exception as e:
         print(f"Error in evaluation: {{e}}")
@@ -1028,7 +973,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
 - **Model**: {model}
 - **Initial Eval**: 200 samples
 - **Final Eval**: 200 samples (same samples for fair comparison)
-- **Evolution**: Staged (50 → 200 if score ≥ 0.5)
 - **Iterations**: 10
 ### Results
@@ -1176,7 +1121,7 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
            - **Evaluation strategy**:
              - Initial evaluation: 200 samples
              - Final evaluation: Same 200 samples (fair comparison)
-             - Evolution: Staged (50 → 200 if score ≥ 0.5 to save API calls)
            - Compare initial vs best prompt side-by-side with identical test sets
         ### About OpenEvolve:

 def create_evaluator_file(dataset_name: str, split: str, model: str,
                          input_field: str, target_field: str, work_dir: str):
+    """Create an evaluator.py file for OpenEvolve that uses fixed 200 samples."""
     evaluator_code = f'''
 import os
 import random
 def evaluate(prompt: str) -> dict:
     """
+    Evaluate a prompt using 200 fixed samples (same as initial/final eval).
+    This ensures evolution optimizes for the SAME test set we measure on.
+    No staging - always evaluates all 200 samples for consistency.
     """
     try:
         # IMPORTANT: Use fixed seed for consistent sampling across all evaluations
         random.seed(42)
         # Load dataset
         try:
             dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
         except ValueError as e:
             if "config" in str(e).lower() or "Config name is missing" in str(e):
                 default_config = "main"
                 if "{dataset_name}".lower() == "glue":
                     default_config = "sst2"
             else:
                 raise
+        # Sample 200 samples with seed 42 (SAME as initial/final eval)
+        num_samples = 200
+        if len(dataset) > num_samples:
+            # Use SAME sampling logic as initial/final eval
+            indices = random.sample(range(len(dataset)), num_samples)
+            samples = [dataset[i] for i in indices]
+        else:
+            indices = list(range(min(num_samples, len(dataset))))
+            samples = list(dataset)[:num_samples]
         # Initialize OpenAI client
         api_key = os.environ.get("OPENAI_API_KEY")
         client = OpenAI(
             api_key=api_key,
         )
+        correct = 0
+        total = 0
+        print(f"Evaluating on {{len(samples)}} samples...")
+        for idx, sample in enumerate(samples):
+            try:
+                # Get input and target
+                input_text = sample.get("{input_field}", "")
+                if isinstance(input_text, dict):
+                    input_text = str(input_text)
+                target = sample.get("{target_field}", "")
+                if isinstance(target, dict):
+                    target = str(target)
+                # Format the prompt
+                formatted_prompt = prompt.replace("{{input}}", str(input_text))
+                # Call the model
+                response = client.chat.completions.create(
+                    model="{model}",
+                    messages=[
+                        {{"role": "system", "content": "You are a helpful assistant."}},
+                        {{"role": "user", "content": formatted_prompt}}
+                    ],
+                    temperature=0.0,
+                    max_tokens=500,
+                )
+                prediction = response.choices[0].message.content.strip()
+                # Smart evaluation - handle both math and text answers
+                target_str = str(target).strip()
+                pred_str = prediction.strip()
+                def extract_answer(text):
+                    """Extract answer from text - handles GSM8K format and general text"""
+                    import re
+                    # GSM8K format: "#### NUMBER" at the end
+                    if "####" in text:
+                        parts = text.split("####")
+                        if len(parts) > 1:
+                            answer_part = parts[-1].strip()
+                            answer_part = answer_part.replace(',', '')
+                            return answer_part
+                    # Try to extract last number from free-form text
+                    numbers = re.findall(r'-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?', text)
+                    if numbers:
+                        return numbers[-1].replace(',', '')
+                    return text
+                def is_mathematically_equal(str1, str2):
+                    """Check if two strings represent the same mathematical value"""
+                    try:
+                        num1 = float(str1.replace(',', ''))
+                        num2 = float(str2.replace(',', ''))
+                        return abs(num1 - num2) < 1e-6
+                    except (ValueError, AttributeError):
+                        return str1.lower().strip() == str2.lower().strip()
+                # Extract answers
+                target_answer = extract_answer(target_str)
+                pred_answer = extract_answer(pred_str)
+                # Check if answers match mathematically or textually
+                is_correct = is_mathematically_equal(target_answer, pred_answer)
+                # Fallback: check for semantic equivalents for sentiment analysis
+                if not is_correct:
+                    target_lower = target_answer.lower()
+                    pred_lower = pred_answer.lower()
+                    # Sentiment mappings with expanded synonyms
+                    positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic",
+                                     "amazing", "love", "best", "1", "pos", "admiration", "appreciation",
+                                     "praise", "favorable", "approve"]
+                    negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate",
+                                     "0", "neg", "criticism", "disdain", "disapproval", "unfavorable",
+                                     "critique", "condemn", "sarcasm"]
+                    if target_lower in ["1", "positive", "pos"]:
+                        is_correct = any(word in pred_lower for word in positive_words)
+                    elif target_lower in ["0", "negative", "neg"]:
+                        is_correct = any(word in pred_lower for word in negative_words)
+                if is_correct:
+                    correct += 1
+                total += 1
+                if (idx + 1) % 50 == 0:
+                    print(f"  Progress: {{idx + 1}}/{{len(samples)}} - Current accuracy: {{correct/total:.2%}}")
+            except Exception as e:
+                print(f"Error evaluating sample {{idx+1}}: {{e}}")
+                continue
+        accuracy = (correct / total) if total > 0 else 0.0
+        print(f"Final: {{correct}}/{{total}} = {{accuracy:.2%}}")
+        return {{
+            "combined_score": accuracy,
+            "accuracy": accuracy,
+            "correct": correct,
+            "total": total
+        }}
     except Exception as e:
         print(f"Error in evaluation: {{e}}")
 - **Model**: {model}
 - **Initial Eval**: 200 samples
 - **Final Eval**: 200 samples (same samples for fair comparison)
+- **Evolution**: 200 samples per variant (same samples as initial/final)
 - **Iterations**: 10
 ### Results
            - **Evaluation strategy**:
              - Initial evaluation: 200 samples
              - Final evaluation: Same 200 samples (fair comparison)
+             - Evolution: Each variant tested on same 200 samples (ensures optimization aligns with test set)
            - Compare initial vs best prompt side-by-side with identical test sets
         ### About OpenEvolve: