Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on 25 days ago

Commit

ea59941

verified ·

1 Parent(s): 2f781ff

Upload app.py

Browse files

Files changed (1) hide show

app.py +105 -61

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 import yaml
 import json
 import random
 from datasets import load_dataset, get_dataset_config_names, get_dataset_split_names
 from openai import OpenAI
 from openevolve import run_evolution
@@ -204,37 +205,58 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
                 prediction = response.choices[0].message.content.strip()
-                # Smart evaluation - handle both exact match and semantic match
-                target_str = str(target).lower().strip()
-                pred_lower = prediction.lower()
-                # Extract numeric answer from GSM8K format (e.g., "#### 42")
-                def extract_numeric_answer(text):
-                    """Extract final numeric answer from GSM8K format or general text"""
-                    # Try GSM8K format first: #### NUMBER
-                    match = re.search(r'####\s*(-?\d+(?:,\d{3})*(?:\.\d+)?)', text)
-                    if match:
-                        return match.group(1).replace(',', '').strip()
-                    # Otherwise try to find any number in the text
-                    match = re.search(r'(-?\d+(?:,\d{3})*(?:\.\d+)?)', text)
-                    if match:
-                        return match.group(1).replace(',', '').strip()
-                    return text.strip()
-                # Extract numeric answers for comparison
-                target_numeric = extract_numeric_answer(str(target))
-                pred_numeric = extract_numeric_answer(prediction)
-                # Check exact match first
-                is_correct = target_str in pred_lower or pred_numeric == target_numeric
-                # If not exact match, check for semantic equivalents (e.g., "1" = "positive")
                 if not is_correct:
-                    # Common sentiment mappings
-                    if target_str in ["1", "positive", "pos"]:
-                        is_correct = any(word in pred_lower for word in ["positive", "good", "great"])
-                    elif target_str in ["0", "negative", "neg"]:
-                        is_correct = any(word in pred_lower for word in ["negative", "bad", "poor"])
                 if is_correct:
                     correct += 1
@@ -479,7 +501,6 @@ def create_evaluator_file(dataset_name: str, split: str, model: str,
     """Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
     evaluator_code = f'''
 import os
-import re
 import random
 from datasets import load_dataset
 from openai import OpenAI
@@ -548,37 +569,60 @@ def evaluate(prompt: str) -> dict:
                     prediction = response.choices[0].message.content.strip()
-                    # Smart evaluation - handle both exact match and semantic match
-                    target_str = str(target).lower().strip()
-                    pred_lower = prediction.lower()
-                    # Extract numeric answer from GSM8K format (e.g., "#### 42")
-                    def extract_numeric_answer(text):
-                        """Extract final numeric answer from GSM8K format or general text"""
-                        # Try GSM8K format first: #### NUMBER
-                        match = re.search(r'####\\s*(-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?)', text)
-                        if match:
-                            return match.group(1).replace(',', '').strip()
-                        # Otherwise try to find any number in the text
-                        match = re.search(r'(-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?)', text)
-                        if match:
-                            return match.group(1).replace(',', '').strip()
-                        return text.strip()
-                    # Extract numeric answers for comparison
-                    target_numeric = extract_numeric_answer(str(target))
-                    pred_numeric = extract_numeric_answer(prediction)
-                    # Check exact match first
-                    is_correct = target_str in pred_lower or pred_numeric == target_numeric
-                    # If not exact match, check for semantic equivalents (e.g., "1" = "positive")
                     if not is_correct:
-                        # Common sentiment mappings
-                        if target_str in ["1", "positive", "pos"]:
-                            is_correct = any(word in pred_lower for word in ["positive", "good", "great"])
-                        elif target_str in ["0", "negative", "neg"]:
-                            is_correct = any(word in pred_lower for word in ["negative", "bad", "poor"])
                     if is_correct:
                         correct += 1

 import yaml
 import json
 import random
+import re
 from datasets import load_dataset, get_dataset_config_names, get_dataset_split_names
 from openai import OpenAI
 from openevolve import run_evolution
                 prediction = response.choices[0].message.content.strip()
+                # Smart evaluation - handle both math and text answers
+                target_str = str(target).strip()
+                pred_str = prediction.strip()
+                def extract_answer(text):
+                    """Extract answer from text - handles GSM8K format and general text"""
+                    # GSM8K format: "#### NUMBER" at the end
+                    if "####" in text:
+                        parts = text.split("####")
+                        if len(parts) > 1:
+                            answer_part = parts[-1].strip()
+                            # Remove comma separators (1,000 -> 1000)
+                            answer_part = answer_part.replace(',', '')
+                            return answer_part
+                    # Try to extract last number from free-form text
+                    numbers = re.findall(r'-?\d+(?:,\d{3})*(?:\.\d+)?', text)
+                    if numbers:
+                        # Return the last number found (usually the final answer)
+                        return numbers[-1].replace(',', '')
+                    return text
+                def is_mathematically_equal(str1, str2):
+                    """Check if two strings represent the same mathematical value"""
+                    try:
+                        # Try to convert both to floats and compare
+                        num1 = float(str1.replace(',', ''))
+                        num2 = float(str2.replace(',', ''))
+                        # Use small epsilon for float comparison
+                        return abs(num1 - num2) < 1e-6
+                    except (ValueError, AttributeError):
+                        # If conversion fails, do string comparison
+                        return str1.lower().strip() == str2.lower().strip()
+                # Extract answers
+                target_answer = extract_answer(target_str)
+                pred_answer = extract_answer(pred_str)
+                # Check if answers match mathematically or textually
+                is_correct = is_mathematically_equal(target_answer, pred_answer)
+                # Fallback: check for semantic equivalents for sentiment analysis
                 if not is_correct:
+                    target_lower = target_answer.lower()
+                    pred_lower = pred_answer.lower()
+                    # Sentiment mappings
+                    if target_lower in ["1", "positive", "pos"]:
+                        is_correct = any(word in pred_lower for word in ["positive", "good", "great", "1"])
+                    elif target_lower in ["0", "negative", "neg"]:
+                        is_correct = any(word in pred_lower for word in ["negative", "bad", "poor", "0"])
                 if is_correct:
                     correct += 1
     """Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
     evaluator_code = f'''
 import os
 import random
 from datasets import load_dataset
 from openai import OpenAI
                     prediction = response.choices[0].message.content.strip()
+                    # Smart evaluation - handle both math and text answers
+                    target_str = str(target).strip()
+                    pred_str = prediction.strip()
+                    def extract_answer(text):
+                        """Extract answer from text - handles GSM8K format and general text"""
+                        import re
+                        # GSM8K format: "#### NUMBER" at the end
+                        if "####" in text:
+                            parts = text.split("####")
+                            if len(parts) > 1:
+                                answer_part = parts[-1].strip()
+                                # Remove comma separators (1,000 -> 1000)
+                                answer_part = answer_part.replace(',', '')
+                                return answer_part
+                        # Try to extract last number from free-form text
+                        numbers = re.findall(r'-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?', text)
+                        if numbers:
+                            # Return the last number found (usually the final answer)
+                            return numbers[-1].replace(',', '')
+                        return text
+                    def is_mathematically_equal(str1, str2):
+                        """Check if two strings represent the same mathematical value"""
+                        try:
+                            # Try to convert both to floats and compare
+                            num1 = float(str1.replace(',', ''))
+                            num2 = float(str2.replace(',', ''))
+                            # Use small epsilon for float comparison
+                            return abs(num1 - num2) < 1e-6
+                        except (ValueError, AttributeError):
+                            # If conversion fails, do string comparison
+                            return str1.lower().strip() == str2.lower().strip()
+                    # Extract answers
+                    target_answer = extract_answer(target_str)
+                    pred_answer = extract_answer(pred_str)
+                    # Check if answers match mathematically or textually
+                    is_correct = is_mathematically_equal(target_answer, pred_answer)
+                    # Fallback: check for semantic equivalents for sentiment analysis
                     if not is_correct:
+                        target_lower = target_answer.lower()
+                        pred_lower = pred_answer.lower()
+                        # Sentiment mappings
+                        if target_lower in ["1", "positive", "pos"]:
+                            is_correct = any(word in pred_lower for word in ["positive", "good", "great", "1"])
+                        elif target_lower in ["0", "negative", "neg"]:
+                            is_correct = any(word in pred_lower for word in ["negative", "bad", "poor", "0"])
                     if is_correct:
                         correct += 1