Upload app.py
Browse files
app.py
CHANGED
|
@@ -3,6 +3,7 @@ import os
|
|
| 3 |
import yaml
|
| 4 |
import json
|
| 5 |
import random
|
|
|
|
| 6 |
from datasets import load_dataset, get_dataset_config_names, get_dataset_split_names
|
| 7 |
from openai import OpenAI
|
| 8 |
from openevolve import run_evolution
|
|
@@ -204,37 +205,58 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
|
|
| 204 |
|
| 205 |
prediction = response.choices[0].message.content.strip()
|
| 206 |
|
| 207 |
-
# Smart evaluation - handle both
|
| 208 |
-
target_str = str(target).
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
if not is_correct:
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
if is_correct:
|
| 240 |
correct += 1
|
|
@@ -479,7 +501,6 @@ def create_evaluator_file(dataset_name: str, split: str, model: str,
|
|
| 479 |
"""Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
|
| 480 |
evaluator_code = f'''
|
| 481 |
import os
|
| 482 |
-
import re
|
| 483 |
import random
|
| 484 |
from datasets import load_dataset
|
| 485 |
from openai import OpenAI
|
|
@@ -548,37 +569,60 @@ def evaluate(prompt: str) -> dict:
|
|
| 548 |
|
| 549 |
prediction = response.choices[0].message.content.strip()
|
| 550 |
|
| 551 |
-
# Smart evaluation - handle both
|
| 552 |
-
target_str = str(target).
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
if
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 576 |
if not is_correct:
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
|
|
|
|
|
|
|
|
|
| 582 |
|
| 583 |
if is_correct:
|
| 584 |
correct += 1
|
|
|
|
| 3 |
import yaml
|
| 4 |
import json
|
| 5 |
import random
|
| 6 |
+
import re
|
| 7 |
from datasets import load_dataset, get_dataset_config_names, get_dataset_split_names
|
| 8 |
from openai import OpenAI
|
| 9 |
from openevolve import run_evolution
|
|
|
|
| 205 |
|
| 206 |
prediction = response.choices[0].message.content.strip()
|
| 207 |
|
| 208 |
+
# Smart evaluation - handle both math and text answers
|
| 209 |
+
target_str = str(target).strip()
|
| 210 |
+
pred_str = prediction.strip()
|
| 211 |
+
|
| 212 |
+
def extract_answer(text):
|
| 213 |
+
"""Extract answer from text - handles GSM8K format and general text"""
|
| 214 |
+
# GSM8K format: "#### NUMBER" at the end
|
| 215 |
+
if "####" in text:
|
| 216 |
+
parts = text.split("####")
|
| 217 |
+
if len(parts) > 1:
|
| 218 |
+
answer_part = parts[-1].strip()
|
| 219 |
+
# Remove comma separators (1,000 -> 1000)
|
| 220 |
+
answer_part = answer_part.replace(',', '')
|
| 221 |
+
return answer_part
|
| 222 |
+
|
| 223 |
+
# Try to extract last number from free-form text
|
| 224 |
+
numbers = re.findall(r'-?\d+(?:,\d{3})*(?:\.\d+)?', text)
|
| 225 |
+
if numbers:
|
| 226 |
+
# Return the last number found (usually the final answer)
|
| 227 |
+
return numbers[-1].replace(',', '')
|
| 228 |
+
|
| 229 |
+
return text
|
| 230 |
+
|
| 231 |
+
def is_mathematically_equal(str1, str2):
|
| 232 |
+
"""Check if two strings represent the same mathematical value"""
|
| 233 |
+
try:
|
| 234 |
+
# Try to convert both to floats and compare
|
| 235 |
+
num1 = float(str1.replace(',', ''))
|
| 236 |
+
num2 = float(str2.replace(',', ''))
|
| 237 |
+
# Use small epsilon for float comparison
|
| 238 |
+
return abs(num1 - num2) < 1e-6
|
| 239 |
+
except (ValueError, AttributeError):
|
| 240 |
+
# If conversion fails, do string comparison
|
| 241 |
+
return str1.lower().strip() == str2.lower().strip()
|
| 242 |
+
|
| 243 |
+
# Extract answers
|
| 244 |
+
target_answer = extract_answer(target_str)
|
| 245 |
+
pred_answer = extract_answer(pred_str)
|
| 246 |
+
|
| 247 |
+
# Check if answers match mathematically or textually
|
| 248 |
+
is_correct = is_mathematically_equal(target_answer, pred_answer)
|
| 249 |
+
|
| 250 |
+
# Fallback: check for semantic equivalents for sentiment analysis
|
| 251 |
if not is_correct:
|
| 252 |
+
target_lower = target_answer.lower()
|
| 253 |
+
pred_lower = pred_answer.lower()
|
| 254 |
+
|
| 255 |
+
# Sentiment mappings
|
| 256 |
+
if target_lower in ["1", "positive", "pos"]:
|
| 257 |
+
is_correct = any(word in pred_lower for word in ["positive", "good", "great", "1"])
|
| 258 |
+
elif target_lower in ["0", "negative", "neg"]:
|
| 259 |
+
is_correct = any(word in pred_lower for word in ["negative", "bad", "poor", "0"])
|
| 260 |
|
| 261 |
if is_correct:
|
| 262 |
correct += 1
|
|
|
|
| 501 |
"""Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
|
| 502 |
evaluator_code = f'''
|
| 503 |
import os
|
|
|
|
| 504 |
import random
|
| 505 |
from datasets import load_dataset
|
| 506 |
from openai import OpenAI
|
|
|
|
| 569 |
|
| 570 |
prediction = response.choices[0].message.content.strip()
|
| 571 |
|
| 572 |
+
# Smart evaluation - handle both math and text answers
|
| 573 |
+
target_str = str(target).strip()
|
| 574 |
+
pred_str = prediction.strip()
|
| 575 |
+
|
| 576 |
+
def extract_answer(text):
|
| 577 |
+
"""Extract answer from text - handles GSM8K format and general text"""
|
| 578 |
+
import re
|
| 579 |
+
|
| 580 |
+
# GSM8K format: "#### NUMBER" at the end
|
| 581 |
+
if "####" in text:
|
| 582 |
+
parts = text.split("####")
|
| 583 |
+
if len(parts) > 1:
|
| 584 |
+
answer_part = parts[-1].strip()
|
| 585 |
+
# Remove comma separators (1,000 -> 1000)
|
| 586 |
+
answer_part = answer_part.replace(',', '')
|
| 587 |
+
return answer_part
|
| 588 |
+
|
| 589 |
+
# Try to extract last number from free-form text
|
| 590 |
+
numbers = re.findall(r'-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?', text)
|
| 591 |
+
if numbers:
|
| 592 |
+
# Return the last number found (usually the final answer)
|
| 593 |
+
return numbers[-1].replace(',', '')
|
| 594 |
+
|
| 595 |
+
return text
|
| 596 |
+
|
| 597 |
+
def is_mathematically_equal(str1, str2):
|
| 598 |
+
"""Check if two strings represent the same mathematical value"""
|
| 599 |
+
try:
|
| 600 |
+
# Try to convert both to floats and compare
|
| 601 |
+
num1 = float(str1.replace(',', ''))
|
| 602 |
+
num2 = float(str2.replace(',', ''))
|
| 603 |
+
# Use small epsilon for float comparison
|
| 604 |
+
return abs(num1 - num2) < 1e-6
|
| 605 |
+
except (ValueError, AttributeError):
|
| 606 |
+
# If conversion fails, do string comparison
|
| 607 |
+
return str1.lower().strip() == str2.lower().strip()
|
| 608 |
+
|
| 609 |
+
# Extract answers
|
| 610 |
+
target_answer = extract_answer(target_str)
|
| 611 |
+
pred_answer = extract_answer(pred_str)
|
| 612 |
+
|
| 613 |
+
# Check if answers match mathematically or textually
|
| 614 |
+
is_correct = is_mathematically_equal(target_answer, pred_answer)
|
| 615 |
+
|
| 616 |
+
# Fallback: check for semantic equivalents for sentiment analysis
|
| 617 |
if not is_correct:
|
| 618 |
+
target_lower = target_answer.lower()
|
| 619 |
+
pred_lower = pred_answer.lower()
|
| 620 |
+
|
| 621 |
+
# Sentiment mappings
|
| 622 |
+
if target_lower in ["1", "positive", "pos"]:
|
| 623 |
+
is_correct = any(word in pred_lower for word in ["positive", "good", "great", "1"])
|
| 624 |
+
elif target_lower in ["0", "negative", "neg"]:
|
| 625 |
+
is_correct = any(word in pred_lower for word in ["negative", "bad", "poor", "0"])
|
| 626 |
|
| 627 |
if is_correct:
|
| 628 |
correct += 1
|