codelion commited on
Commit
ea59941
·
verified ·
1 Parent(s): 2f781ff

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -61
app.py CHANGED
@@ -3,6 +3,7 @@ import os
3
  import yaml
4
  import json
5
  import random
 
6
  from datasets import load_dataset, get_dataset_config_names, get_dataset_split_names
7
  from openai import OpenAI
8
  from openevolve import run_evolution
@@ -204,37 +205,58 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
204
 
205
  prediction = response.choices[0].message.content.strip()
206
 
207
- # Smart evaluation - handle both exact match and semantic match
208
- target_str = str(target).lower().strip()
209
- pred_lower = prediction.lower()
210
-
211
- # Extract numeric answer from GSM8K format (e.g., "#### 42")
212
- def extract_numeric_answer(text):
213
- """Extract final numeric answer from GSM8K format or general text"""
214
- # Try GSM8K format first: #### NUMBER
215
- match = re.search(r'####\s*(-?\d+(?:,\d{3})*(?:\.\d+)?)', text)
216
- if match:
217
- return match.group(1).replace(',', '').strip()
218
- # Otherwise try to find any number in the text
219
- match = re.search(r'(-?\d+(?:,\d{3})*(?:\.\d+)?)', text)
220
- if match:
221
- return match.group(1).replace(',', '').strip()
222
- return text.strip()
223
-
224
- # Extract numeric answers for comparison
225
- target_numeric = extract_numeric_answer(str(target))
226
- pred_numeric = extract_numeric_answer(prediction)
227
-
228
- # Check exact match first
229
- is_correct = target_str in pred_lower or pred_numeric == target_numeric
230
-
231
- # If not exact match, check for semantic equivalents (e.g., "1" = "positive")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  if not is_correct:
233
- # Common sentiment mappings
234
- if target_str in ["1", "positive", "pos"]:
235
- is_correct = any(word in pred_lower for word in ["positive", "good", "great"])
236
- elif target_str in ["0", "negative", "neg"]:
237
- is_correct = any(word in pred_lower for word in ["negative", "bad", "poor"])
 
 
 
238
 
239
  if is_correct:
240
  correct += 1
@@ -479,7 +501,6 @@ def create_evaluator_file(dataset_name: str, split: str, model: str,
479
  """Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
480
  evaluator_code = f'''
481
  import os
482
- import re
483
  import random
484
  from datasets import load_dataset
485
  from openai import OpenAI
@@ -548,37 +569,60 @@ def evaluate(prompt: str) -> dict:
548
 
549
  prediction = response.choices[0].message.content.strip()
550
 
551
- # Smart evaluation - handle both exact match and semantic match
552
- target_str = str(target).lower().strip()
553
- pred_lower = prediction.lower()
554
-
555
- # Extract numeric answer from GSM8K format (e.g., "#### 42")
556
- def extract_numeric_answer(text):
557
- """Extract final numeric answer from GSM8K format or general text"""
558
- # Try GSM8K format first: #### NUMBER
559
- match = re.search(r'####\\s*(-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?)', text)
560
- if match:
561
- return match.group(1).replace(',', '').strip()
562
- # Otherwise try to find any number in the text
563
- match = re.search(r'(-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?)', text)
564
- if match:
565
- return match.group(1).replace(',', '').strip()
566
- return text.strip()
567
-
568
- # Extract numeric answers for comparison
569
- target_numeric = extract_numeric_answer(str(target))
570
- pred_numeric = extract_numeric_answer(prediction)
571
-
572
- # Check exact match first
573
- is_correct = target_str in pred_lower or pred_numeric == target_numeric
574
-
575
- # If not exact match, check for semantic equivalents (e.g., "1" = "positive")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576
  if not is_correct:
577
- # Common sentiment mappings
578
- if target_str in ["1", "positive", "pos"]:
579
- is_correct = any(word in pred_lower for word in ["positive", "good", "great"])
580
- elif target_str in ["0", "negative", "neg"]:
581
- is_correct = any(word in pred_lower for word in ["negative", "bad", "poor"])
 
 
 
582
 
583
  if is_correct:
584
  correct += 1
 
3
  import yaml
4
  import json
5
  import random
6
+ import re
7
  from datasets import load_dataset, get_dataset_config_names, get_dataset_split_names
8
  from openai import OpenAI
9
  from openevolve import run_evolution
 
205
 
206
  prediction = response.choices[0].message.content.strip()
207
 
208
+ # Smart evaluation - handle both math and text answers
209
+ target_str = str(target).strip()
210
+ pred_str = prediction.strip()
211
+
212
+ def extract_answer(text):
213
+ """Extract answer from text - handles GSM8K format and general text"""
214
+ # GSM8K format: "#### NUMBER" at the end
215
+ if "####" in text:
216
+ parts = text.split("####")
217
+ if len(parts) > 1:
218
+ answer_part = parts[-1].strip()
219
+ # Remove comma separators (1,000 -> 1000)
220
+ answer_part = answer_part.replace(',', '')
221
+ return answer_part
222
+
223
+ # Try to extract last number from free-form text
224
+ numbers = re.findall(r'-?\d+(?:,\d{3})*(?:\.\d+)?', text)
225
+ if numbers:
226
+ # Return the last number found (usually the final answer)
227
+ return numbers[-1].replace(',', '')
228
+
229
+ return text
230
+
231
+ def is_mathematically_equal(str1, str2):
232
+ """Check if two strings represent the same mathematical value"""
233
+ try:
234
+ # Try to convert both to floats and compare
235
+ num1 = float(str1.replace(',', ''))
236
+ num2 = float(str2.replace(',', ''))
237
+ # Use small epsilon for float comparison
238
+ return abs(num1 - num2) < 1e-6
239
+ except (ValueError, AttributeError):
240
+ # If conversion fails, do string comparison
241
+ return str1.lower().strip() == str2.lower().strip()
242
+
243
+ # Extract answers
244
+ target_answer = extract_answer(target_str)
245
+ pred_answer = extract_answer(pred_str)
246
+
247
+ # Check if answers match mathematically or textually
248
+ is_correct = is_mathematically_equal(target_answer, pred_answer)
249
+
250
+ # Fallback: check for semantic equivalents for sentiment analysis
251
  if not is_correct:
252
+ target_lower = target_answer.lower()
253
+ pred_lower = pred_answer.lower()
254
+
255
+ # Sentiment mappings
256
+ if target_lower in ["1", "positive", "pos"]:
257
+ is_correct = any(word in pred_lower for word in ["positive", "good", "great", "1"])
258
+ elif target_lower in ["0", "negative", "neg"]:
259
+ is_correct = any(word in pred_lower for word in ["negative", "bad", "poor", "0"])
260
 
261
  if is_correct:
262
  correct += 1
 
501
  """Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
502
  evaluator_code = f'''
503
  import os
 
504
  import random
505
  from datasets import load_dataset
506
  from openai import OpenAI
 
569
 
570
  prediction = response.choices[0].message.content.strip()
571
 
572
+ # Smart evaluation - handle both math and text answers
573
+ target_str = str(target).strip()
574
+ pred_str = prediction.strip()
575
+
576
+ def extract_answer(text):
577
+ """Extract answer from text - handles GSM8K format and general text"""
578
+ import re
579
+
580
+ # GSM8K format: "#### NUMBER" at the end
581
+ if "####" in text:
582
+ parts = text.split("####")
583
+ if len(parts) > 1:
584
+ answer_part = parts[-1].strip()
585
+ # Remove comma separators (1,000 -> 1000)
586
+ answer_part = answer_part.replace(',', '')
587
+ return answer_part
588
+
589
+ # Try to extract last number from free-form text
590
+ numbers = re.findall(r'-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?', text)
591
+ if numbers:
592
+ # Return the last number found (usually the final answer)
593
+ return numbers[-1].replace(',', '')
594
+
595
+ return text
596
+
597
+ def is_mathematically_equal(str1, str2):
598
+ """Check if two strings represent the same mathematical value"""
599
+ try:
600
+ # Try to convert both to floats and compare
601
+ num1 = float(str1.replace(',', ''))
602
+ num2 = float(str2.replace(',', ''))
603
+ # Use small epsilon for float comparison
604
+ return abs(num1 - num2) < 1e-6
605
+ except (ValueError, AttributeError):
606
+ # If conversion fails, do string comparison
607
+ return str1.lower().strip() == str2.lower().strip()
608
+
609
+ # Extract answers
610
+ target_answer = extract_answer(target_str)
611
+ pred_answer = extract_answer(pred_str)
612
+
613
+ # Check if answers match mathematically or textually
614
+ is_correct = is_mathematically_equal(target_answer, pred_answer)
615
+
616
+ # Fallback: check for semantic equivalents for sentiment analysis
617
  if not is_correct:
618
+ target_lower = target_answer.lower()
619
+ pred_lower = pred_answer.lower()
620
+
621
+ # Sentiment mappings
622
+ if target_lower in ["1", "positive", "pos"]:
623
+ is_correct = any(word in pred_lower for word in ["positive", "good", "great", "1"])
624
+ elif target_lower in ["0", "negative", "neg"]:
625
+ is_correct = any(word in pred_lower for word in ["negative", "bad", "poor", "0"])
626
 
627
  if is_correct:
628
  correct += 1