Upload app.py
Browse files
app.py
CHANGED
|
@@ -226,30 +226,22 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
|
|
| 226 |
max_tokens=500,
|
| 227 |
)
|
| 228 |
|
| 229 |
-
prediction = response.choices[0].message.content.strip()
|
| 230 |
|
| 231 |
# IMDB labels: 0 = negative, 1 = positive
|
| 232 |
true_label = int(target) # 0 or 1
|
| 233 |
|
| 234 |
-
#
|
| 235 |
-
|
|
|
|
| 236 |
|
| 237 |
-
#
|
| 238 |
-
|
| 239 |
-
("this is positive" in pred_start) or \
|
| 240 |
-
("sentiment: positive" in pred_start)
|
| 241 |
-
|
| 242 |
-
has_negative = ("negative" in pred_start and "sentiment" in pred_start) or \
|
| 243 |
-
("this is negative" in pred_start) or \
|
| 244 |
-
("sentiment: negative" in pred_start)
|
| 245 |
-
|
| 246 |
-
# Prediction must be unambiguous
|
| 247 |
-
if has_positive and not has_negative:
|
| 248 |
predicted_label = 1
|
| 249 |
-
elif
|
| 250 |
predicted_label = 0
|
| 251 |
else:
|
| 252 |
-
#
|
| 253 |
predicted_label = -1
|
| 254 |
|
| 255 |
is_correct = (predicted_label == true_label)
|
|
@@ -572,30 +564,22 @@ def evaluate(prompt: str) -> dict:
|
|
| 572 |
max_tokens=500,
|
| 573 |
)
|
| 574 |
|
| 575 |
-
prediction = response.choices[0].message.content.strip()
|
| 576 |
|
| 577 |
# IMDB labels: 0 = negative, 1 = positive
|
| 578 |
true_label = int(target) # 0 or 1
|
| 579 |
|
| 580 |
-
#
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
# Look for clear positive/negative indicators
|
| 584 |
-
has_positive = ("positive" in pred_start and "sentiment" in pred_start) or \
|
| 585 |
-
("this is positive" in pred_start) or \
|
| 586 |
-
("sentiment: positive" in pred_start)
|
| 587 |
-
|
| 588 |
-
has_negative = ("negative" in pred_start and "sentiment" in pred_start) or \
|
| 589 |
-
("this is negative" in pred_start) or \
|
| 590 |
-
("sentiment: negative" in pred_start)
|
| 591 |
|
| 592 |
-
#
|
| 593 |
-
if
|
| 594 |
predicted_label = 1
|
| 595 |
-
elif
|
| 596 |
predicted_label = 0
|
| 597 |
else:
|
| 598 |
-
#
|
| 599 |
predicted_label = -1
|
| 600 |
|
| 601 |
is_correct = (predicted_label == true_label)
|
|
|
|
| 226 |
max_tokens=500,
|
| 227 |
)
|
| 228 |
|
| 229 |
+
prediction = response.choices[0].message.content.strip()
|
| 230 |
|
| 231 |
# IMDB labels: 0 = negative, 1 = positive
|
| 232 |
true_label = int(target) # 0 or 1
|
| 233 |
|
| 234 |
+
# STRICT FORMAT REQUIREMENT: Must start with exactly "Sentiment: positive" or "Sentiment: negative"
|
| 235 |
+
# This teaches evolution to add proper format instructions
|
| 236 |
+
pred_lower = prediction.lower()
|
| 237 |
|
| 238 |
+
# Check if response starts with the exact format (allow some whitespace)
|
| 239 |
+
if pred_lower.startswith("sentiment: positive") or pred_lower.startswith("sentiment:positive"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
predicted_label = 1
|
| 241 |
+
elif pred_lower.startswith("sentiment: negative") or pred_lower.startswith("sentiment:negative"):
|
| 242 |
predicted_label = 0
|
| 243 |
else:
|
| 244 |
+
# Wrong format = incorrect (even if sentiment is mentioned elsewhere)
|
| 245 |
predicted_label = -1
|
| 246 |
|
| 247 |
is_correct = (predicted_label == true_label)
|
|
|
|
| 564 |
max_tokens=500,
|
| 565 |
)
|
| 566 |
|
| 567 |
+
prediction = response.choices[0].message.content.strip()
|
| 568 |
|
| 569 |
# IMDB labels: 0 = negative, 1 = positive
|
| 570 |
true_label = int(target) # 0 or 1
|
| 571 |
|
| 572 |
+
# STRICT FORMAT REQUIREMENT: Must start with exactly "Sentiment: positive" or "Sentiment: negative"
|
| 573 |
+
# This teaches evolution to add proper format instructions
|
| 574 |
+
pred_lower = prediction.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
|
| 576 |
+
# Check if response starts with the exact format (allow some whitespace)
|
| 577 |
+
if pred_lower.startswith("sentiment: positive") or pred_lower.startswith("sentiment:positive"):
|
| 578 |
predicted_label = 1
|
| 579 |
+
elif pred_lower.startswith("sentiment: negative") or pred_lower.startswith("sentiment:negative"):
|
| 580 |
predicted_label = 0
|
| 581 |
else:
|
| 582 |
+
# Wrong format = incorrect (even if sentiment is mentioned elsewhere)
|
| 583 |
predicted_label = -1
|
| 584 |
|
| 585 |
is_correct = (predicted_label == true_label)
|