Upload app.py
Browse files
app.py
CHANGED
|
@@ -71,13 +71,18 @@ def validate_dataset(dataset_name: str, split: str, input_field: str, target_fie
|
|
| 71 |
try:
|
| 72 |
dataset = load_dataset(dataset_name, split=split, streaming=True)
|
| 73 |
except ValueError as e:
|
| 74 |
-
# If it fails with config error, try
|
| 75 |
if "config" in str(e).lower() or "Config name is missing" in str(e):
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
try:
|
| 78 |
-
dataset = load_dataset(dataset_name,
|
| 79 |
except:
|
| 80 |
-
# If
|
| 81 |
raise e
|
| 82 |
else:
|
| 83 |
raise
|
|
@@ -161,9 +166,13 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
|
|
| 161 |
try:
|
| 162 |
dataset = load_dataset(dataset_name, split=split, streaming=False)
|
| 163 |
except ValueError as e:
|
| 164 |
-
# If it fails with config error, try
|
| 165 |
if "config" in str(e).lower() or "Config name is missing" in str(e):
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
else:
|
| 168 |
raise
|
| 169 |
|
|
@@ -213,7 +222,7 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
|
|
| 213 |
{"role": "system", "content": "You are a helpful assistant."},
|
| 214 |
{"role": "user", "content": formatted_prompt}
|
| 215 |
],
|
| 216 |
-
temperature=0.
|
| 217 |
max_tokens=500,
|
| 218 |
)
|
| 219 |
|
|
@@ -266,11 +275,18 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
|
|
| 266 |
target_lower = target_answer.lower()
|
| 267 |
pred_lower = pred_answer.lower()
|
| 268 |
|
| 269 |
-
# Sentiment mappings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
if target_lower in ["1", "positive", "pos"]:
|
| 271 |
-
is_correct = any(word in pred_lower for word in
|
| 272 |
elif target_lower in ["0", "negative", "neg"]:
|
| 273 |
-
is_correct = any(word in pred_lower for word in
|
| 274 |
|
| 275 |
if is_correct:
|
| 276 |
correct += 1
|
|
@@ -542,9 +558,13 @@ def evaluate(prompt: str) -> dict:
|
|
| 542 |
try:
|
| 543 |
dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
|
| 544 |
except ValueError as e:
|
| 545 |
-
# If it fails with config error, try
|
| 546 |
if "config" in str(e).lower() or "Config name is missing" in str(e):
|
| 547 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
else:
|
| 549 |
raise
|
| 550 |
|
|
@@ -581,7 +601,7 @@ def evaluate(prompt: str) -> dict:
|
|
| 581 |
{{"role": "system", "content": "You are a helpful assistant."}},
|
| 582 |
{{"role": "user", "content": formatted_prompt}}
|
| 583 |
],
|
| 584 |
-
temperature=0.
|
| 585 |
max_tokens=500,
|
| 586 |
)
|
| 587 |
|
|
@@ -636,11 +656,18 @@ def evaluate(prompt: str) -> dict:
|
|
| 636 |
target_lower = target_answer.lower()
|
| 637 |
pred_lower = pred_answer.lower()
|
| 638 |
|
| 639 |
-
# Sentiment mappings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
if target_lower in ["1", "positive", "pos"]:
|
| 641 |
-
is_correct = any(word in pred_lower for word in
|
| 642 |
elif target_lower in ["0", "negative", "neg"]:
|
| 643 |
-
is_correct = any(word in pred_lower for word in
|
| 644 |
|
| 645 |
if is_correct:
|
| 646 |
correct += 1
|
|
@@ -1062,12 +1089,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 1062 |
|
| 1063 |
## How it works:
|
| 1064 |
1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
|
| 1065 |
-
2. Default dataset is **
|
| 1066 |
-
3. Specify the dataset split and field names (or use other datasets like `stanfordnlp/imdb`)
|
| 1067 |
4. Choose a free model from OpenRouter
|
| 1068 |
5. Click "Optimize Prompt" - the system will validate everything first!
|
| 1069 |
6. Watch the evolution progress in real-time
|
| 1070 |
-
7. Compare initial vs. evolved performance - expect
|
| 1071 |
|
| 1072 |
**Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
|
| 1073 |
""")
|
|
@@ -1085,34 +1112,34 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 1085 |
|
| 1086 |
dataset_name = gr.Textbox(
|
| 1087 |
label="HuggingFace Dataset (Full Name)",
|
| 1088 |
-
value="
|
| 1089 |
-
placeholder="e.g., gsm8k, stanfordnlp/imdb
|
| 1090 |
-
info="Dataset name from HuggingFace Hub. Configs auto-detected (e.g., '
|
| 1091 |
)
|
| 1092 |
|
| 1093 |
dataset_split = gr.Textbox(
|
| 1094 |
label="Dataset Split",
|
| 1095 |
-
value="
|
| 1096 |
placeholder="e.g., train, test, validation"
|
| 1097 |
)
|
| 1098 |
|
| 1099 |
input_field = gr.Textbox(
|
| 1100 |
label="Input Field Name",
|
| 1101 |
-
value="
|
| 1102 |
-
placeholder="e.g.,
|
| 1103 |
info="The field containing inputs to process"
|
| 1104 |
)
|
| 1105 |
|
| 1106 |
target_field = gr.Textbox(
|
| 1107 |
label="Target Field Name",
|
| 1108 |
-
value="
|
| 1109 |
-
placeholder="e.g.,
|
| 1110 |
info="The field containing expected outputs"
|
| 1111 |
)
|
| 1112 |
|
| 1113 |
initial_prompt = gr.TextArea(
|
| 1114 |
label="Initial Prompt",
|
| 1115 |
-
value="{input}\n\
|
| 1116 |
lines=6,
|
| 1117 |
info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
|
| 1118 |
)
|
|
|
|
| 71 |
try:
|
| 72 |
dataset = load_dataset(dataset_name, split=split, streaming=True)
|
| 73 |
except ValueError as e:
|
| 74 |
+
# If it fails with config error, try common configs
|
| 75 |
if "config" in str(e).lower() or "Config name is missing" in str(e):
|
| 76 |
+
# Try common configs based on dataset name
|
| 77 |
+
default_config = "main"
|
| 78 |
+
if dataset_name.lower() == "glue":
|
| 79 |
+
default_config = "sst2"
|
| 80 |
+
|
| 81 |
+
print(f"Dataset requires config, trying with '{default_config}' config...")
|
| 82 |
try:
|
| 83 |
+
dataset = load_dataset(dataset_name, default_config, split=split, streaming=True)
|
| 84 |
except:
|
| 85 |
+
# If default config doesn't work, raise the original error
|
| 86 |
raise e
|
| 87 |
else:
|
| 88 |
raise
|
|
|
|
| 166 |
try:
|
| 167 |
dataset = load_dataset(dataset_name, split=split, streaming=False)
|
| 168 |
except ValueError as e:
|
| 169 |
+
# If it fails with config error, try common configs
|
| 170 |
if "config" in str(e).lower() or "Config name is missing" in str(e):
|
| 171 |
+
# Try common configs based on dataset name
|
| 172 |
+
default_config = "main"
|
| 173 |
+
if dataset_name.lower() == "glue":
|
| 174 |
+
default_config = "sst2"
|
| 175 |
+
dataset = load_dataset(dataset_name, default_config, split=split, streaming=False)
|
| 176 |
else:
|
| 177 |
raise
|
| 178 |
|
|
|
|
| 222 |
{"role": "system", "content": "You are a helpful assistant."},
|
| 223 |
{"role": "user", "content": formatted_prompt}
|
| 224 |
],
|
| 225 |
+
temperature=0.0,
|
| 226 |
max_tokens=500,
|
| 227 |
)
|
| 228 |
|
|
|
|
| 275 |
target_lower = target_answer.lower()
|
| 276 |
pred_lower = pred_answer.lower()
|
| 277 |
|
| 278 |
+
# Sentiment mappings with expanded synonyms
|
| 279 |
+
positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic",
|
| 280 |
+
"amazing", "love", "best", "1", "pos", "admiration", "appreciation",
|
| 281 |
+
"praise", "favorable", "approve"]
|
| 282 |
+
negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate",
|
| 283 |
+
"0", "neg", "criticism", "disdain", "disapproval", "unfavorable",
|
| 284 |
+
"critique", "condemn", "sarcasm"]
|
| 285 |
+
|
| 286 |
if target_lower in ["1", "positive", "pos"]:
|
| 287 |
+
is_correct = any(word in pred_lower for word in positive_words)
|
| 288 |
elif target_lower in ["0", "negative", "neg"]:
|
| 289 |
+
is_correct = any(word in pred_lower for word in negative_words)
|
| 290 |
|
| 291 |
if is_correct:
|
| 292 |
correct += 1
|
|
|
|
| 558 |
try:
|
| 559 |
dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
|
| 560 |
except ValueError as e:
|
| 561 |
+
# If it fails with config error, try common configs
|
| 562 |
if "config" in str(e).lower() or "Config name is missing" in str(e):
|
| 563 |
+
# Try common configs based on dataset name
|
| 564 |
+
default_config = "main"
|
| 565 |
+
if "{dataset_name}".lower() == "glue":
|
| 566 |
+
default_config = "sst2"
|
| 567 |
+
dataset = load_dataset("{dataset_name}", default_config, split="{split}", streaming=False)
|
| 568 |
else:
|
| 569 |
raise
|
| 570 |
|
|
|
|
| 601 |
{{"role": "system", "content": "You are a helpful assistant."}},
|
| 602 |
{{"role": "user", "content": formatted_prompt}}
|
| 603 |
],
|
| 604 |
+
temperature=0.0,
|
| 605 |
max_tokens=500,
|
| 606 |
)
|
| 607 |
|
|
|
|
| 656 |
target_lower = target_answer.lower()
|
| 657 |
pred_lower = pred_answer.lower()
|
| 658 |
|
| 659 |
+
# Sentiment mappings with expanded synonyms
|
| 660 |
+
positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic",
|
| 661 |
+
"amazing", "love", "best", "1", "pos", "admiration", "appreciation",
|
| 662 |
+
"praise", "favorable", "approve"]
|
| 663 |
+
negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate",
|
| 664 |
+
"0", "neg", "criticism", "disdain", "disapproval", "unfavorable",
|
| 665 |
+
"critique", "condemn", "sarcasm"]
|
| 666 |
+
|
| 667 |
if target_lower in ["1", "positive", "pos"]:
|
| 668 |
+
is_correct = any(word in pred_lower for word in positive_words)
|
| 669 |
elif target_lower in ["0", "negative", "neg"]:
|
| 670 |
+
is_correct = any(word in pred_lower for word in negative_words)
|
| 671 |
|
| 672 |
if is_correct:
|
| 673 |
correct += 1
|
|
|
|
| 1089 |
|
| 1090 |
## How it works:
|
| 1091 |
1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
|
| 1092 |
+
2. Default dataset is **SST-2** (binary sentiment classification) - perfect for showing prompt improvement!
|
| 1093 |
+
3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `stanfordnlp/imdb`)
|
| 1094 |
4. Choose a free model from OpenRouter
|
| 1095 |
5. Click "Optimize Prompt" - the system will validate everything first!
|
| 1096 |
6. Watch the evolution progress in real-time
|
| 1097 |
+
7. Compare initial vs. evolved performance - expect improvement from ~15% to 70%+ on SST-2!
|
| 1098 |
|
| 1099 |
**Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
|
| 1100 |
""")
|
|
|
|
| 1112 |
|
| 1113 |
dataset_name = gr.Textbox(
|
| 1114 |
label="HuggingFace Dataset (Full Name)",
|
| 1115 |
+
value="glue",
|
| 1116 |
+
placeholder="e.g., glue, gsm8k, stanfordnlp/imdb",
|
| 1117 |
+
info="Dataset name from HuggingFace Hub. Configs auto-detected (e.g., 'glue' → 'glue:sst2')"
|
| 1118 |
)
|
| 1119 |
|
| 1120 |
dataset_split = gr.Textbox(
|
| 1121 |
label="Dataset Split",
|
| 1122 |
+
value="validation",
|
| 1123 |
placeholder="e.g., train, test, validation"
|
| 1124 |
)
|
| 1125 |
|
| 1126 |
input_field = gr.Textbox(
|
| 1127 |
label="Input Field Name",
|
| 1128 |
+
value="sentence",
|
| 1129 |
+
placeholder="e.g., sentence, text, question",
|
| 1130 |
info="The field containing inputs to process"
|
| 1131 |
)
|
| 1132 |
|
| 1133 |
target_field = gr.Textbox(
|
| 1134 |
label="Target Field Name",
|
| 1135 |
+
value="label",
|
| 1136 |
+
placeholder="e.g., label, answer, target",
|
| 1137 |
info="The field containing expected outputs"
|
| 1138 |
)
|
| 1139 |
|
| 1140 |
initial_prompt = gr.TextArea(
|
| 1141 |
label="Initial Prompt",
|
| 1142 |
+
value="{input}\n\nIs this review positive or negative? Answer with just 'positive' or 'negative':",
|
| 1143 |
lines=6,
|
| 1144 |
info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
|
| 1145 |
)
|