codelion commited on
Commit
5e57630
·
verified ·
1 Parent(s): 19d1d68

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -28
app.py CHANGED
@@ -71,13 +71,18 @@ def validate_dataset(dataset_name: str, split: str, input_field: str, target_fie
71
  try:
72
  dataset = load_dataset(dataset_name, split=split, streaming=True)
73
  except ValueError as e:
74
- # If it fails with config error, try with "main" config (common for datasets like gsm8k)
75
  if "config" in str(e).lower() or "Config name is missing" in str(e):
76
- print(f"Dataset requires config, trying with 'main' config...")
 
 
 
 
 
77
  try:
78
- dataset = load_dataset(dataset_name, "main", split=split, streaming=True)
79
  except:
80
- # If "main" doesn't work, raise the original error
81
  raise e
82
  else:
83
  raise
@@ -161,9 +166,13 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
161
  try:
162
  dataset = load_dataset(dataset_name, split=split, streaming=False)
163
  except ValueError as e:
164
- # If it fails with config error, try with "main" config (common for datasets like gsm8k)
165
  if "config" in str(e).lower() or "Config name is missing" in str(e):
166
- dataset = load_dataset(dataset_name, "main", split=split, streaming=False)
 
 
 
 
167
  else:
168
  raise
169
 
@@ -213,7 +222,7 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
213
  {"role": "system", "content": "You are a helpful assistant."},
214
  {"role": "user", "content": formatted_prompt}
215
  ],
216
- temperature=0.1,
217
  max_tokens=500,
218
  )
219
 
@@ -266,11 +275,18 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
266
  target_lower = target_answer.lower()
267
  pred_lower = pred_answer.lower()
268
 
269
- # Sentiment mappings
 
 
 
 
 
 
 
270
  if target_lower in ["1", "positive", "pos"]:
271
- is_correct = any(word in pred_lower for word in ["positive", "good", "great", "1"])
272
  elif target_lower in ["0", "negative", "neg"]:
273
- is_correct = any(word in pred_lower for word in ["negative", "bad", "poor", "0"])
274
 
275
  if is_correct:
276
  correct += 1
@@ -542,9 +558,13 @@ def evaluate(prompt: str) -> dict:
542
  try:
543
  dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
544
  except ValueError as e:
545
- # If it fails with config error, try with "main" config (common for datasets like gsm8k)
546
  if "config" in str(e).lower() or "Config name is missing" in str(e):
547
- dataset = load_dataset("{dataset_name}", "main", split="{split}", streaming=False)
 
 
 
 
548
  else:
549
  raise
550
 
@@ -581,7 +601,7 @@ def evaluate(prompt: str) -> dict:
581
  {{"role": "system", "content": "You are a helpful assistant."}},
582
  {{"role": "user", "content": formatted_prompt}}
583
  ],
584
- temperature=0.1,
585
  max_tokens=500,
586
  )
587
 
@@ -636,11 +656,18 @@ def evaluate(prompt: str) -> dict:
636
  target_lower = target_answer.lower()
637
  pred_lower = pred_answer.lower()
638
 
639
- # Sentiment mappings
 
 
 
 
 
 
 
640
  if target_lower in ["1", "positive", "pos"]:
641
- is_correct = any(word in pred_lower for word in ["positive", "good", "great", "1"])
642
  elif target_lower in ["0", "negative", "neg"]:
643
- is_correct = any(word in pred_lower for word in ["negative", "bad", "poor", "0"])
644
 
645
  if is_correct:
646
  correct += 1
@@ -1062,12 +1089,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1062
 
1063
  ## How it works:
1064
  1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
1065
- 2. Default dataset is **GSM8K** (grade school math problems) - challenging for showing improvement!
1066
- 3. Specify the dataset split and field names (or use other datasets like `stanfordnlp/imdb`)
1067
  4. Choose a free model from OpenRouter
1068
  5. Click "Optimize Prompt" - the system will validate everything first!
1069
  6. Watch the evolution progress in real-time
1070
- 7. Compare initial vs. evolved performance - expect 20-40% improvement on GSM8K!
1071
 
1072
  **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
1073
  """)
@@ -1085,34 +1112,34 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1085
 
1086
  dataset_name = gr.Textbox(
1087
  label="HuggingFace Dataset (Full Name)",
1088
- value="gsm8k",
1089
- placeholder="e.g., gsm8k, stanfordnlp/imdb, openai/gsm8k",
1090
- info="Dataset name from HuggingFace Hub. Configs auto-detected (e.g., 'gsm8k' → 'gsm8k:main')"
1091
  )
1092
 
1093
  dataset_split = gr.Textbox(
1094
  label="Dataset Split",
1095
- value="train",
1096
  placeholder="e.g., train, test, validation"
1097
  )
1098
 
1099
  input_field = gr.Textbox(
1100
  label="Input Field Name",
1101
- value="question",
1102
- placeholder="e.g., question, text, context",
1103
  info="The field containing inputs to process"
1104
  )
1105
 
1106
  target_field = gr.Textbox(
1107
  label="Target Field Name",
1108
- value="answer",
1109
- placeholder="e.g., answer, label, target",
1110
  info="The field containing expected outputs"
1111
  )
1112
 
1113
  initial_prompt = gr.TextArea(
1114
  label="Initial Prompt",
1115
- value="{input}\n\nAnswer:",
1116
  lines=6,
1117
  info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
1118
  )
 
71
  try:
72
  dataset = load_dataset(dataset_name, split=split, streaming=True)
73
  except ValueError as e:
74
+ # If it fails with config error, try common configs
75
  if "config" in str(e).lower() or "Config name is missing" in str(e):
76
+ # Try common configs based on dataset name
77
+ default_config = "main"
78
+ if dataset_name.lower() == "glue":
79
+ default_config = "sst2"
80
+
81
+ print(f"Dataset requires config, trying with '{default_config}' config...")
82
  try:
83
+ dataset = load_dataset(dataset_name, default_config, split=split, streaming=True)
84
  except:
85
+ # If default config doesn't work, raise the original error
86
  raise e
87
  else:
88
  raise
 
166
  try:
167
  dataset = load_dataset(dataset_name, split=split, streaming=False)
168
  except ValueError as e:
169
+ # If it fails with config error, try common configs
170
  if "config" in str(e).lower() or "Config name is missing" in str(e):
171
+ # Try common configs based on dataset name
172
+ default_config = "main"
173
+ if dataset_name.lower() == "glue":
174
+ default_config = "sst2"
175
+ dataset = load_dataset(dataset_name, default_config, split=split, streaming=False)
176
  else:
177
  raise
178
 
 
222
  {"role": "system", "content": "You are a helpful assistant."},
223
  {"role": "user", "content": formatted_prompt}
224
  ],
225
+ temperature=0.0,
226
  max_tokens=500,
227
  )
228
 
 
275
  target_lower = target_answer.lower()
276
  pred_lower = pred_answer.lower()
277
 
278
+ # Sentiment mappings with expanded synonyms
279
+ positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic",
280
+ "amazing", "love", "best", "1", "pos", "admiration", "appreciation",
281
+ "praise", "favorable", "approve"]
282
+ negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate",
283
+ "0", "neg", "criticism", "disdain", "disapproval", "unfavorable",
284
+ "critique", "condemn", "sarcasm"]
285
+
286
  if target_lower in ["1", "positive", "pos"]:
287
+ is_correct = any(word in pred_lower for word in positive_words)
288
  elif target_lower in ["0", "negative", "neg"]:
289
+ is_correct = any(word in pred_lower for word in negative_words)
290
 
291
  if is_correct:
292
  correct += 1
 
558
  try:
559
  dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
560
  except ValueError as e:
561
+ # If it fails with config error, try common configs
562
  if "config" in str(e).lower() or "Config name is missing" in str(e):
563
+ # Try common configs based on dataset name
564
+ default_config = "main"
565
+ if "{dataset_name}".lower() == "glue":
566
+ default_config = "sst2"
567
+ dataset = load_dataset("{dataset_name}", default_config, split="{split}", streaming=False)
568
  else:
569
  raise
570
 
 
601
  {{"role": "system", "content": "You are a helpful assistant."}},
602
  {{"role": "user", "content": formatted_prompt}}
603
  ],
604
+ temperature=0.0,
605
  max_tokens=500,
606
  )
607
 
 
656
  target_lower = target_answer.lower()
657
  pred_lower = pred_answer.lower()
658
 
659
+ # Sentiment mappings with expanded synonyms
660
+ positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic",
661
+ "amazing", "love", "best", "1", "pos", "admiration", "appreciation",
662
+ "praise", "favorable", "approve"]
663
+ negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate",
664
+ "0", "neg", "criticism", "disdain", "disapproval", "unfavorable",
665
+ "critique", "condemn", "sarcasm"]
666
+
667
  if target_lower in ["1", "positive", "pos"]:
668
+ is_correct = any(word in pred_lower for word in positive_words)
669
  elif target_lower in ["0", "negative", "neg"]:
670
+ is_correct = any(word in pred_lower for word in negative_words)
671
 
672
  if is_correct:
673
  correct += 1
 
1089
 
1090
  ## How it works:
1091
  1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
1092
+ 2. Default dataset is **SST-2** (binary sentiment classification) - perfect for showing prompt improvement!
1093
+ 3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `stanfordnlp/imdb`)
1094
  4. Choose a free model from OpenRouter
1095
  5. Click "Optimize Prompt" - the system will validate everything first!
1096
  6. Watch the evolution progress in real-time
1097
+ 7. Compare initial vs. evolved performance - expect improvement from ~15% to 70%+ on SST-2!
1098
 
1099
  **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
1100
  """)
 
1112
 
1113
  dataset_name = gr.Textbox(
1114
  label="HuggingFace Dataset (Full Name)",
1115
+ value="glue",
1116
+ placeholder="e.g., glue, gsm8k, stanfordnlp/imdb",
1117
+ info="Dataset name from HuggingFace Hub. Configs auto-detected (e.g., 'glue' → 'glue:sst2')"
1118
  )
1119
 
1120
  dataset_split = gr.Textbox(
1121
  label="Dataset Split",
1122
+ value="validation",
1123
  placeholder="e.g., train, test, validation"
1124
  )
1125
 
1126
  input_field = gr.Textbox(
1127
  label="Input Field Name",
1128
+ value="sentence",
1129
+ placeholder="e.g., sentence, text, question",
1130
  info="The field containing inputs to process"
1131
  )
1132
 
1133
  target_field = gr.Textbox(
1134
  label="Target Field Name",
1135
+ value="label",
1136
+ placeholder="e.g., label, answer, target",
1137
  info="The field containing expected outputs"
1138
  )
1139
 
1140
  initial_prompt = gr.TextArea(
1141
  label="Initial Prompt",
1142
+ value="{input}\n\nIs this review positive or negative? Answer with just 'positive' or 'negative':",
1143
  lines=6,
1144
  info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
1145
  )