codelion commited on
Commit
12e9ab7
·
verified ·
1 Parent(s): 5e57630

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -20
app.py CHANGED
@@ -540,11 +540,11 @@ def evaluate(prompt: str) -> dict:
540
  """
541
  Evaluate a prompt using 2-stage cascading evaluation to save API calls.
542
 
543
- Stage 1: Evaluate with 20 samples
544
  - If accuracy >= 0.5, proceed to Stage 2
545
- - If accuracy < 0.5, return early (no point wasting 80 more samples)
546
 
547
- Stage 2: Evaluate with 80 more samples (total 100)
548
  - Combine results for final score
549
 
550
  Returns dict with combined_score (0-1), accuracy, correct, and total.
@@ -679,8 +679,8 @@ def evaluate(prompt: str) -> dict:
679
 
680
  return correct, total
681
 
682
- # STAGE 1: Evaluate with 20 samples first
683
- stage1_size = 20
684
  stage1_samples_count = min(stage1_size, len(dataset))
685
 
686
  if len(dataset) > stage1_samples_count:
@@ -697,7 +697,7 @@ def evaluate(prompt: str) -> dict:
697
 
698
  # Early exit if Stage 1 score is below threshold
699
  if stage1_score < 0.5:
700
- print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 80 API calls)")
701
  return {{
702
  "combined_score": stage1_score,
703
  "accuracy": stage1_score,
@@ -706,9 +706,9 @@ def evaluate(prompt: str) -> dict:
706
  "stage": "stage1_early_exit"
707
  }}
708
 
709
- # STAGE 2: Continue with 80 more samples
710
- print(f"[Stage 2/2] Score >= 0.5 - proceeding with 80 more samples...")
711
- stage2_size = 80
712
  stage2_samples_count = min(stage2_size, max(0, len(dataset) - stage1_samples_count))
713
 
714
  if stage2_samples_count > 0:
@@ -1089,12 +1089,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1089
 
1090
  ## How it works:
1091
  1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
1092
- 2. Default dataset is **SST-2** (binary sentiment classification) - perfect for showing prompt improvement!
1093
- 3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `stanfordnlp/imdb`)
1094
  4. Choose a free model from OpenRouter
1095
  5. Click "Optimize Prompt" - the system will validate everything first!
1096
  6. Watch the evolution progress in real-time
1097
- 7. Compare initial vs. evolved performance - expect improvement from ~15% to 70%+ on SST-2!
1098
 
1099
  **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
1100
  """)
@@ -1112,34 +1112,34 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1112
 
1113
  dataset_name = gr.Textbox(
1114
  label="HuggingFace Dataset (Full Name)",
1115
- value="glue",
1116
- placeholder="e.g., glue, gsm8k, stanfordnlp/imdb",
1117
  info="Dataset name from HuggingFace Hub. Configs auto-detected (e.g., 'glue' → 'glue:sst2')"
1118
  )
1119
 
1120
  dataset_split = gr.Textbox(
1121
  label="Dataset Split",
1122
- value="validation",
1123
  placeholder="e.g., train, test, validation"
1124
  )
1125
 
1126
  input_field = gr.Textbox(
1127
  label="Input Field Name",
1128
- value="sentence",
1129
- placeholder="e.g., sentence, text, question",
1130
  info="The field containing inputs to process"
1131
  )
1132
 
1133
  target_field = gr.Textbox(
1134
  label="Target Field Name",
1135
- value="label",
1136
- placeholder="e.g., label, answer, target",
1137
  info="The field containing expected outputs"
1138
  )
1139
 
1140
  initial_prompt = gr.TextArea(
1141
  label="Initial Prompt",
1142
- value="{input}\n\nIs this review positive or negative? Answer with just 'positive' or 'negative':",
1143
  lines=6,
1144
  info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
1145
  )
 
540
  """
541
  Evaluate a prompt using 2-stage cascading evaluation to save API calls.
542
 
543
+ Stage 1: Evaluate with 50 samples
544
  - If accuracy >= 0.5, proceed to Stage 2
545
+ - If accuracy < 0.5, return early (no point wasting 200 more samples)
546
 
547
+ Stage 2: Evaluate with 200 more samples (total 250)
548
  - Combine results for final score
549
 
550
  Returns dict with combined_score (0-1), accuracy, correct, and total.
 
679
 
680
  return correct, total
681
 
682
+ # STAGE 1: Evaluate with 50 samples first
683
+ stage1_size = 50
684
  stage1_samples_count = min(stage1_size, len(dataset))
685
 
686
  if len(dataset) > stage1_samples_count:
 
697
 
698
  # Early exit if Stage 1 score is below threshold
699
  if stage1_score < 0.5:
700
+ print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 200 API calls)")
701
  return {{
702
  "combined_score": stage1_score,
703
  "accuracy": stage1_score,
 
706
  "stage": "stage1_early_exit"
707
  }}
708
 
709
+ # STAGE 2: Continue with 200 more samples
710
+ print(f"[Stage 2/2] Score >= 0.5 - proceeding with 200 more samples...")
711
+ stage2_size = 200
712
  stage2_samples_count = min(stage2_size, max(0, len(dataset) - stage1_samples_count))
713
 
714
  if stage2_samples_count > 0:
 
1089
 
1090
  ## How it works:
1091
  1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
1092
+ 2. Default dataset is **GSM8K** (grade school math) - great for showing prompt improvement!
1093
+ 3. Specify the dataset split and field names (or use other datasets like `glue`, `stanfordnlp/imdb`)
1094
  4. Choose a free model from OpenRouter
1095
  5. Click "Optimize Prompt" - the system will validate everything first!
1096
  6. Watch the evolution progress in real-time
1097
+ 7. Compare initial vs. evolved performance - uses 50 samples for stage 1, 200 for stage 2!
1098
 
1099
  **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
1100
  """)
 
1112
 
1113
  dataset_name = gr.Textbox(
1114
  label="HuggingFace Dataset (Full Name)",
1115
+ value="gsm8k",
1116
+ placeholder="e.g., gsm8k, glue, stanfordnlp/imdb",
1117
  info="Dataset name from HuggingFace Hub. Configs auto-detected (e.g., 'glue' → 'glue:sst2')"
1118
  )
1119
 
1120
  dataset_split = gr.Textbox(
1121
  label="Dataset Split",
1122
+ value="train",
1123
  placeholder="e.g., train, test, validation"
1124
  )
1125
 
1126
  input_field = gr.Textbox(
1127
  label="Input Field Name",
1128
+ value="question",
1129
+ placeholder="e.g., question, sentence, text",
1130
  info="The field containing inputs to process"
1131
  )
1132
 
1133
  target_field = gr.Textbox(
1134
  label="Target Field Name",
1135
+ value="answer",
1136
+ placeholder="e.g., answer, label, target",
1137
  info="The field containing expected outputs"
1138
  )
1139
 
1140
  initial_prompt = gr.TextArea(
1141
  label="Initial Prompt",
1142
+ value="{input}\n\nAnswer:",
1143
  lines=6,
1144
  info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
1145
  )