Upload app.py
Browse files
app.py
CHANGED
|
@@ -540,11 +540,11 @@ def evaluate(prompt: str) -> dict:
|
|
| 540 |
"""
|
| 541 |
Evaluate a prompt using 2-stage cascading evaluation to save API calls.
|
| 542 |
|
| 543 |
-
Stage 1: Evaluate with
|
| 544 |
- If accuracy >= 0.5, proceed to Stage 2
|
| 545 |
-
- If accuracy < 0.5, return early (no point wasting
|
| 546 |
|
| 547 |
-
Stage 2: Evaluate with
|
| 548 |
- Combine results for final score
|
| 549 |
|
| 550 |
Returns dict with combined_score (0-1), accuracy, correct, and total.
|
|
@@ -679,8 +679,8 @@ def evaluate(prompt: str) -> dict:
|
|
| 679 |
|
| 680 |
return correct, total
|
| 681 |
|
| 682 |
-
# STAGE 1: Evaluate with
|
| 683 |
-
stage1_size =
|
| 684 |
stage1_samples_count = min(stage1_size, len(dataset))
|
| 685 |
|
| 686 |
if len(dataset) > stage1_samples_count:
|
|
@@ -697,7 +697,7 @@ def evaluate(prompt: str) -> dict:
|
|
| 697 |
|
| 698 |
# Early exit if Stage 1 score is below threshold
|
| 699 |
if stage1_score < 0.5:
|
| 700 |
-
print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved
|
| 701 |
return {{
|
| 702 |
"combined_score": stage1_score,
|
| 703 |
"accuracy": stage1_score,
|
|
@@ -706,9 +706,9 @@ def evaluate(prompt: str) -> dict:
|
|
| 706 |
"stage": "stage1_early_exit"
|
| 707 |
}}
|
| 708 |
|
| 709 |
-
# STAGE 2: Continue with
|
| 710 |
-
print(f"[Stage 2/2] Score >= 0.5 - proceeding with
|
| 711 |
-
stage2_size =
|
| 712 |
stage2_samples_count = min(stage2_size, max(0, len(dataset) - stage1_samples_count))
|
| 713 |
|
| 714 |
if stage2_samples_count > 0:
|
|
@@ -1089,12 +1089,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 1089 |
|
| 1090 |
## How it works:
|
| 1091 |
1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
|
| 1092 |
-
2. Default dataset is **
|
| 1093 |
-
3. Specify the dataset split and field names (or use other datasets like `
|
| 1094 |
4. Choose a free model from OpenRouter
|
| 1095 |
5. Click "Optimize Prompt" - the system will validate everything first!
|
| 1096 |
6. Watch the evolution progress in real-time
|
| 1097 |
-
7. Compare initial vs. evolved performance -
|
| 1098 |
|
| 1099 |
**Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
|
| 1100 |
""")
|
|
@@ -1112,34 +1112,34 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 1112 |
|
| 1113 |
dataset_name = gr.Textbox(
|
| 1114 |
label="HuggingFace Dataset (Full Name)",
|
| 1115 |
-
value="
|
| 1116 |
-
placeholder="e.g.,
|
| 1117 |
info="Dataset name from HuggingFace Hub. Configs auto-detected (e.g., 'glue' → 'glue:sst2')"
|
| 1118 |
)
|
| 1119 |
|
| 1120 |
dataset_split = gr.Textbox(
|
| 1121 |
label="Dataset Split",
|
| 1122 |
-
value="
|
| 1123 |
placeholder="e.g., train, test, validation"
|
| 1124 |
)
|
| 1125 |
|
| 1126 |
input_field = gr.Textbox(
|
| 1127 |
label="Input Field Name",
|
| 1128 |
-
value="
|
| 1129 |
-
placeholder="e.g., sentence, text
|
| 1130 |
info="The field containing inputs to process"
|
| 1131 |
)
|
| 1132 |
|
| 1133 |
target_field = gr.Textbox(
|
| 1134 |
label="Target Field Name",
|
| 1135 |
-
value="
|
| 1136 |
-
placeholder="e.g.,
|
| 1137 |
info="The field containing expected outputs"
|
| 1138 |
)
|
| 1139 |
|
| 1140 |
initial_prompt = gr.TextArea(
|
| 1141 |
label="Initial Prompt",
|
| 1142 |
-
value="{input}\n\
|
| 1143 |
lines=6,
|
| 1144 |
info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
|
| 1145 |
)
|
|
|
|
| 540 |
"""
|
| 541 |
Evaluate a prompt using 2-stage cascading evaluation to save API calls.
|
| 542 |
|
| 543 |
+
Stage 1: Evaluate with 50 samples
|
| 544 |
- If accuracy >= 0.5, proceed to Stage 2
|
| 545 |
+
- If accuracy < 0.5, return early (no point wasting 200 more samples)
|
| 546 |
|
| 547 |
+
Stage 2: Evaluate with 200 more samples (total 250)
|
| 548 |
- Combine results for final score
|
| 549 |
|
| 550 |
Returns dict with combined_score (0-1), accuracy, correct, and total.
|
|
|
|
| 679 |
|
| 680 |
return correct, total
|
| 681 |
|
| 682 |
+
# STAGE 1: Evaluate with 50 samples first
|
| 683 |
+
stage1_size = 50
|
| 684 |
stage1_samples_count = min(stage1_size, len(dataset))
|
| 685 |
|
| 686 |
if len(dataset) > stage1_samples_count:
|
|
|
|
| 697 |
|
| 698 |
# Early exit if Stage 1 score is below threshold
|
| 699 |
if stage1_score < 0.5:
|
| 700 |
+
print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 200 API calls)")
|
| 701 |
return {{
|
| 702 |
"combined_score": stage1_score,
|
| 703 |
"accuracy": stage1_score,
|
|
|
|
| 706 |
"stage": "stage1_early_exit"
|
| 707 |
}}
|
| 708 |
|
| 709 |
+
# STAGE 2: Continue with 200 more samples
|
| 710 |
+
print(f"[Stage 2/2] Score >= 0.5 - proceeding with 200 more samples...")
|
| 711 |
+
stage2_size = 200
|
| 712 |
stage2_samples_count = min(stage2_size, max(0, len(dataset) - stage1_samples_count))
|
| 713 |
|
| 714 |
if stage2_samples_count > 0:
|
|
|
|
| 1089 |
|
| 1090 |
## How it works:
|
| 1091 |
1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
|
| 1092 |
+
2. Default dataset is **GSM8K** (grade school math) - great for showing prompt improvement!
|
| 1093 |
+
3. Specify the dataset split and field names (or use other datasets like `glue`, `stanfordnlp/imdb`)
|
| 1094 |
4. Choose a free model from OpenRouter
|
| 1095 |
5. Click "Optimize Prompt" - the system will validate everything first!
|
| 1096 |
6. Watch the evolution progress in real-time
|
| 1097 |
+
7. Compare initial vs. evolved performance - uses 50 samples for stage 1, 200 for stage 2!
|
| 1098 |
|
| 1099 |
**Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
|
| 1100 |
""")
|
|
|
|
| 1112 |
|
| 1113 |
dataset_name = gr.Textbox(
|
| 1114 |
label="HuggingFace Dataset (Full Name)",
|
| 1115 |
+
value="gsm8k",
|
| 1116 |
+
placeholder="e.g., gsm8k, glue, stanfordnlp/imdb",
|
| 1117 |
info="Dataset name from HuggingFace Hub. Configs auto-detected (e.g., 'glue' → 'glue:sst2')"
|
| 1118 |
)
|
| 1119 |
|
| 1120 |
dataset_split = gr.Textbox(
|
| 1121 |
label="Dataset Split",
|
| 1122 |
+
value="train",
|
| 1123 |
placeholder="e.g., train, test, validation"
|
| 1124 |
)
|
| 1125 |
|
| 1126 |
input_field = gr.Textbox(
|
| 1127 |
label="Input Field Name",
|
| 1128 |
+
value="question",
|
| 1129 |
+
placeholder="e.g., question, sentence, text",
|
| 1130 |
info="The field containing inputs to process"
|
| 1131 |
)
|
| 1132 |
|
| 1133 |
target_field = gr.Textbox(
|
| 1134 |
label="Target Field Name",
|
| 1135 |
+
value="answer",
|
| 1136 |
+
placeholder="e.g., answer, label, target",
|
| 1137 |
info="The field containing expected outputs"
|
| 1138 |
)
|
| 1139 |
|
| 1140 |
initial_prompt = gr.TextArea(
|
| 1141 |
label="Initial Prompt",
|
| 1142 |
+
value="{input}\n\nAnswer:",
|
| 1143 |
lines=6,
|
| 1144 |
info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
|
| 1145 |
)
|