Upload app.py
Browse files
app.py
CHANGED
|
@@ -13,15 +13,10 @@ import shutil
|
|
| 13 |
import requests
|
| 14 |
import glob
|
| 15 |
|
| 16 |
-
#
|
| 17 |
-
#
|
| 18 |
-
|
| 19 |
-
"
|
| 20 |
-
"meta-llama/llama-3.3-70b-instruct:free", # 70B - Advanced reasoning
|
| 21 |
-
"google/gemma-3-27b-it:free", # 27B - Strong instruction-tuned
|
| 22 |
-
"mistralai/mistral-small-3.1-24b-instruct:free", # 24B - Efficient and capable
|
| 23 |
-
"deepseek/deepseek-r1:free", # 671B (37B active) - Top-tier but heavily rate-limited
|
| 24 |
-
"meta-llama/llama-3.2-3b-instruct", # 3B - PAID but very cheap fallback when free models hit rate limits
|
| 25 |
]
|
| 26 |
|
| 27 |
|
|
@@ -215,19 +210,34 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
|
|
| 215 |
# Format the prompt with the input
|
| 216 |
formatted_prompt = prompt.replace("{input}", str(input_text))
|
| 217 |
|
| 218 |
-
# Call the model
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
prediction = response.choices[0].message.content.strip()
|
| 230 |
|
|
|
|
|
|
|
|
|
|
| 231 |
# IMDB labels: 0 = negative, 1 = positive
|
| 232 |
true_label = int(target) # 0 or 1
|
| 233 |
|
|
@@ -494,19 +504,20 @@ def parse_evolution_history(output_dir: str) -> str:
|
|
| 494 |
|
| 495 |
def create_evaluator_file(dataset_name: str, split: str, model: str,
|
| 496 |
input_field: str, target_field: str, work_dir: str):
|
| 497 |
-
"""Create an evaluator.py file for OpenEvolve that uses
|
| 498 |
evaluator_code = f'''
|
| 499 |
import os
|
| 500 |
import random
|
|
|
|
| 501 |
from datasets import load_dataset
|
| 502 |
from openai import OpenAI
|
| 503 |
|
| 504 |
def evaluate(prompt: str) -> dict:
|
| 505 |
"""
|
| 506 |
-
Evaluate a prompt using
|
| 507 |
|
| 508 |
-
|
| 509 |
-
|
| 510 |
"""
|
| 511 |
try:
|
| 512 |
# IMPORTANT: Use fixed seed for consistent sampling across all evaluations
|
|
@@ -524,8 +535,8 @@ def evaluate(prompt: str) -> dict:
|
|
| 524 |
else:
|
| 525 |
raise
|
| 526 |
|
| 527 |
-
# Sample
|
| 528 |
-
num_samples =
|
| 529 |
if len(dataset) > num_samples:
|
| 530 |
# Use SAME sampling logic as initial/final eval
|
| 531 |
indices = random.sample(range(len(dataset)), num_samples)
|
|
@@ -543,6 +554,7 @@ def evaluate(prompt: str) -> dict:
|
|
| 543 |
|
| 544 |
correct = 0
|
| 545 |
total = 0
|
|
|
|
| 546 |
|
| 547 |
print(f"Evaluating on {{len(samples)}} samples...")
|
| 548 |
|
|
@@ -560,16 +572,27 @@ def evaluate(prompt: str) -> dict:
|
|
| 560 |
# Format the prompt
|
| 561 |
formatted_prompt = prompt.replace("{{input}}", str(input_text))
|
| 562 |
|
| 563 |
-
# Call the model
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
|
| 574 |
prediction = response.choices[0].message.content.strip()
|
| 575 |
|
|
@@ -602,11 +625,21 @@ def evaluate(prompt: str) -> dict:
|
|
| 602 |
correct += 1
|
| 603 |
total += 1
|
| 604 |
|
| 605 |
-
|
|
|
|
|
|
|
|
|
|
| 606 |
print(f" Progress: {{idx + 1}}/{{len(samples)}} - Current accuracy: {{correct/total:.2%}}")
|
| 607 |
|
| 608 |
except Exception as e:
|
|
|
|
| 609 |
print(f"Error evaluating sample {{idx+1}}: {{e}}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 610 |
continue
|
| 611 |
|
| 612 |
accuracy = (correct / total) if total > 0 else 0.0
|
|
@@ -726,7 +759,7 @@ Your improved prompt here
|
|
| 726 |
"llm": {
|
| 727 |
"primary_model": model,
|
| 728 |
"api_base": "https://openrouter.ai/api/v1", # Use OpenRouter endpoint
|
| 729 |
-
"temperature":
|
| 730 |
},
|
| 731 |
"max_iterations": 5,
|
| 732 |
"checkpoint_interval": 1, # Save checkpoints every iteration to preserve prompt history
|
|
@@ -738,11 +771,11 @@ Your improved prompt here
|
|
| 738 |
"template_dir": templates_dir, # Use our custom prompt engineering templates
|
| 739 |
},
|
| 740 |
"evolution": {
|
| 741 |
-
"population_size":
|
| 742 |
"num_islands": 1, # Single island for simpler evolution
|
| 743 |
-
"elite_ratio": 0.
|
| 744 |
-
"explore_ratio": 0.
|
| 745 |
-
"exploit_ratio": 0.
|
| 746 |
},
|
| 747 |
"database": {
|
| 748 |
"log_prompts": True, # Save prompts used to generate each program
|
|
@@ -940,7 +973,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 940 |
- **Model**: {model}
|
| 941 |
- **Initial Eval**: 50 samples
|
| 942 |
- **Final Eval**: 50 samples (same samples for fair comparison)
|
| 943 |
-
- **Evolution**:
|
| 944 |
- **Iterations**: 5
|
| 945 |
|
| 946 |
### Results
|
|
@@ -974,29 +1007,28 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 974 |
This space uses [OpenEvolve](https://github.com/algorithmicsuperintelligence/openevolve) to iteratively improve prompts
|
| 975 |
by testing them on real datasets and evolving better versions.
|
| 976 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 977 |
## How it works:
|
| 978 |
1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
|
| 979 |
2. Default dataset is **IMDB** (movie review sentiment classification) - great for showing prompt improvement!
|
| 980 |
3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `dair-ai/emotion`)
|
| 981 |
-
4.
|
| 982 |
-
5.
|
| 983 |
-
6.
|
| 984 |
-
7. Compare initial vs. best prompt side-by-side (both evaluated on same 200 samples)!
|
| 985 |
|
| 986 |
-
**
|
| 987 |
""")
|
| 988 |
|
| 989 |
with gr.Row():
|
| 990 |
with gr.Column():
|
| 991 |
gr.Markdown("### Configuration")
|
| 992 |
|
| 993 |
-
model = gr.Dropdown(
|
| 994 |
-
choices=FREE_MODELS,
|
| 995 |
-
value=FREE_MODELS[0],
|
| 996 |
-
label="Select Model",
|
| 997 |
-
info="Choose from 5 curated free models on OpenRouter (24B to 671B parameters)"
|
| 998 |
-
)
|
| 999 |
-
|
| 1000 |
dataset_name = gr.Textbox(
|
| 1001 |
label="HuggingFace Dataset (Full Name)",
|
| 1002 |
value="stanfordnlp/imdb",
|
|
@@ -1097,10 +1129,19 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 1097 |
- [Documentation](https://github.com/algorithmicsuperintelligence/openevolve#readme)
|
| 1098 |
""")
|
| 1099 |
|
| 1100 |
-
# Wire up the optimize button
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1101 |
optimize_btn.click(
|
| 1102 |
-
fn=
|
| 1103 |
-
inputs=[initial_prompt, dataset_name, dataset_split,
|
| 1104 |
input_field, target_field],
|
| 1105 |
outputs=[summary, initial_results, final_results]
|
| 1106 |
)
|
|
|
|
| 13 |
import requests
|
| 14 |
import glob
|
| 15 |
|
| 16 |
+
# Model for OpenRouter
|
| 17 |
+
# Using paid llama-3.2-3b-instruct since free tier models have unreliable rate limits
|
| 18 |
+
MODELS = [
|
| 19 |
+
"meta-llama/llama-3.2-3b-instruct", # 3B - Reliable, fast, and very cheap ($0.04/$0.04 per 1M tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
]
|
| 21 |
|
| 22 |
|
|
|
|
| 210 |
# Format the prompt with the input
|
| 211 |
formatted_prompt = prompt.replace("{input}", str(input_text))
|
| 212 |
|
| 213 |
+
# Call the model with retry logic for transient failures
|
| 214 |
+
max_retries = 3
|
| 215 |
+
import time
|
| 216 |
+
for retry in range(max_retries):
|
| 217 |
+
try:
|
| 218 |
+
response = client.chat.completions.create(
|
| 219 |
+
model=model,
|
| 220 |
+
messages=[
|
| 221 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
| 222 |
+
{"role": "user", "content": formatted_prompt}
|
| 223 |
+
],
|
| 224 |
+
temperature=0.0,
|
| 225 |
+
max_tokens=500,
|
| 226 |
+
)
|
| 227 |
+
break # Success, exit retry loop
|
| 228 |
+
except Exception as api_error:
|
| 229 |
+
if retry < max_retries - 1:
|
| 230 |
+
wait_time = (retry + 1) * 2 # Exponential backoff: 2s, 4s, 6s
|
| 231 |
+
print(f" API error on sample {idx+1}, retrying in {wait_time}s...")
|
| 232 |
+
time.sleep(wait_time)
|
| 233 |
+
else:
|
| 234 |
+
raise # Final retry failed, propagate error
|
| 235 |
|
| 236 |
prediction = response.choices[0].message.content.strip()
|
| 237 |
|
| 238 |
+
# Small delay to avoid rate limiting
|
| 239 |
+
time.sleep(0.1)
|
| 240 |
+
|
| 241 |
# IMDB labels: 0 = negative, 1 = positive
|
| 242 |
true_label = int(target) # 0 or 1
|
| 243 |
|
|
|
|
| 504 |
|
| 505 |
def create_evaluator_file(dataset_name: str, split: str, model: str,
|
| 506 |
input_field: str, target_field: str, work_dir: str):
|
| 507 |
+
"""Create an evaluator.py file for OpenEvolve that uses 75 samples for better signal."""
|
| 508 |
evaluator_code = f'''
|
| 509 |
import os
|
| 510 |
import random
|
| 511 |
+
import time
|
| 512 |
from datasets import load_dataset
|
| 513 |
from openai import OpenAI
|
| 514 |
|
| 515 |
def evaluate(prompt: str) -> dict:
|
| 516 |
"""
|
| 517 |
+
Evaluate a prompt using 75 fixed samples for stronger evolution signal.
|
| 518 |
|
| 519 |
+
75 samples balances signal strength (vs 50) with API rate limits (vs 150).
|
| 520 |
+
Includes early stopping and rate limit handling.
|
| 521 |
"""
|
| 522 |
try:
|
| 523 |
# IMPORTANT: Use fixed seed for consistent sampling across all evaluations
|
|
|
|
| 535 |
else:
|
| 536 |
raise
|
| 537 |
|
| 538 |
+
# Sample 75 samples with seed 42 for good signal without excessive API calls
|
| 539 |
+
num_samples = 75
|
| 540 |
if len(dataset) > num_samples:
|
| 541 |
# Use SAME sampling logic as initial/final eval
|
| 542 |
indices = random.sample(range(len(dataset)), num_samples)
|
|
|
|
| 554 |
|
| 555 |
correct = 0
|
| 556 |
total = 0
|
| 557 |
+
errors = 0
|
| 558 |
|
| 559 |
print(f"Evaluating on {{len(samples)}} samples...")
|
| 560 |
|
|
|
|
| 572 |
# Format the prompt
|
| 573 |
formatted_prompt = prompt.replace("{{input}}", str(input_text))
|
| 574 |
|
| 575 |
+
# Call the model with retry logic for transient failures
|
| 576 |
+
max_retries = 3
|
| 577 |
+
for retry in range(max_retries):
|
| 578 |
+
try:
|
| 579 |
+
response = client.chat.completions.create(
|
| 580 |
+
model="{model}",
|
| 581 |
+
messages=[
|
| 582 |
+
{{"role": "system", "content": "You are a helpful assistant."}},
|
| 583 |
+
{{"role": "user", "content": formatted_prompt}}
|
| 584 |
+
],
|
| 585 |
+
temperature=0.0,
|
| 586 |
+
max_tokens=500,
|
| 587 |
+
)
|
| 588 |
+
break # Success, exit retry loop
|
| 589 |
+
except Exception as api_error:
|
| 590 |
+
if retry < max_retries - 1:
|
| 591 |
+
wait_time = (retry + 1) * 2 # Exponential backoff: 2s, 4s, 6s
|
| 592 |
+
print(f" API error on sample {{idx+1}}, retrying in {{wait_time}}s...")
|
| 593 |
+
time.sleep(wait_time)
|
| 594 |
+
else:
|
| 595 |
+
raise # Final retry failed, propagate error
|
| 596 |
|
| 597 |
prediction = response.choices[0].message.content.strip()
|
| 598 |
|
|
|
|
| 625 |
correct += 1
|
| 626 |
total += 1
|
| 627 |
|
| 628 |
+
# Small delay to avoid rate limiting
|
| 629 |
+
time.sleep(0.1)
|
| 630 |
+
|
| 631 |
+
if (idx + 1) % 25 == 0:
|
| 632 |
print(f" Progress: {{idx + 1}}/{{len(samples)}} - Current accuracy: {{correct/total:.2%}}")
|
| 633 |
|
| 634 |
except Exception as e:
|
| 635 |
+
errors += 1
|
| 636 |
print(f"Error evaluating sample {{idx+1}}: {{e}}")
|
| 637 |
+
|
| 638 |
+
# Early stopping: if more than 40% of samples fail, abort
|
| 639 |
+
if errors > len(samples) * 0.4:
|
| 640 |
+
print(f"Too many errors ({{errors}}/{{idx+1}}), stopping evaluation early")
|
| 641 |
+
break
|
| 642 |
+
|
| 643 |
continue
|
| 644 |
|
| 645 |
accuracy = (correct / total) if total > 0 else 0.0
|
|
|
|
| 759 |
"llm": {
|
| 760 |
"primary_model": model,
|
| 761 |
"api_base": "https://openrouter.ai/api/v1", # Use OpenRouter endpoint
|
| 762 |
+
"temperature": 0.8, # Balanced temperature for diverse but reasonable variations
|
| 763 |
},
|
| 764 |
"max_iterations": 5,
|
| 765 |
"checkpoint_interval": 1, # Save checkpoints every iteration to preserve prompt history
|
|
|
|
| 771 |
"template_dir": templates_dir, # Use our custom prompt engineering templates
|
| 772 |
},
|
| 773 |
"evolution": {
|
| 774 |
+
"population_size": 12, # Moderate population for good exploration without excessive API calls
|
| 775 |
"num_islands": 1, # Single island for simpler evolution
|
| 776 |
+
"elite_ratio": 0.15, # Keep top 15% (1-2 best prompts)
|
| 777 |
+
"explore_ratio": 0.35, # Balanced exploration
|
| 778 |
+
"exploit_ratio": 0.50, # Balanced exploitation
|
| 779 |
},
|
| 780 |
"database": {
|
| 781 |
"log_prompts": True, # Save prompts used to generate each program
|
|
|
|
| 973 |
- **Model**: {model}
|
| 974 |
- **Initial Eval**: 50 samples
|
| 975 |
- **Final Eval**: 50 samples (same samples for fair comparison)
|
| 976 |
+
- **Evolution**: 75 samples per variant (balanced signal vs API limits)
|
| 977 |
- **Iterations**: 5
|
| 978 |
|
| 979 |
### Results
|
|
|
|
| 1007 |
This space uses [OpenEvolve](https://github.com/algorithmicsuperintelligence/openevolve) to iteratively improve prompts
|
| 1008 |
by testing them on real datasets and evolving better versions.
|
| 1009 |
|
| 1010 |
+
## 🔑 Setup (Required)
|
| 1011 |
+
**To use this space:**
|
| 1012 |
+
1. Click "⋮" (menu) → "Duplicate Space" to create your own copy
|
| 1013 |
+
2. In your duplicated space, go to Settings → Variables & Secrets
|
| 1014 |
+
3. Add your OpenRouter API key as `OPENAI_API_KEY`
|
| 1015 |
+
4. Get a free API key at [openrouter.ai](https://openrouter.ai/)
|
| 1016 |
+
|
| 1017 |
## How it works:
|
| 1018 |
1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
|
| 1019 |
2. Default dataset is **IMDB** (movie review sentiment classification) - great for showing prompt improvement!
|
| 1020 |
3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `dair-ai/emotion`)
|
| 1021 |
+
4. Click "Optimize Prompt" - the system will validate everything first!
|
| 1022 |
+
5. Watch the evolution progress in real-time
|
| 1023 |
+
6. Compare initial vs. best prompt side-by-side (both evaluated on same 50 samples)!
|
|
|
|
| 1024 |
|
| 1025 |
+
**Model**: Using `meta-llama/llama-3.2-3b-instruct` (reliable, very cheap at ~$0.04 per 1M tokens)
|
| 1026 |
""")
|
| 1027 |
|
| 1028 |
with gr.Row():
|
| 1029 |
with gr.Column():
|
| 1030 |
gr.Markdown("### Configuration")
|
| 1031 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1032 |
dataset_name = gr.Textbox(
|
| 1033 |
label="HuggingFace Dataset (Full Name)",
|
| 1034 |
value="stanfordnlp/imdb",
|
|
|
|
| 1129 |
- [Documentation](https://github.com/algorithmicsuperintelligence/openevolve#readme)
|
| 1130 |
""")
|
| 1131 |
|
| 1132 |
+
# Wire up the optimize button with hardcoded model
|
| 1133 |
+
def optimize_with_fixed_model(initial_prompt, dataset_name, dataset_split,
|
| 1134 |
+
input_field, target_field, progress=gr.Progress()):
|
| 1135 |
+
"""Wrapper to use fixed model instead of dropdown"""
|
| 1136 |
+
return optimize_prompt(
|
| 1137 |
+
initial_prompt, dataset_name, dataset_split,
|
| 1138 |
+
MODELS[0], # Use fixed llama-3.2-3b model
|
| 1139 |
+
input_field, target_field, progress
|
| 1140 |
+
)
|
| 1141 |
+
|
| 1142 |
optimize_btn.click(
|
| 1143 |
+
fn=optimize_with_fixed_model,
|
| 1144 |
+
inputs=[initial_prompt, dataset_name, dataset_split,
|
| 1145 |
input_field, target_field],
|
| 1146 |
outputs=[summary, initial_results, final_results]
|
| 1147 |
)
|