codelion's picture
Upload app.py
5d7f7a8 verified
raw
history blame
16 kB
import gradio as gr
import os
import yaml
import json
import random
from datasets import load_dataset
from openai import OpenAI
from openevolve import run_evolution
from typing import Dict, List, Tuple
import tempfile
import shutil
# Free models from OpenRouter (as of 2025)
FREE_MODELS = [
"google/gemini-2.0-flash-001:free",
"google/gemini-flash-1.5-8b:free",
"meta-llama/llama-3.2-3b-instruct:free",
"meta-llama/llama-3.2-1b-instruct:free",
"microsoft/phi-3-mini-128k-instruct:free",
"microsoft/phi-3-medium-128k-instruct:free",
"qwen/qwen-2-7b-instruct:free",
"mistralai/mistral-7b-instruct:free",
]
# Popular HuggingFace datasets for different tasks
SAMPLE_DATASETS = {
"Question Answering": [
"hotpot_qa",
"squad",
"trivia_qa",
],
"Sentiment Analysis": [
"imdb",
"yelp_review_full",
"emotion",
],
"Text Classification": [
"ag_news",
"dbpedia_14",
"SetFit/sst5",
],
"Math Reasoning": [
"gsm8k",
"math_qa",
],
}
def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int,
api_key: str, model: str, input_field: str, target_field: str) -> Dict:
"""Evaluate a prompt on a dataset using the selected model."""
try:
# Load dataset
dataset = load_dataset(dataset_name, split=split, streaming=False)
# Sample random examples
if len(dataset) > num_samples:
indices = random.sample(range(len(dataset)), num_samples)
samples = [dataset[i] for i in indices]
else:
samples = list(dataset)[:num_samples]
# Initialize OpenAI client with OpenRouter
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=api_key,
)
correct = 0
total = 0
results = []
for sample in samples:
try:
# Get input and target
input_text = sample.get(input_field, "")
if isinstance(input_text, dict):
input_text = str(input_text)
target = sample.get(target_field, "")
if isinstance(target, dict):
target = str(target)
# Format the prompt with the input
formatted_prompt = prompt.replace("{input}", str(input_text))
# Call the model
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": formatted_prompt}
],
temperature=0.1,
max_tokens=500,
)
prediction = response.choices[0].message.content.strip()
# Simple exact match evaluation
is_correct = str(target).lower().strip() in prediction.lower()
if is_correct:
correct += 1
total += 1
results.append({
"input": str(input_text)[:100] + "...",
"target": str(target),
"prediction": prediction[:100] + "...",
"correct": is_correct
})
except Exception as e:
print(f"Error evaluating sample: {e}")
continue
accuracy = (correct / total * 100) if total > 0 else 0
return {
"accuracy": accuracy,
"correct": correct,
"total": total,
"results": results
}
except Exception as e:
return {
"error": str(e),
"accuracy": 0,
"correct": 0,
"total": 0,
"results": []
}
def create_evaluator_file(dataset_name: str, split: str, model: str,
input_field: str, target_field: str, work_dir: str):
"""Create an evaluator.py file for OpenEvolve."""
evaluator_code = f'''
import os
import random
from datasets import load_dataset
from openai import OpenAI
def evaluate(prompt: str) -> float:
"""Evaluate a prompt and return a score between 0 and 1."""
try:
# Load dataset
dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
# Sample 100 random examples
num_samples = min(100, len(dataset))
if len(dataset) > num_samples:
indices = random.sample(range(len(dataset)), num_samples)
samples = [dataset[i] for i in indices]
else:
samples = list(dataset)[:num_samples]
# Initialize OpenAI client
api_key = os.environ.get("OPENAI_API_KEY")
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=api_key,
)
correct = 0
total = 0
for sample in samples:
try:
# Get input and target
input_text = sample.get("{input_field}", "")
if isinstance(input_text, dict):
input_text = str(input_text)
target = sample.get("{target_field}", "")
if isinstance(target, dict):
target = str(target)
# Format the prompt
formatted_prompt = prompt.replace("{{input}}", str(input_text))
# Call the model
response = client.chat.completions.create(
model="{model}",
messages=[
{{"role": "system", "content": "You are a helpful assistant."}},
{{"role": "user", "content": formatted_prompt}}
],
temperature=0.1,
max_tokens=500,
)
prediction = response.choices[0].message.content.strip()
# Simple evaluation
is_correct = str(target).lower().strip() in prediction.lower()
if is_correct:
correct += 1
total += 1
except Exception as e:
print(f"Error evaluating sample: {{e}}")
continue
# Return score between 0 and 1
return (correct / total) if total > 0 else 0.0
except Exception as e:
print(f"Error in evaluation: {{e}}")
return 0.0
'''
evaluator_path = os.path.join(work_dir, "evaluator.py")
with open(evaluator_path, "w") as f:
f.write(evaluator_code)
return evaluator_path
def create_config_file(model: str, work_dir: str):
"""Create a config.yaml file for OpenEvolve."""
config = {
"llm": {
"api_base": "https://openrouter.ai/api/v1",
"model": model,
"temperature": 0.7,
"max_tokens": 4096,
},
"evolution": {
"max_iterations": 10,
"population_size": 10,
"num_islands": 1,
"elite_ratio": 0.1,
"explore_ratio": 0.3,
"exploit_ratio": 0.6,
},
"evaluation": {
"timeout": 1800,
}
}
config_path = os.path.join(work_dir, "config.yaml")
with open(config_path, "w") as f:
yaml.dump(config, f)
return config_path
def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
model: str, api_key: str, input_field: str, target_field: str,
progress=gr.Progress()) -> Tuple[str, str, str]:
"""Run OpenEvolve to optimize the prompt."""
if not api_key:
return "Error: OpenAI API Key is required", "", ""
# Set API key as environment variable
os.environ["OPENAI_API_KEY"] = api_key
progress(0, desc="Setting up...")
# Create temporary working directory
work_dir = tempfile.mkdtemp(prefix="openevolve_")
try:
# Save initial prompt
initial_prompt_path = os.path.join(work_dir, "initial_prompt.txt")
with open(initial_prompt_path, "w") as f:
f.write(initial_prompt)
# Create evaluator
progress(0.1, desc="Creating evaluator...")
evaluator_path = create_evaluator_file(dataset_name, dataset_split, model,
input_field, target_field, work_dir)
# Create config
progress(0.2, desc="Creating configuration...")
config_path = create_config_file(model, work_dir)
# Run initial evaluation
progress(0.3, desc="Running initial evaluation...")
initial_eval = evaluate_prompt(
initial_prompt, dataset_name, dataset_split, 100,
api_key, model, input_field, target_field
)
initial_results = f"""
### Initial Prompt Evaluation
**Prompt:**
```
{initial_prompt}
```
**Results:**
- Accuracy: {initial_eval['accuracy']:.2f}%
- Correct: {initial_eval['correct']}/{initial_eval['total']}
**Sample Results:**
"""
for i, result in enumerate(initial_eval['results'][:5], 1):
initial_results += f"\n{i}. Input: {result['input']}\n"
initial_results += f" Target: {result['target']}\n"
initial_results += f" Prediction: {result['prediction']}\n"
initial_results += f" ✓ Correct\n" if result['correct'] else f" ✗ Incorrect\n"
# Run OpenEvolve
progress(0.4, desc="Running OpenEvolve (this may take several minutes)...")
output_dir = os.path.join(work_dir, "output")
os.makedirs(output_dir, exist_ok=True)
try:
# Run evolution
result = run_evolution(
initial_program_path=initial_prompt_path,
evaluator_path=evaluator_path,
config_path=config_path,
output_dir=output_dir,
verbose=True
)
progress(0.8, desc="Evaluating best prompt...")
# Get the best prompt
best_prompt_path = os.path.join(output_dir, "best_program.txt")
if os.path.exists(best_prompt_path):
with open(best_prompt_path, "r") as f:
best_prompt = f.read()
else:
best_prompt = initial_prompt
# Evaluate best prompt
final_eval = evaluate_prompt(
best_prompt, dataset_name, dataset_split, 100,
api_key, model, input_field, target_field
)
final_results = f"""
### Evolved Prompt Evaluation
**Prompt:**
```
{best_prompt}
```
**Results:**
- Accuracy: {final_eval['accuracy']:.2f}%
- Correct: {final_eval['correct']}/{final_eval['total']}
- Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:.2f}%
**Sample Results:**
"""
for i, result in enumerate(final_eval['results'][:5], 1):
final_results += f"\n{i}. Input: {result['input']}\n"
final_results += f" Target: {result['target']}\n"
final_results += f" Prediction: {result['prediction']}\n"
final_results += f" ✓ Correct\n" if result['correct'] else f" ✗ Incorrect\n"
summary = f"""
## Optimization Complete!
### Summary
- Initial Accuracy: {initial_eval['accuracy']:.2f}%
- Final Accuracy: {final_eval['accuracy']:.2f}%
- Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:.2f}%
- Dataset: {dataset_name}
- Model: {model}
- Samples Evaluated: 100
- Iterations: 10
"""
progress(1.0, desc="Complete!")
return summary, initial_results, final_results
except Exception as e:
return f"Error during evolution: {str(e)}", initial_results, ""
finally:
# Clean up
try:
shutil.rmtree(work_dir)
except:
pass
# Create Gradio interface
with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🧬 OpenEvolve Prompt Optimizer
Automatically evolve and optimize your prompts using evolutionary algorithms!
This space uses [OpenEvolve](https://github.com/codelion/openevolve) to iteratively improve prompts
by testing them on real datasets and evolving better versions.
## How it works:
1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
2. Select a HuggingFace dataset to test on
3. Choose a free model from OpenRouter
4. Click "Optimize Prompt" to evolve better versions
5. Compare initial vs. evolved performance!
""")
with gr.Row():
with gr.Column():
gr.Markdown("### Configuration")
api_key = gr.Textbox(
label="OpenAI API Key (for OpenRouter)",
type="password",
placeholder="sk-or-v1-...",
info="Get your free key at https://openrouter.ai/keys"
)
model = gr.Dropdown(
choices=FREE_MODELS,
value=FREE_MODELS[0],
label="Select Model",
info="Free models available on OpenRouter"
)
dataset_name = gr.Textbox(
label="HuggingFace Dataset",
value="imdb",
placeholder="e.g., imdb, hotpot_qa, gsm8k",
info="Any dataset from HuggingFace Hub"
)
dataset_split = gr.Textbox(
label="Dataset Split",
value="test",
placeholder="e.g., train, test, validation"
)
input_field = gr.Textbox(
label="Input Field Name",
value="text",
placeholder="e.g., text, question, context",
info="The field containing inputs to process"
)
target_field = gr.Textbox(
label="Target Field Name",
value="label",
placeholder="e.g., label, answer, target",
info="The field containing expected outputs"
)
initial_prompt = gr.TextArea(
label="Initial Prompt",
value="Analyze the sentiment of the following text and classify it as positive or negative:\n\n{input}\n\nClassification:",
lines=6,
info="Use {input} as placeholder for dataset inputs"
)
optimize_btn = gr.Button("🚀 Optimize Prompt", variant="primary", size="lg")
with gr.Row():
with gr.Column():
summary = gr.Markdown(label="Summary")
with gr.Row():
with gr.Column():
initial_results = gr.Markdown(label="Initial Results")
with gr.Column():
final_results = gr.Markdown(label="Evolved Results")
gr.Markdown("""
### Example Datasets & Fields:
| Dataset | Split | Input Field | Target Field | Task |
|---------|-------|-------------|--------------|------|
| imdb | test | text | label | Sentiment Analysis |
| hotpot_qa | validation | question | answer | Question Answering |
| emotion | test | text | label | Emotion Classification |
| gsm8k | test | question | answer | Math Reasoning |
| ag_news | test | text | label | News Classification |
### Notes:
- Evolution runs for 10 iterations with 1 island
- Each evaluation uses 100 random samples from the dataset
- The process may take 5-15 minutes depending on the dataset and model
- Make sure your API key has sufficient credits for the requests
""")
optimize_btn.click(
fn=optimize_prompt,
inputs=[initial_prompt, dataset_name, dataset_split, model, api_key,
input_field, target_field],
outputs=[summary, initial_results, final_results]
)
if __name__ == "__main__":
demo.launch()