|
|
import gradio as gr |
|
|
import os |
|
|
import yaml |
|
|
import json |
|
|
import random |
|
|
import re |
|
|
from datasets import load_dataset, get_dataset_config_names, get_dataset_split_names |
|
|
from openai import OpenAI |
|
|
from openevolve import run_evolution |
|
|
from typing import Dict, List, Tuple, Optional |
|
|
import tempfile |
|
|
import shutil |
|
|
import requests |
|
|
import glob |
|
|
|
|
|
|
|
|
|
|
|
FREE_MODELS = [ |
|
|
"qwen/qwen-2.5-72b-instruct:free", |
|
|
"meta-llama/llama-3.3-70b-instruct:free", |
|
|
"google/gemma-3-27b-it:free", |
|
|
"mistralai/mistral-small-3.1-24b-instruct:free", |
|
|
"deepseek/deepseek-r1:free", |
|
|
"meta-llama/llama-3.2-3b-instruct", |
|
|
] |
|
|
|
|
|
|
|
|
def validate_dataset(dataset_name: str, split: str, input_field: str, target_field: str) -> Tuple[bool, str]: |
|
|
""" |
|
|
Validate that the dataset exists and has the required fields. |
|
|
|
|
|
Returns: |
|
|
Tuple of (is_valid, error_message) |
|
|
""" |
|
|
try: |
|
|
|
|
|
if not dataset_name or dataset_name.strip() == "": |
|
|
return False, "❌ Dataset name cannot be empty" |
|
|
|
|
|
dataset_name = dataset_name.strip() |
|
|
|
|
|
|
|
|
hf_token = os.environ.get("HF_TOKEN", None) |
|
|
headers = {} |
|
|
if hf_token: |
|
|
headers["Authorization"] = f"Bearer {hf_token}" |
|
|
|
|
|
|
|
|
api_url = f"https://huggingface.co/api/datasets/{dataset_name}" |
|
|
response = requests.get(api_url, headers=headers, timeout=10) |
|
|
|
|
|
if response.status_code == 404: |
|
|
return False, f"❌ Dataset '{dataset_name}' not found on HuggingFace Hub. Please use the full dataset name (e.g., 'stanfordnlp/imdb' or 'gsm8k')" |
|
|
elif response.status_code != 200: |
|
|
|
|
|
print(f"Warning: Could not verify dataset via API (status {response.status_code}), attempting to load...") |
|
|
|
|
|
|
|
|
print(f"Loading dataset {dataset_name} with split {split}...") |
|
|
|
|
|
|
|
|
try: |
|
|
available_splits = get_dataset_split_names(dataset_name) |
|
|
if split not in available_splits: |
|
|
return False, f"❌ Split '{split}' not found. Available splits: {', '.join(available_splits)}" |
|
|
except Exception as e: |
|
|
print(f"Could not get split names: {e}. Will try to load anyway...") |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
dataset = load_dataset(dataset_name, split=split, streaming=True) |
|
|
except ValueError as e: |
|
|
|
|
|
if "config" in str(e).lower() or "Config name is missing" in str(e): |
|
|
|
|
|
default_config = "main" |
|
|
if dataset_name.lower() == "glue": |
|
|
default_config = "sst2" |
|
|
|
|
|
print(f"Dataset requires config, trying with '{default_config}' config...") |
|
|
try: |
|
|
dataset = load_dataset(dataset_name, default_config, split=split, streaming=True) |
|
|
except: |
|
|
|
|
|
raise e |
|
|
else: |
|
|
raise |
|
|
|
|
|
|
|
|
first_example = next(iter(dataset)) |
|
|
available_fields = list(first_example.keys()) |
|
|
|
|
|
|
|
|
if input_field not in available_fields: |
|
|
return False, f"❌ Input field '{input_field}' not found. Available fields: {', '.join(available_fields)}" |
|
|
|
|
|
|
|
|
if target_field not in available_fields: |
|
|
return False, f"❌ Target field '{target_field}' not found. Available fields: {', '.join(available_fields)}" |
|
|
|
|
|
|
|
|
return True, f"✅ Dataset validated successfully! Fields '{input_field}' and '{target_field}' found." |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = str(e) |
|
|
if "404" in error_msg or "not found" in error_msg.lower(): |
|
|
return False, f"❌ Dataset '{dataset_name}' not found. Please check the dataset name (use format: org/dataset-name)" |
|
|
return False, f"❌ Error validating dataset: {error_msg}" |
|
|
|
|
|
|
|
|
def validate_inputs(dataset_name: str, split: str, input_field: str, target_field: str, |
|
|
initial_prompt: str) -> Tuple[bool, str]: |
|
|
""" |
|
|
Validate all inputs before starting optimization. |
|
|
|
|
|
Returns: |
|
|
Tuple of (is_valid, message) |
|
|
""" |
|
|
|
|
|
api_key = os.environ.get("OPENAI_API_KEY") |
|
|
if not api_key: |
|
|
return False, "❌ OPENAI_API_KEY environment variable not set. Please set it in the Space secrets." |
|
|
|
|
|
|
|
|
if "{input}" not in initial_prompt: |
|
|
return False, "❌ Prompt must contain '{input}' placeholder for dataset inputs" |
|
|
|
|
|
|
|
|
dataset_name = dataset_name.strip() |
|
|
if not dataset_name: |
|
|
return False, "❌ Dataset name cannot be empty" |
|
|
|
|
|
|
|
|
is_valid, message = validate_dataset(dataset_name, split, input_field, target_field) |
|
|
if not is_valid: |
|
|
return False, message |
|
|
|
|
|
return True, message |
|
|
|
|
|
|
|
|
def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int, |
|
|
model: str, input_field: str, target_field: str, |
|
|
fixed_indices: List[int] = None) -> Dict: |
|
|
""" |
|
|
Evaluate a prompt on a dataset using the selected model. |
|
|
|
|
|
Args: |
|
|
fixed_indices: Optional list of dataset indices to use. If provided, |
|
|
ensures we evaluate on the SAME samples every time. |
|
|
""" |
|
|
try: |
|
|
|
|
|
api_key = os.environ.get("OPENAI_API_KEY") |
|
|
if not api_key: |
|
|
return { |
|
|
"error": "OPENAI_API_KEY not set in environment", |
|
|
"accuracy": 0, |
|
|
"correct": 0, |
|
|
"total": 0, |
|
|
"results": [] |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
dataset = load_dataset(dataset_name, split=split, streaming=False) |
|
|
except ValueError as e: |
|
|
|
|
|
if "config" in str(e).lower() or "Config name is missing" in str(e): |
|
|
|
|
|
default_config = "main" |
|
|
if dataset_name.lower() == "glue": |
|
|
default_config = "sst2" |
|
|
dataset = load_dataset(dataset_name, default_config, split=split, streaming=False) |
|
|
else: |
|
|
raise |
|
|
|
|
|
|
|
|
if fixed_indices is not None: |
|
|
|
|
|
indices = fixed_indices |
|
|
samples = [dataset[i] for i in indices] |
|
|
elif len(dataset) > num_samples: |
|
|
|
|
|
random.seed(42) |
|
|
indices = random.sample(range(len(dataset)), num_samples) |
|
|
samples = [dataset[i] for i in indices] |
|
|
else: |
|
|
indices = list(range(min(num_samples, len(dataset)))) |
|
|
samples = list(dataset)[:num_samples] |
|
|
|
|
|
|
|
|
client = OpenAI( |
|
|
base_url="https://openrouter.ai/api/v1", |
|
|
api_key=api_key, |
|
|
) |
|
|
|
|
|
correct = 0 |
|
|
total = 0 |
|
|
results = [] |
|
|
errors = [] |
|
|
|
|
|
for idx, sample in enumerate(samples): |
|
|
try: |
|
|
|
|
|
input_text = sample.get(input_field, "") |
|
|
if isinstance(input_text, dict): |
|
|
input_text = str(input_text) |
|
|
|
|
|
target = sample.get(target_field, "") |
|
|
if isinstance(target, dict): |
|
|
target = str(target) |
|
|
|
|
|
|
|
|
formatted_prompt = prompt.replace("{input}", str(input_text)) |
|
|
|
|
|
|
|
|
response = client.chat.completions.create( |
|
|
model=model, |
|
|
messages=[ |
|
|
{"role": "system", "content": "You are a helpful assistant."}, |
|
|
{"role": "user", "content": formatted_prompt} |
|
|
], |
|
|
temperature=0.0, |
|
|
max_tokens=500, |
|
|
) |
|
|
|
|
|
prediction = response.choices[0].message.content.strip() |
|
|
|
|
|
|
|
|
target_str = str(target).strip() |
|
|
pred_str = prediction.strip() |
|
|
|
|
|
def extract_answer(text): |
|
|
"""Extract answer from text - handles GSM8K format and general text""" |
|
|
|
|
|
if "####" in text: |
|
|
parts = text.split("####") |
|
|
if len(parts) > 1: |
|
|
answer_part = parts[-1].strip() |
|
|
|
|
|
answer_part = answer_part.replace(',', '') |
|
|
return answer_part |
|
|
|
|
|
|
|
|
numbers = re.findall(r'-?\d+(?:,\d{3})*(?:\.\d+)?', text) |
|
|
if numbers: |
|
|
|
|
|
return numbers[-1].replace(',', '') |
|
|
|
|
|
return text |
|
|
|
|
|
def is_mathematically_equal(str1, str2): |
|
|
"""Check if two strings represent the same mathematical value""" |
|
|
try: |
|
|
|
|
|
num1 = float(str1.replace(',', '')) |
|
|
num2 = float(str2.replace(',', '')) |
|
|
|
|
|
return abs(num1 - num2) < 1e-6 |
|
|
except (ValueError, AttributeError): |
|
|
|
|
|
return str1.lower().strip() == str2.lower().strip() |
|
|
|
|
|
|
|
|
target_answer = extract_answer(target_str) |
|
|
pred_answer = extract_answer(pred_str) |
|
|
|
|
|
|
|
|
is_correct = is_mathematically_equal(target_answer, pred_answer) |
|
|
|
|
|
|
|
|
if not is_correct: |
|
|
target_lower = target_answer.lower() |
|
|
pred_lower = pred_answer.lower() |
|
|
|
|
|
|
|
|
positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic", |
|
|
"amazing", "love", "best", "1", "pos", "admiration", "appreciation", |
|
|
"praise", "favorable", "approve"] |
|
|
negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate", |
|
|
"0", "neg", "criticism", "disdain", "disapproval", "unfavorable", |
|
|
"critique", "condemn", "sarcasm"] |
|
|
|
|
|
if target_lower in ["1", "positive", "pos"]: |
|
|
is_correct = any(word in pred_lower for word in positive_words) |
|
|
elif target_lower in ["0", "negative", "neg"]: |
|
|
is_correct = any(word in pred_lower for word in negative_words) |
|
|
|
|
|
if is_correct: |
|
|
correct += 1 |
|
|
total += 1 |
|
|
|
|
|
results.append({ |
|
|
"input": str(input_text)[:100] + "..." if len(str(input_text)) > 100 else str(input_text), |
|
|
"target": str(target), |
|
|
"prediction": prediction[:100] + "..." if len(prediction) > 100 else prediction, |
|
|
"correct": is_correct |
|
|
}) |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"Sample {idx+1}: {str(e)}" |
|
|
print(f"Error evaluating sample {idx+1}: {e}") |
|
|
errors.append(error_msg) |
|
|
|
|
|
if len(errors) > len(samples) // 2: |
|
|
print(f"Too many errors ({len(errors)} out of {len(samples)}), stopping evaluation") |
|
|
break |
|
|
continue |
|
|
|
|
|
accuracy = (correct / total * 100) if total > 0 else 0 |
|
|
|
|
|
result_dict = { |
|
|
"accuracy": accuracy, |
|
|
"correct": correct, |
|
|
"total": total, |
|
|
"results": results, |
|
|
"indices": indices |
|
|
} |
|
|
|
|
|
|
|
|
if errors: |
|
|
result_dict["errors"] = errors |
|
|
if total == 0: |
|
|
|
|
|
result_dict["error"] = f"All {len(samples)} samples failed to evaluate. First few errors:\n" + "\n".join(errors[:3]) |
|
|
|
|
|
return result_dict |
|
|
|
|
|
except Exception as e: |
|
|
return { |
|
|
"error": str(e), |
|
|
"accuracy": 0, |
|
|
"correct": 0, |
|
|
"total": 0, |
|
|
"results": [] |
|
|
} |
|
|
|
|
|
|
|
|
def collect_prompt_history(output_dir: str, initial_score: float = 0.0) -> List[Dict]: |
|
|
""" |
|
|
Collect only the prompts that were "best" at some point during evolution. |
|
|
Returns only programs that improved upon the initial score (deduplicated). |
|
|
|
|
|
Args: |
|
|
output_dir: Directory containing checkpoint data |
|
|
initial_score: Score of the initial prompt (baseline to beat) |
|
|
|
|
|
Returns a list of dicts with: {prompt, score, iteration, id} |
|
|
""" |
|
|
try: |
|
|
all_programs = [] |
|
|
seen_prompts = set() |
|
|
|
|
|
|
|
|
|
|
|
checkpoints_dir = os.path.join(output_dir, "checkpoints") |
|
|
|
|
|
if not os.path.exists(checkpoints_dir): |
|
|
return [] |
|
|
|
|
|
|
|
|
checkpoint_dirs = sorted(glob.glob(os.path.join(checkpoints_dir, "checkpoint_*"))) |
|
|
|
|
|
|
|
|
for checkpoint_dir in checkpoint_dirs: |
|
|
programs_dir = os.path.join(checkpoint_dir, "programs") |
|
|
if not os.path.exists(programs_dir): |
|
|
continue |
|
|
|
|
|
|
|
|
program_files = glob.glob(os.path.join(programs_dir, "*.json")) |
|
|
|
|
|
for pfile in program_files: |
|
|
try: |
|
|
with open(pfile, 'r') as f: |
|
|
program_data = json.load(f) |
|
|
|
|
|
|
|
|
prompt_content = program_data.get("code", "").strip() |
|
|
prog_id = program_data.get("id", os.path.basename(pfile).replace(".json", "")) |
|
|
iteration = program_data.get("iteration_found", 0) |
|
|
metrics = program_data.get("metrics", {}) |
|
|
|
|
|
|
|
|
combined_score = metrics.get("combined_score", 0.0) |
|
|
|
|
|
all_programs.append({ |
|
|
"prompt": prompt_content, |
|
|
"id": prog_id, |
|
|
"file": pfile, |
|
|
"iteration": iteration, |
|
|
"metrics": metrics, |
|
|
"score": combined_score |
|
|
}) |
|
|
except Exception as e: |
|
|
print(f"Error reading program file {pfile}: {e}") |
|
|
continue |
|
|
|
|
|
|
|
|
all_programs.sort(key=lambda x: x.get("iteration", 0)) |
|
|
|
|
|
|
|
|
|
|
|
best_programs = [] |
|
|
current_best_score = initial_score |
|
|
|
|
|
for program in all_programs: |
|
|
prompt_content = program["prompt"] |
|
|
score = program["score"] |
|
|
iteration = program["iteration"] |
|
|
|
|
|
|
|
|
if iteration == 0: |
|
|
continue |
|
|
|
|
|
|
|
|
normalized_prompt = " ".join(prompt_content.split()) |
|
|
|
|
|
|
|
|
if normalized_prompt in seen_prompts: |
|
|
continue |
|
|
|
|
|
|
|
|
if score > current_best_score: |
|
|
seen_prompts.add(normalized_prompt) |
|
|
best_programs.append(program) |
|
|
improvement = score - current_best_score |
|
|
print(f" ✓ Best program at iteration {iteration}: score={score:.2%} (improved by +{improvement:.2%})") |
|
|
current_best_score = score |
|
|
|
|
|
return best_programs |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error collecting prompt history: {e}") |
|
|
return [] |
|
|
|
|
|
|
|
|
def parse_evolution_history(output_dir: str) -> str: |
|
|
""" |
|
|
Parse evolution history from OpenEvolve output directory. |
|
|
|
|
|
Returns a markdown string with visualization of the evolution process. |
|
|
""" |
|
|
try: |
|
|
evolution_viz = "## 🧬 Evolution Progress\n\n" |
|
|
|
|
|
|
|
|
generation_files = sorted(glob.glob(os.path.join(output_dir, "generation_*.txt"))) |
|
|
log_file = os.path.join(output_dir, "evolution.log") |
|
|
|
|
|
|
|
|
if generation_files: |
|
|
evolution_viz += "### Generation-by-Generation Progress\n\n" |
|
|
for gen_file in generation_files: |
|
|
gen_num = os.path.basename(gen_file).replace("generation_", "").replace(".txt", "") |
|
|
try: |
|
|
with open(gen_file, 'r') as f: |
|
|
content = f.read() |
|
|
evolution_viz += f"**Generation {gen_num}:**\n```\n{content[:200]}{'...' if len(content) > 200 else ''}\n```\n\n" |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
elif os.path.exists(log_file): |
|
|
evolution_viz += "### Evolution Log\n\n" |
|
|
try: |
|
|
with open(log_file, 'r') as f: |
|
|
log_content = f.read() |
|
|
evolution_viz += f"```\n{log_content[-1000:]}\n```\n\n" |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
scores_file = os.path.join(output_dir, "scores.json") |
|
|
if os.path.exists(scores_file): |
|
|
try: |
|
|
with open(scores_file, 'r') as f: |
|
|
scores = json.load(f) |
|
|
|
|
|
evolution_viz += "### Score Progression\n\n" |
|
|
evolution_viz += "| Generation | Best Score | Avg Score | Population |\n" |
|
|
evolution_viz += "|------------|-----------|-----------|------------|\n" |
|
|
|
|
|
for gen in scores: |
|
|
evolution_viz += f"| {gen['generation']} | {gen['best']:.3f} | {gen['avg']:.3f} | {gen['population']} |\n" |
|
|
|
|
|
evolution_viz += "\n" |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
program_files = sorted(glob.glob(os.path.join(output_dir, "program_*.txt"))) |
|
|
if program_files: |
|
|
evolution_viz += f"### Explored Variants\n\n" |
|
|
evolution_viz += f"OpenEvolve explored {len(program_files)} different prompt variants during evolution.\n\n" |
|
|
|
|
|
|
|
|
if len(program_files) > 3: |
|
|
sample_files = [program_files[0], program_files[len(program_files)//2], program_files[-2]] |
|
|
evolution_viz += "**Sample Intermediate Prompts:**\n\n" |
|
|
for idx, pfile in enumerate(sample_files, 1): |
|
|
try: |
|
|
with open(pfile, 'r') as f: |
|
|
prompt_content = f.read() |
|
|
evolution_viz += f"**Variant {idx}:**\n```\n{prompt_content[:150]}{'...' if len(prompt_content) > 150 else ''}\n```\n\n" |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
if not generation_files and not os.path.exists(log_file) and not os.path.exists(scores_file): |
|
|
evolution_viz += "### Evolution Complete\n\n" |
|
|
evolution_viz += "OpenEvolve ran 10 iterations of evolutionary optimization using:\n" |
|
|
evolution_viz += "- **Population Size**: 10 prompts per generation\n" |
|
|
evolution_viz += "- **Selection Strategy**: 10% elite, 30% explore, 60% exploit\n" |
|
|
evolution_viz += "- **Islands**: 1 population with mutation and crossover\n" |
|
|
evolution_viz += "- **Evaluation**: 100 samples per prompt variant\n\n" |
|
|
|
|
|
|
|
|
all_files = os.listdir(output_dir) |
|
|
evolution_viz += f"Generated {len(all_files)} files during evolution process.\n\n" |
|
|
|
|
|
return evolution_viz |
|
|
|
|
|
except Exception as e: |
|
|
return f"## 🧬 Evolution Progress\n\nEvolution completed successfully. Unable to parse detailed history: {str(e)}\n\n" |
|
|
|
|
|
|
|
|
def create_evaluator_file(dataset_name: str, split: str, model: str, |
|
|
input_field: str, target_field: str, work_dir: str): |
|
|
"""Create an evaluator.py file for OpenEvolve with staged/cascading evaluation.""" |
|
|
evaluator_code = f''' |
|
|
import os |
|
|
import random |
|
|
from datasets import load_dataset |
|
|
from openai import OpenAI |
|
|
|
|
|
def evaluate(prompt: str) -> dict: |
|
|
""" |
|
|
Evaluate a prompt using 2-stage cascading evaluation to save API calls. |
|
|
|
|
|
Stage 1: Evaluate with 50 samples |
|
|
- If accuracy >= 0.5, proceed to Stage 2 |
|
|
- If accuracy < 0.5, return early (no point wasting 200 more samples) |
|
|
|
|
|
Stage 2: Evaluate with 200 more samples (total 250) |
|
|
- Combine results for final score |
|
|
|
|
|
Returns dict with combined_score (0-1), accuracy, correct, and total. |
|
|
""" |
|
|
try: |
|
|
# IMPORTANT: Use fixed seed for consistent sampling across all evaluations |
|
|
random.seed(42) |
|
|
|
|
|
# Load dataset |
|
|
# Try loading with just dataset name first |
|
|
try: |
|
|
dataset = load_dataset("{dataset_name}", split="{split}", streaming=False) |
|
|
except ValueError as e: |
|
|
# If it fails with config error, try common configs |
|
|
if "config" in str(e).lower() or "Config name is missing" in str(e): |
|
|
# Try common configs based on dataset name |
|
|
default_config = "main" |
|
|
if "{dataset_name}".lower() == "glue": |
|
|
default_config = "sst2" |
|
|
dataset = load_dataset("{dataset_name}", default_config, split="{split}", streaming=False) |
|
|
else: |
|
|
raise |
|
|
|
|
|
# Initialize OpenAI client |
|
|
api_key = os.environ.get("OPENAI_API_KEY") |
|
|
client = OpenAI( |
|
|
base_url="https://openrouter.ai/api/v1", |
|
|
api_key=api_key, |
|
|
) |
|
|
|
|
|
def evaluate_samples(samples, correct_so_far=0, total_so_far=0): |
|
|
"""Helper function to evaluate a batch of samples.""" |
|
|
correct = correct_so_far |
|
|
total = total_so_far |
|
|
|
|
|
for sample in samples: |
|
|
try: |
|
|
# Get input and target |
|
|
input_text = sample.get("{input_field}", "") |
|
|
if isinstance(input_text, dict): |
|
|
input_text = str(input_text) |
|
|
|
|
|
target = sample.get("{target_field}", "") |
|
|
if isinstance(target, dict): |
|
|
target = str(target) |
|
|
|
|
|
# Format the prompt |
|
|
formatted_prompt = prompt.replace("{{input}}", str(input_text)) |
|
|
|
|
|
# Call the model |
|
|
response = client.chat.completions.create( |
|
|
model="{model}", |
|
|
messages=[ |
|
|
{{"role": "system", "content": "You are a helpful assistant."}}, |
|
|
{{"role": "user", "content": formatted_prompt}} |
|
|
], |
|
|
temperature=0.0, |
|
|
max_tokens=500, |
|
|
) |
|
|
|
|
|
prediction = response.choices[0].message.content.strip() |
|
|
|
|
|
# Smart evaluation - handle both math and text answers |
|
|
target_str = str(target).strip() |
|
|
pred_str = prediction.strip() |
|
|
|
|
|
def extract_answer(text): |
|
|
"""Extract answer from text - handles GSM8K format and general text""" |
|
|
import re |
|
|
|
|
|
# GSM8K format: "#### NUMBER" at the end |
|
|
if "####" in text: |
|
|
parts = text.split("####") |
|
|
if len(parts) > 1: |
|
|
answer_part = parts[-1].strip() |
|
|
# Remove comma separators (1,000 -> 1000) |
|
|
answer_part = answer_part.replace(',', '') |
|
|
return answer_part |
|
|
|
|
|
# Try to extract last number from free-form text |
|
|
numbers = re.findall(r'-?\\d+(?:,\\d{{3}})*(?:\\.\\d+)?', text) |
|
|
if numbers: |
|
|
# Return the last number found (usually the final answer) |
|
|
return numbers[-1].replace(',', '') |
|
|
|
|
|
return text |
|
|
|
|
|
def is_mathematically_equal(str1, str2): |
|
|
"""Check if two strings represent the same mathematical value""" |
|
|
try: |
|
|
# Try to convert both to floats and compare |
|
|
num1 = float(str1.replace(',', '')) |
|
|
num2 = float(str2.replace(',', '')) |
|
|
# Use small epsilon for float comparison |
|
|
return abs(num1 - num2) < 1e-6 |
|
|
except (ValueError, AttributeError): |
|
|
# If conversion fails, do string comparison |
|
|
return str1.lower().strip() == str2.lower().strip() |
|
|
|
|
|
# Extract answers |
|
|
target_answer = extract_answer(target_str) |
|
|
pred_answer = extract_answer(pred_str) |
|
|
|
|
|
# Check if answers match mathematically or textually |
|
|
is_correct = is_mathematically_equal(target_answer, pred_answer) |
|
|
|
|
|
# Fallback: check for semantic equivalents for sentiment analysis |
|
|
if not is_correct: |
|
|
target_lower = target_answer.lower() |
|
|
pred_lower = pred_answer.lower() |
|
|
|
|
|
# Sentiment mappings with expanded synonyms |
|
|
positive_words = ["positive", "good", "great", "excellent", "wonderful", "fantastic", |
|
|
"amazing", "love", "best", "1", "pos", "admiration", "appreciation", |
|
|
"praise", "favorable", "approve"] |
|
|
negative_words = ["negative", "bad", "poor", "terrible", "awful", "worst", "hate", |
|
|
"0", "neg", "criticism", "disdain", "disapproval", "unfavorable", |
|
|
"critique", "condemn", "sarcasm"] |
|
|
|
|
|
if target_lower in ["1", "positive", "pos"]: |
|
|
is_correct = any(word in pred_lower for word in positive_words) |
|
|
elif target_lower in ["0", "negative", "neg"]: |
|
|
is_correct = any(word in pred_lower for word in negative_words) |
|
|
|
|
|
if is_correct: |
|
|
correct += 1 |
|
|
total += 1 |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error evaluating sample: {{e}}") |
|
|
continue |
|
|
|
|
|
return correct, total |
|
|
|
|
|
# STAGE 1: Evaluate with 50 samples first |
|
|
stage1_size = 50 |
|
|
stage1_samples_count = min(stage1_size, len(dataset)) |
|
|
|
|
|
if len(dataset) > stage1_samples_count: |
|
|
stage1_indices = random.sample(range(len(dataset)), stage1_samples_count) |
|
|
stage1_samples = [dataset[i] for i in stage1_indices] |
|
|
else: |
|
|
stage1_samples = list(dataset)[:stage1_samples_count] |
|
|
|
|
|
print(f"[Stage 1/2] Evaluating with {{len(stage1_samples)}} samples...") |
|
|
correct, total = evaluate_samples(stage1_samples) |
|
|
stage1_score = (correct / total) if total > 0 else 0.0 |
|
|
|
|
|
print(f"[Stage 1/2] Score: {{stage1_score:.3f}} ({{correct}}/{{total}})") |
|
|
|
|
|
# Early exit if Stage 1 score is below threshold |
|
|
if stage1_score < 0.5: |
|
|
print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 200 API calls)") |
|
|
return {{ |
|
|
"combined_score": stage1_score, |
|
|
"accuracy": stage1_score, |
|
|
"correct": correct, |
|
|
"total": total, |
|
|
"stage": "stage1_early_exit" |
|
|
}} |
|
|
|
|
|
# STAGE 2: Continue with 200 more samples |
|
|
print(f"[Stage 2/2] Score >= 0.5 - proceeding with 200 more samples...") |
|
|
stage2_size = 200 |
|
|
stage2_samples_count = min(stage2_size, max(0, len(dataset) - stage1_samples_count)) |
|
|
|
|
|
if stage2_samples_count > 0: |
|
|
# Get different samples from Stage 1 |
|
|
remaining_indices = list(set(range(len(dataset))) - set(stage1_indices if 'stage1_indices' in locals() else [])) |
|
|
|
|
|
if len(remaining_indices) >= stage2_samples_count: |
|
|
stage2_indices = random.sample(remaining_indices, stage2_samples_count) |
|
|
stage2_samples = [dataset[i] for i in stage2_indices] |
|
|
else: |
|
|
stage2_samples = [dataset[i] for i in remaining_indices[:stage2_samples_count]] |
|
|
|
|
|
correct, total = evaluate_samples(stage2_samples, correct, total) |
|
|
final_score = (correct / total) if total > 0 else stage1_score |
|
|
|
|
|
print(f"[Stage 2/2] Final score: {{final_score:.3f}} ({{correct}}/{{total}})") |
|
|
return {{ |
|
|
"combined_score": final_score, |
|
|
"accuracy": final_score, |
|
|
"correct": correct, |
|
|
"total": total, |
|
|
"stage": "stage2_complete" |
|
|
}} |
|
|
else: |
|
|
print(f"[Stage 2/2] Not enough samples in dataset for Stage 2") |
|
|
return {{ |
|
|
"combined_score": stage1_score, |
|
|
"accuracy": stage1_score, |
|
|
"correct": correct, |
|
|
"total": total, |
|
|
"stage": "stage1_complete" |
|
|
}} |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error in evaluation: {{e}}") |
|
|
return {{ |
|
|
"combined_score": 0.0, |
|
|
"accuracy": 0.0, |
|
|
"correct": 0, |
|
|
"total": 0, |
|
|
"error": str(e) |
|
|
}} |
|
|
''' |
|
|
|
|
|
evaluator_path = os.path.join(work_dir, "evaluator.py") |
|
|
with open(evaluator_path, "w") as f: |
|
|
f.write(evaluator_code) |
|
|
|
|
|
return evaluator_path |
|
|
|
|
|
|
|
|
def create_config_file(model: str, work_dir: str): |
|
|
"""Create a config.yaml file for OpenEvolve.""" |
|
|
|
|
|
|
|
|
templates_dir = os.path.join(work_dir, "templates") |
|
|
os.makedirs(templates_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
system_template = """You are an expert prompt engineer tasked with iteratively improving prompts for language models. |
|
|
Your job is to analyze the current prompt and suggest improvements based on performance feedback. |
|
|
Focus on making the prompt clearer, more specific, and more effective at achieving its goal. |
|
|
Consider: |
|
|
- Clarity and specificity of instructions |
|
|
- Examples and demonstrations that guide the model |
|
|
- Formatting that makes the prompt easier to follow |
|
|
- Edge cases and error handling in the instructions |
|
|
""" |
|
|
|
|
|
with open(os.path.join(templates_dir, "system_message.txt"), "w") as f: |
|
|
f.write(system_template) |
|
|
|
|
|
|
|
|
user_template = """# Current Prompt Performance |
|
|
- Current metrics: {metrics} |
|
|
- Areas for improvement: {improvement_areas} |
|
|
|
|
|
{artifacts} |
|
|
|
|
|
# Prompt Evolution History |
|
|
{evolution_history} |
|
|
|
|
|
# Current Prompt |
|
|
```text |
|
|
{current_program} |
|
|
``` |
|
|
|
|
|
# Task |
|
|
Rewrite the prompt above to improve its performance on the specified metrics. |
|
|
Provide a complete new version of the prompt that: |
|
|
1. Maintains the same input/output format (keep placeholders like {{input}}, {{text}}, etc.) |
|
|
2. Improves clarity and effectiveness |
|
|
3. Adds helpful examples or instructions if beneficial |
|
|
4. Is more likely to get correct results |
|
|
|
|
|
Output ONLY the new prompt text between ```text markers: |
|
|
|
|
|
```text |
|
|
Your improved prompt here |
|
|
``` |
|
|
""" |
|
|
|
|
|
with open(os.path.join(templates_dir, "full_rewrite_user.txt"), "w") as f: |
|
|
f.write(user_template) |
|
|
|
|
|
config = { |
|
|
"llm": { |
|
|
"primary_model": model, |
|
|
"api_base": "https://openrouter.ai/api/v1", |
|
|
"temperature": 0.7, |
|
|
}, |
|
|
"max_iterations": 10, |
|
|
"checkpoint_interval": 2, |
|
|
"diff_based_evolution": False, |
|
|
"language": "text", |
|
|
"max_code_length": 40000, |
|
|
"num_islands": 1, |
|
|
"prompt": { |
|
|
"template_dir": templates_dir, |
|
|
}, |
|
|
"evolution": { |
|
|
"population_size": 10, |
|
|
"num_islands": 1, |
|
|
"elite_ratio": 0.1, |
|
|
"explore_ratio": 0.3, |
|
|
"exploit_ratio": 0.6, |
|
|
}, |
|
|
"database": { |
|
|
"log_prompts": True, |
|
|
"num_islands": 1, |
|
|
}, |
|
|
"evaluator": { |
|
|
"timeout": 3600, |
|
|
"cascade_evaluation": False, |
|
|
"parallel_evaluations": 1, |
|
|
"distributed": False, |
|
|
} |
|
|
} |
|
|
|
|
|
config_path = os.path.join(work_dir, "config.yaml") |
|
|
with open(config_path, "w") as f: |
|
|
yaml.dump(config, f) |
|
|
|
|
|
return config_path |
|
|
|
|
|
|
|
|
def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str, |
|
|
model: str, input_field: str, target_field: str, |
|
|
progress=gr.Progress()) -> Tuple[str, str, str, str, List[str], int, int]: |
|
|
"""Run OpenEvolve to optimize the prompt.""" |
|
|
|
|
|
progress(0, desc="Validating inputs...") |
|
|
|
|
|
|
|
|
is_valid, validation_message = validate_inputs( |
|
|
dataset_name, dataset_split, input_field, target_field, initial_prompt |
|
|
) |
|
|
|
|
|
if not is_valid: |
|
|
return f"## Validation Failed\n\n{validation_message}", "", "", "", [], 0, 0 |
|
|
|
|
|
progress(0.05, desc=f"Validation passed: {validation_message}") |
|
|
|
|
|
|
|
|
work_dir = tempfile.mkdtemp(prefix="openevolve_") |
|
|
|
|
|
try: |
|
|
|
|
|
initial_prompt_path = os.path.join(work_dir, "initial_prompt.txt") |
|
|
with open(initial_prompt_path, "w") as f: |
|
|
f.write(initial_prompt) |
|
|
|
|
|
|
|
|
progress(0.1, desc="Creating evaluator...") |
|
|
evaluator_path = create_evaluator_file(dataset_name, dataset_split, model, |
|
|
input_field, target_field, work_dir) |
|
|
|
|
|
|
|
|
progress(0.15, desc="Creating configuration...") |
|
|
config_path = create_config_file(model, work_dir) |
|
|
|
|
|
|
|
|
|
|
|
progress(0.2, desc="Running initial evaluation on 20 samples...") |
|
|
initial_eval = evaluate_prompt( |
|
|
initial_prompt, dataset_name, dataset_split, 20, |
|
|
model, input_field, target_field |
|
|
) |
|
|
|
|
|
if "error" in initial_eval: |
|
|
return f"## Error\n\n❌ Initial evaluation failed: {initial_eval['error']}", "", "", "", [initial_prompt], 0, 1 |
|
|
|
|
|
if initial_eval["total"] == 0: |
|
|
return f"## Error\n\n❌ Initial evaluation failed: No samples could be evaluated. This usually means:\n- API key is invalid or has no credits\n- Model is unavailable or rate-limited\n- Dataset fields are incorrect\n- Network connectivity issues\n\nPlease check your configuration and try again.", "", "", "", [initial_prompt], 0, 1 |
|
|
|
|
|
|
|
|
eval_indices = initial_eval.get("indices", []) |
|
|
|
|
|
initial_results = f""" |
|
|
### Initial Prompt Evaluation |
|
|
|
|
|
**Prompt:** |
|
|
``` |
|
|
{initial_prompt} |
|
|
``` |
|
|
|
|
|
**Results:** |
|
|
- Accuracy: {initial_eval['accuracy']:.2f}% |
|
|
- Correct: {initial_eval['correct']}/{initial_eval['total']} |
|
|
|
|
|
**Sample Results:** |
|
|
""" |
|
|
for i, result in enumerate(initial_eval['results'][:5], 1): |
|
|
initial_results += f"\n{i}. Input: {result['input']}\n" |
|
|
initial_results += f" Target: {result['target']}\n" |
|
|
initial_results += f" Prediction: {result['prediction']}\n" |
|
|
initial_results += f" ✓ Correct\n" if result['correct'] else f" ✗ Incorrect\n" |
|
|
|
|
|
|
|
|
progress(0.3, desc="Starting OpenEvolve optimization (10 iterations with staged evaluation)...") |
|
|
|
|
|
output_dir = os.path.join(work_dir, "output") |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os as os_env |
|
|
os_env.environ['OPENEVOLVE_NO_PARALLEL'] = '1' |
|
|
|
|
|
|
|
|
import signal |
|
|
import threading |
|
|
|
|
|
original_signal = signal.signal |
|
|
|
|
|
def safe_signal(signum, handler): |
|
|
"""Only set signal handlers in main thread""" |
|
|
if threading.current_thread() is threading.main_thread(): |
|
|
return original_signal(signum, handler) |
|
|
else: |
|
|
|
|
|
return signal.SIG_DFL |
|
|
|
|
|
signal.signal = safe_signal |
|
|
|
|
|
|
|
|
result = run_evolution( |
|
|
initial_program=initial_prompt_path, |
|
|
evaluator=evaluator_path, |
|
|
config=config_path, |
|
|
output_dir=output_dir |
|
|
) |
|
|
|
|
|
|
|
|
signal.signal = original_signal |
|
|
|
|
|
progress(0.80, desc="Parsing evolution history...") |
|
|
|
|
|
|
|
|
evolution_viz = parse_evolution_history(output_dir) |
|
|
|
|
|
progress(0.85, desc="Evaluating best evolved prompt on 20 samples...") |
|
|
|
|
|
|
|
|
best_prompt_path = os.path.join(output_dir, "best", "best_program.txt") |
|
|
if os.path.exists(best_prompt_path): |
|
|
with open(best_prompt_path, "r") as f: |
|
|
best_prompt = f.read() |
|
|
else: |
|
|
|
|
|
best_prompt_path_alt = os.path.join(output_dir, "best_program.txt") |
|
|
if os.path.exists(best_prompt_path_alt): |
|
|
with open(best_prompt_path_alt, "r") as f: |
|
|
best_prompt = f.read() |
|
|
else: |
|
|
best_prompt = initial_prompt |
|
|
|
|
|
|
|
|
final_eval = evaluate_prompt( |
|
|
best_prompt, dataset_name, dataset_split, 20, |
|
|
model, input_field, target_field, |
|
|
fixed_indices=eval_indices |
|
|
) |
|
|
|
|
|
final_results = f""" |
|
|
### Evolved Prompt Evaluation |
|
|
|
|
|
**Prompt:** |
|
|
``` |
|
|
{best_prompt} |
|
|
``` |
|
|
|
|
|
**Results:** |
|
|
- Accuracy: {final_eval['accuracy']:.2f}% |
|
|
- Correct: {final_eval['correct']}/{final_eval['total']} |
|
|
- Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:+.2f}% |
|
|
|
|
|
**Sample Results:** |
|
|
""" |
|
|
for i, result in enumerate(final_eval['results'][:5], 1): |
|
|
final_results += f"\n{i}. Input: {result['input']}\n" |
|
|
final_results += f" Target: {result['target']}\n" |
|
|
final_results += f" Prediction: {result['prediction']}\n" |
|
|
final_results += f" ✓ Correct\n" if result['correct'] else f" ✗ Incorrect\n" |
|
|
|
|
|
summary = f""" |
|
|
## 🎉 Optimization Complete! |
|
|
|
|
|
### Summary |
|
|
- **Dataset**: {dataset_name} ({dataset_split} split) |
|
|
- **Model**: {model} |
|
|
- **Initial/Final Eval**: 20 samples each |
|
|
- **Evolution Eval**: Staged (20 → 100 if score ≥ 0.5) |
|
|
- **Iterations**: 10 |
|
|
|
|
|
### Results |
|
|
- **Initial Accuracy**: {initial_eval['accuracy']:.2f}% |
|
|
- **Final Accuracy**: {final_eval['accuracy']:.2f}% |
|
|
- **Improvement**: {final_eval['accuracy'] - initial_eval['accuracy']:+.2f}% |
|
|
|
|
|
{validation_message} |
|
|
""" |
|
|
|
|
|
progress(1.0, desc="Complete!") |
|
|
|
|
|
|
|
|
all_prompts = [] |
|
|
|
|
|
|
|
|
initial_score = initial_eval['accuracy'] / 100.0 |
|
|
all_prompts.append({ |
|
|
"prompt": initial_prompt, |
|
|
"score": initial_score, |
|
|
"label": "Initial Prompt", |
|
|
"iteration": 0 |
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
prompt_history = collect_prompt_history(output_dir, initial_score=initial_score) |
|
|
for i, p in enumerate(prompt_history): |
|
|
|
|
|
if i == 0 and p.get("iteration", -1) == 0: |
|
|
continue |
|
|
|
|
|
all_prompts.append({ |
|
|
"prompt": p["prompt"], |
|
|
"score": p.get("score", 0.0), |
|
|
"label": f"Best at Iteration {p.get('iteration', i+1)}", |
|
|
"iteration": p.get("iteration", i+1) |
|
|
}) |
|
|
|
|
|
return summary, initial_results, evolution_viz, final_results, all_prompts, 0, len(all_prompts) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
error_prompts = [{"prompt": initial_prompt, "score": 0.0, "label": "Initial Prompt"}] |
|
|
return f"## Error During Evolution\n\n❌ {str(e)}", initial_results, "", "", error_prompts, 0, 1 |
|
|
|
|
|
finally: |
|
|
|
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(""" |
|
|
# 🧬 OpenEvolve Prompt Optimizer |
|
|
|
|
|
Automatically evolve and optimize your prompts using evolutionary algorithms! |
|
|
|
|
|
This space uses [OpenEvolve](https://github.com/algorithmicsuperintelligence/openevolve) to iteratively improve prompts |
|
|
by testing them on real datasets and evolving better versions. |
|
|
|
|
|
## How it works: |
|
|
1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs) |
|
|
2. Default dataset is **GSM8K** (grade school math) - great for showing prompt improvement! |
|
|
3. Specify the dataset split and field names (or use other datasets like `glue`, `stanfordnlp/imdb`) |
|
|
4. Choose a free model from OpenRouter |
|
|
5. Click "Optimize Prompt" - the system will validate everything first! |
|
|
6. Watch the evolution progress in real-time |
|
|
7. Compare initial vs. evolved performance - uses 50 samples for stage 1, 200 for stage 2! |
|
|
|
|
|
**Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets) |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown("### Configuration") |
|
|
|
|
|
model = gr.Dropdown( |
|
|
choices=FREE_MODELS, |
|
|
value=FREE_MODELS[0], |
|
|
label="Select Model", |
|
|
info="Choose from 5 curated free models on OpenRouter (24B to 671B parameters)" |
|
|
) |
|
|
|
|
|
dataset_name = gr.Textbox( |
|
|
label="HuggingFace Dataset (Full Name)", |
|
|
value="gsm8k", |
|
|
placeholder="e.g., gsm8k, glue, stanfordnlp/imdb", |
|
|
info="Dataset name from HuggingFace Hub. Configs auto-detected (e.g., 'glue' → 'glue:sst2')" |
|
|
) |
|
|
|
|
|
dataset_split = gr.Textbox( |
|
|
label="Dataset Split", |
|
|
value="train", |
|
|
placeholder="e.g., train, test, validation" |
|
|
) |
|
|
|
|
|
input_field = gr.Textbox( |
|
|
label="Input Field Name", |
|
|
value="question", |
|
|
placeholder="e.g., question, sentence, text", |
|
|
info="The field containing inputs to process" |
|
|
) |
|
|
|
|
|
target_field = gr.Textbox( |
|
|
label="Target Field Name", |
|
|
value="answer", |
|
|
placeholder="e.g., answer, label, target", |
|
|
info="The field containing expected outputs" |
|
|
) |
|
|
|
|
|
initial_prompt = gr.TextArea( |
|
|
label="Initial Prompt", |
|
|
value="{input}\n\nAnswer:", |
|
|
lines=6, |
|
|
info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
optimize_btn = gr.Button("🚀 Validate & Optimize Prompt", variant="primary", size="lg") |
|
|
|
|
|
|
|
|
gr.Markdown("---") |
|
|
gr.Markdown("## 📊 Results") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
summary = gr.Markdown("Click 'Validate & Optimize Prompt' to start optimization...", visible=True) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
initial_results = gr.Markdown("### Initial Results\nWill appear here after validation...", visible=True) |
|
|
with gr.Column(): |
|
|
final_results = gr.Markdown("### Final Results\nWill appear here after optimization...", visible=True) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
evolution_progress = gr.Markdown("### Evolution Progress\nEvolution progress will appear here during optimization...", visible=True) |
|
|
|
|
|
|
|
|
gr.Markdown("---") |
|
|
gr.Markdown("## 📜 Prompt History Browser") |
|
|
gr.Markdown("Browse through the progression of **best** prompts found during evolution. Only shows prompts that improved the score (no duplicates or intermediate programs).") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=8): |
|
|
prompt_display = gr.TextArea( |
|
|
label="", |
|
|
lines=10, |
|
|
interactive=False, |
|
|
placeholder="Prompts will appear here after optimization completes...", |
|
|
show_label=False |
|
|
) |
|
|
with gr.Column(scale=2): |
|
|
prompt_counter = gr.Markdown("**Prompt**: -/-") |
|
|
prev_btn = gr.Button("⬅️ Previous", size="sm") |
|
|
next_btn = gr.Button("Next ➡️", size="sm") |
|
|
gr.Markdown("**Prompt Types:**\n- First = Initial\n- Middle = Intermediate\n- Last = Final Best") |
|
|
|
|
|
|
|
|
prompt_history_state = gr.State([]) |
|
|
current_prompt_index = gr.State(0) |
|
|
|
|
|
|
|
|
gr.Markdown("---") |
|
|
with gr.Accordion("📚 Documentation & Examples", open=False): |
|
|
gr.Markdown(""" |
|
|
### Example Datasets & Fields: |
|
|
|
|
|
| Dataset | Split | Input Field | Target Field | Task | |
|
|
|---------|-------|-------------|--------------|------| |
|
|
| stanfordnlp/imdb | test | text | label | Sentiment Analysis | |
|
|
| rajpurkar/squad | validation | question | answers | Question Answering | |
|
|
| dair-ai/emotion | test | text | label | Emotion Classification | |
|
|
| openai/gsm8k | test | question | answer | Math Reasoning | |
|
|
| fancyzhx/ag_news | test | text | label | News Classification | |
|
|
|
|
|
### About This Demo Space: |
|
|
|
|
|
**This is a demonstration space** showcasing OpenEvolve's prompt optimization capabilities. |
|
|
The interface shows you how the system works, but **you'll need to set up your own instance to run optimizations**. |
|
|
|
|
|
### How to Run This Yourself: |
|
|
|
|
|
1. **Clone this Space**: Click "⋮" (three dots) at top-right → "Duplicate this Space" |
|
|
2. **Set Environment Variables** in your cloned Space's settings: |
|
|
- `OPENAI_API_KEY`: Your OpenRouter API key (get free key at [openrouter.ai/keys](https://openrouter.ai/keys)) |
|
|
- `HF_TOKEN`: (Optional) HuggingFace token for private datasets |
|
|
3. **Configure Your Optimization**: |
|
|
- Dataset: Use full name format (e.g., `stanfordnlp/imdb` or `openai/gsm8k`) |
|
|
- Fields: Specify exact field names from the dataset schema |
|
|
- Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited) |
|
|
4. **Run & Monitor**: |
|
|
- All inputs are validated before starting |
|
|
- Evolution uses staged evaluation (20 samples first, then 80 more if promising) |
|
|
- Saves API calls by early-stopping poor prompts (< 50% accuracy) |
|
|
- Watch evolution progress visualization in real-time |
|
|
|
|
|
### About OpenEvolve: |
|
|
OpenEvolve is an open-source evolutionary optimization framework. Learn more at: |
|
|
- [GitHub Repository](https://github.com/algorithmicsuperintelligence/openevolve) |
|
|
- [Documentation](https://github.com/algorithmicsuperintelligence/openevolve#readme) |
|
|
""") |
|
|
|
|
|
|
|
|
def show_previous_prompt(prompts, current_idx): |
|
|
if not prompts or len(prompts) == 0: |
|
|
return "", "**Prompt**: -/-", 0 |
|
|
new_idx = max(0, current_idx - 1) |
|
|
prompt_obj = prompts[new_idx] |
|
|
|
|
|
if isinstance(prompt_obj, dict): |
|
|
prompt_text = prompt_obj["prompt"] |
|
|
score = prompt_obj.get("score", 0.0) |
|
|
label = prompt_obj.get("label", "") |
|
|
counter_text = f"**{label}** ({new_idx + 1}/{len(prompts)}) | Score: {score:.2%}" |
|
|
else: |
|
|
prompt_text = prompt_obj |
|
|
counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}" |
|
|
return prompt_text, counter_text, new_idx |
|
|
|
|
|
def show_next_prompt(prompts, current_idx): |
|
|
if not prompts or len(prompts) == 0: |
|
|
return "", "**Prompt**: -/-", 0 |
|
|
new_idx = min(len(prompts) - 1, current_idx + 1) |
|
|
prompt_obj = prompts[new_idx] |
|
|
|
|
|
if isinstance(prompt_obj, dict): |
|
|
prompt_text = prompt_obj["prompt"] |
|
|
score = prompt_obj.get("score", 0.0) |
|
|
label = prompt_obj.get("label", "") |
|
|
counter_text = f"**{label}** ({new_idx + 1}/{len(prompts)}) | Score: {score:.2%}" |
|
|
else: |
|
|
prompt_text = prompt_obj |
|
|
counter_text = f"**Prompt**: {new_idx + 1}/{len(prompts)}" |
|
|
return prompt_text, counter_text, new_idx |
|
|
|
|
|
def update_prompt_display(prompts, idx, total): |
|
|
if not prompts or len(prompts) == 0: |
|
|
return "", "**Prompt**: -/-" |
|
|
idx = min(idx, len(prompts) - 1) |
|
|
prompt_obj = prompts[idx] |
|
|
|
|
|
if isinstance(prompt_obj, dict): |
|
|
prompt_text = prompt_obj["prompt"] |
|
|
score = prompt_obj.get("score", 0.0) |
|
|
label = prompt_obj.get("label", "") |
|
|
counter_text = f"**{label}** ({idx + 1}/{len(prompts)}) | Score: {score:.2%}" |
|
|
else: |
|
|
prompt_text = prompt_obj |
|
|
counter_text = f"**Prompt**: {idx + 1}/{len(prompts)}" |
|
|
return prompt_text, counter_text |
|
|
|
|
|
|
|
|
optimize_result = optimize_btn.click( |
|
|
fn=optimize_prompt, |
|
|
inputs=[initial_prompt, dataset_name, dataset_split, model, |
|
|
input_field, target_field], |
|
|
outputs=[summary, initial_results, evolution_progress, final_results, |
|
|
prompt_history_state, current_prompt_index, gr.State()] |
|
|
) |
|
|
|
|
|
|
|
|
optimize_result.then( |
|
|
fn=update_prompt_display, |
|
|
inputs=[prompt_history_state, current_prompt_index, gr.State()], |
|
|
outputs=[prompt_display, prompt_counter] |
|
|
) |
|
|
|
|
|
|
|
|
prev_btn.click( |
|
|
fn=show_previous_prompt, |
|
|
inputs=[prompt_history_state, current_prompt_index], |
|
|
outputs=[prompt_display, prompt_counter, current_prompt_index] |
|
|
) |
|
|
|
|
|
next_btn.click( |
|
|
fn=show_next_prompt, |
|
|
inputs=[prompt_history_state, current_prompt_index], |
|
|
outputs=[prompt_display, prompt_counter, current_prompt_index] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|