Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

File size: 15,959 Bytes

5d7f7a8

import gradio as gr
import os
import yaml
import json
import random
from datasets import load_dataset
from openai import OpenAI
from openevolve import run_evolution
from typing import Dict, List, Tuple
import tempfile
import shutil

# Free models from OpenRouter (as of 2025)
FREE_MODELS = [
    "google/gemini-2.0-flash-001:free",
    "google/gemini-flash-1.5-8b:free",
    "meta-llama/llama-3.2-3b-instruct:free",
    "meta-llama/llama-3.2-1b-instruct:free",
    "microsoft/phi-3-mini-128k-instruct:free",
    "microsoft/phi-3-medium-128k-instruct:free",
    "qwen/qwen-2-7b-instruct:free",
    "mistralai/mistral-7b-instruct:free",
]

# Popular HuggingFace datasets for different tasks
SAMPLE_DATASETS = {
    "Question Answering": [
        "hotpot_qa",
        "squad",
        "trivia_qa",
    ],
    "Sentiment Analysis": [
        "imdb",
        "yelp_review_full",
        "emotion",
    ],
    "Text Classification": [
        "ag_news",
        "dbpedia_14",
        "SetFit/sst5",
    ],
    "Math Reasoning": [
        "gsm8k",
        "math_qa",
    ],
}


def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int,
                    api_key: str, model: str, input_field: str, target_field: str) -> Dict:
    """Evaluate a prompt on a dataset using the selected model."""
    try:
        # Load dataset
        dataset = load_dataset(dataset_name, split=split, streaming=False)

        # Sample random examples
        if len(dataset) > num_samples:
            indices = random.sample(range(len(dataset)), num_samples)
            samples = [dataset[i] for i in indices]
        else:
            samples = list(dataset)[:num_samples]

        # Initialize OpenAI client with OpenRouter
        client = OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=api_key,
        )

        correct = 0
        total = 0
        results = []

        for sample in samples:
            try:
                # Get input and target
                input_text = sample.get(input_field, "")
                if isinstance(input_text, dict):
                    input_text = str(input_text)

                target = sample.get(target_field, "")
                if isinstance(target, dict):
                    target = str(target)

                # Format the prompt with the input
                formatted_prompt = prompt.replace("{input}", str(input_text))

                # Call the model
                response = client.chat.completions.create(
                    model=model,
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": formatted_prompt}
                    ],
                    temperature=0.1,
                    max_tokens=500,
                )

                prediction = response.choices[0].message.content.strip()

                # Simple exact match evaluation
                is_correct = str(target).lower().strip() in prediction.lower()
                if is_correct:
                    correct += 1
                total += 1

                results.append({
                    "input": str(input_text)[:100] + "...",
                    "target": str(target),
                    "prediction": prediction[:100] + "...",
                    "correct": is_correct
                })

            except Exception as e:
                print(f"Error evaluating sample: {e}")
                continue

        accuracy = (correct / total * 100) if total > 0 else 0

        return {
            "accuracy": accuracy,
            "correct": correct,
            "total": total,
            "results": results
        }

    except Exception as e:
        return {
            "error": str(e),
            "accuracy": 0,
            "correct": 0,
            "total": 0,
            "results": []
        }


def create_evaluator_file(dataset_name: str, split: str, model: str,
                         input_field: str, target_field: str, work_dir: str):
    """Create an evaluator.py file for OpenEvolve."""
    evaluator_code = f'''
import os
import random
from datasets import load_dataset
from openai import OpenAI

def evaluate(prompt: str) -> float:
    """Evaluate a prompt and return a score between 0 and 1."""
    try:
        # Load dataset
        dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)

        # Sample 100 random examples
        num_samples = min(100, len(dataset))
        if len(dataset) > num_samples:
            indices = random.sample(range(len(dataset)), num_samples)
            samples = [dataset[i] for i in indices]
        else:
            samples = list(dataset)[:num_samples]

        # Initialize OpenAI client
        api_key = os.environ.get("OPENAI_API_KEY")
        client = OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=api_key,
        )

        correct = 0
        total = 0

        for sample in samples:
            try:
                # Get input and target
                input_text = sample.get("{input_field}", "")
                if isinstance(input_text, dict):
                    input_text = str(input_text)

                target = sample.get("{target_field}", "")
                if isinstance(target, dict):
                    target = str(target)

                # Format the prompt
                formatted_prompt = prompt.replace("{{input}}", str(input_text))

                # Call the model
                response = client.chat.completions.create(
                    model="{model}",
                    messages=[
                        {{"role": "system", "content": "You are a helpful assistant."}},
                        {{"role": "user", "content": formatted_prompt}}
                    ],
                    temperature=0.1,
                    max_tokens=500,
                )

                prediction = response.choices[0].message.content.strip()

                # Simple evaluation
                is_correct = str(target).lower().strip() in prediction.lower()
                if is_correct:
                    correct += 1
                total += 1

            except Exception as e:
                print(f"Error evaluating sample: {{e}}")
                continue

        # Return score between 0 and 1
        return (correct / total) if total > 0 else 0.0

    except Exception as e:
        print(f"Error in evaluation: {{e}}")
        return 0.0
'''

    evaluator_path = os.path.join(work_dir, "evaluator.py")
    with open(evaluator_path, "w") as f:
        f.write(evaluator_code)

    return evaluator_path


def create_config_file(model: str, work_dir: str):
    """Create a config.yaml file for OpenEvolve."""
    config = {
        "llm": {
            "api_base": "https://openrouter.ai/api/v1",
            "model": model,
            "temperature": 0.7,
            "max_tokens": 4096,
        },
        "evolution": {
            "max_iterations": 10,
            "population_size": 10,
            "num_islands": 1,
            "elite_ratio": 0.1,
            "explore_ratio": 0.3,
            "exploit_ratio": 0.6,
        },
        "evaluation": {
            "timeout": 1800,
        }
    }

    config_path = os.path.join(work_dir, "config.yaml")
    with open(config_path, "w") as f:
        yaml.dump(config, f)

    return config_path


def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
                   model: str, api_key: str, input_field: str, target_field: str,
                   progress=gr.Progress()) -> Tuple[str, str, str]:
    """Run OpenEvolve to optimize the prompt."""

    if not api_key:
        return "Error: OpenAI API Key is required", "", ""

    # Set API key as environment variable
    os.environ["OPENAI_API_KEY"] = api_key

    progress(0, desc="Setting up...")

    # Create temporary working directory
    work_dir = tempfile.mkdtemp(prefix="openevolve_")

    try:
        # Save initial prompt
        initial_prompt_path = os.path.join(work_dir, "initial_prompt.txt")
        with open(initial_prompt_path, "w") as f:
            f.write(initial_prompt)

        # Create evaluator
        progress(0.1, desc="Creating evaluator...")
        evaluator_path = create_evaluator_file(dataset_name, dataset_split, model,
                                               input_field, target_field, work_dir)

        # Create config
        progress(0.2, desc="Creating configuration...")
        config_path = create_config_file(model, work_dir)

        # Run initial evaluation
        progress(0.3, desc="Running initial evaluation...")
        initial_eval = evaluate_prompt(
            initial_prompt, dataset_name, dataset_split, 100,
            api_key, model, input_field, target_field
        )

        initial_results = f"""
### Initial Prompt Evaluation

**Prompt:**
```
{initial_prompt}
```

**Results:**
- Accuracy: {initial_eval['accuracy']:.2f}%
- Correct: {initial_eval['correct']}/{initial_eval['total']}

**Sample Results:**
"""
        for i, result in enumerate(initial_eval['results'][:5], 1):
            initial_results += f"\n{i}. Input: {result['input']}\n"
            initial_results += f"   Target: {result['target']}\n"
            initial_results += f"   Prediction: {result['prediction']}\n"
            initial_results += f"   ✓ Correct\n" if result['correct'] else f"   ✗ Incorrect\n"

        # Run OpenEvolve
        progress(0.4, desc="Running OpenEvolve (this may take several minutes)...")

        output_dir = os.path.join(work_dir, "output")
        os.makedirs(output_dir, exist_ok=True)

        try:
            # Run evolution
            result = run_evolution(
                initial_program_path=initial_prompt_path,
                evaluator_path=evaluator_path,
                config_path=config_path,
                output_dir=output_dir,
                verbose=True
            )

            progress(0.8, desc="Evaluating best prompt...")

            # Get the best prompt
            best_prompt_path = os.path.join(output_dir, "best_program.txt")
            if os.path.exists(best_prompt_path):
                with open(best_prompt_path, "r") as f:
                    best_prompt = f.read()
            else:
                best_prompt = initial_prompt

            # Evaluate best prompt
            final_eval = evaluate_prompt(
                best_prompt, dataset_name, dataset_split, 100,
                api_key, model, input_field, target_field
            )

            final_results = f"""
### Evolved Prompt Evaluation

**Prompt:**
```
{best_prompt}
```

**Results:**
- Accuracy: {final_eval['accuracy']:.2f}%
- Correct: {final_eval['correct']}/{final_eval['total']}
- Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:.2f}%

**Sample Results:**
"""
            for i, result in enumerate(final_eval['results'][:5], 1):
                final_results += f"\n{i}. Input: {result['input']}\n"
                final_results += f"   Target: {result['target']}\n"
                final_results += f"   Prediction: {result['prediction']}\n"
                final_results += f"   ✓ Correct\n" if result['correct'] else f"   ✗ Incorrect\n"

            summary = f"""
## Optimization Complete!

### Summary
- Initial Accuracy: {initial_eval['accuracy']:.2f}%
- Final Accuracy: {final_eval['accuracy']:.2f}%
- Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:.2f}%
- Dataset: {dataset_name}
- Model: {model}
- Samples Evaluated: 100
- Iterations: 10
"""

            progress(1.0, desc="Complete!")

            return summary, initial_results, final_results

        except Exception as e:
            return f"Error during evolution: {str(e)}", initial_results, ""

    finally:
        # Clean up
        try:
            shutil.rmtree(work_dir)
        except:
            pass


# Create Gradio interface
with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🧬 OpenEvolve Prompt Optimizer

    Automatically evolve and optimize your prompts using evolutionary algorithms!

    This space uses [OpenEvolve](https://github.com/codelion/openevolve) to iteratively improve prompts
    by testing them on real datasets and evolving better versions.

    ## How it works:
    1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
    2. Select a HuggingFace dataset to test on
    3. Choose a free model from OpenRouter
    4. Click "Optimize Prompt" to evolve better versions
    5. Compare initial vs. evolved performance!
    """)

    with gr.Row():
        with gr.Column():
            gr.Markdown("### Configuration")

            api_key = gr.Textbox(
                label="OpenAI API Key (for OpenRouter)",
                type="password",
                placeholder="sk-or-v1-...",
                info="Get your free key at https://openrouter.ai/keys"
            )

            model = gr.Dropdown(
                choices=FREE_MODELS,
                value=FREE_MODELS[0],
                label="Select Model",
                info="Free models available on OpenRouter"
            )

            dataset_name = gr.Textbox(
                label="HuggingFace Dataset",
                value="imdb",
                placeholder="e.g., imdb, hotpot_qa, gsm8k",
                info="Any dataset from HuggingFace Hub"
            )

            dataset_split = gr.Textbox(
                label="Dataset Split",
                value="test",
                placeholder="e.g., train, test, validation"
            )

            input_field = gr.Textbox(
                label="Input Field Name",
                value="text",
                placeholder="e.g., text, question, context",
                info="The field containing inputs to process"
            )

            target_field = gr.Textbox(
                label="Target Field Name",
                value="label",
                placeholder="e.g., label, answer, target",
                info="The field containing expected outputs"
            )

            initial_prompt = gr.TextArea(
                label="Initial Prompt",
                value="Analyze the sentiment of the following text and classify it as positive or negative:\n\n{input}\n\nClassification:",
                lines=6,
                info="Use {input} as placeholder for dataset inputs"
            )

            optimize_btn = gr.Button("🚀 Optimize Prompt", variant="primary", size="lg")

    with gr.Row():
        with gr.Column():
            summary = gr.Markdown(label="Summary")

    with gr.Row():
        with gr.Column():
            initial_results = gr.Markdown(label="Initial Results")
        with gr.Column():
            final_results = gr.Markdown(label="Evolved Results")

    gr.Markdown("""
    ### Example Datasets & Fields:

    | Dataset | Split | Input Field | Target Field | Task |
    |---------|-------|-------------|--------------|------|
    | imdb | test | text | label | Sentiment Analysis |
    | hotpot_qa | validation | question | answer | Question Answering |
    | emotion | test | text | label | Emotion Classification |
    | gsm8k | test | question | answer | Math Reasoning |
    | ag_news | test | text | label | News Classification |

    ### Notes:
    - Evolution runs for 10 iterations with 1 island
    - Each evaluation uses 100 random samples from the dataset
    - The process may take 5-15 minutes depending on the dataset and model
    - Make sure your API key has sufficient credits for the requests
    """)

    optimize_btn.click(
        fn=optimize_prompt,
        inputs=[initial_prompt, dataset_name, dataset_split, model, api_key,
                input_field, target_field],
        outputs=[summary, initial_results, final_results]
    )

if __name__ == "__main__":
    demo.launch()