Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on 26 days ago

Commit

5d7f7a8

verified ·

1 Parent(s): 6bab33e

Upload app.py

Browse files

Files changed (1) hide show

app.py +497 -0

app.py ADDED Viewed

	@@ -0,0 +1,497 @@

+import gradio as gr
+import os
+import yaml
+import json
+import random
+from datasets import load_dataset
+from openai import OpenAI
+from openevolve import run_evolution
+from typing import Dict, List, Tuple
+import tempfile
+import shutil
+# Free models from OpenRouter (as of 2025)
+FREE_MODELS = [
+    "google/gemini-2.0-flash-001:free",
+    "google/gemini-flash-1.5-8b:free",
+    "meta-llama/llama-3.2-3b-instruct:free",
+    "meta-llama/llama-3.2-1b-instruct:free",
+    "microsoft/phi-3-mini-128k-instruct:free",
+    "microsoft/phi-3-medium-128k-instruct:free",
+    "qwen/qwen-2-7b-instruct:free",
+    "mistralai/mistral-7b-instruct:free",
+]
+# Popular HuggingFace datasets for different tasks
+SAMPLE_DATASETS = {
+    "Question Answering": [
+        "hotpot_qa",
+        "squad",
+        "trivia_qa",
+    ],
+    "Sentiment Analysis": [
+        "imdb",
+        "yelp_review_full",
+        "emotion",
+    ],
+    "Text Classification": [
+        "ag_news",
+        "dbpedia_14",
+        "SetFit/sst5",
+    ],
+    "Math Reasoning": [
+        "gsm8k",
+        "math_qa",
+    ],
+}
+def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int,
+                    api_key: str, model: str, input_field: str, target_field: str) -> Dict:
+    """Evaluate a prompt on a dataset using the selected model."""
+    try:
+        # Load dataset
+        dataset = load_dataset(dataset_name, split=split, streaming=False)
+        # Sample random examples
+        if len(dataset) > num_samples:
+            indices = random.sample(range(len(dataset)), num_samples)
+            samples = [dataset[i] for i in indices]
+        else:
+            samples = list(dataset)[:num_samples]
+        # Initialize OpenAI client with OpenRouter
+        client = OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=api_key,
+        )
+        correct = 0
+        total = 0
+        results = []
+        for sample in samples:
+            try:
+                # Get input and target
+                input_text = sample.get(input_field, "")
+                if isinstance(input_text, dict):
+                    input_text = str(input_text)
+                target = sample.get(target_field, "")
+                if isinstance(target, dict):
+                    target = str(target)
+                # Format the prompt with the input
+                formatted_prompt = prompt.replace("{input}", str(input_text))
+                # Call the model
+                response = client.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": formatted_prompt}
+                    ],
+                    temperature=0.1,
+                    max_tokens=500,
+                )
+                prediction = response.choices[0].message.content.strip()
+                # Simple exact match evaluation
+                is_correct = str(target).lower().strip() in prediction.lower()
+                if is_correct:
+                    correct += 1
+                total += 1
+                results.append({
+                    "input": str(input_text)[:100] + "...",
+                    "target": str(target),
+                    "prediction": prediction[:100] + "...",
+                    "correct": is_correct
+                })
+            except Exception as e:
+                print(f"Error evaluating sample: {e}")
+                continue
+        accuracy = (correct / total * 100) if total > 0 else 0
+        return {
+            "accuracy": accuracy,
+            "correct": correct,
+            "total": total,
+            "results": results
+        }
+    except Exception as e:
+        return {
+            "error": str(e),
+            "accuracy": 0,
+            "correct": 0,
+            "total": 0,
+            "results": []
+        }
+def create_evaluator_file(dataset_name: str, split: str, model: str,
+                         input_field: str, target_field: str, work_dir: str):
+    """Create an evaluator.py file for OpenEvolve."""
+    evaluator_code = f'''
+import os
+import random
+from datasets import load_dataset
+from openai import OpenAI
+def evaluate(prompt: str) -> float:
+    """Evaluate a prompt and return a score between 0 and 1."""
+    try:
+        # Load dataset
+        dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
+        # Sample 100 random examples
+        num_samples = min(100, len(dataset))
+        if len(dataset) > num_samples:
+            indices = random.sample(range(len(dataset)), num_samples)
+            samples = [dataset[i] for i in indices]
+        else:
+            samples = list(dataset)[:num_samples]
+        # Initialize OpenAI client
+        api_key = os.environ.get("OPENAI_API_KEY")
+        client = OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=api_key,
+        )
+        correct = 0
+        total = 0
+        for sample in samples:
+            try:
+                # Get input and target
+                input_text = sample.get("{input_field}", "")
+                if isinstance(input_text, dict):
+                    input_text = str(input_text)
+                target = sample.get("{target_field}", "")
+                if isinstance(target, dict):
+                    target = str(target)
+                # Format the prompt
+                formatted_prompt = prompt.replace("{{input}}", str(input_text))
+                # Call the model
+                response = client.chat.completions.create(
+                    model="{model}",
+                    messages=[
+                        {{"role": "system", "content": "You are a helpful assistant."}},
+                        {{"role": "user", "content": formatted_prompt}}
+                    ],
+                    temperature=0.1,
+                    max_tokens=500,
+                )
+                prediction = response.choices[0].message.content.strip()
+                # Simple evaluation
+                is_correct = str(target).lower().strip() in prediction.lower()
+                if is_correct:
+                    correct += 1
+                total += 1
+            except Exception as e:
+                print(f"Error evaluating sample: {{e}}")
+                continue
+        # Return score between 0 and 1
+        return (correct / total) if total > 0 else 0.0
+    except Exception as e:
+        print(f"Error in evaluation: {{e}}")
+        return 0.0
+'''
+    evaluator_path = os.path.join(work_dir, "evaluator.py")
+    with open(evaluator_path, "w") as f:
+        f.write(evaluator_code)
+    return evaluator_path
+def create_config_file(model: str, work_dir: str):
+    """Create a config.yaml file for OpenEvolve."""
+    config = {
+        "llm": {
+            "api_base": "https://openrouter.ai/api/v1",
+            "model": model,
+            "temperature": 0.7,
+            "max_tokens": 4096,
+        },
+        "evolution": {
+            "max_iterations": 10,
+            "population_size": 10,
+            "num_islands": 1,
+            "elite_ratio": 0.1,
+            "explore_ratio": 0.3,
+            "exploit_ratio": 0.6,
+        },
+        "evaluation": {
+            "timeout": 1800,
+        }
+    }
+    config_path = os.path.join(work_dir, "config.yaml")
+    with open(config_path, "w") as f:
+        yaml.dump(config, f)
+    return config_path
+def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
+                   model: str, api_key: str, input_field: str, target_field: str,
+                   progress=gr.Progress()) -> Tuple[str, str, str]:
+    """Run OpenEvolve to optimize the prompt."""
+    if not api_key:
+        return "Error: OpenAI API Key is required", "", ""
+    # Set API key as environment variable
+    os.environ["OPENAI_API_KEY"] = api_key
+    progress(0, desc="Setting up...")
+    # Create temporary working directory
+    work_dir = tempfile.mkdtemp(prefix="openevolve_")
+    try:
+        # Save initial prompt
+        initial_prompt_path = os.path.join(work_dir, "initial_prompt.txt")
+        with open(initial_prompt_path, "w") as f:
+            f.write(initial_prompt)
+        # Create evaluator
+        progress(0.1, desc="Creating evaluator...")
+        evaluator_path = create_evaluator_file(dataset_name, dataset_split, model,
+                                               input_field, target_field, work_dir)
+        # Create config
+        progress(0.2, desc="Creating configuration...")
+        config_path = create_config_file(model, work_dir)
+        # Run initial evaluation
+        progress(0.3, desc="Running initial evaluation...")
+        initial_eval = evaluate_prompt(
+            initial_prompt, dataset_name, dataset_split, 100,
+            api_key, model, input_field, target_field
+        )
+        initial_results = f"""
+### Initial Prompt Evaluation
+**Prompt:**
+```
+{initial_prompt}
+```
+**Results:**
+- Accuracy: {initial_eval['accuracy']:.2f}%
+- Correct: {initial_eval['correct']}/{initial_eval['total']}
+**Sample Results:**
+"""
+        for i, result in enumerate(initial_eval['results'][:5], 1):
+            initial_results += f"\n{i}. Input: {result['input']}\n"
+            initial_results += f"   Target: {result['target']}\n"
+            initial_results += f"   Prediction: {result['prediction']}\n"
+            initial_results += f"   ✓ Correct\n" if result['correct'] else f"   ✗ Incorrect\n"
+        # Run OpenEvolve
+        progress(0.4, desc="Running OpenEvolve (this may take several minutes)...")
+        output_dir = os.path.join(work_dir, "output")
+        os.makedirs(output_dir, exist_ok=True)
+        try:
+            # Run evolution
+            result = run_evolution(
+                initial_program_path=initial_prompt_path,
+                evaluator_path=evaluator_path,
+                config_path=config_path,
+                output_dir=output_dir,
+                verbose=True
+            )
+            progress(0.8, desc="Evaluating best prompt...")
+            # Get the best prompt
+            best_prompt_path = os.path.join(output_dir, "best_program.txt")
+            if os.path.exists(best_prompt_path):
+                with open(best_prompt_path, "r") as f:
+                    best_prompt = f.read()
+            else:
+                best_prompt = initial_prompt
+            # Evaluate best prompt
+            final_eval = evaluate_prompt(
+                best_prompt, dataset_name, dataset_split, 100,
+                api_key, model, input_field, target_field
+            )
+            final_results = f"""
+### Evolved Prompt Evaluation
+**Prompt:**
+```
+{best_prompt}
+```
+**Results:**
+- Accuracy: {final_eval['accuracy']:.2f}%
+- Correct: {final_eval['correct']}/{final_eval['total']}
+- Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:.2f}%
+**Sample Results:**
+"""
+            for i, result in enumerate(final_eval['results'][:5], 1):
+                final_results += f"\n{i}. Input: {result['input']}\n"
+                final_results += f"   Target: {result['target']}\n"
+                final_results += f"   Prediction: {result['prediction']}\n"
+                final_results += f"   ✓ Correct\n" if result['correct'] else f"   ✗ Incorrect\n"
+            summary = f"""
+## Optimization Complete!
+### Summary
+- Initial Accuracy: {initial_eval['accuracy']:.2f}%
+- Final Accuracy: {final_eval['accuracy']:.2f}%
+- Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:.2f}%
+- Dataset: {dataset_name}
+- Model: {model}
+- Samples Evaluated: 100
+- Iterations: 10
+"""
+            progress(1.0, desc="Complete!")
+            return summary, initial_results, final_results
+        except Exception as e:
+            return f"Error during evolution: {str(e)}", initial_results, ""
+    finally:
+        # Clean up
+        try:
+            shutil.rmtree(work_dir)
+        except:
+            pass
+# Create Gradio interface
+with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🧬 OpenEvolve Prompt Optimizer
+    Automatically evolve and optimize your prompts using evolutionary algorithms!
+    This space uses [OpenEvolve](https://github.com/codelion/openevolve) to iteratively improve prompts
+    by testing them on real datasets and evolving better versions.
+    ## How it works:
+    1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
+    2. Select a HuggingFace dataset to test on
+    3. Choose a free model from OpenRouter
+    4. Click "Optimize Prompt" to evolve better versions
+    5. Compare initial vs. evolved performance!
+    """)
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Configuration")
+            api_key = gr.Textbox(
+                label="OpenAI API Key (for OpenRouter)",
+                type="password",
+                placeholder="sk-or-v1-...",
+                info="Get your free key at https://openrouter.ai/keys"
+            )
+            model = gr.Dropdown(
+                choices=FREE_MODELS,
+                value=FREE_MODELS[0],
+                label="Select Model",
+                info="Free models available on OpenRouter"
+            )
+            dataset_name = gr.Textbox(
+                label="HuggingFace Dataset",
+                value="imdb",
+                placeholder="e.g., imdb, hotpot_qa, gsm8k",
+                info="Any dataset from HuggingFace Hub"
+            )
+            dataset_split = gr.Textbox(
+                label="Dataset Split",
+                value="test",
+                placeholder="e.g., train, test, validation"
+            )
+            input_field = gr.Textbox(
+                label="Input Field Name",
+                value="text",
+                placeholder="e.g., text, question, context",
+                info="The field containing inputs to process"
+            )
+            target_field = gr.Textbox(
+                label="Target Field Name",
+                value="label",
+                placeholder="e.g., label, answer, target",
+                info="The field containing expected outputs"
+            )
+            initial_prompt = gr.TextArea(
+                label="Initial Prompt",
+                value="Analyze the sentiment of the following text and classify it as positive or negative:\n\n{input}\n\nClassification:",
+                lines=6,
+                info="Use {input} as placeholder for dataset inputs"
+            )
+            optimize_btn = gr.Button("🚀 Optimize Prompt", variant="primary", size="lg")
+    with gr.Row():
+        with gr.Column():
+            summary = gr.Markdown(label="Summary")
+    with gr.Row():
+        with gr.Column():
+            initial_results = gr.Markdown(label="Initial Results")
+        with gr.Column():
+            final_results = gr.Markdown(label="Evolved Results")
+    gr.Markdown("""
+    ### Example Datasets & Fields:
+    | Dataset | Split | Input Field | Target Field | Task |
+    |---------|-------|-------------|--------------|------|
+    | imdb | test | text | label | Sentiment Analysis |
+    | hotpot_qa | validation | question | answer | Question Answering |
+    | emotion | test | text | label | Emotion Classification |
+    | gsm8k | test | question | answer | Math Reasoning |
+    | ag_news | test | text | label | News Classification |
+    ### Notes:
+    - Evolution runs for 10 iterations with 1 island
+    - Each evaluation uses 100 random samples from the dataset
+    - The process may take 5-15 minutes depending on the dataset and model
+    - Make sure your API key has sufficient credits for the requests
+    """)
+    optimize_btn.click(
+        fn=optimize_prompt,
+        inputs=[initial_prompt, dataset_name, dataset_split, model, api_key,
+                input_field, target_field],
+        outputs=[summary, initial_results, final_results]
+    )
+if __name__ == "__main__":
+    demo.launch()