Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on 25 days ago

Commit

653da52

verified ·

1 Parent(s): 2792008

Upload app.py

Browse files

Files changed (1) hide show

app.py +98 -57

app.py CHANGED Viewed

@@ -13,15 +13,10 @@ import shutil
 import requests
 import glob
-# Free models from OpenRouter - Curated selection (verified as of 2025)
-# IMPORTANT: The :free suffix is REQUIRED to use the free tier. Without it, requests are charged!
-FREE_MODELS = [
-    "qwen/qwen-2.5-72b-instruct:free",  # 72B - Strong in coding/math/multilingual (default - better rate limits)
-    "meta-llama/llama-3.3-70b-instruct:free",  # 70B - Advanced reasoning
-    "google/gemma-3-27b-it:free",  # 27B - Strong instruction-tuned
-    "mistralai/mistral-small-3.1-24b-instruct:free",  # 24B - Efficient and capable
-    "deepseek/deepseek-r1:free",  # 671B (37B active) - Top-tier but heavily rate-limited
-    "meta-llama/llama-3.2-3b-instruct",  # 3B - PAID but very cheap fallback when free models hit rate limits
 ]
@@ -215,19 +210,34 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
                 # Format the prompt with the input
                 formatted_prompt = prompt.replace("{input}", str(input_text))
-                # Call the model
-                response = client.chat.completions.create(
-                    model=model,
-                    messages=[
-                        {"role": "system", "content": "You are a helpful assistant."},
-                        {"role": "user", "content": formatted_prompt}
-                    ],
-                    temperature=0.0,
-                    max_tokens=500,
-                )
                 prediction = response.choices[0].message.content.strip()
                 # IMDB labels: 0 = negative, 1 = positive
                 true_label = int(target)  # 0 or 1
@@ -494,19 +504,20 @@ def parse_evolution_history(output_dir: str) -> str:
 def create_evaluator_file(dataset_name: str, split: str, model: str,
                          input_field: str, target_field: str, work_dir: str):
-    """Create an evaluator.py file for OpenEvolve that uses 150 samples for better signal."""
     evaluator_code = f'''
 import os
 import random
 from datasets import load_dataset
 from openai import OpenAI
 def evaluate(prompt: str) -> dict:
     """
-    Evaluate a prompt using 150 fixed samples for stronger evolution signal.
-    Using more samples (150 vs 50) gives evolution better signal to distinguish
-    good prompts from bad ones. Final comparison still uses the same 50 samples.
     """
     try:
         # IMPORTANT: Use fixed seed for consistent sampling across all evaluations
@@ -524,8 +535,8 @@ def evaluate(prompt: str) -> dict:
             else:
                 raise
-        # Sample 150 samples with seed 42 for stronger signal during evolution
-        num_samples = 150
         if len(dataset) > num_samples:
             # Use SAME sampling logic as initial/final eval
             indices = random.sample(range(len(dataset)), num_samples)
@@ -543,6 +554,7 @@ def evaluate(prompt: str) -> dict:
         correct = 0
         total = 0
         print(f"Evaluating on {{len(samples)}} samples...")
@@ -560,16 +572,27 @@ def evaluate(prompt: str) -> dict:
                 # Format the prompt
                 formatted_prompt = prompt.replace("{{input}}", str(input_text))
-                # Call the model
-                response = client.chat.completions.create(
-                    model="{model}",
-                    messages=[
-                        {{"role": "system", "content": "You are a helpful assistant."}},
-                        {{"role": "user", "content": formatted_prompt}}
-                    ],
-                    temperature=0.0,
-                    max_tokens=500,
-                )
                 prediction = response.choices[0].message.content.strip()
@@ -602,11 +625,21 @@ def evaluate(prompt: str) -> dict:
                     correct += 1
                 total += 1
-                if (idx + 1) % 50 == 0:
                     print(f"  Progress: {{idx + 1}}/{{len(samples)}} - Current accuracy: {{correct/total:.2%}}")
             except Exception as e:
                 print(f"Error evaluating sample {{idx+1}}: {{e}}")
                 continue
         accuracy = (correct / total) if total > 0 else 0.0
@@ -726,7 +759,7 @@ Your improved prompt here
         "llm": {
             "primary_model": model,
             "api_base": "https://openrouter.ai/api/v1",  # Use OpenRouter endpoint
-            "temperature": 1.0,  # Higher temperature for more diverse prompt variations
         },
         "max_iterations": 5,
         "checkpoint_interval": 1,  # Save checkpoints every iteration to preserve prompt history
@@ -738,11 +771,11 @@ Your improved prompt here
             "template_dir": templates_dir,  # Use our custom prompt engineering templates
         },
         "evolution": {
-            "population_size": 15,  # Increased from 10 for more exploration
             "num_islands": 1,  # Single island for simpler evolution
-            "elite_ratio": 0.1,  # Keep top 10% (1-2 best prompts)
-            "explore_ratio": 0.4,  # Increased exploration (was 0.3)
-            "exploit_ratio": 0.5,  # Reduced exploitation (was 0.6)
         },
         "database": {
             "log_prompts": True,  # Save prompts used to generate each program
@@ -940,7 +973,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
 - **Model**: {model}
 - **Initial Eval**: 50 samples
 - **Final Eval**: 50 samples (same samples for fair comparison)
-- **Evolution**: 150 samples per variant (more data for stronger signal)
 - **Iterations**: 5
 ### Results
@@ -974,29 +1007,28 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
     This space uses [OpenEvolve](https://github.com/algorithmicsuperintelligence/openevolve) to iteratively improve prompts
     by testing them on real datasets and evolving better versions.
     ## How it works:
     1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
     2. Default dataset is **IMDB** (movie review sentiment classification) - great for showing prompt improvement!
     3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `dair-ai/emotion`)
-    4. Choose a free model from OpenRouter
-    5. Click "Optimize Prompt" - the system will validate everything first!
-    6. Watch the evolution progress in real-time
-    7. Compare initial vs. best prompt side-by-side (both evaluated on same 200 samples)!
-    **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
     """)
     with gr.Row():
         with gr.Column():
             gr.Markdown("### Configuration")
-            model = gr.Dropdown(
-                choices=FREE_MODELS,
-                value=FREE_MODELS[0],
-                label="Select Model",
-                info="Choose from 5 curated free models on OpenRouter (24B to 671B parameters)"
-            )
             dataset_name = gr.Textbox(
                 label="HuggingFace Dataset (Full Name)",
                 value="stanfordnlp/imdb",
@@ -1097,10 +1129,19 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
         - [Documentation](https://github.com/algorithmicsuperintelligence/openevolve#readme)
         """)
-    # Wire up the optimize button
     optimize_btn.click(
-        fn=optimize_prompt,
-        inputs=[initial_prompt, dataset_name, dataset_split, model,
                 input_field, target_field],
         outputs=[summary, initial_results, final_results]
     )

 import requests
 import glob
+# Model for OpenRouter
+# Using paid llama-3.2-3b-instruct since free tier models have unreliable rate limits
+MODELS = [
+    "meta-llama/llama-3.2-3b-instruct",  # 3B - Reliable, fast, and very cheap ($0.04/$0.04 per 1M tokens)
 ]
                 # Format the prompt with the input
                 formatted_prompt = prompt.replace("{input}", str(input_text))
+                # Call the model with retry logic for transient failures
+                max_retries = 3
+                import time
+                for retry in range(max_retries):
+                    try:
+                        response = client.chat.completions.create(
+                            model=model,
+                            messages=[
+                                {"role": "system", "content": "You are a helpful assistant."},
+                                {"role": "user", "content": formatted_prompt}
+                            ],
+                            temperature=0.0,
+                            max_tokens=500,
+                        )
+                        break  # Success, exit retry loop
+                    except Exception as api_error:
+                        if retry < max_retries - 1:
+                            wait_time = (retry + 1) * 2  # Exponential backoff: 2s, 4s, 6s
+                            print(f"  API error on sample {idx+1}, retrying in {wait_time}s...")
+                            time.sleep(wait_time)
+                        else:
+                            raise  # Final retry failed, propagate error
                 prediction = response.choices[0].message.content.strip()
+                # Small delay to avoid rate limiting
+                time.sleep(0.1)
                 # IMDB labels: 0 = negative, 1 = positive
                 true_label = int(target)  # 0 or 1
 def create_evaluator_file(dataset_name: str, split: str, model: str,
                          input_field: str, target_field: str, work_dir: str):
+    """Create an evaluator.py file for OpenEvolve that uses 75 samples for better signal."""
     evaluator_code = f'''
 import os
 import random
+import time
 from datasets import load_dataset
 from openai import OpenAI
 def evaluate(prompt: str) -> dict:
     """
+    Evaluate a prompt using 75 fixed samples for stronger evolution signal.
+    75 samples balances signal strength (vs 50) with API rate limits (vs 150).
+    Includes early stopping and rate limit handling.
     """
     try:
         # IMPORTANT: Use fixed seed for consistent sampling across all evaluations
             else:
                 raise
+        # Sample 75 samples with seed 42 for good signal without excessive API calls
+        num_samples = 75
         if len(dataset) > num_samples:
             # Use SAME sampling logic as initial/final eval
             indices = random.sample(range(len(dataset)), num_samples)
         correct = 0
         total = 0
+        errors = 0
         print(f"Evaluating on {{len(samples)}} samples...")
                 # Format the prompt
                 formatted_prompt = prompt.replace("{{input}}", str(input_text))
+                # Call the model with retry logic for transient failures
+                max_retries = 3
+                for retry in range(max_retries):
+                    try:
+                        response = client.chat.completions.create(
+                            model="{model}",
+                            messages=[
+                                {{"role": "system", "content": "You are a helpful assistant."}},
+                                {{"role": "user", "content": formatted_prompt}}
+                            ],
+                            temperature=0.0,
+                            max_tokens=500,
+                        )
+                        break  # Success, exit retry loop
+                    except Exception as api_error:
+                        if retry < max_retries - 1:
+                            wait_time = (retry + 1) * 2  # Exponential backoff: 2s, 4s, 6s
+                            print(f"  API error on sample {{idx+1}}, retrying in {{wait_time}}s...")
+                            time.sleep(wait_time)
+                        else:
+                            raise  # Final retry failed, propagate error
                 prediction = response.choices[0].message.content.strip()
                     correct += 1
                 total += 1
+                # Small delay to avoid rate limiting
+                time.sleep(0.1)
+                if (idx + 1) % 25 == 0:
                     print(f"  Progress: {{idx + 1}}/{{len(samples)}} - Current accuracy: {{correct/total:.2%}}")
             except Exception as e:
+                errors += 1
                 print(f"Error evaluating sample {{idx+1}}: {{e}}")
+                # Early stopping: if more than 40% of samples fail, abort
+                if errors > len(samples) * 0.4:
+                    print(f"Too many errors ({{errors}}/{{idx+1}}), stopping evaluation early")
+                    break
                 continue
         accuracy = (correct / total) if total > 0 else 0.0
         "llm": {
             "primary_model": model,
             "api_base": "https://openrouter.ai/api/v1",  # Use OpenRouter endpoint
+            "temperature": 0.8,  # Balanced temperature for diverse but reasonable variations
         },
         "max_iterations": 5,
         "checkpoint_interval": 1,  # Save checkpoints every iteration to preserve prompt history
             "template_dir": templates_dir,  # Use our custom prompt engineering templates
         },
         "evolution": {
+            "population_size": 12,  # Moderate population for good exploration without excessive API calls
             "num_islands": 1,  # Single island for simpler evolution
+            "elite_ratio": 0.15,  # Keep top 15% (1-2 best prompts)
+            "explore_ratio": 0.35,  # Balanced exploration
+            "exploit_ratio": 0.50,  # Balanced exploitation
         },
         "database": {
             "log_prompts": True,  # Save prompts used to generate each program
 - **Model**: {model}
 - **Initial Eval**: 50 samples
 - **Final Eval**: 50 samples (same samples for fair comparison)
+- **Evolution**: 75 samples per variant (balanced signal vs API limits)
 - **Iterations**: 5
 ### Results
     This space uses [OpenEvolve](https://github.com/algorithmicsuperintelligence/openevolve) to iteratively improve prompts
     by testing them on real datasets and evolving better versions.
+    ## 🔑 Setup (Required)
+    **To use this space:**
+    1. Click "⋮" (menu) → "Duplicate Space" to create your own copy
+    2. In your duplicated space, go to Settings → Variables & Secrets
+    3. Add your OpenRouter API key as `OPENAI_API_KEY`
+    4. Get a free API key at [openrouter.ai](https://openrouter.ai/)
     ## How it works:
     1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
     2. Default dataset is **IMDB** (movie review sentiment classification) - great for showing prompt improvement!
     3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `dair-ai/emotion`)
+    4. Click "Optimize Prompt" - the system will validate everything first!
+    5. Watch the evolution progress in real-time
+    6. Compare initial vs. best prompt side-by-side (both evaluated on same 50 samples)!
+    **Model**: Using `meta-llama/llama-3.2-3b-instruct` (reliable, very cheap at ~$0.04 per 1M tokens)
     """)
     with gr.Row():
         with gr.Column():
             gr.Markdown("### Configuration")
             dataset_name = gr.Textbox(
                 label="HuggingFace Dataset (Full Name)",
                 value="stanfordnlp/imdb",
         - [Documentation](https://github.com/algorithmicsuperintelligence/openevolve#readme)
         """)
+    # Wire up the optimize button with hardcoded model
+    def optimize_with_fixed_model(initial_prompt, dataset_name, dataset_split,
+                                   input_field, target_field, progress=gr.Progress()):
+        """Wrapper to use fixed model instead of dropdown"""
+        return optimize_prompt(
+            initial_prompt, dataset_name, dataset_split,
+            MODELS[0],  # Use fixed llama-3.2-3b model
+            input_field, target_field, progress
+        )
     optimize_btn.click(
+        fn=optimize_with_fixed_model,
+        inputs=[initial_prompt, dataset_name, dataset_split,
                 input_field, target_field],
         outputs=[summary, initial_results, final_results]
     )