codelion commited on
Commit
653da52
·
verified ·
1 Parent(s): 2792008

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -57
app.py CHANGED
@@ -13,15 +13,10 @@ import shutil
13
  import requests
14
  import glob
15
 
16
- # Free models from OpenRouter - Curated selection (verified as of 2025)
17
- # IMPORTANT: The :free suffix is REQUIRED to use the free tier. Without it, requests are charged!
18
- FREE_MODELS = [
19
- "qwen/qwen-2.5-72b-instruct:free", # 72B - Strong in coding/math/multilingual (default - better rate limits)
20
- "meta-llama/llama-3.3-70b-instruct:free", # 70B - Advanced reasoning
21
- "google/gemma-3-27b-it:free", # 27B - Strong instruction-tuned
22
- "mistralai/mistral-small-3.1-24b-instruct:free", # 24B - Efficient and capable
23
- "deepseek/deepseek-r1:free", # 671B (37B active) - Top-tier but heavily rate-limited
24
- "meta-llama/llama-3.2-3b-instruct", # 3B - PAID but very cheap fallback when free models hit rate limits
25
  ]
26
 
27
 
@@ -215,19 +210,34 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
215
  # Format the prompt with the input
216
  formatted_prompt = prompt.replace("{input}", str(input_text))
217
 
218
- # Call the model
219
- response = client.chat.completions.create(
220
- model=model,
221
- messages=[
222
- {"role": "system", "content": "You are a helpful assistant."},
223
- {"role": "user", "content": formatted_prompt}
224
- ],
225
- temperature=0.0,
226
- max_tokens=500,
227
- )
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  prediction = response.choices[0].message.content.strip()
230
 
 
 
 
231
  # IMDB labels: 0 = negative, 1 = positive
232
  true_label = int(target) # 0 or 1
233
 
@@ -494,19 +504,20 @@ def parse_evolution_history(output_dir: str) -> str:
494
 
495
  def create_evaluator_file(dataset_name: str, split: str, model: str,
496
  input_field: str, target_field: str, work_dir: str):
497
- """Create an evaluator.py file for OpenEvolve that uses 150 samples for better signal."""
498
  evaluator_code = f'''
499
  import os
500
  import random
 
501
  from datasets import load_dataset
502
  from openai import OpenAI
503
 
504
  def evaluate(prompt: str) -> dict:
505
  """
506
- Evaluate a prompt using 150 fixed samples for stronger evolution signal.
507
 
508
- Using more samples (150 vs 50) gives evolution better signal to distinguish
509
- good prompts from bad ones. Final comparison still uses the same 50 samples.
510
  """
511
  try:
512
  # IMPORTANT: Use fixed seed for consistent sampling across all evaluations
@@ -524,8 +535,8 @@ def evaluate(prompt: str) -> dict:
524
  else:
525
  raise
526
 
527
- # Sample 150 samples with seed 42 for stronger signal during evolution
528
- num_samples = 150
529
  if len(dataset) > num_samples:
530
  # Use SAME sampling logic as initial/final eval
531
  indices = random.sample(range(len(dataset)), num_samples)
@@ -543,6 +554,7 @@ def evaluate(prompt: str) -> dict:
543
 
544
  correct = 0
545
  total = 0
 
546
 
547
  print(f"Evaluating on {{len(samples)}} samples...")
548
 
@@ -560,16 +572,27 @@ def evaluate(prompt: str) -> dict:
560
  # Format the prompt
561
  formatted_prompt = prompt.replace("{{input}}", str(input_text))
562
 
563
- # Call the model
564
- response = client.chat.completions.create(
565
- model="{model}",
566
- messages=[
567
- {{"role": "system", "content": "You are a helpful assistant."}},
568
- {{"role": "user", "content": formatted_prompt}}
569
- ],
570
- temperature=0.0,
571
- max_tokens=500,
572
- )
 
 
 
 
 
 
 
 
 
 
 
573
 
574
  prediction = response.choices[0].message.content.strip()
575
 
@@ -602,11 +625,21 @@ def evaluate(prompt: str) -> dict:
602
  correct += 1
603
  total += 1
604
 
605
- if (idx + 1) % 50 == 0:
 
 
 
606
  print(f" Progress: {{idx + 1}}/{{len(samples)}} - Current accuracy: {{correct/total:.2%}}")
607
 
608
  except Exception as e:
 
609
  print(f"Error evaluating sample {{idx+1}}: {{e}}")
 
 
 
 
 
 
610
  continue
611
 
612
  accuracy = (correct / total) if total > 0 else 0.0
@@ -726,7 +759,7 @@ Your improved prompt here
726
  "llm": {
727
  "primary_model": model,
728
  "api_base": "https://openrouter.ai/api/v1", # Use OpenRouter endpoint
729
- "temperature": 1.0, # Higher temperature for more diverse prompt variations
730
  },
731
  "max_iterations": 5,
732
  "checkpoint_interval": 1, # Save checkpoints every iteration to preserve prompt history
@@ -738,11 +771,11 @@ Your improved prompt here
738
  "template_dir": templates_dir, # Use our custom prompt engineering templates
739
  },
740
  "evolution": {
741
- "population_size": 15, # Increased from 10 for more exploration
742
  "num_islands": 1, # Single island for simpler evolution
743
- "elite_ratio": 0.1, # Keep top 10% (1-2 best prompts)
744
- "explore_ratio": 0.4, # Increased exploration (was 0.3)
745
- "exploit_ratio": 0.5, # Reduced exploitation (was 0.6)
746
  },
747
  "database": {
748
  "log_prompts": True, # Save prompts used to generate each program
@@ -940,7 +973,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
940
  - **Model**: {model}
941
  - **Initial Eval**: 50 samples
942
  - **Final Eval**: 50 samples (same samples for fair comparison)
943
- - **Evolution**: 150 samples per variant (more data for stronger signal)
944
  - **Iterations**: 5
945
 
946
  ### Results
@@ -974,29 +1007,28 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
974
  This space uses [OpenEvolve](https://github.com/algorithmicsuperintelligence/openevolve) to iteratively improve prompts
975
  by testing them on real datasets and evolving better versions.
976
 
 
 
 
 
 
 
 
977
  ## How it works:
978
  1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
979
  2. Default dataset is **IMDB** (movie review sentiment classification) - great for showing prompt improvement!
980
  3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `dair-ai/emotion`)
981
- 4. Choose a free model from OpenRouter
982
- 5. Click "Optimize Prompt" - the system will validate everything first!
983
- 6. Watch the evolution progress in real-time
984
- 7. Compare initial vs. best prompt side-by-side (both evaluated on same 200 samples)!
985
 
986
- **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
987
  """)
988
 
989
  with gr.Row():
990
  with gr.Column():
991
  gr.Markdown("### Configuration")
992
 
993
- model = gr.Dropdown(
994
- choices=FREE_MODELS,
995
- value=FREE_MODELS[0],
996
- label="Select Model",
997
- info="Choose from 5 curated free models on OpenRouter (24B to 671B parameters)"
998
- )
999
-
1000
  dataset_name = gr.Textbox(
1001
  label="HuggingFace Dataset (Full Name)",
1002
  value="stanfordnlp/imdb",
@@ -1097,10 +1129,19 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1097
  - [Documentation](https://github.com/algorithmicsuperintelligence/openevolve#readme)
1098
  """)
1099
 
1100
- # Wire up the optimize button
 
 
 
 
 
 
 
 
 
1101
  optimize_btn.click(
1102
- fn=optimize_prompt,
1103
- inputs=[initial_prompt, dataset_name, dataset_split, model,
1104
  input_field, target_field],
1105
  outputs=[summary, initial_results, final_results]
1106
  )
 
13
  import requests
14
  import glob
15
 
16
+ # Model for OpenRouter
17
+ # Using paid llama-3.2-3b-instruct since free tier models have unreliable rate limits
18
+ MODELS = [
19
+ "meta-llama/llama-3.2-3b-instruct", # 3B - Reliable, fast, and very cheap ($0.04/$0.04 per 1M tokens)
 
 
 
 
 
20
  ]
21
 
22
 
 
210
  # Format the prompt with the input
211
  formatted_prompt = prompt.replace("{input}", str(input_text))
212
 
213
+ # Call the model with retry logic for transient failures
214
+ max_retries = 3
215
+ import time
216
+ for retry in range(max_retries):
217
+ try:
218
+ response = client.chat.completions.create(
219
+ model=model,
220
+ messages=[
221
+ {"role": "system", "content": "You are a helpful assistant."},
222
+ {"role": "user", "content": formatted_prompt}
223
+ ],
224
+ temperature=0.0,
225
+ max_tokens=500,
226
+ )
227
+ break # Success, exit retry loop
228
+ except Exception as api_error:
229
+ if retry < max_retries - 1:
230
+ wait_time = (retry + 1) * 2 # Exponential backoff: 2s, 4s, 6s
231
+ print(f" API error on sample {idx+1}, retrying in {wait_time}s...")
232
+ time.sleep(wait_time)
233
+ else:
234
+ raise # Final retry failed, propagate error
235
 
236
  prediction = response.choices[0].message.content.strip()
237
 
238
+ # Small delay to avoid rate limiting
239
+ time.sleep(0.1)
240
+
241
  # IMDB labels: 0 = negative, 1 = positive
242
  true_label = int(target) # 0 or 1
243
 
 
504
 
505
  def create_evaluator_file(dataset_name: str, split: str, model: str,
506
  input_field: str, target_field: str, work_dir: str):
507
+ """Create an evaluator.py file for OpenEvolve that uses 75 samples for better signal."""
508
  evaluator_code = f'''
509
  import os
510
  import random
511
+ import time
512
  from datasets import load_dataset
513
  from openai import OpenAI
514
 
515
  def evaluate(prompt: str) -> dict:
516
  """
517
+ Evaluate a prompt using 75 fixed samples for stronger evolution signal.
518
 
519
+ 75 samples balances signal strength (vs 50) with API rate limits (vs 150).
520
+ Includes early stopping and rate limit handling.
521
  """
522
  try:
523
  # IMPORTANT: Use fixed seed for consistent sampling across all evaluations
 
535
  else:
536
  raise
537
 
538
+ # Sample 75 samples with seed 42 for good signal without excessive API calls
539
+ num_samples = 75
540
  if len(dataset) > num_samples:
541
  # Use SAME sampling logic as initial/final eval
542
  indices = random.sample(range(len(dataset)), num_samples)
 
554
 
555
  correct = 0
556
  total = 0
557
+ errors = 0
558
 
559
  print(f"Evaluating on {{len(samples)}} samples...")
560
 
 
572
  # Format the prompt
573
  formatted_prompt = prompt.replace("{{input}}", str(input_text))
574
 
575
+ # Call the model with retry logic for transient failures
576
+ max_retries = 3
577
+ for retry in range(max_retries):
578
+ try:
579
+ response = client.chat.completions.create(
580
+ model="{model}",
581
+ messages=[
582
+ {{"role": "system", "content": "You are a helpful assistant."}},
583
+ {{"role": "user", "content": formatted_prompt}}
584
+ ],
585
+ temperature=0.0,
586
+ max_tokens=500,
587
+ )
588
+ break # Success, exit retry loop
589
+ except Exception as api_error:
590
+ if retry < max_retries - 1:
591
+ wait_time = (retry + 1) * 2 # Exponential backoff: 2s, 4s, 6s
592
+ print(f" API error on sample {{idx+1}}, retrying in {{wait_time}}s...")
593
+ time.sleep(wait_time)
594
+ else:
595
+ raise # Final retry failed, propagate error
596
 
597
  prediction = response.choices[0].message.content.strip()
598
 
 
625
  correct += 1
626
  total += 1
627
 
628
+ # Small delay to avoid rate limiting
629
+ time.sleep(0.1)
630
+
631
+ if (idx + 1) % 25 == 0:
632
  print(f" Progress: {{idx + 1}}/{{len(samples)}} - Current accuracy: {{correct/total:.2%}}")
633
 
634
  except Exception as e:
635
+ errors += 1
636
  print(f"Error evaluating sample {{idx+1}}: {{e}}")
637
+
638
+ # Early stopping: if more than 40% of samples fail, abort
639
+ if errors > len(samples) * 0.4:
640
+ print(f"Too many errors ({{errors}}/{{idx+1}}), stopping evaluation early")
641
+ break
642
+
643
  continue
644
 
645
  accuracy = (correct / total) if total > 0 else 0.0
 
759
  "llm": {
760
  "primary_model": model,
761
  "api_base": "https://openrouter.ai/api/v1", # Use OpenRouter endpoint
762
+ "temperature": 0.8, # Balanced temperature for diverse but reasonable variations
763
  },
764
  "max_iterations": 5,
765
  "checkpoint_interval": 1, # Save checkpoints every iteration to preserve prompt history
 
771
  "template_dir": templates_dir, # Use our custom prompt engineering templates
772
  },
773
  "evolution": {
774
+ "population_size": 12, # Moderate population for good exploration without excessive API calls
775
  "num_islands": 1, # Single island for simpler evolution
776
+ "elite_ratio": 0.15, # Keep top 15% (1-2 best prompts)
777
+ "explore_ratio": 0.35, # Balanced exploration
778
+ "exploit_ratio": 0.50, # Balanced exploitation
779
  },
780
  "database": {
781
  "log_prompts": True, # Save prompts used to generate each program
 
973
  - **Model**: {model}
974
  - **Initial Eval**: 50 samples
975
  - **Final Eval**: 50 samples (same samples for fair comparison)
976
+ - **Evolution**: 75 samples per variant (balanced signal vs API limits)
977
  - **Iterations**: 5
978
 
979
  ### Results
 
1007
  This space uses [OpenEvolve](https://github.com/algorithmicsuperintelligence/openevolve) to iteratively improve prompts
1008
  by testing them on real datasets and evolving better versions.
1009
 
1010
+ ## 🔑 Setup (Required)
1011
+ **To use this space:**
1012
+ 1. Click "⋮" (menu) → "Duplicate Space" to create your own copy
1013
+ 2. In your duplicated space, go to Settings → Variables & Secrets
1014
+ 3. Add your OpenRouter API key as `OPENAI_API_KEY`
1015
+ 4. Get a free API key at [openrouter.ai](https://openrouter.ai/)
1016
+
1017
  ## How it works:
1018
  1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
1019
  2. Default dataset is **IMDB** (movie review sentiment classification) - great for showing prompt improvement!
1020
  3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `dair-ai/emotion`)
1021
+ 4. Click "Optimize Prompt" - the system will validate everything first!
1022
+ 5. Watch the evolution progress in real-time
1023
+ 6. Compare initial vs. best prompt side-by-side (both evaluated on same 50 samples)!
 
1024
 
1025
+ **Model**: Using `meta-llama/llama-3.2-3b-instruct` (reliable, very cheap at ~$0.04 per 1M tokens)
1026
  """)
1027
 
1028
  with gr.Row():
1029
  with gr.Column():
1030
  gr.Markdown("### Configuration")
1031
 
 
 
 
 
 
 
 
1032
  dataset_name = gr.Textbox(
1033
  label="HuggingFace Dataset (Full Name)",
1034
  value="stanfordnlp/imdb",
 
1129
  - [Documentation](https://github.com/algorithmicsuperintelligence/openevolve#readme)
1130
  """)
1131
 
1132
+ # Wire up the optimize button with hardcoded model
1133
+ def optimize_with_fixed_model(initial_prompt, dataset_name, dataset_split,
1134
+ input_field, target_field, progress=gr.Progress()):
1135
+ """Wrapper to use fixed model instead of dropdown"""
1136
+ return optimize_prompt(
1137
+ initial_prompt, dataset_name, dataset_split,
1138
+ MODELS[0], # Use fixed llama-3.2-3b model
1139
+ input_field, target_field, progress
1140
+ )
1141
+
1142
  optimize_btn.click(
1143
+ fn=optimize_with_fixed_model,
1144
+ inputs=[initial_prompt, dataset_name, dataset_split,
1145
  input_field, target_field],
1146
  outputs=[summary, initial_results, final_results]
1147
  )