codelion commited on
Commit
5d7f7a8
Β·
verified Β·
1 Parent(s): 6bab33e

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +497 -0
app.py ADDED
@@ -0,0 +1,497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import yaml
4
+ import json
5
+ import random
6
+ from datasets import load_dataset
7
+ from openai import OpenAI
8
+ from openevolve import run_evolution
9
+ from typing import Dict, List, Tuple
10
+ import tempfile
11
+ import shutil
12
+
13
+ # Free models from OpenRouter (as of 2025)
14
+ FREE_MODELS = [
15
+ "google/gemini-2.0-flash-001:free",
16
+ "google/gemini-flash-1.5-8b:free",
17
+ "meta-llama/llama-3.2-3b-instruct:free",
18
+ "meta-llama/llama-3.2-1b-instruct:free",
19
+ "microsoft/phi-3-mini-128k-instruct:free",
20
+ "microsoft/phi-3-medium-128k-instruct:free",
21
+ "qwen/qwen-2-7b-instruct:free",
22
+ "mistralai/mistral-7b-instruct:free",
23
+ ]
24
+
25
+ # Popular HuggingFace datasets for different tasks
26
+ SAMPLE_DATASETS = {
27
+ "Question Answering": [
28
+ "hotpot_qa",
29
+ "squad",
30
+ "trivia_qa",
31
+ ],
32
+ "Sentiment Analysis": [
33
+ "imdb",
34
+ "yelp_review_full",
35
+ "emotion",
36
+ ],
37
+ "Text Classification": [
38
+ "ag_news",
39
+ "dbpedia_14",
40
+ "SetFit/sst5",
41
+ ],
42
+ "Math Reasoning": [
43
+ "gsm8k",
44
+ "math_qa",
45
+ ],
46
+ }
47
+
48
+
49
+ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int,
50
+ api_key: str, model: str, input_field: str, target_field: str) -> Dict:
51
+ """Evaluate a prompt on a dataset using the selected model."""
52
+ try:
53
+ # Load dataset
54
+ dataset = load_dataset(dataset_name, split=split, streaming=False)
55
+
56
+ # Sample random examples
57
+ if len(dataset) > num_samples:
58
+ indices = random.sample(range(len(dataset)), num_samples)
59
+ samples = [dataset[i] for i in indices]
60
+ else:
61
+ samples = list(dataset)[:num_samples]
62
+
63
+ # Initialize OpenAI client with OpenRouter
64
+ client = OpenAI(
65
+ base_url="https://openrouter.ai/api/v1",
66
+ api_key=api_key,
67
+ )
68
+
69
+ correct = 0
70
+ total = 0
71
+ results = []
72
+
73
+ for sample in samples:
74
+ try:
75
+ # Get input and target
76
+ input_text = sample.get(input_field, "")
77
+ if isinstance(input_text, dict):
78
+ input_text = str(input_text)
79
+
80
+ target = sample.get(target_field, "")
81
+ if isinstance(target, dict):
82
+ target = str(target)
83
+
84
+ # Format the prompt with the input
85
+ formatted_prompt = prompt.replace("{input}", str(input_text))
86
+
87
+ # Call the model
88
+ response = client.chat.completions.create(
89
+ model=model,
90
+ messages=[
91
+ {"role": "system", "content": "You are a helpful assistant."},
92
+ {"role": "user", "content": formatted_prompt}
93
+ ],
94
+ temperature=0.1,
95
+ max_tokens=500,
96
+ )
97
+
98
+ prediction = response.choices[0].message.content.strip()
99
+
100
+ # Simple exact match evaluation
101
+ is_correct = str(target).lower().strip() in prediction.lower()
102
+ if is_correct:
103
+ correct += 1
104
+ total += 1
105
+
106
+ results.append({
107
+ "input": str(input_text)[:100] + "...",
108
+ "target": str(target),
109
+ "prediction": prediction[:100] + "...",
110
+ "correct": is_correct
111
+ })
112
+
113
+ except Exception as e:
114
+ print(f"Error evaluating sample: {e}")
115
+ continue
116
+
117
+ accuracy = (correct / total * 100) if total > 0 else 0
118
+
119
+ return {
120
+ "accuracy": accuracy,
121
+ "correct": correct,
122
+ "total": total,
123
+ "results": results
124
+ }
125
+
126
+ except Exception as e:
127
+ return {
128
+ "error": str(e),
129
+ "accuracy": 0,
130
+ "correct": 0,
131
+ "total": 0,
132
+ "results": []
133
+ }
134
+
135
+
136
+ def create_evaluator_file(dataset_name: str, split: str, model: str,
137
+ input_field: str, target_field: str, work_dir: str):
138
+ """Create an evaluator.py file for OpenEvolve."""
139
+ evaluator_code = f'''
140
+ import os
141
+ import random
142
+ from datasets import load_dataset
143
+ from openai import OpenAI
144
+
145
+ def evaluate(prompt: str) -> float:
146
+ """Evaluate a prompt and return a score between 0 and 1."""
147
+ try:
148
+ # Load dataset
149
+ dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
150
+
151
+ # Sample 100 random examples
152
+ num_samples = min(100, len(dataset))
153
+ if len(dataset) > num_samples:
154
+ indices = random.sample(range(len(dataset)), num_samples)
155
+ samples = [dataset[i] for i in indices]
156
+ else:
157
+ samples = list(dataset)[:num_samples]
158
+
159
+ # Initialize OpenAI client
160
+ api_key = os.environ.get("OPENAI_API_KEY")
161
+ client = OpenAI(
162
+ base_url="https://openrouter.ai/api/v1",
163
+ api_key=api_key,
164
+ )
165
+
166
+ correct = 0
167
+ total = 0
168
+
169
+ for sample in samples:
170
+ try:
171
+ # Get input and target
172
+ input_text = sample.get("{input_field}", "")
173
+ if isinstance(input_text, dict):
174
+ input_text = str(input_text)
175
+
176
+ target = sample.get("{target_field}", "")
177
+ if isinstance(target, dict):
178
+ target = str(target)
179
+
180
+ # Format the prompt
181
+ formatted_prompt = prompt.replace("{{input}}", str(input_text))
182
+
183
+ # Call the model
184
+ response = client.chat.completions.create(
185
+ model="{model}",
186
+ messages=[
187
+ {{"role": "system", "content": "You are a helpful assistant."}},
188
+ {{"role": "user", "content": formatted_prompt}}
189
+ ],
190
+ temperature=0.1,
191
+ max_tokens=500,
192
+ )
193
+
194
+ prediction = response.choices[0].message.content.strip()
195
+
196
+ # Simple evaluation
197
+ is_correct = str(target).lower().strip() in prediction.lower()
198
+ if is_correct:
199
+ correct += 1
200
+ total += 1
201
+
202
+ except Exception as e:
203
+ print(f"Error evaluating sample: {{e}}")
204
+ continue
205
+
206
+ # Return score between 0 and 1
207
+ return (correct / total) if total > 0 else 0.0
208
+
209
+ except Exception as e:
210
+ print(f"Error in evaluation: {{e}}")
211
+ return 0.0
212
+ '''
213
+
214
+ evaluator_path = os.path.join(work_dir, "evaluator.py")
215
+ with open(evaluator_path, "w") as f:
216
+ f.write(evaluator_code)
217
+
218
+ return evaluator_path
219
+
220
+
221
+ def create_config_file(model: str, work_dir: str):
222
+ """Create a config.yaml file for OpenEvolve."""
223
+ config = {
224
+ "llm": {
225
+ "api_base": "https://openrouter.ai/api/v1",
226
+ "model": model,
227
+ "temperature": 0.7,
228
+ "max_tokens": 4096,
229
+ },
230
+ "evolution": {
231
+ "max_iterations": 10,
232
+ "population_size": 10,
233
+ "num_islands": 1,
234
+ "elite_ratio": 0.1,
235
+ "explore_ratio": 0.3,
236
+ "exploit_ratio": 0.6,
237
+ },
238
+ "evaluation": {
239
+ "timeout": 1800,
240
+ }
241
+ }
242
+
243
+ config_path = os.path.join(work_dir, "config.yaml")
244
+ with open(config_path, "w") as f:
245
+ yaml.dump(config, f)
246
+
247
+ return config_path
248
+
249
+
250
+ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
251
+ model: str, api_key: str, input_field: str, target_field: str,
252
+ progress=gr.Progress()) -> Tuple[str, str, str]:
253
+ """Run OpenEvolve to optimize the prompt."""
254
+
255
+ if not api_key:
256
+ return "Error: OpenAI API Key is required", "", ""
257
+
258
+ # Set API key as environment variable
259
+ os.environ["OPENAI_API_KEY"] = api_key
260
+
261
+ progress(0, desc="Setting up...")
262
+
263
+ # Create temporary working directory
264
+ work_dir = tempfile.mkdtemp(prefix="openevolve_")
265
+
266
+ try:
267
+ # Save initial prompt
268
+ initial_prompt_path = os.path.join(work_dir, "initial_prompt.txt")
269
+ with open(initial_prompt_path, "w") as f:
270
+ f.write(initial_prompt)
271
+
272
+ # Create evaluator
273
+ progress(0.1, desc="Creating evaluator...")
274
+ evaluator_path = create_evaluator_file(dataset_name, dataset_split, model,
275
+ input_field, target_field, work_dir)
276
+
277
+ # Create config
278
+ progress(0.2, desc="Creating configuration...")
279
+ config_path = create_config_file(model, work_dir)
280
+
281
+ # Run initial evaluation
282
+ progress(0.3, desc="Running initial evaluation...")
283
+ initial_eval = evaluate_prompt(
284
+ initial_prompt, dataset_name, dataset_split, 100,
285
+ api_key, model, input_field, target_field
286
+ )
287
+
288
+ initial_results = f"""
289
+ ### Initial Prompt Evaluation
290
+
291
+ **Prompt:**
292
+ ```
293
+ {initial_prompt}
294
+ ```
295
+
296
+ **Results:**
297
+ - Accuracy: {initial_eval['accuracy']:.2f}%
298
+ - Correct: {initial_eval['correct']}/{initial_eval['total']}
299
+
300
+ **Sample Results:**
301
+ """
302
+ for i, result in enumerate(initial_eval['results'][:5], 1):
303
+ initial_results += f"\n{i}. Input: {result['input']}\n"
304
+ initial_results += f" Target: {result['target']}\n"
305
+ initial_results += f" Prediction: {result['prediction']}\n"
306
+ initial_results += f" βœ“ Correct\n" if result['correct'] else f" βœ— Incorrect\n"
307
+
308
+ # Run OpenEvolve
309
+ progress(0.4, desc="Running OpenEvolve (this may take several minutes)...")
310
+
311
+ output_dir = os.path.join(work_dir, "output")
312
+ os.makedirs(output_dir, exist_ok=True)
313
+
314
+ try:
315
+ # Run evolution
316
+ result = run_evolution(
317
+ initial_program_path=initial_prompt_path,
318
+ evaluator_path=evaluator_path,
319
+ config_path=config_path,
320
+ output_dir=output_dir,
321
+ verbose=True
322
+ )
323
+
324
+ progress(0.8, desc="Evaluating best prompt...")
325
+
326
+ # Get the best prompt
327
+ best_prompt_path = os.path.join(output_dir, "best_program.txt")
328
+ if os.path.exists(best_prompt_path):
329
+ with open(best_prompt_path, "r") as f:
330
+ best_prompt = f.read()
331
+ else:
332
+ best_prompt = initial_prompt
333
+
334
+ # Evaluate best prompt
335
+ final_eval = evaluate_prompt(
336
+ best_prompt, dataset_name, dataset_split, 100,
337
+ api_key, model, input_field, target_field
338
+ )
339
+
340
+ final_results = f"""
341
+ ### Evolved Prompt Evaluation
342
+
343
+ **Prompt:**
344
+ ```
345
+ {best_prompt}
346
+ ```
347
+
348
+ **Results:**
349
+ - Accuracy: {final_eval['accuracy']:.2f}%
350
+ - Correct: {final_eval['correct']}/{final_eval['total']}
351
+ - Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:.2f}%
352
+
353
+ **Sample Results:**
354
+ """
355
+ for i, result in enumerate(final_eval['results'][:5], 1):
356
+ final_results += f"\n{i}. Input: {result['input']}\n"
357
+ final_results += f" Target: {result['target']}\n"
358
+ final_results += f" Prediction: {result['prediction']}\n"
359
+ final_results += f" βœ“ Correct\n" if result['correct'] else f" βœ— Incorrect\n"
360
+
361
+ summary = f"""
362
+ ## Optimization Complete!
363
+
364
+ ### Summary
365
+ - Initial Accuracy: {initial_eval['accuracy']:.2f}%
366
+ - Final Accuracy: {final_eval['accuracy']:.2f}%
367
+ - Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:.2f}%
368
+ - Dataset: {dataset_name}
369
+ - Model: {model}
370
+ - Samples Evaluated: 100
371
+ - Iterations: 10
372
+ """
373
+
374
+ progress(1.0, desc="Complete!")
375
+
376
+ return summary, initial_results, final_results
377
+
378
+ except Exception as e:
379
+ return f"Error during evolution: {str(e)}", initial_results, ""
380
+
381
+ finally:
382
+ # Clean up
383
+ try:
384
+ shutil.rmtree(work_dir)
385
+ except:
386
+ pass
387
+
388
+
389
+ # Create Gradio interface
390
+ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as demo:
391
+ gr.Markdown("""
392
+ # 🧬 OpenEvolve Prompt Optimizer
393
+
394
+ Automatically evolve and optimize your prompts using evolutionary algorithms!
395
+
396
+ This space uses [OpenEvolve](https://github.com/codelion/openevolve) to iteratively improve prompts
397
+ by testing them on real datasets and evolving better versions.
398
+
399
+ ## How it works:
400
+ 1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
401
+ 2. Select a HuggingFace dataset to test on
402
+ 3. Choose a free model from OpenRouter
403
+ 4. Click "Optimize Prompt" to evolve better versions
404
+ 5. Compare initial vs. evolved performance!
405
+ """)
406
+
407
+ with gr.Row():
408
+ with gr.Column():
409
+ gr.Markdown("### Configuration")
410
+
411
+ api_key = gr.Textbox(
412
+ label="OpenAI API Key (for OpenRouter)",
413
+ type="password",
414
+ placeholder="sk-or-v1-...",
415
+ info="Get your free key at https://openrouter.ai/keys"
416
+ )
417
+
418
+ model = gr.Dropdown(
419
+ choices=FREE_MODELS,
420
+ value=FREE_MODELS[0],
421
+ label="Select Model",
422
+ info="Free models available on OpenRouter"
423
+ )
424
+
425
+ dataset_name = gr.Textbox(
426
+ label="HuggingFace Dataset",
427
+ value="imdb",
428
+ placeholder="e.g., imdb, hotpot_qa, gsm8k",
429
+ info="Any dataset from HuggingFace Hub"
430
+ )
431
+
432
+ dataset_split = gr.Textbox(
433
+ label="Dataset Split",
434
+ value="test",
435
+ placeholder="e.g., train, test, validation"
436
+ )
437
+
438
+ input_field = gr.Textbox(
439
+ label="Input Field Name",
440
+ value="text",
441
+ placeholder="e.g., text, question, context",
442
+ info="The field containing inputs to process"
443
+ )
444
+
445
+ target_field = gr.Textbox(
446
+ label="Target Field Name",
447
+ value="label",
448
+ placeholder="e.g., label, answer, target",
449
+ info="The field containing expected outputs"
450
+ )
451
+
452
+ initial_prompt = gr.TextArea(
453
+ label="Initial Prompt",
454
+ value="Analyze the sentiment of the following text and classify it as positive or negative:\n\n{input}\n\nClassification:",
455
+ lines=6,
456
+ info="Use {input} as placeholder for dataset inputs"
457
+ )
458
+
459
+ optimize_btn = gr.Button("πŸš€ Optimize Prompt", variant="primary", size="lg")
460
+
461
+ with gr.Row():
462
+ with gr.Column():
463
+ summary = gr.Markdown(label="Summary")
464
+
465
+ with gr.Row():
466
+ with gr.Column():
467
+ initial_results = gr.Markdown(label="Initial Results")
468
+ with gr.Column():
469
+ final_results = gr.Markdown(label="Evolved Results")
470
+
471
+ gr.Markdown("""
472
+ ### Example Datasets & Fields:
473
+
474
+ | Dataset | Split | Input Field | Target Field | Task |
475
+ |---------|-------|-------------|--------------|------|
476
+ | imdb | test | text | label | Sentiment Analysis |
477
+ | hotpot_qa | validation | question | answer | Question Answering |
478
+ | emotion | test | text | label | Emotion Classification |
479
+ | gsm8k | test | question | answer | Math Reasoning |
480
+ | ag_news | test | text | label | News Classification |
481
+
482
+ ### Notes:
483
+ - Evolution runs for 10 iterations with 1 island
484
+ - Each evaluation uses 100 random samples from the dataset
485
+ - The process may take 5-15 minutes depending on the dataset and model
486
+ - Make sure your API key has sufficient credits for the requests
487
+ """)
488
+
489
+ optimize_btn.click(
490
+ fn=optimize_prompt,
491
+ inputs=[initial_prompt, dataset_name, dataset_split, model, api_key,
492
+ input_field, target_field],
493
+ outputs=[summary, initial_results, final_results]
494
+ )
495
+
496
+ if __name__ == "__main__":
497
+ demo.launch()