Upload app.py
Browse files
app.py
CHANGED
|
@@ -734,22 +734,27 @@ Bad patterns to avoid:
|
|
| 734 |
```
|
| 735 |
|
| 736 |
# Task
|
| 737 |
-
Rewrite the prompt
|
| 738 |
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
- For classification: ask directly for the label, don't ask for explanations
|
| 745 |
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
4. Will cause the model to output the label word in its response
|
| 751 |
|
| 752 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 753 |
|
| 754 |
```text
|
| 755 |
Your improved prompt here
|
|
@@ -761,11 +766,11 @@ Your improved prompt here
|
|
| 761 |
|
| 762 |
config = {
|
| 763 |
"llm": {
|
| 764 |
-
"primary_model": model
|
| 765 |
"api_base": "https://openrouter.ai/api/v1", # Use OpenRouter endpoint
|
| 766 |
-
"temperature": 1.
|
| 767 |
},
|
| 768 |
-
"max_iterations":
|
| 769 |
"checkpoint_interval": 1, # Save checkpoints every iteration to preserve prompt history
|
| 770 |
"diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
|
| 771 |
"language": "text", # CRITICAL: Optimize text/prompts, not Python code!
|
|
@@ -775,11 +780,11 @@ Your improved prompt here
|
|
| 775 |
"template_dir": templates_dir, # Use our custom prompt engineering templates
|
| 776 |
},
|
| 777 |
"evolution": {
|
| 778 |
-
"population_size":
|
| 779 |
"num_islands": 1, # Single island for simpler evolution
|
| 780 |
-
"elite_ratio": 0.
|
| 781 |
-
"explore_ratio": 0.
|
| 782 |
-
"exploit_ratio": 0.5, #
|
| 783 |
},
|
| 784 |
"database": {
|
| 785 |
"log_prompts": True, # Save prompts used to generate each program
|
|
@@ -979,11 +984,12 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 979 |
|
| 980 |
### Summary
|
| 981 |
- **Dataset**: {dataset_name} ({dataset_split} split)
|
| 982 |
-
- **Model**: {model}
|
|
|
|
| 983 |
- **Initial Eval**: 50 samples
|
| 984 |
- **Final Eval**: 50 samples (same samples for fair comparison)
|
| 985 |
-
- **Evolution**: 50 samples per variant (SAME samples as initial/final
|
| 986 |
-
- **Iterations**: 10
|
| 987 |
|
| 988 |
### Results
|
| 989 |
- **Initial Accuracy**: {initial_eval['accuracy']:.2f}% ({initial_eval['correct']}/{initial_eval['total']})
|
|
|
|
| 734 |
```
|
| 735 |
|
| 736 |
# Task
|
| 737 |
+
Rewrite the prompt to MAXIMIZE accuracy on sentiment classification.
|
| 738 |
|
| 739 |
+
CRITICAL REQUIREMENTS (these DIRECTLY affect score):
|
| 740 |
+
1. β MUST include word "sentiment" β model response will contain "sentiment" keyword
|
| 741 |
+
2. β MUST use pattern "[Action] sentiment: {{input}}" β triggers correct response format
|
| 742 |
+
3. β MUST be SHORT (under 35 chars) β prevents verbose/conversational responses
|
| 743 |
+
4. β MUST keep {{input}} placeholder EXACTLY as-is
|
|
|
|
| 744 |
|
| 745 |
+
PROVEN WORKING PATTERNS (use these!):
|
| 746 |
+
- "Classify sentiment: {{input}}" β BEST (scores ~90%)
|
| 747 |
+
- "Determine sentiment: {{input}}" β Also works well (~85%)
|
| 748 |
+
- "Sentiment of: {{input}}" β Good (~80%)
|
|
|
|
| 749 |
|
| 750 |
+
PATTERNS THAT FAIL (avoid!):
|
| 751 |
+
- β "What is the sentiment?" - question format, no {{input}}
|
| 752 |
+
- β "Review: {{input}}" - missing "sentiment" keyword
|
| 753 |
+
- β "Please analyze the sentiment..." - too long, word "please"
|
| 754 |
+
|
| 755 |
+
Generate a SHORT, DIRECT prompt using the working pattern above.
|
| 756 |
+
|
| 757 |
+
Output ONLY the new prompt between ```text markers:
|
| 758 |
|
| 759 |
```text
|
| 760 |
Your improved prompt here
|
|
|
|
| 766 |
|
| 767 |
config = {
|
| 768 |
"llm": {
|
| 769 |
+
"primary_model": "meta-llama/llama-3.1-8b-instruct", # Use STRONGER model for prompt generation
|
| 770 |
"api_base": "https://openrouter.ai/api/v1", # Use OpenRouter endpoint
|
| 771 |
+
"temperature": 1.2, # Even higher temperature for more creative variations
|
| 772 |
},
|
| 773 |
+
"max_iterations": 5, # Fewer iterations (each is expensive)
|
| 774 |
"checkpoint_interval": 1, # Save checkpoints every iteration to preserve prompt history
|
| 775 |
"diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
|
| 776 |
"language": "text", # CRITICAL: Optimize text/prompts, not Python code!
|
|
|
|
| 780 |
"template_dir": templates_dir, # Use our custom prompt engineering templates
|
| 781 |
},
|
| 782 |
"evolution": {
|
| 783 |
+
"population_size": 15, # Larger population = more variants per generation
|
| 784 |
"num_islands": 1, # Single island for simpler evolution
|
| 785 |
+
"elite_ratio": 0.4, # Keep top 40% (6 best prompts)
|
| 786 |
+
"explore_ratio": 0.1, # Minimal random exploration (only 1-2 prompts)
|
| 787 |
+
"exploit_ratio": 0.5, # 50% exploitation of best prompts
|
| 788 |
},
|
| 789 |
"database": {
|
| 790 |
"log_prompts": True, # Save prompts used to generate each program
|
|
|
|
| 984 |
|
| 985 |
### Summary
|
| 986 |
- **Dataset**: {dataset_name} ({dataset_split} split)
|
| 987 |
+
- **Evaluation Model**: {model}
|
| 988 |
+
- **Evolution Model**: meta-llama/llama-3.1-8b-instruct (larger model for better prompt generation)
|
| 989 |
- **Initial Eval**: 50 samples
|
| 990 |
- **Final Eval**: 50 samples (same samples for fair comparison)
|
| 991 |
+
- **Evolution**: 50 samples per variant (SAME samples as initial/final!)
|
| 992 |
+
- **Iterations**: 5 (population: 15, elite: 40%, explore: 10%, exploit: 50%)
|
| 993 |
|
| 994 |
### Results
|
| 995 |
- **Initial Accuracy**: {initial_eval['accuracy']:.2f}% ({initial_eval['correct']}/{initial_eval['total']})
|