Commit
·
c6a30f0
1
Parent(s):
2c4519d
Delete script.py
Browse files
script.py
DELETED
|
@@ -1,80 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import random
|
| 3 |
-
import re
|
| 4 |
-
from tqdm import tqdm
|
| 5 |
-
from glob import glob
|
| 6 |
-
|
| 7 |
-
# Function to check for special content and return appropriate system content
|
| 8 |
-
def get_system_content(assistant_content):
|
| 9 |
-
if re.search(r'\b(?:int|float|char|struct|for|while|if|else)\b', assistant_content):
|
| 10 |
-
return "you are a genius!"
|
| 11 |
-
elif re.search(r"\*[^*]+\*", assistant_content):
|
| 12 |
-
return "lets tell a story"
|
| 13 |
-
else:
|
| 14 |
-
# Get the first three words from the assistant's turn
|
| 15 |
-
first_three_words = ' '.join(assistant_content.split()[:3])
|
| 16 |
-
return f"start like {first_three_words}"
|
| 17 |
-
|
| 18 |
-
# Function to add a System role to the conversation
|
| 19 |
-
def add_system_role(conversation, total_turns):
|
| 20 |
-
# Check for special content in the first assistant turn
|
| 21 |
-
assistant_content = conversation[1]["value"]
|
| 22 |
-
if total_turns % 2 == 0: # If even, add a new System turn
|
| 23 |
-
system_content = get_system_content(assistant_content)
|
| 24 |
-
# Insert the new System turn at the beginning
|
| 25 |
-
conversation.insert(0, {"from": "system", "value": system_content})
|
| 26 |
-
else: # If odd, convert the first user turn to System
|
| 27 |
-
conversation[0]["from"] = "system"
|
| 28 |
-
return conversation
|
| 29 |
-
|
| 30 |
-
# Function to reformat a single conversation
|
| 31 |
-
def reformat_conversation(conversation):
|
| 32 |
-
reformatted_convo = []
|
| 33 |
-
# First, handle the System role for the conversation
|
| 34 |
-
conversation = add_system_role(conversation, len(conversation))
|
| 35 |
-
# Next, assign roles and randomize do_train
|
| 36 |
-
for i, turn in enumerate(conversation):
|
| 37 |
-
role = "System" if turn["from"] == "system" else ("User" if i % 2 == 1 else "Assistant")
|
| 38 |
-
reformatted_convo.append({
|
| 39 |
-
"content": turn["value"],
|
| 40 |
-
"do_train": random.choice([True, False]),
|
| 41 |
-
"role": role
|
| 42 |
-
})
|
| 43 |
-
return reformatted_convo
|
| 44 |
-
|
| 45 |
-
# Function to load all .jsonl files, reformat them, and ensure odd number of turns
|
| 46 |
-
def load_and_reformat_conversations():
|
| 47 |
-
all_conversations = []
|
| 48 |
-
even_conversations_count = 0 # Counter for conversations with even number of turns
|
| 49 |
-
# Iterate over all .jsonl files in the current directory with a progress bar
|
| 50 |
-
for file_name in tqdm(glob("*.jsonl"), desc="Processing files"):
|
| 51 |
-
with open(file_name, 'r') as file:
|
| 52 |
-
# Process each line in the current file with a progress bar
|
| 53 |
-
for line in tqdm(file, desc=f"Processing {file_name}", leave=False):
|
| 54 |
-
# Load the original conversation
|
| 55 |
-
data = json.loads(line)
|
| 56 |
-
# Reformat the conversation
|
| 57 |
-
reformatted_convo = reformat_conversation(data['conversations'])
|
| 58 |
-
# Add to the list of all conversations
|
| 59 |
-
all_conversations.append({"conversation": reformatted_convo})
|
| 60 |
-
# Shuffle the combined list of all conversations
|
| 61 |
-
random.shuffle(all_conversations)
|
| 62 |
-
return all_conversations
|
| 63 |
-
|
| 64 |
-
# Execute the reformatting function and save the result
|
| 65 |
-
reformatted_conversations = load_and_reformat_conversations()
|
| 66 |
-
|
| 67 |
-
# Check that all conversations have an odd number of turns after reformatting
|
| 68 |
-
odd_turns_check = all(len(convo["conversation"]) % 2 != 0 for convo in reformatted_conversations)
|
| 69 |
-
if not odd_turns_check:
|
| 70 |
-
raise ValueError("Some conversations have an even number of turns after reformatting.")
|
| 71 |
-
|
| 72 |
-
# Save to a new .jsonl file
|
| 73 |
-
output_file = 'combined_conversations.jsonl'
|
| 74 |
-
with open(output_file, 'w') as outfile:
|
| 75 |
-
for convo in reformatted_conversations:
|
| 76 |
-
json.dump(convo, outfile)
|
| 77 |
-
outfile.write('\n')
|
| 78 |
-
|
| 79 |
-
# Return the name of the output file
|
| 80 |
-
output_file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|