Alignment-Lab-AI
/

lfs-enable-largefiles

Model card Files Files and versions

xet

Community

Alignment-Lab-AI commited on Nov 8, 2023

Commit

c6a30f0

1 Parent(s): 2c4519d

Delete script.py

Browse files

Files changed (1) hide show

script.py +0 -80

script.py DELETED Viewed

@@ -1,80 +0,0 @@
-import json
-import random
-import re
-from tqdm import tqdm
-from glob import glob
-# Function to check for special content and return appropriate system content
-def get_system_content(assistant_content):
-    if re.search(r'\b(?:int|float|char|struct|for|while|if|else)\b', assistant_content):
-        return "you are a genius!"
-    elif re.search(r"\*[^*]+\*", assistant_content):
-        return "lets tell a story"
-    else:
-        # Get the first three words from the assistant's turn
-        first_three_words = ' '.join(assistant_content.split()[:3])
-        return f"start like {first_three_words}"
-# Function to add a System role to the conversation
-def add_system_role(conversation, total_turns):
-    # Check for special content in the first assistant turn
-    assistant_content = conversation[1]["value"]
-    if total_turns % 2 == 0:  # If even, add a new System turn
-        system_content = get_system_content(assistant_content)
-        # Insert the new System turn at the beginning
-        conversation.insert(0, {"from": "system", "value": system_content})
-    else:  # If odd, convert the first user turn to System
-        conversation[0]["from"] = "system"
-    return conversation
-# Function to reformat a single conversation
-def reformat_conversation(conversation):
-    reformatted_convo = []
-    # First, handle the System role for the conversation
-    conversation = add_system_role(conversation, len(conversation))
-    # Next, assign roles and randomize do_train
-    for i, turn in enumerate(conversation):
-        role = "System" if turn["from"] == "system" else ("User" if i % 2 == 1 else "Assistant")
-        reformatted_convo.append({
-            "content": turn["value"],
-            "do_train": random.choice([True, False]),
-            "role": role
-        })
-    return reformatted_convo
-# Function to load all .jsonl files, reformat them, and ensure odd number of turns
-def load_and_reformat_conversations():
-    all_conversations = []
-    even_conversations_count = 0  # Counter for conversations with even number of turns
-    # Iterate over all .jsonl files in the current directory with a progress bar
-    for file_name in tqdm(glob("*.jsonl"), desc="Processing files"):
-        with open(file_name, 'r') as file:
-            # Process each line in the current file with a progress bar
-            for line in tqdm(file, desc=f"Processing {file_name}", leave=False):
-                # Load the original conversation
-                data = json.loads(line)
-                # Reformat the conversation
-                reformatted_convo = reformat_conversation(data['conversations'])
-                # Add to the list of all conversations
-                all_conversations.append({"conversation": reformatted_convo})
-    # Shuffle the combined list of all conversations
-    random.shuffle(all_conversations)
-    return all_conversations
-# Execute the reformatting function and save the result
-reformatted_conversations = load_and_reformat_conversations()
-# Check that all conversations have an odd number of turns after reformatting
-odd_turns_check = all(len(convo["conversation"]) % 2 != 0 for convo in reformatted_conversations)
-if not odd_turns_check:
-    raise ValueError("Some conversations have an even number of turns after reformatting.")
-# Save to a new .jsonl file
-output_file = 'combined_conversations.jsonl'
-with open(output_file, 'w') as outfile:
-    for convo in reformatted_conversations:
-        json.dump(convo, outfile)
-        outfile.write('\n')
-# Return the name of the output file
-output_file