Spaces:
Build error
Build error
complete gpt-4o-mini training
Browse files- datasets/mac/openai-training.jsonl +0 -0
- llm_toolkit/eval_openai.py +16 -10
- llm_toolkit/translation_utils.py +85 -7
- logs/{l40-1gpu-rpp.txt → l40-1gpu-rpp-1.txt} +0 -0
- logs/l40-4gpu-1.txt +0 -0
- logs/l40-4gpu.txt +0 -3
- logs/openai-gpt-4o-mini-fine-tuned.txt +151 -0
- logs/openai-training-sample.jsonl +3 -0
- notebooks/00b_Data Analysis_Few_Shots.ipynb +2 -2
- notebooks/00c_Data Analysis_Fine_Tuned.ipynb +0 -0
- notebooks/00d_Data Analysis_Fine_Tuned_RPP.ipynb +0 -0
- notebooks/02_Fine_Tune_OpenAI.ipynb +0 -0
- requirements.txt +1 -0
- results/mac-results_few_shots_metrics.csv +2 -2
- results/mac-results_few_shots_openai.csv +2 -2
- results/mac-results_fine_tuned_metrics.csv +2 -2
- scripts/eval-4gpu.sh +2 -4
datasets/mac/openai-training.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
llm_toolkit/eval_openai.py
CHANGED
|
@@ -29,7 +29,7 @@ print(
|
|
| 29 |
)
|
| 30 |
|
| 31 |
|
| 32 |
-
def on_num_shots_step_completed(model_name, dataset, predictions):
|
| 33 |
save_results(
|
| 34 |
model_name,
|
| 35 |
results_path,
|
|
@@ -44,8 +44,10 @@ def on_num_shots_step_completed(model_name, dataset, predictions):
|
|
| 44 |
def evaluate_model_with_num_shots(
|
| 45 |
model_name,
|
| 46 |
data_path,
|
|
|
|
| 47 |
range_num_shots=[0, 1, 3, 5, 10, 50],
|
| 48 |
max_new_tokens=2048,
|
|
|
|
| 49 |
):
|
| 50 |
print(f"Evaluating model: {model_name}")
|
| 51 |
|
|
@@ -56,20 +58,24 @@ def evaluate_model_with_num_shots(
|
|
| 56 |
print(f"*** Evaluating with num_shots: {num_shots}")
|
| 57 |
|
| 58 |
predictions = eval_openai(num_shots, datasets, max_new_tokens=max_new_tokens)
|
| 59 |
-
model_name_with_shorts =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
try:
|
| 62 |
on_num_shots_step_completed(
|
| 63 |
-
model_name_with_shorts,
|
| 64 |
-
datasets["test"],
|
| 65 |
-
predictions,
|
| 66 |
)
|
| 67 |
except Exception as e:
|
| 68 |
print(e)
|
| 69 |
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
|
|
|
|
|
|
|
|
| 29 |
)
|
| 30 |
|
| 31 |
|
| 32 |
+
def on_num_shots_step_completed(model_name, dataset, predictions, results_path):
|
| 33 |
save_results(
|
| 34 |
model_name,
|
| 35 |
results_path,
|
|
|
|
| 44 |
def evaluate_model_with_num_shots(
|
| 45 |
model_name,
|
| 46 |
data_path,
|
| 47 |
+
results_path=None,
|
| 48 |
range_num_shots=[0, 1, 3, 5, 10, 50],
|
| 49 |
max_new_tokens=2048,
|
| 50 |
+
result_column_name=None,
|
| 51 |
):
|
| 52 |
print(f"Evaluating model: {model_name}")
|
| 53 |
|
|
|
|
| 58 |
print(f"*** Evaluating with num_shots: {num_shots}")
|
| 59 |
|
| 60 |
predictions = eval_openai(num_shots, datasets, max_new_tokens=max_new_tokens)
|
| 61 |
+
model_name_with_shorts = (
|
| 62 |
+
result_column_name
|
| 63 |
+
if result_column_name
|
| 64 |
+
else f"{model_name}/shots-{num_shots:02d}"
|
| 65 |
+
)
|
| 66 |
|
| 67 |
try:
|
| 68 |
on_num_shots_step_completed(
|
| 69 |
+
model_name_with_shorts, datasets["test"], predictions, results_path
|
|
|
|
|
|
|
| 70 |
)
|
| 71 |
except Exception as e:
|
| 72 |
print(e)
|
| 73 |
|
| 74 |
|
| 75 |
+
if __name__ == "__main__":
|
| 76 |
+
evaluate_model_with_num_shots(
|
| 77 |
+
model_name,
|
| 78 |
+
data_path,
|
| 79 |
+
results_path=results_path,
|
| 80 |
+
max_new_tokens=max_new_tokens,
|
| 81 |
+
)
|
llm_toolkit/translation_utils.py
CHANGED
|
@@ -18,6 +18,7 @@ bleu = evaluate.load("bleu")
|
|
| 18 |
rouge = evaluate.load("rouge")
|
| 19 |
meteor = evaluate.load("meteor")
|
| 20 |
accuracy = evaluate.load("accuracy")
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
def extract_answer(text, debug=False):
|
|
@@ -54,6 +55,10 @@ def calc_metrics(references, predictions, debug=False):
|
|
| 54 |
"meteor"
|
| 55 |
]
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
results["bleu_scores"] = bleu.compute(
|
| 58 |
predictions=predictions, references=references, max_order=4
|
| 59 |
)
|
|
@@ -108,7 +113,7 @@ def get_few_shot_prompt(dataset, num_shots=5):
|
|
| 108 |
return translation_prompt
|
| 109 |
|
| 110 |
|
| 111 |
-
def load_translation_dataset(data_path, tokenizer=None, num_shots=0):
|
| 112 |
train_data_file = data_path.replace(".tsv", "-train.tsv")
|
| 113 |
test_data_file = data_path.replace(".tsv", "-test.tsv")
|
| 114 |
|
|
@@ -138,7 +143,7 @@ def load_translation_dataset(data_path, tokenizer=None, num_shots=0):
|
|
| 138 |
delimiter="\t",
|
| 139 |
)
|
| 140 |
|
| 141 |
-
if tokenizer:
|
| 142 |
translation_prompt = get_few_shot_prompt(datasets["train"], num_shots)
|
| 143 |
|
| 144 |
def formatting_prompts_func(examples):
|
|
@@ -164,11 +169,23 @@ def load_translation_dataset(data_path, tokenizer=None, num_shots=0):
|
|
| 164 |
prompt = translation_prompt.format(input=input)
|
| 165 |
messages[-1] = {"role": "user", "content": prompt}
|
| 166 |
|
| 167 |
-
|
| 168 |
-
messages
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
return {"text": texts, "prompt": prompts}
|
| 173 |
|
| 174 |
datasets = datasets.map(
|
|
@@ -216,6 +233,11 @@ def detect_repetition_scores(row, col, debug=False):
|
|
| 216 |
)
|
| 217 |
|
| 218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
def get_metrics(df, max_output_tokens=2048, variant="rpp"):
|
| 220 |
metrics_df = pd.DataFrame(df.columns.T)[2:]
|
| 221 |
metrics_df.rename(columns={0: "model"}, inplace=True)
|
|
@@ -235,12 +257,14 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
|
|
| 235 |
tokenizers = {model: load_tokenizer(model) for model in models}
|
| 236 |
|
| 237 |
meteor = []
|
|
|
|
| 238 |
bleu_1 = []
|
| 239 |
rouge_l = []
|
| 240 |
ews_score = []
|
| 241 |
repetition_score = []
|
| 242 |
total_repetitions = []
|
| 243 |
num_max_output_tokens = []
|
|
|
|
| 244 |
columns = df.columns[2:]
|
| 245 |
|
| 246 |
df[
|
|
@@ -256,6 +280,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
|
|
| 256 |
print(f"{col}: {metrics}")
|
| 257 |
|
| 258 |
meteor.append(metrics["meteor"])
|
|
|
|
| 259 |
bleu_1.append(metrics["bleu_scores"]["bleu"])
|
| 260 |
rouge_l.append(metrics["rouge_scores"]["rougeL"])
|
| 261 |
|
|
@@ -273,6 +298,10 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
|
|
| 273 |
lambda x: len(tokenizers[model](x)["input_ids"])
|
| 274 |
)
|
| 275 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
new_col = f"output_tokens-{col}"
|
| 277 |
df[new_col] = df[col].apply(lambda x: len(tokenizers[model](x)["input_ids"]))
|
| 278 |
|
|
@@ -281,6 +310,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
|
|
| 281 |
)
|
| 282 |
|
| 283 |
metrics_df["meteor"] = meteor
|
|
|
|
| 284 |
metrics_df["bleu_1"] = bleu_1
|
| 285 |
metrics_df["rouge_l"] = rouge_l
|
| 286 |
metrics_df["ews_score"] = ews_score
|
|
@@ -290,6 +320,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
|
|
| 290 |
lambda x: x["meteor"] / math.log10(10 + x["total_repetitions"]), axis=1
|
| 291 |
)
|
| 292 |
|
|
|
|
| 293 |
metrics_df["num_max_output_tokens"] = num_max_output_tokens
|
| 294 |
|
| 295 |
if variant != "rpp":
|
|
@@ -328,6 +359,12 @@ def analyze_translation_results(df, col, max_new_tokens=300, repetition_threshol
|
|
| 328 |
)
|
| 329 |
print_row_details(df2, range(len(df2)))
|
| 330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
|
| 332 |
def plot_metrics(metrics_df, figsize=(14, 5), ylim=(0, 0.44)):
|
| 333 |
plt.figure(figsize=figsize)
|
|
@@ -604,3 +641,44 @@ def load_alpaca_data(data_path):
|
|
| 604 |
df_alpaca.to_json(alpaca_data_path, orient="records", lines=False, indent=2)
|
| 605 |
|
| 606 |
return df_alpaca
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
rouge = evaluate.load("rouge")
|
| 19 |
meteor = evaluate.load("meteor")
|
| 20 |
accuracy = evaluate.load("accuracy")
|
| 21 |
+
sacrebleu = evaluate.load("sacrebleu")
|
| 22 |
|
| 23 |
|
| 24 |
def extract_answer(text, debug=False):
|
|
|
|
| 55 |
"meteor"
|
| 56 |
]
|
| 57 |
|
| 58 |
+
results["sacrebleu"] = sacrebleu.compute(
|
| 59 |
+
predictions=predictions, references=references
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
results["bleu_scores"] = bleu.compute(
|
| 63 |
predictions=predictions, references=references, max_order=4
|
| 64 |
)
|
|
|
|
| 113 |
return translation_prompt
|
| 114 |
|
| 115 |
|
| 116 |
+
def load_translation_dataset(data_path, tokenizer=None, num_shots=0, for_openai=False):
|
| 117 |
train_data_file = data_path.replace(".tsv", "-train.tsv")
|
| 118 |
test_data_file = data_path.replace(".tsv", "-test.tsv")
|
| 119 |
|
|
|
|
| 143 |
delimiter="\t",
|
| 144 |
)
|
| 145 |
|
| 146 |
+
if tokenizer or for_openai:
|
| 147 |
translation_prompt = get_few_shot_prompt(datasets["train"], num_shots)
|
| 148 |
|
| 149 |
def formatting_prompts_func(examples):
|
|
|
|
| 169 |
prompt = translation_prompt.format(input=input)
|
| 170 |
messages[-1] = {"role": "user", "content": prompt}
|
| 171 |
|
| 172 |
+
if for_openai:
|
| 173 |
+
prompts.append(messages.copy())
|
| 174 |
+
text = messages.copy()
|
| 175 |
+
text.append(
|
| 176 |
+
{
|
| 177 |
+
"role": "assistant",
|
| 178 |
+
"content": output,
|
| 179 |
+
}
|
| 180 |
+
)
|
| 181 |
+
texts.append(text)
|
| 182 |
+
else:
|
| 183 |
+
prompt = tokenizer.apply_chat_template(
|
| 184 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 185 |
+
)
|
| 186 |
+
prompts.append(prompt)
|
| 187 |
+
texts.append(prompt + output + tokenizer.eos_token)
|
| 188 |
+
|
| 189 |
return {"text": texts, "prompt": prompts}
|
| 190 |
|
| 191 |
datasets = datasets.map(
|
|
|
|
| 233 |
)
|
| 234 |
|
| 235 |
|
| 236 |
+
def contains_chinese(text):
|
| 237 |
+
chinese_char_pattern = re.compile(r"[\u4e00-\u9fff]")
|
| 238 |
+
return 1 if chinese_char_pattern.search(text) else 0
|
| 239 |
+
|
| 240 |
+
|
| 241 |
def get_metrics(df, max_output_tokens=2048, variant="rpp"):
|
| 242 |
metrics_df = pd.DataFrame(df.columns.T)[2:]
|
| 243 |
metrics_df.rename(columns={0: "model"}, inplace=True)
|
|
|
|
| 257 |
tokenizers = {model: load_tokenizer(model) for model in models}
|
| 258 |
|
| 259 |
meteor = []
|
| 260 |
+
spbleu = []
|
| 261 |
bleu_1 = []
|
| 262 |
rouge_l = []
|
| 263 |
ews_score = []
|
| 264 |
repetition_score = []
|
| 265 |
total_repetitions = []
|
| 266 |
num_max_output_tokens = []
|
| 267 |
+
num_incomplete_translations = []
|
| 268 |
columns = df.columns[2:]
|
| 269 |
|
| 270 |
df[
|
|
|
|
| 280 |
print(f"{col}: {metrics}")
|
| 281 |
|
| 282 |
meteor.append(metrics["meteor"])
|
| 283 |
+
spbleu.append(metrics["sacrebleu"]["score"])
|
| 284 |
bleu_1.append(metrics["bleu_scores"]["bleu"])
|
| 285 |
rouge_l.append(metrics["rouge_scores"]["rougeL"])
|
| 286 |
|
|
|
|
| 298 |
lambda x: len(tokenizers[model](x)["input_ids"])
|
| 299 |
)
|
| 300 |
|
| 301 |
+
new_col = f"contains_chinese-{col}"
|
| 302 |
+
df[new_col] = df[col].apply(contains_chinese)
|
| 303 |
+
num_incomplete_translations.append(df[new_col].sum())
|
| 304 |
+
|
| 305 |
new_col = f"output_tokens-{col}"
|
| 306 |
df[new_col] = df[col].apply(lambda x: len(tokenizers[model](x)["input_ids"]))
|
| 307 |
|
|
|
|
| 310 |
)
|
| 311 |
|
| 312 |
metrics_df["meteor"] = meteor
|
| 313 |
+
metrics_df["spbleu"] = spbleu
|
| 314 |
metrics_df["bleu_1"] = bleu_1
|
| 315 |
metrics_df["rouge_l"] = rouge_l
|
| 316 |
metrics_df["ews_score"] = ews_score
|
|
|
|
| 320 |
lambda x: x["meteor"] / math.log10(10 + x["total_repetitions"]), axis=1
|
| 321 |
)
|
| 322 |
|
| 323 |
+
metrics_df["num_incomplete_translations"] = num_incomplete_translations
|
| 324 |
metrics_df["num_max_output_tokens"] = num_max_output_tokens
|
| 325 |
|
| 326 |
if variant != "rpp":
|
|
|
|
| 359 |
)
|
| 360 |
print_row_details(df2, range(len(df2)))
|
| 361 |
|
| 362 |
+
contains_chinese = f"contains_chinese-{col}"
|
| 363 |
+
df3 = df[df[contains_chinese] > 0][["chinese", "english", col, contains_chinese]]
|
| 364 |
+
|
| 365 |
+
print(f"\n*** Found {len(df3)} rows with incomplete translations for {col}")
|
| 366 |
+
print_row_details(df3, range(len(df3)))
|
| 367 |
+
|
| 368 |
|
| 369 |
def plot_metrics(metrics_df, figsize=(14, 5), ylim=(0, 0.44)):
|
| 370 |
plt.figure(figsize=figsize)
|
|
|
|
| 641 |
df_alpaca.to_json(alpaca_data_path, orient="records", lines=False, indent=2)
|
| 642 |
|
| 643 |
return df_alpaca
|
| 644 |
+
|
| 645 |
+
|
| 646 |
+
def load_openai_training_data(
|
| 647 |
+
data_path, openai_data_path="datasets/mac/openai-training.jsonl"
|
| 648 |
+
):
|
| 649 |
+
if os.path.exists(openai_data_path):
|
| 650 |
+
print("loading existing data from:", openai_data_path)
|
| 651 |
+
data = pd.read_json(openai_data_path, orient="records", lines=True)
|
| 652 |
+
return data
|
| 653 |
+
|
| 654 |
+
datasets = load_translation_dataset(data_path)
|
| 655 |
+
prompt_template = get_few_shot_prompt(datasets["train"], num_shots=0)
|
| 656 |
+
|
| 657 |
+
df_train = datasets["train"].to_pandas()
|
| 658 |
+
messages = []
|
| 659 |
+
|
| 660 |
+
for i, row in df_train.iterrows():
|
| 661 |
+
messages.append(
|
| 662 |
+
[
|
| 663 |
+
{
|
| 664 |
+
"role": "system",
|
| 665 |
+
"content": system_prompt,
|
| 666 |
+
},
|
| 667 |
+
{
|
| 668 |
+
"role": "user",
|
| 669 |
+
"content": prompt_template.format(input=row["chinese"]),
|
| 670 |
+
},
|
| 671 |
+
{
|
| 672 |
+
"role": "assistant",
|
| 673 |
+
"content": row["english"],
|
| 674 |
+
},
|
| 675 |
+
]
|
| 676 |
+
)
|
| 677 |
+
|
| 678 |
+
df_openai = pd.DataFrame(
|
| 679 |
+
{
|
| 680 |
+
"messages": messages,
|
| 681 |
+
}
|
| 682 |
+
)
|
| 683 |
+
df_openai.to_json(openai_data_path, orient="records", lines=True)
|
| 684 |
+
return df_openai
|
logs/{l40-1gpu-rpp.txt → l40-1gpu-rpp-1.txt}
RENAMED
|
File without changes
|
logs/l40-4gpu-1.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
logs/l40-4gpu.txt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:289a8bbbf208650bc4a0cc3b86578f8a7db73ef68bbefa3c55c3eedf94a38ed0
|
| 3 |
-
size 878270
|
|
|
|
|
|
|
|
|
|
|
|
logs/openai-gpt-4o-mini-fine-tuned.txt
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Qwen/Qwen2-7B-Instruct None False datasets/mac/mac.tsv results/mac-results.csv False 300
|
| 2 |
+
loading env vars from: /Users/inflaton/code/engd/papers/rapget-translation/.env
|
| 3 |
+
workding dir: /Users/inflaton/code/engd/papers/rapget-translation
|
| 4 |
+
Python 3.11.9
|
| 5 |
+
Name: torch
|
| 6 |
+
Version: 2.4.0
|
| 7 |
+
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
|
| 8 |
+
Home-page: https://pytorch.org/
|
| 9 |
+
Author: PyTorch Team
|
| 10 |
+
Author-email: [email protected]
|
| 11 |
+
License: BSD-3
|
| 12 |
+
Location: /Users/inflaton/anaconda3/envs/rapget/lib/python3.11/site-packages
|
| 13 |
+
Requires: filelock, fsspec, jinja2, networkx, sympy, typing-extensions
|
| 14 |
+
Required-by: accelerate, peft, torchaudio, torchvision, trl
|
| 15 |
+
---
|
| 16 |
+
Name: transformers
|
| 17 |
+
Version: 4.43.3
|
| 18 |
+
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
|
| 19 |
+
Home-page: https://github.com/huggingface/transformers
|
| 20 |
+
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
|
| 21 |
+
Author-email: [email protected]
|
| 22 |
+
License: Apache 2.0 License
|
| 23 |
+
Location: /Users/inflaton/anaconda3/envs/rapget/lib/python3.11/site-packages
|
| 24 |
+
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
|
| 25 |
+
Required-by: llamafactory, peft, trl
|
| 26 |
+
CPU times: user 8.97 ms, sys: 13.7 ms, total: 22.7 ms
|
| 27 |
+
Wall time: 1.91 s
|
| 28 |
+
MPS is available
|
| 29 |
+
loading existing data from: logs/openai-training-sample.jsonl
|
| 30 |
+
messages
|
| 31 |
+
0 [{'role': 'system', 'content': 'Marv is a fact...
|
| 32 |
+
1 [{'role': 'system', 'content': 'Marv is a fact...
|
| 33 |
+
2 [{'role': 'system', 'content': 'Marv is a fact...
|
| 34 |
+
FileObject(id='file-IokPHn4YWcniXL4wGnK4xVmn', bytes=3413094, created_at=1723269681, filename='openai-training.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)
|
| 35 |
+
FineTuningJob(id='ftjob-TcCo4KtDd3Gp5cnOVky2Rxhh', created_at=1723270136, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=6, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-RXHVnD8cqPvqTPdXgZ5rQdl3', result_files=[], seed=1046194933, status='validating_files', trained_tokens=None, training_file='file-IokPHn4YWcniXL4wGnK4xVmn', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)
|
| 36 |
+
FineTuningJob(id='ftjob-TcCo4KtDd3Gp5cnOVky2Rxhh', created_at=1723270136, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-mini-2024-07-18:mastercard::9uaCEFTs', finished_at=1723272532, hyperparameters=Hyperparameters(n_epochs=6, batch_size=18, learning_rate_multiplier=1.8), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-RXHVnD8cqPvqTPdXgZ5rQdl3', result_files=['file-aCppW0GWhhytwe4yKwymNUZl'], seed=1046194933, status='succeeded', trained_tokens=3640956, training_file='file-IokPHn4YWcniXL4wGnK4xVmn', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)
|
| 37 |
+
Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ufuULvy
|
| 38 |
+
loading train/test data files
|
| 39 |
+
DatasetDict({
|
| 40 |
+
train: Dataset({
|
| 41 |
+
features: ['chinese', 'english'],
|
| 42 |
+
num_rows: 4528
|
| 43 |
+
})
|
| 44 |
+
test: Dataset({
|
| 45 |
+
features: ['chinese', 'english'],
|
| 46 |
+
num_rows: 1133
|
| 47 |
+
})
|
| 48 |
+
})
|
| 49 |
+
--------------------------------------------------
|
| 50 |
+
chinese: 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
|
| 51 |
+
--------------------------------------------------
|
| 52 |
+
english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
|
| 53 |
+
*** Evaluating with num_shots: 0
|
| 54 |
+
100%|██████████| 1133/1133 [16:48<00:00, 1.12it/s]
|
| 55 |
+
gpt-4o-mini/epochs-01 metrics: {'meteor': 0.3785370331806402, 'sacrebleu': {'score': 12.052844230027103, 'counts': [12818, 4623, 2153, 1081], 'totals': [29097, 27964, 26850, 25740], 'precisions': [44.05265147609719, 16.53196967529681, 8.018621973929237, 4.1996891996892], 'bp': 0.9631327655852462, 'sys_len': 29097, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.12052844230027103, 'precisions': [0.44052651476097193, 0.1653196967529681, 0.08018621973929237, 0.041996891996891994], 'brevity_penalty': 0.9631327655852462, 'length_ratio': 0.9637959589267969, 'translation_length': 29097, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4244007719128182, 'rouge2': 0.17601540674784633, 'rougeL': 0.3693615986543504, 'rougeLsum': 0.3696442718692141}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
|
| 56 |
+
Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ug0Gt3w
|
| 57 |
+
loading train/test data files
|
| 58 |
+
DatasetDict({
|
| 59 |
+
train: Dataset({
|
| 60 |
+
features: ['chinese', 'english'],
|
| 61 |
+
num_rows: 4528
|
| 62 |
+
})
|
| 63 |
+
test: Dataset({
|
| 64 |
+
features: ['chinese', 'english'],
|
| 65 |
+
num_rows: 1133
|
| 66 |
+
})
|
| 67 |
+
})
|
| 68 |
+
--------------------------------------------------
|
| 69 |
+
chinese: 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
|
| 70 |
+
--------------------------------------------------
|
| 71 |
+
english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
|
| 72 |
+
*** Evaluating with num_shots: 0
|
| 73 |
+
100%|██████████| 1133/1133 [17:56<00:00, 1.05it/s]
|
| 74 |
+
gpt-4o-mini/epochs-02 metrics: {'meteor': 0.3785921332515917, 'sacrebleu': {'score': 12.033706874864837, 'counts': [12801, 4628, 2150, 1076], 'totals': [29076, 27943, 26830, 25722], 'precisions': [44.02600082542303, 16.562287513867517, 8.013417815877748, 4.183189487598165], 'bp': 0.9624112877781842, 'sys_len': 29076, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.12033706874864836, 'precisions': [0.4402600082542303, 0.16562287513867516, 0.08013417815877749, 0.04183189487598165], 'brevity_penalty': 0.9624112877781842, 'length_ratio': 0.9631003643590593, 'translation_length': 29076, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4235104923203792, 'rouge2': 0.1758318317686482, 'rougeL': 0.36922125683186846, 'rougeLsum': 0.3693808162149962}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
|
| 75 |
+
Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ug5PhpZ
|
| 76 |
+
loading train/test data files
|
| 77 |
+
DatasetDict({
|
| 78 |
+
train: Dataset({
|
| 79 |
+
features: ['chinese', 'english'],
|
| 80 |
+
num_rows: 4528
|
| 81 |
+
})
|
| 82 |
+
test: Dataset({
|
| 83 |
+
features: ['chinese', 'english'],
|
| 84 |
+
num_rows: 1133
|
| 85 |
+
})
|
| 86 |
+
})
|
| 87 |
+
--------------------------------------------------
|
| 88 |
+
chinese: 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
|
| 89 |
+
--------------------------------------------------
|
| 90 |
+
english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
|
| 91 |
+
*** Evaluating with num_shots: 0
|
| 92 |
+
100%|██████████| 1133/1133 [17:02<00:00, 1.11it/s]
|
| 93 |
+
gpt-4o-mini/epochs-03 metrics: {'meteor': 0.37736228106121694, 'sacrebleu': {'score': 11.933111335430906, 'counts': [12779, 4601, 2124, 1061], 'totals': [29096, 27963, 26848, 25737], 'precisions': [43.920126477866376, 16.453885491542394, 7.911203814064362, 4.122469596301046], 'bp': 0.9630984208616785, 'sys_len': 29096, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11933111335430906, 'precisions': [0.4392012647786637, 0.16453885491542394, 0.07911203814064362, 0.041224695963010455], 'brevity_penalty': 0.9630984208616785, 'length_ratio': 0.9637628353759523, 'translation_length': 29096, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4235319934194407, 'rouge2': 0.17493309683581332, 'rougeL': 0.3685697120399035, 'rougeLsum': 0.3689298428303013}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
|
| 94 |
+
Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ugPThQI
|
| 95 |
+
loading train/test data files
|
| 96 |
+
DatasetDict({
|
| 97 |
+
train: Dataset({
|
| 98 |
+
features: ['chinese', 'english'],
|
| 99 |
+
num_rows: 4528
|
| 100 |
+
})
|
| 101 |
+
test: Dataset({
|
| 102 |
+
features: ['chinese', 'english'],
|
| 103 |
+
num_rows: 1133
|
| 104 |
+
})
|
| 105 |
+
})
|
| 106 |
+
--------------------------------------------------
|
| 107 |
+
chinese: 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
|
| 108 |
+
--------------------------------------------------
|
| 109 |
+
english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
|
| 110 |
+
*** Evaluating with num_shots: 0
|
| 111 |
+
100%|██████████| 1133/1133 [18:35<00:00, 1.02it/s]
|
| 112 |
+
gpt-4o-mini/epochs-04 metrics: {'meteor': 0.37818535038887346, 'sacrebleu': {'score': 11.933285526593995, 'counts': [12797, 4601, 2121, 1061], 'totals': [29110, 27977, 26861, 25749], 'precisions': [43.960838199931295, 16.445651785395146, 7.896206395889952, 4.120548370810517], 'bp': 0.9635791436286372, 'sys_len': 29110, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11933285526593994, 'precisions': [0.43960838199931296, 0.16445651785395146, 0.07896206395889951, 0.041205483708105166], 'brevity_penalty': 0.9635791436286371, 'length_ratio': 0.9642265650877774, 'translation_length': 29110, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.42372801674771476, 'rouge2': 0.17487358435014705, 'rougeL': 0.36931437347367646, 'rougeLsum': 0.36934766241132383}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
|
| 113 |
+
|
| 114 |
+
Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ugVLmcB
|
| 115 |
+
loading train/test data files
|
| 116 |
+
DatasetDict({
|
| 117 |
+
train: Dataset({
|
| 118 |
+
features: ['chinese', 'english'],
|
| 119 |
+
num_rows: 4528
|
| 120 |
+
})
|
| 121 |
+
test: Dataset({
|
| 122 |
+
features: ['chinese', 'english'],
|
| 123 |
+
num_rows: 1133
|
| 124 |
+
})
|
| 125 |
+
})
|
| 126 |
+
--------------------------------------------------
|
| 127 |
+
chinese: 老耿端起枪,眯缝���一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
|
| 128 |
+
--------------------------------------------------
|
| 129 |
+
english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
|
| 130 |
+
*** Evaluating with num_shots: 0
|
| 131 |
+
100%|██████████| 1133/1133 [15:47<00:00, 1.20it/s]
|
| 132 |
+
gpt-4o-mini/epochs-05 metrics: {'meteor': 0.3790673551140706, 'sacrebleu': {'score': 11.955698498650582, 'counts': [12808, 4609, 2126, 1064], 'totals': [29209, 28076, 26959, 25846], 'precisions': [43.849498442260945, 16.416156147599374, 7.88604918580066, 4.116691170780778], 'bp': 0.9669721941455759, 'sys_len': 29209, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11955698498650584, 'precisions': [0.4384949844226095, 0.16416156147599373, 0.0788604918580066, 0.041166911707807785], 'brevity_penalty': 0.9669721941455759, 'length_ratio': 0.9675057966213978, 'translation_length': 29209, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.42476082012412075, 'rouge2': 0.17559955520032905, 'rougeL': 0.3700113513462385, 'rougeLsum': 0.37012014201963733}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
|
| 133 |
+
Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9uaCEFTs
|
| 134 |
+
loading train/test data files
|
| 135 |
+
DatasetDict({
|
| 136 |
+
train: Dataset({
|
| 137 |
+
features: ['chinese', 'english'],
|
| 138 |
+
num_rows: 4528
|
| 139 |
+
})
|
| 140 |
+
test: Dataset({
|
| 141 |
+
features: ['chinese', 'english'],
|
| 142 |
+
num_rows: 1133
|
| 143 |
+
})
|
| 144 |
+
})
|
| 145 |
+
--------------------------------------------------
|
| 146 |
+
chinese: 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。
|
| 147 |
+
--------------------------------------------------
|
| 148 |
+
english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
|
| 149 |
+
*** Evaluating with num_shots: 0
|
| 150 |
+
100%|██████████| 1133/1133 [15:43<00:00, 1.20it/s]
|
| 151 |
+
gpt-4o-mini/epochs-06 metrics: {'meteor': 0.3792226866395673, 'sacrebleu': {'score': 11.982811850915233, 'counts': [12810, 4617, 2137, 1066], 'totals': [29116, 27983, 26868, 25757], 'precisions': [43.996428080780326, 16.499303148340065, 7.95369956825964, 4.138680746981403], 'bp': 0.9637850995333245, 'sys_len': 29116, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11982811850915229, 'precisions': [0.43996428080780325, 0.16499303148340064, 0.0795369956825964, 0.04138680746981403], 'brevity_penalty': 0.9637850995333245, 'length_ratio': 0.9644253063928453, 'translation_length': 29116, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4251187202203103, 'rouge2': 0.17553224521896635, 'rougeL': 0.37003282393672954, 'rougeLsum': 0.370114181474168}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
|
logs/openai-training-sample.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
|
| 2 |
+
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]}
|
| 3 |
+
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]}
|
notebooks/00b_Data Analysis_Few_Shots.ipynb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4d7af735cc4e9c4395e8bba07f2d212edc159f0f8f8cf83281f25f1ea8c2d717
|
| 3 |
+
size 3332561
|
notebooks/00c_Data Analysis_Fine_Tuned.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/00d_Data Analysis_Fine_Tuned_RPP.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/02_Fine_Tune_OpenAI.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
CHANGED
|
@@ -18,3 +18,4 @@ sentencepiece==0.2.0
|
|
| 18 |
einops==0.8.0
|
| 19 |
accelerate==0.32.0
|
| 20 |
peft==0.11.1
|
|
|
|
|
|
| 18 |
einops==0.8.0
|
| 19 |
accelerate==0.32.0
|
| 20 |
peft==0.11.1
|
| 21 |
+
sacrebleu==2.4.2
|
results/mac-results_few_shots_metrics.csv
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:945e48d5773ce3a870e793e410c79148bd34c1b427c7bcd8e9e5ec140e574fa7
|
| 3 |
+
size 9379
|
results/mac-results_few_shots_openai.csv
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eea324569b30d1696a51853cbfc5f7b992a569f464cae0db7a88a38c8024578a
|
| 3 |
+
size 2782816
|
results/mac-results_fine_tuned_metrics.csv
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bbf8e7661be99195444d1c6985179880efdc065fc1856c5fe5a78de14906c064
|
| 3 |
+
size 8321
|
scripts/eval-4gpu.sh
CHANGED
|
@@ -16,11 +16,9 @@ grep MemTotal /proc/meminfo
|
|
| 16 |
#pip install -r requirements.txt
|
| 17 |
|
| 18 |
export BATCH_SIZE=1
|
| 19 |
-
# export START_REPETITION_PENALTY=1.06
|
| 20 |
-
export START_NUM_SHOTS=50
|
| 21 |
|
| 22 |
#./scripts/eval-model.sh Qwen/Qwen2-72B-Instruct
|
| 23 |
|
| 24 |
-
./scripts/eval-model.sh shenzhi-wang/Llama3.1-70B-Chinese-Chat
|
| 25 |
|
| 26 |
-
|
|
|
|
| 16 |
#pip install -r requirements.txt
|
| 17 |
|
| 18 |
export BATCH_SIZE=1
|
|
|
|
|
|
|
| 19 |
|
| 20 |
#./scripts/eval-model.sh Qwen/Qwen2-72B-Instruct
|
| 21 |
|
| 22 |
+
# ./scripts/eval-model.sh shenzhi-wang/Llama3.1-70B-Chinese-Chat
|
| 23 |
|
| 24 |
+
./scripts/eval-rpp.sh shenzhi-wang Llama3.1-70B-Chinese-Chat checkpoint-280
|