Spaces:

inflaton
/

machine-translation

Build error

App Files Files Community

dh-mc commited on Aug 10, 2024

Commit

3f6b774

1 Parent(s): 64e39c6

complete gpt-4o-mini training

Browse files

Files changed (17) hide show

datasets/mac/openai-training.jsonl +0 -0
llm_toolkit/eval_openai.py +16 -10
llm_toolkit/translation_utils.py +85 -7
logs/{l40-1gpu-rpp.txt → l40-1gpu-rpp-1.txt} +0 -0
logs/l40-4gpu-1.txt +0 -0
logs/l40-4gpu.txt +0 -3
logs/openai-gpt-4o-mini-fine-tuned.txt +151 -0
logs/openai-training-sample.jsonl +3 -0
notebooks/00b_Data Analysis_Few_Shots.ipynb +2 -2
notebooks/00c_Data Analysis_Fine_Tuned.ipynb +0 -0
notebooks/00d_Data Analysis_Fine_Tuned_RPP.ipynb +0 -0
notebooks/02_Fine_Tune_OpenAI.ipynb +0 -0
requirements.txt +1 -0
results/mac-results_few_shots_metrics.csv +2 -2
results/mac-results_few_shots_openai.csv +2 -2
results/mac-results_fine_tuned_metrics.csv +2 -2
scripts/eval-4gpu.sh +2 -4

datasets/mac/openai-training.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

llm_toolkit/eval_openai.py CHANGED Viewed

@@ -29,7 +29,7 @@ print(
 )
-def on_num_shots_step_completed(model_name, dataset, predictions):
     save_results(
         model_name,
         results_path,
@@ -44,8 +44,10 @@ def on_num_shots_step_completed(model_name, dataset, predictions):
 def evaluate_model_with_num_shots(
     model_name,
     data_path,
     range_num_shots=[0, 1, 3, 5, 10, 50],
     max_new_tokens=2048,
 ):
     print(f"Evaluating model: {model_name}")
@@ -56,20 +58,24 @@ def evaluate_model_with_num_shots(
         print(f"*** Evaluating with num_shots: {num_shots}")
         predictions = eval_openai(num_shots, datasets, max_new_tokens=max_new_tokens)
-        model_name_with_shorts = f"{model_name}/shots-{num_shots:02d}"
         try:
             on_num_shots_step_completed(
-                model_name_with_shorts,
-                datasets["test"],
-                predictions,
             )
         except Exception as e:
             print(e)
-evaluate_model_with_num_shots(
-    model_name,
-    data_path,
-    max_new_tokens=max_new_tokens,
-)

 )
+def on_num_shots_step_completed(model_name, dataset, predictions, results_path):
     save_results(
         model_name,
         results_path,
 def evaluate_model_with_num_shots(
     model_name,
     data_path,
+    results_path=None,
     range_num_shots=[0, 1, 3, 5, 10, 50],
     max_new_tokens=2048,
+    result_column_name=None,
 ):
     print(f"Evaluating model: {model_name}")
         print(f"*** Evaluating with num_shots: {num_shots}")
         predictions = eval_openai(num_shots, datasets, max_new_tokens=max_new_tokens)
+        model_name_with_shorts = (
+            result_column_name
+            if result_column_name
+            else f"{model_name}/shots-{num_shots:02d}"
+        )
         try:
             on_num_shots_step_completed(
+                model_name_with_shorts, datasets["test"], predictions, results_path
             )
         except Exception as e:
             print(e)
+if __name__ == "__main__":
+    evaluate_model_with_num_shots(
+        model_name,
+        data_path,
+        results_path=results_path,
+        max_new_tokens=max_new_tokens,
+    )

llm_toolkit/translation_utils.py CHANGED Viewed

@@ -18,6 +18,7 @@ bleu = evaluate.load("bleu")
 rouge = evaluate.load("rouge")
 meteor = evaluate.load("meteor")
 accuracy = evaluate.load("accuracy")
 def extract_answer(text, debug=False):
@@ -54,6 +55,10 @@ def calc_metrics(references, predictions, debug=False):
         "meteor"
     ]
     results["bleu_scores"] = bleu.compute(
         predictions=predictions, references=references, max_order=4
     )
@@ -108,7 +113,7 @@ def get_few_shot_prompt(dataset, num_shots=5):
     return translation_prompt
-def load_translation_dataset(data_path, tokenizer=None, num_shots=0):
     train_data_file = data_path.replace(".tsv", "-train.tsv")
     test_data_file = data_path.replace(".tsv", "-test.tsv")
@@ -138,7 +143,7 @@ def load_translation_dataset(data_path, tokenizer=None, num_shots=0):
         delimiter="\t",
     )
-    if tokenizer:
         translation_prompt = get_few_shot_prompt(datasets["train"], num_shots)
         def formatting_prompts_func(examples):
@@ -164,11 +169,23 @@ def load_translation_dataset(data_path, tokenizer=None, num_shots=0):
                 prompt = translation_prompt.format(input=input)
                 messages[-1] = {"role": "user", "content": prompt}
-                prompt = tokenizer.apply_chat_template(
-                    messages, tokenize=False, add_generation_prompt=True
-                )
-                prompts.append(prompt)
-                texts.append(prompt + output + tokenizer.eos_token)
             return {"text": texts, "prompt": prompts}
         datasets = datasets.map(
@@ -216,6 +233,11 @@ def detect_repetition_scores(row, col, debug=False):
     )
 def get_metrics(df, max_output_tokens=2048, variant="rpp"):
     metrics_df = pd.DataFrame(df.columns.T)[2:]
     metrics_df.rename(columns={0: "model"}, inplace=True)
@@ -235,12 +257,14 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
     tokenizers = {model: load_tokenizer(model) for model in models}
     meteor = []
     bleu_1 = []
     rouge_l = []
     ews_score = []
     repetition_score = []
     total_repetitions = []
     num_max_output_tokens = []
     columns = df.columns[2:]
     df[
@@ -256,6 +280,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
         print(f"{col}: {metrics}")
         meteor.append(metrics["meteor"])
         bleu_1.append(metrics["bleu_scores"]["bleu"])
         rouge_l.append(metrics["rouge_scores"]["rougeL"])
@@ -273,6 +298,10 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
             lambda x: len(tokenizers[model](x)["input_ids"])
         )
         new_col = f"output_tokens-{col}"
         df[new_col] = df[col].apply(lambda x: len(tokenizers[model](x)["input_ids"]))
@@ -281,6 +310,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
         )
     metrics_df["meteor"] = meteor
     metrics_df["bleu_1"] = bleu_1
     metrics_df["rouge_l"] = rouge_l
     metrics_df["ews_score"] = ews_score
@@ -290,6 +320,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
         lambda x: x["meteor"] / math.log10(10 + x["total_repetitions"]), axis=1
     )
     metrics_df["num_max_output_tokens"] = num_max_output_tokens
     if variant != "rpp":
@@ -328,6 +359,12 @@ def analyze_translation_results(df, col, max_new_tokens=300, repetition_threshol
     )
     print_row_details(df2, range(len(df2)))
 def plot_metrics(metrics_df, figsize=(14, 5), ylim=(0, 0.44)):
     plt.figure(figsize=figsize)
@@ -604,3 +641,44 @@ def load_alpaca_data(data_path):
     df_alpaca.to_json(alpaca_data_path, orient="records", lines=False, indent=2)
     return df_alpaca

 rouge = evaluate.load("rouge")
 meteor = evaluate.load("meteor")
 accuracy = evaluate.load("accuracy")
+sacrebleu = evaluate.load("sacrebleu")
 def extract_answer(text, debug=False):
         "meteor"
     ]
+    results["sacrebleu"] = sacrebleu.compute(
+        predictions=predictions, references=references
+    )
     results["bleu_scores"] = bleu.compute(
         predictions=predictions, references=references, max_order=4
     )
     return translation_prompt
+def load_translation_dataset(data_path, tokenizer=None, num_shots=0, for_openai=False):
     train_data_file = data_path.replace(".tsv", "-train.tsv")
     test_data_file = data_path.replace(".tsv", "-test.tsv")
         delimiter="\t",
     )
+    if tokenizer or for_openai:
         translation_prompt = get_few_shot_prompt(datasets["train"], num_shots)
         def formatting_prompts_func(examples):
                 prompt = translation_prompt.format(input=input)
                 messages[-1] = {"role": "user", "content": prompt}
+                if for_openai:
+                    prompts.append(messages.copy())
+                    text = messages.copy()
+                    text.append(
+                        {
+                            "role": "assistant",
+                            "content": output,
+                        }
+                    )
+                    texts.append(text)
+                else:
+                    prompt = tokenizer.apply_chat_template(
+                        messages, tokenize=False, add_generation_prompt=True
+                    )
+                    prompts.append(prompt)
+                    texts.append(prompt + output + tokenizer.eos_token)
             return {"text": texts, "prompt": prompts}
         datasets = datasets.map(
     )
+def contains_chinese(text):
+    chinese_char_pattern = re.compile(r"[\u4e00-\u9fff]")
+    return 1 if chinese_char_pattern.search(text) else 0
 def get_metrics(df, max_output_tokens=2048, variant="rpp"):
     metrics_df = pd.DataFrame(df.columns.T)[2:]
     metrics_df.rename(columns={0: "model"}, inplace=True)
     tokenizers = {model: load_tokenizer(model) for model in models}
     meteor = []
+    spbleu = []
     bleu_1 = []
     rouge_l = []
     ews_score = []
     repetition_score = []
     total_repetitions = []
     num_max_output_tokens = []
+    num_incomplete_translations = []
     columns = df.columns[2:]
     df[
         print(f"{col}: {metrics}")
         meteor.append(metrics["meteor"])
+        spbleu.append(metrics["sacrebleu"]["score"])
         bleu_1.append(metrics["bleu_scores"]["bleu"])
         rouge_l.append(metrics["rouge_scores"]["rougeL"])
             lambda x: len(tokenizers[model](x)["input_ids"])
         )
+        new_col = f"contains_chinese-{col}"
+        df[new_col] = df[col].apply(contains_chinese)
+        num_incomplete_translations.append(df[new_col].sum())
         new_col = f"output_tokens-{col}"
         df[new_col] = df[col].apply(lambda x: len(tokenizers[model](x)["input_ids"]))
         )
     metrics_df["meteor"] = meteor
+    metrics_df["spbleu"] = spbleu
     metrics_df["bleu_1"] = bleu_1
     metrics_df["rouge_l"] = rouge_l
     metrics_df["ews_score"] = ews_score
         lambda x: x["meteor"] / math.log10(10 + x["total_repetitions"]), axis=1
     )
+    metrics_df["num_incomplete_translations"] = num_incomplete_translations
     metrics_df["num_max_output_tokens"] = num_max_output_tokens
     if variant != "rpp":
     )
     print_row_details(df2, range(len(df2)))
+    contains_chinese = f"contains_chinese-{col}"
+    df3 = df[df[contains_chinese] > 0][["chinese", "english", col, contains_chinese]]
+    print(f"\n*** Found {len(df3)} rows with incomplete translations for {col}")
+    print_row_details(df3, range(len(df3)))
 def plot_metrics(metrics_df, figsize=(14, 5), ylim=(0, 0.44)):
     plt.figure(figsize=figsize)
     df_alpaca.to_json(alpaca_data_path, orient="records", lines=False, indent=2)
     return df_alpaca
+def load_openai_training_data(
+    data_path, openai_data_path="datasets/mac/openai-training.jsonl"
+):
+    if os.path.exists(openai_data_path):
+        print("loading existing data from:", openai_data_path)
+        data = pd.read_json(openai_data_path, orient="records", lines=True)
+        return data
+    datasets = load_translation_dataset(data_path)
+    prompt_template = get_few_shot_prompt(datasets["train"], num_shots=0)
+    df_train = datasets["train"].to_pandas()
+    messages = []
+    for i, row in df_train.iterrows():
+        messages.append(
+            [
+                {
+                    "role": "system",
+                    "content": system_prompt,
+                },
+                {
+                    "role": "user",
+                    "content": prompt_template.format(input=row["chinese"]),
+                },
+                {
+                    "role": "assistant",
+                    "content": row["english"],
+                },
+            ]
+        )
+    df_openai = pd.DataFrame(
+        {
+            "messages": messages,
+        }
+    )
+    df_openai.to_json(openai_data_path, orient="records", lines=True)
+    return df_openai

logs/{l40-1gpu-rpp.txt → l40-1gpu-rpp-1.txt} RENAMED Viewed

File without changes

logs/l40-4gpu-1.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/l40-4gpu.txt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:289a8bbbf208650bc4a0cc3b86578f8a7db73ef68bbefa3c55c3eedf94a38ed0
-size 878270

logs/openai-gpt-4o-mini-fine-tuned.txt ADDED Viewed

	@@ -0,0 +1,151 @@

+Qwen/Qwen2-7B-Instruct None False datasets/mac/mac.tsv results/mac-results.csv False 300
+loading env vars from: /Users/inflaton/code/engd/papers/rapget-translation/.env
+workding dir: /Users/inflaton/code/engd/papers/rapget-translation
+Python 3.11.9
+Name: torch
+Version: 2.4.0
+Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
+Home-page: https://pytorch.org/
+Author: PyTorch Team
+Author-email: [email protected]
+License: BSD-3
+Location: /Users/inflaton/anaconda3/envs/rapget/lib/python3.11/site-packages
+Requires: filelock, fsspec, jinja2, networkx, sympy, typing-extensions
+Required-by: accelerate, peft, torchaudio, torchvision, trl
+---
+Name: transformers
+Version: 4.43.3
+Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
+Home-page: https://github.com/huggingface/transformers
+Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
+Author-email: [email protected]
+License: Apache 2.0 License
+Location: /Users/inflaton/anaconda3/envs/rapget/lib/python3.11/site-packages
+Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
+Required-by: llamafactory, peft, trl
+CPU times: user 8.97 ms, sys: 13.7 ms, total: 22.7 ms
+Wall time: 1.91 s
+MPS is available
+loading existing data from: logs/openai-training-sample.jsonl
+messages
+0	[{'role': 'system', 'content': 'Marv is a fact...
+1	[{'role': 'system', 'content': 'Marv is a fact...
+2	[{'role': 'system', 'content': 'Marv is a fact...
+FileObject(id='file-IokPHn4YWcniXL4wGnK4xVmn', bytes=3413094, created_at=1723269681, filename='openai-training.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)
+FineTuningJob(id='ftjob-TcCo4KtDd3Gp5cnOVky2Rxhh', created_at=1723270136, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=6, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-RXHVnD8cqPvqTPdXgZ5rQdl3', result_files=[], seed=1046194933, status='validating_files', trained_tokens=None, training_file='file-IokPHn4YWcniXL4wGnK4xVmn', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)
+FineTuningJob(id='ftjob-TcCo4KtDd3Gp5cnOVky2Rxhh', created_at=1723270136, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-mini-2024-07-18:mastercard::9uaCEFTs', finished_at=1723272532, hyperparameters=Hyperparameters(n_epochs=6, batch_size=18, learning_rate_multiplier=1.8), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-RXHVnD8cqPvqTPdXgZ5rQdl3', result_files=['file-aCppW0GWhhytwe4yKwymNUZl'], seed=1046194933, status='succeeded', trained_tokens=3640956, training_file='file-IokPHn4YWcniXL4wGnK4xVmn', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)
+Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ufuULvy
+loading train/test data files
+DatasetDict({
+    train: Dataset({
+        features: ['chinese', 'english'],
+        num_rows: 4528
+    })
+    test: Dataset({
+        features: ['chinese', 'english'],
+        num_rows: 1133
+    })
+})
+--------------------------------------------------
+chinese: 老耿端起枪，眯缝起一只三角眼，一搂扳机响了枪，冰雹般的金麻雀劈哩啪啦往下落，铁砂子在柳枝间飞迸着，嚓嚓有声。
+--------------------------------------------------
+english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
+*** Evaluating with num_shots: 0
+100%|██████████| 1133/1133 [16:48<00:00,  1.12it/s]
+gpt-4o-mini/epochs-01 metrics: {'meteor': 0.3785370331806402, 'sacrebleu': {'score': 12.052844230027103, 'counts': [12818, 4623, 2153, 1081], 'totals': [29097, 27964, 26850, 25740], 'precisions': [44.05265147609719, 16.53196967529681, 8.018621973929237, 4.1996891996892], 'bp': 0.9631327655852462, 'sys_len': 29097, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.12052844230027103, 'precisions': [0.44052651476097193, 0.1653196967529681, 0.08018621973929237, 0.041996891996891994], 'brevity_penalty': 0.9631327655852462, 'length_ratio': 0.9637959589267969, 'translation_length': 29097, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4244007719128182, 'rouge2': 0.17601540674784633, 'rougeL': 0.3693615986543504, 'rougeLsum': 0.3696442718692141}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
+Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ug0Gt3w
+loading train/test data files
+DatasetDict({
+    train: Dataset({
+        features: ['chinese', 'english'],
+        num_rows: 4528
+    })
+    test: Dataset({
+        features: ['chinese', 'english'],
+        num_rows: 1133
+    })
+})
+--------------------------------------------------
+chinese: 老耿端起枪，眯缝起一只三角眼，一搂扳机响了枪，冰雹般的金麻雀劈哩啪啦往下落，铁砂子在柳枝间飞迸着，嚓嚓有声。
+--------------------------------------------------
+english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
+*** Evaluating with num_shots: 0
+100%|██████████| 1133/1133 [17:56<00:00,  1.05it/s]
+gpt-4o-mini/epochs-02 metrics: {'meteor': 0.3785921332515917, 'sacrebleu': {'score': 12.033706874864837, 'counts': [12801, 4628, 2150, 1076], 'totals': [29076, 27943, 26830, 25722], 'precisions': [44.02600082542303, 16.562287513867517, 8.013417815877748, 4.183189487598165], 'bp': 0.9624112877781842, 'sys_len': 29076, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.12033706874864836, 'precisions': [0.4402600082542303, 0.16562287513867516, 0.08013417815877749, 0.04183189487598165], 'brevity_penalty': 0.9624112877781842, 'length_ratio': 0.9631003643590593, 'translation_length': 29076, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4235104923203792, 'rouge2': 0.1758318317686482, 'rougeL': 0.36922125683186846, 'rougeLsum': 0.3693808162149962}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
+Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ug5PhpZ
+loading train/test data files
+DatasetDict({
+    train: Dataset({
+        features: ['chinese', 'english'],
+        num_rows: 4528
+    })
+    test: Dataset({
+        features: ['chinese', 'english'],
+        num_rows: 1133
+    })
+})
+--------------------------------------------------
+chinese: 老耿端起枪，眯缝起一只三角眼，一搂扳机响了枪，冰雹般的金麻雀劈哩啪啦往下落，铁砂子在柳枝间飞迸着，嚓嚓有声。
+--------------------------------------------------
+english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
+*** Evaluating with num_shots: 0
+100%|██████████| 1133/1133 [17:02<00:00,  1.11it/s]
+gpt-4o-mini/epochs-03 metrics: {'meteor': 0.37736228106121694, 'sacrebleu': {'score': 11.933111335430906, 'counts': [12779, 4601, 2124, 1061], 'totals': [29096, 27963, 26848, 25737], 'precisions': [43.920126477866376, 16.453885491542394, 7.911203814064362, 4.122469596301046], 'bp': 0.9630984208616785, 'sys_len': 29096, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11933111335430906, 'precisions': [0.4392012647786637, 0.16453885491542394, 0.07911203814064362, 0.041224695963010455], 'brevity_penalty': 0.9630984208616785, 'length_ratio': 0.9637628353759523, 'translation_length': 29096, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4235319934194407, 'rouge2': 0.17493309683581332, 'rougeL': 0.3685697120399035, 'rougeLsum': 0.3689298428303013}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
+Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ugPThQI
+loading train/test data files
+DatasetDict({
+    train: Dataset({
+        features: ['chinese', 'english'],
+        num_rows: 4528
+    })
+    test: Dataset({
+        features: ['chinese', 'english'],
+        num_rows: 1133
+    })
+})
+--------------------------------------------------
+chinese: 老耿端起枪，眯缝起一只三角眼，一搂扳机响了枪，冰雹般的金麻雀劈哩啪啦往下落，铁砂子在柳枝间飞迸着，嚓嚓有声。
+--------------------------------------------------
+english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
+*** Evaluating with num_shots: 0
+100%|██████████| 1133/1133 [18:35<00:00,  1.02it/s]
+gpt-4o-mini/epochs-04 metrics: {'meteor': 0.37818535038887346, 'sacrebleu': {'score': 11.933285526593995, 'counts': [12797, 4601, 2121, 1061], 'totals': [29110, 27977, 26861, 25749], 'precisions': [43.960838199931295, 16.445651785395146, 7.896206395889952, 4.120548370810517], 'bp': 0.9635791436286372, 'sys_len': 29110, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11933285526593994, 'precisions': [0.43960838199931296, 0.16445651785395146, 0.07896206395889951, 0.041205483708105166], 'brevity_penalty': 0.9635791436286371, 'length_ratio': 0.9642265650877774, 'translation_length': 29110, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.42372801674771476, 'rouge2': 0.17487358435014705, 'rougeL': 0.36931437347367646, 'rougeLsum': 0.36934766241132383}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
+Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9ugVLmcB
+loading train/test data files
+DatasetDict({
+    train: Dataset({
+        features: ['chinese', 'english'],
+        num_rows: 4528
+    })
+    test: Dataset({
+        features: ['chinese', 'english'],
+        num_rows: 1133
+    })
+})
+--------------------------------------------------
+chinese: 老耿端起枪，眯缝���一只三角眼，一搂扳机响了枪，冰雹般的金麻雀劈哩啪啦往下落，铁砂子在柳枝间飞迸着，嚓嚓有声。
+--------------------------------------------------
+english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
+*** Evaluating with num_shots: 0
+100%|██████████| 1133/1133 [15:47<00:00,  1.20it/s]
+gpt-4o-mini/epochs-05 metrics: {'meteor': 0.3790673551140706, 'sacrebleu': {'score': 11.955698498650582, 'counts': [12808, 4609, 2126, 1064], 'totals': [29209, 28076, 26959, 25846], 'precisions': [43.849498442260945, 16.416156147599374, 7.88604918580066, 4.116691170780778], 'bp': 0.9669721941455759, 'sys_len': 29209, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11955698498650584, 'precisions': [0.4384949844226095, 0.16416156147599373, 0.0788604918580066, 0.041166911707807785], 'brevity_penalty': 0.9669721941455759, 'length_ratio': 0.9675057966213978, 'translation_length': 29209, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.42476082012412075, 'rouge2': 0.17559955520032905, 'rougeL': 0.3700113513462385, 'rougeLsum': 0.37012014201963733}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}
+Evaluating model: ft:gpt-4o-mini-2024-07-18:mastercard::9uaCEFTs
+loading train/test data files
+DatasetDict({
+    train: Dataset({
+        features: ['chinese', 'english'],
+        num_rows: 4528
+    })
+    test: Dataset({
+        features: ['chinese', 'english'],
+        num_rows: 1133
+    })
+})
+--------------------------------------------------
+chinese: 老耿端起枪，眯缝起一只三角眼，一搂扳机响了枪，冰雹般的金麻雀劈哩啪啦往下落，铁砂子在柳枝间飞迸着，嚓嚓有声。
+--------------------------------------------------
+english: Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.
+*** Evaluating with num_shots: 0
+100%|██████████| 1133/1133 [15:43<00:00,  1.20it/s]
+gpt-4o-mini/epochs-06 metrics: {'meteor': 0.3792226866395673, 'sacrebleu': {'score': 11.982811850915233, 'counts': [12810, 4617, 2137, 1066], 'totals': [29116, 27983, 26868, 25757], 'precisions': [43.996428080780326, 16.499303148340065, 7.95369956825964, 4.138680746981403], 'bp': 0.9637850995333245, 'sys_len': 29116, 'ref_len': 30190}, 'bleu_scores': {'bleu': 0.11982811850915229, 'precisions': [0.43996428080780325, 0.16499303148340064, 0.0795369956825964, 0.04138680746981403], 'brevity_penalty': 0.9637850995333245, 'length_ratio': 0.9644253063928453, 'translation_length': 29116, 'reference_length': 30190}, 'rouge_scores': {'rouge1': 0.4251187202203103, 'rouge2': 0.17553224521896635, 'rougeL': 0.37003282393672954, 'rougeLsum': 0.370114181474168}, 'accuracy': 0.00088261253309797, 'correct_ids': [77]}

logs/openai-training-sample.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
+{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]}
+{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]}

notebooks/00b_Data Analysis_Few_Shots.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:79fc6f45be27f13fe14be2e41598b8cc605cfcb71565a136603608711a5338fa
-size 1606476

 version https://git-lfs.github.com/spec/v1
+oid sha256:4d7af735cc4e9c4395e8bba07f2d212edc159f0f8f8cf83281f25f1ea8c2d717
+size 3332561

notebooks/00c_Data Analysis_Fine_Tuned.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/00d_Data Analysis_Fine_Tuned_RPP.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/02_Fine_Tune_OpenAI.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -18,3 +18,4 @@ sentencepiece==0.2.0
 einops==0.8.0
 accelerate==0.32.0
 peft==0.11.1

 einops==0.8.0
 accelerate==0.32.0
 peft==0.11.1
+sacrebleu==2.4.2

results/mac-results_few_shots_metrics.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:40bd68a7831cf37a0bd5b4e290435873c574a15bca5a79400127a46ff2717672
-size 8156

 version https://git-lfs.github.com/spec/v1
+oid sha256:945e48d5773ce3a870e793e410c79148bd34c1b427c7bcd8e9e5ec140e574fa7
+size 9379

results/mac-results_few_shots_openai.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c797cbdebc17690da76da5e75f9533939bacaa80dca1b46a5d51aaaa316a0ee5
-size 1932132

 version https://git-lfs.github.com/spec/v1
+oid sha256:eea324569b30d1696a51853cbfc5f7b992a569f464cae0db7a88a38c8024578a
+size 2782816

results/mac-results_fine_tuned_metrics.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9443cd0d5ed360cccbbf9f58b0f26e41320c962bca19fe34ae4dbeb9334de610
-size 1158

 version https://git-lfs.github.com/spec/v1
+oid sha256:bbf8e7661be99195444d1c6985179880efdc065fc1856c5fe5a78de14906c064
+size 8321

scripts/eval-4gpu.sh CHANGED Viewed

@@ -16,11 +16,9 @@ grep MemTotal /proc/meminfo
 #pip install -r requirements.txt
 export BATCH_SIZE=1
-# export START_REPETITION_PENALTY=1.06
-export START_NUM_SHOTS=50
 #./scripts/eval-model.sh Qwen/Qwen2-72B-Instruct
-./scripts/eval-model.sh shenzhi-wang/Llama3.1-70B-Chinese-Chat
-# ./scripts/eval-model.sh 01-ai/Yi-1.5-34B-Chat

 #pip install -r requirements.txt
 export BATCH_SIZE=1
 #./scripts/eval-model.sh Qwen/Qwen2-72B-Instruct
+# ./scripts/eval-model.sh shenzhi-wang/Llama3.1-70B-Chinese-Chat
+./scripts/eval-rpp.sh shenzhi-wang Llama3.1-70B-Chinese-Chat checkpoint-280