Igor Evdokimov commited on
Commit ·
d632b3b
1
Parent(s): 68b94a3
- basics
Browse files- config.json +13 -0
- prepare.py +16 -0
- processed_ds/dataset_dict.json +1 -0
- processed_ds/test/data-00000-of-00001.arrow +3 -0
- processed_ds/test/dataset_info.json +58 -0
- processed_ds/test/state.json +13 -0
- processed_ds/train/data-00000-of-00001.arrow +3 -0
- processed_ds/train/dataset_info.json +58 -0
- processed_ds/train/state.json +13 -0
- train.py +55 -0
- trainingSet.json +0 -0
config.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "qwen2_5",
|
| 3 |
+
"adapter_config": {
|
| 4 |
+
"base_model": "Alexis-Az/Qwen-2.5-Coder-7B-4bit-CSharp-Alpaca-Code-ORPO-LoRA",
|
| 5 |
+
"adapter_type": "lora",
|
| 6 |
+
"r": 8,
|
| 7 |
+
"lora_alpha": 32,
|
| 8 |
+
"target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"],
|
| 9 |
+
"lora_dropout": 0.1
|
| 10 |
+
},
|
| 11 |
+
"tags": ["peft", "lora", "adapter-only"],
|
| 12 |
+
"license": "apache-2.0"
|
| 13 |
+
}
|
prepare.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import login
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def prepare(example):
|
| 6 |
+
# подставьте реальные поля вашего jsonl
|
| 7 |
+
inp = example.get("prompt","")
|
| 8 |
+
tgt = example.get("completion","")
|
| 9 |
+
return {"input_text": inp, "target_text": tgt}
|
| 10 |
+
|
| 11 |
+
ds = load_dataset("json", data_files="trainingSet.json", split="train")
|
| 12 |
+
ds = ds.map(prepare)
|
| 13 |
+
ds = ds.train_test_split(test_size=0.02)
|
| 14 |
+
ds.save_to_disk("processed_ds")
|
| 15 |
+
|
| 16 |
+
print("Saved processed_ds")
|
processed_ds/dataset_dict.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"splits": ["train", "test"]}
|
processed_ds/test/data-00000-of-00001.arrow
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5673f1d5a1ccd0c5bcb67d6a89accca2d25160c0d655f81b3ef1e8a516d67af4
|
| 3 |
+
size 4056
|
processed_ds/test/dataset_info.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"builder_name": "json",
|
| 3 |
+
"citation": "",
|
| 4 |
+
"config_name": "default",
|
| 5 |
+
"dataset_name": "json",
|
| 6 |
+
"dataset_size": 118782,
|
| 7 |
+
"description": "",
|
| 8 |
+
"download_checksums": {
|
| 9 |
+
"D:/Work/BotticelliBotsMakerAssistant/trainingSet.json": {
|
| 10 |
+
"num_bytes": 128816,
|
| 11 |
+
"checksum": null
|
| 12 |
+
}
|
| 13 |
+
},
|
| 14 |
+
"download_size": 128816,
|
| 15 |
+
"features": {
|
| 16 |
+
"input": {
|
| 17 |
+
"dtype": "string",
|
| 18 |
+
"_type": "Value"
|
| 19 |
+
},
|
| 20 |
+
"output": {
|
| 21 |
+
"dtype": "string",
|
| 22 |
+
"_type": "Value"
|
| 23 |
+
},
|
| 24 |
+
"code": {
|
| 25 |
+
"dtype": "string",
|
| 26 |
+
"_type": "Value"
|
| 27 |
+
},
|
| 28 |
+
"instruction": {
|
| 29 |
+
"dtype": "string",
|
| 30 |
+
"_type": "Value"
|
| 31 |
+
},
|
| 32 |
+
"input_text": {
|
| 33 |
+
"dtype": "string",
|
| 34 |
+
"_type": "Value"
|
| 35 |
+
},
|
| 36 |
+
"target_text": {
|
| 37 |
+
"dtype": "string",
|
| 38 |
+
"_type": "Value"
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
+
"homepage": "",
|
| 42 |
+
"license": "",
|
| 43 |
+
"size_in_bytes": 247598,
|
| 44 |
+
"splits": {
|
| 45 |
+
"train": {
|
| 46 |
+
"name": "train",
|
| 47 |
+
"num_bytes": 118782,
|
| 48 |
+
"num_examples": 296,
|
| 49 |
+
"dataset_name": "json"
|
| 50 |
+
}
|
| 51 |
+
},
|
| 52 |
+
"version": {
|
| 53 |
+
"version_str": "0.0.0",
|
| 54 |
+
"major": 0,
|
| 55 |
+
"minor": 0,
|
| 56 |
+
"patch": 0
|
| 57 |
+
}
|
| 58 |
+
}
|
processed_ds/test/state.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_data_files": [
|
| 3 |
+
{
|
| 4 |
+
"filename": "data-00000-of-00001.arrow"
|
| 5 |
+
}
|
| 6 |
+
],
|
| 7 |
+
"_fingerprint": "fc9f3d6416289cea",
|
| 8 |
+
"_format_columns": null,
|
| 9 |
+
"_format_kwargs": {},
|
| 10 |
+
"_format_type": null,
|
| 11 |
+
"_output_all_columns": false,
|
| 12 |
+
"_split": "train"
|
| 13 |
+
}
|
processed_ds/train/data-00000-of-00001.arrow
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2c0a3a365ffd004cab804705a1c0bc4910ed57b98280bd5e1c04176dcd6f0f8c
|
| 3 |
+
size 119688
|
processed_ds/train/dataset_info.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"builder_name": "json",
|
| 3 |
+
"citation": "",
|
| 4 |
+
"config_name": "default",
|
| 5 |
+
"dataset_name": "json",
|
| 6 |
+
"dataset_size": 118782,
|
| 7 |
+
"description": "",
|
| 8 |
+
"download_checksums": {
|
| 9 |
+
"D:/Work/BotticelliBotsMakerAssistant/trainingSet.json": {
|
| 10 |
+
"num_bytes": 128816,
|
| 11 |
+
"checksum": null
|
| 12 |
+
}
|
| 13 |
+
},
|
| 14 |
+
"download_size": 128816,
|
| 15 |
+
"features": {
|
| 16 |
+
"input": {
|
| 17 |
+
"dtype": "string",
|
| 18 |
+
"_type": "Value"
|
| 19 |
+
},
|
| 20 |
+
"output": {
|
| 21 |
+
"dtype": "string",
|
| 22 |
+
"_type": "Value"
|
| 23 |
+
},
|
| 24 |
+
"code": {
|
| 25 |
+
"dtype": "string",
|
| 26 |
+
"_type": "Value"
|
| 27 |
+
},
|
| 28 |
+
"instruction": {
|
| 29 |
+
"dtype": "string",
|
| 30 |
+
"_type": "Value"
|
| 31 |
+
},
|
| 32 |
+
"input_text": {
|
| 33 |
+
"dtype": "string",
|
| 34 |
+
"_type": "Value"
|
| 35 |
+
},
|
| 36 |
+
"target_text": {
|
| 37 |
+
"dtype": "string",
|
| 38 |
+
"_type": "Value"
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
+
"homepage": "",
|
| 42 |
+
"license": "",
|
| 43 |
+
"size_in_bytes": 247598,
|
| 44 |
+
"splits": {
|
| 45 |
+
"train": {
|
| 46 |
+
"name": "train",
|
| 47 |
+
"num_bytes": 118782,
|
| 48 |
+
"num_examples": 296,
|
| 49 |
+
"dataset_name": "json"
|
| 50 |
+
}
|
| 51 |
+
},
|
| 52 |
+
"version": {
|
| 53 |
+
"version_str": "0.0.0",
|
| 54 |
+
"major": 0,
|
| 55 |
+
"minor": 0,
|
| 56 |
+
"patch": 0
|
| 57 |
+
}
|
| 58 |
+
}
|
processed_ds/train/state.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_data_files": [
|
| 3 |
+
{
|
| 4 |
+
"filename": "data-00000-of-00001.arrow"
|
| 5 |
+
}
|
| 6 |
+
],
|
| 7 |
+
"_fingerprint": "1497a0576b514d71",
|
| 8 |
+
"_format_columns": null,
|
| 9 |
+
"_format_kwargs": {},
|
| 10 |
+
"_format_type": null,
|
| 11 |
+
"_output_all_columns": false,
|
| 12 |
+
"_split": "train"
|
| 13 |
+
}
|
train.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from datasets import load_from_disk
|
| 3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
|
| 4 |
+
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
| 5 |
+
import torch
|
| 6 |
+
|
| 7 |
+
model_name = "BotticelliBots/BotticelliBotsMakerAssistant" # ваш base_model
|
| 8 |
+
out_dir = "out_peft"
|
| 9 |
+
|
| 10 |
+
# load dataset
|
| 11 |
+
ds = load_from_disk("processed_ds")
|
| 12 |
+
train_ds = ds["train"]
|
| 13 |
+
eval_ds = ds["test"]
|
| 14 |
+
|
| 15 |
+
# tokenizer & model
|
| 16 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
|
| 17 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
|
| 18 |
+
|
| 19 |
+
# prepare and apply LoRA
|
| 20 |
+
model = prepare_model_for_kbit_training(model)
|
| 21 |
+
lora_config = LoraConfig(
|
| 22 |
+
r=8, lora_alpha=32, target_modules=["q_proj","v_proj","k_proj","o_proj"],
|
| 23 |
+
lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
|
| 24 |
+
)
|
| 25 |
+
model = get_peft_model(model, lora_config)
|
| 26 |
+
|
| 27 |
+
# tokenization fn
|
| 28 |
+
def tokenize_fn(batch):
|
| 29 |
+
inputs = [a + tokenizer.eos_token + b for a,b in zip(batch["input_text"], batch["target_text"])]
|
| 30 |
+
out = tokenizer(inputs, truncation=True, padding="max_length", max_length=1024)
|
| 31 |
+
out["labels"] = out["input_ids"].copy()
|
| 32 |
+
return out
|
| 33 |
+
|
| 34 |
+
train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=train_ds.column_names)
|
| 35 |
+
eval_ds = eval_ds.map(tokenize_fn, batched=True, remove_columns=eval_ds.column_names)
|
| 36 |
+
|
| 37 |
+
training_args = TrainingArguments(
|
| 38 |
+
output_dir=out_dir,
|
| 39 |
+
per_device_train_batch_size=1,
|
| 40 |
+
gradient_accumulation_steps=8,
|
| 41 |
+
num_train_epochs=3,
|
| 42 |
+
learning_rate=2e-4,
|
| 43 |
+
fp16=True,
|
| 44 |
+
logging_steps=50,
|
| 45 |
+
save_total_limit=2,
|
| 46 |
+
optim="paged_adamw_8bit"
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
trainer = Trainer(model=model, args=training_args, train_dataset=train_ds, eval_dataset=eval_ds)
|
| 50 |
+
trainer.train()
|
| 51 |
+
|
| 52 |
+
# save PEFT weights (small)
|
| 53 |
+
model.save_pretrained(out_dir)
|
| 54 |
+
tokenizer.save_pretrained(out_dir)
|
| 55 |
+
print("Saved PEFT to", out_dir)
|
trainingSet.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|