Igor Evdokimov commited on
Commit
d632b3b
·
1 Parent(s): 68b94a3
config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "qwen2_5",
3
+ "adapter_config": {
4
+ "base_model": "Alexis-Az/Qwen-2.5-Coder-7B-4bit-CSharp-Alpaca-Code-ORPO-LoRA",
5
+ "adapter_type": "lora",
6
+ "r": 8,
7
+ "lora_alpha": 32,
8
+ "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"],
9
+ "lora_dropout": 0.1
10
+ },
11
+ "tags": ["peft", "lora", "adapter-only"],
12
+ "license": "apache-2.0"
13
+ }
prepare.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import login
2
+ from datasets import load_dataset
3
+
4
+
5
+ def prepare(example):
6
+ # подставьте реальные поля вашего jsonl
7
+ inp = example.get("prompt","")
8
+ tgt = example.get("completion","")
9
+ return {"input_text": inp, "target_text": tgt}
10
+
11
+ ds = load_dataset("json", data_files="trainingSet.json", split="train")
12
+ ds = ds.map(prepare)
13
+ ds = ds.train_test_split(test_size=0.02)
14
+ ds.save_to_disk("processed_ds")
15
+
16
+ print("Saved processed_ds")
processed_ds/dataset_dict.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"splits": ["train", "test"]}
processed_ds/test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5673f1d5a1ccd0c5bcb67d6a89accca2d25160c0d655f81b3ef1e8a516d67af4
3
+ size 4056
processed_ds/test/dataset_info.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "json",
3
+ "citation": "",
4
+ "config_name": "default",
5
+ "dataset_name": "json",
6
+ "dataset_size": 118782,
7
+ "description": "",
8
+ "download_checksums": {
9
+ "D:/Work/BotticelliBotsMakerAssistant/trainingSet.json": {
10
+ "num_bytes": 128816,
11
+ "checksum": null
12
+ }
13
+ },
14
+ "download_size": 128816,
15
+ "features": {
16
+ "input": {
17
+ "dtype": "string",
18
+ "_type": "Value"
19
+ },
20
+ "output": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ },
24
+ "code": {
25
+ "dtype": "string",
26
+ "_type": "Value"
27
+ },
28
+ "instruction": {
29
+ "dtype": "string",
30
+ "_type": "Value"
31
+ },
32
+ "input_text": {
33
+ "dtype": "string",
34
+ "_type": "Value"
35
+ },
36
+ "target_text": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ }
40
+ },
41
+ "homepage": "",
42
+ "license": "",
43
+ "size_in_bytes": 247598,
44
+ "splits": {
45
+ "train": {
46
+ "name": "train",
47
+ "num_bytes": 118782,
48
+ "num_examples": 296,
49
+ "dataset_name": "json"
50
+ }
51
+ },
52
+ "version": {
53
+ "version_str": "0.0.0",
54
+ "major": 0,
55
+ "minor": 0,
56
+ "patch": 0
57
+ }
58
+ }
processed_ds/test/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "fc9f3d6416289cea",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": "train"
13
+ }
processed_ds/train/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c0a3a365ffd004cab804705a1c0bc4910ed57b98280bd5e1c04176dcd6f0f8c
3
+ size 119688
processed_ds/train/dataset_info.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "json",
3
+ "citation": "",
4
+ "config_name": "default",
5
+ "dataset_name": "json",
6
+ "dataset_size": 118782,
7
+ "description": "",
8
+ "download_checksums": {
9
+ "D:/Work/BotticelliBotsMakerAssistant/trainingSet.json": {
10
+ "num_bytes": 128816,
11
+ "checksum": null
12
+ }
13
+ },
14
+ "download_size": 128816,
15
+ "features": {
16
+ "input": {
17
+ "dtype": "string",
18
+ "_type": "Value"
19
+ },
20
+ "output": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ },
24
+ "code": {
25
+ "dtype": "string",
26
+ "_type": "Value"
27
+ },
28
+ "instruction": {
29
+ "dtype": "string",
30
+ "_type": "Value"
31
+ },
32
+ "input_text": {
33
+ "dtype": "string",
34
+ "_type": "Value"
35
+ },
36
+ "target_text": {
37
+ "dtype": "string",
38
+ "_type": "Value"
39
+ }
40
+ },
41
+ "homepage": "",
42
+ "license": "",
43
+ "size_in_bytes": 247598,
44
+ "splits": {
45
+ "train": {
46
+ "name": "train",
47
+ "num_bytes": 118782,
48
+ "num_examples": 296,
49
+ "dataset_name": "json"
50
+ }
51
+ },
52
+ "version": {
53
+ "version_str": "0.0.0",
54
+ "major": 0,
55
+ "minor": 0,
56
+ "patch": 0
57
+ }
58
+ }
processed_ds/train/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "1497a0576b514d71",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": "train"
13
+ }
train.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datasets import load_from_disk
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
4
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
5
+ import torch
6
+
7
+ model_name = "BotticelliBots/BotticelliBotsMakerAssistant" # ваш base_model
8
+ out_dir = "out_peft"
9
+
10
+ # load dataset
11
+ ds = load_from_disk("processed_ds")
12
+ train_ds = ds["train"]
13
+ eval_ds = ds["test"]
14
+
15
+ # tokenizer & model
16
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
17
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
18
+
19
+ # prepare and apply LoRA
20
+ model = prepare_model_for_kbit_training(model)
21
+ lora_config = LoraConfig(
22
+ r=8, lora_alpha=32, target_modules=["q_proj","v_proj","k_proj","o_proj"],
23
+ lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
24
+ )
25
+ model = get_peft_model(model, lora_config)
26
+
27
+ # tokenization fn
28
+ def tokenize_fn(batch):
29
+ inputs = [a + tokenizer.eos_token + b for a,b in zip(batch["input_text"], batch["target_text"])]
30
+ out = tokenizer(inputs, truncation=True, padding="max_length", max_length=1024)
31
+ out["labels"] = out["input_ids"].copy()
32
+ return out
33
+
34
+ train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=train_ds.column_names)
35
+ eval_ds = eval_ds.map(tokenize_fn, batched=True, remove_columns=eval_ds.column_names)
36
+
37
+ training_args = TrainingArguments(
38
+ output_dir=out_dir,
39
+ per_device_train_batch_size=1,
40
+ gradient_accumulation_steps=8,
41
+ num_train_epochs=3,
42
+ learning_rate=2e-4,
43
+ fp16=True,
44
+ logging_steps=50,
45
+ save_total_limit=2,
46
+ optim="paged_adamw_8bit"
47
+ )
48
+
49
+ trainer = Trainer(model=model, args=training_args, train_dataset=train_ds, eval_dataset=eval_ds)
50
+ trainer.train()
51
+
52
+ # save PEFT weights (small)
53
+ model.save_pretrained(out_dir)
54
+ tokenizer.save_pretrained(out_dir)
55
+ print("Saved PEFT to", out_dir)
trainingSet.json ADDED
The diff for this file is too large to render. See raw diff