- basics

Files changed (11) hide show

config.json +13 -0
prepare.py +16 -0
processed_ds/dataset_dict.json +1 -0
processed_ds/test/data-00000-of-00001.arrow +3 -0
processed_ds/test/dataset_info.json +58 -0
processed_ds/test/state.json +13 -0
processed_ds/train/data-00000-of-00001.arrow +3 -0
processed_ds/train/dataset_info.json +58 -0
processed_ds/train/state.json +13 -0
train.py +55 -0
trainingSet.json +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "model_type": "qwen2_5",
+  "adapter_config": {
+    "base_model": "Alexis-Az/Qwen-2.5-Coder-7B-4bit-CSharp-Alpaca-Code-ORPO-LoRA",
+    "adapter_type": "lora",
+    "r": 8,
+    "lora_alpha": 32,
+    "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"],
+    "lora_dropout": 0.1
+  },
+  "tags": ["peft", "lora", "adapter-only"],
+  "license": "apache-2.0"
+}

prepare.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from huggingface_hub import login
+from datasets import load_dataset
+def prepare(example):
+    # подставьте реальные поля вашего jsonl
+    inp = example.get("prompt","")
+    tgt = example.get("completion","")
+    return {"input_text": inp, "target_text": tgt}
+ds = load_dataset("json", data_files="trainingSet.json", split="train")
+ds = ds.map(prepare)
+ds = ds.train_test_split(test_size=0.02)
+ds.save_to_disk("processed_ds")
+print("Saved processed_ds")

processed_ds/dataset_dict.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"splits": ["train", "test"]}

processed_ds/test/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5673f1d5a1ccd0c5bcb67d6a89accca2d25160c0d655f81b3ef1e8a516d67af4
+size 4056

processed_ds/test/dataset_info.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "builder_name": "json",
+  "citation": "",
+  "config_name": "default",
+  "dataset_name": "json",
+  "dataset_size": 118782,
+  "description": "",
+  "download_checksums": {
+    "D:/Work/BotticelliBotsMakerAssistant/trainingSet.json": {
+      "num_bytes": 128816,
+      "checksum": null
+    }
+  },
+  "download_size": 128816,
+  "features": {
+    "input": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "output": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "code": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "instruction": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "input_text": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "target_text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 247598,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 118782,
+      "num_examples": 296,
+      "dataset_name": "json"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}

processed_ds/test/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "fc9f3d6416289cea",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train"
+}

processed_ds/train/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c0a3a365ffd004cab804705a1c0bc4910ed57b98280bd5e1c04176dcd6f0f8c
+size 119688

processed_ds/train/dataset_info.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "builder_name": "json",
+  "citation": "",
+  "config_name": "default",
+  "dataset_name": "json",
+  "dataset_size": 118782,
+  "description": "",
+  "download_checksums": {
+    "D:/Work/BotticelliBotsMakerAssistant/trainingSet.json": {
+      "num_bytes": 128816,
+      "checksum": null
+    }
+  },
+  "download_size": 128816,
+  "features": {
+    "input": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "output": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "code": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "instruction": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "input_text": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "target_text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 247598,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 118782,
+      "num_examples": 296,
+      "dataset_name": "json"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}

processed_ds/train/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "1497a0576b514d71",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train"
+}

train.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+from datasets import load_from_disk
+from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+import torch
+model_name = "BotticelliBots/BotticelliBotsMakerAssistant"  # ваш base_model
+out_dir = "out_peft"
+# load dataset
+ds = load_from_disk("processed_ds")
+train_ds = ds["train"]
+eval_ds = ds["test"]
+# tokenizer & model
+tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
+# prepare and apply LoRA
+model = prepare_model_for_kbit_training(model)
+lora_config = LoraConfig(
+    r=8, lora_alpha=32, target_modules=["q_proj","v_proj","k_proj","o_proj"],
+    lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
+)
+model = get_peft_model(model, lora_config)
+# tokenization fn
+def tokenize_fn(batch):
+    inputs = [a + tokenizer.eos_token + b for a,b in zip(batch["input_text"], batch["target_text"])]
+    out = tokenizer(inputs, truncation=True, padding="max_length", max_length=1024)
+    out["labels"] = out["input_ids"].copy()
+    return out
+train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=train_ds.column_names)
+eval_ds = eval_ds.map(tokenize_fn, batched=True, remove_columns=eval_ds.column_names)
+training_args = TrainingArguments(
+    output_dir=out_dir,
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=8,
+    num_train_epochs=3,
+    learning_rate=2e-4,
+    fp16=True,
+    logging_steps=50,
+    save_total_limit=2,
+    optim="paged_adamw_8bit"
+)
+trainer = Trainer(model=model, args=training_args, train_dataset=train_ds, eval_dataset=eval_ds)
+trainer.train()
+# save PEFT weights (small)
+model.save_pretrained(out_dir)
+tokenizer.save_pretrained(out_dir)
+print("Saved PEFT to", out_dir)

trainingSet.json ADDED Viewed

The diff for this file is too large to render. See raw diff