{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {},
   "source": [
    "# Fine tuning notebook\n",
    "In this notebook we load and visualize a dataset of fashion products and fine tune a VLM model on it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "from sklearn.model_selection import train_test_split\n",
    "from PIL import Image\n",
    "import io\n",
    "import matplotlib.pyplot as plt\n",
    "import json\n",
    "from modules.data_processing import create_training_example"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2",
   "metadata": {},
   "source": [
    "## Understand the data and split into train and test\n",
    "1. Shape of dataset\n",
    "2. Distribution / balance of categories\n",
    "3. Train-test split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds =load_dataset(\"ceyda/fashion-products-small\")\n",
    "df = ds['train'].to_pandas()\n",
    "print(f\"Shape of dataset: {df.shape}\")\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4",
   "metadata": {},
   "outputs": [],
   "source": [
    "### For expediency we will randomly sample only 10,000 total rows\n",
    "sample_size = 10000\n",
    "df = df.sample(n=sample_size, random_state=42)\n",
    "print(f\"Shape of dataset after sampling: {df.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_category_distribution_by_percent(df, col):\n",
    "    count_df = df.groupby(col)[\"id\"].count().reset_index(name=\"count\")\n",
    "    _denominator = df.shape[0]\n",
    "    count_df.loc[:, \"percent\"] = (count_df[\"count\"] / _denominator) * 100\n",
    "    return count_df.sort_values(by=\"percent\", ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6",
   "metadata": {},
   "outputs": [],
   "source": [
    "m_cat = get_category_distribution_by_percent(df, \"masterCategory\")\n",
    "m_cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7",
   "metadata": {},
   "outputs": [],
   "source": [
    "get_category_distribution_by_percent(df, \"subCategory\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8",
   "metadata": {},
   "outputs": [],
   "source": [
    "get_category_distribution_by_percent(df, \"gender\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9",
   "metadata": {},
   "source": [
    "As seen above the dataset is imbalanced, especially around masterCategory. Lets filter out any masterCategory with less than 2% of the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_less_than_2_percent = m_cat.loc[m_cat.loc[:, \"percent\"] < 2, \"masterCategory\"].values\n",
    "print(f\"Starting with {df.shape}\")\n",
    "df = df.loc[~df.loc[:, \"masterCategory\"].isin(cat_less_than_2_percent)]\n",
    "print(f\"Finished with {df.shape}\")\n",
    "\n",
    "print(f\"masterCategories are now: {df.masterCategory.unique()}\")\n",
    "print(f\"subCategories are now: {df.subCategory.unique()}\")\n",
    "print(f\"genders are now: {df.gender.unique()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train, df_test = train_test_split(df, test_size=0.15, random_state=42)\n",
    "print(f\"Train shape: {df_train.shape}\")\n",
    "print(f\"Test shape: {df_test.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Save datasets in /data folder\n",
    "df_train.to_csv(\"../data/train.csv\", index=False)\n",
    "df_test.to_csv(\"../data/test.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13",
   "metadata": {},
   "source": [
    "## Fine tuning\n",
    "\n",
    "1. Upload dataset to fireworks\n",
    "2. Fine tune model on dataset\n",
    "3. Create deployment with fine tuned model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14",
   "metadata": {},
   "outputs": [],
   "source": [
    "import base64\n",
    "from io import BytesIO\n",
    "\n",
    "def pil_to_base64(pil_image):\n",
    "    \"\"\"Convert PIL Image to base64 string\"\"\"\n",
    "    buffered = BytesIO()\n",
    "    pil_image.save(buffered, format=\"PNG\")\n",
    "    img_str = base64.b64encode(buffered.getvalue()).decode()\n",
    "    return f\"data:image/png;base64,{img_str}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15",
   "metadata": {},
   "outputs": [],
   "source": [
    "img_bytes = df_train['image'][0]['bytes']\n",
    "img = Image.open(io.BytesIO(img_bytes))\n",
    "plt.imshow(img)\n",
    "plt.axis('off')\n",
    "plt.title(ds['train'][0].get('productDisplayName', 'Product'))\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16",
   "metadata": {},
   "source": [
    "#### 1. Convert dataset to Fireworks jsonl as specified in [the docs](https://fireworks.ai/docs/fine-tuning/fine-tuning-vlm#supervised-fine-tuning-for-vlms-sft)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Creating training examples...\")\n",
    "training_data = [create_training_example(row) for idx, row in df_train.iterrows()]\n",
    "with open(\"../data/fashion_catalog_train.jsonl\", \"w\") as f:\n",
    "    for example in training_data:\n",
    "        f.write(json.dumps(example) + \"\\n\")\n",
    "print(f\"Finished creating training examples {len(training_data)} / {df_train.shape[0]}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18",
   "metadata": {},
   "source": [
    "**Note: make sure you have firectl installed, if you do not please install it from [here](https://docs.fireworks.ai/tools-sdks/firectl/firectl)**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19",
   "metadata": {},
   "outputs": [],
   "source": [
    "! firectl -a pyroworks create dataset fashion-catalog-train ../data/fashion_catalog_train.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check dataset was correctly uploaded\n",
    "! firectl-admin -a pyroworks get dataset fashion-catalog-train"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21",
   "metadata": {},
   "source": [
    "### 2. Run fine tuning job\n",
    "\n",
    "Parameter Guide\n",
    "| Parameter | Description | Recommended Value |\n",
    "|-----------|-------------|-------------------|\n",
    "| `--base-model` | Base model to fine-tune | `qwen2p5-vl-72b-instruct` |\n",
    "| `--dataset` | Your uploaded dataset ID | From step 3 |\n",
    "| `--output-model` | Name for your fine-tuned model | `Qwen2.5-72b-fashion-catalog` |\n",
    "| `--epochs` | Training iterations | `3` (start small) |\n",
    "| `--learning-rate` | Learning rate | `0.0001` |\n",
    "| `--turbo` | Faster training | Always include |\n",
    "| `--early-stop` | Prevent overfitting | Always include |\n",
    "| `--eval-auto-carveout` | Auto validation split | Always include |"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "22",
   "metadata": {},
   "source": [
    "##### Fine tune Qwen 2.5 vl 32B"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23",
   "metadata": {},
   "outputs": [],
   "source": [
    "! firectl -a pyroworks create sftj --base-model accounts/fireworks/models/qwen2p5-vl-32b-instruct --dataset accounts/pyroworks/datasets/fashion-catalog-train --output-model qwen-32b-fashion-catalog --display-name \"Qwen2.5-32b-fashion-catalog\" --epochs 3 --learning-rate 0.0001 --early-stop"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24",
   "metadata": {},
   "outputs": [],
   "source": [
    "### Check status of job\n",
    "! firectl -a pyroworks get sftj j588i1qm"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "25",
   "metadata": {},
   "source": [
    "##### Fine tune Qwen 2.5 vl 72B"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26",
   "metadata": {},
   "outputs": [],
   "source": [
    "! firectl -a pyroworks create sftj --base-model accounts/fireworks/models/qwen2p5-vl-72b-instruct --dataset accounts/pyroworks/datasets/fashion-catalog-train --output-model qwen-72b-fashion-catalog --display-name \"Qwen2.5-72b-fashion-catalog\" --epochs 3 --learning-rate 0.0001 --early-stop --eval-auto-carveout"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27",
   "metadata": {},
   "outputs": [],
   "source": [
    "### Check status of job\n",
    "! firectl -a pyroworks get sftj bew0pztj"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "28",
   "metadata": {},
   "source": [
    "##### Fine tune Qwen 3 vl 8B"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29",
   "metadata": {},
   "outputs": [],
   "source": [
    "! firectl -a pyroworks create sftj --base-model accounts/fireworks/models/qwen3-vl-8b-instruct --dataset accounts/pyroworks/datasets/fashion-catalog-train --output-model qwen3-8b-fashion-catalog --display-name \"Qwen3-8B-fashion-catalog\" --epochs 3 --learning-rate 0.0001 --early-stop --eval-auto-carveout"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "30",
   "metadata": {},
   "source": [
    "##### Fine tune Qwen 3 VL 32B"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31",
   "metadata": {},
   "outputs": [],
   "source": [
    "! firectl -a pyroworks create sftj --base-model accounts/fireworks/models/qwen3-vl-32b-instruct --dataset accounts/pyroworks/datasets/fashion-catalog-train --output-model qwen3-32b-fashion-catalog --display-name \"Qwen3-32B-fashion-catalog\" --epochs 3 --learning-rate 0.0001 --early-stop --eval-auto-carveout"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "32",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}