# PIPELINE DEFINITION # Name: qlora-pdf-fine-tuning # Description: Fine-tune Llama 3.1 70B via QLoRA on PDFs from the Quobjects training-data bucket. Pushes the adapter to Gitea and logs metrics to MLflow. # Inputs: # aws_access_key_id: str [Default: ''] # aws_secret_access_key: str [Default: ''] # base_model: str [Default: 'meta-llama/Llama-3.1-70B-Instruct'] # batch_size: int [Default: 2.0] # chunk_overlap: int [Default: 64.0] # chunk_size: int [Default: 512.0] # gitea_owner: str [Default: 'daviestechlabs'] # gitea_password: str [Default: ''] # gitea_repo: str [Default: 'qlora-adapters'] # gitea_url: str [Default: 'http://gitea-http.gitea.svc.cluster.local:3000'] # gitea_username: str [Default: ''] # gradient_accumulation_steps: int [Default: 8.0] # learning_rate: float [Default: 0.0002] # lora_alpha: int [Default: 16.0] # lora_dropout: float [Default: 0.05] # lora_r: int [Default: 64.0] # max_seq_length: int [Default: 2048.0] # mlflow_tracking_uri: str [Default: 'http://mlflow.mlflow.svc.cluster.local:80'] # num_epochs: int [Default: 3.0] # s3_bucket: str [Default: 'training-data'] # s3_endpoint: str [Default: 'candlekeep.lab.daviestechlabs.io'] # s3_prefix: str [Default: ''] components: comp-evaluate-adapter: executorLabel: exec-evaluate-adapter inputDefinitions: parameters: adapter_path: parameterType: STRING base_model: parameterType: STRING outputDefinitions: parameters: passed: parameterType: BOOLEAN report: parameterType: STRING comp-fetch-pdfs-from-s3: executorLabel: exec-fetch-pdfs-from-s3 inputDefinitions: parameters: aws_access_key_id: parameterType: STRING aws_secret_access_key: parameterType: STRING s3_bucket: parameterType: STRING s3_endpoint: parameterType: STRING s3_prefix: parameterType: STRING outputDefinitions: parameters: num_files: parameterType: NUMBER_INTEGER pdf_dir: parameterType: STRING comp-log-training-metrics: executorLabel: exec-log-training-metrics inputDefinitions: parameters: base_model: parameterType: STRING eval_loss: parameterType: NUMBER_DOUBLE experiment_name: defaultValue: qlora-pdf-training isOptional: true parameterType: STRING learning_rate: parameterType: NUMBER_DOUBLE lora_alpha: parameterType: NUMBER_INTEGER lora_r: parameterType: NUMBER_INTEGER mlflow_tracking_uri: defaultValue: http://mlflow.mlflow.svc.cluster.local:80 isOptional: true parameterType: STRING num_epochs: parameterType: NUMBER_INTEGER num_pdfs: parameterType: NUMBER_INTEGER num_train: parameterType: NUMBER_INTEGER num_val: parameterType: NUMBER_INTEGER repo_url: parameterType: STRING train_loss: parameterType: NUMBER_DOUBLE comp-prepare-training-data: executorLabel: exec-prepare-training-data inputDefinitions: parameters: chunk_overlap: defaultValue: 64.0 isOptional: true parameterType: NUMBER_INTEGER chunk_size: defaultValue: 512.0 isOptional: true parameterType: NUMBER_INTEGER max_seq_length: defaultValue: 2048.0 isOptional: true parameterType: NUMBER_INTEGER pdf_dir: parameterType: STRING outputDefinitions: parameters: dataset_path: parameterType: STRING num_train: parameterType: NUMBER_INTEGER num_val: parameterType: NUMBER_INTEGER comp-push-adapter-to-gitea: executorLabel: exec-push-adapter-to-gitea inputDefinitions: parameters: adapter_path: parameterType: STRING branch: defaultValue: main isOptional: true parameterType: STRING commit_message: defaultValue: 'feat: add QLoRA adapter from PDF training pipeline' isOptional: true parameterType: STRING gitea_owner: parameterType: STRING gitea_password: parameterType: STRING gitea_repo: parameterType: STRING gitea_url: parameterType: STRING gitea_username: parameterType: STRING outputDefinitions: parameters: files_pushed: parameterType: NUMBER_INTEGER repo_url: parameterType: STRING comp-train-qlora: executorLabel: exec-train-qlora inputDefinitions: parameters: base_model: parameterType: STRING batch_size: defaultValue: 2.0 isOptional: true parameterType: NUMBER_INTEGER dataset_path: parameterType: STRING gradient_accumulation_steps: defaultValue: 8.0 isOptional: true parameterType: NUMBER_INTEGER learning_rate: defaultValue: 0.0002 isOptional: true parameterType: NUMBER_DOUBLE lora_alpha: defaultValue: 16.0 isOptional: true parameterType: NUMBER_INTEGER lora_dropout: defaultValue: 0.05 isOptional: true parameterType: NUMBER_DOUBLE lora_r: defaultValue: 64.0 isOptional: true parameterType: NUMBER_INTEGER max_seq_length: defaultValue: 2048.0 isOptional: true parameterType: NUMBER_INTEGER num_epochs: defaultValue: 3.0 isOptional: true parameterType: NUMBER_INTEGER outputDefinitions: parameters: adapter_path: parameterType: STRING eval_loss: parameterType: NUMBER_DOUBLE train_loss: parameterType: NUMBER_DOUBLE deploymentSpec: executors: exec-evaluate-adapter: container: args: - --executor_input - '{{$}}' - --function_to_execute - evaluate_adapter command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'torch' 'transformers'\ \ 'peft' 'bitsandbytes' 'accelerate' 'scipy' && \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef evaluate_adapter(\n adapter_path: str,\n base_model: str,\n\ ) -> NamedTuple(\"EvalOutput\", [(\"report\", str), (\"passed\", bool)]):\n\ \ \"\"\"Load the QLoRA adapter and run a few sanity-check prompts.\"\"\ \"\n import torch\n from transformers import AutoModelForCausalLM,\ \ AutoTokenizer, BitsAndBytesConfig\n from peft import PeftModel\n\n\ \ bnb_config = BitsAndBytesConfig(\n load_in_4bit=True,\n \ \ bnb_4bit_quant_type=\"nf4\",\n bnb_4bit_compute_dtype=torch.bfloat16,\n\ \ bnb_4bit_use_double_quant=True,\n )\n\n print(f\"Loading\ \ base model {base_model} \u2026\")\n model = AutoModelForCausalLM.from_pretrained(\n\ \ base_model,\n quantization_config=bnb_config,\n device_map=\"\ auto\",\n trust_remote_code=True,\n torch_dtype=torch.bfloat16,\n\ \ )\n tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)\n\ \n print(f\"Loading adapter from {adapter_path} \u2026\")\n model\ \ = PeftModel.from_pretrained(model, adapter_path)\n model.eval()\n\n\ \ test_prompts = [\n \"Summarise the key points from the training\ \ material.\",\n \"What are the main topics covered in the source\ \ documents?\",\n \"Explain the most important concept from the training\ \ data.\",\n ]\n\n lines = []\n for prompt in test_prompts:\n \ \ messages = [\n {\"role\": \"system\", \"content\": \"\ You are a helpful assistant.\"},\n {\"role\": \"user\", \"content\"\ : prompt},\n ]\n input_text = tokenizer.apply_chat_template(\n\ \ messages, tokenize=False, add_generation_prompt=True\n \ \ )\n inputs = tokenizer(input_text, return_tensors=\"pt\").to(model.device)\n\ \ with torch.no_grad():\n out = model.generate(**inputs,\ \ max_new_tokens=128, temperature=0.7, do_sample=True)\n response\ \ = tokenizer.decode(out[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n\ \ lines.append(f\"Q: {prompt}\\nA: {response}\\n\")\n print(lines[-1])\n\ \n report = \"\\n\".join(lines)\n # Simple heuristic: did the model\ \ produce non-empty responses?\n passed = all(len(l.split(\"A:\")[1].strip())\ \ > 10 for l in lines)\n print(f\"Evaluation passed: {passed}\")\n\n\ \ from collections import namedtuple\n\n return namedtuple(\"EvalOutput\"\ , [\"report\", \"passed\"])(\n report=report, passed=passed\n \ \ )\n\n" image: python:3.13-slim resources: accelerator: resourceCount: '1' resourceType: gpu exec-fetch-pdfs-from-s3: container: args: - --executor_input - '{{$}}' - --function_to_execute - fetch_pdfs_from_s3 command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'boto3' && \"\ $0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef fetch_pdfs_from_s3(\n s3_endpoint: str,\n s3_bucket: str,\n\ \ s3_prefix: str,\n aws_access_key_id: str,\n aws_secret_access_key:\ \ str,\n) -> NamedTuple(\"PDFOutput\", [(\"pdf_dir\", str), (\"num_files\"\ , int)]):\n \"\"\"Download all PDFs from a Quobjects S3 bucket.\"\"\"\ \n import os\n import boto3\n from botocore.client import Config\n\ \n out_dir = \"/tmp/pdfs\"\n os.makedirs(out_dir, exist_ok=True)\n\ \n client = boto3.client(\n \"s3\",\n endpoint_url=f\"\ http://{s3_endpoint}\",\n aws_access_key_id=aws_access_key_id,\n\ \ aws_secret_access_key=aws_secret_access_key,\n region_name=\"\ us-east-1\",\n config=Config(signature_version=\"s3v4\"),\n )\n\ \n paginator = client.get_paginator(\"list_objects_v2\")\n count =\ \ 0\n for page in paginator.paginate(Bucket=s3_bucket, Prefix=s3_prefix):\n\ \ for obj in page.get(\"Contents\", []):\n key = obj[\"\ Key\"]\n if key.lower().endswith(\".pdf\"):\n \ \ local_path = os.path.join(out_dir, os.path.basename(key))\n \ \ print(f\"Downloading: {key} \u2192 {local_path}\")\n \ \ client.download_file(s3_bucket, key, local_path)\n count\ \ += 1\n\n print(f\"Downloaded {count} PDFs to {out_dir}\")\n from\ \ collections import namedtuple\n\n return namedtuple(\"PDFOutput\",\ \ [\"pdf_dir\", \"num_files\"])(\n pdf_dir=out_dir, num_files=count\n\ \ )\n\n" image: python:3.13-slim exec-log-training-metrics: container: args: - --executor_input - '{{$}}' - --function_to_execute - log_training_metrics command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'mlflow==2.22.0'\ \ && \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef log_training_metrics(\n base_model: str,\n train_loss:\ \ float,\n eval_loss: float,\n num_train: int,\n num_val: int,\n\ \ num_pdfs: int,\n lora_r: int,\n lora_alpha: int,\n learning_rate:\ \ float,\n num_epochs: int,\n repo_url: str,\n mlflow_tracking_uri:\ \ str = \"http://mlflow.mlflow.svc.cluster.local:80\",\n experiment_name:\ \ str = \"qlora-pdf-training\",\n):\n \"\"\"Log the full training run\ \ to MLflow.\"\"\"\n import mlflow\n\n mlflow.set_tracking_uri(mlflow_tracking_uri)\n\ \ mlflow.set_experiment(experiment_name)\n\n with mlflow.start_run(run_name=f\"\ qlora-{base_model.split('/')[-1]}\"):\n mlflow.log_params(\n \ \ {\n \"base_model\": base_model,\n \ \ \"lora_r\": lora_r,\n \"lora_alpha\": lora_alpha,\n \ \ \"learning_rate\": learning_rate,\n \"num_epochs\"\ : num_epochs,\n \"num_pdfs\": num_pdfs,\n \ \ \"data_source\": \"quobjects/training-data\",\n }\n \ \ )\n mlflow.log_metrics(\n {\n \"train_loss\"\ : train_loss,\n \"eval_loss\": eval_loss,\n \ \ \"train_samples\": float(num_train),\n \"val_samples\"\ : float(num_val),\n }\n )\n mlflow.set_tag(\"adapter_repo\"\ , repo_url)\n\n" image: python:3.13-slim exec-prepare-training-data: container: args: - --executor_input - '{{$}}' - --function_to_execute - prepare_training_data command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'pymupdf' &&\ \ \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef prepare_training_data(\n pdf_dir: str,\n max_seq_length:\ \ int = 2048,\n chunk_size: int = 512,\n chunk_overlap: int = 64,\n\ ) -> NamedTuple(\"DataOutput\", [(\"dataset_path\", str), (\"num_train\"\ , int), (\"num_val\", int)]):\n \"\"\"Extract text from PDFs, chunk it,\ \ and format as instruction-tuning pairs.\"\"\"\n import json\n import\ \ os\n import fitz # PyMuPDF\n\n out_dir = \"/tmp/training_data\"\ \n os.makedirs(out_dir, exist_ok=True)\n\n # 1. Extract text from\ \ all PDFs\n all_chunks: list[dict] = []\n for fname in sorted(os.listdir(pdf_dir)):\n\ \ if not fname.lower().endswith(\".pdf\"):\n continue\n\ \ path = os.path.join(pdf_dir, fname)\n print(f\"Extracting:\ \ {fname}\")\n try:\n doc = fitz.open(path)\n \ \ full_text = \"\"\n for page in doc:\n full_text\ \ += page.get_text() + \"\\n\"\n doc.close()\n except\ \ Exception as e:\n print(f\" SKIP ({e})\")\n continue\n\ \n # 2. Chunk text with overlap\n words = full_text.split()\n\ \ for i in range(0, len(words), chunk_size - chunk_overlap):\n \ \ chunk_words = words[i : i + chunk_size]\n if len(chunk_words)\ \ < 50:\n continue # skip tiny trailing chunks\n \ \ chunk_text = \" \".join(chunk_words)\n all_chunks.append({\"\ text\": chunk_text, \"source\": fname})\n\n print(f\"Total chunks: {len(all_chunks)}\"\ )\n if not all_chunks:\n raise ValueError(\"No text extracted\ \ from PDFs \u2014 check your bucket\")\n\n # 3. Format as Llama 3 chat\ \ training pairs\n # We create self-supervised pairs: model learns\ \ to continue/explain the content\n samples = []\n for chunk in all_chunks:\n\ \ text = chunk[\"text\"]\n source = chunk[\"source\"]\n \ \ # Split chunk roughly in half for input/output\n words = text.split()\n\ \ mid = len(words) // 2\n context = \" \".join(words[:mid])\n\ \ continuation = \" \".join(words[mid:])\n\n samples.append(\n\ \ {\n \"messages\": [\n {\n\ \ \"role\": \"system\",\n \ \ \"content\": (\n \"You are a knowledgeable\ \ assistant. \"\n \"Continue the information\ \ accurately and coherently.\"\n ),\n \ \ },\n {\n \"role\": \"\ user\",\n \"content\": f\"Continue the following\ \ passage from {source}:\\n\\n{context}\",\n },\n \ \ {\"role\": \"assistant\", \"content\": continuation},\n\ \ ]\n }\n )\n\n # 4. Train/val split\ \ (90/10)\n import random\n\n random.seed(42)\n random.shuffle(samples)\n\ \ split = int(len(samples) * 0.9)\n train = samples[:split]\n val\ \ = samples[split:]\n\n train_path = os.path.join(out_dir, \"train.json\"\ )\n val_path = os.path.join(out_dir, \"val.json\")\n with open(train_path,\ \ \"w\") as f:\n json.dump(train, f)\n with open(val_path, \"\ w\") as f:\n json.dump(val, f)\n\n print(f\"Train: {len(train)}\ \ samples, Val: {len(val)} samples\")\n from collections import namedtuple\n\ \n return namedtuple(\"DataOutput\", [\"dataset_path\", \"num_train\"\ , \"num_val\"])(\n dataset_path=out_dir, num_train=len(train), num_val=len(val)\n\ \ )\n\n" image: python:3.13-slim exec-push-adapter-to-gitea: container: args: - --executor_input - '{{$}}' - --function_to_execute - push_adapter_to_gitea command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'requests' &&\ \ \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef push_adapter_to_gitea(\n adapter_path: str,\n gitea_url:\ \ str,\n gitea_owner: str,\n gitea_repo: str,\n gitea_username:\ \ str,\n gitea_password: str,\n branch: str = \"main\",\n commit_message:\ \ str = \"feat: add QLoRA adapter from PDF training pipeline\",\n) -> NamedTuple(\"\ PushOutput\", [(\"repo_url\", str), (\"files_pushed\", int)]):\n \"\"\ \"Push the QLoRA adapter files to a Gitea repository via the API.\"\"\"\n\ \ import base64\n import json\n import os\n import requests\n\ \n api_base = f\"{gitea_url}/api/v1\"\n auth = (gitea_username, gitea_password)\n\ \ repo_api = f\"{api_base}/repos/{gitea_owner}/{gitea_repo}\"\n\n \ \ # Check if repo exists, create if not\n resp = requests.get(repo_api,\ \ auth=auth, timeout=30)\n if resp.status_code == 404:\n print(f\"\ Creating repo {gitea_owner}/{gitea_repo} \u2026\")\n create_resp\ \ = requests.post(\n f\"{api_base}/orgs/{gitea_owner}/repos\"\ \n if gitea_owner != gitea_username\n else f\"{api_base}/user/repos\"\ ,\n auth=auth,\n json={\n \"name\"\ : gitea_repo,\n \"description\": \"QLoRA adapters trained\ \ from PDF documents\",\n \"private\": False,\n \ \ \"auto_init\": True,\n },\n timeout=30,\n\ \ )\n create_resp.raise_for_status()\n print(f\"Created:\ \ {create_resp.json().get('html_url')}\")\n\n # Collect all adapter files\n\ \ files_to_push = []\n for root, dirs, files in os.walk(adapter_path):\n\ \ for fname in files:\n fpath = os.path.join(root, fname)\n\ \ rel_path = os.path.relpath(fpath, adapter_path)\n \ \ with open(fpath, \"rb\") as f:\n content = base64.b64encode(f.read()).decode(\"\ utf-8\")\n files_to_push.append({\"path\": rel_path, \"content\"\ : content})\n\n print(f\"Pushing {len(files_to_push)} files to {gitea_owner}/{gitea_repo}\"\ )\n\n # Push each file via Gitea contents API\n pushed = 0\n for\ \ item in files_to_push:\n file_api = f\"{repo_api}/contents/{item['path']}\"\ \n\n # Check if file already exists (need SHA for update)\n \ \ existing = requests.get(file_api, auth=auth, params={\"ref\": branch},\ \ timeout=30)\n payload = {\n \"message\": commit_message,\n\ \ \"content\": item[\"content\"],\n \"branch\": branch,\n\ \ }\n if existing.status_code == 200:\n payload[\"\ sha\"] = existing.json()[\"sha\"]\n resp = requests.put(file_api,\ \ auth=auth, json=payload, timeout=60)\n else:\n resp\ \ = requests.post(file_api, auth=auth, json=payload, timeout=60)\n\n \ \ if resp.status_code in (200, 201):\n pushed += 1\n \ \ print(f\" \u2713 {item['path']}\")\n else:\n \ \ print(f\" \u2717 {item['path']}: {resp.status_code} {resp.text[:200]}\"\ )\n\n repo_url = f\"{gitea_url}/{gitea_owner}/{gitea_repo}\"\n print(f\"\ Pushed {pushed}/{len(files_to_push)} files to {repo_url}\")\n\n from\ \ collections import namedtuple\n\n return namedtuple(\"PushOutput\"\ , [\"repo_url\", \"files_pushed\"])(\n repo_url=repo_url, files_pushed=pushed\n\ \ )\n\n" image: python:3.13-slim exec-train-qlora: container: args: - --executor_input - '{{$}}' - --function_to_execute - train_qlora command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'torch' 'transformers'\ \ 'peft' 'datasets' 'accelerate' 'bitsandbytes' 'scipy' 'trl' && \"$0\"\ \ \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef train_qlora(\n dataset_path: str,\n base_model: str,\n\ \ learning_rate: float = 2e-4,\n num_epochs: int = 3,\n batch_size:\ \ int = 2,\n gradient_accumulation_steps: int = 8,\n max_seq_length:\ \ int = 2048,\n lora_r: int = 64,\n lora_alpha: int = 16,\n lora_dropout:\ \ float = 0.05,\n) -> NamedTuple(\n \"TrainOutput\",\n [(\"adapter_path\"\ , str), (\"train_loss\", float), (\"eval_loss\", float)],\n):\n \"\"\"\ QLoRA fine-tune Llama 3.1 70B with 4-bit NF4 quantization.\"\"\"\n import\ \ json\n import os\n\n import torch\n from datasets import Dataset\n\ \ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training\n\ \ from transformers import (\n AutoModelForCausalLM,\n \ \ AutoTokenizer,\n BitsAndBytesConfig,\n TrainingArguments,\n\ \ )\n from trl import SFTTrainer\n\n output_dir = \"/tmp/qlora_output\"\ \n os.makedirs(output_dir, exist_ok=True)\n\n # \u2500\u2500 Load\ \ data \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n with open(os.path.join(dataset_path,\ \ \"train.json\")) as f:\n train_data = json.load(f)\n with open(os.path.join(dataset_path,\ \ \"val.json\")) as f:\n val_data = json.load(f)\n\n print(f\"\ Loaded {len(train_data)} train / {len(val_data)} val samples\")\n\n #\ \ \u2500\u2500 Tokenizer \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n \ \ print(f\"Loading tokenizer: {base_model}\")\n tokenizer = AutoTokenizer.from_pretrained(base_model,\ \ trust_remote_code=True)\n if tokenizer.pad_token is None:\n \ \ tokenizer.pad_token = tokenizer.eos_token\n tokenizer.padding_side\ \ = \"right\"\n\n # \u2500\u2500 Format with chat template \u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\n def format_chat(sample):\n return {\"text\": tokenizer.apply_chat_template(\n\ \ sample[\"messages\"], tokenize=False, add_generation_prompt=False\n\ \ )}\n\n train_ds = Dataset.from_list(train_data).map(format_chat)\n\ \ val_ds = Dataset.from_list(val_data).map(format_chat)\n\n # \u2500\ \u2500 4-bit quantisation \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\n bnb_config = BitsAndBytesConfig(\n load_in_4bit=True,\n\ \ bnb_4bit_quant_type=\"nf4\",\n bnb_4bit_compute_dtype=torch.bfloat16,\n\ \ bnb_4bit_use_double_quant=True,\n )\n\n print(f\"Loading\ \ model: {base_model} (4-bit NF4)\")\n model = AutoModelForCausalLM.from_pretrained(\n\ \ base_model,\n quantization_config=bnb_config,\n device_map=\"\ auto\",\n trust_remote_code=True,\n torch_dtype=torch.bfloat16,\n\ \ )\n model = prepare_model_for_kbit_training(model)\n\n # \u2500\ \u2500 LoRA config \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n lora_config = LoraConfig(\n\ \ r=lora_r,\n lora_alpha=lora_alpha,\n target_modules=[\n\ \ \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n \ \ \"gate_proj\", \"up_proj\", \"down_proj\",\n ],\n lora_dropout=lora_dropout,\n\ \ bias=\"none\",\n task_type=\"CAUSAL_LM\",\n )\n\n \ \ model = get_peft_model(model, lora_config)\n model.print_trainable_parameters()\n\ \n # \u2500\u2500 Training args \u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n training_args = TrainingArguments(\n\ \ output_dir=os.path.join(output_dir, \"checkpoints\"),\n \ \ num_train_epochs=num_epochs,\n per_device_train_batch_size=batch_size,\n\ \ per_device_eval_batch_size=batch_size,\n gradient_accumulation_steps=gradient_accumulation_steps,\n\ \ learning_rate=learning_rate,\n bf16=True,\n logging_steps=5,\n\ \ eval_strategy=\"steps\",\n eval_steps=50,\n save_strategy=\"\ steps\",\n save_steps=100,\n save_total_limit=2,\n \ \ load_best_model_at_end=True,\n metric_for_best_model=\"eval_loss\"\ ,\n report_to=\"none\",\n warmup_ratio=0.03,\n lr_scheduler_type=\"\ cosine\",\n optim=\"paged_adamw_8bit\",\n max_grad_norm=0.3,\n\ \ group_by_length=True,\n )\n\n # \u2500\u2500 SFTTrainer \u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\n trainer = SFTTrainer(\n model=model,\n\ \ args=training_args,\n train_dataset=train_ds,\n eval_dataset=val_ds,\n\ \ tokenizer=tokenizer,\n max_seq_length=max_seq_length,\n\ \ dataset_text_field=\"text\",\n packing=True, # pack short\ \ samples for efficiency\n )\n\n print(\"Starting QLoRA training \u2026\ \")\n result = trainer.train()\n train_loss = result.training_loss\n\ \n eval_result = trainer.evaluate()\n eval_loss = eval_result.get(\"\ eval_loss\", 0.0)\n\n print(f\"Train loss: {train_loss:.4f}, Eval loss:\ \ {eval_loss:.4f}\")\n\n # \u2500\u2500 Save adapter \u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ \u2500\n adapter_path = os.path.join(output_dir, \"adapter\")\n model.save_pretrained(adapter_path)\n\ \ tokenizer.save_pretrained(adapter_path)\n\n metadata = {\n \ \ \"base_model\": base_model,\n \"lora_r\": lora_r,\n \"\ lora_alpha\": lora_alpha,\n \"lora_dropout\": lora_dropout,\n \ \ \"learning_rate\": learning_rate,\n \"num_epochs\": num_epochs,\n\ \ \"batch_size\": batch_size,\n \"gradient_accumulation_steps\"\ : gradient_accumulation_steps,\n \"max_seq_length\": max_seq_length,\n\ \ \"train_samples\": len(train_data),\n \"val_samples\": len(val_data),\n\ \ \"train_loss\": train_loss,\n \"eval_loss\": eval_loss,\n\ \ }\n with open(os.path.join(adapter_path, \"training_metadata.json\"\ ), \"w\") as f:\n json.dump(metadata, f, indent=2)\n\n print(f\"\ Adapter saved to {adapter_path}\")\n\n from collections import namedtuple\n\ \n return namedtuple(\"TrainOutput\", [\"adapter_path\", \"train_loss\"\ , \"eval_loss\"])(\n adapter_path=adapter_path,\n train_loss=train_loss,\n\ \ eval_loss=eval_loss,\n )\n\n" image: python:3.13-slim resources: accelerator: resourceCount: '1' resourceType: gpu pipelineInfo: description: Fine-tune Llama 3.1 70B via QLoRA on PDFs from the Quobjects training-data bucket. Pushes the adapter to Gitea and logs metrics to MLflow. name: qlora-pdf-fine-tuning root: dag: tasks: evaluate-adapter: cachingOptions: enableCache: true componentRef: name: comp-evaluate-adapter dependentTasks: - train-qlora inputs: parameters: adapter_path: taskOutputParameter: outputParameterKey: adapter_path producerTask: train-qlora base_model: componentInputParameter: base_model taskInfo: name: evaluate-adapter fetch-pdfs-from-s3: cachingOptions: enableCache: true componentRef: name: comp-fetch-pdfs-from-s3 inputs: parameters: aws_access_key_id: componentInputParameter: aws_access_key_id aws_secret_access_key: componentInputParameter: aws_secret_access_key s3_bucket: componentInputParameter: s3_bucket s3_endpoint: componentInputParameter: s3_endpoint s3_prefix: componentInputParameter: s3_prefix taskInfo: name: fetch-pdfs-from-s3 log-training-metrics: cachingOptions: enableCache: true componentRef: name: comp-log-training-metrics dependentTasks: - fetch-pdfs-from-s3 - prepare-training-data - push-adapter-to-gitea - train-qlora inputs: parameters: base_model: componentInputParameter: base_model eval_loss: taskOutputParameter: outputParameterKey: eval_loss producerTask: train-qlora learning_rate: componentInputParameter: learning_rate lora_alpha: componentInputParameter: lora_alpha lora_r: componentInputParameter: lora_r mlflow_tracking_uri: componentInputParameter: mlflow_tracking_uri num_epochs: componentInputParameter: num_epochs num_pdfs: taskOutputParameter: outputParameterKey: num_files producerTask: fetch-pdfs-from-s3 num_train: taskOutputParameter: outputParameterKey: num_train producerTask: prepare-training-data num_val: taskOutputParameter: outputParameterKey: num_val producerTask: prepare-training-data repo_url: taskOutputParameter: outputParameterKey: repo_url producerTask: push-adapter-to-gitea train_loss: taskOutputParameter: outputParameterKey: train_loss producerTask: train-qlora taskInfo: name: log-training-metrics prepare-training-data: cachingOptions: enableCache: true componentRef: name: comp-prepare-training-data dependentTasks: - fetch-pdfs-from-s3 inputs: parameters: chunk_overlap: componentInputParameter: chunk_overlap chunk_size: componentInputParameter: chunk_size max_seq_length: componentInputParameter: max_seq_length pdf_dir: taskOutputParameter: outputParameterKey: pdf_dir producerTask: fetch-pdfs-from-s3 taskInfo: name: prepare-training-data push-adapter-to-gitea: cachingOptions: enableCache: true componentRef: name: comp-push-adapter-to-gitea dependentTasks: - train-qlora inputs: parameters: adapter_path: taskOutputParameter: outputParameterKey: adapter_path producerTask: train-qlora gitea_owner: componentInputParameter: gitea_owner gitea_password: componentInputParameter: gitea_password gitea_repo: componentInputParameter: gitea_repo gitea_url: componentInputParameter: gitea_url gitea_username: componentInputParameter: gitea_username taskInfo: name: push-adapter-to-gitea train-qlora: cachingOptions: enableCache: true componentRef: name: comp-train-qlora dependentTasks: - prepare-training-data inputs: parameters: base_model: componentInputParameter: base_model batch_size: componentInputParameter: batch_size dataset_path: taskOutputParameter: outputParameterKey: dataset_path producerTask: prepare-training-data gradient_accumulation_steps: componentInputParameter: gradient_accumulation_steps learning_rate: componentInputParameter: learning_rate lora_alpha: componentInputParameter: lora_alpha lora_dropout: componentInputParameter: lora_dropout lora_r: componentInputParameter: lora_r max_seq_length: componentInputParameter: max_seq_length num_epochs: componentInputParameter: num_epochs taskInfo: name: train-qlora inputDefinitions: parameters: aws_access_key_id: defaultValue: '' isOptional: true parameterType: STRING aws_secret_access_key: defaultValue: '' isOptional: true parameterType: STRING base_model: defaultValue: meta-llama/Llama-3.1-70B-Instruct isOptional: true parameterType: STRING batch_size: defaultValue: 2.0 isOptional: true parameterType: NUMBER_INTEGER chunk_overlap: defaultValue: 64.0 isOptional: true parameterType: NUMBER_INTEGER chunk_size: defaultValue: 512.0 isOptional: true parameterType: NUMBER_INTEGER gitea_owner: defaultValue: daviestechlabs isOptional: true parameterType: STRING gitea_password: defaultValue: '' isOptional: true parameterType: STRING gitea_repo: defaultValue: qlora-adapters isOptional: true parameterType: STRING gitea_url: defaultValue: http://gitea-http.gitea.svc.cluster.local:3000 isOptional: true parameterType: STRING gitea_username: defaultValue: '' isOptional: true parameterType: STRING gradient_accumulation_steps: defaultValue: 8.0 isOptional: true parameterType: NUMBER_INTEGER learning_rate: defaultValue: 0.0002 isOptional: true parameterType: NUMBER_DOUBLE lora_alpha: defaultValue: 16.0 isOptional: true parameterType: NUMBER_INTEGER lora_dropout: defaultValue: 0.05 isOptional: true parameterType: NUMBER_DOUBLE lora_r: defaultValue: 64.0 isOptional: true parameterType: NUMBER_INTEGER max_seq_length: defaultValue: 2048.0 isOptional: true parameterType: NUMBER_INTEGER mlflow_tracking_uri: defaultValue: http://mlflow.mlflow.svc.cluster.local:80 isOptional: true parameterType: STRING num_epochs: defaultValue: 3.0 isOptional: true parameterType: NUMBER_INTEGER s3_bucket: defaultValue: training-data isOptional: true parameterType: STRING s3_endpoint: defaultValue: candlekeep.lab.daviestechlabs.io isOptional: true parameterType: STRING s3_prefix: defaultValue: '' isOptional: true parameterType: STRING schemaVersion: 2.1.0 sdkVersion: kfp-2.12.1