kubeflow/qlora_pdf_pipeline.yaml

# PIPELINE DEFINITION
# Name: qlora-pdf-fine-tuning
# Description: Fine-tune Llama 3.1 70B via QLoRA on PDFs from the Quobjects training-data bucket. Pushes the adapter to Gitea and logs metrics to MLflow.
# Inputs:
#    aws_access_key_id: str [Default: '']
#    aws_secret_access_key: str [Default: '']
#    base_model: str [Default: 'meta-llama/Llama-3.1-70B-Instruct']
#    batch_size: int [Default: 2.0]
#    chunk_overlap: int [Default: 64.0]
#    chunk_size: int [Default: 512.0]
#    gitea_owner: str [Default: 'daviestechlabs']
#    gitea_password: str [Default: '']
#    gitea_repo: str [Default: 'qlora-adapters']
#    gitea_url: str [Default: 'http://gitea-http.gitea.svc.cluster.local:3000']
#    gitea_username: str [Default: '']
#    gradient_accumulation_steps: int [Default: 8.0]
#    learning_rate: float [Default: 0.0002]
#    lora_alpha: int [Default: 16.0]
#    lora_dropout: float [Default: 0.05]
#    lora_r: int [Default: 64.0]
#    max_seq_length: int [Default: 2048.0]
#    mlflow_tracking_uri: str [Default: 'http://mlflow.mlflow.svc.cluster.local:80']
#    num_epochs: int [Default: 3.0]
#    s3_bucket: str [Default: 'training-data']
#    s3_endpoint: str [Default: 'candlekeep.lab.daviestechlabs.io']
#    s3_prefix: str [Default: '']
components:
  comp-evaluate-adapter:
    executorLabel: exec-evaluate-adapter
    inputDefinitions:
      parameters:
        adapter_path:
          parameterType: STRING
        base_model:
          parameterType: STRING
    outputDefinitions:
      parameters:
        passed:
          parameterType: BOOLEAN
        report:
          parameterType: STRING
  comp-fetch-pdfs-from-s3:
    executorLabel: exec-fetch-pdfs-from-s3
    inputDefinitions:
      parameters:
        aws_access_key_id:
          parameterType: STRING
        aws_secret_access_key:
          parameterType: STRING
        s3_bucket:
          parameterType: STRING
        s3_endpoint:
          parameterType: STRING
        s3_prefix:
          parameterType: STRING
    outputDefinitions:
      parameters:
        num_files:
          parameterType: NUMBER_INTEGER
        pdf_dir:
          parameterType: STRING
  comp-log-training-metrics:
    executorLabel: exec-log-training-metrics
    inputDefinitions:
      parameters:
        base_model:
          parameterType: STRING
        eval_loss:
          parameterType: NUMBER_DOUBLE
        experiment_name:
          defaultValue: qlora-pdf-training
          isOptional: true
          parameterType: STRING
        learning_rate:
          parameterType: NUMBER_DOUBLE
        lora_alpha:
          parameterType: NUMBER_INTEGER
        lora_r:
          parameterType: NUMBER_INTEGER
        mlflow_tracking_uri:
          defaultValue: http://mlflow.mlflow.svc.cluster.local:80
          isOptional: true
          parameterType: STRING
        num_epochs:
          parameterType: NUMBER_INTEGER
        num_pdfs:
          parameterType: NUMBER_INTEGER
        num_train:
          parameterType: NUMBER_INTEGER
        num_val:
          parameterType: NUMBER_INTEGER
        repo_url:
          parameterType: STRING
        train_loss:
          parameterType: NUMBER_DOUBLE
  comp-prepare-training-data:
    executorLabel: exec-prepare-training-data
    inputDefinitions:
      parameters:
        chunk_overlap:
          defaultValue: 64.0
          isOptional: true
          parameterType: NUMBER_INTEGER
        chunk_size:
          defaultValue: 512.0
          isOptional: true
          parameterType: NUMBER_INTEGER
        max_seq_length:
          defaultValue: 2048.0
          isOptional: true
          parameterType: NUMBER_INTEGER
        pdf_dir:
          parameterType: STRING
    outputDefinitions:
      parameters:
        dataset_path:
          parameterType: STRING
        num_train:
          parameterType: NUMBER_INTEGER
        num_val:
          parameterType: NUMBER_INTEGER
  comp-push-adapter-to-gitea:
    executorLabel: exec-push-adapter-to-gitea
    inputDefinitions:
      parameters:
        adapter_path:
          parameterType: STRING
        branch:
          defaultValue: main
          isOptional: true
          parameterType: STRING
        commit_message:
          defaultValue: 'feat: add QLoRA adapter from PDF training pipeline'
          isOptional: true
          parameterType: STRING
        gitea_owner:
          parameterType: STRING
        gitea_password:
          parameterType: STRING
        gitea_repo:
          parameterType: STRING
        gitea_url:
          parameterType: STRING
        gitea_username:
          parameterType: STRING
    outputDefinitions:
      parameters:
        files_pushed:
          parameterType: NUMBER_INTEGER
        repo_url:
          parameterType: STRING
  comp-train-qlora:
    executorLabel: exec-train-qlora
    inputDefinitions:
      parameters:
        base_model:
          parameterType: STRING
        batch_size:
          defaultValue: 2.0
          isOptional: true
          parameterType: NUMBER_INTEGER
        dataset_path:
          parameterType: STRING
        gradient_accumulation_steps:
          defaultValue: 8.0
          isOptional: true
          parameterType: NUMBER_INTEGER
        learning_rate:
          defaultValue: 0.0002
          isOptional: true
          parameterType: NUMBER_DOUBLE
        lora_alpha:
          defaultValue: 16.0
          isOptional: true
          parameterType: NUMBER_INTEGER
        lora_dropout:
          defaultValue: 0.05
          isOptional: true
          parameterType: NUMBER_DOUBLE
        lora_r:
          defaultValue: 64.0
          isOptional: true
          parameterType: NUMBER_INTEGER
        max_seq_length:
          defaultValue: 2048.0
          isOptional: true
          parameterType: NUMBER_INTEGER
        num_epochs:
          defaultValue: 3.0
          isOptional: true
          parameterType: NUMBER_INTEGER
    outputDefinitions:
      parameters:
        adapter_path:
          parameterType: STRING
        eval_loss:
          parameterType: NUMBER_DOUBLE
        train_loss:
          parameterType: NUMBER_DOUBLE
deploymentSpec:
  executors:
    exec-evaluate-adapter:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - evaluate_adapter
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'torch' 'transformers'\
          \ 'peft' 'bitsandbytes' 'accelerate' 'scipy' && \"$0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef evaluate_adapter(\n    adapter_path: str,\n    base_model: str,\n\
          ) -> NamedTuple(\"EvalOutput\", [(\"report\", str), (\"passed\", bool)]):\n\
          \    \"\"\"Load the QLoRA adapter and run a few sanity-check prompts.\"\"\
          \"\n    import torch\n    from transformers import AutoModelForCausalLM,\
          \ AutoTokenizer, BitsAndBytesConfig\n    from peft import PeftModel\n\n\
          \    bnb_config = BitsAndBytesConfig(\n        load_in_4bit=True,\n    \
          \    bnb_4bit_quant_type=\"nf4\",\n        bnb_4bit_compute_dtype=torch.bfloat16,\n\
          \        bnb_4bit_use_double_quant=True,\n    )\n\n    print(f\"Loading\
          \ base model {base_model} \u2026\")\n    model = AutoModelForCausalLM.from_pretrained(\n\
          \        base_model,\n        quantization_config=bnb_config,\n        device_map=\"\
          auto\",\n        trust_remote_code=True,\n        torch_dtype=torch.bfloat16,\n\
          \    )\n    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)\n\
          \n    print(f\"Loading adapter from {adapter_path} \u2026\")\n    model\
          \ = PeftModel.from_pretrained(model, adapter_path)\n    model.eval()\n\n\
          \    test_prompts = [\n        \"Summarise the key points from the training\
          \ material.\",\n        \"What are the main topics covered in the source\
          \ documents?\",\n        \"Explain the most important concept from the training\
          \ data.\",\n    ]\n\n    lines = []\n    for prompt in test_prompts:\n \
          \       messages = [\n            {\"role\": \"system\", \"content\": \"\
          You are a helpful assistant.\"},\n            {\"role\": \"user\", \"content\"\
          : prompt},\n        ]\n        input_text = tokenizer.apply_chat_template(\n\
          \            messages, tokenize=False, add_generation_prompt=True\n    \
          \    )\n        inputs = tokenizer(input_text, return_tensors=\"pt\").to(model.device)\n\
          \        with torch.no_grad():\n            out = model.generate(**inputs,\
          \ max_new_tokens=128, temperature=0.7, do_sample=True)\n        response\
          \ = tokenizer.decode(out[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n\
          \        lines.append(f\"Q: {prompt}\\nA: {response}\\n\")\n        print(lines[-1])\n\
          \n    report = \"\\n\".join(lines)\n    # Simple heuristic: did the model\
          \ produce non-empty responses?\n    passed = all(len(l.split(\"A:\")[1].strip())\
          \ > 10 for l in lines)\n    print(f\"Evaluation passed: {passed}\")\n\n\
          \    from collections import namedtuple\n\n    return namedtuple(\"EvalOutput\"\
          , [\"report\", \"passed\"])(\n        report=report, passed=passed\n   \
          \ )\n\n"
        image: python:3.13-slim
        resources:
          accelerator:
            resourceCount: '1'
            resourceType: gpu
    exec-fetch-pdfs-from-s3:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - fetch_pdfs_from_s3
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'boto3' && \"\
          $0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef fetch_pdfs_from_s3(\n    s3_endpoint: str,\n    s3_bucket: str,\n\
          \    s3_prefix: str,\n    aws_access_key_id: str,\n    aws_secret_access_key:\
          \ str,\n) -> NamedTuple(\"PDFOutput\", [(\"pdf_dir\", str), (\"num_files\"\
          , int)]):\n    \"\"\"Download all PDFs from a Quobjects S3 bucket.\"\"\"\
          \n    import os\n    import boto3\n    from botocore.client import Config\n\
          \n    out_dir = \"/tmp/pdfs\"\n    os.makedirs(out_dir, exist_ok=True)\n\
          \n    client = boto3.client(\n        \"s3\",\n        endpoint_url=f\"\
          http://{s3_endpoint}\",\n        aws_access_key_id=aws_access_key_id,\n\
          \        aws_secret_access_key=aws_secret_access_key,\n        region_name=\"\
          us-east-1\",\n        config=Config(signature_version=\"s3v4\"),\n    )\n\
          \n    paginator = client.get_paginator(\"list_objects_v2\")\n    count =\
          \ 0\n    for page in paginator.paginate(Bucket=s3_bucket, Prefix=s3_prefix):\n\
          \        for obj in page.get(\"Contents\", []):\n            key = obj[\"\
          Key\"]\n            if key.lower().endswith(\".pdf\"):\n               \
          \ local_path = os.path.join(out_dir, os.path.basename(key))\n          \
          \      print(f\"Downloading: {key} \u2192 {local_path}\")\n            \
          \    client.download_file(s3_bucket, key, local_path)\n                count\
          \ += 1\n\n    print(f\"Downloaded {count} PDFs to {out_dir}\")\n    from\
          \ collections import namedtuple\n\n    return namedtuple(\"PDFOutput\",\
          \ [\"pdf_dir\", \"num_files\"])(\n        pdf_dir=out_dir, num_files=count\n\
          \    )\n\n"
        image: python:3.13-slim
    exec-log-training-metrics:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - log_training_metrics
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'mlflow==2.22.0'\
          \ && \"$0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef log_training_metrics(\n    base_model: str,\n    train_loss:\
          \ float,\n    eval_loss: float,\n    num_train: int,\n    num_val: int,\n\
          \    num_pdfs: int,\n    lora_r: int,\n    lora_alpha: int,\n    learning_rate:\
          \ float,\n    num_epochs: int,\n    repo_url: str,\n    mlflow_tracking_uri:\
          \ str = \"http://mlflow.mlflow.svc.cluster.local:80\",\n    experiment_name:\
          \ str = \"qlora-pdf-training\",\n):\n    \"\"\"Log the full training run\
          \ to MLflow.\"\"\"\n    import mlflow\n\n    mlflow.set_tracking_uri(mlflow_tracking_uri)\n\
          \    mlflow.set_experiment(experiment_name)\n\n    with mlflow.start_run(run_name=f\"\
          qlora-{base_model.split('/')[-1]}\"):\n        mlflow.log_params(\n    \
          \        {\n                \"base_model\": base_model,\n              \
          \  \"lora_r\": lora_r,\n                \"lora_alpha\": lora_alpha,\n  \
          \              \"learning_rate\": learning_rate,\n                \"num_epochs\"\
          : num_epochs,\n                \"num_pdfs\": num_pdfs,\n               \
          \ \"data_source\": \"quobjects/training-data\",\n            }\n       \
          \ )\n        mlflow.log_metrics(\n            {\n                \"train_loss\"\
          : train_loss,\n                \"eval_loss\": eval_loss,\n             \
          \   \"train_samples\": float(num_train),\n                \"val_samples\"\
          : float(num_val),\n            }\n        )\n        mlflow.set_tag(\"adapter_repo\"\
          , repo_url)\n\n"
        image: python:3.13-slim
    exec-prepare-training-data:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - prepare_training_data
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'pymupdf' &&\
          \ \"$0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef prepare_training_data(\n    pdf_dir: str,\n    max_seq_length:\
          \ int = 2048,\n    chunk_size: int = 512,\n    chunk_overlap: int = 64,\n\
          ) -> NamedTuple(\"DataOutput\", [(\"dataset_path\", str), (\"num_train\"\
          , int), (\"num_val\", int)]):\n    \"\"\"Extract text from PDFs, chunk it,\
          \ and format as instruction-tuning pairs.\"\"\"\n    import json\n    import\
          \ os\n    import fitz  # PyMuPDF\n\n    out_dir = \"/tmp/training_data\"\
          \n    os.makedirs(out_dir, exist_ok=True)\n\n    # 1. Extract text from\
          \ all PDFs\n    all_chunks: list[dict] = []\n    for fname in sorted(os.listdir(pdf_dir)):\n\
          \        if not fname.lower().endswith(\".pdf\"):\n            continue\n\
          \        path = os.path.join(pdf_dir, fname)\n        print(f\"Extracting:\
          \ {fname}\")\n        try:\n            doc = fitz.open(path)\n        \
          \    full_text = \"\"\n            for page in doc:\n                full_text\
          \ += page.get_text() + \"\\n\"\n            doc.close()\n        except\
          \ Exception as e:\n            print(f\"  SKIP ({e})\")\n            continue\n\
          \n        # 2. Chunk text with overlap\n        words = full_text.split()\n\
          \        for i in range(0, len(words), chunk_size - chunk_overlap):\n  \
          \          chunk_words = words[i : i + chunk_size]\n            if len(chunk_words)\
          \ < 50:\n                continue  # skip tiny trailing chunks\n       \
          \     chunk_text = \" \".join(chunk_words)\n            all_chunks.append({\"\
          text\": chunk_text, \"source\": fname})\n\n    print(f\"Total chunks: {len(all_chunks)}\"\
          )\n    if not all_chunks:\n        raise ValueError(\"No text extracted\
          \ from PDFs \u2014 check your bucket\")\n\n    # 3. Format as Llama 3 chat\
          \ training pairs\n    #    We create self-supervised pairs: model learns\
          \ to continue/explain the content\n    samples = []\n    for chunk in all_chunks:\n\
          \        text = chunk[\"text\"]\n        source = chunk[\"source\"]\n  \
          \      # Split chunk roughly in half for input/output\n        words = text.split()\n\
          \        mid = len(words) // 2\n        context = \" \".join(words[:mid])\n\
          \        continuation = \" \".join(words[mid:])\n\n        samples.append(\n\
          \            {\n                \"messages\": [\n                    {\n\
          \                        \"role\": \"system\",\n                       \
          \ \"content\": (\n                            \"You are a knowledgeable\
          \ assistant. \"\n                            \"Continue the information\
          \ accurately and coherently.\"\n                        ),\n           \
          \         },\n                    {\n                        \"role\": \"\
          user\",\n                        \"content\": f\"Continue the following\
          \ passage from {source}:\\n\\n{context}\",\n                    },\n   \
          \                 {\"role\": \"assistant\", \"content\": continuation},\n\
          \                ]\n            }\n        )\n\n    # 4. Train/val split\
          \ (90/10)\n    import random\n\n    random.seed(42)\n    random.shuffle(samples)\n\
          \    split = int(len(samples) * 0.9)\n    train = samples[:split]\n    val\
          \ = samples[split:]\n\n    train_path = os.path.join(out_dir, \"train.json\"\
          )\n    val_path = os.path.join(out_dir, \"val.json\")\n    with open(train_path,\
          \ \"w\") as f:\n        json.dump(train, f)\n    with open(val_path, \"\
          w\") as f:\n        json.dump(val, f)\n\n    print(f\"Train: {len(train)}\
          \ samples, Val: {len(val)} samples\")\n    from collections import namedtuple\n\
          \n    return namedtuple(\"DataOutput\", [\"dataset_path\", \"num_train\"\
          , \"num_val\"])(\n        dataset_path=out_dir, num_train=len(train), num_val=len(val)\n\
          \    )\n\n"
        image: python:3.13-slim
    exec-push-adapter-to-gitea:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - push_adapter_to_gitea
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'requests' &&\
          \ \"$0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef push_adapter_to_gitea(\n    adapter_path: str,\n    gitea_url:\
          \ str,\n    gitea_owner: str,\n    gitea_repo: str,\n    gitea_username:\
          \ str,\n    gitea_password: str,\n    branch: str = \"main\",\n    commit_message:\
          \ str = \"feat: add QLoRA adapter from PDF training pipeline\",\n) -> NamedTuple(\"\
          PushOutput\", [(\"repo_url\", str), (\"files_pushed\", int)]):\n    \"\"\
          \"Push the QLoRA adapter files to a Gitea repository via the API.\"\"\"\n\
          \    import base64\n    import json\n    import os\n    import requests\n\
          \n    api_base = f\"{gitea_url}/api/v1\"\n    auth = (gitea_username, gitea_password)\n\
          \    repo_api = f\"{api_base}/repos/{gitea_owner}/{gitea_repo}\"\n\n   \
          \ # Check if repo exists, create if not\n    resp = requests.get(repo_api,\
          \ auth=auth, timeout=30)\n    if resp.status_code == 404:\n        print(f\"\
          Creating repo {gitea_owner}/{gitea_repo} \u2026\")\n        create_resp\
          \ = requests.post(\n            f\"{api_base}/orgs/{gitea_owner}/repos\"\
          \n            if gitea_owner != gitea_username\n            else f\"{api_base}/user/repos\"\
          ,\n            auth=auth,\n            json={\n                \"name\"\
          : gitea_repo,\n                \"description\": \"QLoRA adapters trained\
          \ from PDF documents\",\n                \"private\": False,\n         \
          \       \"auto_init\": True,\n            },\n            timeout=30,\n\
          \        )\n        create_resp.raise_for_status()\n        print(f\"Created:\
          \ {create_resp.json().get('html_url')}\")\n\n    # Collect all adapter files\n\
          \    files_to_push = []\n    for root, dirs, files in os.walk(adapter_path):\n\
          \        for fname in files:\n            fpath = os.path.join(root, fname)\n\
          \            rel_path = os.path.relpath(fpath, adapter_path)\n         \
          \   with open(fpath, \"rb\") as f:\n                content = base64.b64encode(f.read()).decode(\"\
          utf-8\")\n            files_to_push.append({\"path\": rel_path, \"content\"\
          : content})\n\n    print(f\"Pushing {len(files_to_push)} files to {gitea_owner}/{gitea_repo}\"\
          )\n\n    # Push each file via Gitea contents API\n    pushed = 0\n    for\
          \ item in files_to_push:\n        file_api = f\"{repo_api}/contents/{item['path']}\"\
          \n\n        # Check if file already exists (need SHA for update)\n     \
          \   existing = requests.get(file_api, auth=auth, params={\"ref\": branch},\
          \ timeout=30)\n        payload = {\n            \"message\": commit_message,\n\
          \            \"content\": item[\"content\"],\n            \"branch\": branch,\n\
          \        }\n        if existing.status_code == 200:\n            payload[\"\
          sha\"] = existing.json()[\"sha\"]\n            resp = requests.put(file_api,\
          \ auth=auth, json=payload, timeout=60)\n        else:\n            resp\
          \ = requests.post(file_api, auth=auth, json=payload, timeout=60)\n\n   \
          \     if resp.status_code in (200, 201):\n            pushed += 1\n    \
          \        print(f\"  \u2713 {item['path']}\")\n        else:\n          \
          \  print(f\"  \u2717 {item['path']}: {resp.status_code} {resp.text[:200]}\"\
          )\n\n    repo_url = f\"{gitea_url}/{gitea_owner}/{gitea_repo}\"\n    print(f\"\
          Pushed {pushed}/{len(files_to_push)} files to {repo_url}\")\n\n    from\
          \ collections import namedtuple\n\n    return namedtuple(\"PushOutput\"\
          , [\"repo_url\", \"files_pushed\"])(\n        repo_url=repo_url, files_pushed=pushed\n\
          \    )\n\n"
        image: python:3.13-slim
    exec-train-qlora:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - train_qlora
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'torch' 'transformers'\
          \ 'peft' 'datasets' 'accelerate' 'bitsandbytes' 'scipy' 'trl' && \"$0\"\
          \ \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef train_qlora(\n    dataset_path: str,\n    base_model: str,\n\
          \    learning_rate: float = 2e-4,\n    num_epochs: int = 3,\n    batch_size:\
          \ int = 2,\n    gradient_accumulation_steps: int = 8,\n    max_seq_length:\
          \ int = 2048,\n    lora_r: int = 64,\n    lora_alpha: int = 16,\n    lora_dropout:\
          \ float = 0.05,\n) -> NamedTuple(\n    \"TrainOutput\",\n    [(\"adapter_path\"\
          , str), (\"train_loss\", float), (\"eval_loss\", float)],\n):\n    \"\"\"\
          QLoRA fine-tune Llama 3.1 70B with 4-bit NF4 quantization.\"\"\"\n    import\
          \ json\n    import os\n\n    import torch\n    from datasets import Dataset\n\
          \    from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training\n\
          \    from transformers import (\n        AutoModelForCausalLM,\n       \
          \ AutoTokenizer,\n        BitsAndBytesConfig,\n        TrainingArguments,\n\
          \    )\n    from trl import SFTTrainer\n\n    output_dir = \"/tmp/qlora_output\"\
          \n    os.makedirs(output_dir, exist_ok=True)\n\n    # \u2500\u2500 Load\
          \ data \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n    with open(os.path.join(dataset_path,\
          \ \"train.json\")) as f:\n        train_data = json.load(f)\n    with open(os.path.join(dataset_path,\
          \ \"val.json\")) as f:\n        val_data = json.load(f)\n\n    print(f\"\
          Loaded {len(train_data)} train / {len(val_data)} val samples\")\n\n    #\
          \ \u2500\u2500 Tokenizer \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   \
          \ print(f\"Loading tokenizer: {base_model}\")\n    tokenizer = AutoTokenizer.from_pretrained(base_model,\
          \ trust_remote_code=True)\n    if tokenizer.pad_token is None:\n       \
          \ tokenizer.pad_token = tokenizer.eos_token\n    tokenizer.padding_side\
          \ = \"right\"\n\n    # \u2500\u2500 Format with chat template \u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\n    def format_chat(sample):\n        return {\"text\": tokenizer.apply_chat_template(\n\
          \            sample[\"messages\"], tokenize=False, add_generation_prompt=False\n\
          \        )}\n\n    train_ds = Dataset.from_list(train_data).map(format_chat)\n\
          \    val_ds = Dataset.from_list(val_data).map(format_chat)\n\n    # \u2500\
          \u2500 4-bit quantisation \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\n    bnb_config = BitsAndBytesConfig(\n        load_in_4bit=True,\n\
          \        bnb_4bit_quant_type=\"nf4\",\n        bnb_4bit_compute_dtype=torch.bfloat16,\n\
          \        bnb_4bit_use_double_quant=True,\n    )\n\n    print(f\"Loading\
          \ model: {base_model} (4-bit NF4)\")\n    model = AutoModelForCausalLM.from_pretrained(\n\
          \        base_model,\n        quantization_config=bnb_config,\n        device_map=\"\
          auto\",\n        trust_remote_code=True,\n        torch_dtype=torch.bfloat16,\n\
          \    )\n    model = prepare_model_for_kbit_training(model)\n\n    # \u2500\
          \u2500 LoRA config \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n    lora_config = LoraConfig(\n\
          \        r=lora_r,\n        lora_alpha=lora_alpha,\n        target_modules=[\n\
          \            \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n         \
          \   \"gate_proj\", \"up_proj\", \"down_proj\",\n        ],\n        lora_dropout=lora_dropout,\n\
          \        bias=\"none\",\n        task_type=\"CAUSAL_LM\",\n    )\n\n   \
          \ model = get_peft_model(model, lora_config)\n    model.print_trainable_parameters()\n\
          \n    # \u2500\u2500 Training args \u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n    training_args = TrainingArguments(\n\
          \        output_dir=os.path.join(output_dir, \"checkpoints\"),\n       \
          \ num_train_epochs=num_epochs,\n        per_device_train_batch_size=batch_size,\n\
          \        per_device_eval_batch_size=batch_size,\n        gradient_accumulation_steps=gradient_accumulation_steps,\n\
          \        learning_rate=learning_rate,\n        bf16=True,\n        logging_steps=5,\n\
          \        eval_strategy=\"steps\",\n        eval_steps=50,\n        save_strategy=\"\
          steps\",\n        save_steps=100,\n        save_total_limit=2,\n       \
          \ load_best_model_at_end=True,\n        metric_for_best_model=\"eval_loss\"\
          ,\n        report_to=\"none\",\n        warmup_ratio=0.03,\n        lr_scheduler_type=\"\
          cosine\",\n        optim=\"paged_adamw_8bit\",\n        max_grad_norm=0.3,\n\
          \        group_by_length=True,\n    )\n\n    # \u2500\u2500 SFTTrainer \u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\n    trainer = SFTTrainer(\n        model=model,\n\
          \        args=training_args,\n        train_dataset=train_ds,\n        eval_dataset=val_ds,\n\
          \        tokenizer=tokenizer,\n        max_seq_length=max_seq_length,\n\
          \        dataset_text_field=\"text\",\n        packing=True,  # pack short\
          \ samples for efficiency\n    )\n\n    print(\"Starting QLoRA training \u2026\
          \")\n    result = trainer.train()\n    train_loss = result.training_loss\n\
          \n    eval_result = trainer.evaluate()\n    eval_loss = eval_result.get(\"\
          eval_loss\", 0.0)\n\n    print(f\"Train loss: {train_loss:.4f}, Eval loss:\
          \ {eval_loss:.4f}\")\n\n    # \u2500\u2500 Save adapter \u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
          \u2500\n    adapter_path = os.path.join(output_dir, \"adapter\")\n    model.save_pretrained(adapter_path)\n\
          \    tokenizer.save_pretrained(adapter_path)\n\n    metadata = {\n     \
          \   \"base_model\": base_model,\n        \"lora_r\": lora_r,\n        \"\
          lora_alpha\": lora_alpha,\n        \"lora_dropout\": lora_dropout,\n   \
          \     \"learning_rate\": learning_rate,\n        \"num_epochs\": num_epochs,\n\
          \        \"batch_size\": batch_size,\n        \"gradient_accumulation_steps\"\
          : gradient_accumulation_steps,\n        \"max_seq_length\": max_seq_length,\n\
          \        \"train_samples\": len(train_data),\n        \"val_samples\": len(val_data),\n\
          \        \"train_loss\": train_loss,\n        \"eval_loss\": eval_loss,\n\
          \    }\n    with open(os.path.join(adapter_path, \"training_metadata.json\"\
          ), \"w\") as f:\n        json.dump(metadata, f, indent=2)\n\n    print(f\"\
          Adapter saved to {adapter_path}\")\n\n    from collections import namedtuple\n\
          \n    return namedtuple(\"TrainOutput\", [\"adapter_path\", \"train_loss\"\
          , \"eval_loss\"])(\n        adapter_path=adapter_path,\n        train_loss=train_loss,\n\
          \        eval_loss=eval_loss,\n    )\n\n"
        image: python:3.13-slim
        resources:
          accelerator:
            resourceCount: '1'
            resourceType: gpu
pipelineInfo:
  description: Fine-tune Llama 3.1 70B via QLoRA on PDFs from the Quobjects training-data
    bucket. Pushes the adapter to Gitea and logs metrics to MLflow.
  name: qlora-pdf-fine-tuning
root:
  dag:
    tasks:
      evaluate-adapter:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-evaluate-adapter
        dependentTasks:
        - train-qlora
        inputs:
          parameters:
            adapter_path:
              taskOutputParameter:
                outputParameterKey: adapter_path
                producerTask: train-qlora
            base_model:
              componentInputParameter: base_model
        taskInfo:
          name: evaluate-adapter
      fetch-pdfs-from-s3:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-fetch-pdfs-from-s3
        inputs:
          parameters:
            aws_access_key_id:
              componentInputParameter: aws_access_key_id
            aws_secret_access_key:
              componentInputParameter: aws_secret_access_key
            s3_bucket:
              componentInputParameter: s3_bucket
            s3_endpoint:
              componentInputParameter: s3_endpoint
            s3_prefix:
              componentInputParameter: s3_prefix
        taskInfo:
          name: fetch-pdfs-from-s3
      log-training-metrics:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-log-training-metrics
        dependentTasks:
        - fetch-pdfs-from-s3
        - prepare-training-data
        - push-adapter-to-gitea
        - train-qlora
        inputs:
          parameters:
            base_model:
              componentInputParameter: base_model
            eval_loss:
              taskOutputParameter:
                outputParameterKey: eval_loss
                producerTask: train-qlora
            learning_rate:
              componentInputParameter: learning_rate
            lora_alpha:
              componentInputParameter: lora_alpha
            lora_r:
              componentInputParameter: lora_r
            mlflow_tracking_uri:
              componentInputParameter: mlflow_tracking_uri
            num_epochs:
              componentInputParameter: num_epochs
            num_pdfs:
              taskOutputParameter:
                outputParameterKey: num_files
                producerTask: fetch-pdfs-from-s3
            num_train:
              taskOutputParameter:
                outputParameterKey: num_train
                producerTask: prepare-training-data
            num_val:
              taskOutputParameter:
                outputParameterKey: num_val
                producerTask: prepare-training-data
            repo_url:
              taskOutputParameter:
                outputParameterKey: repo_url
                producerTask: push-adapter-to-gitea
            train_loss:
              taskOutputParameter:
                outputParameterKey: train_loss
                producerTask: train-qlora
        taskInfo:
          name: log-training-metrics
      prepare-training-data:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-prepare-training-data
        dependentTasks:
        - fetch-pdfs-from-s3
        inputs:
          parameters:
            chunk_overlap:
              componentInputParameter: chunk_overlap
            chunk_size:
              componentInputParameter: chunk_size
            max_seq_length:
              componentInputParameter: max_seq_length
            pdf_dir:
              taskOutputParameter:
                outputParameterKey: pdf_dir
                producerTask: fetch-pdfs-from-s3
        taskInfo:
          name: prepare-training-data
      push-adapter-to-gitea:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-push-adapter-to-gitea
        dependentTasks:
        - train-qlora
        inputs:
          parameters:
            adapter_path:
              taskOutputParameter:
                outputParameterKey: adapter_path
                producerTask: train-qlora
            gitea_owner:
              componentInputParameter: gitea_owner
            gitea_password:
              componentInputParameter: gitea_password
            gitea_repo:
              componentInputParameter: gitea_repo
            gitea_url:
              componentInputParameter: gitea_url
            gitea_username:
              componentInputParameter: gitea_username
        taskInfo:
          name: push-adapter-to-gitea
      train-qlora:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-train-qlora
        dependentTasks:
        - prepare-training-data
        inputs:
          parameters:
            base_model:
              componentInputParameter: base_model
            batch_size:
              componentInputParameter: batch_size
            dataset_path:
              taskOutputParameter:
                outputParameterKey: dataset_path
                producerTask: prepare-training-data
            gradient_accumulation_steps:
              componentInputParameter: gradient_accumulation_steps
            learning_rate:
              componentInputParameter: learning_rate
            lora_alpha:
              componentInputParameter: lora_alpha
            lora_dropout:
              componentInputParameter: lora_dropout
            lora_r:
              componentInputParameter: lora_r
            max_seq_length:
              componentInputParameter: max_seq_length
            num_epochs:
              componentInputParameter: num_epochs
        taskInfo:
          name: train-qlora
  inputDefinitions:
    parameters:
      aws_access_key_id:
        defaultValue: ''
        isOptional: true
        parameterType: STRING
      aws_secret_access_key:
        defaultValue: ''
        isOptional: true
        parameterType: STRING
      base_model:
        defaultValue: meta-llama/Llama-3.1-70B-Instruct
        isOptional: true
        parameterType: STRING
      batch_size:
        defaultValue: 2.0
        isOptional: true
        parameterType: NUMBER_INTEGER
      chunk_overlap:
        defaultValue: 64.0
        isOptional: true
        parameterType: NUMBER_INTEGER
      chunk_size:
        defaultValue: 512.0
        isOptional: true
        parameterType: NUMBER_INTEGER
      gitea_owner:
        defaultValue: daviestechlabs
        isOptional: true
        parameterType: STRING
      gitea_password:
        defaultValue: ''
        isOptional: true
        parameterType: STRING
      gitea_repo:
        defaultValue: qlora-adapters
        isOptional: true
        parameterType: STRING
      gitea_url:
        defaultValue: http://gitea-http.gitea.svc.cluster.local:3000
        isOptional: true
        parameterType: STRING
      gitea_username:
        defaultValue: ''
        isOptional: true
        parameterType: STRING
      gradient_accumulation_steps:
        defaultValue: 8.0
        isOptional: true
        parameterType: NUMBER_INTEGER
      learning_rate:
        defaultValue: 0.0002
        isOptional: true
        parameterType: NUMBER_DOUBLE
      lora_alpha:
        defaultValue: 16.0
        isOptional: true
        parameterType: NUMBER_INTEGER
      lora_dropout:
        defaultValue: 0.05
        isOptional: true
        parameterType: NUMBER_DOUBLE
      lora_r:
        defaultValue: 64.0
        isOptional: true
        parameterType: NUMBER_INTEGER
      max_seq_length:
        defaultValue: 2048.0
        isOptional: true
        parameterType: NUMBER_INTEGER
      mlflow_tracking_uri:
        defaultValue: http://mlflow.mlflow.svc.cluster.local:80
        isOptional: true
        parameterType: STRING
      num_epochs:
        defaultValue: 3.0
        isOptional: true
        parameterType: NUMBER_INTEGER
      s3_bucket:
        defaultValue: training-data
        isOptional: true
        parameterType: STRING
      s3_endpoint:
        defaultValue: candlekeep.lab.daviestechlabs.io
        isOptional: true
        parameterType: STRING
      s3_prefix:
        defaultValue: ''
        isOptional: true
        parameterType: STRING
schemaVersion: 2.1.0
sdkVersion: kfp-2.12.1