feat: add QLoRA PDF pipeline and Gitea CI workflow

- qlora_pdf_pipeline.py: 6-step QLoRA fine-tuning pipeline (S3 PDFs → prepare data → train → evaluate → push to Gitea → MLflow) - .gitea/workflows/compile-upload.yaml: auto-compile and upload all pipelines to Kubeflow on push, with ntfy notifications
2026-02-13 10:28:53 -05:00
parent 45996a8dbf
commit 321eca5943
3 changed files with 1830 additions and 0 deletions
--- a/.gitea/workflows/compile-upload.yaml
+++ b/.gitea/workflows/compile-upload.yaml
@@ -0,0 +1,221 @@
+name: Compile and Upload Pipelines
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "**/*_pipeline.py"
+      - "**/*pipeline*.py"
+  workflow_dispatch:
+
+env:
+  NTFY_URL: http://ntfy.observability.svc.cluster.local:80
+  KUBEFLOW_HOST: http://ml-pipeline.kubeflow.svc.cluster.local:8888
+
+jobs:
+  compile-and-upload:
+    name: Compile & Upload
+    runs-on: ubuntu-latest
+    outputs:
+      compiled: ${{ steps.compile.outputs.compiled }}
+      failed: ${{ steps.compile.outputs.failed }}
+      uploaded: ${{ steps.upload.outputs.uploaded }}
+      upload_failed: ${{ steps.upload.outputs.failed }}
+      version: ${{ steps.upload.outputs.version }}
+      uploaded_names: ${{ steps.upload.outputs.uploaded_names }}
+      failed_names: ${{ steps.upload.outputs.failed_names }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+
+      - name: Install KFP
+        run: pip install kfp==2.12.1
+
+      - name: Discover pipeline files
+        id: discover
+        run: |
+          # Find all pipeline Python files
+          FILES=$(find . -maxdepth 1 -name '*_pipeline.py' -o -name '*pipeline*.py' | sort)
+          COUNT=$(echo "$FILES" | grep -c '.' || true)
+          echo "files<<EOF" >> $GITHUB_OUTPUT
+          echo "$FILES" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+          echo "count=$COUNT" >> $GITHUB_OUTPUT
+          echo "Found $COUNT pipeline files:"
+          echo "$FILES"
+
+      - name: Compile pipelines
+        id: compile
+        run: |
+          COMPILED=0
+          FAILED=0
+          COMPILED_LIST=""
+          FAILED_LIST=""
+
+          for py_file in ${{ steps.discover.outputs.files }}; do
+            name=$(basename "$py_file" .py)
+            echo "::group::Compiling $name"
+
+            if python "$py_file"; then
+              yaml_file="${name}.yaml"
+              if [ -f "$yaml_file" ]; then
+                echo "✓ Compiled $name → $yaml_file"
+                COMPILED=$((COMPILED + 1))
+                COMPILED_LIST="${COMPILED_LIST}${name}\n"
+              else
+                echo "✗ $name produced no YAML output"
+                FAILED=$((FAILED + 1))
+                FAILED_LIST="${FAILED_LIST}${name}\n"
+              fi
+            else
+              echo "✗ Failed to compile $name"
+              FAILED=$((FAILED + 1))
+              FAILED_LIST="${FAILED_LIST}${name}\n"
+            fi
+
+            echo "::endgroup::"
+          done
+
+          echo "compiled=$COMPILED" >> $GITHUB_OUTPUT
+          echo "failed=$FAILED" >> $GITHUB_OUTPUT
+          echo "compiled_list<<EOF" >> $GITHUB_OUTPUT
+          echo -e "$COMPILED_LIST" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+          echo "failed_list<<EOF" >> $GITHUB_OUTPUT
+          echo -e "$FAILED_LIST" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+
+          echo ""
+          echo "=== Summary ==="
+          echo "Compiled: $COMPILED"
+          echo "Failed:   $FAILED"
+
+          if [ "$FAILED" -gt 0 ]; then
+            echo "::warning::$FAILED pipeline(s) failed to compile"
+          fi
+
+      - name: Upload pipelines to Kubeflow
+        id: upload
+        run: |
+          python3 << 'UPLOAD_SCRIPT'
+          import os
+          import sys
+          from pathlib import Path
+          from datetime import datetime
+          from kfp import Client
+
+          host = os.environ["KUBEFLOW_HOST"]
+          print(f"Connecting to Kubeflow at {host}")
+
+          try:
+              client = Client(host=host)
+              client.list_pipelines(page_size=1)
+              print("Connected to Kubeflow Pipelines")
+          except Exception as e:
+              print(f"ERROR: Cannot connect to Kubeflow: {e}")
+              sys.exit(1)
+
+          # Get all compiled YAML files
+          yaml_files = sorted(Path(".").glob("*_pipeline.yaml"))
+          if not yaml_files:
+              yaml_files = sorted(Path(".").glob("*pipeline*.yaml"))
+
+          uploaded = 0
+          failed = 0
+          uploaded_names = []
+          failed_names = []
+          version_tag = f"v{datetime.now().strftime('%Y%m%d-%H%M%S')}"
+
+          for yaml_path in yaml_files:
+              pipeline_name = yaml_path.stem.replace("_", "-")
+              print(f"\n--- {pipeline_name} ---")
+
+              try:
+                  # Check if pipeline already exists
+                  existing = None
+                  all_pipelines = client.list_pipelines(page_size=200)
+                  if all_pipelines.pipelines:
+                      for p in all_pipelines.pipelines:
+                          if p.display_name == pipeline_name:
+                              existing = p
+                              break
+
+                  if existing:
+                      print(f"  Updating: {pipeline_name} ({existing.pipeline_id})")
+                      client.upload_pipeline_version(
+                          pipeline_package_path=str(yaml_path),
+                          pipeline_version_name=version_tag,
+                          pipeline_id=existing.pipeline_id,
+                      )
+                  else:
+                      print(f"  Creating: {pipeline_name}")
+                      client.upload_pipeline(
+                          pipeline_package_path=str(yaml_path),
+                          pipeline_name=pipeline_name,
+                      )
+
+                  uploaded += 1
+                  uploaded_names.append(pipeline_name)
+                  print(f"  ✓ Done")
+
+              except Exception as e:
+                  failed += 1
+                  failed_names.append(pipeline_name)
+                  print(f"  ✗ Error: {e}")
+
+          # Write outputs
+          with open(os.environ["GITHUB_OUTPUT"], "a") as f:
+              f.write(f"uploaded={uploaded}\n")
+              f.write(f"failed={failed}\n")
+              f.write(f"version={version_tag}\n")
+              f.write(f"uploaded_names={', '.join(uploaded_names)}\n")
+              f.write(f"failed_names={', '.join(failed_names)}\n")
+
+          print(f"\n=== Upload Summary ===")
+          print(f"Uploaded: {uploaded}")
+          print(f"Failed:   {failed}")
+
+          if failed > 0:
+              sys.exit(1)
+          UPLOAD_SCRIPT
+
+  notify:
+    name: Notify
+    runs-on: ubuntu-latest
+    needs: [compile-and-upload]
+    if: always()
+    steps:
+      - name: Notify on success
+        if: needs.compile-and-upload.result == 'success'
+        run: |
+          curl -s \
+            -H "Title: ✅ Pipelines uploaded to Kubeflow" \
+            -H "Priority: default" \
+            -H "Tags: white_check_mark,rocket" \
+            -H "Click: ${{ gitea.server_url }}/${{ gitea.repository }}/actions/runs/${{ gitea.run_id }}" \
+            -d "Branch: ${{ gitea.ref_name }}
+          Commit: ${{ gitea.event.head_commit.message || gitea.sha }}
+          Compiled: ${{ needs.compile-and-upload.outputs.compiled || '?' }} pipeline(s)
+          Uploaded: ${{ needs.compile-and-upload.outputs.uploaded || '?' }} pipeline(s)
+          Version: ${{ needs.compile-and-upload.outputs.version || 'n/a' }}" \
+            ${{ env.NTFY_URL }}/gitea-ci
+
+      - name: Notify on failure
+        if: needs.compile-and-upload.result == 'failure'
+        run: |
+          curl -s \
+            -H "Title: ❌ Pipeline upload failed" \
+            -H "Priority: high" \
+            -H "Tags: x,rocket" \
+            -H "Click: ${{ gitea.server_url }}/${{ gitea.repository }}/actions/runs/${{ gitea.run_id }}" \
+            -d "Branch: ${{ gitea.ref_name }}
+          Commit: ${{ gitea.event.head_commit.message || gitea.sha }}
+          Compiled: ${{ needs.compile-and-upload.outputs.compiled || '?' }}, Failed compile: ${{ needs.compile-and-upload.outputs.failed || '?' }}
+          Upload failures: ${{ needs.compile-and-upload.outputs.failed_names || 'unknown' }}
+          Check logs for details." \
+            ${{ env.NTFY_URL }}/gitea-ci
--- a/qlora_pdf_pipeline.py
+++ b/qlora_pdf_pipeline.py
@@ -0,0 +1,705 @@
+#!/usr/bin/env python3
+"""
+QLoRA Fine-Tuning Pipeline – Kubeflow Pipelines SDK
+
+Fetches PDFs from a Quobjects S3 bucket, extracts instruction-tuning
+data, trains a QLoRA adapter on the Llama 3.1 70B base model using
+the Strix Halo's 128 GB unified memory, evaluates it, and pushes the
+adapter weights to a Gitea repository.
+
+Usage:
+  pip install kfp==2.12.1
+  python qlora_pdf_pipeline.py
+  # Upload qlora_pdf_pipeline.yaml to Kubeflow Pipelines UI
+
+Prerequisites in-cluster:
+  - Secret mlpipeline-minio-artifact (namespace kubeflow) for S3 creds
+  - Secret gitea-admin-secret (namespace gitea) for Gitea push
+  - Node khelben with amd.com/gpu and the ROCm PyTorch image
+"""
+
+from kfp import compiler, dsl
+from typing import NamedTuple
+
+
+# ──────────────────────────────────────────────────────────────
+# 1. Fetch PDFs from Quobjects S3
+# ──────────────────────────────────────────────────────────────
+@dsl.component(
+    base_image="python:3.13-slim",
+    packages_to_install=["boto3"],
+)
+def fetch_pdfs_from_s3(
+    s3_endpoint: str,
+    s3_bucket: str,
+    s3_prefix: str,
+    aws_access_key_id: str,
+    aws_secret_access_key: str,
+) -> NamedTuple("PDFOutput", [("pdf_dir", str), ("num_files", int)]):
+    """Download all PDFs from a Quobjects S3 bucket."""
+    import os
+    import boto3
+    from botocore.client import Config
+
+    out_dir = "/tmp/pdfs"
+    os.makedirs(out_dir, exist_ok=True)
+
+    client = boto3.client(
+        "s3",
+        endpoint_url=f"http://{s3_endpoint}",
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+        region_name="us-east-1",
+        config=Config(signature_version="s3v4"),
+    )
+
+    paginator = client.get_paginator("list_objects_v2")
+    count = 0
+    for page in paginator.paginate(Bucket=s3_bucket, Prefix=s3_prefix):
+        for obj in page.get("Contents", []):
+            key = obj["Key"]
+            if key.lower().endswith(".pdf"):
+                local_path = os.path.join(out_dir, os.path.basename(key))
+                print(f"Downloading: {key} → {local_path}")
+                client.download_file(s3_bucket, key, local_path)
+                count += 1
+
+    print(f"Downloaded {count} PDFs to {out_dir}")
+    from collections import namedtuple
+
+    return namedtuple("PDFOutput", ["pdf_dir", "num_files"])(
+        pdf_dir=out_dir, num_files=count
+    )
+
+
+# ──────────────────────────────────────────────────────────────
+# 2. Extract text from PDFs → instruction-tuning dataset
+# ──────────────────────────────────────────────────────────────
+@dsl.component(
+    base_image="python:3.13-slim",
+    packages_to_install=["pymupdf"],
+)
+def prepare_training_data(
+    pdf_dir: str,
+    max_seq_length: int = 2048,
+    chunk_size: int = 512,
+    chunk_overlap: int = 64,
+) -> NamedTuple("DataOutput", [("dataset_path", str), ("num_train", int), ("num_val", int)]):
+    """Extract text from PDFs, chunk it, and format as instruction-tuning pairs."""
+    import json
+    import os
+    import fitz  # PyMuPDF
+
+    out_dir = "/tmp/training_data"
+    os.makedirs(out_dir, exist_ok=True)
+
+    # 1. Extract text from all PDFs
+    all_chunks: list[dict] = []
+    for fname in sorted(os.listdir(pdf_dir)):
+        if not fname.lower().endswith(".pdf"):
+            continue
+        path = os.path.join(pdf_dir, fname)
+        print(f"Extracting: {fname}")
+        try:
+            doc = fitz.open(path)
+            full_text = ""
+            for page in doc:
+                full_text += page.get_text() + "\n"
+            doc.close()
+        except Exception as e:
+            print(f"  SKIP ({e})")
+            continue
+
+        # 2. Chunk text with overlap
+        words = full_text.split()
+        for i in range(0, len(words), chunk_size - chunk_overlap):
+            chunk_words = words[i : i + chunk_size]
+            if len(chunk_words) < 50:
+                continue  # skip tiny trailing chunks
+            chunk_text = " ".join(chunk_words)
+            all_chunks.append({"text": chunk_text, "source": fname})
+
+    print(f"Total chunks: {len(all_chunks)}")
+    if not all_chunks:
+        raise ValueError("No text extracted from PDFs — check your bucket")
+
+    # 3. Format as Llama 3 chat training pairs
+    #    We create self-supervised pairs: model learns to continue/explain the content
+    samples = []
+    for chunk in all_chunks:
+        text = chunk["text"]
+        source = chunk["source"]
+        # Split chunk roughly in half for input/output
+        words = text.split()
+        mid = len(words) // 2
+        context = " ".join(words[:mid])
+        continuation = " ".join(words[mid:])
+
+        samples.append(
+            {
+                "messages": [
+                    {
+                        "role": "system",
+                        "content": (
+                            "You are a knowledgeable assistant. "
+                            "Continue the information accurately and coherently."
+                        ),
+                    },
+                    {
+                        "role": "user",
+                        "content": f"Continue the following passage from {source}:\n\n{context}",
+                    },
+                    {"role": "assistant", "content": continuation},
+                ]
+            }
+        )
+
+    # 4. Train/val split (90/10)
+    import random
+
+    random.seed(42)
+    random.shuffle(samples)
+    split = int(len(samples) * 0.9)
+    train = samples[:split]
+    val = samples[split:]
+
+    train_path = os.path.join(out_dir, "train.json")
+    val_path = os.path.join(out_dir, "val.json")
+    with open(train_path, "w") as f:
+        json.dump(train, f)
+    with open(val_path, "w") as f:
+        json.dump(val, f)
+
+    print(f"Train: {len(train)} samples, Val: {len(val)} samples")
+    from collections import namedtuple
+
+    return namedtuple("DataOutput", ["dataset_path", "num_train", "num_val"])(
+        dataset_path=out_dir, num_train=len(train), num_val=len(val)
+    )
+
+
+# ──────────────────────────────────────────────────────────────
+# 3. QLoRA training on Strix Halo (ROCm, 128 GB unified)
+# ──────────────────────────────────────────────────────────────
+@dsl.component(
+    # Use a ROCm base image with PyTorch + PEFT pre-installed.
+    # Falls back to pip-installing if not present.
+    base_image="python:3.13-slim",
+    packages_to_install=[
+        "torch",
+        "transformers",
+        "peft",
+        "datasets",
+        "accelerate",
+        "bitsandbytes",
+        "scipy",
+        "trl",
+    ],
+)
+def train_qlora(
+    dataset_path: str,
+    base_model: str,
+    learning_rate: float = 2e-4,
+    num_epochs: int = 3,
+    batch_size: int = 2,
+    gradient_accumulation_steps: int = 8,
+    max_seq_length: int = 2048,
+    lora_r: int = 64,
+    lora_alpha: int = 16,
+    lora_dropout: float = 0.05,
+) -> NamedTuple(
+    "TrainOutput",
+    [("adapter_path", str), ("train_loss", float), ("eval_loss", float)],
+):
+    """QLoRA fine-tune Llama 3.1 70B with 4-bit NF4 quantization."""
+    import json
+    import os
+
+    import torch
+    from datasets import Dataset
+    from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+    from transformers import (
+        AutoModelForCausalLM,
+        AutoTokenizer,
+        BitsAndBytesConfig,
+        TrainingArguments,
+    )
+    from trl import SFTTrainer
+
+    output_dir = "/tmp/qlora_output"
+    os.makedirs(output_dir, exist_ok=True)
+
+    # ── Load data ───────────────────────────────────────────
+    with open(os.path.join(dataset_path, "train.json")) as f:
+        train_data = json.load(f)
+    with open(os.path.join(dataset_path, "val.json")) as f:
+        val_data = json.load(f)
+
+    print(f"Loaded {len(train_data)} train / {len(val_data)} val samples")
+
+    # ── Tokenizer ───────────────────────────────────────────
+    print(f"Loading tokenizer: {base_model}")
+    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+
+    # ── Format with chat template ───────────────────────────
+    def format_chat(sample):
+        return {"text": tokenizer.apply_chat_template(
+            sample["messages"], tokenize=False, add_generation_prompt=False
+        )}
+
+    train_ds = Dataset.from_list(train_data).map(format_chat)
+    val_ds = Dataset.from_list(val_data).map(format_chat)
+
+    # ── 4-bit quantisation ──────────────────────────────────
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+    )
+
+    print(f"Loading model: {base_model} (4-bit NF4)")
+    model = AutoModelForCausalLM.from_pretrained(
+        base_model,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+    )
+    model = prepare_model_for_kbit_training(model)
+
+    # ── LoRA config ─────────────────────────────────────────
+    lora_config = LoraConfig(
+        r=lora_r,
+        lora_alpha=lora_alpha,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj",
+        ],
+        lora_dropout=lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+
+    model = get_peft_model(model, lora_config)
+    model.print_trainable_parameters()
+
+    # ── Training args ───────────────────────────────────────
+    training_args = TrainingArguments(
+        output_dir=os.path.join(output_dir, "checkpoints"),
+        num_train_epochs=num_epochs,
+        per_device_train_batch_size=batch_size,
+        per_device_eval_batch_size=batch_size,
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        learning_rate=learning_rate,
+        bf16=True,
+        logging_steps=5,
+        eval_strategy="steps",
+        eval_steps=50,
+        save_strategy="steps",
+        save_steps=100,
+        save_total_limit=2,
+        load_best_model_at_end=True,
+        metric_for_best_model="eval_loss",
+        report_to="none",
+        warmup_ratio=0.03,
+        lr_scheduler_type="cosine",
+        optim="paged_adamw_8bit",
+        max_grad_norm=0.3,
+        group_by_length=True,
+    )
+
+    # ── SFTTrainer ──────────────────────────────────────────
+    trainer = SFTTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_ds,
+        eval_dataset=val_ds,
+        tokenizer=tokenizer,
+        max_seq_length=max_seq_length,
+        dataset_text_field="text",
+        packing=True,  # pack short samples for efficiency
+    )
+
+    print("Starting QLoRA training …")
+    result = trainer.train()
+    train_loss = result.training_loss
+
+    eval_result = trainer.evaluate()
+    eval_loss = eval_result.get("eval_loss", 0.0)
+
+    print(f"Train loss: {train_loss:.4f}, Eval loss: {eval_loss:.4f}")
+
+    # ── Save adapter ────────────────────────────────────────
+    adapter_path = os.path.join(output_dir, "adapter")
+    model.save_pretrained(adapter_path)
+    tokenizer.save_pretrained(adapter_path)
+
+    metadata = {
+        "base_model": base_model,
+        "lora_r": lora_r,
+        "lora_alpha": lora_alpha,
+        "lora_dropout": lora_dropout,
+        "learning_rate": learning_rate,
+        "num_epochs": num_epochs,
+        "batch_size": batch_size,
+        "gradient_accumulation_steps": gradient_accumulation_steps,
+        "max_seq_length": max_seq_length,
+        "train_samples": len(train_data),
+        "val_samples": len(val_data),
+        "train_loss": train_loss,
+        "eval_loss": eval_loss,
+    }
+    with open(os.path.join(adapter_path, "training_metadata.json"), "w") as f:
+        json.dump(metadata, f, indent=2)
+
+    print(f"Adapter saved to {adapter_path}")
+
+    from collections import namedtuple
+
+    return namedtuple("TrainOutput", ["adapter_path", "train_loss", "eval_loss"])(
+        adapter_path=adapter_path,
+        train_loss=train_loss,
+        eval_loss=eval_loss,
+    )
+
+
+# ──────────────────────────────────────────────────────────────
+# 4. Quick sanity evaluation
+# ──────────────────────────────────────────────────────────────
+@dsl.component(
+    base_image="python:3.13-slim",
+    packages_to_install=[
+        "torch", "transformers", "peft", "bitsandbytes", "accelerate", "scipy",
+    ],
+)
+def evaluate_adapter(
+    adapter_path: str,
+    base_model: str,
+) -> NamedTuple("EvalOutput", [("report", str), ("passed", bool)]):
+    """Load the QLoRA adapter and run a few sanity-check prompts."""
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+    from peft import PeftModel
+
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+    )
+
+    print(f"Loading base model {base_model} …")
+    model = AutoModelForCausalLM.from_pretrained(
+        base_model,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
+
+    print(f"Loading adapter from {adapter_path} …")
+    model = PeftModel.from_pretrained(model, adapter_path)
+    model.eval()
+
+    test_prompts = [
+        "Summarise the key points from the training material.",
+        "What are the main topics covered in the source documents?",
+        "Explain the most important concept from the training data.",
+    ]
+
+    lines = []
+    for prompt in test_prompts:
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt},
+        ]
+        input_text = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
+        with torch.no_grad():
+            out = model.generate(**inputs, max_new_tokens=128, temperature=0.7, do_sample=True)
+        response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+        lines.append(f"Q: {prompt}\nA: {response}\n")
+        print(lines[-1])
+
+    report = "\n".join(lines)
+    # Simple heuristic: did the model produce non-empty responses?
+    passed = all(len(l.split("A:")[1].strip()) > 10 for l in lines)
+    print(f"Evaluation passed: {passed}")
+
+    from collections import namedtuple
+
+    return namedtuple("EvalOutput", ["report", "passed"])(
+        report=report, passed=passed
+    )
+
+
+# ──────────────────────────────────────────────────────────────
+# 5. Push adapter to Gitea repo
+# ──────────────────────────────────────────────────────────────
+@dsl.component(
+    base_image="python:3.13-slim",
+    packages_to_install=["requests"],
+)
+def push_adapter_to_gitea(
+    adapter_path: str,
+    gitea_url: str,
+    gitea_owner: str,
+    gitea_repo: str,
+    gitea_username: str,
+    gitea_password: str,
+    branch: str = "main",
+    commit_message: str = "feat: add QLoRA adapter from PDF training pipeline",
+) -> NamedTuple("PushOutput", [("repo_url", str), ("files_pushed", int)]):
+    """Push the QLoRA adapter files to a Gitea repository via the API."""
+    import base64
+    import json
+    import os
+    import requests
+
+    api_base = f"{gitea_url}/api/v1"
+    auth = (gitea_username, gitea_password)
+    repo_api = f"{api_base}/repos/{gitea_owner}/{gitea_repo}"
+
+    # Check if repo exists, create if not
+    resp = requests.get(repo_api, auth=auth, timeout=30)
+    if resp.status_code == 404:
+        print(f"Creating repo {gitea_owner}/{gitea_repo} …")
+        create_resp = requests.post(
+            f"{api_base}/orgs/{gitea_owner}/repos"
+            if gitea_owner != gitea_username
+            else f"{api_base}/user/repos",
+            auth=auth,
+            json={
+                "name": gitea_repo,
+                "description": "QLoRA adapters trained from PDF documents",
+                "private": False,
+                "auto_init": True,
+            },
+            timeout=30,
+        )
+        create_resp.raise_for_status()
+        print(f"Created: {create_resp.json().get('html_url')}")
+
+    # Collect all adapter files
+    files_to_push = []
+    for root, dirs, files in os.walk(adapter_path):
+        for fname in files:
+            fpath = os.path.join(root, fname)
+            rel_path = os.path.relpath(fpath, adapter_path)
+            with open(fpath, "rb") as f:
+                content = base64.b64encode(f.read()).decode("utf-8")
+            files_to_push.append({"path": rel_path, "content": content})
+
+    print(f"Pushing {len(files_to_push)} files to {gitea_owner}/{gitea_repo}")
+
+    # Push each file via Gitea contents API
+    pushed = 0
+    for item in files_to_push:
+        file_api = f"{repo_api}/contents/{item['path']}"
+
+        # Check if file already exists (need SHA for update)
+        existing = requests.get(file_api, auth=auth, params={"ref": branch}, timeout=30)
+        payload = {
+            "message": commit_message,
+            "content": item["content"],
+            "branch": branch,
+        }
+        if existing.status_code == 200:
+            payload["sha"] = existing.json()["sha"]
+            resp = requests.put(file_api, auth=auth, json=payload, timeout=60)
+        else:
+            resp = requests.post(file_api, auth=auth, json=payload, timeout=60)
+
+        if resp.status_code in (200, 201):
+            pushed += 1
+            print(f"  ✓ {item['path']}")
+        else:
+            print(f"  ✗ {item['path']}: {resp.status_code} {resp.text[:200]}")
+
+    repo_url = f"{gitea_url}/{gitea_owner}/{gitea_repo}"
+    print(f"Pushed {pushed}/{len(files_to_push)} files to {repo_url}")
+
+    from collections import namedtuple
+
+    return namedtuple("PushOutput", ["repo_url", "files_pushed"])(
+        repo_url=repo_url, files_pushed=pushed
+    )
+
+
+# ──────────────────────────────────────────────────────────────
+# 6. Log metrics to MLflow
+# ──────────────────────────────────────────────────────────────
+@dsl.component(
+    base_image="python:3.13-slim",
+    packages_to_install=["mlflow==2.22.0"],
+)
+def log_training_metrics(
+    base_model: str,
+    train_loss: float,
+    eval_loss: float,
+    num_train: int,
+    num_val: int,
+    num_pdfs: int,
+    lora_r: int,
+    lora_alpha: int,
+    learning_rate: float,
+    num_epochs: int,
+    repo_url: str,
+    mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
+    experiment_name: str = "qlora-pdf-training",
+):
+    """Log the full training run to MLflow."""
+    import mlflow
+
+    mlflow.set_tracking_uri(mlflow_tracking_uri)
+    mlflow.set_experiment(experiment_name)
+
+    with mlflow.start_run(run_name=f"qlora-{base_model.split('/')[-1]}"):
+        mlflow.log_params(
+            {
+                "base_model": base_model,
+                "lora_r": lora_r,
+                "lora_alpha": lora_alpha,
+                "learning_rate": learning_rate,
+                "num_epochs": num_epochs,
+                "num_pdfs": num_pdfs,
+                "data_source": "quobjects/training-data",
+            }
+        )
+        mlflow.log_metrics(
+            {
+                "train_loss": train_loss,
+                "eval_loss": eval_loss,
+                "train_samples": float(num_train),
+                "val_samples": float(num_val),
+            }
+        )
+        mlflow.set_tag("adapter_repo", repo_url)
+
+
+# ──────────────────────────────────────────────────────────────
+# Pipeline definition
+# ──────────────────────────────────────────────────────────────
+@dsl.pipeline(
+    name="QLoRA PDF Fine-Tuning",
+    description=(
+        "Fine-tune Llama 3.1 70B via QLoRA on PDFs from the Quobjects "
+        "training-data bucket. Pushes the adapter to Gitea and logs "
+        "metrics to MLflow."
+    ),
+)
+def qlora_pdf_pipeline(
+    # ── S3 / Quobjects ──
+    s3_endpoint: str = "candlekeep.lab.daviestechlabs.io",
+    s3_bucket: str = "training-data",
+    s3_prefix: str = "",
+    aws_access_key_id: str = "",
+    aws_secret_access_key: str = "",
+    # ── Model ──
+    base_model: str = "meta-llama/Llama-3.1-70B-Instruct",
+    # ── Training hyper-params ──
+    learning_rate: float = 2e-4,
+    num_epochs: int = 3,
+    batch_size: int = 2,
+    gradient_accumulation_steps: int = 8,
+    max_seq_length: int = 2048,
+    lora_r: int = 64,
+    lora_alpha: int = 16,
+    lora_dropout: float = 0.05,
+    # ── Data prep ──
+    chunk_size: int = 512,
+    chunk_overlap: int = 64,
+    # ── Gitea ──
+    gitea_url: str = "http://gitea-http.gitea.svc.cluster.local:3000",
+    gitea_owner: str = "daviestechlabs",
+    gitea_repo: str = "qlora-adapters",
+    gitea_username: str = "",
+    gitea_password: str = "",
+    # ── MLflow ──
+    mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
+):
+    # Step 1 — Fetch PDFs from S3
+    pdfs = fetch_pdfs_from_s3(
+        s3_endpoint=s3_endpoint,
+        s3_bucket=s3_bucket,
+        s3_prefix=s3_prefix,
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+    )
+
+    # Step 2 — Extract text and build training dataset
+    data = prepare_training_data(
+        pdf_dir=pdfs.outputs["pdf_dir"],
+        max_seq_length=max_seq_length,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+    )
+
+    # Step 3 — QLoRA training (GPU-heavy)
+    trained = train_qlora(
+        dataset_path=data.outputs["dataset_path"],
+        base_model=base_model,
+        learning_rate=learning_rate,
+        num_epochs=num_epochs,
+        batch_size=batch_size,
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        max_seq_length=max_seq_length,
+        lora_r=lora_r,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+    )
+    # Ask for a GPU on khelben
+    trained.set_accelerator_type("gpu")
+    trained.set_gpu_limit(1)
+
+    # Step 4 — Quick evaluation
+    evaluated = evaluate_adapter(
+        adapter_path=trained.outputs["adapter_path"],
+        base_model=base_model,
+    )
+    evaluated.set_accelerator_type("gpu")
+    evaluated.set_gpu_limit(1)
+
+    # Step 5 — Push adapter to Gitea
+    pushed = push_adapter_to_gitea(
+        adapter_path=trained.outputs["adapter_path"],
+        gitea_url=gitea_url,
+        gitea_owner=gitea_owner,
+        gitea_repo=gitea_repo,
+        gitea_username=gitea_username,
+        gitea_password=gitea_password,
+    )
+
+    # Step 6 — Log to MLflow
+    log_training_metrics(
+        base_model=base_model,
+        train_loss=trained.outputs["train_loss"],
+        eval_loss=trained.outputs["eval_loss"],
+        num_train=data.outputs["num_train"],
+        num_val=data.outputs["num_val"],
+        num_pdfs=pdfs.outputs["num_files"],
+        lora_r=lora_r,
+        lora_alpha=lora_alpha,
+        learning_rate=learning_rate,
+        num_epochs=num_epochs,
+        repo_url=pushed.outputs["repo_url"],
+        mlflow_tracking_uri=mlflow_tracking_uri,
+    )
+
+
+# ──────────────────────────────────────────────────────────────
+# Compile
+# ──────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    compiler.Compiler().compile(
+        pipeline_func=qlora_pdf_pipeline,
+        package_path="qlora_pdf_pipeline.yaml",
+    )
+    print("Compiled: qlora_pdf_pipeline.yaml")
--- a/qlora_pdf_pipeline.yaml
+++ b/qlora_pdf_pipeline.yaml
@@ -0,0 +1,904 @@
+# PIPELINE DEFINITION
+# Name: qlora-pdf-fine-tuning
+# Description: Fine-tune Llama 3.1 70B via QLoRA on PDFs from the Quobjects training-data bucket. Pushes the adapter to Gitea and logs metrics to MLflow.
+# Inputs:
+#    aws_access_key_id: str [Default: '']
+#    aws_secret_access_key: str [Default: '']
+#    base_model: str [Default: 'meta-llama/Llama-3.1-70B-Instruct']
+#    batch_size: int [Default: 2.0]
+#    chunk_overlap: int [Default: 64.0]
+#    chunk_size: int [Default: 512.0]
+#    gitea_owner: str [Default: 'daviestechlabs']
+#    gitea_password: str [Default: '']
+#    gitea_repo: str [Default: 'qlora-adapters']
+#    gitea_url: str [Default: 'http://gitea-http.gitea.svc.cluster.local:3000']
+#    gitea_username: str [Default: '']
+#    gradient_accumulation_steps: int [Default: 8.0]
+#    learning_rate: float [Default: 0.0002]
+#    lora_alpha: int [Default: 16.0]
+#    lora_dropout: float [Default: 0.05]
+#    lora_r: int [Default: 64.0]
+#    max_seq_length: int [Default: 2048.0]
+#    mlflow_tracking_uri: str [Default: 'http://mlflow.mlflow.svc.cluster.local:80']
+#    num_epochs: int [Default: 3.0]
+#    s3_bucket: str [Default: 'training-data']
+#    s3_endpoint: str [Default: 'candlekeep.lab.daviestechlabs.io']
+#    s3_prefix: str [Default: '']
+components:
+  comp-evaluate-adapter:
+    executorLabel: exec-evaluate-adapter
+    inputDefinitions:
+      parameters:
+        adapter_path:
+          parameterType: STRING
+        base_model:
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        passed:
+          parameterType: BOOLEAN
+        report:
+          parameterType: STRING
+  comp-fetch-pdfs-from-s3:
+    executorLabel: exec-fetch-pdfs-from-s3
+    inputDefinitions:
+      parameters:
+        aws_access_key_id:
+          parameterType: STRING
+        aws_secret_access_key:
+          parameterType: STRING
+        s3_bucket:
+          parameterType: STRING
+        s3_endpoint:
+          parameterType: STRING
+        s3_prefix:
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        num_files:
+          parameterType: NUMBER_INTEGER
+        pdf_dir:
+          parameterType: STRING
+  comp-log-training-metrics:
+    executorLabel: exec-log-training-metrics
+    inputDefinitions:
+      parameters:
+        base_model:
+          parameterType: STRING
+        eval_loss:
+          parameterType: NUMBER_DOUBLE
+        experiment_name:
+          defaultValue: qlora-pdf-training
+          isOptional: true
+          parameterType: STRING
+        learning_rate:
+          parameterType: NUMBER_DOUBLE
+        lora_alpha:
+          parameterType: NUMBER_INTEGER
+        lora_r:
+          parameterType: NUMBER_INTEGER
+        mlflow_tracking_uri:
+          defaultValue: http://mlflow.mlflow.svc.cluster.local:80
+          isOptional: true
+          parameterType: STRING
+        num_epochs:
+          parameterType: NUMBER_INTEGER
+        num_pdfs:
+          parameterType: NUMBER_INTEGER
+        num_train:
+          parameterType: NUMBER_INTEGER
+        num_val:
+          parameterType: NUMBER_INTEGER
+        repo_url:
+          parameterType: STRING
+        train_loss:
+          parameterType: NUMBER_DOUBLE
+  comp-prepare-training-data:
+    executorLabel: exec-prepare-training-data
+    inputDefinitions:
+      parameters:
+        chunk_overlap:
+          defaultValue: 64.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        chunk_size:
+          defaultValue: 512.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        max_seq_length:
+          defaultValue: 2048.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        pdf_dir:
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        dataset_path:
+          parameterType: STRING
+        num_train:
+          parameterType: NUMBER_INTEGER
+        num_val:
+          parameterType: NUMBER_INTEGER
+  comp-push-adapter-to-gitea:
+    executorLabel: exec-push-adapter-to-gitea
+    inputDefinitions:
+      parameters:
+        adapter_path:
+          parameterType: STRING
+        branch:
+          defaultValue: main
+          isOptional: true
+          parameterType: STRING
+        commit_message:
+          defaultValue: 'feat: add QLoRA adapter from PDF training pipeline'
+          isOptional: true
+          parameterType: STRING
+        gitea_owner:
+          parameterType: STRING
+        gitea_password:
+          parameterType: STRING
+        gitea_repo:
+          parameterType: STRING
+        gitea_url:
+          parameterType: STRING
+        gitea_username:
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        files_pushed:
+          parameterType: NUMBER_INTEGER
+        repo_url:
+          parameterType: STRING
+  comp-train-qlora:
+    executorLabel: exec-train-qlora
+    inputDefinitions:
+      parameters:
+        base_model:
+          parameterType: STRING
+        batch_size:
+          defaultValue: 2.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        dataset_path:
+          parameterType: STRING
+        gradient_accumulation_steps:
+          defaultValue: 8.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        learning_rate:
+          defaultValue: 0.0002
+          isOptional: true
+          parameterType: NUMBER_DOUBLE
+        lora_alpha:
+          defaultValue: 16.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        lora_dropout:
+          defaultValue: 0.05
+          isOptional: true
+          parameterType: NUMBER_DOUBLE
+        lora_r:
+          defaultValue: 64.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        max_seq_length:
+          defaultValue: 2048.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        num_epochs:
+          defaultValue: 3.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+    outputDefinitions:
+      parameters:
+        adapter_path:
+          parameterType: STRING
+        eval_loss:
+          parameterType: NUMBER_DOUBLE
+        train_loss:
+          parameterType: NUMBER_DOUBLE
+deploymentSpec:
+  executors:
+    exec-evaluate-adapter:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - evaluate_adapter
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'torch' 'transformers'\
+          \ 'peft' 'bitsandbytes' 'accelerate' 'scipy' && \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef evaluate_adapter(\n    adapter_path: str,\n    base_model: str,\n\
+          ) -> NamedTuple(\"EvalOutput\", [(\"report\", str), (\"passed\", bool)]):\n\
+          \    \"\"\"Load the QLoRA adapter and run a few sanity-check prompts.\"\"\
+          \"\n    import torch\n    from transformers import AutoModelForCausalLM,\
+          \ AutoTokenizer, BitsAndBytesConfig\n    from peft import PeftModel\n\n\
+          \    bnb_config = BitsAndBytesConfig(\n        load_in_4bit=True,\n    \
+          \    bnb_4bit_quant_type=\"nf4\",\n        bnb_4bit_compute_dtype=torch.bfloat16,\n\
+          \        bnb_4bit_use_double_quant=True,\n    )\n\n    print(f\"Loading\
+          \ base model {base_model} \u2026\")\n    model = AutoModelForCausalLM.from_pretrained(\n\
+          \        base_model,\n        quantization_config=bnb_config,\n        device_map=\"\
+          auto\",\n        trust_remote_code=True,\n        torch_dtype=torch.bfloat16,\n\
+          \    )\n    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)\n\
+          \n    print(f\"Loading adapter from {adapter_path} \u2026\")\n    model\
+          \ = PeftModel.from_pretrained(model, adapter_path)\n    model.eval()\n\n\
+          \    test_prompts = [\n        \"Summarise the key points from the training\
+          \ material.\",\n        \"What are the main topics covered in the source\
+          \ documents?\",\n        \"Explain the most important concept from the training\
+          \ data.\",\n    ]\n\n    lines = []\n    for prompt in test_prompts:\n \
+          \       messages = [\n            {\"role\": \"system\", \"content\": \"\
+          You are a helpful assistant.\"},\n            {\"role\": \"user\", \"content\"\
+          : prompt},\n        ]\n        input_text = tokenizer.apply_chat_template(\n\
+          \            messages, tokenize=False, add_generation_prompt=True\n    \
+          \    )\n        inputs = tokenizer(input_text, return_tensors=\"pt\").to(model.device)\n\
+          \        with torch.no_grad():\n            out = model.generate(**inputs,\
+          \ max_new_tokens=128, temperature=0.7, do_sample=True)\n        response\
+          \ = tokenizer.decode(out[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n\
+          \        lines.append(f\"Q: {prompt}\\nA: {response}\\n\")\n        print(lines[-1])\n\
+          \n    report = \"\\n\".join(lines)\n    # Simple heuristic: did the model\
+          \ produce non-empty responses?\n    passed = all(len(l.split(\"A:\")[1].strip())\
+          \ > 10 for l in lines)\n    print(f\"Evaluation passed: {passed}\")\n\n\
+          \    from collections import namedtuple\n\n    return namedtuple(\"EvalOutput\"\
+          , [\"report\", \"passed\"])(\n        report=report, passed=passed\n   \
+          \ )\n\n"
+        image: python:3.13-slim
+        resources:
+          accelerator:
+            resourceCount: '1'
+            resourceType: gpu
+    exec-fetch-pdfs-from-s3:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - fetch_pdfs_from_s3
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'boto3' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef fetch_pdfs_from_s3(\n    s3_endpoint: str,\n    s3_bucket: str,\n\
+          \    s3_prefix: str,\n    aws_access_key_id: str,\n    aws_secret_access_key:\
+          \ str,\n) -> NamedTuple(\"PDFOutput\", [(\"pdf_dir\", str), (\"num_files\"\
+          , int)]):\n    \"\"\"Download all PDFs from a Quobjects S3 bucket.\"\"\"\
+          \n    import os\n    import boto3\n    from botocore.client import Config\n\
+          \n    out_dir = \"/tmp/pdfs\"\n    os.makedirs(out_dir, exist_ok=True)\n\
+          \n    client = boto3.client(\n        \"s3\",\n        endpoint_url=f\"\
+          http://{s3_endpoint}\",\n        aws_access_key_id=aws_access_key_id,\n\
+          \        aws_secret_access_key=aws_secret_access_key,\n        region_name=\"\
+          us-east-1\",\n        config=Config(signature_version=\"s3v4\"),\n    )\n\
+          \n    paginator = client.get_paginator(\"list_objects_v2\")\n    count =\
+          \ 0\n    for page in paginator.paginate(Bucket=s3_bucket, Prefix=s3_prefix):\n\
+          \        for obj in page.get(\"Contents\", []):\n            key = obj[\"\
+          Key\"]\n            if key.lower().endswith(\".pdf\"):\n               \
+          \ local_path = os.path.join(out_dir, os.path.basename(key))\n          \
+          \      print(f\"Downloading: {key} \u2192 {local_path}\")\n            \
+          \    client.download_file(s3_bucket, key, local_path)\n                count\
+          \ += 1\n\n    print(f\"Downloaded {count} PDFs to {out_dir}\")\n    from\
+          \ collections import namedtuple\n\n    return namedtuple(\"PDFOutput\",\
+          \ [\"pdf_dir\", \"num_files\"])(\n        pdf_dir=out_dir, num_files=count\n\
+          \    )\n\n"
+        image: python:3.13-slim
+    exec-log-training-metrics:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - log_training_metrics
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'mlflow==2.22.0'\
+          \ && \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef log_training_metrics(\n    base_model: str,\n    train_loss:\
+          \ float,\n    eval_loss: float,\n    num_train: int,\n    num_val: int,\n\
+          \    num_pdfs: int,\n    lora_r: int,\n    lora_alpha: int,\n    learning_rate:\
+          \ float,\n    num_epochs: int,\n    repo_url: str,\n    mlflow_tracking_uri:\
+          \ str = \"http://mlflow.mlflow.svc.cluster.local:80\",\n    experiment_name:\
+          \ str = \"qlora-pdf-training\",\n):\n    \"\"\"Log the full training run\
+          \ to MLflow.\"\"\"\n    import mlflow\n\n    mlflow.set_tracking_uri(mlflow_tracking_uri)\n\
+          \    mlflow.set_experiment(experiment_name)\n\n    with mlflow.start_run(run_name=f\"\
+          qlora-{base_model.split('/')[-1]}\"):\n        mlflow.log_params(\n    \
+          \        {\n                \"base_model\": base_model,\n              \
+          \  \"lora_r\": lora_r,\n                \"lora_alpha\": lora_alpha,\n  \
+          \              \"learning_rate\": learning_rate,\n                \"num_epochs\"\
+          : num_epochs,\n                \"num_pdfs\": num_pdfs,\n               \
+          \ \"data_source\": \"quobjects/training-data\",\n            }\n       \
+          \ )\n        mlflow.log_metrics(\n            {\n                \"train_loss\"\
+          : train_loss,\n                \"eval_loss\": eval_loss,\n             \
+          \   \"train_samples\": float(num_train),\n                \"val_samples\"\
+          : float(num_val),\n            }\n        )\n        mlflow.set_tag(\"adapter_repo\"\
+          , repo_url)\n\n"
+        image: python:3.13-slim
+    exec-prepare-training-data:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - prepare_training_data
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'pymupdf' &&\
+          \ \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef prepare_training_data(\n    pdf_dir: str,\n    max_seq_length:\
+          \ int = 2048,\n    chunk_size: int = 512,\n    chunk_overlap: int = 64,\n\
+          ) -> NamedTuple(\"DataOutput\", [(\"dataset_path\", str), (\"num_train\"\
+          , int), (\"num_val\", int)]):\n    \"\"\"Extract text from PDFs, chunk it,\
+          \ and format as instruction-tuning pairs.\"\"\"\n    import json\n    import\
+          \ os\n    import fitz  # PyMuPDF\n\n    out_dir = \"/tmp/training_data\"\
+          \n    os.makedirs(out_dir, exist_ok=True)\n\n    # 1. Extract text from\
+          \ all PDFs\n    all_chunks: list[dict] = []\n    for fname in sorted(os.listdir(pdf_dir)):\n\
+          \        if not fname.lower().endswith(\".pdf\"):\n            continue\n\
+          \        path = os.path.join(pdf_dir, fname)\n        print(f\"Extracting:\
+          \ {fname}\")\n        try:\n            doc = fitz.open(path)\n        \
+          \    full_text = \"\"\n            for page in doc:\n                full_text\
+          \ += page.get_text() + \"\\n\"\n            doc.close()\n        except\
+          \ Exception as e:\n            print(f\"  SKIP ({e})\")\n            continue\n\
+          \n        # 2. Chunk text with overlap\n        words = full_text.split()\n\
+          \        for i in range(0, len(words), chunk_size - chunk_overlap):\n  \
+          \          chunk_words = words[i : i + chunk_size]\n            if len(chunk_words)\
+          \ < 50:\n                continue  # skip tiny trailing chunks\n       \
+          \     chunk_text = \" \".join(chunk_words)\n            all_chunks.append({\"\
+          text\": chunk_text, \"source\": fname})\n\n    print(f\"Total chunks: {len(all_chunks)}\"\
+          )\n    if not all_chunks:\n        raise ValueError(\"No text extracted\
+          \ from PDFs \u2014 check your bucket\")\n\n    # 3. Format as Llama 3 chat\
+          \ training pairs\n    #    We create self-supervised pairs: model learns\
+          \ to continue/explain the content\n    samples = []\n    for chunk in all_chunks:\n\
+          \        text = chunk[\"text\"]\n        source = chunk[\"source\"]\n  \
+          \      # Split chunk roughly in half for input/output\n        words = text.split()\n\
+          \        mid = len(words) // 2\n        context = \" \".join(words[:mid])\n\
+          \        continuation = \" \".join(words[mid:])\n\n        samples.append(\n\
+          \            {\n                \"messages\": [\n                    {\n\
+          \                        \"role\": \"system\",\n                       \
+          \ \"content\": (\n                            \"You are a knowledgeable\
+          \ assistant. \"\n                            \"Continue the information\
+          \ accurately and coherently.\"\n                        ),\n           \
+          \         },\n                    {\n                        \"role\": \"\
+          user\",\n                        \"content\": f\"Continue the following\
+          \ passage from {source}:\\n\\n{context}\",\n                    },\n   \
+          \                 {\"role\": \"assistant\", \"content\": continuation},\n\
+          \                ]\n            }\n        )\n\n    # 4. Train/val split\
+          \ (90/10)\n    import random\n\n    random.seed(42)\n    random.shuffle(samples)\n\
+          \    split = int(len(samples) * 0.9)\n    train = samples[:split]\n    val\
+          \ = samples[split:]\n\n    train_path = os.path.join(out_dir, \"train.json\"\
+          )\n    val_path = os.path.join(out_dir, \"val.json\")\n    with open(train_path,\
+          \ \"w\") as f:\n        json.dump(train, f)\n    with open(val_path, \"\
+          w\") as f:\n        json.dump(val, f)\n\n    print(f\"Train: {len(train)}\
+          \ samples, Val: {len(val)} samples\")\n    from collections import namedtuple\n\
+          \n    return namedtuple(\"DataOutput\", [\"dataset_path\", \"num_train\"\
+          , \"num_val\"])(\n        dataset_path=out_dir, num_train=len(train), num_val=len(val)\n\
+          \    )\n\n"
+        image: python:3.13-slim
+    exec-push-adapter-to-gitea:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - push_adapter_to_gitea
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'requests' &&\
+          \ \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef push_adapter_to_gitea(\n    adapter_path: str,\n    gitea_url:\
+          \ str,\n    gitea_owner: str,\n    gitea_repo: str,\n    gitea_username:\
+          \ str,\n    gitea_password: str,\n    branch: str = \"main\",\n    commit_message:\
+          \ str = \"feat: add QLoRA adapter from PDF training pipeline\",\n) -> NamedTuple(\"\
+          PushOutput\", [(\"repo_url\", str), (\"files_pushed\", int)]):\n    \"\"\
+          \"Push the QLoRA adapter files to a Gitea repository via the API.\"\"\"\n\
+          \    import base64\n    import json\n    import os\n    import requests\n\
+          \n    api_base = f\"{gitea_url}/api/v1\"\n    auth = (gitea_username, gitea_password)\n\
+          \    repo_api = f\"{api_base}/repos/{gitea_owner}/{gitea_repo}\"\n\n   \
+          \ # Check if repo exists, create if not\n    resp = requests.get(repo_api,\
+          \ auth=auth, timeout=30)\n    if resp.status_code == 404:\n        print(f\"\
+          Creating repo {gitea_owner}/{gitea_repo} \u2026\")\n        create_resp\
+          \ = requests.post(\n            f\"{api_base}/orgs/{gitea_owner}/repos\"\
+          \n            if gitea_owner != gitea_username\n            else f\"{api_base}/user/repos\"\
+          ,\n            auth=auth,\n            json={\n                \"name\"\
+          : gitea_repo,\n                \"description\": \"QLoRA adapters trained\
+          \ from PDF documents\",\n                \"private\": False,\n         \
+          \       \"auto_init\": True,\n            },\n            timeout=30,\n\
+          \        )\n        create_resp.raise_for_status()\n        print(f\"Created:\
+          \ {create_resp.json().get('html_url')}\")\n\n    # Collect all adapter files\n\
+          \    files_to_push = []\n    for root, dirs, files in os.walk(adapter_path):\n\
+          \        for fname in files:\n            fpath = os.path.join(root, fname)\n\
+          \            rel_path = os.path.relpath(fpath, adapter_path)\n         \
+          \   with open(fpath, \"rb\") as f:\n                content = base64.b64encode(f.read()).decode(\"\
+          utf-8\")\n            files_to_push.append({\"path\": rel_path, \"content\"\
+          : content})\n\n    print(f\"Pushing {len(files_to_push)} files to {gitea_owner}/{gitea_repo}\"\
+          )\n\n    # Push each file via Gitea contents API\n    pushed = 0\n    for\
+          \ item in files_to_push:\n        file_api = f\"{repo_api}/contents/{item['path']}\"\
+          \n\n        # Check if file already exists (need SHA for update)\n     \
+          \   existing = requests.get(file_api, auth=auth, params={\"ref\": branch},\
+          \ timeout=30)\n        payload = {\n            \"message\": commit_message,\n\
+          \            \"content\": item[\"content\"],\n            \"branch\": branch,\n\
+          \        }\n        if existing.status_code == 200:\n            payload[\"\
+          sha\"] = existing.json()[\"sha\"]\n            resp = requests.put(file_api,\
+          \ auth=auth, json=payload, timeout=60)\n        else:\n            resp\
+          \ = requests.post(file_api, auth=auth, json=payload, timeout=60)\n\n   \
+          \     if resp.status_code in (200, 201):\n            pushed += 1\n    \
+          \        print(f\"  \u2713 {item['path']}\")\n        else:\n          \
+          \  print(f\"  \u2717 {item['path']}: {resp.status_code} {resp.text[:200]}\"\
+          )\n\n    repo_url = f\"{gitea_url}/{gitea_owner}/{gitea_repo}\"\n    print(f\"\
+          Pushed {pushed}/{len(files_to_push)} files to {repo_url}\")\n\n    from\
+          \ collections import namedtuple\n\n    return namedtuple(\"PushOutput\"\
+          , [\"repo_url\", \"files_pushed\"])(\n        repo_url=repo_url, files_pushed=pushed\n\
+          \    )\n\n"
+        image: python:3.13-slim
+    exec-train-qlora:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - train_qlora
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'torch' 'transformers'\
+          \ 'peft' 'datasets' 'accelerate' 'bitsandbytes' 'scipy' 'trl' && \"$0\"\
+          \ \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef train_qlora(\n    dataset_path: str,\n    base_model: str,\n\
+          \    learning_rate: float = 2e-4,\n    num_epochs: int = 3,\n    batch_size:\
+          \ int = 2,\n    gradient_accumulation_steps: int = 8,\n    max_seq_length:\
+          \ int = 2048,\n    lora_r: int = 64,\n    lora_alpha: int = 16,\n    lora_dropout:\
+          \ float = 0.05,\n) -> NamedTuple(\n    \"TrainOutput\",\n    [(\"adapter_path\"\
+          , str), (\"train_loss\", float), (\"eval_loss\", float)],\n):\n    \"\"\"\
+          QLoRA fine-tune Llama 3.1 70B with 4-bit NF4 quantization.\"\"\"\n    import\
+          \ json\n    import os\n\n    import torch\n    from datasets import Dataset\n\
+          \    from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training\n\
+          \    from transformers import (\n        AutoModelForCausalLM,\n       \
+          \ AutoTokenizer,\n        BitsAndBytesConfig,\n        TrainingArguments,\n\
+          \    )\n    from trl import SFTTrainer\n\n    output_dir = \"/tmp/qlora_output\"\
+          \n    os.makedirs(output_dir, exist_ok=True)\n\n    # \u2500\u2500 Load\
+          \ data \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n    with open(os.path.join(dataset_path,\
+          \ \"train.json\")) as f:\n        train_data = json.load(f)\n    with open(os.path.join(dataset_path,\
+          \ \"val.json\")) as f:\n        val_data = json.load(f)\n\n    print(f\"\
+          Loaded {len(train_data)} train / {len(val_data)} val samples\")\n\n    #\
+          \ \u2500\u2500 Tokenizer \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n   \
+          \ print(f\"Loading tokenizer: {base_model}\")\n    tokenizer = AutoTokenizer.from_pretrained(base_model,\
+          \ trust_remote_code=True)\n    if tokenizer.pad_token is None:\n       \
+          \ tokenizer.pad_token = tokenizer.eos_token\n    tokenizer.padding_side\
+          \ = \"right\"\n\n    # \u2500\u2500 Format with chat template \u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\n    def format_chat(sample):\n        return {\"text\": tokenizer.apply_chat_template(\n\
+          \            sample[\"messages\"], tokenize=False, add_generation_prompt=False\n\
+          \        )}\n\n    train_ds = Dataset.from_list(train_data).map(format_chat)\n\
+          \    val_ds = Dataset.from_list(val_data).map(format_chat)\n\n    # \u2500\
+          \u2500 4-bit quantisation \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\n    bnb_config = BitsAndBytesConfig(\n        load_in_4bit=True,\n\
+          \        bnb_4bit_quant_type=\"nf4\",\n        bnb_4bit_compute_dtype=torch.bfloat16,\n\
+          \        bnb_4bit_use_double_quant=True,\n    )\n\n    print(f\"Loading\
+          \ model: {base_model} (4-bit NF4)\")\n    model = AutoModelForCausalLM.from_pretrained(\n\
+          \        base_model,\n        quantization_config=bnb_config,\n        device_map=\"\
+          auto\",\n        trust_remote_code=True,\n        torch_dtype=torch.bfloat16,\n\
+          \    )\n    model = prepare_model_for_kbit_training(model)\n\n    # \u2500\
+          \u2500 LoRA config \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n    lora_config = LoraConfig(\n\
+          \        r=lora_r,\n        lora_alpha=lora_alpha,\n        target_modules=[\n\
+          \            \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n         \
+          \   \"gate_proj\", \"up_proj\", \"down_proj\",\n        ],\n        lora_dropout=lora_dropout,\n\
+          \        bias=\"none\",\n        task_type=\"CAUSAL_LM\",\n    )\n\n   \
+          \ model = get_peft_model(model, lora_config)\n    model.print_trainable_parameters()\n\
+          \n    # \u2500\u2500 Training args \u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n    training_args = TrainingArguments(\n\
+          \        output_dir=os.path.join(output_dir, \"checkpoints\"),\n       \
+          \ num_train_epochs=num_epochs,\n        per_device_train_batch_size=batch_size,\n\
+          \        per_device_eval_batch_size=batch_size,\n        gradient_accumulation_steps=gradient_accumulation_steps,\n\
+          \        learning_rate=learning_rate,\n        bf16=True,\n        logging_steps=5,\n\
+          \        eval_strategy=\"steps\",\n        eval_steps=50,\n        save_strategy=\"\
+          steps\",\n        save_steps=100,\n        save_total_limit=2,\n       \
+          \ load_best_model_at_end=True,\n        metric_for_best_model=\"eval_loss\"\
+          ,\n        report_to=\"none\",\n        warmup_ratio=0.03,\n        lr_scheduler_type=\"\
+          cosine\",\n        optim=\"paged_adamw_8bit\",\n        max_grad_norm=0.3,\n\
+          \        group_by_length=True,\n    )\n\n    # \u2500\u2500 SFTTrainer \u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\n    trainer = SFTTrainer(\n        model=model,\n\
+          \        args=training_args,\n        train_dataset=train_ds,\n        eval_dataset=val_ds,\n\
+          \        tokenizer=tokenizer,\n        max_seq_length=max_seq_length,\n\
+          \        dataset_text_field=\"text\",\n        packing=True,  # pack short\
+          \ samples for efficiency\n    )\n\n    print(\"Starting QLoRA training \u2026\
+          \")\n    result = trainer.train()\n    train_loss = result.training_loss\n\
+          \n    eval_result = trainer.evaluate()\n    eval_loss = eval_result.get(\"\
+          eval_loss\", 0.0)\n\n    print(f\"Train loss: {train_loss:.4f}, Eval loss:\
+          \ {eval_loss:.4f}\")\n\n    # \u2500\u2500 Save adapter \u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\
+          \u2500\n    adapter_path = os.path.join(output_dir, \"adapter\")\n    model.save_pretrained(adapter_path)\n\
+          \    tokenizer.save_pretrained(adapter_path)\n\n    metadata = {\n     \
+          \   \"base_model\": base_model,\n        \"lora_r\": lora_r,\n        \"\
+          lora_alpha\": lora_alpha,\n        \"lora_dropout\": lora_dropout,\n   \
+          \     \"learning_rate\": learning_rate,\n        \"num_epochs\": num_epochs,\n\
+          \        \"batch_size\": batch_size,\n        \"gradient_accumulation_steps\"\
+          : gradient_accumulation_steps,\n        \"max_seq_length\": max_seq_length,\n\
+          \        \"train_samples\": len(train_data),\n        \"val_samples\": len(val_data),\n\
+          \        \"train_loss\": train_loss,\n        \"eval_loss\": eval_loss,\n\
+          \    }\n    with open(os.path.join(adapter_path, \"training_metadata.json\"\
+          ), \"w\") as f:\n        json.dump(metadata, f, indent=2)\n\n    print(f\"\
+          Adapter saved to {adapter_path}\")\n\n    from collections import namedtuple\n\
+          \n    return namedtuple(\"TrainOutput\", [\"adapter_path\", \"train_loss\"\
+          , \"eval_loss\"])(\n        adapter_path=adapter_path,\n        train_loss=train_loss,\n\
+          \        eval_loss=eval_loss,\n    )\n\n"
+        image: python:3.13-slim
+        resources:
+          accelerator:
+            resourceCount: '1'
+            resourceType: gpu
+pipelineInfo:
+  description: Fine-tune Llama 3.1 70B via QLoRA on PDFs from the Quobjects training-data
+    bucket. Pushes the adapter to Gitea and logs metrics to MLflow.
+  name: qlora-pdf-fine-tuning
+root:
+  dag:
+    tasks:
+      evaluate-adapter:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-evaluate-adapter
+        dependentTasks:
+        - train-qlora
+        inputs:
+          parameters:
+            adapter_path:
+              taskOutputParameter:
+                outputParameterKey: adapter_path
+                producerTask: train-qlora
+            base_model:
+              componentInputParameter: base_model
+        taskInfo:
+          name: evaluate-adapter
+      fetch-pdfs-from-s3:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-fetch-pdfs-from-s3
+        inputs:
+          parameters:
+            aws_access_key_id:
+              componentInputParameter: aws_access_key_id
+            aws_secret_access_key:
+              componentInputParameter: aws_secret_access_key
+            s3_bucket:
+              componentInputParameter: s3_bucket
+            s3_endpoint:
+              componentInputParameter: s3_endpoint
+            s3_prefix:
+              componentInputParameter: s3_prefix
+        taskInfo:
+          name: fetch-pdfs-from-s3
+      log-training-metrics:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-log-training-metrics
+        dependentTasks:
+        - fetch-pdfs-from-s3
+        - prepare-training-data
+        - push-adapter-to-gitea
+        - train-qlora
+        inputs:
+          parameters:
+            base_model:
+              componentInputParameter: base_model
+            eval_loss:
+              taskOutputParameter:
+                outputParameterKey: eval_loss
+                producerTask: train-qlora
+            learning_rate:
+              componentInputParameter: learning_rate
+            lora_alpha:
+              componentInputParameter: lora_alpha
+            lora_r:
+              componentInputParameter: lora_r
+            mlflow_tracking_uri:
+              componentInputParameter: mlflow_tracking_uri
+            num_epochs:
+              componentInputParameter: num_epochs
+            num_pdfs:
+              taskOutputParameter:
+                outputParameterKey: num_files
+                producerTask: fetch-pdfs-from-s3
+            num_train:
+              taskOutputParameter:
+                outputParameterKey: num_train
+                producerTask: prepare-training-data
+            num_val:
+              taskOutputParameter:
+                outputParameterKey: num_val
+                producerTask: prepare-training-data
+            repo_url:
+              taskOutputParameter:
+                outputParameterKey: repo_url
+                producerTask: push-adapter-to-gitea
+            train_loss:
+              taskOutputParameter:
+                outputParameterKey: train_loss
+                producerTask: train-qlora
+        taskInfo:
+          name: log-training-metrics
+      prepare-training-data:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-prepare-training-data
+        dependentTasks:
+        - fetch-pdfs-from-s3
+        inputs:
+          parameters:
+            chunk_overlap:
+              componentInputParameter: chunk_overlap
+            chunk_size:
+              componentInputParameter: chunk_size
+            max_seq_length:
+              componentInputParameter: max_seq_length
+            pdf_dir:
+              taskOutputParameter:
+                outputParameterKey: pdf_dir
+                producerTask: fetch-pdfs-from-s3
+        taskInfo:
+          name: prepare-training-data
+      push-adapter-to-gitea:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-push-adapter-to-gitea
+        dependentTasks:
+        - train-qlora
+        inputs:
+          parameters:
+            adapter_path:
+              taskOutputParameter:
+                outputParameterKey: adapter_path
+                producerTask: train-qlora
+            gitea_owner:
+              componentInputParameter: gitea_owner
+            gitea_password:
+              componentInputParameter: gitea_password
+            gitea_repo:
+              componentInputParameter: gitea_repo
+            gitea_url:
+              componentInputParameter: gitea_url
+            gitea_username:
+              componentInputParameter: gitea_username
+        taskInfo:
+          name: push-adapter-to-gitea
+      train-qlora:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-train-qlora
+        dependentTasks:
+        - prepare-training-data
+        inputs:
+          parameters:
+            base_model:
+              componentInputParameter: base_model
+            batch_size:
+              componentInputParameter: batch_size
+            dataset_path:
+              taskOutputParameter:
+                outputParameterKey: dataset_path
+                producerTask: prepare-training-data
+            gradient_accumulation_steps:
+              componentInputParameter: gradient_accumulation_steps
+            learning_rate:
+              componentInputParameter: learning_rate
+            lora_alpha:
+              componentInputParameter: lora_alpha
+            lora_dropout:
+              componentInputParameter: lora_dropout
+            lora_r:
+              componentInputParameter: lora_r
+            max_seq_length:
+              componentInputParameter: max_seq_length
+            num_epochs:
+              componentInputParameter: num_epochs
+        taskInfo:
+          name: train-qlora
+  inputDefinitions:
+    parameters:
+      aws_access_key_id:
+        defaultValue: ''
+        isOptional: true
+        parameterType: STRING
+      aws_secret_access_key:
+        defaultValue: ''
+        isOptional: true
+        parameterType: STRING
+      base_model:
+        defaultValue: meta-llama/Llama-3.1-70B-Instruct
+        isOptional: true
+        parameterType: STRING
+      batch_size:
+        defaultValue: 2.0
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      chunk_overlap:
+        defaultValue: 64.0
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      chunk_size:
+        defaultValue: 512.0
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      gitea_owner:
+        defaultValue: daviestechlabs
+        isOptional: true
+        parameterType: STRING
+      gitea_password:
+        defaultValue: ''
+        isOptional: true
+        parameterType: STRING
+      gitea_repo:
+        defaultValue: qlora-adapters
+        isOptional: true
+        parameterType: STRING
+      gitea_url:
+        defaultValue: http://gitea-http.gitea.svc.cluster.local:3000
+        isOptional: true
+        parameterType: STRING
+      gitea_username:
+        defaultValue: ''
+        isOptional: true
+        parameterType: STRING
+      gradient_accumulation_steps:
+        defaultValue: 8.0
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      learning_rate:
+        defaultValue: 0.0002
+        isOptional: true
+        parameterType: NUMBER_DOUBLE
+      lora_alpha:
+        defaultValue: 16.0
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      lora_dropout:
+        defaultValue: 0.05
+        isOptional: true
+        parameterType: NUMBER_DOUBLE
+      lora_r:
+        defaultValue: 64.0
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      max_seq_length:
+        defaultValue: 2048.0
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      mlflow_tracking_uri:
+        defaultValue: http://mlflow.mlflow.svc.cluster.local:80
+        isOptional: true
+        parameterType: STRING
+      num_epochs:
+        defaultValue: 3.0
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      s3_bucket:
+        defaultValue: training-data
+        isOptional: true
+        parameterType: STRING
+      s3_endpoint:
+        defaultValue: candlekeep.lab.daviestechlabs.io
+        isOptional: true
+        parameterType: STRING
+      s3_prefix:
+        defaultValue: ''
+        isOptional: true
+        parameterType: STRING
+schemaVersion: 2.1.0
+sdkVersion: kfp-2.12.1