pipelines go to gravenhollow now.
All checks were successful
Compile and Upload Pipelines / Compile & Upload (push) Successful in 15s
Compile and Upload Pipelines / Notify (push) Successful in 1s

This commit is contained in:
2026-02-18 07:14:12 -05:00
parent 7f2b011c95
commit d4eb54d92b
3 changed files with 14 additions and 14 deletions

View File

@@ -7,7 +7,7 @@ distributed across multiple cluster nodes via KubeRay RayJob.
GPUs remain 100 % dedicated to inference serving. GPUs remain 100 % dedicated to inference serving.
Architecture: Architecture:
1. Fetch PDFs from Quobjects S3 1. Fetch PDFs from RustFS S3
2. Prepare instruction-tuning dataset 2. Prepare instruction-tuning dataset
3. Upload prepared data to S3 (shared storage for Ray workers) 3. Upload prepared data to S3 (shared storage for Ray workers)
4. Submit a KubeRay RayJob that runs Ray Train TorchTrainer 4. Submit a KubeRay RayJob that runs Ray Train TorchTrainer
@@ -41,7 +41,7 @@ from typing import NamedTuple
# ────────────────────────────────────────────────────────────── # ──────────────────────────────────────────────────────────────
# 1. Fetch PDFs from Quobjects S3 # 1. Fetch PDFs from RustFS S3
# ────────────────────────────────────────────────────────────── # ──────────────────────────────────────────────────────────────
@dsl.component( @dsl.component(
base_image="python:3.13-slim", base_image="python:3.13-slim",
@@ -54,7 +54,7 @@ def fetch_pdfs_from_s3(
aws_access_key_id: str, aws_access_key_id: str,
aws_secret_access_key: str, aws_secret_access_key: str,
) -> NamedTuple("PDFOutput", [("pdf_dir", str), ("num_files", int)]): ) -> NamedTuple("PDFOutput", [("pdf_dir", str), ("num_files", int)]):
"""Download all PDFs from a Quobjects S3 bucket.""" """Download all PDFs from an S3 bucket."""
import os import os
import boto3 import boto3
@@ -994,7 +994,7 @@ def log_training_metrics(
"num_epochs": num_epochs, "num_epochs": num_epochs,
"num_pdfs": num_pdfs, "num_pdfs": num_pdfs,
"backend": "ray-train-gloo", "backend": "ray-train-gloo",
"data_source": "quobjects/training-data", "data_source": "rustfs/training-data",
} }
) )
mlflow.log_metrics( mlflow.log_metrics(
@@ -1023,7 +1023,7 @@ def log_training_metrics(
), ),
) )
def cpu_training_pipeline( def cpu_training_pipeline(
# ── S3 / Quobjects ── # ── S3 / RustFS ──
s3_endpoint: str = "https://gravenhollow.lab.daviestechlabs.io:30292", s3_endpoint: str = "https://gravenhollow.lab.daviestechlabs.io:30292",
s3_bucket: str = "training-data", s3_bucket: str = "training-data",
s3_prefix: str = "", s3_prefix: str = "",

View File

@@ -2,7 +2,7 @@
""" """
QLoRA Fine-Tuning Pipeline Kubeflow Pipelines SDK QLoRA Fine-Tuning Pipeline Kubeflow Pipelines SDK
Fetches PDFs from a Quobjects S3 bucket, extracts instruction-tuning Fetches PDFs from a RustFS S3 bucket, extracts instruction-tuning
data, trains a QLoRA adapter on the Llama 3.1 70B base model using data, trains a QLoRA adapter on the Llama 3.1 70B base model using
the Strix Halo's 128 GB unified memory, evaluates it, and pushes the the Strix Halo's 128 GB unified memory, evaluates it, and pushes the
adapter weights to a Gitea repository. adapter weights to a Gitea repository.
@@ -23,7 +23,7 @@ from typing import NamedTuple
# ────────────────────────────────────────────────────────────── # ──────────────────────────────────────────────────────────────
# 1. Fetch PDFs from Quobjects S3 # 1. Fetch PDFs from RustFS S3
# ────────────────────────────────────────────────────────────── # ──────────────────────────────────────────────────────────────
@dsl.component( @dsl.component(
base_image="python:3.13-slim", base_image="python:3.13-slim",
@@ -36,7 +36,7 @@ def fetch_pdfs_from_s3(
aws_access_key_id: str, aws_access_key_id: str,
aws_secret_access_key: str, aws_secret_access_key: str,
) -> NamedTuple("PDFOutput", [("pdf_dir", str), ("num_files", int)]): ) -> NamedTuple("PDFOutput", [("pdf_dir", str), ("num_files", int)]):
"""Download all PDFs from a Quobjects S3 bucket.""" """Download all PDFs from an S3 bucket."""
import os import os
import boto3 import boto3
from botocore.client import Config from botocore.client import Config
@@ -571,7 +571,7 @@ def log_training_metrics(
"learning_rate": learning_rate, "learning_rate": learning_rate,
"num_epochs": num_epochs, "num_epochs": num_epochs,
"num_pdfs": num_pdfs, "num_pdfs": num_pdfs,
"data_source": "quobjects/training-data", "data_source": "rustfs/training-data",
} }
) )
mlflow.log_metrics( mlflow.log_metrics(
@@ -591,13 +591,13 @@ def log_training_metrics(
@dsl.pipeline( @dsl.pipeline(
name="QLoRA PDF Fine-Tuning", name="QLoRA PDF Fine-Tuning",
description=( description=(
"Fine-tune Llama 3.1 70B via QLoRA on PDFs from the Quobjects " "Fine-tune Llama 3.1 70B via QLoRA on PDFs from the RustFS "
"training-data bucket. Pushes the adapter to Gitea and logs " "training-data bucket. Pushes the adapter to Gitea and logs "
"metrics to MLflow." "metrics to MLflow."
), ),
) )
def qlora_pdf_pipeline( def qlora_pdf_pipeline(
# ── S3 / Quobjects ── # ── S3 / RustFS ──
s3_endpoint: str = "https://gravenhollow.lab.daviestechlabs.io:30292", s3_endpoint: str = "https://gravenhollow.lab.daviestechlabs.io:30292",
s3_bucket: str = "training-data", s3_bucket: str = "training-data",
s3_prefix: str = "", s3_prefix: str = "",

View File

@@ -29,7 +29,7 @@ def transcribe_and_diarise(
s3_key: str, s3_key: str,
whisper_url: str = "http://ai-inference-serve-svc.kuberay.svc.cluster.local:8000/whisper", whisper_url: str = "http://ai-inference-serve-svc.kuberay.svc.cluster.local:8000/whisper",
) -> NamedTuple("TranscriptOutput", [("transcript_json", str), ("speakers", str), ("audio_path", str)]): ) -> NamedTuple("TranscriptOutput", [("transcript_json", str), ("speakers", str), ("audio_path", str)]):
"""Download audio from Quobjects S3, transcribe via Whisper with timestamps.""" """Download audio from S3, transcribe via Whisper with timestamps."""
import json import json
import os import os
import subprocess import subprocess
@@ -41,7 +41,7 @@ def transcribe_and_diarise(
out = NamedTuple("TranscriptOutput", [("transcript_json", str), ("speakers", str), ("audio_path", str)]) out = NamedTuple("TranscriptOutput", [("transcript_json", str), ("speakers", str), ("audio_path", str)])
work = tempfile.mkdtemp() work = tempfile.mkdtemp()
# ── Download audio from Quobjects S3 ───────────────────── # ── Download audio from S3 ─────────────────────
ext = os.path.splitext(s3_key)[-1] or ".wav" ext = os.path.splitext(s3_key)[-1] or ".wav"
audio_path = os.path.join(work, f"audio_raw{ext}") audio_path = os.path.join(work, f"audio_raw{ext}")
@@ -609,7 +609,7 @@ def voice_cloning_pipeline(
# MLflow # MLflow
mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80", mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
): ):
# 1 - Download from Quobjects S3 and transcribe with Whisper # 1 - Download from S3 and transcribe with Whisper
transcribed = transcribe_and_diarise( transcribed = transcribe_and_diarise(
s3_endpoint=s3_endpoint, s3_endpoint=s3_endpoint,
s3_bucket=s3_bucket, s3_bucket=s3_bucket,