pipelines go to gravenhollow now.
This commit is contained in:
@@ -7,7 +7,7 @@ distributed across multiple cluster nodes via KubeRay RayJob.
|
|||||||
GPUs remain 100 % dedicated to inference serving.
|
GPUs remain 100 % dedicated to inference serving.
|
||||||
|
|
||||||
Architecture:
|
Architecture:
|
||||||
1. Fetch PDFs from Quobjects S3
|
1. Fetch PDFs from RustFS S3
|
||||||
2. Prepare instruction-tuning dataset
|
2. Prepare instruction-tuning dataset
|
||||||
3. Upload prepared data to S3 (shared storage for Ray workers)
|
3. Upload prepared data to S3 (shared storage for Ray workers)
|
||||||
4. Submit a KubeRay RayJob that runs Ray Train TorchTrainer
|
4. Submit a KubeRay RayJob that runs Ray Train TorchTrainer
|
||||||
@@ -41,7 +41,7 @@ from typing import NamedTuple
|
|||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────
|
||||||
# 1. Fetch PDFs from Quobjects S3
|
# 1. Fetch PDFs from RustFS S3
|
||||||
# ──────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────
|
||||||
@dsl.component(
|
@dsl.component(
|
||||||
base_image="python:3.13-slim",
|
base_image="python:3.13-slim",
|
||||||
@@ -54,7 +54,7 @@ def fetch_pdfs_from_s3(
|
|||||||
aws_access_key_id: str,
|
aws_access_key_id: str,
|
||||||
aws_secret_access_key: str,
|
aws_secret_access_key: str,
|
||||||
) -> NamedTuple("PDFOutput", [("pdf_dir", str), ("num_files", int)]):
|
) -> NamedTuple("PDFOutput", [("pdf_dir", str), ("num_files", int)]):
|
||||||
"""Download all PDFs from a Quobjects S3 bucket."""
|
"""Download all PDFs from an S3 bucket."""
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import boto3
|
import boto3
|
||||||
@@ -994,7 +994,7 @@ def log_training_metrics(
|
|||||||
"num_epochs": num_epochs,
|
"num_epochs": num_epochs,
|
||||||
"num_pdfs": num_pdfs,
|
"num_pdfs": num_pdfs,
|
||||||
"backend": "ray-train-gloo",
|
"backend": "ray-train-gloo",
|
||||||
"data_source": "quobjects/training-data",
|
"data_source": "rustfs/training-data",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
mlflow.log_metrics(
|
mlflow.log_metrics(
|
||||||
@@ -1023,7 +1023,7 @@ def log_training_metrics(
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
def cpu_training_pipeline(
|
def cpu_training_pipeline(
|
||||||
# ── S3 / Quobjects ──
|
# ── S3 / RustFS ──
|
||||||
s3_endpoint: str = "https://gravenhollow.lab.daviestechlabs.io:30292",
|
s3_endpoint: str = "https://gravenhollow.lab.daviestechlabs.io:30292",
|
||||||
s3_bucket: str = "training-data",
|
s3_bucket: str = "training-data",
|
||||||
s3_prefix: str = "",
|
s3_prefix: str = "",
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
"""
|
"""
|
||||||
QLoRA Fine-Tuning Pipeline – Kubeflow Pipelines SDK
|
QLoRA Fine-Tuning Pipeline – Kubeflow Pipelines SDK
|
||||||
|
|
||||||
Fetches PDFs from a Quobjects S3 bucket, extracts instruction-tuning
|
Fetches PDFs from a RustFS S3 bucket, extracts instruction-tuning
|
||||||
data, trains a QLoRA adapter on the Llama 3.1 70B base model using
|
data, trains a QLoRA adapter on the Llama 3.1 70B base model using
|
||||||
the Strix Halo's 128 GB unified memory, evaluates it, and pushes the
|
the Strix Halo's 128 GB unified memory, evaluates it, and pushes the
|
||||||
adapter weights to a Gitea repository.
|
adapter weights to a Gitea repository.
|
||||||
@@ -23,7 +23,7 @@ from typing import NamedTuple
|
|||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────
|
||||||
# 1. Fetch PDFs from Quobjects S3
|
# 1. Fetch PDFs from RustFS S3
|
||||||
# ──────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────
|
||||||
@dsl.component(
|
@dsl.component(
|
||||||
base_image="python:3.13-slim",
|
base_image="python:3.13-slim",
|
||||||
@@ -36,7 +36,7 @@ def fetch_pdfs_from_s3(
|
|||||||
aws_access_key_id: str,
|
aws_access_key_id: str,
|
||||||
aws_secret_access_key: str,
|
aws_secret_access_key: str,
|
||||||
) -> NamedTuple("PDFOutput", [("pdf_dir", str), ("num_files", int)]):
|
) -> NamedTuple("PDFOutput", [("pdf_dir", str), ("num_files", int)]):
|
||||||
"""Download all PDFs from a Quobjects S3 bucket."""
|
"""Download all PDFs from an S3 bucket."""
|
||||||
import os
|
import os
|
||||||
import boto3
|
import boto3
|
||||||
from botocore.client import Config
|
from botocore.client import Config
|
||||||
@@ -571,7 +571,7 @@ def log_training_metrics(
|
|||||||
"learning_rate": learning_rate,
|
"learning_rate": learning_rate,
|
||||||
"num_epochs": num_epochs,
|
"num_epochs": num_epochs,
|
||||||
"num_pdfs": num_pdfs,
|
"num_pdfs": num_pdfs,
|
||||||
"data_source": "quobjects/training-data",
|
"data_source": "rustfs/training-data",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
mlflow.log_metrics(
|
mlflow.log_metrics(
|
||||||
@@ -591,13 +591,13 @@ def log_training_metrics(
|
|||||||
@dsl.pipeline(
|
@dsl.pipeline(
|
||||||
name="QLoRA PDF Fine-Tuning",
|
name="QLoRA PDF Fine-Tuning",
|
||||||
description=(
|
description=(
|
||||||
"Fine-tune Llama 3.1 70B via QLoRA on PDFs from the Quobjects "
|
"Fine-tune Llama 3.1 70B via QLoRA on PDFs from the RustFS "
|
||||||
"training-data bucket. Pushes the adapter to Gitea and logs "
|
"training-data bucket. Pushes the adapter to Gitea and logs "
|
||||||
"metrics to MLflow."
|
"metrics to MLflow."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
def qlora_pdf_pipeline(
|
def qlora_pdf_pipeline(
|
||||||
# ── S3 / Quobjects ──
|
# ── S3 / RustFS ──
|
||||||
s3_endpoint: str = "https://gravenhollow.lab.daviestechlabs.io:30292",
|
s3_endpoint: str = "https://gravenhollow.lab.daviestechlabs.io:30292",
|
||||||
s3_bucket: str = "training-data",
|
s3_bucket: str = "training-data",
|
||||||
s3_prefix: str = "",
|
s3_prefix: str = "",
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ def transcribe_and_diarise(
|
|||||||
s3_key: str,
|
s3_key: str,
|
||||||
whisper_url: str = "http://ai-inference-serve-svc.kuberay.svc.cluster.local:8000/whisper",
|
whisper_url: str = "http://ai-inference-serve-svc.kuberay.svc.cluster.local:8000/whisper",
|
||||||
) -> NamedTuple("TranscriptOutput", [("transcript_json", str), ("speakers", str), ("audio_path", str)]):
|
) -> NamedTuple("TranscriptOutput", [("transcript_json", str), ("speakers", str), ("audio_path", str)]):
|
||||||
"""Download audio from Quobjects S3, transcribe via Whisper with timestamps."""
|
"""Download audio from S3, transcribe via Whisper with timestamps."""
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
@@ -41,7 +41,7 @@ def transcribe_and_diarise(
|
|||||||
out = NamedTuple("TranscriptOutput", [("transcript_json", str), ("speakers", str), ("audio_path", str)])
|
out = NamedTuple("TranscriptOutput", [("transcript_json", str), ("speakers", str), ("audio_path", str)])
|
||||||
work = tempfile.mkdtemp()
|
work = tempfile.mkdtemp()
|
||||||
|
|
||||||
# ── Download audio from Quobjects S3 ─────────────────────
|
# ── Download audio from S3 ─────────────────────
|
||||||
ext = os.path.splitext(s3_key)[-1] or ".wav"
|
ext = os.path.splitext(s3_key)[-1] or ".wav"
|
||||||
audio_path = os.path.join(work, f"audio_raw{ext}")
|
audio_path = os.path.join(work, f"audio_raw{ext}")
|
||||||
|
|
||||||
@@ -609,7 +609,7 @@ def voice_cloning_pipeline(
|
|||||||
# MLflow
|
# MLflow
|
||||||
mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
|
mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
|
||||||
):
|
):
|
||||||
# 1 - Download from Quobjects S3 and transcribe with Whisper
|
# 1 - Download from S3 and transcribe with Whisper
|
||||||
transcribed = transcribe_and_diarise(
|
transcribed = transcribe_and_diarise(
|
||||||
s3_endpoint=s3_endpoint,
|
s3_endpoint=s3_endpoint,
|
||||||
s3_bucket=s3_bucket,
|
s3_bucket=s3_bucket,
|
||||||
|
|||||||
Reference in New Issue
Block a user