feat: Add Kubeflow Pipeline definitions

- voice_pipeline: STT → RAG → LLM → TTS
- document_ingestion_pipeline: Extract → Chunk → Embed → Milvus
- document_ingestion_mlflow_pipeline: With MLflow tracking
- evaluation_pipeline: Model benchmarking
- kfp-sync-job: K8s job to sync pipelines
This commit is contained in:
2026-02-01 20:41:13 -05:00
parent c36655b570
commit c26e4e5ef0
6 changed files with 1690 additions and 1 deletions

347
kfp-sync-job.yaml Normal file
View File

@@ -0,0 +1,347 @@
# KFP Pipeline Sync Job
# Automatically compiles Python pipeline definitions and uploads to Kubeflow
# Runs as a CronJob to keep pipelines in sync with Git
---
# RBAC to allow reading GitRepository status for artifact URL
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: kfp-sync-flux-reader
namespace: ai-ml
rules:
- apiGroups: ["source.toolkit.fluxcd.io"]
resources: ["gitrepositories"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: kfp-sync-flux-reader
namespace: ai-ml
subjects:
- kind: ServiceAccount
name: pipeline-bridge
namespace: ai-ml
roleRef:
kind: Role
name: kfp-sync-flux-reader
apiGroup: rbac.authorization.k8s.io
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: kfp-pipeline-sync
namespace: ai-ml
labels:
app.kubernetes.io/name: kfp-pipeline-sync
app.kubernetes.io/part-of: llm-workflows
spec:
# Run every 30 minutes
schedule: "*/30 * * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 3
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 2
template:
metadata:
labels:
app: kfp-pipeline-sync
spec:
schedulerName: volcano
restartPolicy: OnFailure
serviceAccountName: pipeline-bridge
containers:
- name: sync
image: python:3.13-slim
command:
- python
- /scripts/sync_pipelines.py
env:
- name: KUBEFLOW_HOST
value: "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
- name: GIT_REPO_NAME
value: "llm-workflows"
- name: GIT_REPO_NAMESPACE
value: "ai-ml"
volumeMounts:
- name: scripts
mountPath: /scripts
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
volumes:
- name: scripts
configMap:
name: kfp-sync-scripts
---
# Manual trigger job (run this to sync immediately)
apiVersion: batch/v1
kind: Job
metadata:
name: kfp-pipeline-sync-manual
namespace: ai-ml
labels:
app.kubernetes.io/name: kfp-pipeline-sync
app.kubernetes.io/part-of: llm-workflows
annotations:
description: "Delete and recreate to manually trigger sync"
spec:
backoffLimit: 1
template:
metadata:
labels:
app: kfp-pipeline-sync
spec:
schedulerName: volcano
restartPolicy: Never
serviceAccountName: pipeline-bridge
containers:
- name: sync
image: python:3.13-slim
command:
- python
- /scripts/sync_pipelines.py
env:
- name: KUBEFLOW_HOST
value: "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
- name: GIT_REPO_NAME
value: "llm-workflows"
- name: GIT_REPO_NAMESPACE
value: "ai-ml"
volumeMounts:
- name: scripts
mountPath: /scripts
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
volumes:
- name: scripts
configMap:
name: kfp-sync-scripts
---
apiVersion: v1
kind: ConfigMap
metadata:
name: kfp-sync-scripts
namespace: ai-ml
data:
sync_pipelines.py: |
#!/usr/bin/env python3
"""
KFP Pipeline Sync
Compiles Python pipeline definitions and uploads to Kubeflow Pipelines.
Downloads from Flux source-controller artifact for secure access.
"""
import os
import sys
import subprocess
import tempfile
import tarfile
import hashlib
from pathlib import Path
from datetime import datetime
# Install KFP and kubernetes client
subprocess.check_call([
sys.executable, "-m", "pip", "install", "-q",
"kfp==2.12.1", "httpx", "kubernetes"
])
from kfp import Client
from kfp import compiler
import httpx
from kubernetes import client as k8s_client, config as k8s_config
KUBEFLOW_HOST = os.environ.get("KUBEFLOW_HOST", "http://ml-pipeline.kubeflow.svc.cluster.local:8888")
PIPELINES_DIR = os.environ.get("PIPELINES_DIR", "/pipelines")
# GitRepository to get artifact from
GIT_REPO_NAME = os.environ.get("GIT_REPO_NAME", "llm-workflows")
GIT_REPO_NAMESPACE = os.environ.get("GIT_REPO_NAMESPACE", "ai-ml")
def get_flux_artifact_url() -> str:
"""Get artifact URL from Flux GitRepository status."""
try:
k8s_config.load_incluster_config()
api = k8s_client.CustomObjectsApi()
gitrepo = api.get_namespaced_custom_object(
group="source.toolkit.fluxcd.io",
version="v1",
namespace=GIT_REPO_NAMESPACE,
plural="gitrepositories",
name=GIT_REPO_NAME
)
return gitrepo.get("status", {}).get("artifact", {}).get("url", "")
except Exception as e:
print(f"Error getting GitRepository: {e}")
return ""
def get_file_hash(filepath: str) -> str:
"""Get MD5 hash of file for change detection."""
with open(filepath, "rb") as f:
return hashlib.md5(f.read()).hexdigest()
def compile_pipeline(py_file: Path, output_dir: Path) -> Path:
"""Compile a Python pipeline file to YAML."""
output_file = output_dir / f"{py_file.stem}.yaml"
# Execute the pipeline file to compile it
result = subprocess.run(
[sys.executable, str(py_file)],
cwd=str(py_file.parent),
capture_output=True,
text=True
)
if result.returncode != 0:
print(f"Warning: Failed to compile {py_file}: {result.stderr}")
return None
# Check if YAML was generated
generated = py_file.parent / f"{py_file.stem}.yaml"
if generated.exists():
generated.rename(output_file)
return output_file
return None
def upload_pipeline(client: Client, yaml_path: Path, pipeline_name: str) -> bool:
"""Upload or update a pipeline in Kubeflow."""
try:
# Check if pipeline exists by listing and filtering client-side
# KFP v2 API has different filter syntax
all_pipelines = client.list_pipelines(page_size=100)
existing = None
if all_pipelines.pipelines:
for p in all_pipelines.pipelines:
if p.display_name == pipeline_name:
existing = p
break
if existing:
# Update existing pipeline
print(f"Updating existing pipeline: {pipeline_name}")
# Create new version
version_name = f"v{datetime.now().strftime('%Y%m%d-%H%M%S')}"
client.upload_pipeline_version(
pipeline_package_path=str(yaml_path),
pipeline_version_name=version_name,
pipeline_id=existing.pipeline_id
)
print(f"Created version {version_name}")
else:
# Create new pipeline
print(f"Creating new pipeline: {pipeline_name}")
client.upload_pipeline(
pipeline_package_path=str(yaml_path),
pipeline_name=pipeline_name
)
return True
except Exception as e:
print(f"Error uploading {pipeline_name}: {e}")
return False
def sync_from_flux_artifact(client: Client, artifact_url: str):
"""Download from Flux source-controller and sync pipelines."""
with tempfile.TemporaryDirectory() as tmpdir:
tarball = Path(tmpdir) / "source.tar.gz"
extract_dir = Path(tmpdir) / "source"
extract_dir.mkdir()
# Download artifact from Flux source-controller
print(f"Downloading from Flux: {artifact_url}")
response = httpx.get(artifact_url, follow_redirects=True, timeout=60.0)
response.raise_for_status()
tarball.write_bytes(response.content)
# Extract tarball
with tarfile.open(tarball, 'r:gz') as tar:
tar.extractall(extract_dir)
pipelines_dir = extract_dir / "pipelines"
if not pipelines_dir.exists():
print("No pipelines directory found in artifact")
return
sync_directory(client, pipelines_dir)
def sync_directory(client: Client, pipelines_dir: Path):
"""Sync all pipeline files from a directory."""
output_dir = Path("/tmp/compiled")
output_dir.mkdir(exist_ok=True)
# Find all Python pipeline files
py_files = list(pipelines_dir.glob("*_pipeline.py")) + list(pipelines_dir.glob("*.pipeline.py"))
if not py_files:
# Fall back to all Python files
py_files = list(pipelines_dir.glob("*.py"))
print(f"Found {len(py_files)} pipeline files")
for py_file in py_files:
print(f"\nProcessing: {py_file.name}")
# Compile pipeline
yaml_path = compile_pipeline(py_file, output_dir)
if not yaml_path:
# Try direct YAML files
yaml_path = pipelines_dir / f"{py_file.stem}.yaml"
if not yaml_path.exists():
continue
pipeline_name = py_file.stem.replace("_", "-")
upload_pipeline(client, yaml_path, pipeline_name)
def main():
print(f"KFP Pipeline Sync starting...")
print(f"Kubeflow host: {KUBEFLOW_HOST}")
# Wait for Kubeflow to be ready
client = None
for i in range(5):
try:
client = Client(host=KUBEFLOW_HOST)
# Test connection
client.list_pipelines(page_size=1)
print("Connected to Kubeflow Pipelines")
break
except Exception as e:
print(f"Waiting for Kubeflow... ({e})")
import time
time.sleep(10)
if not client:
print("Failed to connect to Kubeflow")
sys.exit(1)
# Get artifact URL from Flux GitRepository
artifact_url = get_flux_artifact_url()
if artifact_url:
print(f"\nSyncing from Flux artifact...")
sync_from_flux_artifact(client, artifact_url)
elif PIPELINES_DIR and Path(PIPELINES_DIR).exists():
# Sync from local directory
print(f"\nSyncing from local: {PIPELINES_DIR}")
sync_directory(client, Path(PIPELINES_DIR))
else:
print("No pipeline source configured!")
sys.exit(1)
print("\nSync complete!")
if __name__ == "__main__":
main()