feat: Add Kubeflow Pipeline definitions
- voice_pipeline: STT → RAG → LLM → TTS - document_ingestion_pipeline: Extract → Chunk → Embed → Milvus - document_ingestion_mlflow_pipeline: With MLflow tracking - evaluation_pipeline: Model benchmarking - kfp-sync-job: K8s job to sync pipelines
This commit is contained in:
347
kfp-sync-job.yaml
Normal file
347
kfp-sync-job.yaml
Normal file
@@ -0,0 +1,347 @@
|
||||
# KFP Pipeline Sync Job
|
||||
# Automatically compiles Python pipeline definitions and uploads to Kubeflow
|
||||
# Runs as a CronJob to keep pipelines in sync with Git
|
||||
---
|
||||
# RBAC to allow reading GitRepository status for artifact URL
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: kfp-sync-flux-reader
|
||||
namespace: ai-ml
|
||||
rules:
|
||||
- apiGroups: ["source.toolkit.fluxcd.io"]
|
||||
resources: ["gitrepositories"]
|
||||
verbs: ["get", "list"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: kfp-sync-flux-reader
|
||||
namespace: ai-ml
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: pipeline-bridge
|
||||
namespace: ai-ml
|
||||
roleRef:
|
||||
kind: Role
|
||||
name: kfp-sync-flux-reader
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
---
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: kfp-pipeline-sync
|
||||
namespace: ai-ml
|
||||
labels:
|
||||
app.kubernetes.io/name: kfp-pipeline-sync
|
||||
app.kubernetes.io/part-of: llm-workflows
|
||||
spec:
|
||||
# Run every 30 minutes
|
||||
schedule: "*/30 * * * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 3
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 2
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: kfp-pipeline-sync
|
||||
spec:
|
||||
schedulerName: volcano
|
||||
restartPolicy: OnFailure
|
||||
serviceAccountName: pipeline-bridge
|
||||
containers:
|
||||
- name: sync
|
||||
image: python:3.13-slim
|
||||
command:
|
||||
- python
|
||||
- /scripts/sync_pipelines.py
|
||||
env:
|
||||
- name: KUBEFLOW_HOST
|
||||
value: "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
|
||||
- name: GIT_REPO_NAME
|
||||
value: "llm-workflows"
|
||||
- name: GIT_REPO_NAMESPACE
|
||||
value: "ai-ml"
|
||||
volumeMounts:
|
||||
- name: scripts
|
||||
mountPath: /scripts
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
volumes:
|
||||
- name: scripts
|
||||
configMap:
|
||||
name: kfp-sync-scripts
|
||||
---
|
||||
# Manual trigger job (run this to sync immediately)
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: kfp-pipeline-sync-manual
|
||||
namespace: ai-ml
|
||||
labels:
|
||||
app.kubernetes.io/name: kfp-pipeline-sync
|
||||
app.kubernetes.io/part-of: llm-workflows
|
||||
annotations:
|
||||
description: "Delete and recreate to manually trigger sync"
|
||||
spec:
|
||||
backoffLimit: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: kfp-pipeline-sync
|
||||
spec:
|
||||
schedulerName: volcano
|
||||
restartPolicy: Never
|
||||
serviceAccountName: pipeline-bridge
|
||||
containers:
|
||||
- name: sync
|
||||
image: python:3.13-slim
|
||||
command:
|
||||
- python
|
||||
- /scripts/sync_pipelines.py
|
||||
env:
|
||||
- name: KUBEFLOW_HOST
|
||||
value: "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
|
||||
- name: GIT_REPO_NAME
|
||||
value: "llm-workflows"
|
||||
- name: GIT_REPO_NAMESPACE
|
||||
value: "ai-ml"
|
||||
volumeMounts:
|
||||
- name: scripts
|
||||
mountPath: /scripts
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
volumes:
|
||||
- name: scripts
|
||||
configMap:
|
||||
name: kfp-sync-scripts
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: kfp-sync-scripts
|
||||
namespace: ai-ml
|
||||
data:
|
||||
sync_pipelines.py: |
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
KFP Pipeline Sync
|
||||
|
||||
Compiles Python pipeline definitions and uploads to Kubeflow Pipelines.
|
||||
Downloads from Flux source-controller artifact for secure access.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import tempfile
|
||||
import tarfile
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
# Install KFP and kubernetes client
|
||||
subprocess.check_call([
|
||||
sys.executable, "-m", "pip", "install", "-q",
|
||||
"kfp==2.12.1", "httpx", "kubernetes"
|
||||
])
|
||||
|
||||
from kfp import Client
|
||||
from kfp import compiler
|
||||
import httpx
|
||||
from kubernetes import client as k8s_client, config as k8s_config
|
||||
|
||||
KUBEFLOW_HOST = os.environ.get("KUBEFLOW_HOST", "http://ml-pipeline.kubeflow.svc.cluster.local:8888")
|
||||
PIPELINES_DIR = os.environ.get("PIPELINES_DIR", "/pipelines")
|
||||
# GitRepository to get artifact from
|
||||
GIT_REPO_NAME = os.environ.get("GIT_REPO_NAME", "llm-workflows")
|
||||
GIT_REPO_NAMESPACE = os.environ.get("GIT_REPO_NAMESPACE", "ai-ml")
|
||||
|
||||
def get_flux_artifact_url() -> str:
|
||||
"""Get artifact URL from Flux GitRepository status."""
|
||||
try:
|
||||
k8s_config.load_incluster_config()
|
||||
api = k8s_client.CustomObjectsApi()
|
||||
gitrepo = api.get_namespaced_custom_object(
|
||||
group="source.toolkit.fluxcd.io",
|
||||
version="v1",
|
||||
namespace=GIT_REPO_NAMESPACE,
|
||||
plural="gitrepositories",
|
||||
name=GIT_REPO_NAME
|
||||
)
|
||||
return gitrepo.get("status", {}).get("artifact", {}).get("url", "")
|
||||
except Exception as e:
|
||||
print(f"Error getting GitRepository: {e}")
|
||||
return ""
|
||||
|
||||
def get_file_hash(filepath: str) -> str:
|
||||
"""Get MD5 hash of file for change detection."""
|
||||
with open(filepath, "rb") as f:
|
||||
return hashlib.md5(f.read()).hexdigest()
|
||||
|
||||
def compile_pipeline(py_file: Path, output_dir: Path) -> Path:
|
||||
"""Compile a Python pipeline file to YAML."""
|
||||
output_file = output_dir / f"{py_file.stem}.yaml"
|
||||
|
||||
# Execute the pipeline file to compile it
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(py_file)],
|
||||
cwd=str(py_file.parent),
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"Warning: Failed to compile {py_file}: {result.stderr}")
|
||||
return None
|
||||
|
||||
# Check if YAML was generated
|
||||
generated = py_file.parent / f"{py_file.stem}.yaml"
|
||||
if generated.exists():
|
||||
generated.rename(output_file)
|
||||
return output_file
|
||||
|
||||
return None
|
||||
|
||||
def upload_pipeline(client: Client, yaml_path: Path, pipeline_name: str) -> bool:
|
||||
"""Upload or update a pipeline in Kubeflow."""
|
||||
try:
|
||||
# Check if pipeline exists by listing and filtering client-side
|
||||
# KFP v2 API has different filter syntax
|
||||
all_pipelines = client.list_pipelines(page_size=100)
|
||||
existing = None
|
||||
if all_pipelines.pipelines:
|
||||
for p in all_pipelines.pipelines:
|
||||
if p.display_name == pipeline_name:
|
||||
existing = p
|
||||
break
|
||||
|
||||
if existing:
|
||||
# Update existing pipeline
|
||||
print(f"Updating existing pipeline: {pipeline_name}")
|
||||
|
||||
# Create new version
|
||||
version_name = f"v{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
||||
client.upload_pipeline_version(
|
||||
pipeline_package_path=str(yaml_path),
|
||||
pipeline_version_name=version_name,
|
||||
pipeline_id=existing.pipeline_id
|
||||
)
|
||||
print(f"Created version {version_name}")
|
||||
else:
|
||||
# Create new pipeline
|
||||
print(f"Creating new pipeline: {pipeline_name}")
|
||||
client.upload_pipeline(
|
||||
pipeline_package_path=str(yaml_path),
|
||||
pipeline_name=pipeline_name
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error uploading {pipeline_name}: {e}")
|
||||
return False
|
||||
|
||||
def sync_from_flux_artifact(client: Client, artifact_url: str):
|
||||
"""Download from Flux source-controller and sync pipelines."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tarball = Path(tmpdir) / "source.tar.gz"
|
||||
extract_dir = Path(tmpdir) / "source"
|
||||
extract_dir.mkdir()
|
||||
|
||||
# Download artifact from Flux source-controller
|
||||
print(f"Downloading from Flux: {artifact_url}")
|
||||
response = httpx.get(artifact_url, follow_redirects=True, timeout=60.0)
|
||||
response.raise_for_status()
|
||||
tarball.write_bytes(response.content)
|
||||
|
||||
# Extract tarball
|
||||
with tarfile.open(tarball, 'r:gz') as tar:
|
||||
tar.extractall(extract_dir)
|
||||
|
||||
pipelines_dir = extract_dir / "pipelines"
|
||||
if not pipelines_dir.exists():
|
||||
print("No pipelines directory found in artifact")
|
||||
return
|
||||
|
||||
sync_directory(client, pipelines_dir)
|
||||
|
||||
def sync_directory(client: Client, pipelines_dir: Path):
|
||||
"""Sync all pipeline files from a directory."""
|
||||
output_dir = Path("/tmp/compiled")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Find all Python pipeline files
|
||||
py_files = list(pipelines_dir.glob("*_pipeline.py")) + list(pipelines_dir.glob("*.pipeline.py"))
|
||||
|
||||
if not py_files:
|
||||
# Fall back to all Python files
|
||||
py_files = list(pipelines_dir.glob("*.py"))
|
||||
|
||||
print(f"Found {len(py_files)} pipeline files")
|
||||
|
||||
for py_file in py_files:
|
||||
print(f"\nProcessing: {py_file.name}")
|
||||
|
||||
# Compile pipeline
|
||||
yaml_path = compile_pipeline(py_file, output_dir)
|
||||
if not yaml_path:
|
||||
# Try direct YAML files
|
||||
yaml_path = pipelines_dir / f"{py_file.stem}.yaml"
|
||||
if not yaml_path.exists():
|
||||
continue
|
||||
|
||||
pipeline_name = py_file.stem.replace("_", "-")
|
||||
upload_pipeline(client, yaml_path, pipeline_name)
|
||||
|
||||
def main():
|
||||
print(f"KFP Pipeline Sync starting...")
|
||||
print(f"Kubeflow host: {KUBEFLOW_HOST}")
|
||||
|
||||
# Wait for Kubeflow to be ready
|
||||
client = None
|
||||
for i in range(5):
|
||||
try:
|
||||
client = Client(host=KUBEFLOW_HOST)
|
||||
# Test connection
|
||||
client.list_pipelines(page_size=1)
|
||||
print("Connected to Kubeflow Pipelines")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Waiting for Kubeflow... ({e})")
|
||||
import time
|
||||
time.sleep(10)
|
||||
|
||||
if not client:
|
||||
print("Failed to connect to Kubeflow")
|
||||
sys.exit(1)
|
||||
|
||||
# Get artifact URL from Flux GitRepository
|
||||
artifact_url = get_flux_artifact_url()
|
||||
if artifact_url:
|
||||
print(f"\nSyncing from Flux artifact...")
|
||||
sync_from_flux_artifact(client, artifact_url)
|
||||
elif PIPELINES_DIR and Path(PIPELINES_DIR).exists():
|
||||
# Sync from local directory
|
||||
print(f"\nSyncing from local: {PIPELINES_DIR}")
|
||||
sync_directory(client, Path(PIPELINES_DIR))
|
||||
else:
|
||||
print("No pipeline source configured!")
|
||||
sys.exit(1)
|
||||
|
||||
print("\nSync complete!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user