- voice_pipeline: STT → RAG → LLM → TTS - document_ingestion_pipeline: Extract → Chunk → Embed → Milvus - document_ingestion_mlflow_pipeline: With MLflow tracking - evaluation_pipeline: Model benchmarking - kfp-sync-job: K8s job to sync pipelines
348 lines
11 KiB
YAML
348 lines
11 KiB
YAML
# KFP Pipeline Sync Job
|
|
# Automatically compiles Python pipeline definitions and uploads to Kubeflow
|
|
# Runs as a CronJob to keep pipelines in sync with Git
|
|
---
|
|
# RBAC to allow reading GitRepository status for artifact URL
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: Role
|
|
metadata:
|
|
name: kfp-sync-flux-reader
|
|
namespace: ai-ml
|
|
rules:
|
|
- apiGroups: ["source.toolkit.fluxcd.io"]
|
|
resources: ["gitrepositories"]
|
|
verbs: ["get", "list"]
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: RoleBinding
|
|
metadata:
|
|
name: kfp-sync-flux-reader
|
|
namespace: ai-ml
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: pipeline-bridge
|
|
namespace: ai-ml
|
|
roleRef:
|
|
kind: Role
|
|
name: kfp-sync-flux-reader
|
|
apiGroup: rbac.authorization.k8s.io
|
|
---
|
|
apiVersion: batch/v1
|
|
kind: CronJob
|
|
metadata:
|
|
name: kfp-pipeline-sync
|
|
namespace: ai-ml
|
|
labels:
|
|
app.kubernetes.io/name: kfp-pipeline-sync
|
|
app.kubernetes.io/part-of: llm-workflows
|
|
spec:
|
|
# Run every 30 minutes
|
|
schedule: "*/30 * * * *"
|
|
concurrencyPolicy: Forbid
|
|
successfulJobsHistoryLimit: 3
|
|
failedJobsHistoryLimit: 3
|
|
jobTemplate:
|
|
spec:
|
|
backoffLimit: 2
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: kfp-pipeline-sync
|
|
spec:
|
|
schedulerName: volcano
|
|
restartPolicy: OnFailure
|
|
serviceAccountName: pipeline-bridge
|
|
containers:
|
|
- name: sync
|
|
image: python:3.13-slim
|
|
command:
|
|
- python
|
|
- /scripts/sync_pipelines.py
|
|
env:
|
|
- name: KUBEFLOW_HOST
|
|
value: "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
|
|
- name: GIT_REPO_NAME
|
|
value: "llm-workflows"
|
|
- name: GIT_REPO_NAMESPACE
|
|
value: "ai-ml"
|
|
volumeMounts:
|
|
- name: scripts
|
|
mountPath: /scripts
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 256Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 512Mi
|
|
volumes:
|
|
- name: scripts
|
|
configMap:
|
|
name: kfp-sync-scripts
|
|
---
|
|
# Manual trigger job (run this to sync immediately)
|
|
apiVersion: batch/v1
|
|
kind: Job
|
|
metadata:
|
|
name: kfp-pipeline-sync-manual
|
|
namespace: ai-ml
|
|
labels:
|
|
app.kubernetes.io/name: kfp-pipeline-sync
|
|
app.kubernetes.io/part-of: llm-workflows
|
|
annotations:
|
|
description: "Delete and recreate to manually trigger sync"
|
|
spec:
|
|
backoffLimit: 1
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: kfp-pipeline-sync
|
|
spec:
|
|
schedulerName: volcano
|
|
restartPolicy: Never
|
|
serviceAccountName: pipeline-bridge
|
|
containers:
|
|
- name: sync
|
|
image: python:3.13-slim
|
|
command:
|
|
- python
|
|
- /scripts/sync_pipelines.py
|
|
env:
|
|
- name: KUBEFLOW_HOST
|
|
value: "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
|
|
- name: GIT_REPO_NAME
|
|
value: "llm-workflows"
|
|
- name: GIT_REPO_NAMESPACE
|
|
value: "ai-ml"
|
|
volumeMounts:
|
|
- name: scripts
|
|
mountPath: /scripts
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 256Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 512Mi
|
|
volumes:
|
|
- name: scripts
|
|
configMap:
|
|
name: kfp-sync-scripts
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: kfp-sync-scripts
|
|
namespace: ai-ml
|
|
data:
|
|
sync_pipelines.py: |
|
|
#!/usr/bin/env python3
|
|
"""
|
|
KFP Pipeline Sync
|
|
|
|
Compiles Python pipeline definitions and uploads to Kubeflow Pipelines.
|
|
Downloads from Flux source-controller artifact for secure access.
|
|
"""
|
|
import os
|
|
import sys
|
|
import subprocess
|
|
import tempfile
|
|
import tarfile
|
|
import hashlib
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
# Install KFP and kubernetes client
|
|
subprocess.check_call([
|
|
sys.executable, "-m", "pip", "install", "-q",
|
|
"kfp==2.12.1", "httpx", "kubernetes"
|
|
])
|
|
|
|
from kfp import Client
|
|
from kfp import compiler
|
|
import httpx
|
|
from kubernetes import client as k8s_client, config as k8s_config
|
|
|
|
KUBEFLOW_HOST = os.environ.get("KUBEFLOW_HOST", "http://ml-pipeline.kubeflow.svc.cluster.local:8888")
|
|
PIPELINES_DIR = os.environ.get("PIPELINES_DIR", "/pipelines")
|
|
# GitRepository to get artifact from
|
|
GIT_REPO_NAME = os.environ.get("GIT_REPO_NAME", "llm-workflows")
|
|
GIT_REPO_NAMESPACE = os.environ.get("GIT_REPO_NAMESPACE", "ai-ml")
|
|
|
|
def get_flux_artifact_url() -> str:
|
|
"""Get artifact URL from Flux GitRepository status."""
|
|
try:
|
|
k8s_config.load_incluster_config()
|
|
api = k8s_client.CustomObjectsApi()
|
|
gitrepo = api.get_namespaced_custom_object(
|
|
group="source.toolkit.fluxcd.io",
|
|
version="v1",
|
|
namespace=GIT_REPO_NAMESPACE,
|
|
plural="gitrepositories",
|
|
name=GIT_REPO_NAME
|
|
)
|
|
return gitrepo.get("status", {}).get("artifact", {}).get("url", "")
|
|
except Exception as e:
|
|
print(f"Error getting GitRepository: {e}")
|
|
return ""
|
|
|
|
def get_file_hash(filepath: str) -> str:
|
|
"""Get MD5 hash of file for change detection."""
|
|
with open(filepath, "rb") as f:
|
|
return hashlib.md5(f.read()).hexdigest()
|
|
|
|
def compile_pipeline(py_file: Path, output_dir: Path) -> Path:
|
|
"""Compile a Python pipeline file to YAML."""
|
|
output_file = output_dir / f"{py_file.stem}.yaml"
|
|
|
|
# Execute the pipeline file to compile it
|
|
result = subprocess.run(
|
|
[sys.executable, str(py_file)],
|
|
cwd=str(py_file.parent),
|
|
capture_output=True,
|
|
text=True
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
print(f"Warning: Failed to compile {py_file}: {result.stderr}")
|
|
return None
|
|
|
|
# Check if YAML was generated
|
|
generated = py_file.parent / f"{py_file.stem}.yaml"
|
|
if generated.exists():
|
|
generated.rename(output_file)
|
|
return output_file
|
|
|
|
return None
|
|
|
|
def upload_pipeline(client: Client, yaml_path: Path, pipeline_name: str) -> bool:
|
|
"""Upload or update a pipeline in Kubeflow."""
|
|
try:
|
|
# Check if pipeline exists by listing and filtering client-side
|
|
# KFP v2 API has different filter syntax
|
|
all_pipelines = client.list_pipelines(page_size=100)
|
|
existing = None
|
|
if all_pipelines.pipelines:
|
|
for p in all_pipelines.pipelines:
|
|
if p.display_name == pipeline_name:
|
|
existing = p
|
|
break
|
|
|
|
if existing:
|
|
# Update existing pipeline
|
|
print(f"Updating existing pipeline: {pipeline_name}")
|
|
|
|
# Create new version
|
|
version_name = f"v{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
|
client.upload_pipeline_version(
|
|
pipeline_package_path=str(yaml_path),
|
|
pipeline_version_name=version_name,
|
|
pipeline_id=existing.pipeline_id
|
|
)
|
|
print(f"Created version {version_name}")
|
|
else:
|
|
# Create new pipeline
|
|
print(f"Creating new pipeline: {pipeline_name}")
|
|
client.upload_pipeline(
|
|
pipeline_package_path=str(yaml_path),
|
|
pipeline_name=pipeline_name
|
|
)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Error uploading {pipeline_name}: {e}")
|
|
return False
|
|
|
|
def sync_from_flux_artifact(client: Client, artifact_url: str):
|
|
"""Download from Flux source-controller and sync pipelines."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
tarball = Path(tmpdir) / "source.tar.gz"
|
|
extract_dir = Path(tmpdir) / "source"
|
|
extract_dir.mkdir()
|
|
|
|
# Download artifact from Flux source-controller
|
|
print(f"Downloading from Flux: {artifact_url}")
|
|
response = httpx.get(artifact_url, follow_redirects=True, timeout=60.0)
|
|
response.raise_for_status()
|
|
tarball.write_bytes(response.content)
|
|
|
|
# Extract tarball
|
|
with tarfile.open(tarball, 'r:gz') as tar:
|
|
tar.extractall(extract_dir)
|
|
|
|
pipelines_dir = extract_dir / "pipelines"
|
|
if not pipelines_dir.exists():
|
|
print("No pipelines directory found in artifact")
|
|
return
|
|
|
|
sync_directory(client, pipelines_dir)
|
|
|
|
def sync_directory(client: Client, pipelines_dir: Path):
|
|
"""Sync all pipeline files from a directory."""
|
|
output_dir = Path("/tmp/compiled")
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
# Find all Python pipeline files
|
|
py_files = list(pipelines_dir.glob("*_pipeline.py")) + list(pipelines_dir.glob("*.pipeline.py"))
|
|
|
|
if not py_files:
|
|
# Fall back to all Python files
|
|
py_files = list(pipelines_dir.glob("*.py"))
|
|
|
|
print(f"Found {len(py_files)} pipeline files")
|
|
|
|
for py_file in py_files:
|
|
print(f"\nProcessing: {py_file.name}")
|
|
|
|
# Compile pipeline
|
|
yaml_path = compile_pipeline(py_file, output_dir)
|
|
if not yaml_path:
|
|
# Try direct YAML files
|
|
yaml_path = pipelines_dir / f"{py_file.stem}.yaml"
|
|
if not yaml_path.exists():
|
|
continue
|
|
|
|
pipeline_name = py_file.stem.replace("_", "-")
|
|
upload_pipeline(client, yaml_path, pipeline_name)
|
|
|
|
def main():
|
|
print(f"KFP Pipeline Sync starting...")
|
|
print(f"Kubeflow host: {KUBEFLOW_HOST}")
|
|
|
|
# Wait for Kubeflow to be ready
|
|
client = None
|
|
for i in range(5):
|
|
try:
|
|
client = Client(host=KUBEFLOW_HOST)
|
|
# Test connection
|
|
client.list_pipelines(page_size=1)
|
|
print("Connected to Kubeflow Pipelines")
|
|
break
|
|
except Exception as e:
|
|
print(f"Waiting for Kubeflow... ({e})")
|
|
import time
|
|
time.sleep(10)
|
|
|
|
if not client:
|
|
print("Failed to connect to Kubeflow")
|
|
sys.exit(1)
|
|
|
|
# Get artifact URL from Flux GitRepository
|
|
artifact_url = get_flux_artifact_url()
|
|
if artifact_url:
|
|
print(f"\nSyncing from Flux artifact...")
|
|
sync_from_flux_artifact(client, artifact_url)
|
|
elif PIPELINES_DIR and Path(PIPELINES_DIR).exists():
|
|
# Sync from local directory
|
|
print(f"\nSyncing from local: {PIPELINES_DIR}")
|
|
sync_directory(client, Path(PIPELINES_DIR))
|
|
else:
|
|
print("No pipeline source configured!")
|
|
sys.exit(1)
|
|
|
|
print("\nSync complete!")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|