feat: Add Kubeflow Pipeline definitions

- voice_pipeline: STT → RAG → LLM → TTS - document_ingestion_pipeline: Extract → Chunk → Embed → Milvus - document_ingestion_mlflow_pipeline: With MLflow tracking - evaluation_pipeline: Model benchmarking - kfp-sync-job: K8s job to sync pipelines
2026-02-01 20:41:13 -05:00
parent c36655b570
commit c26e4e5ef0
6 changed files with 1690 additions and 1 deletions
--- a/kfp-sync-job.yaml
+++ b/kfp-sync-job.yaml
@@ -0,0 +1,347 @@
+# KFP Pipeline Sync Job
+# Automatically compiles Python pipeline definitions and uploads to Kubeflow
+# Runs as a CronJob to keep pipelines in sync with Git
+---
+# RBAC to allow reading GitRepository status for artifact URL
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: kfp-sync-flux-reader
+  namespace: ai-ml
+rules:
+  - apiGroups: ["source.toolkit.fluxcd.io"]
+    resources: ["gitrepositories"]
+    verbs: ["get", "list"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: kfp-sync-flux-reader
+  namespace: ai-ml
+subjects:
+  - kind: ServiceAccount
+    name: pipeline-bridge
+    namespace: ai-ml
+roleRef:
+  kind: Role
+  name: kfp-sync-flux-reader
+  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: kfp-pipeline-sync
+  namespace: ai-ml
+  labels:
+    app.kubernetes.io/name: kfp-pipeline-sync
+    app.kubernetes.io/part-of: llm-workflows
+spec:
+  # Run every 30 minutes
+  schedule: "*/30 * * * *"
+  concurrencyPolicy: Forbid
+  successfulJobsHistoryLimit: 3
+  failedJobsHistoryLimit: 3
+  jobTemplate:
+    spec:
+      backoffLimit: 2
+      template:
+        metadata:
+          labels:
+            app: kfp-pipeline-sync
+        spec:
+          schedulerName: volcano
+          restartPolicy: OnFailure
+          serviceAccountName: pipeline-bridge
+          containers:
+            - name: sync
+              image: python:3.13-slim
+              command:
+                - python
+                - /scripts/sync_pipelines.py
+              env:
+                - name: KUBEFLOW_HOST
+                  value: "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
+                - name: GIT_REPO_NAME
+                  value: "llm-workflows"
+                - name: GIT_REPO_NAMESPACE
+                  value: "ai-ml"
+              volumeMounts:
+                - name: scripts
+                  mountPath: /scripts
+              resources:
+                requests:
+                  cpu: 100m
+                  memory: 256Mi
+                limits:
+                  cpu: 500m
+                  memory: 512Mi
+          volumes:
+            - name: scripts
+              configMap:
+                name: kfp-sync-scripts
+---
+# Manual trigger job (run this to sync immediately)
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: kfp-pipeline-sync-manual
+  namespace: ai-ml
+  labels:
+    app.kubernetes.io/name: kfp-pipeline-sync
+    app.kubernetes.io/part-of: llm-workflows
+  annotations:
+    description: "Delete and recreate to manually trigger sync"
+spec:
+  backoffLimit: 1
+  template:
+    metadata:
+      labels:
+        app: kfp-pipeline-sync
+    spec:
+      schedulerName: volcano
+      restartPolicy: Never
+      serviceAccountName: pipeline-bridge
+      containers:
+        - name: sync
+          image: python:3.13-slim
+          command:
+            - python
+            - /scripts/sync_pipelines.py
+          env:
+            - name: KUBEFLOW_HOST
+              value: "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
+            - name: GIT_REPO_NAME
+              value: "llm-workflows"
+            - name: GIT_REPO_NAMESPACE
+              value: "ai-ml"
+          volumeMounts:
+            - name: scripts
+              mountPath: /scripts
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+      volumes:
+        - name: scripts
+          configMap:
+            name: kfp-sync-scripts
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: kfp-sync-scripts
+  namespace: ai-ml
+data:
+  sync_pipelines.py: |
+    #!/usr/bin/env python3
+    """
+    KFP Pipeline Sync
+
+    Compiles Python pipeline definitions and uploads to Kubeflow Pipelines.
+    Downloads from Flux source-controller artifact for secure access.
+    """
+    import os
+    import sys
+    import subprocess
+    import tempfile
+    import tarfile
+    import hashlib
+    from pathlib import Path
+    from datetime import datetime
+
+    # Install KFP and kubernetes client
+    subprocess.check_call([
+        sys.executable, "-m", "pip", "install", "-q",
+        "kfp==2.12.1", "httpx", "kubernetes"
+    ])
+
+    from kfp import Client
+    from kfp import compiler
+    import httpx
+    from kubernetes import client as k8s_client, config as k8s_config
+
+    KUBEFLOW_HOST = os.environ.get("KUBEFLOW_HOST", "http://ml-pipeline.kubeflow.svc.cluster.local:8888")
+    PIPELINES_DIR = os.environ.get("PIPELINES_DIR", "/pipelines")
+    # GitRepository to get artifact from
+    GIT_REPO_NAME = os.environ.get("GIT_REPO_NAME", "llm-workflows")
+    GIT_REPO_NAMESPACE = os.environ.get("GIT_REPO_NAMESPACE", "ai-ml")
+
+    def get_flux_artifact_url() -> str:
+        """Get artifact URL from Flux GitRepository status."""
+        try:
+            k8s_config.load_incluster_config()
+            api = k8s_client.CustomObjectsApi()
+            gitrepo = api.get_namespaced_custom_object(
+                group="source.toolkit.fluxcd.io",
+                version="v1",
+                namespace=GIT_REPO_NAMESPACE,
+                plural="gitrepositories",
+                name=GIT_REPO_NAME
+            )
+            return gitrepo.get("status", {}).get("artifact", {}).get("url", "")
+        except Exception as e:
+            print(f"Error getting GitRepository: {e}")
+            return ""
+
+    def get_file_hash(filepath: str) -> str:
+        """Get MD5 hash of file for change detection."""
+        with open(filepath, "rb") as f:
+            return hashlib.md5(f.read()).hexdigest()
+
+    def compile_pipeline(py_file: Path, output_dir: Path) -> Path:
+        """Compile a Python pipeline file to YAML."""
+        output_file = output_dir / f"{py_file.stem}.yaml"
+
+        # Execute the pipeline file to compile it
+        result = subprocess.run(
+            [sys.executable, str(py_file)],
+            cwd=str(py_file.parent),
+            capture_output=True,
+            text=True
+        )
+
+        if result.returncode != 0:
+            print(f"Warning: Failed to compile {py_file}: {result.stderr}")
+            return None
+
+        # Check if YAML was generated
+        generated = py_file.parent / f"{py_file.stem}.yaml"
+        if generated.exists():
+            generated.rename(output_file)
+            return output_file
+
+        return None
+
+    def upload_pipeline(client: Client, yaml_path: Path, pipeline_name: str) -> bool:
+        """Upload or update a pipeline in Kubeflow."""
+        try:
+            # Check if pipeline exists by listing and filtering client-side
+            # KFP v2 API has different filter syntax
+            all_pipelines = client.list_pipelines(page_size=100)
+            existing = None
+            if all_pipelines.pipelines:
+                for p in all_pipelines.pipelines:
+                    if p.display_name == pipeline_name:
+                        existing = p
+                        break
+
+            if existing:
+                # Update existing pipeline
+                print(f"Updating existing pipeline: {pipeline_name}")
+
+                # Create new version
+                version_name = f"v{datetime.now().strftime('%Y%m%d-%H%M%S')}"
+                client.upload_pipeline_version(
+                    pipeline_package_path=str(yaml_path),
+                    pipeline_version_name=version_name,
+                    pipeline_id=existing.pipeline_id
+                )
+                print(f"Created version {version_name}")
+            else:
+                # Create new pipeline
+                print(f"Creating new pipeline: {pipeline_name}")
+                client.upload_pipeline(
+                    pipeline_package_path=str(yaml_path),
+                    pipeline_name=pipeline_name
+                )
+
+            return True
+
+        except Exception as e:
+            print(f"Error uploading {pipeline_name}: {e}")
+            return False
+
+    def sync_from_flux_artifact(client: Client, artifact_url: str):
+        """Download from Flux source-controller and sync pipelines."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tarball = Path(tmpdir) / "source.tar.gz"
+            extract_dir = Path(tmpdir) / "source"
+            extract_dir.mkdir()
+
+            # Download artifact from Flux source-controller
+            print(f"Downloading from Flux: {artifact_url}")
+            response = httpx.get(artifact_url, follow_redirects=True, timeout=60.0)
+            response.raise_for_status()
+            tarball.write_bytes(response.content)
+
+            # Extract tarball
+            with tarfile.open(tarball, 'r:gz') as tar:
+                tar.extractall(extract_dir)
+
+            pipelines_dir = extract_dir / "pipelines"
+            if not pipelines_dir.exists():
+                print("No pipelines directory found in artifact")
+                return
+
+            sync_directory(client, pipelines_dir)
+
+    def sync_directory(client: Client, pipelines_dir: Path):
+        """Sync all pipeline files from a directory."""
+        output_dir = Path("/tmp/compiled")
+        output_dir.mkdir(exist_ok=True)
+
+        # Find all Python pipeline files
+        py_files = list(pipelines_dir.glob("*_pipeline.py")) + list(pipelines_dir.glob("*.pipeline.py"))
+
+        if not py_files:
+            # Fall back to all Python files
+            py_files = list(pipelines_dir.glob("*.py"))
+
+        print(f"Found {len(py_files)} pipeline files")
+
+        for py_file in py_files:
+            print(f"\nProcessing: {py_file.name}")
+
+            # Compile pipeline
+            yaml_path = compile_pipeline(py_file, output_dir)
+            if not yaml_path:
+                # Try direct YAML files
+                yaml_path = pipelines_dir / f"{py_file.stem}.yaml"
+                if not yaml_path.exists():
+                    continue
+
+            pipeline_name = py_file.stem.replace("_", "-")
+            upload_pipeline(client, yaml_path, pipeline_name)
+
+    def main():
+        print(f"KFP Pipeline Sync starting...")
+        print(f"Kubeflow host: {KUBEFLOW_HOST}")
+
+        # Wait for Kubeflow to be ready
+        client = None
+        for i in range(5):
+            try:
+                client = Client(host=KUBEFLOW_HOST)
+                # Test connection
+                client.list_pipelines(page_size=1)
+                print("Connected to Kubeflow Pipelines")
+                break
+            except Exception as e:
+                print(f"Waiting for Kubeflow... ({e})")
+                import time
+                time.sleep(10)
+
+        if not client:
+            print("Failed to connect to Kubeflow")
+            sys.exit(1)
+
+        # Get artifact URL from Flux GitRepository
+        artifact_url = get_flux_artifact_url()
+        if artifact_url:
+            print(f"\nSyncing from Flux artifact...")
+            sync_from_flux_artifact(client, artifact_url)
+        elif PIPELINES_DIR and Path(PIPELINES_DIR).exists():
+            # Sync from local directory
+            print(f"\nSyncing from local: {PIPELINES_DIR}")
+            sync_directory(client, Path(PIPELINES_DIR))
+        else:
+            print("No pipeline source configured!")
+            sys.exit(1)
+
+        print("\nSync complete!")
+
+    if __name__ == "__main__":
+        main()