kubeflow/kfp-sync-job.yaml

# KFP Pipeline Sync Job
# Automatically compiles Python pipeline definitions and uploads to Kubeflow
# Runs as a CronJob to keep pipelines in sync with Git
---
# RBAC to allow reading GitRepository status for artifact URL
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: kfp-sync-flux-reader
  namespace: ai-ml
rules:
  - apiGroups: ["source.toolkit.fluxcd.io"]
    resources: ["gitrepositories"]
    verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: kfp-sync-flux-reader
  namespace: ai-ml
subjects:
  - kind: ServiceAccount
    name: pipeline-bridge
    namespace: ai-ml
roleRef:
  kind: Role
  name: kfp-sync-flux-reader
  apiGroup: rbac.authorization.k8s.io
---
apiVersion: batch/v1
kind: CronJob
metadata:
  name: kfp-pipeline-sync
  namespace: ai-ml
  labels:
    app.kubernetes.io/name: kfp-pipeline-sync
    app.kubernetes.io/part-of: llm-workflows
spec:
  # Run every 30 minutes
  schedule: "*/30 * * * *"
  concurrencyPolicy: Forbid
  successfulJobsHistoryLimit: 3
  failedJobsHistoryLimit: 3
  jobTemplate:
    spec:
      backoffLimit: 2
      template:
        metadata:
          labels:
            app: kfp-pipeline-sync
        spec:
          schedulerName: volcano
          restartPolicy: OnFailure
          serviceAccountName: pipeline-bridge
          containers:
            - name: sync
              image: python:3.13-slim
              command:
                - python
                - /scripts/sync_pipelines.py
              env:
                - name: KUBEFLOW_HOST
                  value: "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
                - name: GIT_REPO_NAME
                  value: "llm-workflows"
                - name: GIT_REPO_NAMESPACE
                  value: "ai-ml"
              volumeMounts:
                - name: scripts
                  mountPath: /scripts
              resources:
                requests:
                  cpu: 100m
                  memory: 256Mi
                limits:
                  cpu: 500m
                  memory: 512Mi
          volumes:
            - name: scripts
              configMap:
                name: kfp-sync-scripts
---
# Manual trigger job (run this to sync immediately)
apiVersion: batch/v1
kind: Job
metadata:
  name: kfp-pipeline-sync-manual
  namespace: ai-ml
  labels:
    app.kubernetes.io/name: kfp-pipeline-sync
    app.kubernetes.io/part-of: llm-workflows
  annotations:
    description: "Delete and recreate to manually trigger sync"
spec:
  backoffLimit: 1
  template:
    metadata:
      labels:
        app: kfp-pipeline-sync
    spec:
      schedulerName: volcano
      restartPolicy: Never
      serviceAccountName: pipeline-bridge
      containers:
        - name: sync
          image: python:3.13-slim
          command:
            - python
            - /scripts/sync_pipelines.py
          env:
            - name: KUBEFLOW_HOST
              value: "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
            - name: GIT_REPO_NAME
              value: "llm-workflows"
            - name: GIT_REPO_NAMESPACE
              value: "ai-ml"
          volumeMounts:
            - name: scripts
              mountPath: /scripts
          resources:
            requests:
              cpu: 100m
              memory: 256Mi
            limits:
              cpu: 500m
              memory: 512Mi
      volumes:
        - name: scripts
          configMap:
            name: kfp-sync-scripts
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: kfp-sync-scripts
  namespace: ai-ml
data:
  sync_pipelines.py: |
    #!/usr/bin/env python3
    """
    KFP Pipeline Sync

    Compiles Python pipeline definitions and uploads to Kubeflow Pipelines.
    Downloads from Flux source-controller artifact for secure access.
    """
    import os
    import sys
    import subprocess
    import tempfile
    import tarfile
    import hashlib
    from pathlib import Path
    from datetime import datetime

    # Install KFP and kubernetes client
    subprocess.check_call([
        sys.executable, "-m", "pip", "install", "-q",
        "kfp==2.12.1", "httpx", "kubernetes"
    ])

    from kfp import Client
    from kfp import compiler
    import httpx
    from kubernetes import client as k8s_client, config as k8s_config

    KUBEFLOW_HOST = os.environ.get("KUBEFLOW_HOST", "http://ml-pipeline.kubeflow.svc.cluster.local:8888")
    PIPELINES_DIR = os.environ.get("PIPELINES_DIR", "/pipelines")
    # GitRepository to get artifact from
    GIT_REPO_NAME = os.environ.get("GIT_REPO_NAME", "llm-workflows")
    GIT_REPO_NAMESPACE = os.environ.get("GIT_REPO_NAMESPACE", "ai-ml")

    def get_flux_artifact_url() -> str:
        """Get artifact URL from Flux GitRepository status."""
        try:
            k8s_config.load_incluster_config()
            api = k8s_client.CustomObjectsApi()
            gitrepo = api.get_namespaced_custom_object(
                group="source.toolkit.fluxcd.io",
                version="v1",
                namespace=GIT_REPO_NAMESPACE,
                plural="gitrepositories",
                name=GIT_REPO_NAME
            )
            return gitrepo.get("status", {}).get("artifact", {}).get("url", "")
        except Exception as e:
            print(f"Error getting GitRepository: {e}")
            return ""

    def get_file_hash(filepath: str) -> str:
        """Get MD5 hash of file for change detection."""
        with open(filepath, "rb") as f:
            return hashlib.md5(f.read()).hexdigest()

    def compile_pipeline(py_file: Path, output_dir: Path) -> Path:
        """Compile a Python pipeline file to YAML."""
        output_file = output_dir / f"{py_file.stem}.yaml"

        # Execute the pipeline file to compile it
        result = subprocess.run(
            [sys.executable, str(py_file)],
            cwd=str(py_file.parent),
            capture_output=True,
            text=True
        )

        if result.returncode != 0:
            print(f"Warning: Failed to compile {py_file}: {result.stderr}")
            return None

        # Check if YAML was generated
        generated = py_file.parent / f"{py_file.stem}.yaml"
        if generated.exists():
            generated.rename(output_file)
            return output_file

        return None

    def upload_pipeline(client: Client, yaml_path: Path, pipeline_name: str) -> bool:
        """Upload or update a pipeline in Kubeflow."""
        try:
            # Check if pipeline exists by listing and filtering client-side
            # KFP v2 API has different filter syntax
            all_pipelines = client.list_pipelines(page_size=100)
            existing = None
            if all_pipelines.pipelines:
                for p in all_pipelines.pipelines:
                    if p.display_name == pipeline_name:
                        existing = p
                        break

            if existing:
                # Update existing pipeline
                print(f"Updating existing pipeline: {pipeline_name}")

                # Create new version
                version_name = f"v{datetime.now().strftime('%Y%m%d-%H%M%S')}"
                client.upload_pipeline_version(
                    pipeline_package_path=str(yaml_path),
                    pipeline_version_name=version_name,
                    pipeline_id=existing.pipeline_id
                )
                print(f"Created version {version_name}")
            else:
                # Create new pipeline
                print(f"Creating new pipeline: {pipeline_name}")
                client.upload_pipeline(
                    pipeline_package_path=str(yaml_path),
                    pipeline_name=pipeline_name
                )

            return True

        except Exception as e:
            print(f"Error uploading {pipeline_name}: {e}")
            return False

    def sync_from_flux_artifact(client: Client, artifact_url: str):
        """Download from Flux source-controller and sync pipelines."""
        with tempfile.TemporaryDirectory() as tmpdir:
            tarball = Path(tmpdir) / "source.tar.gz"
            extract_dir = Path(tmpdir) / "source"
            extract_dir.mkdir()

            # Download artifact from Flux source-controller
            print(f"Downloading from Flux: {artifact_url}")
            response = httpx.get(artifact_url, follow_redirects=True, timeout=60.0)
            response.raise_for_status()
            tarball.write_bytes(response.content)

            # Extract tarball
            with tarfile.open(tarball, 'r:gz') as tar:
                tar.extractall(extract_dir)

            pipelines_dir = extract_dir / "pipelines"
            if not pipelines_dir.exists():
                print("No pipelines directory found in artifact")
                return

            sync_directory(client, pipelines_dir)

    def sync_directory(client: Client, pipelines_dir: Path):
        """Sync all pipeline files from a directory."""
        output_dir = Path("/tmp/compiled")
        output_dir.mkdir(exist_ok=True)

        # Find all Python pipeline files
        py_files = list(pipelines_dir.glob("*_pipeline.py")) + list(pipelines_dir.glob("*.pipeline.py"))

        if not py_files:
            # Fall back to all Python files
            py_files = list(pipelines_dir.glob("*.py"))

        print(f"Found {len(py_files)} pipeline files")

        for py_file in py_files:
            print(f"\nProcessing: {py_file.name}")

            # Compile pipeline
            yaml_path = compile_pipeline(py_file, output_dir)
            if not yaml_path:
                # Try direct YAML files
                yaml_path = pipelines_dir / f"{py_file.stem}.yaml"
                if not yaml_path.exists():
                    continue

            pipeline_name = py_file.stem.replace("_", "-")
            upload_pipeline(client, yaml_path, pipeline_name)

    def main():
        print(f"KFP Pipeline Sync starting...")
        print(f"Kubeflow host: {KUBEFLOW_HOST}")

        # Wait for Kubeflow to be ready
        client = None
        for i in range(5):
            try:
                client = Client(host=KUBEFLOW_HOST)
                # Test connection
                client.list_pipelines(page_size=1)
                print("Connected to Kubeflow Pipelines")
                break
            except Exception as e:
                print(f"Waiting for Kubeflow... ({e})")
                import time
                time.sleep(10)

        if not client:
            print("Failed to connect to Kubeflow")
            sys.exit(1)

        # Get artifact URL from Flux GitRepository
        artifact_url = get_flux_artifact_url()
        if artifact_url:
            print(f"\nSyncing from Flux artifact...")
            sync_from_flux_artifact(client, artifact_url)
        elif PIPELINES_DIR and Path(PIPELINES_DIR).exists():
            # Sync from local directory
            print(f"\nSyncing from local: {PIPELINES_DIR}")
            sync_directory(client, Path(PIPELINES_DIR))
        else:
            print("No pipeline source configured!")
            sys.exit(1)

        print("\nSync complete!")

    if __name__ == "__main__":
        main()