feat: add pipeline bridge for NATS to Argo/Kubeflow

- pipeline_bridge.py: Standalone bridge service - pipeline_bridge_v2.py: handler-base version - Supports Argo Workflows and Kubeflow Pipelines - Workflow monitoring and status publishing - Dockerfile variants for standalone and handler-base
2026-02-02 06:23:21 -05:00
parent 57514f2b09
commit 50b1835688
7 changed files with 756 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,11 @@
+.venv/
+__pycache__/
+*.pyc
+*.pyo
+.pytest_cache/
+.mypy_cache/
+*.egg-info/
+dist/
+build/
+.env
+.env.local
--- a/29
+++ b/29
@@ -0,0 +1,29 @@
+FROM python:3.13-slim
+
+WORKDIR /app
+
+# Install uv for fast, reliable package management
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy requirements first for better caching
+COPY requirements.txt .
+RUN uv pip install --system --no-cache -r requirements.txt
+
+# Copy application code
+COPY pipeline_bridge.py .
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD python -c "print('healthy')" || exit 1
+
+# Run the application
+CMD ["python", "pipeline_bridge.py"]
--- a/Dockerfile.v2
+++ b/Dockerfile.v2
@@ -0,0 +1,12 @@
+# Pipeline Bridge v2 - Using handler-base
+ARG BASE_TAG=local
+FROM ghcr.io/billy-davies-2/handler-base:${BASE_TAG}
+
+WORKDIR /app
+
+# Additional dependency for Kubernetes API
+RUN uv pip install --system --no-cache kubernetes>=28.0.0
+
+COPY pipeline_bridge_v2.py ./pipeline_bridge.py
+
+CMD ["python", "pipeline_bridge.py"]
--- a/README.md
+++ b/README.md
@@ -1,2 +1,110 @@
-# pipeline-bridge
+# Pipeline Bridge

+Bridges NATS events to Kubeflow Pipelines and Argo Workflows.
+
+## Overview
+
+The Pipeline Bridge listens for pipeline trigger requests on NATS and submits them to the appropriate workflow engine (Argo Workflows or Kubeflow Pipelines). It monitors execution and publishes status updates back to NATS.
+
+## NATS Subjects
+
+| Subject | Direction | Description |
+|---------|-----------|-------------|
+| `ai.pipeline.trigger` | Subscribe | Pipeline trigger requests |
+| `ai.pipeline.status.{request_id}` | Publish | Pipeline status updates |
+
+## Supported Pipelines
+
+| Pipeline | Engine | Description |
+|----------|--------|-------------|
+| `document-ingestion` | Argo | Ingest documents into Milvus |
+| `batch-inference` | Argo | Run batch LLM inference |
+| `model-evaluation` | Argo | Evaluate model performance |
+| `rag-query` | Kubeflow | Execute RAG query pipeline |
+| `voice-pipeline` | Kubeflow | Full voice assistant pipeline |
+
+## Request Format
+
+```json
+{
+  "request_id": "uuid",
+  "pipeline": "document-ingestion",
+  "parameters": {
+    "source-url": "s3://bucket/docs/",
+    "collection-name": "knowledge_base"
+  }
+}
+```
+
+## Response Format
+
+```json
+{
+  "request_id": "uuid",
+  "status": "submitted",
+  "pipeline": "document-ingestion",
+  "engine": "argo",
+  "run_id": "document-ingestion-abc123",
+  "message": "Pipeline submitted successfully",
+  "timestamp": "2026-01-03T12:00:00Z"
+}
+```
+
+## Status Updates
+
+The bridge publishes status updates as the workflow progresses:
+
+- `submitted` - Workflow created
+- `pending` - Waiting to start
+- `running` - In progress
+- `succeeded` - Completed successfully
+- `failed` - Failed
+- `error` - System error
+
+## Variants
+
+### pipeline_bridge.py (Standalone)
+Self-contained service with pip install on startup. Good for simple deployments.
+
+### pipeline_bridge_v2.py (handler-base)
+Uses handler-base library for standardized NATS handling, telemetry, and health checks.
+
+## Environment Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `NATS_URL` | `nats://nats.ai-ml.svc.cluster.local:4222` | NATS server URL |
+| `KUBEFLOW_HOST` | `http://ml-pipeline.kubeflow.svc.cluster.local:8888` | Kubeflow Pipelines API |
+| `ARGO_HOST` | `http://argo-server.argo.svc.cluster.local:2746` | Argo Workflows API |
+| `ARGO_NAMESPACE` | `ai-ml` | Namespace for Argo Workflows |
+
+## Building
+
+```bash
+# Standalone version
+docker build -t pipeline-bridge:latest .
+
+# handler-base version
+docker build -f Dockerfile.v2 -t pipeline-bridge:v2 --build-arg BASE_TAG=latest .
+```
+
+## Testing
+
+```bash
+# Port-forward NATS
+kubectl port-forward -n ai-ml svc/nats 4222:4222
+
+# Trigger document ingestion
+nats pub ai.pipeline.trigger '{
+  "request_id": "test-1",
+  "pipeline": "document-ingestion",
+  "parameters": {"source-url": "https://example.com/docs.txt"}
+}'
+
+# Monitor status
+nats sub "ai.pipeline.status.>"
+```
+
+## License
+
+MIT
--- a/pipeline_bridge.py
+++ b/pipeline_bridge.py
@@ -0,0 +1,351 @@
+#!/usr/bin/env python3
+"""
+Pipeline Bridge Service
+
+Bridges NATS events to workflow engines:
+1. Listen for pipeline triggers on "ai.pipeline.trigger"
+2. Submit to Kubeflow Pipelines or Argo Workflows
+3. Monitor execution and publish status updates
+4. Publish completion to "ai.pipeline.status.{request_id}"
+
+Supported pipelines:
+- document-ingestion: Ingest documents into Milvus
+- batch-inference: Run batch LLM inference
+- model-evaluation: Evaluate model performance
+"""
+import asyncio
+import json
+import logging
+import os
+import signal
+import subprocess
+import sys
+from typing import Dict, Optional
+from datetime import datetime
+
+# Install dependencies on startup
+subprocess.check_call([
+    sys.executable, "-m", "pip", "install", "-q", 
+    "-r", "/app/requirements.txt"
+])
+
+import httpx
+import nats
+from kubernetes import client, config
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger("pipeline-bridge")
+
+# Configuration from environment
+NATS_URL = os.environ.get("NATS_URL", "nats://nats.ai-ml.svc.cluster.local:4222")
+KUBEFLOW_HOST = os.environ.get("KUBEFLOW_HOST", "http://ml-pipeline.kubeflow.svc.cluster.local:8888")
+ARGO_HOST = os.environ.get("ARGO_HOST", "http://argo-server.argo.svc.cluster.local:2746")
+ARGO_NAMESPACE = os.environ.get("ARGO_NAMESPACE", "ai-ml")
+
+# NATS subjects
+TRIGGER_SUBJECT = "ai.pipeline.trigger"
+STATUS_SUBJECT = "ai.pipeline.status"
+
+# Pipeline definitions - maps pipeline names to their configurations
+PIPELINES = {
+    "document-ingestion": {
+        "engine": "argo",
+        "template": "document-ingestion",
+        "description": "Ingest documents into Milvus vector database"
+    },
+    "batch-inference": {
+        "engine": "argo",
+        "template": "batch-inference",
+        "description": "Run batch LLM inference on a dataset"
+    },
+    "rag-query": {
+        "engine": "kubeflow",
+        "pipeline_id": "rag-pipeline",
+        "description": "Execute RAG query pipeline"
+    },
+    "voice-pipeline": {
+        "engine": "kubeflow",
+        "pipeline_id": "voice-pipeline",
+        "description": "Full voice assistant pipeline"
+    }
+}
+
+
+class PipelineBridge:
+    def __init__(self):
+        self.nc = None
+        self.http_client = None
+        self.running = True
+        self.active_workflows = {}  # Track running workflows
+
+    async def setup(self):
+        """Initialize all connections."""
+        # NATS connection
+        self.nc = await nats.connect(NATS_URL)
+        logger.info(f"Connected to NATS at {NATS_URL}")
+
+        # HTTP client for API calls
+        self.http_client = httpx.AsyncClient(timeout=60.0)
+
+        # Initialize Kubernetes client for Argo
+        try:
+            config.load_incluster_config()
+            self.k8s_custom = client.CustomObjectsApi()
+            logger.info("Kubernetes client initialized")
+        except Exception as e:
+            logger.warning(f"Kubernetes client failed: {e}")
+            self.k8s_custom = None
+
+    async def submit_argo_workflow(self, template: str, parameters: Dict, request_id: str) -> Optional[str]:
+        """Submit an Argo Workflow from a WorkflowTemplate."""
+        if not self.k8s_custom:
+            logger.error("Kubernetes client not available")
+            return None
+
+        try:
+            # Create workflow from template
+            workflow = {
+                "apiVersion": "argoproj.io/v1alpha1",
+                "kind": "Workflow",
+                "metadata": {
+                    "generateName": f"{template}-",
+                    "namespace": ARGO_NAMESPACE,
+                    "labels": {
+                        "app.kubernetes.io/managed-by": "pipeline-bridge",
+                        "pipeline-bridge/request-id": request_id
+                    }
+                },
+                "spec": {
+                    "workflowTemplateRef": {
+                        "name": template
+                    },
+                    "arguments": {
+                        "parameters": [
+                            {"name": k, "value": str(v)} 
+                            for k, v in parameters.items()
+                        ]
+                    }
+                }
+            }
+
+            result = self.k8s_custom.create_namespaced_custom_object(
+                group="argoproj.io",
+                version="v1alpha1",
+                namespace=ARGO_NAMESPACE,
+                plural="workflows",
+                body=workflow
+            )
+            
+            workflow_name = result["metadata"]["name"]
+            logger.info(f"Submitted Argo workflow: {workflow_name}")
+            return workflow_name
+
+        except Exception as e:
+            logger.error(f"Failed to submit Argo workflow: {e}")
+            return None
+
+    async def submit_kubeflow_pipeline(self, pipeline_id: str, parameters: Dict, request_id: str) -> Optional[str]:
+        """Submit a Kubeflow Pipeline run."""
+        try:
+            # Create pipeline run via Kubeflow API
+            run_request = {
+                "name": f"{pipeline_id}-{request_id[:8]}",
+                "pipeline_spec": {
+                    "pipeline_id": pipeline_id
+                },
+                "resource_references": [],
+                "parameters": [
+                    {"name": k, "value": str(v)}
+                    for k, v in parameters.items()
+                ]
+            }
+
+            response = await self.http_client.post(
+                f"{KUBEFLOW_HOST}/apis/v1beta1/runs",
+                json=run_request
+            )
+            
+            if response.status_code == 200:
+                result = response.json()
+                run_id = result.get("run", {}).get("id")
+                logger.info(f"Submitted Kubeflow pipeline run: {run_id}")
+                return run_id
+            else:
+                logger.error(f"Kubeflow API error: {response.status_code} - {response.text}")
+                return None
+
+        except Exception as e:
+            logger.error(f"Failed to submit Kubeflow pipeline: {e}")
+            return None
+
+    async def get_argo_workflow_status(self, workflow_name: str) -> Dict:
+        """Get status of an Argo Workflow."""
+        if not self.k8s_custom:
+            return {"phase": "Unknown", "message": "Kubernetes client not available"}
+
+        try:
+            result = self.k8s_custom.get_namespaced_custom_object(
+                group="argoproj.io",
+                version="v1alpha1",
+                namespace=ARGO_NAMESPACE,
+                plural="workflows",
+                name=workflow_name
+            )
+            
+            status = result.get("status", {})
+            return {
+                "phase": status.get("phase", "Pending"),
+                "message": status.get("message", ""),
+                "startedAt": status.get("startedAt"),
+                "finishedAt": status.get("finishedAt"),
+                "progress": status.get("progress", "0/0")
+            }
+
+        except Exception as e:
+            logger.error(f"Failed to get workflow status: {e}")
+            return {"phase": "Error", "message": str(e)}
+
+    async def process_trigger(self, msg):
+        """Process a pipeline trigger request."""
+        try:
+            data = json.loads(msg.data.decode())
+            request_id = data.get("request_id", "unknown")
+            pipeline_name = data.get("pipeline", "")
+            parameters = data.get("parameters", {})
+            
+            logger.info(f"Processing pipeline trigger {request_id}: {pipeline_name}")
+
+            # Validate pipeline
+            if pipeline_name not in PIPELINES:
+                await self.publish_status(request_id, {
+                    "status": "error",
+                    "message": f"Unknown pipeline: {pipeline_name}",
+                    "available_pipelines": list(PIPELINES.keys())
+                })
+                return
+
+            pipeline_config = PIPELINES[pipeline_name]
+            engine = pipeline_config["engine"]
+            
+            # Submit to appropriate engine
+            run_id = None
+            if engine == "argo":
+                run_id = await self.submit_argo_workflow(
+                    pipeline_config["template"],
+                    parameters,
+                    request_id
+                )
+            elif engine == "kubeflow":
+                run_id = await self.submit_kubeflow_pipeline(
+                    pipeline_config["pipeline_id"],
+                    parameters,
+                    request_id
+                )
+
+            if run_id:
+                # Track workflow for status updates
+                self.active_workflows[request_id] = {
+                    "engine": engine,
+                    "run_id": run_id,
+                    "started_at": datetime.utcnow().isoformat()
+                }
+                
+                await self.publish_status(request_id, {
+                    "status": "submitted",
+                    "pipeline": pipeline_name,
+                    "engine": engine,
+                    "run_id": run_id,
+                    "message": f"Pipeline submitted successfully"
+                })
+            else:
+                await self.publish_status(request_id, {
+                    "status": "error",
+                    "pipeline": pipeline_name,
+                    "message": "Failed to submit pipeline"
+                })
+
+        except Exception as e:
+            logger.error(f"Trigger processing failed: {e}")
+            await self.publish_status(
+                data.get("request_id", "unknown"),
+                {"status": "error", "message": str(e)}
+            )
+
+    async def publish_status(self, request_id: str, status: Dict):
+        """Publish pipeline status update."""
+        status["request_id"] = request_id
+        status["timestamp"] = datetime.utcnow().isoformat()
+        await self.nc.publish(
+            f"{STATUS_SUBJECT}.{request_id}",
+            json.dumps(status).encode()
+        )
+        logger.info(f"Published status for {request_id}: {status.get('status')}")
+
+    async def monitor_workflows(self):
+        """Periodically check and publish status of active workflows."""
+        while self.running:
+            completed = []
+            
+            for request_id, workflow in self.active_workflows.items():
+                try:
+                    if workflow["engine"] == "argo":
+                        status = await self.get_argo_workflow_status(workflow["run_id"])
+                        
+                        # Publish status update
+                        await self.publish_status(request_id, {
+                            "status": status["phase"].lower(),
+                            "run_id": workflow["run_id"],
+                            "progress": status.get("progress"),
+                            "message": status.get("message", "")
+                        })
+                        
+                        # Check if completed
+                        if status["phase"] in ["Succeeded", "Failed", "Error"]:
+                            completed.append(request_id)
+                            
+                except Exception as e:
+                    logger.error(f"Error monitoring workflow {request_id}: {e}")
+
+            # Remove completed workflows from tracking
+            for request_id in completed:
+                del self.active_workflows[request_id]
+
+            await asyncio.sleep(10)  # Check every 10 seconds
+
+    async def run(self):
+        """Main run loop."""
+        await self.setup()
+        
+        # Subscribe to pipeline triggers
+        sub = await self.nc.subscribe(TRIGGER_SUBJECT, cb=self.process_trigger)
+        logger.info(f"Subscribed to {TRIGGER_SUBJECT}")
+
+        # Start workflow monitor
+        monitor_task = asyncio.create_task(self.monitor_workflows())
+
+        # Handle shutdown
+        def signal_handler():
+            self.running = False
+        
+        loop = asyncio.get_event_loop()
+        for sig in (signal.SIGTERM, signal.SIGINT):
+            loop.add_signal_handler(sig, signal_handler)
+
+        # Keep running
+        while self.running:
+            await asyncio.sleep(1)
+
+        # Cleanup
+        monitor_task.cancel()
+        await sub.unsubscribe()
+        await self.nc.close()
+        logger.info("Shutdown complete")
+
+
+if __name__ == "__main__":
+    bridge = PipelineBridge()
+    asyncio.run(bridge.run())
--- a/pipeline_bridge_v2.py
+++ b/pipeline_bridge_v2.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+"""
+Pipeline Bridge Service (Refactored)
+
+Bridges NATS events to workflow engines using handler-base:
+1. Listen for pipeline triggers on "ai.pipeline.trigger"
+2. Submit to Kubeflow Pipelines or Argo Workflows
+3. Monitor execution and publish status updates
+4. Publish completion to "ai.pipeline.status.{request_id}"
+"""
+import logging
+from typing import Any, Optional
+from datetime import datetime
+
+import httpx
+from nats.aio.msg import Msg
+
+from handler_base import Handler, Settings
+from handler_base.telemetry import create_span
+
+logger = logging.getLogger("pipeline-bridge")
+
+
+class PipelineSettings(Settings):
+    """Pipeline bridge specific settings."""
+    
+    service_name: str = "pipeline-bridge"
+    
+    # Kubeflow Pipelines
+    kubeflow_host: str = "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
+    
+    # Argo Workflows
+    argo_host: str = "http://argo-server.argo.svc.cluster.local:2746"
+    argo_namespace: str = "ai-ml"
+
+
+# Pipeline definitions
+PIPELINES = {
+    "document-ingestion": {
+        "engine": "argo",
+        "template": "document-ingestion",
+        "description": "Ingest documents into Milvus vector database",
+    },
+    "batch-inference": {
+        "engine": "argo",
+        "template": "batch-inference",
+        "description": "Run batch LLM inference on a dataset",
+    },
+    "rag-query": {
+        "engine": "kubeflow",
+        "pipeline_id": "rag-pipeline",
+        "description": "Execute RAG query pipeline",
+    },
+    "voice-pipeline": {
+        "engine": "kubeflow",
+        "pipeline_id": "voice-pipeline",
+        "description": "Full voice assistant pipeline",
+    },
+    "model-evaluation": {
+        "engine": "argo",
+        "template": "model-evaluation",
+        "description": "Evaluate model performance",
+    },
+}
+
+
+class PipelineBridge(Handler):
+    """
+    Pipeline trigger handler.
+    
+    Request format:
+    {
+        "request_id": "uuid",
+        "pipeline": "document-ingestion",
+        "parameters": {"key": "value"}
+    }
+    
+    Response format:
+    {
+        "request_id": "uuid",
+        "status": "submitted",
+        "run_id": "workflow-run-id",
+        "engine": "argo|kubeflow"
+    }
+    """
+    
+    def __init__(self):
+        self.pipeline_settings = PipelineSettings()
+        super().__init__(
+            subject="ai.pipeline.trigger",
+            settings=self.pipeline_settings,
+            queue_group="pipeline-bridges",
+        )
+        
+        self._http: Optional[httpx.AsyncClient] = None
+    
+    async def setup(self) -> None:
+        """Initialize HTTP client."""
+        logger.info("Initializing pipeline bridge...")
+        
+        self._http = httpx.AsyncClient(timeout=60.0)
+        
+        logger.info(f"Pipeline bridge ready. Available pipelines: {list(PIPELINES.keys())}")
+    
+    async def teardown(self) -> None:
+        """Clean up HTTP client."""
+        if self._http:
+            await self._http.aclose()
+        logger.info("Pipeline bridge closed")
+    
+    async def handle_message(self, msg: Msg, data: Any) -> Optional[dict]:
+        """Handle pipeline trigger request."""
+        request_id = data.get("request_id", "unknown")
+        pipeline_name = data.get("pipeline", "")
+        parameters = data.get("parameters", {})
+        
+        logger.info(f"Triggering pipeline '{pipeline_name}' for request {request_id}")
+        
+        with create_span("pipeline.trigger") as span:
+            if span:
+                span.set_attribute("request.id", request_id)
+                span.set_attribute("pipeline.name", pipeline_name)
+            
+            # Validate pipeline
+            if pipeline_name not in PIPELINES:
+                error = f"Unknown pipeline: {pipeline_name}"
+                logger.error(error)
+                return {
+                    "request_id": request_id,
+                    "status": "error",
+                    "error": error,
+                    "available_pipelines": list(PIPELINES.keys()),
+                }
+            
+            pipeline = PIPELINES[pipeline_name]
+            engine = pipeline["engine"]
+            
+            try:
+                if engine == "argo":
+                    run_id = await self._submit_argo(
+                        pipeline["template"], parameters, request_id
+                    )
+                else:
+                    run_id = await self._submit_kubeflow(
+                        pipeline["pipeline_id"], parameters, request_id
+                    )
+                
+                result = {
+                    "request_id": request_id,
+                    "status": "submitted",
+                    "run_id": run_id,
+                    "engine": engine,
+                    "pipeline": pipeline_name,
+                    "submitted_at": datetime.utcnow().isoformat(),
+                }
+                
+                # Publish status update
+                await self.nats.publish(
+                    f"ai.pipeline.status.{request_id}", result
+                )
+                
+                logger.info(f"Pipeline {pipeline_name} submitted: {run_id}")
+                return result
+                
+            except Exception as e:
+                logger.exception(f"Failed to submit pipeline {pipeline_name}")
+                return {
+                    "request_id": request_id,
+                    "status": "error",
+                    "error": str(e),
+                }
+    
+    async def _submit_argo(
+        self, template: str, parameters: dict, request_id: str
+    ) -> str:
+        """Submit workflow to Argo Workflows."""
+        with create_span("pipeline.submit.argo") as span:
+            if span:
+                span.set_attribute("argo.template", template)
+            
+            workflow = {
+                "apiVersion": "argoproj.io/v1alpha1",
+                "kind": "Workflow",
+                "metadata": {
+                    "generateName": f"{template}-",
+                    "namespace": self.pipeline_settings.argo_namespace,
+                    "labels": {
+                        "request-id": request_id,
+                    },
+                },
+                "spec": {
+                    "workflowTemplateRef": {"name": template},
+                    "arguments": {
+                        "parameters": [
+                            {"name": k, "value": str(v)}
+                            for k, v in parameters.items()
+                        ]
+                    },
+                },
+            }
+            
+            response = await self._http.post(
+                f"{self.pipeline_settings.argo_host}/api/v1/workflows/{self.pipeline_settings.argo_namespace}",
+                json={"workflow": workflow},
+            )
+            response.raise_for_status()
+            
+            result = response.json()
+            return result["metadata"]["name"]
+    
+    async def _submit_kubeflow(
+        self, pipeline_id: str, parameters: dict, request_id: str
+    ) -> str:
+        """Submit run to Kubeflow Pipelines."""
+        with create_span("pipeline.submit.kubeflow") as span:
+            if span:
+                span.set_attribute("kubeflow.pipeline_id", pipeline_id)
+            
+            run_request = {
+                "name": f"{pipeline_id}-{request_id[:8]}",
+                "pipeline_spec": {
+                    "pipeline_id": pipeline_id,
+                    "parameters": [
+                        {"name": k, "value": str(v)}
+                        for k, v in parameters.items()
+                    ],
+                },
+            }
+            
+            response = await self._http.post(
+                f"{self.pipeline_settings.kubeflow_host}/apis/v1beta1/runs",
+                json=run_request,
+            )
+            response.raise_for_status()
+            
+            result = response.json()
+            return result["run"]["id"]
+
+
+if __name__ == "__main__":
+    PipelineBridge().run()
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+nats-py
+httpx
+kubernetes