Files
argo/kfp-integration.yaml
Billy D. 7104698eee feat: Add ML training and batch inference workflows
- batch-inference: LLM inference with optional RAG
- qlora-training: QLoRA adapter fine-tuning from Milvus
- hybrid-ml-training: Multi-GPU distributed training
- coqui-voice-training: XTTS voice cloning
- document-ingestion: Ingest documents to Milvus
- eventsource-kfp: Argo Events / Kubeflow integration
- kfp-integration: Bridge between Argo and Kubeflow
2026-02-01 20:39:42 -05:00

238 lines
7.3 KiB
YAML

# Argo Workflows + Kubeflow Pipelines Integration
# This template allows Argo Workflows to trigger KFP pipelines and vice versa
---
apiVersion: argoproj.io/v1alpha1
kind: WorkflowTemplate
metadata:
name: kfp-trigger
namespace: ai-ml
labels:
app.kubernetes.io/name: kfp-trigger
app.kubernetes.io/part-of: llm-workflows
spec:
entrypoint: trigger-kfp-pipeline
serviceAccountName: argo-workflow
arguments:
parameters:
- name: pipeline-id
description: "Kubeflow Pipeline ID or name"
- name: pipeline-params
description: "JSON object of pipeline parameters"
value: "{}"
- name: experiment-name
description: "KFP Experiment to use"
value: "Default"
- name: wait-for-completion
description: "Wait for pipeline to complete"
value: "true"
templates:
- name: trigger-kfp-pipeline
steps:
- - name: submit-run
template: submit-kfp-run
arguments:
parameters:
- name: pipeline-id
value: "{{workflow.parameters.pipeline-id}}"
- name: pipeline-params
value: "{{workflow.parameters.pipeline-params}}"
- name: experiment-name
value: "{{workflow.parameters.experiment-name}}"
- - name: wait-completion
template: wait-for-kfp
when: "{{workflow.parameters.wait-for-completion}} == true"
arguments:
parameters:
- name: run-id
value: "{{steps.submit-run.outputs.parameters.run-id}}"
- name: submit-kfp-run
inputs:
parameters:
- name: pipeline-id
- name: pipeline-params
- name: experiment-name
outputs:
parameters:
- name: run-id
valueFrom:
path: /tmp/run-id
script:
image: python:3.13-slim
command: [python]
source: |
import json
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "kfp==2.12.1"])
from kfp import Client
KUBEFLOW_HOST = "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
client = Client(host=KUBEFLOW_HOST)
pipeline_id = "{{inputs.parameters.pipeline-id}}"
params = json.loads('''{{inputs.parameters.pipeline-params}}''')
experiment_name = "{{inputs.parameters.experiment-name}}"
# Get or create experiment
try:
experiment = client.get_experiment(experiment_name=experiment_name)
except:
experiment = client.create_experiment(name=experiment_name)
# Get pipeline by name or ID
try:
pipeline = client.get_pipeline(pipeline_id)
except:
# Try by name
pipelines = client.list_pipelines(filter=f'name="{pipeline_id}"')
if pipelines.pipelines:
pipeline = pipelines.pipelines[0]
else:
raise ValueError(f"Pipeline not found: {pipeline_id}")
# Create run
run = client.run_pipeline(
experiment_id=experiment.experiment_id,
job_name=f"{pipeline.display_name}-argo-{pipeline_id[:8]}",
pipeline_id=pipeline.pipeline_id,
params=params
)
print(f"Submitted KFP run: {run.run_id}")
with open("/tmp/run-id", "w") as f:
f.write(run.run_id)
- name: wait-for-kfp
inputs:
parameters:
- name: run-id
outputs:
parameters:
- name: status
valueFrom:
path: /tmp/status
script:
image: python:3.13-slim
command: [python]
source: |
import subprocess
import sys
import time
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "kfp==2.12.1"])
from kfp import Client
KUBEFLOW_HOST = "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
run_id = "{{inputs.parameters.run-id}}"
client = Client(host=KUBEFLOW_HOST)
while True:
run = client.get_run(run_id)
state = run.run.status
print(f"Run {run_id} status: {state}")
if state in ["SUCCEEDED", "SKIPPED"]:
with open("/tmp/status", "w") as f:
f.write("SUCCEEDED")
break
elif state in ["FAILED", "ERROR", "CANCELLED"]:
with open("/tmp/status", "w") as f:
f.write(state)
raise Exception(f"Pipeline failed with status: {state}")
time.sleep(30)
---
# WorkflowTemplate for running KFP pipeline components as Argo steps
apiVersion: argoproj.io/v1alpha1
kind: WorkflowTemplate
metadata:
name: kfp-component-runner
namespace: ai-ml
labels:
app.kubernetes.io/name: kfp-component-runner
app.kubernetes.io/part-of: llm-workflows
spec:
entrypoint: run-component
serviceAccountName: argo-workflow
arguments:
parameters:
- name: component-name
description: "Name of the KFP component to run"
- name: component-params
description: "JSON parameters for the component"
value: "{}"
templates:
- name: run-component
inputs:
parameters:
- name: component-name
- name: component-params
outputs:
parameters:
- name: result
valueFrom:
path: /tmp/result.json
script:
image: python:3.13-slim
command: [python]
source: |
import json
import subprocess
import sys
subprocess.check_call([
sys.executable, "-m", "pip", "install", "-q",
"httpx", "pymilvus"
])
import httpx
component_name = "{{inputs.parameters.component-name}}"
params = json.loads('''{{inputs.parameters.component-params}}''')
# Component implementations (mirrors KFP components)
COMPONENTS = {
"transcribe_audio": {
"url": "http://whisper-predictor.ai-ml.svc.cluster.local",
"endpoint": "/v1/audio/transcriptions"
},
"generate_embeddings": {
"url": "http://embeddings-predictor.ai-ml.svc.cluster.local",
"endpoint": "/embeddings"
},
"generate_response": {
"url": "http://llm-draft.ai-ml.svc.cluster.local:8000",
"endpoint": "/v1/chat/completions"
},
"synthesize_speech": {
"url": "http://tts-predictor.ai-ml.svc.cluster.local",
"endpoint": "/v1/audio/speech"
}
}
if component_name not in COMPONENTS:
raise ValueError(f"Unknown component: {component_name}")
config = COMPONENTS[component_name]
with httpx.Client(timeout=120.0) as client:
response = client.post(
f"{config['url']}{config['endpoint']}",
json=params
)
result = response.json()
with open("/tmp/result.json", "w") as f:
json.dump(result, f)
print(f"Component {component_name} completed")