- batch-inference: LLM inference with optional RAG - qlora-training: QLoRA adapter fine-tuning from Milvus - hybrid-ml-training: Multi-GPU distributed training - coqui-voice-training: XTTS voice cloning - document-ingestion: Ingest documents to Milvus - eventsource-kfp: Argo Events / Kubeflow integration - kfp-integration: Bridge between Argo and Kubeflow
238 lines
7.3 KiB
YAML
238 lines
7.3 KiB
YAML
# Argo Workflows + Kubeflow Pipelines Integration
|
|
# This template allows Argo Workflows to trigger KFP pipelines and vice versa
|
|
---
|
|
apiVersion: argoproj.io/v1alpha1
|
|
kind: WorkflowTemplate
|
|
metadata:
|
|
name: kfp-trigger
|
|
namespace: ai-ml
|
|
labels:
|
|
app.kubernetes.io/name: kfp-trigger
|
|
app.kubernetes.io/part-of: llm-workflows
|
|
spec:
|
|
entrypoint: trigger-kfp-pipeline
|
|
serviceAccountName: argo-workflow
|
|
|
|
arguments:
|
|
parameters:
|
|
- name: pipeline-id
|
|
description: "Kubeflow Pipeline ID or name"
|
|
- name: pipeline-params
|
|
description: "JSON object of pipeline parameters"
|
|
value: "{}"
|
|
- name: experiment-name
|
|
description: "KFP Experiment to use"
|
|
value: "Default"
|
|
- name: wait-for-completion
|
|
description: "Wait for pipeline to complete"
|
|
value: "true"
|
|
|
|
templates:
|
|
- name: trigger-kfp-pipeline
|
|
steps:
|
|
- - name: submit-run
|
|
template: submit-kfp-run
|
|
arguments:
|
|
parameters:
|
|
- name: pipeline-id
|
|
value: "{{workflow.parameters.pipeline-id}}"
|
|
- name: pipeline-params
|
|
value: "{{workflow.parameters.pipeline-params}}"
|
|
- name: experiment-name
|
|
value: "{{workflow.parameters.experiment-name}}"
|
|
|
|
- - name: wait-completion
|
|
template: wait-for-kfp
|
|
when: "{{workflow.parameters.wait-for-completion}} == true"
|
|
arguments:
|
|
parameters:
|
|
- name: run-id
|
|
value: "{{steps.submit-run.outputs.parameters.run-id}}"
|
|
|
|
- name: submit-kfp-run
|
|
inputs:
|
|
parameters:
|
|
- name: pipeline-id
|
|
- name: pipeline-params
|
|
- name: experiment-name
|
|
outputs:
|
|
parameters:
|
|
- name: run-id
|
|
valueFrom:
|
|
path: /tmp/run-id
|
|
script:
|
|
image: python:3.13-slim
|
|
command: [python]
|
|
source: |
|
|
import json
|
|
import subprocess
|
|
import sys
|
|
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "kfp==2.12.1"])
|
|
|
|
from kfp import Client
|
|
|
|
KUBEFLOW_HOST = "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
|
|
|
|
client = Client(host=KUBEFLOW_HOST)
|
|
|
|
pipeline_id = "{{inputs.parameters.pipeline-id}}"
|
|
params = json.loads('''{{inputs.parameters.pipeline-params}}''')
|
|
experiment_name = "{{inputs.parameters.experiment-name}}"
|
|
|
|
# Get or create experiment
|
|
try:
|
|
experiment = client.get_experiment(experiment_name=experiment_name)
|
|
except:
|
|
experiment = client.create_experiment(name=experiment_name)
|
|
|
|
# Get pipeline by name or ID
|
|
try:
|
|
pipeline = client.get_pipeline(pipeline_id)
|
|
except:
|
|
# Try by name
|
|
pipelines = client.list_pipelines(filter=f'name="{pipeline_id}"')
|
|
if pipelines.pipelines:
|
|
pipeline = pipelines.pipelines[0]
|
|
else:
|
|
raise ValueError(f"Pipeline not found: {pipeline_id}")
|
|
|
|
# Create run
|
|
run = client.run_pipeline(
|
|
experiment_id=experiment.experiment_id,
|
|
job_name=f"{pipeline.display_name}-argo-{pipeline_id[:8]}",
|
|
pipeline_id=pipeline.pipeline_id,
|
|
params=params
|
|
)
|
|
|
|
print(f"Submitted KFP run: {run.run_id}")
|
|
with open("/tmp/run-id", "w") as f:
|
|
f.write(run.run_id)
|
|
|
|
- name: wait-for-kfp
|
|
inputs:
|
|
parameters:
|
|
- name: run-id
|
|
outputs:
|
|
parameters:
|
|
- name: status
|
|
valueFrom:
|
|
path: /tmp/status
|
|
script:
|
|
image: python:3.13-slim
|
|
command: [python]
|
|
source: |
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "kfp==2.12.1"])
|
|
|
|
from kfp import Client
|
|
|
|
KUBEFLOW_HOST = "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
|
|
run_id = "{{inputs.parameters.run-id}}"
|
|
|
|
client = Client(host=KUBEFLOW_HOST)
|
|
|
|
while True:
|
|
run = client.get_run(run_id)
|
|
state = run.run.status
|
|
|
|
print(f"Run {run_id} status: {state}")
|
|
|
|
if state in ["SUCCEEDED", "SKIPPED"]:
|
|
with open("/tmp/status", "w") as f:
|
|
f.write("SUCCEEDED")
|
|
break
|
|
elif state in ["FAILED", "ERROR", "CANCELLED"]:
|
|
with open("/tmp/status", "w") as f:
|
|
f.write(state)
|
|
raise Exception(f"Pipeline failed with status: {state}")
|
|
|
|
time.sleep(30)
|
|
|
|
---
|
|
# WorkflowTemplate for running KFP pipeline components as Argo steps
|
|
apiVersion: argoproj.io/v1alpha1
|
|
kind: WorkflowTemplate
|
|
metadata:
|
|
name: kfp-component-runner
|
|
namespace: ai-ml
|
|
labels:
|
|
app.kubernetes.io/name: kfp-component-runner
|
|
app.kubernetes.io/part-of: llm-workflows
|
|
spec:
|
|
entrypoint: run-component
|
|
serviceAccountName: argo-workflow
|
|
|
|
arguments:
|
|
parameters:
|
|
- name: component-name
|
|
description: "Name of the KFP component to run"
|
|
- name: component-params
|
|
description: "JSON parameters for the component"
|
|
value: "{}"
|
|
|
|
templates:
|
|
- name: run-component
|
|
inputs:
|
|
parameters:
|
|
- name: component-name
|
|
- name: component-params
|
|
outputs:
|
|
parameters:
|
|
- name: result
|
|
valueFrom:
|
|
path: /tmp/result.json
|
|
script:
|
|
image: python:3.13-slim
|
|
command: [python]
|
|
source: |
|
|
import json
|
|
import subprocess
|
|
import sys
|
|
subprocess.check_call([
|
|
sys.executable, "-m", "pip", "install", "-q",
|
|
"httpx", "pymilvus"
|
|
])
|
|
|
|
import httpx
|
|
|
|
component_name = "{{inputs.parameters.component-name}}"
|
|
params = json.loads('''{{inputs.parameters.component-params}}''')
|
|
|
|
# Component implementations (mirrors KFP components)
|
|
COMPONENTS = {
|
|
"transcribe_audio": {
|
|
"url": "http://whisper-predictor.ai-ml.svc.cluster.local",
|
|
"endpoint": "/v1/audio/transcriptions"
|
|
},
|
|
"generate_embeddings": {
|
|
"url": "http://embeddings-predictor.ai-ml.svc.cluster.local",
|
|
"endpoint": "/embeddings"
|
|
},
|
|
"generate_response": {
|
|
"url": "http://llm-draft.ai-ml.svc.cluster.local:8000",
|
|
"endpoint": "/v1/chat/completions"
|
|
},
|
|
"synthesize_speech": {
|
|
"url": "http://tts-predictor.ai-ml.svc.cluster.local",
|
|
"endpoint": "/v1/audio/speech"
|
|
}
|
|
}
|
|
|
|
if component_name not in COMPONENTS:
|
|
raise ValueError(f"Unknown component: {component_name}")
|
|
|
|
config = COMPONENTS[component_name]
|
|
with httpx.Client(timeout=120.0) as client:
|
|
response = client.post(
|
|
f"{config['url']}{config['endpoint']}",
|
|
json=params
|
|
)
|
|
result = response.json()
|
|
|
|
with open("/tmp/result.json", "w") as f:
|
|
json.dump(result, f)
|
|
|
|
print(f"Component {component_name} completed")
|