feat: add vLLM tuning pipeline + recompile voice pipelines with MLflow

New: - vllm_tuning_pipeline.py: A/B benchmark different vLLM configs, logs latency/TPS/TTFT to MLflow (vllm-tuning experiment) - vllm_tuning_pipeline.yaml: compiled KFP YAML Updated: - voice_pipeline.py: per-step NamedTuple outputs with latency tracking, new log_pipeline_metrics MLflow component - voice_pipeline.yaml, tts_pipeline.yaml, rag_pipeline.yaml: recompiled
2026-02-13 08:24:11 -05:00
parent cee21f124c
commit bc4b230dd9
6 changed files with 2216 additions and 26 deletions
--- a/rag_pipeline.yaml
+++ b/rag_pipeline.yaml
@@ -0,0 +1,363 @@
+# PIPELINE DEFINITION
+# Name: rag-query-pipeline
+# Description: RAG query pipeline: Embed -> Retrieve -> Rerank -> LLM.  Logs per-step latency to MLflow.
+# Inputs:
+#    collection_name: str [Default: 'knowledge_base']
+#    query: str
+components:
+  comp-generate-embeddings:
+    executorLabel: exec-generate-embeddings
+    inputDefinitions:
+      parameters:
+        embeddings_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings
+          isOptional: true
+          parameterType: STRING
+        text:
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        embedding:
+          parameterType: LIST
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+  comp-generate-response:
+    executorLabel: exec-generate-response
+    inputDefinitions:
+      parameters:
+        context:
+          parameterType: LIST
+        model:
+          defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
+          isOptional: true
+          parameterType: STRING
+        query:
+          parameterType: STRING
+        vllm_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
+          isOptional: true
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        completion_tokens:
+          parameterType: NUMBER_INTEGER
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+        text:
+          parameterType: STRING
+  comp-rerank-documents:
+    executorLabel: exec-rerank-documents
+    inputDefinitions:
+      parameters:
+        documents:
+          parameterType: LIST
+        query:
+          parameterType: STRING
+        reranker_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker
+          isOptional: true
+          parameterType: STRING
+        top_k:
+          defaultValue: 3.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+    outputDefinitions:
+      parameters:
+        documents:
+          parameterType: LIST
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+  comp-retrieve-context:
+    executorLabel: exec-retrieve-context
+    inputDefinitions:
+      parameters:
+        collection_name:
+          defaultValue: knowledge_base
+          isOptional: true
+          parameterType: STRING
+        embedding:
+          parameterType: LIST
+        milvus_host:
+          defaultValue: milvus.ai-ml.svc.cluster.local
+          isOptional: true
+          parameterType: STRING
+        top_k:
+          defaultValue: 5.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+    outputDefinitions:
+      parameters:
+        documents:
+          parameterType: LIST
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+deploymentSpec:
+  executors:
+    exec-generate-embeddings:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - generate_embeddings
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef generate_embeddings(\n    text: str,\n    embeddings_url: str\
+          \ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings\"\
+          \n) -> NamedTuple(\"EmbedResult\", [(\"embedding\", list), (\"latency_s\"\
+          , float)]):\n    \"\"\"Generate embeddings for RAG retrieval.\"\"\"\n  \
+          \  import time\n    import httpx\n    from collections import namedtuple\n\
+          \n    start = time.perf_counter()\n    with httpx.Client(timeout=60.0) as\
+          \ client:\n        response = client.post(\n            f\"{embeddings_url}/embeddings\"\
+          ,\n            json={\"input\": text, \"model\": \"bge-small-en-v1.5\"}\n\
+          \        )\n        result = response.json()\n    latency = time.perf_counter()\
+          \ - start\n\n    EmbedResult = namedtuple(\"EmbedResult\", [\"embedding\"\
+          , \"latency_s\"])\n    return EmbedResult(result[\"data\"][0][\"embedding\"\
+          ], latency)\n\n"
+        image: python:3.13-slim
+    exec-generate-response:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - generate_response
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef generate_response(\n    query: str,\n    context: list,\n   \
+          \ vllm_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm\"\
+          ,\n    model: str = \"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4\"\
+          \n) -> NamedTuple(\"LLMResult\", [(\"text\", str), (\"latency_s\", float),\
+          \ (\"completion_tokens\", int)]):\n    \"\"\"Generate response using vLLM.\"\
+          \"\"\n    import time\n    import httpx\n    from collections import namedtuple\n\
+          \n    # Build context\n    if context:\n        context_text = \"\\n\\n\"\
+          .join([doc[\"text\"] for doc in context])\n        user_content = f\"Context:\\\
+          n{context_text}\\n\\nQuestion: {query}\"\n    else:\n        user_content\
+          \ = query\n\n    system_prompt = \"\"\"You are a helpful voice assistant.\n\
+          Answer questions based on the provided context when available.\nKeep responses\
+          \ concise and natural for speech synthesis.\"\"\"\n\n    messages = [\n\
+          \        {\"role\": \"system\", \"content\": system_prompt},\n        {\"\
+          role\": \"user\", \"content\": user_content}\n    ]\n\n    start = time.perf_counter()\n\
+          \    with httpx.Client(timeout=180.0) as client:\n        response = client.post(\n\
+          \            f\"{vllm_url}/v1/chat/completions\",\n            json={\n\
+          \                \"model\": model,\n                \"messages\": messages,\n\
+          \                \"max_tokens\": 512,\n                \"temperature\":\
+          \ 0.7\n            }\n        )\n        result = response.json()\n    latency\
+          \ = time.perf_counter() - start\n\n    text = result[\"choices\"][0][\"\
+          message\"][\"content\"]\n    usage = result.get(\"usage\", {})\n    completion_tokens\
+          \ = usage.get(\"completion_tokens\", len(text.split()))\n\n    LLMResult\
+          \ = namedtuple(\"LLMResult\", [\"text\", \"latency_s\", \"completion_tokens\"\
+          ])\n    return LLMResult(text, latency, completion_tokens)\n\n"
+        image: python:3.13-slim
+    exec-rerank-documents:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - rerank_documents
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef rerank_documents(\n    query: str,\n    documents: list,\n  \
+          \  reranker_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker\"\
+          ,\n    top_k: int = 3\n) -> NamedTuple(\"RerankResult\", [(\"documents\"\
+          , list), (\"latency_s\", float)]):\n    \"\"\"Rerank documents using BGE\
+          \ reranker.\"\"\"\n    import time\n    import httpx\n    from collections\
+          \ import namedtuple\n\n    if not documents:\n        RerankResult = namedtuple(\"\
+          RerankResult\", [\"documents\", \"latency_s\"])\n        return RerankResult([],\
+          \ 0.0)\n\n    start = time.perf_counter()\n    with httpx.Client(timeout=60.0)\
+          \ as client:\n        response = client.post(\n            f\"{reranker_url}/v1/rerank\"\
+          ,\n            json={\n                \"query\": query,\n             \
+          \   \"documents\": [doc[\"text\"] for doc in documents],\n             \
+          \   \"model\": \"bge-reranker-v2-m3\"\n            }\n        )\n      \
+          \  result = response.json()\n    latency = time.perf_counter() - start\n\
+          \n    # Sort by rerank score\n    reranked = sorted(\n        zip(documents,\
+          \ result.get(\"scores\", [0] * len(documents))),\n        key=lambda x:\
+          \ x[1],\n        reverse=True\n    )[:top_k]\n\n    RerankResult = namedtuple(\"\
+          RerankResult\", [\"documents\", \"latency_s\"])\n    return RerankResult([doc\
+          \ for doc, score in reranked], latency)\n\n"
+        image: python:3.13-slim
+    exec-retrieve-context:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - retrieve_context
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'pymilvus' &&\
+          \ \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef retrieve_context(\n    embedding: list,\n    milvus_host: str\
+          \ = \"milvus.ai-ml.svc.cluster.local\",\n    collection_name: str = \"knowledge_base\"\
+          ,\n    top_k: int = 5\n) -> NamedTuple(\"RetrieveResult\", [(\"documents\"\
+          , list), (\"latency_s\", float)]):\n    \"\"\"Retrieve relevant documents\
+          \ from Milvus vector database.\"\"\"\n    import time\n    from pymilvus\
+          \ import connections, Collection, utility\n    from collections import namedtuple\n\
+          \n    start = time.perf_counter()\n    connections.connect(host=milvus_host,\
+          \ port=19530)\n\n    if not utility.has_collection(collection_name):\n \
+          \       latency = time.perf_counter() - start\n        RetrieveResult =\
+          \ namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n     \
+          \   return RetrieveResult([], latency)\n\n    collection = Collection(collection_name)\n\
+          \    collection.load()\n\n    results = collection.search(\n        data=[embedding],\n\
+          \        anns_field=\"embedding\",\n        param={\"metric_type\": \"COSINE\"\
+          , \"params\": {\"nprobe\": 10}},\n        limit=top_k,\n        output_fields=[\"\
+          text\", \"source\"]\n    )\n    latency = time.perf_counter() - start\n\n\
+          \    documents = []\n    for hits in results:\n        for hit in hits:\n\
+          \            documents.append({\n                \"text\": hit.entity.get(\"\
+          text\"),\n                \"source\": hit.entity.get(\"source\"),\n    \
+          \            \"score\": hit.distance\n            })\n\n    RetrieveResult\
+          \ = namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n   \
+          \ return RetrieveResult(documents, latency)\n\n"
+        image: python:3.13-slim
+pipelineInfo:
+  description: 'RAG query pipeline: Embed -> Retrieve -> Rerank -> LLM.  Logs per-step
+    latency to MLflow.'
+  name: rag-query-pipeline
+root:
+  dag:
+    tasks:
+      generate-embeddings:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-generate-embeddings
+        inputs:
+          parameters:
+            text:
+              componentInputParameter: query
+        taskInfo:
+          name: generate-embeddings
+      generate-response:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-generate-response
+        dependentTasks:
+        - rerank-documents
+        inputs:
+          parameters:
+            context:
+              taskOutputParameter:
+                outputParameterKey: documents
+                producerTask: rerank-documents
+            query:
+              componentInputParameter: query
+        taskInfo:
+          name: generate-response
+      rerank-documents:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-rerank-documents
+        dependentTasks:
+        - retrieve-context
+        inputs:
+          parameters:
+            documents:
+              taskOutputParameter:
+                outputParameterKey: documents
+                producerTask: retrieve-context
+            query:
+              componentInputParameter: query
+        taskInfo:
+          name: rerank-documents
+      retrieve-context:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-retrieve-context
+        dependentTasks:
+        - generate-embeddings
+        inputs:
+          parameters:
+            collection_name:
+              componentInputParameter: collection_name
+            embedding:
+              taskOutputParameter:
+                outputParameterKey: embedding
+                producerTask: generate-embeddings
+        taskInfo:
+          name: retrieve-context
+  inputDefinitions:
+    parameters:
+      collection_name:
+        defaultValue: knowledge_base
+        description: Milvus collection name
+        isOptional: true
+        parameterType: STRING
+      query:
+        description: Text query
+        parameterType: STRING
+schemaVersion: 2.1.0
+sdkVersion: kfp-2.12.1
--- a/tts_pipeline.yaml
+++ b/tts_pipeline.yaml
@@ -0,0 +1,87 @@
+# PIPELINE DEFINITION
+# Name: text-to-speech-pipeline
+# Description: Simple text to speech pipeline
+# Inputs:
+#    text: str
+components:
+  comp-synthesize-speech:
+    executorLabel: exec-synthesize-speech
+    inputDefinitions:
+      parameters:
+        text:
+          parameterType: STRING
+        tts_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts
+          isOptional: true
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        audio_b64:
+          parameterType: STRING
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+deploymentSpec:
+  executors:
+    exec-synthesize-speech:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - synthesize_speech
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef synthesize_speech(\n    text: str,\n    tts_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts\"\
+          \n) -> NamedTuple(\"TTSResult\", [(\"audio_b64\", str), (\"latency_s\",\
+          \ float)]):\n    \"\"\"Convert text to speech using TTS service.\"\"\"\n\
+          \    import base64\n    import time\n    import httpx\n    from collections\
+          \ import namedtuple\n\n    start = time.perf_counter()\n    with httpx.Client(timeout=120.0)\
+          \ as client:\n        response = client.post(\n            f\"{tts_url}/v1/audio/speech\"\
+          ,\n            json={\n                \"input\": text,\n              \
+          \  \"voice\": \"en_US-lessac-high\",\n                \"response_format\"\
+          : \"wav\"\n            }\n        )\n        audio_b64 = base64.b64encode(response.content).decode(\"\
+          utf-8\")\n    latency = time.perf_counter() - start\n\n    TTSResult = namedtuple(\"\
+          TTSResult\", [\"audio_b64\", \"latency_s\"])\n    return TTSResult(audio_b64,\
+          \ latency)\n\n"
+        image: python:3.13-slim
+pipelineInfo:
+  description: Simple text to speech pipeline
+  name: text-to-speech-pipeline
+root:
+  dag:
+    tasks:
+      synthesize-speech:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-synthesize-speech
+        inputs:
+          parameters:
+            text:
+              componentInputParameter: text
+        taskInfo:
+          name: synthesize-speech
+  inputDefinitions:
+    parameters:
+      text:
+        parameterType: STRING
+schemaVersion: 2.1.0
+sdkVersion: kfp-2.12.1
--- a/vllm_tuning_pipeline.py
+++ b/vllm_tuning_pipeline.py
@@ -0,0 +1,454 @@
+#!/usr/bin/env python3
+"""
+vLLM Tuning Evaluation Pipeline - Kubeflow Pipelines SDK
+
+Runs inference benchmarks with different vLLM configurations and logs
+results to MLflow so you can compare APC, chunked prefill, speculative
+decoding, and GPU memory utilisation settings side-by-side.
+
+Usage:
+  pip install kfp==2.12.1
+  python vllm_tuning_pipeline.py
+  # Upload vllm_tuning_pipeline.yaml to Kubeflow Pipelines UI
+"""
+
+from kfp import dsl
+from kfp import compiler
+from typing import NamedTuple
+
+
+MLFLOW_IMAGE = "python:3.13-slim"
+MLFLOW_PACKAGES = ["mlflow>=2.10.0", "boto3", "psycopg2-binary"]
+BENCH_PACKAGES = ["httpx"]
+
+
+# ---- MLflow components ----
+
+
+@dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES)
+def create_tuning_run(
+    experiment_name: str,
+    run_name: str,
+    tuning_params: dict,
+    mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
+) -> NamedTuple("RunInfo", [("run_id", str), ("experiment_id", str)]):
+    """Create an MLflow run for a vLLM tuning experiment."""
+    import os
+    import mlflow
+    from mlflow.tracking import MlflowClient
+    from collections import namedtuple
+
+    mlflow.set_tracking_uri(mlflow_tracking_uri)
+    client = MlflowClient()
+
+    exp = client.get_experiment_by_name(experiment_name)
+    experiment_id = (
+        exp.experiment_id
+        if exp
+        else client.create_experiment(
+            name=experiment_name,
+            artifact_location=f"/mlflow/artifacts/{experiment_name}",
+        )
+    )
+
+    tags = {
+        "pipeline.type": "vllm-tuning",
+        "kfp.run_id": os.environ.get("KFP_RUN_ID", "unknown"),
+    }
+
+    run = mlflow.start_run(
+        experiment_id=experiment_id, run_name=run_name, tags=tags
+    )
+    # Log every tuning param
+    for key, value in tuning_params.items():
+        mlflow.log_param(f"vllm.{key}", value)
+    run_id = run.info.run_id
+    mlflow.end_run()
+
+    RunInfo = namedtuple("RunInfo", ["run_id", "experiment_id"])
+    return RunInfo(run_id, experiment_id)
+
+
+@dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES)
+def log_benchmark_results(
+    run_id: str,
+    metrics: dict,
+    mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
+) -> str:
+    """Log benchmark metrics to MLflow and close the run."""
+    import json
+    import tempfile
+    import mlflow
+    from mlflow.tracking import MlflowClient
+    from pathlib import Path
+
+    mlflow.set_tracking_uri(mlflow_tracking_uri)
+    client = MlflowClient()
+
+    for key, value in metrics.items():
+        client.log_metric(run_id, key, float(value))
+
+    # Save full results as artifact
+    with tempfile.TemporaryDirectory() as tmpdir:
+        path = Path(tmpdir) / "benchmark_results.json"
+        path.write_text(json.dumps(metrics, indent=2))
+        client.log_artifact(run_id, str(path))
+
+    client.set_terminated(run_id, status="FINISHED")
+    return run_id
+
+
+# ---- Benchmark components ----
+
+
+@dsl.component(
+    base_image="python:3.13-slim",
+    packages_to_install=BENCH_PACKAGES,
+)
+def build_prompt_suite() -> list:
+    """Return a list of test prompts spanning short, medium, and long inputs."""
+    return [
+        {
+            "id": "short-1",
+            "category": "short",
+            "messages": [
+                {"role": "user", "content": "What is the capital of France?"}
+            ],
+            "max_tokens": 64,
+        },
+        {
+            "id": "short-2",
+            "category": "short",
+            "messages": [
+                {"role": "user", "content": "Explain quantum computing in one sentence."}
+            ],
+            "max_tokens": 64,
+        },
+        {
+            "id": "medium-1",
+            "category": "medium",
+            "messages": [
+                {
+                    "role": "system",
+                    "content": "You are a helpful AI assistant running on a homelab.",
+                },
+                {
+                    "role": "user",
+                    "content": (
+                        "Compare and contrast supervised and unsupervised "
+                        "machine learning. Give examples of each and explain "
+                        "when you would choose one over the other."
+                    ),
+                },
+            ],
+            "max_tokens": 512,
+        },
+        {
+            "id": "medium-2",
+            "category": "medium",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": (
+                        "Write a Python function that implements a binary search "
+                        "tree with insert, search, and delete operations. Include "
+                        "docstrings and type hints."
+                    ),
+                },
+            ],
+            "max_tokens": 1024,
+        },
+        {
+            "id": "long-1",
+            "category": "long",
+            "messages": [
+                {
+                    "role": "system",
+                    "content": "You are a technical writer for a Kubernetes homelab blog.",
+                },
+                {
+                    "role": "user",
+                    "content": (
+                        "Write a detailed tutorial on setting up a multi-node "
+                        "Kubernetes cluster with Talos Linux, covering: "
+                        "1) Hardware requirements and network topology, "
+                        "2) Talos machine config generation, "
+                        "3) Control plane bootstrapping, "
+                        "4) Worker node joining, "
+                        "5) CNI setup with Cilium, "
+                        "6) Storage with Rook-Ceph, "
+                        "7) GitOps with Flux CD. "
+                        "Include YAML examples for each step."
+                    ),
+                },
+            ],
+            "max_tokens": 2048,
+        },
+        {
+            "id": "repeat-prefix-1",
+            "category": "prefix-cache-test",
+            "messages": [
+                {
+                    "role": "system",
+                    "content": "You are a helpful AI assistant running on a homelab.",
+                },
+                {
+                    "role": "user",
+                    "content": (
+                        "Compare and contrast supervised and unsupervised "
+                        "machine learning. Now focus specifically on "
+                        "reinforcement learning and how it differs."
+                    ),
+                },
+            ],
+            "max_tokens": 512,
+        },
+    ]
+
+
+@dsl.component(
+    base_image="python:3.13-slim",
+    packages_to_install=BENCH_PACKAGES,
+)
+def run_benchmark(
+    prompts: list,
+    llm_endpoint: str,
+    model_name: str,
+    num_warmup: int = 2,
+    num_iterations: int = 3,
+) -> dict:
+    """
+    Run all prompts through the LLM endpoint and collect timing metrics.
+
+    Returns aggregate metrics: p50/p95/mean latency, tokens/sec, TTFT.
+    """
+    import time
+    import statistics
+    import httpx
+
+    all_latencies: list[float] = []
+    all_tps: list[float] = []
+    all_ttft: list[float] = []
+    per_category: dict[str, list[float]] = {}
+
+    with httpx.Client(timeout=300.0) as client:
+        # Warmup
+        for _ in range(num_warmup):
+            try:
+                client.post(
+                    f"{llm_endpoint}/v1/chat/completions",
+                    json={
+                        "model": model_name,
+                        "messages": [{"role": "user", "content": "Hi"}],
+                        "max_tokens": 8,
+                        "temperature": 0,
+                    },
+                )
+            except Exception:
+                pass
+
+        # Benchmark
+        for iteration in range(num_iterations):
+            for prompt in prompts:
+                category = prompt.get("category", "unknown")
+                payload = {
+                    "model": model_name,
+                    "messages": prompt["messages"],
+                    "max_tokens": prompt.get("max_tokens", 256),
+                    "temperature": 0,
+                    "stream": True,
+                }
+
+                try:
+                    t_start = time.perf_counter()
+                    first_token_time = None
+
+                    with client.stream(
+                        "POST",
+                        f"{llm_endpoint}/v1/chat/completions",
+                        json=payload,
+                    ) as resp:
+                        resp.raise_for_status()
+                        completion_tokens = 0
+                        for line in resp.iter_lines():
+                            if not line.startswith("data: "):
+                                continue
+                            chunk = line[6:]
+                            if chunk == "[DONE]":
+                                break
+                            if first_token_time is None:
+                                first_token_time = time.perf_counter()
+                            completion_tokens += 1
+
+                    t_end = time.perf_counter()
+                    latency = t_end - t_start
+                    ttft = (
+                        (first_token_time - t_start)
+                        if first_token_time
+                        else latency
+                    )
+                    tps = (
+                        completion_tokens / latency if latency > 0 else 0
+                    )
+
+                    all_latencies.append(latency)
+                    all_tps.append(tps)
+                    all_ttft.append(ttft)
+                    per_category.setdefault(category, []).append(latency)
+
+                except Exception as exc:
+                    # Record failure but keep going
+                    all_latencies.append(-1)
+                    all_tps.append(0)
+                    all_ttft.append(-1)
+
+    # Compute aggregates
+    valid_latencies = [l for l in all_latencies if l > 0]
+    valid_tps = [t for t in all_tps if t > 0]
+    valid_ttft = [t for t in all_ttft if t > 0]
+
+    def safe_stat(values, func):
+        return func(values) if values else 0
+
+    metrics = {
+        "total_requests": len(all_latencies),
+        "successful_requests": len(valid_latencies),
+        "failed_requests": len(all_latencies) - len(valid_latencies),
+        # Latency
+        "latency_mean_s": safe_stat(valid_latencies, statistics.mean),
+        "latency_p50_s": safe_stat(
+            valid_latencies,
+            lambda v: statistics.median(v),
+        ),
+        "latency_p95_s": safe_stat(
+            valid_latencies,
+            lambda v: sorted(v)[int(len(v) * 0.95)] if v else 0,
+        ),
+        # Throughput
+        "tokens_per_second_mean": safe_stat(valid_tps, statistics.mean),
+        "tokens_per_second_p50": safe_stat(
+            valid_tps, lambda v: statistics.median(v)
+        ),
+        # Time to first token
+        "ttft_mean_s": safe_stat(valid_ttft, statistics.mean),
+        "ttft_p50_s": safe_stat(valid_ttft, lambda v: statistics.median(v)),
+        "ttft_p95_s": safe_stat(
+            valid_ttft,
+            lambda v: sorted(v)[int(len(v) * 0.95)] if v else 0,
+        ),
+    }
+
+    # Per-category latency
+    for cat, lats in per_category.items():
+        valid = [l for l in lats if l > 0]
+        if valid:
+            metrics[f"latency_mean_{cat}_s"] = statistics.mean(valid)
+
+    return metrics
+
+
+# ---- Pipeline ----
+
+
+@dsl.pipeline(
+    name="vllm-tuning-evaluation",
+    description=(
+        "Benchmark vLLM with different tuning configurations. "
+        "Logs latency, TPS, and TTFT to MLflow for A/B comparison."
+    ),
+)
+def vllm_tuning_pipeline(
+    llm_endpoint: str = "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm",
+    model_name: str = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4",
+    # Tuning knobs (match env vars in rayservice.yaml)
+    enable_prefix_caching: str = "true",
+    enable_chunked_prefill: str = "true",
+    num_speculative_tokens: str = "3",
+    ngram_prompt_lookup_max: str = "4",
+    gpu_memory_utilization: str = "0.90",
+    # Benchmark config
+    num_warmup: int = 2,
+    num_iterations: int = 3,
+    run_label: str = "baseline",
+):
+    """
+    vLLM Tuning Evaluation Pipeline
+
+    Run this multiple times with different tuning params, then compare
+    runs in the MLflow "vllm-tuning" experiment.
+
+    Args:
+        llm_endpoint: vLLM inference endpoint URL
+        model_name: HF model identifier
+        enable_prefix_caching: "true" or "false"
+        enable_chunked_prefill: "true" or "false"
+        num_speculative_tokens: number of speculative tokens (0 = off)
+        ngram_prompt_lookup_max: ngram window for spec decode (0 = off)
+        gpu_memory_utilization: 0.0 - 1.0
+        num_warmup: warmup requests before timing
+        num_iterations: how many times to repeat the prompt suite
+        run_label: human-readable label (e.g. "apc-on-spec3")
+    """
+
+    tuning_params = {
+        "enable_prefix_caching": enable_prefix_caching,
+        "enable_chunked_prefill": enable_chunked_prefill,
+        "num_speculative_tokens": num_speculative_tokens,
+        "ngram_prompt_lookup_max": ngram_prompt_lookup_max,
+        "gpu_memory_utilization": gpu_memory_utilization,
+        "model_name": model_name,
+        "llm_endpoint": llm_endpoint,
+        "num_warmup": str(num_warmup),
+        "num_iterations": str(num_iterations),
+    }
+
+    # 1. Create MLflow run
+    mlflow_run = create_tuning_run(
+        experiment_name="vllm-tuning",
+        run_name=f"vllm-{run_label}",
+        tuning_params=tuning_params,
+    )
+
+    # 2. Build prompt suite
+    prompts_task = build_prompt_suite()
+    prompts_task.set_caching_options(enable_caching=True)
+
+    # 3. Run benchmark
+    bench_task = run_benchmark(
+        prompts=prompts_task.output,
+        llm_endpoint=llm_endpoint,
+        model_name=model_name,
+        num_warmup=num_warmup,
+        num_iterations=num_iterations,
+    )
+    bench_task.set_caching_options(enable_caching=False)
+
+    # 4. Log results to MLflow
+    log_task = log_benchmark_results(
+        run_id=mlflow_run.outputs["run_id"],
+        metrics=bench_task.output,
+    )
+
+
+if __name__ == "__main__":
+    compiler.Compiler().compile(
+        vllm_tuning_pipeline,
+        "vllm_tuning_pipeline.yaml",
+    )
+    print("Compiled: vllm_tuning_pipeline.yaml")
+    print()
+    print("Example runs to compare configurations:")
+    print("  # Baseline (current config)")
+    print("  kfp run submit vllm_tuning_pipeline.yaml --run-label=baseline")
+    print()
+    print("  # APC disabled")
+    print("  kfp run submit vllm_tuning_pipeline.yaml \\")
+    print("    --enable-prefix-caching=false --run-label=no-apc")
+    print()
+    print("  # No speculative decoding")
+    print("  kfp run submit vllm_tuning_pipeline.yaml \\")
+    print("    --num-speculative-tokens=0 --run-label=no-spec")
+    print()
+    print("  # Aggressive spec decode")
+    print("  kfp run submit vllm_tuning_pipeline.yaml \\")
+    print("    --num-speculative-tokens=5 --ngram-prompt-lookup-max=6 --run-label=spec5-ngram6")
--- a/vllm_tuning_pipeline.yaml
+++ b/vllm_tuning_pipeline.yaml
@@ -0,0 +1,501 @@
+# PIPELINE DEFINITION
+# Name: vllm-tuning-evaluation
+# Description: Benchmark vLLM with different tuning configurations. Logs latency, TPS, and TTFT to MLflow for A/B comparison.
+# Inputs:
+#    enable_chunked_prefill: str [Default: 'true']
+#    enable_prefix_caching: str [Default: 'true']
+#    gpu_memory_utilization: str [Default: '0.90']
+#    llm_endpoint: str [Default: 'http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm']
+#    model_name: str [Default: 'hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4']
+#    ngram_prompt_lookup_max: str [Default: '4']
+#    num_iterations: int [Default: 3.0]
+#    num_speculative_tokens: str [Default: '3']
+#    num_warmup: int [Default: 2.0]
+#    run_label: str [Default: 'baseline']
+components:
+  comp-build-prompt-suite:
+    executorLabel: exec-build-prompt-suite
+    outputDefinitions:
+      parameters:
+        Output:
+          parameterType: LIST
+  comp-create-tuning-run:
+    executorLabel: exec-create-tuning-run
+    inputDefinitions:
+      parameters:
+        experiment_name:
+          parameterType: STRING
+        mlflow_tracking_uri:
+          defaultValue: http://mlflow.mlflow.svc.cluster.local:80
+          isOptional: true
+          parameterType: STRING
+        run_name:
+          parameterType: STRING
+        tuning_params:
+          parameterType: STRUCT
+    outputDefinitions:
+      parameters:
+        experiment_id:
+          parameterType: STRING
+        run_id:
+          parameterType: STRING
+  comp-log-benchmark-results:
+    executorLabel: exec-log-benchmark-results
+    inputDefinitions:
+      parameters:
+        metrics:
+          parameterType: STRUCT
+        mlflow_tracking_uri:
+          defaultValue: http://mlflow.mlflow.svc.cluster.local:80
+          isOptional: true
+          parameterType: STRING
+        run_id:
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        Output:
+          parameterType: STRING
+  comp-run-benchmark:
+    executorLabel: exec-run-benchmark
+    inputDefinitions:
+      parameters:
+        llm_endpoint:
+          parameterType: STRING
+        model_name:
+          parameterType: STRING
+        num_iterations:
+          defaultValue: 3.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        num_warmup:
+          defaultValue: 2.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        prompts:
+          parameterType: LIST
+    outputDefinitions:
+      parameters:
+        Output:
+          parameterType: STRUCT
+deploymentSpec:
+  executors:
+    exec-build-prompt-suite:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - build_prompt_suite
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef build_prompt_suite() -> list:\n    \"\"\"Return a list of test\
+          \ prompts spanning short, medium, and long inputs.\"\"\"\n    return [\n\
+          \        {\n            \"id\": \"short-1\",\n            \"category\":\
+          \ \"short\",\n            \"messages\": [\n                {\"role\": \"\
+          user\", \"content\": \"What is the capital of France?\"}\n            ],\n\
+          \            \"max_tokens\": 64,\n        },\n        {\n            \"\
+          id\": \"short-2\",\n            \"category\": \"short\",\n            \"\
+          messages\": [\n                {\"role\": \"user\", \"content\": \"Explain\
+          \ quantum computing in one sentence.\"}\n            ],\n            \"\
+          max_tokens\": 64,\n        },\n        {\n            \"id\": \"medium-1\"\
+          ,\n            \"category\": \"medium\",\n            \"messages\": [\n\
+          \                {\n                    \"role\": \"system\",\n        \
+          \            \"content\": \"You are a helpful AI assistant running on a\
+          \ homelab.\",\n                },\n                {\n                 \
+          \   \"role\": \"user\",\n                    \"content\": (\n          \
+          \              \"Compare and contrast supervised and unsupervised \"\n \
+          \                       \"machine learning. Give examples of each and explain\
+          \ \"\n                        \"when you would choose one over the other.\"\
+          \n                    ),\n                },\n            ],\n         \
+          \   \"max_tokens\": 512,\n        },\n        {\n            \"id\": \"\
+          medium-2\",\n            \"category\": \"medium\",\n            \"messages\"\
+          : [\n                {\n                    \"role\": \"user\",\n      \
+          \              \"content\": (\n                        \"Write a Python\
+          \ function that implements a binary search \"\n                        \"\
+          tree with insert, search, and delete operations. Include \"\n          \
+          \              \"docstrings and type hints.\"\n                    ),\n\
+          \                },\n            ],\n            \"max_tokens\": 1024,\n\
+          \        },\n        {\n            \"id\": \"long-1\",\n            \"\
+          category\": \"long\",\n            \"messages\": [\n                {\n\
+          \                    \"role\": \"system\",\n                    \"content\"\
+          : \"You are a technical writer for a Kubernetes homelab blog.\",\n     \
+          \           },\n                {\n                    \"role\": \"user\"\
+          ,\n                    \"content\": (\n                        \"Write a\
+          \ detailed tutorial on setting up a multi-node \"\n                    \
+          \    \"Kubernetes cluster with Talos Linux, covering: \"\n             \
+          \           \"1) Hardware requirements and network topology, \"\n      \
+          \                  \"2) Talos machine config generation, \"\n          \
+          \              \"3) Control plane bootstrapping, \"\n                  \
+          \      \"4) Worker node joining, \"\n                        \"5) CNI setup\
+          \ with Cilium, \"\n                        \"6) Storage with Rook-Ceph,\
+          \ \"\n                        \"7) GitOps with Flux CD. \"\n           \
+          \             \"Include YAML examples for each step.\"\n               \
+          \     ),\n                },\n            ],\n            \"max_tokens\"\
+          : 2048,\n        },\n        {\n            \"id\": \"repeat-prefix-1\"\
+          ,\n            \"category\": \"prefix-cache-test\",\n            \"messages\"\
+          : [\n                {\n                    \"role\": \"system\",\n    \
+          \                \"content\": \"You are a helpful AI assistant running on\
+          \ a homelab.\",\n                },\n                {\n               \
+          \     \"role\": \"user\",\n                    \"content\": (\n        \
+          \                \"Compare and contrast supervised and unsupervised \"\n\
+          \                        \"machine learning. Now focus specifically on \"\
+          \n                        \"reinforcement learning and how it differs.\"\
+          \n                    ),\n                },\n            ],\n         \
+          \   \"max_tokens\": 512,\n        },\n    ]\n\n"
+        image: python:3.13-slim
+    exec-create-tuning-run:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - create_tuning_run
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
+          \ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef create_tuning_run(\n    experiment_name: str,\n    run_name:\
+          \ str,\n    tuning_params: dict,\n    mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\
+          ,\n) -> NamedTuple(\"RunInfo\", [(\"run_id\", str), (\"experiment_id\",\
+          \ str)]):\n    \"\"\"Create an MLflow run for a vLLM tuning experiment.\"\
+          \"\"\n    import os\n    import mlflow\n    from mlflow.tracking import\
+          \ MlflowClient\n    from collections import namedtuple\n\n    mlflow.set_tracking_uri(mlflow_tracking_uri)\n\
+          \    client = MlflowClient()\n\n    exp = client.get_experiment_by_name(experiment_name)\n\
+          \    experiment_id = (\n        exp.experiment_id\n        if exp\n    \
+          \    else client.create_experiment(\n            name=experiment_name,\n\
+          \            artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\
+          ,\n        )\n    )\n\n    tags = {\n        \"pipeline.type\": \"vllm-tuning\"\
+          ,\n        \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\", \"unknown\"),\n\
+          \    }\n\n    run = mlflow.start_run(\n        experiment_id=experiment_id,\
+          \ run_name=run_name, tags=tags\n    )\n    # Log every tuning param\n  \
+          \  for key, value in tuning_params.items():\n        mlflow.log_param(f\"\
+          vllm.{key}\", value)\n    run_id = run.info.run_id\n    mlflow.end_run()\n\
+          \n    RunInfo = namedtuple(\"RunInfo\", [\"run_id\", \"experiment_id\"])\n\
+          \    return RunInfo(run_id, experiment_id)\n\n"
+        image: python:3.13-slim
+    exec-log-benchmark-results:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - log_benchmark_results
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
+          \ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef log_benchmark_results(\n    run_id: str,\n    metrics: dict,\n\
+          \    mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\
+          ,\n) -> str:\n    \"\"\"Log benchmark metrics to MLflow and close the run.\"\
+          \"\"\n    import json\n    import tempfile\n    import mlflow\n    from\
+          \ mlflow.tracking import MlflowClient\n    from pathlib import Path\n\n\
+          \    mlflow.set_tracking_uri(mlflow_tracking_uri)\n    client = MlflowClient()\n\
+          \n    for key, value in metrics.items():\n        client.log_metric(run_id,\
+          \ key, float(value))\n\n    # Save full results as artifact\n    with tempfile.TemporaryDirectory()\
+          \ as tmpdir:\n        path = Path(tmpdir) / \"benchmark_results.json\"\n\
+          \        path.write_text(json.dumps(metrics, indent=2))\n        client.log_artifact(run_id,\
+          \ str(path))\n\n    client.set_terminated(run_id, status=\"FINISHED\")\n\
+          \    return run_id\n\n"
+        image: python:3.13-slim
+    exec-run-benchmark:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - run_benchmark
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef run_benchmark(\n    prompts: list,\n    llm_endpoint: str,\n\
+          \    model_name: str,\n    num_warmup: int = 2,\n    num_iterations: int\
+          \ = 3,\n) -> dict:\n    \"\"\"\n    Run all prompts through the LLM endpoint\
+          \ and collect timing metrics.\n\n    Returns aggregate metrics: p50/p95/mean\
+          \ latency, tokens/sec, TTFT.\n    \"\"\"\n    import time\n    import statistics\n\
+          \    import httpx\n\n    all_latencies: list[float] = []\n    all_tps: list[float]\
+          \ = []\n    all_ttft: list[float] = []\n    per_category: dict[str, list[float]]\
+          \ = {}\n\n    with httpx.Client(timeout=300.0) as client:\n        # Warmup\n\
+          \        for _ in range(num_warmup):\n            try:\n               \
+          \ client.post(\n                    f\"{llm_endpoint}/v1/chat/completions\"\
+          ,\n                    json={\n                        \"model\": model_name,\n\
+          \                        \"messages\": [{\"role\": \"user\", \"content\"\
+          : \"Hi\"}],\n                        \"max_tokens\": 8,\n              \
+          \          \"temperature\": 0,\n                    },\n               \
+          \ )\n            except Exception:\n                pass\n\n        # Benchmark\n\
+          \        for iteration in range(num_iterations):\n            for prompt\
+          \ in prompts:\n                category = prompt.get(\"category\", \"unknown\"\
+          )\n                payload = {\n                    \"model\": model_name,\n\
+          \                    \"messages\": prompt[\"messages\"],\n             \
+          \       \"max_tokens\": prompt.get(\"max_tokens\", 256),\n             \
+          \       \"temperature\": 0,\n                    \"stream\": True,\n   \
+          \             }\n\n                try:\n                    t_start = time.perf_counter()\n\
+          \                    first_token_time = None\n\n                    with\
+          \ client.stream(\n                        \"POST\",\n                  \
+          \      f\"{llm_endpoint}/v1/chat/completions\",\n                      \
+          \  json=payload,\n                    ) as resp:\n                     \
+          \   resp.raise_for_status()\n                        completion_tokens =\
+          \ 0\n                        for line in resp.iter_lines():\n          \
+          \                  if not line.startswith(\"data: \"):\n               \
+          \                 continue\n                            chunk = line[6:]\n\
+          \                            if chunk == \"[DONE]\":\n                 \
+          \               break\n                            if first_token_time is\
+          \ None:\n                                first_token_time = time.perf_counter()\n\
+          \                            completion_tokens += 1\n\n                \
+          \    t_end = time.perf_counter()\n                    latency = t_end -\
+          \ t_start\n                    ttft = (\n                        (first_token_time\
+          \ - t_start)\n                        if first_token_time\n            \
+          \            else latency\n                    )\n                    tps\
+          \ = (\n                        completion_tokens / latency if latency >\
+          \ 0 else 0\n                    )\n\n                    all_latencies.append(latency)\n\
+          \                    all_tps.append(tps)\n                    all_ttft.append(ttft)\n\
+          \                    per_category.setdefault(category, []).append(latency)\n\
+          \n                except Exception as exc:\n                    # Record\
+          \ failure but keep going\n                    all_latencies.append(-1)\n\
+          \                    all_tps.append(0)\n                    all_ttft.append(-1)\n\
+          \n    # Compute aggregates\n    valid_latencies = [l for l in all_latencies\
+          \ if l > 0]\n    valid_tps = [t for t in all_tps if t > 0]\n    valid_ttft\
+          \ = [t for t in all_ttft if t > 0]\n\n    def safe_stat(values, func):\n\
+          \        return func(values) if values else 0\n\n    metrics = {\n     \
+          \   \"total_requests\": len(all_latencies),\n        \"successful_requests\"\
+          : len(valid_latencies),\n        \"failed_requests\": len(all_latencies)\
+          \ - len(valid_latencies),\n        # Latency\n        \"latency_mean_s\"\
+          : safe_stat(valid_latencies, statistics.mean),\n        \"latency_p50_s\"\
+          : safe_stat(\n            valid_latencies,\n            lambda v: statistics.median(v),\n\
+          \        ),\n        \"latency_p95_s\": safe_stat(\n            valid_latencies,\n\
+          \            lambda v: sorted(v)[int(len(v) * 0.95)] if v else 0,\n    \
+          \    ),\n        # Throughput\n        \"tokens_per_second_mean\": safe_stat(valid_tps,\
+          \ statistics.mean),\n        \"tokens_per_second_p50\": safe_stat(\n   \
+          \         valid_tps, lambda v: statistics.median(v)\n        ),\n      \
+          \  # Time to first token\n        \"ttft_mean_s\": safe_stat(valid_ttft,\
+          \ statistics.mean),\n        \"ttft_p50_s\": safe_stat(valid_ttft, lambda\
+          \ v: statistics.median(v)),\n        \"ttft_p95_s\": safe_stat(\n      \
+          \      valid_ttft,\n            lambda v: sorted(v)[int(len(v) * 0.95)]\
+          \ if v else 0,\n        ),\n    }\n\n    # Per-category latency\n    for\
+          \ cat, lats in per_category.items():\n        valid = [l for l in lats if\
+          \ l > 0]\n        if valid:\n            metrics[f\"latency_mean_{cat}_s\"\
+          ] = statistics.mean(valid)\n\n    return metrics\n\n"
+        image: python:3.13-slim
+pipelineInfo:
+  description: Benchmark vLLM with different tuning configurations. Logs latency,
+    TPS, and TTFT to MLflow for A/B comparison.
+  name: vllm-tuning-evaluation
+root:
+  dag:
+    tasks:
+      build-prompt-suite:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-build-prompt-suite
+        taskInfo:
+          name: build-prompt-suite
+      create-tuning-run:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-create-tuning-run
+        inputs:
+          parameters:
+            experiment_name:
+              runtimeValue:
+                constant: vllm-tuning
+            pipelinechannel--enable_chunked_prefill:
+              componentInputParameter: enable_chunked_prefill
+            pipelinechannel--enable_prefix_caching:
+              componentInputParameter: enable_prefix_caching
+            pipelinechannel--gpu_memory_utilization:
+              componentInputParameter: gpu_memory_utilization
+            pipelinechannel--llm_endpoint:
+              componentInputParameter: llm_endpoint
+            pipelinechannel--model_name:
+              componentInputParameter: model_name
+            pipelinechannel--ngram_prompt_lookup_max:
+              componentInputParameter: ngram_prompt_lookup_max
+            pipelinechannel--num_iterations:
+              componentInputParameter: num_iterations
+            pipelinechannel--num_speculative_tokens:
+              componentInputParameter: num_speculative_tokens
+            pipelinechannel--num_warmup:
+              componentInputParameter: num_warmup
+            pipelinechannel--run_label:
+              componentInputParameter: run_label
+            run_name:
+              runtimeValue:
+                constant: vllm-{{$.inputs.parameters['pipelinechannel--run_label']}}
+            tuning_params:
+              runtimeValue:
+                constant:
+                  enable_chunked_prefill: '{{$.inputs.parameters[''pipelinechannel--enable_chunked_prefill'']}}'
+                  enable_prefix_caching: '{{$.inputs.parameters[''pipelinechannel--enable_prefix_caching'']}}'
+                  gpu_memory_utilization: '{{$.inputs.parameters[''pipelinechannel--gpu_memory_utilization'']}}'
+                  llm_endpoint: '{{$.inputs.parameters[''pipelinechannel--llm_endpoint'']}}'
+                  model_name: '{{$.inputs.parameters[''pipelinechannel--model_name'']}}'
+                  ngram_prompt_lookup_max: '{{$.inputs.parameters[''pipelinechannel--ngram_prompt_lookup_max'']}}'
+                  num_iterations: '{{$.inputs.parameters[''pipelinechannel--num_iterations'']}}'
+                  num_speculative_tokens: '{{$.inputs.parameters[''pipelinechannel--num_speculative_tokens'']}}'
+                  num_warmup: '{{$.inputs.parameters[''pipelinechannel--num_warmup'']}}'
+        taskInfo:
+          name: create-tuning-run
+      log-benchmark-results:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-log-benchmark-results
+        dependentTasks:
+        - create-tuning-run
+        - run-benchmark
+        inputs:
+          parameters:
+            metrics:
+              taskOutputParameter:
+                outputParameterKey: Output
+                producerTask: run-benchmark
+            run_id:
+              taskOutputParameter:
+                outputParameterKey: run_id
+                producerTask: create-tuning-run
+        taskInfo:
+          name: log-benchmark-results
+      run-benchmark:
+        cachingOptions: {}
+        componentRef:
+          name: comp-run-benchmark
+        dependentTasks:
+        - build-prompt-suite
+        inputs:
+          parameters:
+            llm_endpoint:
+              componentInputParameter: llm_endpoint
+            model_name:
+              componentInputParameter: model_name
+            num_iterations:
+              componentInputParameter: num_iterations
+            num_warmup:
+              componentInputParameter: num_warmup
+            prompts:
+              taskOutputParameter:
+                outputParameterKey: Output
+                producerTask: build-prompt-suite
+        taskInfo:
+          name: run-benchmark
+  inputDefinitions:
+    parameters:
+      enable_chunked_prefill:
+        defaultValue: 'true'
+        description: '"true" or "false"'
+        isOptional: true
+        parameterType: STRING
+      enable_prefix_caching:
+        defaultValue: 'true'
+        description: '"true" or "false"'
+        isOptional: true
+        parameterType: STRING
+      gpu_memory_utilization:
+        defaultValue: '0.90'
+        description: 0.0 - 1.0
+        isOptional: true
+        parameterType: STRING
+      llm_endpoint:
+        defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
+        description: vLLM inference endpoint URL
+        isOptional: true
+        parameterType: STRING
+      model_name:
+        defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
+        description: HF model identifier
+        isOptional: true
+        parameterType: STRING
+      ngram_prompt_lookup_max:
+        defaultValue: '4'
+        description: ngram window for spec decode (0 = off)
+        isOptional: true
+        parameterType: STRING
+      num_iterations:
+        defaultValue: 3.0
+        description: how many times to repeat the prompt suite
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      num_speculative_tokens:
+        defaultValue: '3'
+        description: number of speculative tokens (0 = off)
+        isOptional: true
+        parameterType: STRING
+      num_warmup:
+        defaultValue: 2.0
+        description: warmup requests before timing
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      run_label:
+        defaultValue: baseline
+        description: human-readable label (e.g. "apc-on-spec3")
+        isOptional: true
+        parameterType: STRING
+schemaVersion: 2.1.0
+sdkVersion: kfp-2.12.1
--- a/voice_pipeline.py
+++ b/voice_pipeline.py
@@ -12,6 +12,11 @@ Usage:

 from kfp import dsl
 from kfp import compiler
+from typing import NamedTuple
+
+
+MLFLOW_IMAGE = "python:3.13-slim"
+MLFLOW_PACKAGES = ["mlflow>=2.10.0", "boto3", "psycopg2-binary"]


@dsl.component(
@@ -21,13 +26,16 @@ from kfp import compiler
 def transcribe_audio(
    audio_b64: str,
    whisper_url: str = "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper"
-) -> str:
+) -> NamedTuple("STTResult", [("text", str), ("latency_s", float), ("audio_duration_s", float)]):
    """Transcribe audio using Whisper STT service."""
    import base64
+    import time
    import httpx
+    from collections import namedtuple

    audio_bytes = base64.b64decode(audio_b64)
    
+    start = time.perf_counter()
    with httpx.Client(timeout=120.0) as client:
        response = client.post(
            f"{whisper_url}/v1/audio/transcriptions",
@@ -35,8 +43,14 @@ def transcribe_audio(
            data={"model": "whisper-large-v3", "language": "en"}
        )
        result = response.json()
+    latency = time.perf_counter() - start

-    return result.get("text", "")
+    text = result.get("text", "")
+    # Estimate audio duration from WAV header (16-bit PCM, 16kHz)
+    audio_duration = max(len(audio_bytes) / (16000 * 2), 0.1)
+
+    STTResult = namedtuple("STTResult", ["text", "latency_s", "audio_duration_s"])
+    return STTResult(text, latency, audio_duration)


@dsl.component(
@@ -46,18 +60,23 @@ def transcribe_audio(
 def generate_embeddings(
    text: str,
    embeddings_url: str = "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings"
-) -> list:
+) -> NamedTuple("EmbedResult", [("embedding", list), ("latency_s", float)]):
    """Generate embeddings for RAG retrieval."""
+    import time
    import httpx
+    from collections import namedtuple

+    start = time.perf_counter()
    with httpx.Client(timeout=60.0) as client:
        response = client.post(
            f"{embeddings_url}/embeddings",
            json={"input": text, "model": "bge-small-en-v1.5"}
        )
        result = response.json()
+    latency = time.perf_counter() - start

-    return result["data"][0]["embedding"]
+    EmbedResult = namedtuple("EmbedResult", ["embedding", "latency_s"])
+    return EmbedResult(result["data"][0]["embedding"], latency)


@dsl.component(
@@ -69,14 +88,19 @@ def retrieve_context(
    milvus_host: str = "milvus.ai-ml.svc.cluster.local",
    collection_name: str = "knowledge_base",
    top_k: int = 5
-) -> list:
+) -> NamedTuple("RetrieveResult", [("documents", list), ("latency_s", float)]):
    """Retrieve relevant documents from Milvus vector database."""
+    import time
    from pymilvus import connections, Collection, utility
+    from collections import namedtuple

+    start = time.perf_counter()
    connections.connect(host=milvus_host, port=19530)
    
    if not utility.has_collection(collection_name):
-        return []
+        latency = time.perf_counter() - start
+        RetrieveResult = namedtuple("RetrieveResult", ["documents", "latency_s"])
+        return RetrieveResult([], latency)
    
    collection = Collection(collection_name)
    collection.load()
@@ -88,6 +112,7 @@ def retrieve_context(
        limit=top_k,
        output_fields=["text", "source"]
    )
+    latency = time.perf_counter() - start

    documents = []
    for hits in results:
@@ -98,7 +123,8 @@ def retrieve_context(
                "score": hit.distance
            })

-    return documents
+    RetrieveResult = namedtuple("RetrieveResult", ["documents", "latency_s"])
+    return RetrieveResult(documents, latency)


@dsl.component(
@@ -110,13 +136,17 @@ def rerank_documents(
    documents: list,
    reranker_url: str = "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker",
    top_k: int = 3
-) -> list:
+) -> NamedTuple("RerankResult", [("documents", list), ("latency_s", float)]):
    """Rerank documents using BGE reranker."""
+    import time
    import httpx
+    from collections import namedtuple

    if not documents:
-        return []
+        RerankResult = namedtuple("RerankResult", ["documents", "latency_s"])
+        return RerankResult([], 0.0)

+    start = time.perf_counter()
    with httpx.Client(timeout=60.0) as client:
        response = client.post(
            f"{reranker_url}/v1/rerank",
@@ -127,6 +157,7 @@ def rerank_documents(
            }
        )
        result = response.json()
+    latency = time.perf_counter() - start

    # Sort by rerank score
    reranked = sorted(
@@ -135,7 +166,8 @@ def rerank_documents(
        reverse=True
    )[:top_k]

-    return [doc for doc, score in reranked]
+    RerankResult = namedtuple("RerankResult", ["documents", "latency_s"])
+    return RerankResult([doc for doc, score in reranked], latency)


@dsl.component(
@@ -147,9 +179,11 @@ def generate_response(
    context: list,
    vllm_url: str = "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm",
    model: str = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
-) -> str:
+) -> NamedTuple("LLMResult", [("text", str), ("latency_s", float), ("completion_tokens", int)]):
    """Generate response using vLLM."""
+    import time
    import httpx
+    from collections import namedtuple

    # Build context
    if context:
@@ -167,6 +201,7 @@ Keep responses concise and natural for speech synthesis."""
        {"role": "user", "content": user_content}
    ]

+    start = time.perf_counter()
    with httpx.Client(timeout=180.0) as client:
        response = client.post(
            f"{vllm_url}/v1/chat/completions",
@@ -178,8 +213,14 @@ Keep responses concise and natural for speech synthesis."""
            }
        )
        result = response.json()
+    latency = time.perf_counter() - start

-    return result["choices"][0]["message"]["content"]
+    text = result["choices"][0]["message"]["content"]
+    usage = result.get("usage", {})
+    completion_tokens = usage.get("completion_tokens", len(text.split()))
+
+    LLMResult = namedtuple("LLMResult", ["text", "latency_s", "completion_tokens"])
+    return LLMResult(text, latency, completion_tokens)


@dsl.component(
@@ -189,11 +230,14 @@ Keep responses concise and natural for speech synthesis."""
 def synthesize_speech(
    text: str,
    tts_url: str = "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts"
-) -> str:
+) -> NamedTuple("TTSResult", [("audio_b64", str), ("latency_s", float)]):
    """Convert text to speech using TTS service."""
    import base64
+    import time
    import httpx
+    from collections import namedtuple

+    start = time.perf_counter()
    with httpx.Client(timeout=120.0) as client:
        response = client.post(
            f"{tts_url}/v1/audio/speech",
@@ -204,13 +248,86 @@ def synthesize_speech(
            }
        )
        audio_b64 = base64.b64encode(response.content).decode("utf-8")
+    latency = time.perf_counter() - start

-    return audio_b64
+    TTSResult = namedtuple("TTSResult", ["audio_b64", "latency_s"])
+    return TTSResult(audio_b64, latency)
+
+
+# ---- MLflow logging component ----
+
+
+@dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES)
+def log_pipeline_metrics(
+    stt_latency: float,
+    stt_audio_duration: float,
+    embed_latency: float,
+    retrieve_latency: float,
+    rerank_latency: float,
+    llm_latency: float,
+    llm_completion_tokens: int,
+    tts_latency: float,
+    experiment_name: str = "voice-pipeline-metrics",
+    run_name: str = "voice-pipeline",
+    mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
+) -> str:
+    """Log per-step latency metrics to MLflow for the full voice pipeline."""
+    import os
+    import mlflow
+    from mlflow.tracking import MlflowClient
+
+    mlflow.set_tracking_uri(mlflow_tracking_uri)
+    client = MlflowClient()
+
+    exp = client.get_experiment_by_name(experiment_name)
+    experiment_id = (
+        exp.experiment_id
+        if exp
+        else client.create_experiment(
+            name=experiment_name,
+            artifact_location=f"/mlflow/artifacts/{experiment_name}",
+        )
+    )
+
+    run = mlflow.start_run(
+        experiment_id=experiment_id,
+        run_name=run_name,
+        tags={
+            "pipeline.type": "voice-assistant",
+            "kfp.run_id": os.environ.get("KFP_RUN_ID", "unknown"),
+        },
+    )
+
+    total_latency = (
+        stt_latency + embed_latency + retrieve_latency
+        + rerank_latency + llm_latency + tts_latency
+    )
+    stt_rtf = stt_latency / stt_audio_duration if stt_audio_duration > 0 else 0
+    llm_tps = llm_completion_tokens / llm_latency if llm_latency > 0 else 0
+
+    mlflow.log_metrics({
+        "stt_latency_s": stt_latency,
+        "stt_audio_duration_s": stt_audio_duration,
+        "stt_realtime_factor": stt_rtf,
+        "embed_latency_s": embed_latency,
+        "retrieve_latency_s": retrieve_latency,
+        "rerank_latency_s": rerank_latency,
+        "llm_latency_s": llm_latency,
+        "llm_completion_tokens": llm_completion_tokens,
+        "llm_tokens_per_second": llm_tps,
+        "tts_latency_s": tts_latency,
+        "total_pipeline_latency_s": total_latency,
+    })
+    mlflow.end_run()
+    return run.info.run_id
+
+
+# ---- Pipelines ----


@dsl.pipeline(
    name="voice-assistant-rag-pipeline",
-    description="End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus -> Rerank -> LLM -> TTS"
+    description="End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus -> Rerank -> LLM -> TTS.  Logs per-step latency to MLflow."
 )
 def voice_assistant_pipeline(
    audio_b64: str,
@@ -229,29 +346,41 @@ def voice_assistant_pipeline(
    transcribe_task.set_caching_options(enable_caching=False)

    # Step 2: Generate embeddings
-    embed_task = generate_embeddings(text=transcribe_task.output)
+    embed_task = generate_embeddings(text=transcribe_task.outputs["text"])
    embed_task.set_caching_options(enable_caching=True)

    # Step 3: Retrieve context from Milvus
    retrieve_task = retrieve_context(
-        embedding=embed_task.output,
+        embedding=embed_task.outputs["embedding"],
        collection_name=collection_name
    )

    # Step 4: Rerank documents
    rerank_task = rerank_documents(
-        query=transcribe_task.output,
-        documents=retrieve_task.output
+        query=transcribe_task.outputs["text"],
+        documents=retrieve_task.outputs["documents"]
    )

    # Step 5: Generate response with context
    llm_task = generate_response(
-        query=transcribe_task.output,
-        context=rerank_task.output
+        query=transcribe_task.outputs["text"],
+        context=rerank_task.outputs["documents"]
    )

    # Step 6: Synthesize speech
-    tts_task = synthesize_speech(text=llm_task.output)
+    tts_task = synthesize_speech(text=llm_task.outputs["text"])
+
+    # Step 7: Log all per-step latencies to MLflow
+    log_task = log_pipeline_metrics(
+        stt_latency=transcribe_task.outputs["latency_s"],
+        stt_audio_duration=transcribe_task.outputs["audio_duration_s"],
+        embed_latency=embed_task.outputs["latency_s"],
+        retrieve_latency=retrieve_task.outputs["latency_s"],
+        rerank_latency=rerank_task.outputs["latency_s"],
+        llm_latency=llm_task.outputs["latency_s"],
+        llm_completion_tokens=llm_task.outputs["completion_tokens"],
+        tts_latency=tts_task.outputs["latency_s"],
+    )


@dsl.pipeline(
@@ -265,7 +394,7 @@ def text_to_speech_pipeline(text: str):

@dsl.pipeline(
    name="rag-query-pipeline", 
-    description="RAG query pipeline: Embed -> Retrieve -> Rerank -> LLM"
+    description="RAG query pipeline: Embed -> Retrieve -> Rerank -> LLM.  Logs per-step latency to MLflow."
 )
 def rag_query_pipeline(
    query: str,
@@ -283,20 +412,20 @@ def rag_query_pipeline(
    
    # Retrieve from Milvus
    retrieve_task = retrieve_context(
-        embedding=embed_task.output,
+        embedding=embed_task.outputs["embedding"],
        collection_name=collection_name
    )
    
    # Rerank
    rerank_task = rerank_documents(
        query=query,
-        documents=retrieve_task.output
+        documents=retrieve_task.outputs["documents"]
    )
    
    # Generate response
    llm_task = generate_response(
        query=query,
-        context=rerank_task.output
+        context=rerank_task.outputs["documents"]
    )


--- a/voice_pipeline.yaml
+++ b/voice_pipeline.yaml
@@ -0,0 +1,656 @@
+# PIPELINE DEFINITION
+# Name: voice-assistant-rag-pipeline
+# Description: End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus -> Rerank -> LLM -> TTS.  Logs per-step latency to MLflow.
+# Inputs:
+#    audio_b64: str
+#    collection_name: str [Default: 'knowledge_base']
+components:
+  comp-generate-embeddings:
+    executorLabel: exec-generate-embeddings
+    inputDefinitions:
+      parameters:
+        embeddings_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings
+          isOptional: true
+          parameterType: STRING
+        text:
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        embedding:
+          parameterType: LIST
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+  comp-generate-response:
+    executorLabel: exec-generate-response
+    inputDefinitions:
+      parameters:
+        context:
+          parameterType: LIST
+        model:
+          defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
+          isOptional: true
+          parameterType: STRING
+        query:
+          parameterType: STRING
+        vllm_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
+          isOptional: true
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        completion_tokens:
+          parameterType: NUMBER_INTEGER
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+        text:
+          parameterType: STRING
+  comp-log-pipeline-metrics:
+    executorLabel: exec-log-pipeline-metrics
+    inputDefinitions:
+      parameters:
+        embed_latency:
+          parameterType: NUMBER_DOUBLE
+        experiment_name:
+          defaultValue: voice-pipeline-metrics
+          isOptional: true
+          parameterType: STRING
+        llm_completion_tokens:
+          parameterType: NUMBER_INTEGER
+        llm_latency:
+          parameterType: NUMBER_DOUBLE
+        mlflow_tracking_uri:
+          defaultValue: http://mlflow.mlflow.svc.cluster.local:80
+          isOptional: true
+          parameterType: STRING
+        rerank_latency:
+          parameterType: NUMBER_DOUBLE
+        retrieve_latency:
+          parameterType: NUMBER_DOUBLE
+        run_name:
+          defaultValue: voice-pipeline
+          isOptional: true
+          parameterType: STRING
+        stt_audio_duration:
+          parameterType: NUMBER_DOUBLE
+        stt_latency:
+          parameterType: NUMBER_DOUBLE
+        tts_latency:
+          parameterType: NUMBER_DOUBLE
+    outputDefinitions:
+      parameters:
+        Output:
+          parameterType: STRING
+  comp-rerank-documents:
+    executorLabel: exec-rerank-documents
+    inputDefinitions:
+      parameters:
+        documents:
+          parameterType: LIST
+        query:
+          parameterType: STRING
+        reranker_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker
+          isOptional: true
+          parameterType: STRING
+        top_k:
+          defaultValue: 3.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+    outputDefinitions:
+      parameters:
+        documents:
+          parameterType: LIST
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+  comp-retrieve-context:
+    executorLabel: exec-retrieve-context
+    inputDefinitions:
+      parameters:
+        collection_name:
+          defaultValue: knowledge_base
+          isOptional: true
+          parameterType: STRING
+        embedding:
+          parameterType: LIST
+        milvus_host:
+          defaultValue: milvus.ai-ml.svc.cluster.local
+          isOptional: true
+          parameterType: STRING
+        top_k:
+          defaultValue: 5.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+    outputDefinitions:
+      parameters:
+        documents:
+          parameterType: LIST
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+  comp-synthesize-speech:
+    executorLabel: exec-synthesize-speech
+    inputDefinitions:
+      parameters:
+        text:
+          parameterType: STRING
+        tts_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts
+          isOptional: true
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        audio_b64:
+          parameterType: STRING
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+  comp-transcribe-audio:
+    executorLabel: exec-transcribe-audio
+    inputDefinitions:
+      parameters:
+        audio_b64:
+          parameterType: STRING
+        whisper_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper
+          isOptional: true
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        audio_duration_s:
+          parameterType: NUMBER_DOUBLE
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+        text:
+          parameterType: STRING
+deploymentSpec:
+  executors:
+    exec-generate-embeddings:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - generate_embeddings
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef generate_embeddings(\n    text: str,\n    embeddings_url: str\
+          \ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings\"\
+          \n) -> NamedTuple(\"EmbedResult\", [(\"embedding\", list), (\"latency_s\"\
+          , float)]):\n    \"\"\"Generate embeddings for RAG retrieval.\"\"\"\n  \
+          \  import time\n    import httpx\n    from collections import namedtuple\n\
+          \n    start = time.perf_counter()\n    with httpx.Client(timeout=60.0) as\
+          \ client:\n        response = client.post(\n            f\"{embeddings_url}/embeddings\"\
+          ,\n            json={\"input\": text, \"model\": \"bge-small-en-v1.5\"}\n\
+          \        )\n        result = response.json()\n    latency = time.perf_counter()\
+          \ - start\n\n    EmbedResult = namedtuple(\"EmbedResult\", [\"embedding\"\
+          , \"latency_s\"])\n    return EmbedResult(result[\"data\"][0][\"embedding\"\
+          ], latency)\n\n"
+        image: python:3.13-slim
+    exec-generate-response:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - generate_response
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef generate_response(\n    query: str,\n    context: list,\n   \
+          \ vllm_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm\"\
+          ,\n    model: str = \"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4\"\
+          \n) -> NamedTuple(\"LLMResult\", [(\"text\", str), (\"latency_s\", float),\
+          \ (\"completion_tokens\", int)]):\n    \"\"\"Generate response using vLLM.\"\
+          \"\"\n    import time\n    import httpx\n    from collections import namedtuple\n\
+          \n    # Build context\n    if context:\n        context_text = \"\\n\\n\"\
+          .join([doc[\"text\"] for doc in context])\n        user_content = f\"Context:\\\
+          n{context_text}\\n\\nQuestion: {query}\"\n    else:\n        user_content\
+          \ = query\n\n    system_prompt = \"\"\"You are a helpful voice assistant.\n\
+          Answer questions based on the provided context when available.\nKeep responses\
+          \ concise and natural for speech synthesis.\"\"\"\n\n    messages = [\n\
+          \        {\"role\": \"system\", \"content\": system_prompt},\n        {\"\
+          role\": \"user\", \"content\": user_content}\n    ]\n\n    start = time.perf_counter()\n\
+          \    with httpx.Client(timeout=180.0) as client:\n        response = client.post(\n\
+          \            f\"{vllm_url}/v1/chat/completions\",\n            json={\n\
+          \                \"model\": model,\n                \"messages\": messages,\n\
+          \                \"max_tokens\": 512,\n                \"temperature\":\
+          \ 0.7\n            }\n        )\n        result = response.json()\n    latency\
+          \ = time.perf_counter() - start\n\n    text = result[\"choices\"][0][\"\
+          message\"][\"content\"]\n    usage = result.get(\"usage\", {})\n    completion_tokens\
+          \ = usage.get(\"completion_tokens\", len(text.split()))\n\n    LLMResult\
+          \ = namedtuple(\"LLMResult\", [\"text\", \"latency_s\", \"completion_tokens\"\
+          ])\n    return LLMResult(text, latency, completion_tokens)\n\n"
+        image: python:3.13-slim
+    exec-log-pipeline-metrics:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - log_pipeline_metrics
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
+          \ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef log_pipeline_metrics(\n    stt_latency: float,\n    stt_audio_duration:\
+          \ float,\n    embed_latency: float,\n    retrieve_latency: float,\n    rerank_latency:\
+          \ float,\n    llm_latency: float,\n    llm_completion_tokens: int,\n   \
+          \ tts_latency: float,\n    experiment_name: str = \"voice-pipeline-metrics\"\
+          ,\n    run_name: str = \"voice-pipeline\",\n    mlflow_tracking_uri: str\
+          \ = \"http://mlflow.mlflow.svc.cluster.local:80\",\n) -> str:\n    \"\"\"\
+          Log per-step latency metrics to MLflow for the full voice pipeline.\"\"\"\
+          \n    import os\n    import mlflow\n    from mlflow.tracking import MlflowClient\n\
+          \n    mlflow.set_tracking_uri(mlflow_tracking_uri)\n    client = MlflowClient()\n\
+          \n    exp = client.get_experiment_by_name(experiment_name)\n    experiment_id\
+          \ = (\n        exp.experiment_id\n        if exp\n        else client.create_experiment(\n\
+          \            name=experiment_name,\n            artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\
+          ,\n        )\n    )\n\n    run = mlflow.start_run(\n        experiment_id=experiment_id,\n\
+          \        run_name=run_name,\n        tags={\n            \"pipeline.type\"\
+          : \"voice-assistant\",\n            \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\"\
+          , \"unknown\"),\n        },\n    )\n\n    total_latency = (\n        stt_latency\
+          \ + embed_latency + retrieve_latency\n        + rerank_latency + llm_latency\
+          \ + tts_latency\n    )\n    stt_rtf = stt_latency / stt_audio_duration if\
+          \ stt_audio_duration > 0 else 0\n    llm_tps = llm_completion_tokens / llm_latency\
+          \ if llm_latency > 0 else 0\n\n    mlflow.log_metrics({\n        \"stt_latency_s\"\
+          : stt_latency,\n        \"stt_audio_duration_s\": stt_audio_duration,\n\
+          \        \"stt_realtime_factor\": stt_rtf,\n        \"embed_latency_s\"\
+          : embed_latency,\n        \"retrieve_latency_s\": retrieve_latency,\n  \
+          \      \"rerank_latency_s\": rerank_latency,\n        \"llm_latency_s\"\
+          : llm_latency,\n        \"llm_completion_tokens\": llm_completion_tokens,\n\
+          \        \"llm_tokens_per_second\": llm_tps,\n        \"tts_latency_s\"\
+          : tts_latency,\n        \"total_pipeline_latency_s\": total_latency,\n \
+          \   })\n    mlflow.end_run()\n    return run.info.run_id\n\n"
+        image: python:3.13-slim
+    exec-rerank-documents:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - rerank_documents
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef rerank_documents(\n    query: str,\n    documents: list,\n  \
+          \  reranker_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker\"\
+          ,\n    top_k: int = 3\n) -> NamedTuple(\"RerankResult\", [(\"documents\"\
+          , list), (\"latency_s\", float)]):\n    \"\"\"Rerank documents using BGE\
+          \ reranker.\"\"\"\n    import time\n    import httpx\n    from collections\
+          \ import namedtuple\n\n    if not documents:\n        RerankResult = namedtuple(\"\
+          RerankResult\", [\"documents\", \"latency_s\"])\n        return RerankResult([],\
+          \ 0.0)\n\n    start = time.perf_counter()\n    with httpx.Client(timeout=60.0)\
+          \ as client:\n        response = client.post(\n            f\"{reranker_url}/v1/rerank\"\
+          ,\n            json={\n                \"query\": query,\n             \
+          \   \"documents\": [doc[\"text\"] for doc in documents],\n             \
+          \   \"model\": \"bge-reranker-v2-m3\"\n            }\n        )\n      \
+          \  result = response.json()\n    latency = time.perf_counter() - start\n\
+          \n    # Sort by rerank score\n    reranked = sorted(\n        zip(documents,\
+          \ result.get(\"scores\", [0] * len(documents))),\n        key=lambda x:\
+          \ x[1],\n        reverse=True\n    )[:top_k]\n\n    RerankResult = namedtuple(\"\
+          RerankResult\", [\"documents\", \"latency_s\"])\n    return RerankResult([doc\
+          \ for doc, score in reranked], latency)\n\n"
+        image: python:3.13-slim
+    exec-retrieve-context:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - retrieve_context
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'pymilvus' &&\
+          \ \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef retrieve_context(\n    embedding: list,\n    milvus_host: str\
+          \ = \"milvus.ai-ml.svc.cluster.local\",\n    collection_name: str = \"knowledge_base\"\
+          ,\n    top_k: int = 5\n) -> NamedTuple(\"RetrieveResult\", [(\"documents\"\
+          , list), (\"latency_s\", float)]):\n    \"\"\"Retrieve relevant documents\
+          \ from Milvus vector database.\"\"\"\n    import time\n    from pymilvus\
+          \ import connections, Collection, utility\n    from collections import namedtuple\n\
+          \n    start = time.perf_counter()\n    connections.connect(host=milvus_host,\
+          \ port=19530)\n\n    if not utility.has_collection(collection_name):\n \
+          \       latency = time.perf_counter() - start\n        RetrieveResult =\
+          \ namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n     \
+          \   return RetrieveResult([], latency)\n\n    collection = Collection(collection_name)\n\
+          \    collection.load()\n\n    results = collection.search(\n        data=[embedding],\n\
+          \        anns_field=\"embedding\",\n        param={\"metric_type\": \"COSINE\"\
+          , \"params\": {\"nprobe\": 10}},\n        limit=top_k,\n        output_fields=[\"\
+          text\", \"source\"]\n    )\n    latency = time.perf_counter() - start\n\n\
+          \    documents = []\n    for hits in results:\n        for hit in hits:\n\
+          \            documents.append({\n                \"text\": hit.entity.get(\"\
+          text\"),\n                \"source\": hit.entity.get(\"source\"),\n    \
+          \            \"score\": hit.distance\n            })\n\n    RetrieveResult\
+          \ = namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n   \
+          \ return RetrieveResult(documents, latency)\n\n"
+        image: python:3.13-slim
+    exec-synthesize-speech:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - synthesize_speech
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef synthesize_speech(\n    text: str,\n    tts_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts\"\
+          \n) -> NamedTuple(\"TTSResult\", [(\"audio_b64\", str), (\"latency_s\",\
+          \ float)]):\n    \"\"\"Convert text to speech using TTS service.\"\"\"\n\
+          \    import base64\n    import time\n    import httpx\n    from collections\
+          \ import namedtuple\n\n    start = time.perf_counter()\n    with httpx.Client(timeout=120.0)\
+          \ as client:\n        response = client.post(\n            f\"{tts_url}/v1/audio/speech\"\
+          ,\n            json={\n                \"input\": text,\n              \
+          \  \"voice\": \"en_US-lessac-high\",\n                \"response_format\"\
+          : \"wav\"\n            }\n        )\n        audio_b64 = base64.b64encode(response.content).decode(\"\
+          utf-8\")\n    latency = time.perf_counter() - start\n\n    TTSResult = namedtuple(\"\
+          TTSResult\", [\"audio_b64\", \"latency_s\"])\n    return TTSResult(audio_b64,\
+          \ latency)\n\n"
+        image: python:3.13-slim
+    exec-transcribe-audio:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - transcribe_audio
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef transcribe_audio(\n    audio_b64: str,\n    whisper_url: str\
+          \ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper\"\
+          \n) -> NamedTuple(\"STTResult\", [(\"text\", str), (\"latency_s\", float),\
+          \ (\"audio_duration_s\", float)]):\n    \"\"\"Transcribe audio using Whisper\
+          \ STT service.\"\"\"\n    import base64\n    import time\n    import httpx\n\
+          \    from collections import namedtuple\n\n    audio_bytes = base64.b64decode(audio_b64)\n\
+          \n    start = time.perf_counter()\n    with httpx.Client(timeout=120.0)\
+          \ as client:\n        response = client.post(\n            f\"{whisper_url}/v1/audio/transcriptions\"\
+          ,\n            files={\"file\": (\"audio.wav\", audio_bytes, \"audio/wav\"\
+          )},\n            data={\"model\": \"whisper-large-v3\", \"language\": \"\
+          en\"}\n        )\n        result = response.json()\n    latency = time.perf_counter()\
+          \ - start\n\n    text = result.get(\"text\", \"\")\n    # Estimate audio\
+          \ duration from WAV header (16-bit PCM, 16kHz)\n    audio_duration = max(len(audio_bytes)\
+          \ / (16000 * 2), 0.1)\n\n    STTResult = namedtuple(\"STTResult\", [\"text\"\
+          , \"latency_s\", \"audio_duration_s\"])\n    return STTResult(text, latency,\
+          \ audio_duration)\n\n"
+        image: python:3.13-slim
+pipelineInfo:
+  description: 'End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus ->
+    Rerank -> LLM -> TTS.  Logs per-step latency to MLflow.'
+  name: voice-assistant-rag-pipeline
+root:
+  dag:
+    tasks:
+      generate-embeddings:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-generate-embeddings
+        dependentTasks:
+        - transcribe-audio
+        inputs:
+          parameters:
+            text:
+              taskOutputParameter:
+                outputParameterKey: text
+                producerTask: transcribe-audio
+        taskInfo:
+          name: generate-embeddings
+      generate-response:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-generate-response
+        dependentTasks:
+        - rerank-documents
+        - transcribe-audio
+        inputs:
+          parameters:
+            context:
+              taskOutputParameter:
+                outputParameterKey: documents
+                producerTask: rerank-documents
+            query:
+              taskOutputParameter:
+                outputParameterKey: text
+                producerTask: transcribe-audio
+        taskInfo:
+          name: generate-response
+      log-pipeline-metrics:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-log-pipeline-metrics
+        dependentTasks:
+        - generate-embeddings
+        - generate-response
+        - rerank-documents
+        - retrieve-context
+        - synthesize-speech
+        - transcribe-audio
+        inputs:
+          parameters:
+            embed_latency:
+              taskOutputParameter:
+                outputParameterKey: latency_s
+                producerTask: generate-embeddings
+            llm_completion_tokens:
+              taskOutputParameter:
+                outputParameterKey: completion_tokens
+                producerTask: generate-response
+            llm_latency:
+              taskOutputParameter:
+                outputParameterKey: latency_s
+                producerTask: generate-response
+            rerank_latency:
+              taskOutputParameter:
+                outputParameterKey: latency_s
+                producerTask: rerank-documents
+            retrieve_latency:
+              taskOutputParameter:
+                outputParameterKey: latency_s
+                producerTask: retrieve-context
+            stt_audio_duration:
+              taskOutputParameter:
+                outputParameterKey: audio_duration_s
+                producerTask: transcribe-audio
+            stt_latency:
+              taskOutputParameter:
+                outputParameterKey: latency_s
+                producerTask: transcribe-audio
+            tts_latency:
+              taskOutputParameter:
+                outputParameterKey: latency_s
+                producerTask: synthesize-speech
+        taskInfo:
+          name: log-pipeline-metrics
+      rerank-documents:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-rerank-documents
+        dependentTasks:
+        - retrieve-context
+        - transcribe-audio
+        inputs:
+          parameters:
+            documents:
+              taskOutputParameter:
+                outputParameterKey: documents
+                producerTask: retrieve-context
+            query:
+              taskOutputParameter:
+                outputParameterKey: text
+                producerTask: transcribe-audio
+        taskInfo:
+          name: rerank-documents
+      retrieve-context:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-retrieve-context
+        dependentTasks:
+        - generate-embeddings
+        inputs:
+          parameters:
+            collection_name:
+              componentInputParameter: collection_name
+            embedding:
+              taskOutputParameter:
+                outputParameterKey: embedding
+                producerTask: generate-embeddings
+        taskInfo:
+          name: retrieve-context
+      synthesize-speech:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-synthesize-speech
+        dependentTasks:
+        - generate-response
+        inputs:
+          parameters:
+            text:
+              taskOutputParameter:
+                outputParameterKey: text
+                producerTask: generate-response
+        taskInfo:
+          name: synthesize-speech
+      transcribe-audio:
+        cachingOptions: {}
+        componentRef:
+          name: comp-transcribe-audio
+        inputs:
+          parameters:
+            audio_b64:
+              componentInputParameter: audio_b64
+        taskInfo:
+          name: transcribe-audio
+  inputDefinitions:
+    parameters:
+      audio_b64:
+        description: Base64-encoded audio file
+        parameterType: STRING
+      collection_name:
+        defaultValue: knowledge_base
+        description: Milvus collection for RAG
+        isOptional: true
+        parameterType: STRING
+schemaVersion: 2.1.0
+sdkVersion: kfp-2.12.1