feat: add vLLM tuning pipeline + recompile voice pipelines with MLflow

New: - vllm_tuning_pipeline.py: A/B benchmark different vLLM configs, logs latency/TPS/TTFT to MLflow (vllm-tuning experiment) - vllm_tuning_pipeline.yaml: compiled KFP YAML Updated: - voice_pipeline.py: per-step NamedTuple outputs with latency tracking, new log_pipeline_metrics MLflow component - voice_pipeline.yaml, tts_pipeline.yaml, rag_pipeline.yaml: recompiled
2026-02-13 08:24:11 -05:00
parent cee21f124c
commit bc4b230dd9
6 changed files with 2216 additions and 26 deletions
--- a/voice_pipeline.yaml
+++ b/voice_pipeline.yaml
@@ -0,0 +1,656 @@
+# PIPELINE DEFINITION
+# Name: voice-assistant-rag-pipeline
+# Description: End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus -> Rerank -> LLM -> TTS.  Logs per-step latency to MLflow.
+# Inputs:
+#    audio_b64: str
+#    collection_name: str [Default: 'knowledge_base']
+components:
+  comp-generate-embeddings:
+    executorLabel: exec-generate-embeddings
+    inputDefinitions:
+      parameters:
+        embeddings_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings
+          isOptional: true
+          parameterType: STRING
+        text:
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        embedding:
+          parameterType: LIST
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+  comp-generate-response:
+    executorLabel: exec-generate-response
+    inputDefinitions:
+      parameters:
+        context:
+          parameterType: LIST
+        model:
+          defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
+          isOptional: true
+          parameterType: STRING
+        query:
+          parameterType: STRING
+        vllm_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
+          isOptional: true
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        completion_tokens:
+          parameterType: NUMBER_INTEGER
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+        text:
+          parameterType: STRING
+  comp-log-pipeline-metrics:
+    executorLabel: exec-log-pipeline-metrics
+    inputDefinitions:
+      parameters:
+        embed_latency:
+          parameterType: NUMBER_DOUBLE
+        experiment_name:
+          defaultValue: voice-pipeline-metrics
+          isOptional: true
+          parameterType: STRING
+        llm_completion_tokens:
+          parameterType: NUMBER_INTEGER
+        llm_latency:
+          parameterType: NUMBER_DOUBLE
+        mlflow_tracking_uri:
+          defaultValue: http://mlflow.mlflow.svc.cluster.local:80
+          isOptional: true
+          parameterType: STRING
+        rerank_latency:
+          parameterType: NUMBER_DOUBLE
+        retrieve_latency:
+          parameterType: NUMBER_DOUBLE
+        run_name:
+          defaultValue: voice-pipeline
+          isOptional: true
+          parameterType: STRING
+        stt_audio_duration:
+          parameterType: NUMBER_DOUBLE
+        stt_latency:
+          parameterType: NUMBER_DOUBLE
+        tts_latency:
+          parameterType: NUMBER_DOUBLE
+    outputDefinitions:
+      parameters:
+        Output:
+          parameterType: STRING
+  comp-rerank-documents:
+    executorLabel: exec-rerank-documents
+    inputDefinitions:
+      parameters:
+        documents:
+          parameterType: LIST
+        query:
+          parameterType: STRING
+        reranker_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker
+          isOptional: true
+          parameterType: STRING
+        top_k:
+          defaultValue: 3.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+    outputDefinitions:
+      parameters:
+        documents:
+          parameterType: LIST
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+  comp-retrieve-context:
+    executorLabel: exec-retrieve-context
+    inputDefinitions:
+      parameters:
+        collection_name:
+          defaultValue: knowledge_base
+          isOptional: true
+          parameterType: STRING
+        embedding:
+          parameterType: LIST
+        milvus_host:
+          defaultValue: milvus.ai-ml.svc.cluster.local
+          isOptional: true
+          parameterType: STRING
+        top_k:
+          defaultValue: 5.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+    outputDefinitions:
+      parameters:
+        documents:
+          parameterType: LIST
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+  comp-synthesize-speech:
+    executorLabel: exec-synthesize-speech
+    inputDefinitions:
+      parameters:
+        text:
+          parameterType: STRING
+        tts_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts
+          isOptional: true
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        audio_b64:
+          parameterType: STRING
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+  comp-transcribe-audio:
+    executorLabel: exec-transcribe-audio
+    inputDefinitions:
+      parameters:
+        audio_b64:
+          parameterType: STRING
+        whisper_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper
+          isOptional: true
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        audio_duration_s:
+          parameterType: NUMBER_DOUBLE
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+        text:
+          parameterType: STRING
+deploymentSpec:
+  executors:
+    exec-generate-embeddings:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - generate_embeddings
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef generate_embeddings(\n    text: str,\n    embeddings_url: str\
+          \ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings\"\
+          \n) -> NamedTuple(\"EmbedResult\", [(\"embedding\", list), (\"latency_s\"\
+          , float)]):\n    \"\"\"Generate embeddings for RAG retrieval.\"\"\"\n  \
+          \  import time\n    import httpx\n    from collections import namedtuple\n\
+          \n    start = time.perf_counter()\n    with httpx.Client(timeout=60.0) as\
+          \ client:\n        response = client.post(\n            f\"{embeddings_url}/embeddings\"\
+          ,\n            json={\"input\": text, \"model\": \"bge-small-en-v1.5\"}\n\
+          \        )\n        result = response.json()\n    latency = time.perf_counter()\
+          \ - start\n\n    EmbedResult = namedtuple(\"EmbedResult\", [\"embedding\"\
+          , \"latency_s\"])\n    return EmbedResult(result[\"data\"][0][\"embedding\"\
+          ], latency)\n\n"
+        image: python:3.13-slim
+    exec-generate-response:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - generate_response
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef generate_response(\n    query: str,\n    context: list,\n   \
+          \ vllm_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm\"\
+          ,\n    model: str = \"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4\"\
+          \n) -> NamedTuple(\"LLMResult\", [(\"text\", str), (\"latency_s\", float),\
+          \ (\"completion_tokens\", int)]):\n    \"\"\"Generate response using vLLM.\"\
+          \"\"\n    import time\n    import httpx\n    from collections import namedtuple\n\
+          \n    # Build context\n    if context:\n        context_text = \"\\n\\n\"\
+          .join([doc[\"text\"] for doc in context])\n        user_content = f\"Context:\\\
+          n{context_text}\\n\\nQuestion: {query}\"\n    else:\n        user_content\
+          \ = query\n\n    system_prompt = \"\"\"You are a helpful voice assistant.\n\
+          Answer questions based on the provided context when available.\nKeep responses\
+          \ concise and natural for speech synthesis.\"\"\"\n\n    messages = [\n\
+          \        {\"role\": \"system\", \"content\": system_prompt},\n        {\"\
+          role\": \"user\", \"content\": user_content}\n    ]\n\n    start = time.perf_counter()\n\
+          \    with httpx.Client(timeout=180.0) as client:\n        response = client.post(\n\
+          \            f\"{vllm_url}/v1/chat/completions\",\n            json={\n\
+          \                \"model\": model,\n                \"messages\": messages,\n\
+          \                \"max_tokens\": 512,\n                \"temperature\":\
+          \ 0.7\n            }\n        )\n        result = response.json()\n    latency\
+          \ = time.perf_counter() - start\n\n    text = result[\"choices\"][0][\"\
+          message\"][\"content\"]\n    usage = result.get(\"usage\", {})\n    completion_tokens\
+          \ = usage.get(\"completion_tokens\", len(text.split()))\n\n    LLMResult\
+          \ = namedtuple(\"LLMResult\", [\"text\", \"latency_s\", \"completion_tokens\"\
+          ])\n    return LLMResult(text, latency, completion_tokens)\n\n"
+        image: python:3.13-slim
+    exec-log-pipeline-metrics:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - log_pipeline_metrics
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
+          \ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef log_pipeline_metrics(\n    stt_latency: float,\n    stt_audio_duration:\
+          \ float,\n    embed_latency: float,\n    retrieve_latency: float,\n    rerank_latency:\
+          \ float,\n    llm_latency: float,\n    llm_completion_tokens: int,\n   \
+          \ tts_latency: float,\n    experiment_name: str = \"voice-pipeline-metrics\"\
+          ,\n    run_name: str = \"voice-pipeline\",\n    mlflow_tracking_uri: str\
+          \ = \"http://mlflow.mlflow.svc.cluster.local:80\",\n) -> str:\n    \"\"\"\
+          Log per-step latency metrics to MLflow for the full voice pipeline.\"\"\"\
+          \n    import os\n    import mlflow\n    from mlflow.tracking import MlflowClient\n\
+          \n    mlflow.set_tracking_uri(mlflow_tracking_uri)\n    client = MlflowClient()\n\
+          \n    exp = client.get_experiment_by_name(experiment_name)\n    experiment_id\
+          \ = (\n        exp.experiment_id\n        if exp\n        else client.create_experiment(\n\
+          \            name=experiment_name,\n            artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\
+          ,\n        )\n    )\n\n    run = mlflow.start_run(\n        experiment_id=experiment_id,\n\
+          \        run_name=run_name,\n        tags={\n            \"pipeline.type\"\
+          : \"voice-assistant\",\n            \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\"\
+          , \"unknown\"),\n        },\n    )\n\n    total_latency = (\n        stt_latency\
+          \ + embed_latency + retrieve_latency\n        + rerank_latency + llm_latency\
+          \ + tts_latency\n    )\n    stt_rtf = stt_latency / stt_audio_duration if\
+          \ stt_audio_duration > 0 else 0\n    llm_tps = llm_completion_tokens / llm_latency\
+          \ if llm_latency > 0 else 0\n\n    mlflow.log_metrics({\n        \"stt_latency_s\"\
+          : stt_latency,\n        \"stt_audio_duration_s\": stt_audio_duration,\n\
+          \        \"stt_realtime_factor\": stt_rtf,\n        \"embed_latency_s\"\
+          : embed_latency,\n        \"retrieve_latency_s\": retrieve_latency,\n  \
+          \      \"rerank_latency_s\": rerank_latency,\n        \"llm_latency_s\"\
+          : llm_latency,\n        \"llm_completion_tokens\": llm_completion_tokens,\n\
+          \        \"llm_tokens_per_second\": llm_tps,\n        \"tts_latency_s\"\
+          : tts_latency,\n        \"total_pipeline_latency_s\": total_latency,\n \
+          \   })\n    mlflow.end_run()\n    return run.info.run_id\n\n"
+        image: python:3.13-slim
+    exec-rerank-documents:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - rerank_documents
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef rerank_documents(\n    query: str,\n    documents: list,\n  \
+          \  reranker_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker\"\
+          ,\n    top_k: int = 3\n) -> NamedTuple(\"RerankResult\", [(\"documents\"\
+          , list), (\"latency_s\", float)]):\n    \"\"\"Rerank documents using BGE\
+          \ reranker.\"\"\"\n    import time\n    import httpx\n    from collections\
+          \ import namedtuple\n\n    if not documents:\n        RerankResult = namedtuple(\"\
+          RerankResult\", [\"documents\", \"latency_s\"])\n        return RerankResult([],\
+          \ 0.0)\n\n    start = time.perf_counter()\n    with httpx.Client(timeout=60.0)\
+          \ as client:\n        response = client.post(\n            f\"{reranker_url}/v1/rerank\"\
+          ,\n            json={\n                \"query\": query,\n             \
+          \   \"documents\": [doc[\"text\"] for doc in documents],\n             \
+          \   \"model\": \"bge-reranker-v2-m3\"\n            }\n        )\n      \
+          \  result = response.json()\n    latency = time.perf_counter() - start\n\
+          \n    # Sort by rerank score\n    reranked = sorted(\n        zip(documents,\
+          \ result.get(\"scores\", [0] * len(documents))),\n        key=lambda x:\
+          \ x[1],\n        reverse=True\n    )[:top_k]\n\n    RerankResult = namedtuple(\"\
+          RerankResult\", [\"documents\", \"latency_s\"])\n    return RerankResult([doc\
+          \ for doc, score in reranked], latency)\n\n"
+        image: python:3.13-slim
+    exec-retrieve-context:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - retrieve_context
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'pymilvus' &&\
+          \ \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef retrieve_context(\n    embedding: list,\n    milvus_host: str\
+          \ = \"milvus.ai-ml.svc.cluster.local\",\n    collection_name: str = \"knowledge_base\"\
+          ,\n    top_k: int = 5\n) -> NamedTuple(\"RetrieveResult\", [(\"documents\"\
+          , list), (\"latency_s\", float)]):\n    \"\"\"Retrieve relevant documents\
+          \ from Milvus vector database.\"\"\"\n    import time\n    from pymilvus\
+          \ import connections, Collection, utility\n    from collections import namedtuple\n\
+          \n    start = time.perf_counter()\n    connections.connect(host=milvus_host,\
+          \ port=19530)\n\n    if not utility.has_collection(collection_name):\n \
+          \       latency = time.perf_counter() - start\n        RetrieveResult =\
+          \ namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n     \
+          \   return RetrieveResult([], latency)\n\n    collection = Collection(collection_name)\n\
+          \    collection.load()\n\n    results = collection.search(\n        data=[embedding],\n\
+          \        anns_field=\"embedding\",\n        param={\"metric_type\": \"COSINE\"\
+          , \"params\": {\"nprobe\": 10}},\n        limit=top_k,\n        output_fields=[\"\
+          text\", \"source\"]\n    )\n    latency = time.perf_counter() - start\n\n\
+          \    documents = []\n    for hits in results:\n        for hit in hits:\n\
+          \            documents.append({\n                \"text\": hit.entity.get(\"\
+          text\"),\n                \"source\": hit.entity.get(\"source\"),\n    \
+          \            \"score\": hit.distance\n            })\n\n    RetrieveResult\
+          \ = namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n   \
+          \ return RetrieveResult(documents, latency)\n\n"
+        image: python:3.13-slim
+    exec-synthesize-speech:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - synthesize_speech
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef synthesize_speech(\n    text: str,\n    tts_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts\"\
+          \n) -> NamedTuple(\"TTSResult\", [(\"audio_b64\", str), (\"latency_s\",\
+          \ float)]):\n    \"\"\"Convert text to speech using TTS service.\"\"\"\n\
+          \    import base64\n    import time\n    import httpx\n    from collections\
+          \ import namedtuple\n\n    start = time.perf_counter()\n    with httpx.Client(timeout=120.0)\
+          \ as client:\n        response = client.post(\n            f\"{tts_url}/v1/audio/speech\"\
+          ,\n            json={\n                \"input\": text,\n              \
+          \  \"voice\": \"en_US-lessac-high\",\n                \"response_format\"\
+          : \"wav\"\n            }\n        )\n        audio_b64 = base64.b64encode(response.content).decode(\"\
+          utf-8\")\n    latency = time.perf_counter() - start\n\n    TTSResult = namedtuple(\"\
+          TTSResult\", [\"audio_b64\", \"latency_s\"])\n    return TTSResult(audio_b64,\
+          \ latency)\n\n"
+        image: python:3.13-slim
+    exec-transcribe-audio:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - transcribe_audio
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef transcribe_audio(\n    audio_b64: str,\n    whisper_url: str\
+          \ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper\"\
+          \n) -> NamedTuple(\"STTResult\", [(\"text\", str), (\"latency_s\", float),\
+          \ (\"audio_duration_s\", float)]):\n    \"\"\"Transcribe audio using Whisper\
+          \ STT service.\"\"\"\n    import base64\n    import time\n    import httpx\n\
+          \    from collections import namedtuple\n\n    audio_bytes = base64.b64decode(audio_b64)\n\
+          \n    start = time.perf_counter()\n    with httpx.Client(timeout=120.0)\
+          \ as client:\n        response = client.post(\n            f\"{whisper_url}/v1/audio/transcriptions\"\
+          ,\n            files={\"file\": (\"audio.wav\", audio_bytes, \"audio/wav\"\
+          )},\n            data={\"model\": \"whisper-large-v3\", \"language\": \"\
+          en\"}\n        )\n        result = response.json()\n    latency = time.perf_counter()\
+          \ - start\n\n    text = result.get(\"text\", \"\")\n    # Estimate audio\
+          \ duration from WAV header (16-bit PCM, 16kHz)\n    audio_duration = max(len(audio_bytes)\
+          \ / (16000 * 2), 0.1)\n\n    STTResult = namedtuple(\"STTResult\", [\"text\"\
+          , \"latency_s\", \"audio_duration_s\"])\n    return STTResult(text, latency,\
+          \ audio_duration)\n\n"
+        image: python:3.13-slim
+pipelineInfo:
+  description: 'End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus ->
+    Rerank -> LLM -> TTS.  Logs per-step latency to MLflow.'
+  name: voice-assistant-rag-pipeline
+root:
+  dag:
+    tasks:
+      generate-embeddings:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-generate-embeddings
+        dependentTasks:
+        - transcribe-audio
+        inputs:
+          parameters:
+            text:
+              taskOutputParameter:
+                outputParameterKey: text
+                producerTask: transcribe-audio
+        taskInfo:
+          name: generate-embeddings
+      generate-response:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-generate-response
+        dependentTasks:
+        - rerank-documents
+        - transcribe-audio
+        inputs:
+          parameters:
+            context:
+              taskOutputParameter:
+                outputParameterKey: documents
+                producerTask: rerank-documents
+            query:
+              taskOutputParameter:
+                outputParameterKey: text
+                producerTask: transcribe-audio
+        taskInfo:
+          name: generate-response
+      log-pipeline-metrics:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-log-pipeline-metrics
+        dependentTasks:
+        - generate-embeddings
+        - generate-response
+        - rerank-documents
+        - retrieve-context
+        - synthesize-speech
+        - transcribe-audio
+        inputs:
+          parameters:
+            embed_latency:
+              taskOutputParameter:
+                outputParameterKey: latency_s
+                producerTask: generate-embeddings
+            llm_completion_tokens:
+              taskOutputParameter:
+                outputParameterKey: completion_tokens
+                producerTask: generate-response
+            llm_latency:
+              taskOutputParameter:
+                outputParameterKey: latency_s
+                producerTask: generate-response
+            rerank_latency:
+              taskOutputParameter:
+                outputParameterKey: latency_s
+                producerTask: rerank-documents
+            retrieve_latency:
+              taskOutputParameter:
+                outputParameterKey: latency_s
+                producerTask: retrieve-context
+            stt_audio_duration:
+              taskOutputParameter:
+                outputParameterKey: audio_duration_s
+                producerTask: transcribe-audio
+            stt_latency:
+              taskOutputParameter:
+                outputParameterKey: latency_s
+                producerTask: transcribe-audio
+            tts_latency:
+              taskOutputParameter:
+                outputParameterKey: latency_s
+                producerTask: synthesize-speech
+        taskInfo:
+          name: log-pipeline-metrics
+      rerank-documents:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-rerank-documents
+        dependentTasks:
+        - retrieve-context
+        - transcribe-audio
+        inputs:
+          parameters:
+            documents:
+              taskOutputParameter:
+                outputParameterKey: documents
+                producerTask: retrieve-context
+            query:
+              taskOutputParameter:
+                outputParameterKey: text
+                producerTask: transcribe-audio
+        taskInfo:
+          name: rerank-documents
+      retrieve-context:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-retrieve-context
+        dependentTasks:
+        - generate-embeddings
+        inputs:
+          parameters:
+            collection_name:
+              componentInputParameter: collection_name
+            embedding:
+              taskOutputParameter:
+                outputParameterKey: embedding
+                producerTask: generate-embeddings
+        taskInfo:
+          name: retrieve-context
+      synthesize-speech:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-synthesize-speech
+        dependentTasks:
+        - generate-response
+        inputs:
+          parameters:
+            text:
+              taskOutputParameter:
+                outputParameterKey: text
+                producerTask: generate-response
+        taskInfo:
+          name: synthesize-speech
+      transcribe-audio:
+        cachingOptions: {}
+        componentRef:
+          name: comp-transcribe-audio
+        inputs:
+          parameters:
+            audio_b64:
+              componentInputParameter: audio_b64
+        taskInfo:
+          name: transcribe-audio
+  inputDefinitions:
+    parameters:
+      audio_b64:
+        description: Base64-encoded audio file
+        parameterType: STRING
+      collection_name:
+        defaultValue: knowledge_base
+        description: Milvus collection for RAG
+        isOptional: true
+        parameterType: STRING
+schemaVersion: 2.1.0
+sdkVersion: kfp-2.12.1