kubeflow/voice_pipeline.yaml

# PIPELINE DEFINITION
# Name: voice-assistant-rag-pipeline
# Description: End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus -> Rerank -> LLM -> TTS.  Logs per-step latency to MLflow.
# Inputs:
#    audio_b64: str
#    collection_name: str [Default: 'knowledge_base']
components:
  comp-generate-embeddings:
    executorLabel: exec-generate-embeddings
    inputDefinitions:
      parameters:
        embeddings_url:
          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings
          isOptional: true
          parameterType: STRING
        text:
          parameterType: STRING
    outputDefinitions:
      parameters:
        embedding:
          parameterType: LIST
        latency_s:
          parameterType: NUMBER_DOUBLE
  comp-generate-response:
    executorLabel: exec-generate-response
    inputDefinitions:
      parameters:
        context:
          parameterType: LIST
        model:
          defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
          isOptional: true
          parameterType: STRING
        query:
          parameterType: STRING
        vllm_url:
          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
          isOptional: true
          parameterType: STRING
    outputDefinitions:
      parameters:
        completion_tokens:
          parameterType: NUMBER_INTEGER
        latency_s:
          parameterType: NUMBER_DOUBLE
        text:
          parameterType: STRING
  comp-log-pipeline-metrics:
    executorLabel: exec-log-pipeline-metrics
    inputDefinitions:
      parameters:
        embed_latency:
          parameterType: NUMBER_DOUBLE
        experiment_name:
          defaultValue: voice-pipeline-metrics
          isOptional: true
          parameterType: STRING
        llm_completion_tokens:
          parameterType: NUMBER_INTEGER
        llm_latency:
          parameterType: NUMBER_DOUBLE
        mlflow_tracking_uri:
          defaultValue: http://mlflow.mlflow.svc.cluster.local:80
          isOptional: true
          parameterType: STRING
        rerank_latency:
          parameterType: NUMBER_DOUBLE
        retrieve_latency:
          parameterType: NUMBER_DOUBLE
        run_name:
          defaultValue: voice-pipeline
          isOptional: true
          parameterType: STRING
        stt_audio_duration:
          parameterType: NUMBER_DOUBLE
        stt_latency:
          parameterType: NUMBER_DOUBLE
        tts_latency:
          parameterType: NUMBER_DOUBLE
    outputDefinitions:
      parameters:
        Output:
          parameterType: STRING
  comp-rerank-documents:
    executorLabel: exec-rerank-documents
    inputDefinitions:
      parameters:
        documents:
          parameterType: LIST
        query:
          parameterType: STRING
        reranker_url:
          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker
          isOptional: true
          parameterType: STRING
        top_k:
          defaultValue: 3.0
          isOptional: true
          parameterType: NUMBER_INTEGER
    outputDefinitions:
      parameters:
        documents:
          parameterType: LIST
        latency_s:
          parameterType: NUMBER_DOUBLE
  comp-retrieve-context:
    executorLabel: exec-retrieve-context
    inputDefinitions:
      parameters:
        collection_name:
          defaultValue: knowledge_base
          isOptional: true
          parameterType: STRING
        embedding:
          parameterType: LIST
        milvus_host:
          defaultValue: milvus.ai-ml.svc.cluster.local
          isOptional: true
          parameterType: STRING
        top_k:
          defaultValue: 5.0
          isOptional: true
          parameterType: NUMBER_INTEGER
    outputDefinitions:
      parameters:
        documents:
          parameterType: LIST
        latency_s:
          parameterType: NUMBER_DOUBLE
  comp-synthesize-speech:
    executorLabel: exec-synthesize-speech
    inputDefinitions:
      parameters:
        text:
          parameterType: STRING
        tts_url:
          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts
          isOptional: true
          parameterType: STRING
    outputDefinitions:
      parameters:
        audio_b64:
          parameterType: STRING
        latency_s:
          parameterType: NUMBER_DOUBLE
  comp-transcribe-audio:
    executorLabel: exec-transcribe-audio
    inputDefinitions:
      parameters:
        audio_b64:
          parameterType: STRING
        whisper_url:
          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper
          isOptional: true
          parameterType: STRING
    outputDefinitions:
      parameters:
        audio_duration_s:
          parameterType: NUMBER_DOUBLE
        latency_s:
          parameterType: NUMBER_DOUBLE
        text:
          parameterType: STRING
deploymentSpec:
  executors:
    exec-generate-embeddings:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - generate_embeddings
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
          $0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef generate_embeddings(\n    text: str,\n    embeddings_url: str\
          \ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings\"\
          \n) -> NamedTuple(\"EmbedResult\", [(\"embedding\", list), (\"latency_s\"\
          , float)]):\n    \"\"\"Generate embeddings for RAG retrieval.\"\"\"\n  \
          \  import time\n    import httpx\n    from collections import namedtuple\n\
          \n    start = time.perf_counter()\n    with httpx.Client(timeout=60.0) as\
          \ client:\n        response = client.post(\n            f\"{embeddings_url}/embeddings\"\
          ,\n            json={\"input\": text, \"model\": \"bge-small-en-v1.5\"}\n\
          \        )\n        result = response.json()\n    latency = time.perf_counter()\
          \ - start\n\n    EmbedResult = namedtuple(\"EmbedResult\", [\"embedding\"\
          , \"latency_s\"])\n    return EmbedResult(result[\"data\"][0][\"embedding\"\
          ], latency)\n\n"
        image: python:3.13-slim
    exec-generate-response:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - generate_response
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
          $0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef generate_response(\n    query: str,\n    context: list,\n   \
          \ vllm_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm\"\
          ,\n    model: str = \"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4\"\
          \n) -> NamedTuple(\"LLMResult\", [(\"text\", str), (\"latency_s\", float),\
          \ (\"completion_tokens\", int)]):\n    \"\"\"Generate response using vLLM.\"\
          \"\"\n    import time\n    import httpx\n    from collections import namedtuple\n\
          \n    # Build context\n    if context:\n        context_text = \"\\n\\n\"\
          .join([doc[\"text\"] for doc in context])\n        user_content = f\"Context:\\\
          n{context_text}\\n\\nQuestion: {query}\"\n    else:\n        user_content\
          \ = query\n\n    system_prompt = \"\"\"You are a helpful voice assistant.\n\
          Answer questions based on the provided context when available.\nKeep responses\
          \ concise and natural for speech synthesis.\"\"\"\n\n    messages = [\n\
          \        {\"role\": \"system\", \"content\": system_prompt},\n        {\"\
          role\": \"user\", \"content\": user_content}\n    ]\n\n    start = time.perf_counter()\n\
          \    with httpx.Client(timeout=180.0) as client:\n        response = client.post(\n\
          \            f\"{vllm_url}/v1/chat/completions\",\n            json={\n\
          \                \"model\": model,\n                \"messages\": messages,\n\
          \                \"max_tokens\": 512,\n                \"temperature\":\
          \ 0.7\n            }\n        )\n        result = response.json()\n    latency\
          \ = time.perf_counter() - start\n\n    text = result[\"choices\"][0][\"\
          message\"][\"content\"]\n    usage = result.get(\"usage\", {})\n    completion_tokens\
          \ = usage.get(\"completion_tokens\", len(text.split()))\n\n    LLMResult\
          \ = namedtuple(\"LLMResult\", [\"text\", \"latency_s\", \"completion_tokens\"\
          ])\n    return LLMResult(text, latency, completion_tokens)\n\n"
        image: python:3.13-slim
    exec-log-pipeline-metrics:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - log_pipeline_metrics
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
          \ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef log_pipeline_metrics(\n    stt_latency: float,\n    stt_audio_duration:\
          \ float,\n    embed_latency: float,\n    retrieve_latency: float,\n    rerank_latency:\
          \ float,\n    llm_latency: float,\n    llm_completion_tokens: int,\n   \
          \ tts_latency: float,\n    experiment_name: str = \"voice-pipeline-metrics\"\
          ,\n    run_name: str = \"voice-pipeline\",\n    mlflow_tracking_uri: str\
          \ = \"http://mlflow.mlflow.svc.cluster.local:80\",\n) -> str:\n    \"\"\"\
          Log per-step latency metrics to MLflow for the full voice pipeline.\"\"\"\
          \n    import os\n    import mlflow\n    from mlflow.tracking import MlflowClient\n\
          \n    mlflow.set_tracking_uri(mlflow_tracking_uri)\n    client = MlflowClient()\n\
          \n    exp = client.get_experiment_by_name(experiment_name)\n    experiment_id\
          \ = (\n        exp.experiment_id\n        if exp\n        else client.create_experiment(\n\
          \            name=experiment_name,\n            artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\
          ,\n        )\n    )\n\n    run = mlflow.start_run(\n        experiment_id=experiment_id,\n\
          \        run_name=run_name,\n        tags={\n            \"pipeline.type\"\
          : \"voice-assistant\",\n            \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\"\
          , \"unknown\"),\n        },\n    )\n\n    total_latency = (\n        stt_latency\
          \ + embed_latency + retrieve_latency\n        + rerank_latency + llm_latency\
          \ + tts_latency\n    )\n    stt_rtf = stt_latency / stt_audio_duration if\
          \ stt_audio_duration > 0 else 0\n    llm_tps = llm_completion_tokens / llm_latency\
          \ if llm_latency > 0 else 0\n\n    mlflow.log_metrics({\n        \"stt_latency_s\"\
          : stt_latency,\n        \"stt_audio_duration_s\": stt_audio_duration,\n\
          \        \"stt_realtime_factor\": stt_rtf,\n        \"embed_latency_s\"\
          : embed_latency,\n        \"retrieve_latency_s\": retrieve_latency,\n  \
          \      \"rerank_latency_s\": rerank_latency,\n        \"llm_latency_s\"\
          : llm_latency,\n        \"llm_completion_tokens\": llm_completion_tokens,\n\
          \        \"llm_tokens_per_second\": llm_tps,\n        \"tts_latency_s\"\
          : tts_latency,\n        \"total_pipeline_latency_s\": total_latency,\n \
          \   })\n    mlflow.end_run()\n    return run.info.run_id\n\n"
        image: python:3.13-slim
    exec-rerank-documents:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - rerank_documents
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
          $0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef rerank_documents(\n    query: str,\n    documents: list,\n  \
          \  reranker_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker\"\
          ,\n    top_k: int = 3\n) -> NamedTuple(\"RerankResult\", [(\"documents\"\
          , list), (\"latency_s\", float)]):\n    \"\"\"Rerank documents using BGE\
          \ reranker.\"\"\"\n    import time\n    import httpx\n    from collections\
          \ import namedtuple\n\n    if not documents:\n        RerankResult = namedtuple(\"\
          RerankResult\", [\"documents\", \"latency_s\"])\n        return RerankResult([],\
          \ 0.0)\n\n    start = time.perf_counter()\n    with httpx.Client(timeout=60.0)\
          \ as client:\n        response = client.post(\n            f\"{reranker_url}/v1/rerank\"\
          ,\n            json={\n                \"query\": query,\n             \
          \   \"documents\": [doc[\"text\"] for doc in documents],\n             \
          \   \"model\": \"bge-reranker-v2-m3\"\n            }\n        )\n      \
          \  result = response.json()\n    latency = time.perf_counter() - start\n\
          \n    # Sort by rerank score\n    reranked = sorted(\n        zip(documents,\
          \ result.get(\"scores\", [0] * len(documents))),\n        key=lambda x:\
          \ x[1],\n        reverse=True\n    )[:top_k]\n\n    RerankResult = namedtuple(\"\
          RerankResult\", [\"documents\", \"latency_s\"])\n    return RerankResult([doc\
          \ for doc, score in reranked], latency)\n\n"
        image: python:3.13-slim
    exec-retrieve-context:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - retrieve_context
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'pymilvus' &&\
          \ \"$0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef retrieve_context(\n    embedding: list,\n    milvus_host: str\
          \ = \"milvus.ai-ml.svc.cluster.local\",\n    collection_name: str = \"knowledge_base\"\
          ,\n    top_k: int = 5\n) -> NamedTuple(\"RetrieveResult\", [(\"documents\"\
          , list), (\"latency_s\", float)]):\n    \"\"\"Retrieve relevant documents\
          \ from Milvus vector database.\"\"\"\n    import time\n    from pymilvus\
          \ import connections, Collection, utility\n    from collections import namedtuple\n\
          \n    start = time.perf_counter()\n    connections.connect(host=milvus_host,\
          \ port=19530)\n\n    if not utility.has_collection(collection_name):\n \
          \       latency = time.perf_counter() - start\n        RetrieveResult =\
          \ namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n     \
          \   return RetrieveResult([], latency)\n\n    collection = Collection(collection_name)\n\
          \    collection.load()\n\n    results = collection.search(\n        data=[embedding],\n\
          \        anns_field=\"embedding\",\n        param={\"metric_type\": \"COSINE\"\
          , \"params\": {\"nprobe\": 10}},\n        limit=top_k,\n        output_fields=[\"\
          text\", \"source\"]\n    )\n    latency = time.perf_counter() - start\n\n\
          \    documents = []\n    for hits in results:\n        for hit in hits:\n\
          \            documents.append({\n                \"text\": hit.entity.get(\"\
          text\"),\n                \"source\": hit.entity.get(\"source\"),\n    \
          \            \"score\": hit.distance\n            })\n\n    RetrieveResult\
          \ = namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n   \
          \ return RetrieveResult(documents, latency)\n\n"
        image: python:3.13-slim
    exec-synthesize-speech:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - synthesize_speech
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
          $0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef synthesize_speech(\n    text: str,\n    tts_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts\"\
          \n) -> NamedTuple(\"TTSResult\", [(\"audio_b64\", str), (\"latency_s\",\
          \ float)]):\n    \"\"\"Convert text to speech using TTS service.\"\"\"\n\
          \    import base64\n    import time\n    import httpx\n    from collections\
          \ import namedtuple\n\n    start = time.perf_counter()\n    with httpx.Client(timeout=120.0)\
          \ as client:\n        response = client.post(\n            f\"{tts_url}/v1/audio/speech\"\
          ,\n            json={\n                \"input\": text,\n              \
          \  \"voice\": \"en_US-lessac-high\",\n                \"response_format\"\
          : \"wav\"\n            }\n        )\n        audio_b64 = base64.b64encode(response.content).decode(\"\
          utf-8\")\n    latency = time.perf_counter() - start\n\n    TTSResult = namedtuple(\"\
          TTSResult\", [\"audio_b64\", \"latency_s\"])\n    return TTSResult(audio_b64,\
          \ latency)\n\n"
        image: python:3.13-slim
    exec-transcribe-audio:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - transcribe_audio
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
          $0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef transcribe_audio(\n    audio_b64: str,\n    whisper_url: str\
          \ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper\"\
          \n) -> NamedTuple(\"STTResult\", [(\"text\", str), (\"latency_s\", float),\
          \ (\"audio_duration_s\", float)]):\n    \"\"\"Transcribe audio using Whisper\
          \ STT service.\"\"\"\n    import base64\n    import time\n    import httpx\n\
          \    from collections import namedtuple\n\n    audio_bytes = base64.b64decode(audio_b64)\n\
          \n    start = time.perf_counter()\n    with httpx.Client(timeout=120.0)\
          \ as client:\n        response = client.post(\n            f\"{whisper_url}/v1/audio/transcriptions\"\
          ,\n            files={\"file\": (\"audio.wav\", audio_bytes, \"audio/wav\"\
          )},\n            data={\"model\": \"whisper-large-v3\", \"language\": \"\
          en\"}\n        )\n        result = response.json()\n    latency = time.perf_counter()\
          \ - start\n\n    text = result.get(\"text\", \"\")\n    # Estimate audio\
          \ duration from WAV header (16-bit PCM, 16kHz)\n    audio_duration = max(len(audio_bytes)\
          \ / (16000 * 2), 0.1)\n\n    STTResult = namedtuple(\"STTResult\", [\"text\"\
          , \"latency_s\", \"audio_duration_s\"])\n    return STTResult(text, latency,\
          \ audio_duration)\n\n"
        image: python:3.13-slim
pipelineInfo:
  description: 'End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus ->
    Rerank -> LLM -> TTS.  Logs per-step latency to MLflow.'
  name: voice-assistant-rag-pipeline
root:
  dag:
    tasks:
      generate-embeddings:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-generate-embeddings
        dependentTasks:
        - transcribe-audio
        inputs:
          parameters:
            text:
              taskOutputParameter:
                outputParameterKey: text
                producerTask: transcribe-audio
        taskInfo:
          name: generate-embeddings
      generate-response:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-generate-response
        dependentTasks:
        - rerank-documents
        - transcribe-audio
        inputs:
          parameters:
            context:
              taskOutputParameter:
                outputParameterKey: documents
                producerTask: rerank-documents
            query:
              taskOutputParameter:
                outputParameterKey: text
                producerTask: transcribe-audio
        taskInfo:
          name: generate-response
      log-pipeline-metrics:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-log-pipeline-metrics
        dependentTasks:
        - generate-embeddings
        - generate-response
        - rerank-documents
        - retrieve-context
        - synthesize-speech
        - transcribe-audio
        inputs:
          parameters:
            embed_latency:
              taskOutputParameter:
                outputParameterKey: latency_s
                producerTask: generate-embeddings
            llm_completion_tokens:
              taskOutputParameter:
                outputParameterKey: completion_tokens
                producerTask: generate-response
            llm_latency:
              taskOutputParameter:
                outputParameterKey: latency_s
                producerTask: generate-response
            rerank_latency:
              taskOutputParameter:
                outputParameterKey: latency_s
                producerTask: rerank-documents
            retrieve_latency:
              taskOutputParameter:
                outputParameterKey: latency_s
                producerTask: retrieve-context
            stt_audio_duration:
              taskOutputParameter:
                outputParameterKey: audio_duration_s
                producerTask: transcribe-audio
            stt_latency:
              taskOutputParameter:
                outputParameterKey: latency_s
                producerTask: transcribe-audio
            tts_latency:
              taskOutputParameter:
                outputParameterKey: latency_s
                producerTask: synthesize-speech
        taskInfo:
          name: log-pipeline-metrics
      rerank-documents:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-rerank-documents
        dependentTasks:
        - retrieve-context
        - transcribe-audio
        inputs:
          parameters:
            documents:
              taskOutputParameter:
                outputParameterKey: documents
                producerTask: retrieve-context
            query:
              taskOutputParameter:
                outputParameterKey: text
                producerTask: transcribe-audio
        taskInfo:
          name: rerank-documents
      retrieve-context:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-retrieve-context
        dependentTasks:
        - generate-embeddings
        inputs:
          parameters:
            collection_name:
              componentInputParameter: collection_name
            embedding:
              taskOutputParameter:
                outputParameterKey: embedding
                producerTask: generate-embeddings
        taskInfo:
          name: retrieve-context
      synthesize-speech:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-synthesize-speech
        dependentTasks:
        - generate-response
        inputs:
          parameters:
            text:
              taskOutputParameter:
                outputParameterKey: text
                producerTask: generate-response
        taskInfo:
          name: synthesize-speech
      transcribe-audio:
        cachingOptions: {}
        componentRef:
          name: comp-transcribe-audio
        inputs:
          parameters:
            audio_b64:
              componentInputParameter: audio_b64
        taskInfo:
          name: transcribe-audio
  inputDefinitions:
    parameters:
      audio_b64:
        description: Base64-encoded audio file
        parameterType: STRING
      collection_name:
        defaultValue: knowledge_base
        description: Milvus collection for RAG
        isOptional: true
        parameterType: STRING
schemaVersion: 2.1.0
sdkVersion: kfp-2.12.1