feat: add vLLM tuning pipeline + recompile voice pipelines with MLflow

New: - vllm_tuning_pipeline.py: A/B benchmark different vLLM configs, logs latency/TPS/TTFT to MLflow (vllm-tuning experiment) - vllm_tuning_pipeline.yaml: compiled KFP YAML Updated: - voice_pipeline.py: per-step NamedTuple outputs with latency tracking, new log_pipeline_metrics MLflow component - voice_pipeline.yaml, tts_pipeline.yaml, rag_pipeline.yaml: recompiled
2026-02-13 08:24:11 -05:00
parent cee21f124c
commit bc4b230dd9
6 changed files with 2216 additions and 26 deletions
--- a/rag_pipeline.yaml
+++ b/rag_pipeline.yaml
@@ -0,0 +1,363 @@
+# PIPELINE DEFINITION
+# Name: rag-query-pipeline
+# Description: RAG query pipeline: Embed -> Retrieve -> Rerank -> LLM.  Logs per-step latency to MLflow.
+# Inputs:
+#    collection_name: str [Default: 'knowledge_base']
+#    query: str
+components:
+  comp-generate-embeddings:
+    executorLabel: exec-generate-embeddings
+    inputDefinitions:
+      parameters:
+        embeddings_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings
+          isOptional: true
+          parameterType: STRING
+        text:
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        embedding:
+          parameterType: LIST
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+  comp-generate-response:
+    executorLabel: exec-generate-response
+    inputDefinitions:
+      parameters:
+        context:
+          parameterType: LIST
+        model:
+          defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
+          isOptional: true
+          parameterType: STRING
+        query:
+          parameterType: STRING
+        vllm_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
+          isOptional: true
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        completion_tokens:
+          parameterType: NUMBER_INTEGER
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+        text:
+          parameterType: STRING
+  comp-rerank-documents:
+    executorLabel: exec-rerank-documents
+    inputDefinitions:
+      parameters:
+        documents:
+          parameterType: LIST
+        query:
+          parameterType: STRING
+        reranker_url:
+          defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker
+          isOptional: true
+          parameterType: STRING
+        top_k:
+          defaultValue: 3.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+    outputDefinitions:
+      parameters:
+        documents:
+          parameterType: LIST
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+  comp-retrieve-context:
+    executorLabel: exec-retrieve-context
+    inputDefinitions:
+      parameters:
+        collection_name:
+          defaultValue: knowledge_base
+          isOptional: true
+          parameterType: STRING
+        embedding:
+          parameterType: LIST
+        milvus_host:
+          defaultValue: milvus.ai-ml.svc.cluster.local
+          isOptional: true
+          parameterType: STRING
+        top_k:
+          defaultValue: 5.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+    outputDefinitions:
+      parameters:
+        documents:
+          parameterType: LIST
+        latency_s:
+          parameterType: NUMBER_DOUBLE
+deploymentSpec:
+  executors:
+    exec-generate-embeddings:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - generate_embeddings
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef generate_embeddings(\n    text: str,\n    embeddings_url: str\
+          \ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings\"\
+          \n) -> NamedTuple(\"EmbedResult\", [(\"embedding\", list), (\"latency_s\"\
+          , float)]):\n    \"\"\"Generate embeddings for RAG retrieval.\"\"\"\n  \
+          \  import time\n    import httpx\n    from collections import namedtuple\n\
+          \n    start = time.perf_counter()\n    with httpx.Client(timeout=60.0) as\
+          \ client:\n        response = client.post(\n            f\"{embeddings_url}/embeddings\"\
+          ,\n            json={\"input\": text, \"model\": \"bge-small-en-v1.5\"}\n\
+          \        )\n        result = response.json()\n    latency = time.perf_counter()\
+          \ - start\n\n    EmbedResult = namedtuple(\"EmbedResult\", [\"embedding\"\
+          , \"latency_s\"])\n    return EmbedResult(result[\"data\"][0][\"embedding\"\
+          ], latency)\n\n"
+        image: python:3.13-slim
+    exec-generate-response:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - generate_response
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef generate_response(\n    query: str,\n    context: list,\n   \
+          \ vllm_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm\"\
+          ,\n    model: str = \"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4\"\
+          \n) -> NamedTuple(\"LLMResult\", [(\"text\", str), (\"latency_s\", float),\
+          \ (\"completion_tokens\", int)]):\n    \"\"\"Generate response using vLLM.\"\
+          \"\"\n    import time\n    import httpx\n    from collections import namedtuple\n\
+          \n    # Build context\n    if context:\n        context_text = \"\\n\\n\"\
+          .join([doc[\"text\"] for doc in context])\n        user_content = f\"Context:\\\
+          n{context_text}\\n\\nQuestion: {query}\"\n    else:\n        user_content\
+          \ = query\n\n    system_prompt = \"\"\"You are a helpful voice assistant.\n\
+          Answer questions based on the provided context when available.\nKeep responses\
+          \ concise and natural for speech synthesis.\"\"\"\n\n    messages = [\n\
+          \        {\"role\": \"system\", \"content\": system_prompt},\n        {\"\
+          role\": \"user\", \"content\": user_content}\n    ]\n\n    start = time.perf_counter()\n\
+          \    with httpx.Client(timeout=180.0) as client:\n        response = client.post(\n\
+          \            f\"{vllm_url}/v1/chat/completions\",\n            json={\n\
+          \                \"model\": model,\n                \"messages\": messages,\n\
+          \                \"max_tokens\": 512,\n                \"temperature\":\
+          \ 0.7\n            }\n        )\n        result = response.json()\n    latency\
+          \ = time.perf_counter() - start\n\n    text = result[\"choices\"][0][\"\
+          message\"][\"content\"]\n    usage = result.get(\"usage\", {})\n    completion_tokens\
+          \ = usage.get(\"completion_tokens\", len(text.split()))\n\n    LLMResult\
+          \ = namedtuple(\"LLMResult\", [\"text\", \"latency_s\", \"completion_tokens\"\
+          ])\n    return LLMResult(text, latency, completion_tokens)\n\n"
+        image: python:3.13-slim
+    exec-rerank-documents:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - rerank_documents
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef rerank_documents(\n    query: str,\n    documents: list,\n  \
+          \  reranker_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker\"\
+          ,\n    top_k: int = 3\n) -> NamedTuple(\"RerankResult\", [(\"documents\"\
+          , list), (\"latency_s\", float)]):\n    \"\"\"Rerank documents using BGE\
+          \ reranker.\"\"\"\n    import time\n    import httpx\n    from collections\
+          \ import namedtuple\n\n    if not documents:\n        RerankResult = namedtuple(\"\
+          RerankResult\", [\"documents\", \"latency_s\"])\n        return RerankResult([],\
+          \ 0.0)\n\n    start = time.perf_counter()\n    with httpx.Client(timeout=60.0)\
+          \ as client:\n        response = client.post(\n            f\"{reranker_url}/v1/rerank\"\
+          ,\n            json={\n                \"query\": query,\n             \
+          \   \"documents\": [doc[\"text\"] for doc in documents],\n             \
+          \   \"model\": \"bge-reranker-v2-m3\"\n            }\n        )\n      \
+          \  result = response.json()\n    latency = time.perf_counter() - start\n\
+          \n    # Sort by rerank score\n    reranked = sorted(\n        zip(documents,\
+          \ result.get(\"scores\", [0] * len(documents))),\n        key=lambda x:\
+          \ x[1],\n        reverse=True\n    )[:top_k]\n\n    RerankResult = namedtuple(\"\
+          RerankResult\", [\"documents\", \"latency_s\"])\n    return RerankResult([doc\
+          \ for doc, score in reranked], latency)\n\n"
+        image: python:3.13-slim
+    exec-retrieve-context:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - retrieve_context
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'pymilvus' &&\
+          \ \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef retrieve_context(\n    embedding: list,\n    milvus_host: str\
+          \ = \"milvus.ai-ml.svc.cluster.local\",\n    collection_name: str = \"knowledge_base\"\
+          ,\n    top_k: int = 5\n) -> NamedTuple(\"RetrieveResult\", [(\"documents\"\
+          , list), (\"latency_s\", float)]):\n    \"\"\"Retrieve relevant documents\
+          \ from Milvus vector database.\"\"\"\n    import time\n    from pymilvus\
+          \ import connections, Collection, utility\n    from collections import namedtuple\n\
+          \n    start = time.perf_counter()\n    connections.connect(host=milvus_host,\
+          \ port=19530)\n\n    if not utility.has_collection(collection_name):\n \
+          \       latency = time.perf_counter() - start\n        RetrieveResult =\
+          \ namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n     \
+          \   return RetrieveResult([], latency)\n\n    collection = Collection(collection_name)\n\
+          \    collection.load()\n\n    results = collection.search(\n        data=[embedding],\n\
+          \        anns_field=\"embedding\",\n        param={\"metric_type\": \"COSINE\"\
+          , \"params\": {\"nprobe\": 10}},\n        limit=top_k,\n        output_fields=[\"\
+          text\", \"source\"]\n    )\n    latency = time.perf_counter() - start\n\n\
+          \    documents = []\n    for hits in results:\n        for hit in hits:\n\
+          \            documents.append({\n                \"text\": hit.entity.get(\"\
+          text\"),\n                \"source\": hit.entity.get(\"source\"),\n    \
+          \            \"score\": hit.distance\n            })\n\n    RetrieveResult\
+          \ = namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n   \
+          \ return RetrieveResult(documents, latency)\n\n"
+        image: python:3.13-slim
+pipelineInfo:
+  description: 'RAG query pipeline: Embed -> Retrieve -> Rerank -> LLM.  Logs per-step
+    latency to MLflow.'
+  name: rag-query-pipeline
+root:
+  dag:
+    tasks:
+      generate-embeddings:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-generate-embeddings
+        inputs:
+          parameters:
+            text:
+              componentInputParameter: query
+        taskInfo:
+          name: generate-embeddings
+      generate-response:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-generate-response
+        dependentTasks:
+        - rerank-documents
+        inputs:
+          parameters:
+            context:
+              taskOutputParameter:
+                outputParameterKey: documents
+                producerTask: rerank-documents
+            query:
+              componentInputParameter: query
+        taskInfo:
+          name: generate-response
+      rerank-documents:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-rerank-documents
+        dependentTasks:
+        - retrieve-context
+        inputs:
+          parameters:
+            documents:
+              taskOutputParameter:
+                outputParameterKey: documents
+                producerTask: retrieve-context
+            query:
+              componentInputParameter: query
+        taskInfo:
+          name: rerank-documents
+      retrieve-context:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-retrieve-context
+        dependentTasks:
+        - generate-embeddings
+        inputs:
+          parameters:
+            collection_name:
+              componentInputParameter: collection_name
+            embedding:
+              taskOutputParameter:
+                outputParameterKey: embedding
+                producerTask: generate-embeddings
+        taskInfo:
+          name: retrieve-context
+  inputDefinitions:
+    parameters:
+      collection_name:
+        defaultValue: knowledge_base
+        description: Milvus collection name
+        isOptional: true
+        parameterType: STRING
+      query:
+        description: Text query
+        parameterType: STRING
+schemaVersion: 2.1.0
+sdkVersion: kfp-2.12.1