feat: add vLLM tuning pipeline + recompile voice pipelines with MLflow

New:
- vllm_tuning_pipeline.py: A/B benchmark different vLLM configs,
  logs latency/TPS/TTFT to MLflow (vllm-tuning experiment)
- vllm_tuning_pipeline.yaml: compiled KFP YAML

Updated:
- voice_pipeline.py: per-step NamedTuple outputs with latency tracking,
  new log_pipeline_metrics MLflow component
- voice_pipeline.yaml, tts_pipeline.yaml, rag_pipeline.yaml: recompiled
This commit is contained in:
2026-02-13 08:24:11 -05:00
parent cee21f124c
commit bc4b230dd9
6 changed files with 2216 additions and 26 deletions

656
voice_pipeline.yaml Normal file
View File

@@ -0,0 +1,656 @@
# PIPELINE DEFINITION
# Name: voice-assistant-rag-pipeline
# Description: End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus -> Rerank -> LLM -> TTS. Logs per-step latency to MLflow.
# Inputs:
# audio_b64: str
# collection_name: str [Default: 'knowledge_base']
components:
comp-generate-embeddings:
executorLabel: exec-generate-embeddings
inputDefinitions:
parameters:
embeddings_url:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings
isOptional: true
parameterType: STRING
text:
parameterType: STRING
outputDefinitions:
parameters:
embedding:
parameterType: LIST
latency_s:
parameterType: NUMBER_DOUBLE
comp-generate-response:
executorLabel: exec-generate-response
inputDefinitions:
parameters:
context:
parameterType: LIST
model:
defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
isOptional: true
parameterType: STRING
query:
parameterType: STRING
vllm_url:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
isOptional: true
parameterType: STRING
outputDefinitions:
parameters:
completion_tokens:
parameterType: NUMBER_INTEGER
latency_s:
parameterType: NUMBER_DOUBLE
text:
parameterType: STRING
comp-log-pipeline-metrics:
executorLabel: exec-log-pipeline-metrics
inputDefinitions:
parameters:
embed_latency:
parameterType: NUMBER_DOUBLE
experiment_name:
defaultValue: voice-pipeline-metrics
isOptional: true
parameterType: STRING
llm_completion_tokens:
parameterType: NUMBER_INTEGER
llm_latency:
parameterType: NUMBER_DOUBLE
mlflow_tracking_uri:
defaultValue: http://mlflow.mlflow.svc.cluster.local:80
isOptional: true
parameterType: STRING
rerank_latency:
parameterType: NUMBER_DOUBLE
retrieve_latency:
parameterType: NUMBER_DOUBLE
run_name:
defaultValue: voice-pipeline
isOptional: true
parameterType: STRING
stt_audio_duration:
parameterType: NUMBER_DOUBLE
stt_latency:
parameterType: NUMBER_DOUBLE
tts_latency:
parameterType: NUMBER_DOUBLE
outputDefinitions:
parameters:
Output:
parameterType: STRING
comp-rerank-documents:
executorLabel: exec-rerank-documents
inputDefinitions:
parameters:
documents:
parameterType: LIST
query:
parameterType: STRING
reranker_url:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker
isOptional: true
parameterType: STRING
top_k:
defaultValue: 3.0
isOptional: true
parameterType: NUMBER_INTEGER
outputDefinitions:
parameters:
documents:
parameterType: LIST
latency_s:
parameterType: NUMBER_DOUBLE
comp-retrieve-context:
executorLabel: exec-retrieve-context
inputDefinitions:
parameters:
collection_name:
defaultValue: knowledge_base
isOptional: true
parameterType: STRING
embedding:
parameterType: LIST
milvus_host:
defaultValue: milvus.ai-ml.svc.cluster.local
isOptional: true
parameterType: STRING
top_k:
defaultValue: 5.0
isOptional: true
parameterType: NUMBER_INTEGER
outputDefinitions:
parameters:
documents:
parameterType: LIST
latency_s:
parameterType: NUMBER_DOUBLE
comp-synthesize-speech:
executorLabel: exec-synthesize-speech
inputDefinitions:
parameters:
text:
parameterType: STRING
tts_url:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts
isOptional: true
parameterType: STRING
outputDefinitions:
parameters:
audio_b64:
parameterType: STRING
latency_s:
parameterType: NUMBER_DOUBLE
comp-transcribe-audio:
executorLabel: exec-transcribe-audio
inputDefinitions:
parameters:
audio_b64:
parameterType: STRING
whisper_url:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper
isOptional: true
parameterType: STRING
outputDefinitions:
parameters:
audio_duration_s:
parameterType: NUMBER_DOUBLE
latency_s:
parameterType: NUMBER_DOUBLE
text:
parameterType: STRING
deploymentSpec:
executors:
exec-generate-embeddings:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- generate_embeddings
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef generate_embeddings(\n text: str,\n embeddings_url: str\
\ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings\"\
\n) -> NamedTuple(\"EmbedResult\", [(\"embedding\", list), (\"latency_s\"\
, float)]):\n \"\"\"Generate embeddings for RAG retrieval.\"\"\"\n \
\ import time\n import httpx\n from collections import namedtuple\n\
\n start = time.perf_counter()\n with httpx.Client(timeout=60.0) as\
\ client:\n response = client.post(\n f\"{embeddings_url}/embeddings\"\
,\n json={\"input\": text, \"model\": \"bge-small-en-v1.5\"}\n\
\ )\n result = response.json()\n latency = time.perf_counter()\
\ - start\n\n EmbedResult = namedtuple(\"EmbedResult\", [\"embedding\"\
, \"latency_s\"])\n return EmbedResult(result[\"data\"][0][\"embedding\"\
], latency)\n\n"
image: python:3.13-slim
exec-generate-response:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- generate_response
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef generate_response(\n query: str,\n context: list,\n \
\ vllm_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm\"\
,\n model: str = \"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4\"\
\n) -> NamedTuple(\"LLMResult\", [(\"text\", str), (\"latency_s\", float),\
\ (\"completion_tokens\", int)]):\n \"\"\"Generate response using vLLM.\"\
\"\"\n import time\n import httpx\n from collections import namedtuple\n\
\n # Build context\n if context:\n context_text = \"\\n\\n\"\
.join([doc[\"text\"] for doc in context])\n user_content = f\"Context:\\\
n{context_text}\\n\\nQuestion: {query}\"\n else:\n user_content\
\ = query\n\n system_prompt = \"\"\"You are a helpful voice assistant.\n\
Answer questions based on the provided context when available.\nKeep responses\
\ concise and natural for speech synthesis.\"\"\"\n\n messages = [\n\
\ {\"role\": \"system\", \"content\": system_prompt},\n {\"\
role\": \"user\", \"content\": user_content}\n ]\n\n start = time.perf_counter()\n\
\ with httpx.Client(timeout=180.0) as client:\n response = client.post(\n\
\ f\"{vllm_url}/v1/chat/completions\",\n json={\n\
\ \"model\": model,\n \"messages\": messages,\n\
\ \"max_tokens\": 512,\n \"temperature\":\
\ 0.7\n }\n )\n result = response.json()\n latency\
\ = time.perf_counter() - start\n\n text = result[\"choices\"][0][\"\
message\"][\"content\"]\n usage = result.get(\"usage\", {})\n completion_tokens\
\ = usage.get(\"completion_tokens\", len(text.split()))\n\n LLMResult\
\ = namedtuple(\"LLMResult\", [\"text\", \"latency_s\", \"completion_tokens\"\
])\n return LLMResult(text, latency, completion_tokens)\n\n"
image: python:3.13-slim
exec-log-pipeline-metrics:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- log_pipeline_metrics
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
\ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef log_pipeline_metrics(\n stt_latency: float,\n stt_audio_duration:\
\ float,\n embed_latency: float,\n retrieve_latency: float,\n rerank_latency:\
\ float,\n llm_latency: float,\n llm_completion_tokens: int,\n \
\ tts_latency: float,\n experiment_name: str = \"voice-pipeline-metrics\"\
,\n run_name: str = \"voice-pipeline\",\n mlflow_tracking_uri: str\
\ = \"http://mlflow.mlflow.svc.cluster.local:80\",\n) -> str:\n \"\"\"\
Log per-step latency metrics to MLflow for the full voice pipeline.\"\"\"\
\n import os\n import mlflow\n from mlflow.tracking import MlflowClient\n\
\n mlflow.set_tracking_uri(mlflow_tracking_uri)\n client = MlflowClient()\n\
\n exp = client.get_experiment_by_name(experiment_name)\n experiment_id\
\ = (\n exp.experiment_id\n if exp\n else client.create_experiment(\n\
\ name=experiment_name,\n artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\
,\n )\n )\n\n run = mlflow.start_run(\n experiment_id=experiment_id,\n\
\ run_name=run_name,\n tags={\n \"pipeline.type\"\
: \"voice-assistant\",\n \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\"\
, \"unknown\"),\n },\n )\n\n total_latency = (\n stt_latency\
\ + embed_latency + retrieve_latency\n + rerank_latency + llm_latency\
\ + tts_latency\n )\n stt_rtf = stt_latency / stt_audio_duration if\
\ stt_audio_duration > 0 else 0\n llm_tps = llm_completion_tokens / llm_latency\
\ if llm_latency > 0 else 0\n\n mlflow.log_metrics({\n \"stt_latency_s\"\
: stt_latency,\n \"stt_audio_duration_s\": stt_audio_duration,\n\
\ \"stt_realtime_factor\": stt_rtf,\n \"embed_latency_s\"\
: embed_latency,\n \"retrieve_latency_s\": retrieve_latency,\n \
\ \"rerank_latency_s\": rerank_latency,\n \"llm_latency_s\"\
: llm_latency,\n \"llm_completion_tokens\": llm_completion_tokens,\n\
\ \"llm_tokens_per_second\": llm_tps,\n \"tts_latency_s\"\
: tts_latency,\n \"total_pipeline_latency_s\": total_latency,\n \
\ })\n mlflow.end_run()\n return run.info.run_id\n\n"
image: python:3.13-slim
exec-rerank-documents:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- rerank_documents
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef rerank_documents(\n query: str,\n documents: list,\n \
\ reranker_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker\"\
,\n top_k: int = 3\n) -> NamedTuple(\"RerankResult\", [(\"documents\"\
, list), (\"latency_s\", float)]):\n \"\"\"Rerank documents using BGE\
\ reranker.\"\"\"\n import time\n import httpx\n from collections\
\ import namedtuple\n\n if not documents:\n RerankResult = namedtuple(\"\
RerankResult\", [\"documents\", \"latency_s\"])\n return RerankResult([],\
\ 0.0)\n\n start = time.perf_counter()\n with httpx.Client(timeout=60.0)\
\ as client:\n response = client.post(\n f\"{reranker_url}/v1/rerank\"\
,\n json={\n \"query\": query,\n \
\ \"documents\": [doc[\"text\"] for doc in documents],\n \
\ \"model\": \"bge-reranker-v2-m3\"\n }\n )\n \
\ result = response.json()\n latency = time.perf_counter() - start\n\
\n # Sort by rerank score\n reranked = sorted(\n zip(documents,\
\ result.get(\"scores\", [0] * len(documents))),\n key=lambda x:\
\ x[1],\n reverse=True\n )[:top_k]\n\n RerankResult = namedtuple(\"\
RerankResult\", [\"documents\", \"latency_s\"])\n return RerankResult([doc\
\ for doc, score in reranked], latency)\n\n"
image: python:3.13-slim
exec-retrieve-context:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- retrieve_context
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'pymilvus' &&\
\ \"$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef retrieve_context(\n embedding: list,\n milvus_host: str\
\ = \"milvus.ai-ml.svc.cluster.local\",\n collection_name: str = \"knowledge_base\"\
,\n top_k: int = 5\n) -> NamedTuple(\"RetrieveResult\", [(\"documents\"\
, list), (\"latency_s\", float)]):\n \"\"\"Retrieve relevant documents\
\ from Milvus vector database.\"\"\"\n import time\n from pymilvus\
\ import connections, Collection, utility\n from collections import namedtuple\n\
\n start = time.perf_counter()\n connections.connect(host=milvus_host,\
\ port=19530)\n\n if not utility.has_collection(collection_name):\n \
\ latency = time.perf_counter() - start\n RetrieveResult =\
\ namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n \
\ return RetrieveResult([], latency)\n\n collection = Collection(collection_name)\n\
\ collection.load()\n\n results = collection.search(\n data=[embedding],\n\
\ anns_field=\"embedding\",\n param={\"metric_type\": \"COSINE\"\
, \"params\": {\"nprobe\": 10}},\n limit=top_k,\n output_fields=[\"\
text\", \"source\"]\n )\n latency = time.perf_counter() - start\n\n\
\ documents = []\n for hits in results:\n for hit in hits:\n\
\ documents.append({\n \"text\": hit.entity.get(\"\
text\"),\n \"source\": hit.entity.get(\"source\"),\n \
\ \"score\": hit.distance\n })\n\n RetrieveResult\
\ = namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n \
\ return RetrieveResult(documents, latency)\n\n"
image: python:3.13-slim
exec-synthesize-speech:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- synthesize_speech
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef synthesize_speech(\n text: str,\n tts_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts\"\
\n) -> NamedTuple(\"TTSResult\", [(\"audio_b64\", str), (\"latency_s\",\
\ float)]):\n \"\"\"Convert text to speech using TTS service.\"\"\"\n\
\ import base64\n import time\n import httpx\n from collections\
\ import namedtuple\n\n start = time.perf_counter()\n with httpx.Client(timeout=120.0)\
\ as client:\n response = client.post(\n f\"{tts_url}/v1/audio/speech\"\
,\n json={\n \"input\": text,\n \
\ \"voice\": \"en_US-lessac-high\",\n \"response_format\"\
: \"wav\"\n }\n )\n audio_b64 = base64.b64encode(response.content).decode(\"\
utf-8\")\n latency = time.perf_counter() - start\n\n TTSResult = namedtuple(\"\
TTSResult\", [\"audio_b64\", \"latency_s\"])\n return TTSResult(audio_b64,\
\ latency)\n\n"
image: python:3.13-slim
exec-transcribe-audio:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- transcribe_audio
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef transcribe_audio(\n audio_b64: str,\n whisper_url: str\
\ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper\"\
\n) -> NamedTuple(\"STTResult\", [(\"text\", str), (\"latency_s\", float),\
\ (\"audio_duration_s\", float)]):\n \"\"\"Transcribe audio using Whisper\
\ STT service.\"\"\"\n import base64\n import time\n import httpx\n\
\ from collections import namedtuple\n\n audio_bytes = base64.b64decode(audio_b64)\n\
\n start = time.perf_counter()\n with httpx.Client(timeout=120.0)\
\ as client:\n response = client.post(\n f\"{whisper_url}/v1/audio/transcriptions\"\
,\n files={\"file\": (\"audio.wav\", audio_bytes, \"audio/wav\"\
)},\n data={\"model\": \"whisper-large-v3\", \"language\": \"\
en\"}\n )\n result = response.json()\n latency = time.perf_counter()\
\ - start\n\n text = result.get(\"text\", \"\")\n # Estimate audio\
\ duration from WAV header (16-bit PCM, 16kHz)\n audio_duration = max(len(audio_bytes)\
\ / (16000 * 2), 0.1)\n\n STTResult = namedtuple(\"STTResult\", [\"text\"\
, \"latency_s\", \"audio_duration_s\"])\n return STTResult(text, latency,\
\ audio_duration)\n\n"
image: python:3.13-slim
pipelineInfo:
description: 'End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus ->
Rerank -> LLM -> TTS. Logs per-step latency to MLflow.'
name: voice-assistant-rag-pipeline
root:
dag:
tasks:
generate-embeddings:
cachingOptions:
enableCache: true
componentRef:
name: comp-generate-embeddings
dependentTasks:
- transcribe-audio
inputs:
parameters:
text:
taskOutputParameter:
outputParameterKey: text
producerTask: transcribe-audio
taskInfo:
name: generate-embeddings
generate-response:
cachingOptions:
enableCache: true
componentRef:
name: comp-generate-response
dependentTasks:
- rerank-documents
- transcribe-audio
inputs:
parameters:
context:
taskOutputParameter:
outputParameterKey: documents
producerTask: rerank-documents
query:
taskOutputParameter:
outputParameterKey: text
producerTask: transcribe-audio
taskInfo:
name: generate-response
log-pipeline-metrics:
cachingOptions:
enableCache: true
componentRef:
name: comp-log-pipeline-metrics
dependentTasks:
- generate-embeddings
- generate-response
- rerank-documents
- retrieve-context
- synthesize-speech
- transcribe-audio
inputs:
parameters:
embed_latency:
taskOutputParameter:
outputParameterKey: latency_s
producerTask: generate-embeddings
llm_completion_tokens:
taskOutputParameter:
outputParameterKey: completion_tokens
producerTask: generate-response
llm_latency:
taskOutputParameter:
outputParameterKey: latency_s
producerTask: generate-response
rerank_latency:
taskOutputParameter:
outputParameterKey: latency_s
producerTask: rerank-documents
retrieve_latency:
taskOutputParameter:
outputParameterKey: latency_s
producerTask: retrieve-context
stt_audio_duration:
taskOutputParameter:
outputParameterKey: audio_duration_s
producerTask: transcribe-audio
stt_latency:
taskOutputParameter:
outputParameterKey: latency_s
producerTask: transcribe-audio
tts_latency:
taskOutputParameter:
outputParameterKey: latency_s
producerTask: synthesize-speech
taskInfo:
name: log-pipeline-metrics
rerank-documents:
cachingOptions:
enableCache: true
componentRef:
name: comp-rerank-documents
dependentTasks:
- retrieve-context
- transcribe-audio
inputs:
parameters:
documents:
taskOutputParameter:
outputParameterKey: documents
producerTask: retrieve-context
query:
taskOutputParameter:
outputParameterKey: text
producerTask: transcribe-audio
taskInfo:
name: rerank-documents
retrieve-context:
cachingOptions:
enableCache: true
componentRef:
name: comp-retrieve-context
dependentTasks:
- generate-embeddings
inputs:
parameters:
collection_name:
componentInputParameter: collection_name
embedding:
taskOutputParameter:
outputParameterKey: embedding
producerTask: generate-embeddings
taskInfo:
name: retrieve-context
synthesize-speech:
cachingOptions:
enableCache: true
componentRef:
name: comp-synthesize-speech
dependentTasks:
- generate-response
inputs:
parameters:
text:
taskOutputParameter:
outputParameterKey: text
producerTask: generate-response
taskInfo:
name: synthesize-speech
transcribe-audio:
cachingOptions: {}
componentRef:
name: comp-transcribe-audio
inputs:
parameters:
audio_b64:
componentInputParameter: audio_b64
taskInfo:
name: transcribe-audio
inputDefinitions:
parameters:
audio_b64:
description: Base64-encoded audio file
parameterType: STRING
collection_name:
defaultValue: knowledge_base
description: Milvus collection for RAG
isOptional: true
parameterType: STRING
schemaVersion: 2.1.0
sdkVersion: kfp-2.12.1