New: - vllm_tuning_pipeline.py: A/B benchmark different vLLM configs, logs latency/TPS/TTFT to MLflow (vllm-tuning experiment) - vllm_tuning_pipeline.yaml: compiled KFP YAML Updated: - voice_pipeline.py: per-step NamedTuple outputs with latency tracking, new log_pipeline_metrics MLflow component - voice_pipeline.yaml, tts_pipeline.yaml, rag_pipeline.yaml: recompiled
657 lines
29 KiB
YAML
657 lines
29 KiB
YAML
# PIPELINE DEFINITION
|
|
# Name: voice-assistant-rag-pipeline
|
|
# Description: End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus -> Rerank -> LLM -> TTS. Logs per-step latency to MLflow.
|
|
# Inputs:
|
|
# audio_b64: str
|
|
# collection_name: str [Default: 'knowledge_base']
|
|
components:
|
|
comp-generate-embeddings:
|
|
executorLabel: exec-generate-embeddings
|
|
inputDefinitions:
|
|
parameters:
|
|
embeddings_url:
|
|
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings
|
|
isOptional: true
|
|
parameterType: STRING
|
|
text:
|
|
parameterType: STRING
|
|
outputDefinitions:
|
|
parameters:
|
|
embedding:
|
|
parameterType: LIST
|
|
latency_s:
|
|
parameterType: NUMBER_DOUBLE
|
|
comp-generate-response:
|
|
executorLabel: exec-generate-response
|
|
inputDefinitions:
|
|
parameters:
|
|
context:
|
|
parameterType: LIST
|
|
model:
|
|
defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
|
|
isOptional: true
|
|
parameterType: STRING
|
|
query:
|
|
parameterType: STRING
|
|
vllm_url:
|
|
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
|
|
isOptional: true
|
|
parameterType: STRING
|
|
outputDefinitions:
|
|
parameters:
|
|
completion_tokens:
|
|
parameterType: NUMBER_INTEGER
|
|
latency_s:
|
|
parameterType: NUMBER_DOUBLE
|
|
text:
|
|
parameterType: STRING
|
|
comp-log-pipeline-metrics:
|
|
executorLabel: exec-log-pipeline-metrics
|
|
inputDefinitions:
|
|
parameters:
|
|
embed_latency:
|
|
parameterType: NUMBER_DOUBLE
|
|
experiment_name:
|
|
defaultValue: voice-pipeline-metrics
|
|
isOptional: true
|
|
parameterType: STRING
|
|
llm_completion_tokens:
|
|
parameterType: NUMBER_INTEGER
|
|
llm_latency:
|
|
parameterType: NUMBER_DOUBLE
|
|
mlflow_tracking_uri:
|
|
defaultValue: http://mlflow.mlflow.svc.cluster.local:80
|
|
isOptional: true
|
|
parameterType: STRING
|
|
rerank_latency:
|
|
parameterType: NUMBER_DOUBLE
|
|
retrieve_latency:
|
|
parameterType: NUMBER_DOUBLE
|
|
run_name:
|
|
defaultValue: voice-pipeline
|
|
isOptional: true
|
|
parameterType: STRING
|
|
stt_audio_duration:
|
|
parameterType: NUMBER_DOUBLE
|
|
stt_latency:
|
|
parameterType: NUMBER_DOUBLE
|
|
tts_latency:
|
|
parameterType: NUMBER_DOUBLE
|
|
outputDefinitions:
|
|
parameters:
|
|
Output:
|
|
parameterType: STRING
|
|
comp-rerank-documents:
|
|
executorLabel: exec-rerank-documents
|
|
inputDefinitions:
|
|
parameters:
|
|
documents:
|
|
parameterType: LIST
|
|
query:
|
|
parameterType: STRING
|
|
reranker_url:
|
|
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker
|
|
isOptional: true
|
|
parameterType: STRING
|
|
top_k:
|
|
defaultValue: 3.0
|
|
isOptional: true
|
|
parameterType: NUMBER_INTEGER
|
|
outputDefinitions:
|
|
parameters:
|
|
documents:
|
|
parameterType: LIST
|
|
latency_s:
|
|
parameterType: NUMBER_DOUBLE
|
|
comp-retrieve-context:
|
|
executorLabel: exec-retrieve-context
|
|
inputDefinitions:
|
|
parameters:
|
|
collection_name:
|
|
defaultValue: knowledge_base
|
|
isOptional: true
|
|
parameterType: STRING
|
|
embedding:
|
|
parameterType: LIST
|
|
milvus_host:
|
|
defaultValue: milvus.ai-ml.svc.cluster.local
|
|
isOptional: true
|
|
parameterType: STRING
|
|
top_k:
|
|
defaultValue: 5.0
|
|
isOptional: true
|
|
parameterType: NUMBER_INTEGER
|
|
outputDefinitions:
|
|
parameters:
|
|
documents:
|
|
parameterType: LIST
|
|
latency_s:
|
|
parameterType: NUMBER_DOUBLE
|
|
comp-synthesize-speech:
|
|
executorLabel: exec-synthesize-speech
|
|
inputDefinitions:
|
|
parameters:
|
|
text:
|
|
parameterType: STRING
|
|
tts_url:
|
|
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts
|
|
isOptional: true
|
|
parameterType: STRING
|
|
outputDefinitions:
|
|
parameters:
|
|
audio_b64:
|
|
parameterType: STRING
|
|
latency_s:
|
|
parameterType: NUMBER_DOUBLE
|
|
comp-transcribe-audio:
|
|
executorLabel: exec-transcribe-audio
|
|
inputDefinitions:
|
|
parameters:
|
|
audio_b64:
|
|
parameterType: STRING
|
|
whisper_url:
|
|
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper
|
|
isOptional: true
|
|
parameterType: STRING
|
|
outputDefinitions:
|
|
parameters:
|
|
audio_duration_s:
|
|
parameterType: NUMBER_DOUBLE
|
|
latency_s:
|
|
parameterType: NUMBER_DOUBLE
|
|
text:
|
|
parameterType: STRING
|
|
deploymentSpec:
|
|
executors:
|
|
exec-generate-embeddings:
|
|
container:
|
|
args:
|
|
- --executor_input
|
|
- '{{$}}'
|
|
- --function_to_execute
|
|
- generate_embeddings
|
|
command:
|
|
- sh
|
|
- -c
|
|
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
|
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
|
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
|
$0\" \"$@\"\n"
|
|
- sh
|
|
- -ec
|
|
- 'program_path=$(mktemp -d)
|
|
|
|
|
|
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
|
|
|
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
|
|
|
'
|
|
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
|
\ *\n\ndef generate_embeddings(\n text: str,\n embeddings_url: str\
|
|
\ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings\"\
|
|
\n) -> NamedTuple(\"EmbedResult\", [(\"embedding\", list), (\"latency_s\"\
|
|
, float)]):\n \"\"\"Generate embeddings for RAG retrieval.\"\"\"\n \
|
|
\ import time\n import httpx\n from collections import namedtuple\n\
|
|
\n start = time.perf_counter()\n with httpx.Client(timeout=60.0) as\
|
|
\ client:\n response = client.post(\n f\"{embeddings_url}/embeddings\"\
|
|
,\n json={\"input\": text, \"model\": \"bge-small-en-v1.5\"}\n\
|
|
\ )\n result = response.json()\n latency = time.perf_counter()\
|
|
\ - start\n\n EmbedResult = namedtuple(\"EmbedResult\", [\"embedding\"\
|
|
, \"latency_s\"])\n return EmbedResult(result[\"data\"][0][\"embedding\"\
|
|
], latency)\n\n"
|
|
image: python:3.13-slim
|
|
exec-generate-response:
|
|
container:
|
|
args:
|
|
- --executor_input
|
|
- '{{$}}'
|
|
- --function_to_execute
|
|
- generate_response
|
|
command:
|
|
- sh
|
|
- -c
|
|
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
|
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
|
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
|
$0\" \"$@\"\n"
|
|
- sh
|
|
- -ec
|
|
- 'program_path=$(mktemp -d)
|
|
|
|
|
|
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
|
|
|
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
|
|
|
'
|
|
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
|
\ *\n\ndef generate_response(\n query: str,\n context: list,\n \
|
|
\ vllm_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm\"\
|
|
,\n model: str = \"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4\"\
|
|
\n) -> NamedTuple(\"LLMResult\", [(\"text\", str), (\"latency_s\", float),\
|
|
\ (\"completion_tokens\", int)]):\n \"\"\"Generate response using vLLM.\"\
|
|
\"\"\n import time\n import httpx\n from collections import namedtuple\n\
|
|
\n # Build context\n if context:\n context_text = \"\\n\\n\"\
|
|
.join([doc[\"text\"] for doc in context])\n user_content = f\"Context:\\\
|
|
n{context_text}\\n\\nQuestion: {query}\"\n else:\n user_content\
|
|
\ = query\n\n system_prompt = \"\"\"You are a helpful voice assistant.\n\
|
|
Answer questions based on the provided context when available.\nKeep responses\
|
|
\ concise and natural for speech synthesis.\"\"\"\n\n messages = [\n\
|
|
\ {\"role\": \"system\", \"content\": system_prompt},\n {\"\
|
|
role\": \"user\", \"content\": user_content}\n ]\n\n start = time.perf_counter()\n\
|
|
\ with httpx.Client(timeout=180.0) as client:\n response = client.post(\n\
|
|
\ f\"{vllm_url}/v1/chat/completions\",\n json={\n\
|
|
\ \"model\": model,\n \"messages\": messages,\n\
|
|
\ \"max_tokens\": 512,\n \"temperature\":\
|
|
\ 0.7\n }\n )\n result = response.json()\n latency\
|
|
\ = time.perf_counter() - start\n\n text = result[\"choices\"][0][\"\
|
|
message\"][\"content\"]\n usage = result.get(\"usage\", {})\n completion_tokens\
|
|
\ = usage.get(\"completion_tokens\", len(text.split()))\n\n LLMResult\
|
|
\ = namedtuple(\"LLMResult\", [\"text\", \"latency_s\", \"completion_tokens\"\
|
|
])\n return LLMResult(text, latency, completion_tokens)\n\n"
|
|
image: python:3.13-slim
|
|
exec-log-pipeline-metrics:
|
|
container:
|
|
args:
|
|
- --executor_input
|
|
- '{{$}}'
|
|
- --function_to_execute
|
|
- log_pipeline_metrics
|
|
command:
|
|
- sh
|
|
- -c
|
|
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
|
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
|
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
|
|
\ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
|
|
- sh
|
|
- -ec
|
|
- 'program_path=$(mktemp -d)
|
|
|
|
|
|
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
|
|
|
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
|
|
|
'
|
|
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
|
\ *\n\ndef log_pipeline_metrics(\n stt_latency: float,\n stt_audio_duration:\
|
|
\ float,\n embed_latency: float,\n retrieve_latency: float,\n rerank_latency:\
|
|
\ float,\n llm_latency: float,\n llm_completion_tokens: int,\n \
|
|
\ tts_latency: float,\n experiment_name: str = \"voice-pipeline-metrics\"\
|
|
,\n run_name: str = \"voice-pipeline\",\n mlflow_tracking_uri: str\
|
|
\ = \"http://mlflow.mlflow.svc.cluster.local:80\",\n) -> str:\n \"\"\"\
|
|
Log per-step latency metrics to MLflow for the full voice pipeline.\"\"\"\
|
|
\n import os\n import mlflow\n from mlflow.tracking import MlflowClient\n\
|
|
\n mlflow.set_tracking_uri(mlflow_tracking_uri)\n client = MlflowClient()\n\
|
|
\n exp = client.get_experiment_by_name(experiment_name)\n experiment_id\
|
|
\ = (\n exp.experiment_id\n if exp\n else client.create_experiment(\n\
|
|
\ name=experiment_name,\n artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\
|
|
,\n )\n )\n\n run = mlflow.start_run(\n experiment_id=experiment_id,\n\
|
|
\ run_name=run_name,\n tags={\n \"pipeline.type\"\
|
|
: \"voice-assistant\",\n \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\"\
|
|
, \"unknown\"),\n },\n )\n\n total_latency = (\n stt_latency\
|
|
\ + embed_latency + retrieve_latency\n + rerank_latency + llm_latency\
|
|
\ + tts_latency\n )\n stt_rtf = stt_latency / stt_audio_duration if\
|
|
\ stt_audio_duration > 0 else 0\n llm_tps = llm_completion_tokens / llm_latency\
|
|
\ if llm_latency > 0 else 0\n\n mlflow.log_metrics({\n \"stt_latency_s\"\
|
|
: stt_latency,\n \"stt_audio_duration_s\": stt_audio_duration,\n\
|
|
\ \"stt_realtime_factor\": stt_rtf,\n \"embed_latency_s\"\
|
|
: embed_latency,\n \"retrieve_latency_s\": retrieve_latency,\n \
|
|
\ \"rerank_latency_s\": rerank_latency,\n \"llm_latency_s\"\
|
|
: llm_latency,\n \"llm_completion_tokens\": llm_completion_tokens,\n\
|
|
\ \"llm_tokens_per_second\": llm_tps,\n \"tts_latency_s\"\
|
|
: tts_latency,\n \"total_pipeline_latency_s\": total_latency,\n \
|
|
\ })\n mlflow.end_run()\n return run.info.run_id\n\n"
|
|
image: python:3.13-slim
|
|
exec-rerank-documents:
|
|
container:
|
|
args:
|
|
- --executor_input
|
|
- '{{$}}'
|
|
- --function_to_execute
|
|
- rerank_documents
|
|
command:
|
|
- sh
|
|
- -c
|
|
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
|
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
|
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
|
$0\" \"$@\"\n"
|
|
- sh
|
|
- -ec
|
|
- 'program_path=$(mktemp -d)
|
|
|
|
|
|
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
|
|
|
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
|
|
|
'
|
|
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
|
\ *\n\ndef rerank_documents(\n query: str,\n documents: list,\n \
|
|
\ reranker_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker\"\
|
|
,\n top_k: int = 3\n) -> NamedTuple(\"RerankResult\", [(\"documents\"\
|
|
, list), (\"latency_s\", float)]):\n \"\"\"Rerank documents using BGE\
|
|
\ reranker.\"\"\"\n import time\n import httpx\n from collections\
|
|
\ import namedtuple\n\n if not documents:\n RerankResult = namedtuple(\"\
|
|
RerankResult\", [\"documents\", \"latency_s\"])\n return RerankResult([],\
|
|
\ 0.0)\n\n start = time.perf_counter()\n with httpx.Client(timeout=60.0)\
|
|
\ as client:\n response = client.post(\n f\"{reranker_url}/v1/rerank\"\
|
|
,\n json={\n \"query\": query,\n \
|
|
\ \"documents\": [doc[\"text\"] for doc in documents],\n \
|
|
\ \"model\": \"bge-reranker-v2-m3\"\n }\n )\n \
|
|
\ result = response.json()\n latency = time.perf_counter() - start\n\
|
|
\n # Sort by rerank score\n reranked = sorted(\n zip(documents,\
|
|
\ result.get(\"scores\", [0] * len(documents))),\n key=lambda x:\
|
|
\ x[1],\n reverse=True\n )[:top_k]\n\n RerankResult = namedtuple(\"\
|
|
RerankResult\", [\"documents\", \"latency_s\"])\n return RerankResult([doc\
|
|
\ for doc, score in reranked], latency)\n\n"
|
|
image: python:3.13-slim
|
|
exec-retrieve-context:
|
|
container:
|
|
args:
|
|
- --executor_input
|
|
- '{{$}}'
|
|
- --function_to_execute
|
|
- retrieve_context
|
|
command:
|
|
- sh
|
|
- -c
|
|
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
|
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
|
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'pymilvus' &&\
|
|
\ \"$0\" \"$@\"\n"
|
|
- sh
|
|
- -ec
|
|
- 'program_path=$(mktemp -d)
|
|
|
|
|
|
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
|
|
|
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
|
|
|
'
|
|
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
|
\ *\n\ndef retrieve_context(\n embedding: list,\n milvus_host: str\
|
|
\ = \"milvus.ai-ml.svc.cluster.local\",\n collection_name: str = \"knowledge_base\"\
|
|
,\n top_k: int = 5\n) -> NamedTuple(\"RetrieveResult\", [(\"documents\"\
|
|
, list), (\"latency_s\", float)]):\n \"\"\"Retrieve relevant documents\
|
|
\ from Milvus vector database.\"\"\"\n import time\n from pymilvus\
|
|
\ import connections, Collection, utility\n from collections import namedtuple\n\
|
|
\n start = time.perf_counter()\n connections.connect(host=milvus_host,\
|
|
\ port=19530)\n\n if not utility.has_collection(collection_name):\n \
|
|
\ latency = time.perf_counter() - start\n RetrieveResult =\
|
|
\ namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n \
|
|
\ return RetrieveResult([], latency)\n\n collection = Collection(collection_name)\n\
|
|
\ collection.load()\n\n results = collection.search(\n data=[embedding],\n\
|
|
\ anns_field=\"embedding\",\n param={\"metric_type\": \"COSINE\"\
|
|
, \"params\": {\"nprobe\": 10}},\n limit=top_k,\n output_fields=[\"\
|
|
text\", \"source\"]\n )\n latency = time.perf_counter() - start\n\n\
|
|
\ documents = []\n for hits in results:\n for hit in hits:\n\
|
|
\ documents.append({\n \"text\": hit.entity.get(\"\
|
|
text\"),\n \"source\": hit.entity.get(\"source\"),\n \
|
|
\ \"score\": hit.distance\n })\n\n RetrieveResult\
|
|
\ = namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n \
|
|
\ return RetrieveResult(documents, latency)\n\n"
|
|
image: python:3.13-slim
|
|
exec-synthesize-speech:
|
|
container:
|
|
args:
|
|
- --executor_input
|
|
- '{{$}}'
|
|
- --function_to_execute
|
|
- synthesize_speech
|
|
command:
|
|
- sh
|
|
- -c
|
|
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
|
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
|
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
|
$0\" \"$@\"\n"
|
|
- sh
|
|
- -ec
|
|
- 'program_path=$(mktemp -d)
|
|
|
|
|
|
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
|
|
|
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
|
|
|
'
|
|
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
|
\ *\n\ndef synthesize_speech(\n text: str,\n tts_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts\"\
|
|
\n) -> NamedTuple(\"TTSResult\", [(\"audio_b64\", str), (\"latency_s\",\
|
|
\ float)]):\n \"\"\"Convert text to speech using TTS service.\"\"\"\n\
|
|
\ import base64\n import time\n import httpx\n from collections\
|
|
\ import namedtuple\n\n start = time.perf_counter()\n with httpx.Client(timeout=120.0)\
|
|
\ as client:\n response = client.post(\n f\"{tts_url}/v1/audio/speech\"\
|
|
,\n json={\n \"input\": text,\n \
|
|
\ \"voice\": \"en_US-lessac-high\",\n \"response_format\"\
|
|
: \"wav\"\n }\n )\n audio_b64 = base64.b64encode(response.content).decode(\"\
|
|
utf-8\")\n latency = time.perf_counter() - start\n\n TTSResult = namedtuple(\"\
|
|
TTSResult\", [\"audio_b64\", \"latency_s\"])\n return TTSResult(audio_b64,\
|
|
\ latency)\n\n"
|
|
image: python:3.13-slim
|
|
exec-transcribe-audio:
|
|
container:
|
|
args:
|
|
- --executor_input
|
|
- '{{$}}'
|
|
- --function_to_execute
|
|
- transcribe_audio
|
|
command:
|
|
- sh
|
|
- -c
|
|
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
|
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
|
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
|
$0\" \"$@\"\n"
|
|
- sh
|
|
- -ec
|
|
- 'program_path=$(mktemp -d)
|
|
|
|
|
|
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
|
|
|
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
|
|
|
'
|
|
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
|
\ *\n\ndef transcribe_audio(\n audio_b64: str,\n whisper_url: str\
|
|
\ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper\"\
|
|
\n) -> NamedTuple(\"STTResult\", [(\"text\", str), (\"latency_s\", float),\
|
|
\ (\"audio_duration_s\", float)]):\n \"\"\"Transcribe audio using Whisper\
|
|
\ STT service.\"\"\"\n import base64\n import time\n import httpx\n\
|
|
\ from collections import namedtuple\n\n audio_bytes = base64.b64decode(audio_b64)\n\
|
|
\n start = time.perf_counter()\n with httpx.Client(timeout=120.0)\
|
|
\ as client:\n response = client.post(\n f\"{whisper_url}/v1/audio/transcriptions\"\
|
|
,\n files={\"file\": (\"audio.wav\", audio_bytes, \"audio/wav\"\
|
|
)},\n data={\"model\": \"whisper-large-v3\", \"language\": \"\
|
|
en\"}\n )\n result = response.json()\n latency = time.perf_counter()\
|
|
\ - start\n\n text = result.get(\"text\", \"\")\n # Estimate audio\
|
|
\ duration from WAV header (16-bit PCM, 16kHz)\n audio_duration = max(len(audio_bytes)\
|
|
\ / (16000 * 2), 0.1)\n\n STTResult = namedtuple(\"STTResult\", [\"text\"\
|
|
, \"latency_s\", \"audio_duration_s\"])\n return STTResult(text, latency,\
|
|
\ audio_duration)\n\n"
|
|
image: python:3.13-slim
|
|
pipelineInfo:
|
|
description: 'End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus ->
|
|
Rerank -> LLM -> TTS. Logs per-step latency to MLflow.'
|
|
name: voice-assistant-rag-pipeline
|
|
root:
|
|
dag:
|
|
tasks:
|
|
generate-embeddings:
|
|
cachingOptions:
|
|
enableCache: true
|
|
componentRef:
|
|
name: comp-generate-embeddings
|
|
dependentTasks:
|
|
- transcribe-audio
|
|
inputs:
|
|
parameters:
|
|
text:
|
|
taskOutputParameter:
|
|
outputParameterKey: text
|
|
producerTask: transcribe-audio
|
|
taskInfo:
|
|
name: generate-embeddings
|
|
generate-response:
|
|
cachingOptions:
|
|
enableCache: true
|
|
componentRef:
|
|
name: comp-generate-response
|
|
dependentTasks:
|
|
- rerank-documents
|
|
- transcribe-audio
|
|
inputs:
|
|
parameters:
|
|
context:
|
|
taskOutputParameter:
|
|
outputParameterKey: documents
|
|
producerTask: rerank-documents
|
|
query:
|
|
taskOutputParameter:
|
|
outputParameterKey: text
|
|
producerTask: transcribe-audio
|
|
taskInfo:
|
|
name: generate-response
|
|
log-pipeline-metrics:
|
|
cachingOptions:
|
|
enableCache: true
|
|
componentRef:
|
|
name: comp-log-pipeline-metrics
|
|
dependentTasks:
|
|
- generate-embeddings
|
|
- generate-response
|
|
- rerank-documents
|
|
- retrieve-context
|
|
- synthesize-speech
|
|
- transcribe-audio
|
|
inputs:
|
|
parameters:
|
|
embed_latency:
|
|
taskOutputParameter:
|
|
outputParameterKey: latency_s
|
|
producerTask: generate-embeddings
|
|
llm_completion_tokens:
|
|
taskOutputParameter:
|
|
outputParameterKey: completion_tokens
|
|
producerTask: generate-response
|
|
llm_latency:
|
|
taskOutputParameter:
|
|
outputParameterKey: latency_s
|
|
producerTask: generate-response
|
|
rerank_latency:
|
|
taskOutputParameter:
|
|
outputParameterKey: latency_s
|
|
producerTask: rerank-documents
|
|
retrieve_latency:
|
|
taskOutputParameter:
|
|
outputParameterKey: latency_s
|
|
producerTask: retrieve-context
|
|
stt_audio_duration:
|
|
taskOutputParameter:
|
|
outputParameterKey: audio_duration_s
|
|
producerTask: transcribe-audio
|
|
stt_latency:
|
|
taskOutputParameter:
|
|
outputParameterKey: latency_s
|
|
producerTask: transcribe-audio
|
|
tts_latency:
|
|
taskOutputParameter:
|
|
outputParameterKey: latency_s
|
|
producerTask: synthesize-speech
|
|
taskInfo:
|
|
name: log-pipeline-metrics
|
|
rerank-documents:
|
|
cachingOptions:
|
|
enableCache: true
|
|
componentRef:
|
|
name: comp-rerank-documents
|
|
dependentTasks:
|
|
- retrieve-context
|
|
- transcribe-audio
|
|
inputs:
|
|
parameters:
|
|
documents:
|
|
taskOutputParameter:
|
|
outputParameterKey: documents
|
|
producerTask: retrieve-context
|
|
query:
|
|
taskOutputParameter:
|
|
outputParameterKey: text
|
|
producerTask: transcribe-audio
|
|
taskInfo:
|
|
name: rerank-documents
|
|
retrieve-context:
|
|
cachingOptions:
|
|
enableCache: true
|
|
componentRef:
|
|
name: comp-retrieve-context
|
|
dependentTasks:
|
|
- generate-embeddings
|
|
inputs:
|
|
parameters:
|
|
collection_name:
|
|
componentInputParameter: collection_name
|
|
embedding:
|
|
taskOutputParameter:
|
|
outputParameterKey: embedding
|
|
producerTask: generate-embeddings
|
|
taskInfo:
|
|
name: retrieve-context
|
|
synthesize-speech:
|
|
cachingOptions:
|
|
enableCache: true
|
|
componentRef:
|
|
name: comp-synthesize-speech
|
|
dependentTasks:
|
|
- generate-response
|
|
inputs:
|
|
parameters:
|
|
text:
|
|
taskOutputParameter:
|
|
outputParameterKey: text
|
|
producerTask: generate-response
|
|
taskInfo:
|
|
name: synthesize-speech
|
|
transcribe-audio:
|
|
cachingOptions: {}
|
|
componentRef:
|
|
name: comp-transcribe-audio
|
|
inputs:
|
|
parameters:
|
|
audio_b64:
|
|
componentInputParameter: audio_b64
|
|
taskInfo:
|
|
name: transcribe-audio
|
|
inputDefinitions:
|
|
parameters:
|
|
audio_b64:
|
|
description: Base64-encoded audio file
|
|
parameterType: STRING
|
|
collection_name:
|
|
defaultValue: knowledge_base
|
|
description: Milvus collection for RAG
|
|
isOptional: true
|
|
parameterType: STRING
|
|
schemaVersion: 2.1.0
|
|
sdkVersion: kfp-2.12.1
|