feat: add vLLM tuning pipeline + recompile voice pipelines with MLflow
New: - vllm_tuning_pipeline.py: A/B benchmark different vLLM configs, logs latency/TPS/TTFT to MLflow (vllm-tuning experiment) - vllm_tuning_pipeline.yaml: compiled KFP YAML Updated: - voice_pipeline.py: per-step NamedTuple outputs with latency tracking, new log_pipeline_metrics MLflow component - voice_pipeline.yaml, tts_pipeline.yaml, rag_pipeline.yaml: recompiled
This commit is contained in:
656
voice_pipeline.yaml
Normal file
656
voice_pipeline.yaml
Normal file
@@ -0,0 +1,656 @@
|
||||
# PIPELINE DEFINITION
|
||||
# Name: voice-assistant-rag-pipeline
|
||||
# Description: End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus -> Rerank -> LLM -> TTS. Logs per-step latency to MLflow.
|
||||
# Inputs:
|
||||
# audio_b64: str
|
||||
# collection_name: str [Default: 'knowledge_base']
|
||||
components:
|
||||
comp-generate-embeddings:
|
||||
executorLabel: exec-generate-embeddings
|
||||
inputDefinitions:
|
||||
parameters:
|
||||
embeddings_url:
|
||||
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
text:
|
||||
parameterType: STRING
|
||||
outputDefinitions:
|
||||
parameters:
|
||||
embedding:
|
||||
parameterType: LIST
|
||||
latency_s:
|
||||
parameterType: NUMBER_DOUBLE
|
||||
comp-generate-response:
|
||||
executorLabel: exec-generate-response
|
||||
inputDefinitions:
|
||||
parameters:
|
||||
context:
|
||||
parameterType: LIST
|
||||
model:
|
||||
defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
query:
|
||||
parameterType: STRING
|
||||
vllm_url:
|
||||
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
outputDefinitions:
|
||||
parameters:
|
||||
completion_tokens:
|
||||
parameterType: NUMBER_INTEGER
|
||||
latency_s:
|
||||
parameterType: NUMBER_DOUBLE
|
||||
text:
|
||||
parameterType: STRING
|
||||
comp-log-pipeline-metrics:
|
||||
executorLabel: exec-log-pipeline-metrics
|
||||
inputDefinitions:
|
||||
parameters:
|
||||
embed_latency:
|
||||
parameterType: NUMBER_DOUBLE
|
||||
experiment_name:
|
||||
defaultValue: voice-pipeline-metrics
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
llm_completion_tokens:
|
||||
parameterType: NUMBER_INTEGER
|
||||
llm_latency:
|
||||
parameterType: NUMBER_DOUBLE
|
||||
mlflow_tracking_uri:
|
||||
defaultValue: http://mlflow.mlflow.svc.cluster.local:80
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
rerank_latency:
|
||||
parameterType: NUMBER_DOUBLE
|
||||
retrieve_latency:
|
||||
parameterType: NUMBER_DOUBLE
|
||||
run_name:
|
||||
defaultValue: voice-pipeline
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
stt_audio_duration:
|
||||
parameterType: NUMBER_DOUBLE
|
||||
stt_latency:
|
||||
parameterType: NUMBER_DOUBLE
|
||||
tts_latency:
|
||||
parameterType: NUMBER_DOUBLE
|
||||
outputDefinitions:
|
||||
parameters:
|
||||
Output:
|
||||
parameterType: STRING
|
||||
comp-rerank-documents:
|
||||
executorLabel: exec-rerank-documents
|
||||
inputDefinitions:
|
||||
parameters:
|
||||
documents:
|
||||
parameterType: LIST
|
||||
query:
|
||||
parameterType: STRING
|
||||
reranker_url:
|
||||
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
top_k:
|
||||
defaultValue: 3.0
|
||||
isOptional: true
|
||||
parameterType: NUMBER_INTEGER
|
||||
outputDefinitions:
|
||||
parameters:
|
||||
documents:
|
||||
parameterType: LIST
|
||||
latency_s:
|
||||
parameterType: NUMBER_DOUBLE
|
||||
comp-retrieve-context:
|
||||
executorLabel: exec-retrieve-context
|
||||
inputDefinitions:
|
||||
parameters:
|
||||
collection_name:
|
||||
defaultValue: knowledge_base
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
embedding:
|
||||
parameterType: LIST
|
||||
milvus_host:
|
||||
defaultValue: milvus.ai-ml.svc.cluster.local
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
top_k:
|
||||
defaultValue: 5.0
|
||||
isOptional: true
|
||||
parameterType: NUMBER_INTEGER
|
||||
outputDefinitions:
|
||||
parameters:
|
||||
documents:
|
||||
parameterType: LIST
|
||||
latency_s:
|
||||
parameterType: NUMBER_DOUBLE
|
||||
comp-synthesize-speech:
|
||||
executorLabel: exec-synthesize-speech
|
||||
inputDefinitions:
|
||||
parameters:
|
||||
text:
|
||||
parameterType: STRING
|
||||
tts_url:
|
||||
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
outputDefinitions:
|
||||
parameters:
|
||||
audio_b64:
|
||||
parameterType: STRING
|
||||
latency_s:
|
||||
parameterType: NUMBER_DOUBLE
|
||||
comp-transcribe-audio:
|
||||
executorLabel: exec-transcribe-audio
|
||||
inputDefinitions:
|
||||
parameters:
|
||||
audio_b64:
|
||||
parameterType: STRING
|
||||
whisper_url:
|
||||
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
outputDefinitions:
|
||||
parameters:
|
||||
audio_duration_s:
|
||||
parameterType: NUMBER_DOUBLE
|
||||
latency_s:
|
||||
parameterType: NUMBER_DOUBLE
|
||||
text:
|
||||
parameterType: STRING
|
||||
deploymentSpec:
|
||||
executors:
|
||||
exec-generate-embeddings:
|
||||
container:
|
||||
args:
|
||||
- --executor_input
|
||||
- '{{$}}'
|
||||
- --function_to_execute
|
||||
- generate_embeddings
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
||||
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
||||
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
||||
$0\" \"$@\"\n"
|
||||
- sh
|
||||
- -ec
|
||||
- 'program_path=$(mktemp -d)
|
||||
|
||||
|
||||
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
||||
|
||||
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
||||
|
||||
'
|
||||
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
||||
\ *\n\ndef generate_embeddings(\n text: str,\n embeddings_url: str\
|
||||
\ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings\"\
|
||||
\n) -> NamedTuple(\"EmbedResult\", [(\"embedding\", list), (\"latency_s\"\
|
||||
, float)]):\n \"\"\"Generate embeddings for RAG retrieval.\"\"\"\n \
|
||||
\ import time\n import httpx\n from collections import namedtuple\n\
|
||||
\n start = time.perf_counter()\n with httpx.Client(timeout=60.0) as\
|
||||
\ client:\n response = client.post(\n f\"{embeddings_url}/embeddings\"\
|
||||
,\n json={\"input\": text, \"model\": \"bge-small-en-v1.5\"}\n\
|
||||
\ )\n result = response.json()\n latency = time.perf_counter()\
|
||||
\ - start\n\n EmbedResult = namedtuple(\"EmbedResult\", [\"embedding\"\
|
||||
, \"latency_s\"])\n return EmbedResult(result[\"data\"][0][\"embedding\"\
|
||||
], latency)\n\n"
|
||||
image: python:3.13-slim
|
||||
exec-generate-response:
|
||||
container:
|
||||
args:
|
||||
- --executor_input
|
||||
- '{{$}}'
|
||||
- --function_to_execute
|
||||
- generate_response
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
||||
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
||||
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
||||
$0\" \"$@\"\n"
|
||||
- sh
|
||||
- -ec
|
||||
- 'program_path=$(mktemp -d)
|
||||
|
||||
|
||||
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
||||
|
||||
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
||||
|
||||
'
|
||||
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
||||
\ *\n\ndef generate_response(\n query: str,\n context: list,\n \
|
||||
\ vllm_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm\"\
|
||||
,\n model: str = \"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4\"\
|
||||
\n) -> NamedTuple(\"LLMResult\", [(\"text\", str), (\"latency_s\", float),\
|
||||
\ (\"completion_tokens\", int)]):\n \"\"\"Generate response using vLLM.\"\
|
||||
\"\"\n import time\n import httpx\n from collections import namedtuple\n\
|
||||
\n # Build context\n if context:\n context_text = \"\\n\\n\"\
|
||||
.join([doc[\"text\"] for doc in context])\n user_content = f\"Context:\\\
|
||||
n{context_text}\\n\\nQuestion: {query}\"\n else:\n user_content\
|
||||
\ = query\n\n system_prompt = \"\"\"You are a helpful voice assistant.\n\
|
||||
Answer questions based on the provided context when available.\nKeep responses\
|
||||
\ concise and natural for speech synthesis.\"\"\"\n\n messages = [\n\
|
||||
\ {\"role\": \"system\", \"content\": system_prompt},\n {\"\
|
||||
role\": \"user\", \"content\": user_content}\n ]\n\n start = time.perf_counter()\n\
|
||||
\ with httpx.Client(timeout=180.0) as client:\n response = client.post(\n\
|
||||
\ f\"{vllm_url}/v1/chat/completions\",\n json={\n\
|
||||
\ \"model\": model,\n \"messages\": messages,\n\
|
||||
\ \"max_tokens\": 512,\n \"temperature\":\
|
||||
\ 0.7\n }\n )\n result = response.json()\n latency\
|
||||
\ = time.perf_counter() - start\n\n text = result[\"choices\"][0][\"\
|
||||
message\"][\"content\"]\n usage = result.get(\"usage\", {})\n completion_tokens\
|
||||
\ = usage.get(\"completion_tokens\", len(text.split()))\n\n LLMResult\
|
||||
\ = namedtuple(\"LLMResult\", [\"text\", \"latency_s\", \"completion_tokens\"\
|
||||
])\n return LLMResult(text, latency, completion_tokens)\n\n"
|
||||
image: python:3.13-slim
|
||||
exec-log-pipeline-metrics:
|
||||
container:
|
||||
args:
|
||||
- --executor_input
|
||||
- '{{$}}'
|
||||
- --function_to_execute
|
||||
- log_pipeline_metrics
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
||||
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
||||
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
|
||||
\ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
|
||||
- sh
|
||||
- -ec
|
||||
- 'program_path=$(mktemp -d)
|
||||
|
||||
|
||||
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
||||
|
||||
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
||||
|
||||
'
|
||||
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
||||
\ *\n\ndef log_pipeline_metrics(\n stt_latency: float,\n stt_audio_duration:\
|
||||
\ float,\n embed_latency: float,\n retrieve_latency: float,\n rerank_latency:\
|
||||
\ float,\n llm_latency: float,\n llm_completion_tokens: int,\n \
|
||||
\ tts_latency: float,\n experiment_name: str = \"voice-pipeline-metrics\"\
|
||||
,\n run_name: str = \"voice-pipeline\",\n mlflow_tracking_uri: str\
|
||||
\ = \"http://mlflow.mlflow.svc.cluster.local:80\",\n) -> str:\n \"\"\"\
|
||||
Log per-step latency metrics to MLflow for the full voice pipeline.\"\"\"\
|
||||
\n import os\n import mlflow\n from mlflow.tracking import MlflowClient\n\
|
||||
\n mlflow.set_tracking_uri(mlflow_tracking_uri)\n client = MlflowClient()\n\
|
||||
\n exp = client.get_experiment_by_name(experiment_name)\n experiment_id\
|
||||
\ = (\n exp.experiment_id\n if exp\n else client.create_experiment(\n\
|
||||
\ name=experiment_name,\n artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\
|
||||
,\n )\n )\n\n run = mlflow.start_run(\n experiment_id=experiment_id,\n\
|
||||
\ run_name=run_name,\n tags={\n \"pipeline.type\"\
|
||||
: \"voice-assistant\",\n \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\"\
|
||||
, \"unknown\"),\n },\n )\n\n total_latency = (\n stt_latency\
|
||||
\ + embed_latency + retrieve_latency\n + rerank_latency + llm_latency\
|
||||
\ + tts_latency\n )\n stt_rtf = stt_latency / stt_audio_duration if\
|
||||
\ stt_audio_duration > 0 else 0\n llm_tps = llm_completion_tokens / llm_latency\
|
||||
\ if llm_latency > 0 else 0\n\n mlflow.log_metrics({\n \"stt_latency_s\"\
|
||||
: stt_latency,\n \"stt_audio_duration_s\": stt_audio_duration,\n\
|
||||
\ \"stt_realtime_factor\": stt_rtf,\n \"embed_latency_s\"\
|
||||
: embed_latency,\n \"retrieve_latency_s\": retrieve_latency,\n \
|
||||
\ \"rerank_latency_s\": rerank_latency,\n \"llm_latency_s\"\
|
||||
: llm_latency,\n \"llm_completion_tokens\": llm_completion_tokens,\n\
|
||||
\ \"llm_tokens_per_second\": llm_tps,\n \"tts_latency_s\"\
|
||||
: tts_latency,\n \"total_pipeline_latency_s\": total_latency,\n \
|
||||
\ })\n mlflow.end_run()\n return run.info.run_id\n\n"
|
||||
image: python:3.13-slim
|
||||
exec-rerank-documents:
|
||||
container:
|
||||
args:
|
||||
- --executor_input
|
||||
- '{{$}}'
|
||||
- --function_to_execute
|
||||
- rerank_documents
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
||||
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
||||
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
||||
$0\" \"$@\"\n"
|
||||
- sh
|
||||
- -ec
|
||||
- 'program_path=$(mktemp -d)
|
||||
|
||||
|
||||
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
||||
|
||||
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
||||
|
||||
'
|
||||
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
||||
\ *\n\ndef rerank_documents(\n query: str,\n documents: list,\n \
|
||||
\ reranker_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker\"\
|
||||
,\n top_k: int = 3\n) -> NamedTuple(\"RerankResult\", [(\"documents\"\
|
||||
, list), (\"latency_s\", float)]):\n \"\"\"Rerank documents using BGE\
|
||||
\ reranker.\"\"\"\n import time\n import httpx\n from collections\
|
||||
\ import namedtuple\n\n if not documents:\n RerankResult = namedtuple(\"\
|
||||
RerankResult\", [\"documents\", \"latency_s\"])\n return RerankResult([],\
|
||||
\ 0.0)\n\n start = time.perf_counter()\n with httpx.Client(timeout=60.0)\
|
||||
\ as client:\n response = client.post(\n f\"{reranker_url}/v1/rerank\"\
|
||||
,\n json={\n \"query\": query,\n \
|
||||
\ \"documents\": [doc[\"text\"] for doc in documents],\n \
|
||||
\ \"model\": \"bge-reranker-v2-m3\"\n }\n )\n \
|
||||
\ result = response.json()\n latency = time.perf_counter() - start\n\
|
||||
\n # Sort by rerank score\n reranked = sorted(\n zip(documents,\
|
||||
\ result.get(\"scores\", [0] * len(documents))),\n key=lambda x:\
|
||||
\ x[1],\n reverse=True\n )[:top_k]\n\n RerankResult = namedtuple(\"\
|
||||
RerankResult\", [\"documents\", \"latency_s\"])\n return RerankResult([doc\
|
||||
\ for doc, score in reranked], latency)\n\n"
|
||||
image: python:3.13-slim
|
||||
exec-retrieve-context:
|
||||
container:
|
||||
args:
|
||||
- --executor_input
|
||||
- '{{$}}'
|
||||
- --function_to_execute
|
||||
- retrieve_context
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
||||
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
||||
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'pymilvus' &&\
|
||||
\ \"$0\" \"$@\"\n"
|
||||
- sh
|
||||
- -ec
|
||||
- 'program_path=$(mktemp -d)
|
||||
|
||||
|
||||
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
||||
|
||||
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
||||
|
||||
'
|
||||
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
||||
\ *\n\ndef retrieve_context(\n embedding: list,\n milvus_host: str\
|
||||
\ = \"milvus.ai-ml.svc.cluster.local\",\n collection_name: str = \"knowledge_base\"\
|
||||
,\n top_k: int = 5\n) -> NamedTuple(\"RetrieveResult\", [(\"documents\"\
|
||||
, list), (\"latency_s\", float)]):\n \"\"\"Retrieve relevant documents\
|
||||
\ from Milvus vector database.\"\"\"\n import time\n from pymilvus\
|
||||
\ import connections, Collection, utility\n from collections import namedtuple\n\
|
||||
\n start = time.perf_counter()\n connections.connect(host=milvus_host,\
|
||||
\ port=19530)\n\n if not utility.has_collection(collection_name):\n \
|
||||
\ latency = time.perf_counter() - start\n RetrieveResult =\
|
||||
\ namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n \
|
||||
\ return RetrieveResult([], latency)\n\n collection = Collection(collection_name)\n\
|
||||
\ collection.load()\n\n results = collection.search(\n data=[embedding],\n\
|
||||
\ anns_field=\"embedding\",\n param={\"metric_type\": \"COSINE\"\
|
||||
, \"params\": {\"nprobe\": 10}},\n limit=top_k,\n output_fields=[\"\
|
||||
text\", \"source\"]\n )\n latency = time.perf_counter() - start\n\n\
|
||||
\ documents = []\n for hits in results:\n for hit in hits:\n\
|
||||
\ documents.append({\n \"text\": hit.entity.get(\"\
|
||||
text\"),\n \"source\": hit.entity.get(\"source\"),\n \
|
||||
\ \"score\": hit.distance\n })\n\n RetrieveResult\
|
||||
\ = namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n \
|
||||
\ return RetrieveResult(documents, latency)\n\n"
|
||||
image: python:3.13-slim
|
||||
exec-synthesize-speech:
|
||||
container:
|
||||
args:
|
||||
- --executor_input
|
||||
- '{{$}}'
|
||||
- --function_to_execute
|
||||
- synthesize_speech
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
||||
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
||||
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
||||
$0\" \"$@\"\n"
|
||||
- sh
|
||||
- -ec
|
||||
- 'program_path=$(mktemp -d)
|
||||
|
||||
|
||||
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
||||
|
||||
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
||||
|
||||
'
|
||||
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
||||
\ *\n\ndef synthesize_speech(\n text: str,\n tts_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts\"\
|
||||
\n) -> NamedTuple(\"TTSResult\", [(\"audio_b64\", str), (\"latency_s\",\
|
||||
\ float)]):\n \"\"\"Convert text to speech using TTS service.\"\"\"\n\
|
||||
\ import base64\n import time\n import httpx\n from collections\
|
||||
\ import namedtuple\n\n start = time.perf_counter()\n with httpx.Client(timeout=120.0)\
|
||||
\ as client:\n response = client.post(\n f\"{tts_url}/v1/audio/speech\"\
|
||||
,\n json={\n \"input\": text,\n \
|
||||
\ \"voice\": \"en_US-lessac-high\",\n \"response_format\"\
|
||||
: \"wav\"\n }\n )\n audio_b64 = base64.b64encode(response.content).decode(\"\
|
||||
utf-8\")\n latency = time.perf_counter() - start\n\n TTSResult = namedtuple(\"\
|
||||
TTSResult\", [\"audio_b64\", \"latency_s\"])\n return TTSResult(audio_b64,\
|
||||
\ latency)\n\n"
|
||||
image: python:3.13-slim
|
||||
exec-transcribe-audio:
|
||||
container:
|
||||
args:
|
||||
- --executor_input
|
||||
- '{{$}}'
|
||||
- --function_to_execute
|
||||
- transcribe_audio
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
||||
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
||||
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
||||
$0\" \"$@\"\n"
|
||||
- sh
|
||||
- -ec
|
||||
- 'program_path=$(mktemp -d)
|
||||
|
||||
|
||||
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
||||
|
||||
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
||||
|
||||
'
|
||||
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
||||
\ *\n\ndef transcribe_audio(\n audio_b64: str,\n whisper_url: str\
|
||||
\ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper\"\
|
||||
\n) -> NamedTuple(\"STTResult\", [(\"text\", str), (\"latency_s\", float),\
|
||||
\ (\"audio_duration_s\", float)]):\n \"\"\"Transcribe audio using Whisper\
|
||||
\ STT service.\"\"\"\n import base64\n import time\n import httpx\n\
|
||||
\ from collections import namedtuple\n\n audio_bytes = base64.b64decode(audio_b64)\n\
|
||||
\n start = time.perf_counter()\n with httpx.Client(timeout=120.0)\
|
||||
\ as client:\n response = client.post(\n f\"{whisper_url}/v1/audio/transcriptions\"\
|
||||
,\n files={\"file\": (\"audio.wav\", audio_bytes, \"audio/wav\"\
|
||||
)},\n data={\"model\": \"whisper-large-v3\", \"language\": \"\
|
||||
en\"}\n )\n result = response.json()\n latency = time.perf_counter()\
|
||||
\ - start\n\n text = result.get(\"text\", \"\")\n # Estimate audio\
|
||||
\ duration from WAV header (16-bit PCM, 16kHz)\n audio_duration = max(len(audio_bytes)\
|
||||
\ / (16000 * 2), 0.1)\n\n STTResult = namedtuple(\"STTResult\", [\"text\"\
|
||||
, \"latency_s\", \"audio_duration_s\"])\n return STTResult(text, latency,\
|
||||
\ audio_duration)\n\n"
|
||||
image: python:3.13-slim
|
||||
pipelineInfo:
|
||||
description: 'End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus ->
|
||||
Rerank -> LLM -> TTS. Logs per-step latency to MLflow.'
|
||||
name: voice-assistant-rag-pipeline
|
||||
root:
|
||||
dag:
|
||||
tasks:
|
||||
generate-embeddings:
|
||||
cachingOptions:
|
||||
enableCache: true
|
||||
componentRef:
|
||||
name: comp-generate-embeddings
|
||||
dependentTasks:
|
||||
- transcribe-audio
|
||||
inputs:
|
||||
parameters:
|
||||
text:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: text
|
||||
producerTask: transcribe-audio
|
||||
taskInfo:
|
||||
name: generate-embeddings
|
||||
generate-response:
|
||||
cachingOptions:
|
||||
enableCache: true
|
||||
componentRef:
|
||||
name: comp-generate-response
|
||||
dependentTasks:
|
||||
- rerank-documents
|
||||
- transcribe-audio
|
||||
inputs:
|
||||
parameters:
|
||||
context:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: documents
|
||||
producerTask: rerank-documents
|
||||
query:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: text
|
||||
producerTask: transcribe-audio
|
||||
taskInfo:
|
||||
name: generate-response
|
||||
log-pipeline-metrics:
|
||||
cachingOptions:
|
||||
enableCache: true
|
||||
componentRef:
|
||||
name: comp-log-pipeline-metrics
|
||||
dependentTasks:
|
||||
- generate-embeddings
|
||||
- generate-response
|
||||
- rerank-documents
|
||||
- retrieve-context
|
||||
- synthesize-speech
|
||||
- transcribe-audio
|
||||
inputs:
|
||||
parameters:
|
||||
embed_latency:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: latency_s
|
||||
producerTask: generate-embeddings
|
||||
llm_completion_tokens:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: completion_tokens
|
||||
producerTask: generate-response
|
||||
llm_latency:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: latency_s
|
||||
producerTask: generate-response
|
||||
rerank_latency:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: latency_s
|
||||
producerTask: rerank-documents
|
||||
retrieve_latency:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: latency_s
|
||||
producerTask: retrieve-context
|
||||
stt_audio_duration:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: audio_duration_s
|
||||
producerTask: transcribe-audio
|
||||
stt_latency:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: latency_s
|
||||
producerTask: transcribe-audio
|
||||
tts_latency:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: latency_s
|
||||
producerTask: synthesize-speech
|
||||
taskInfo:
|
||||
name: log-pipeline-metrics
|
||||
rerank-documents:
|
||||
cachingOptions:
|
||||
enableCache: true
|
||||
componentRef:
|
||||
name: comp-rerank-documents
|
||||
dependentTasks:
|
||||
- retrieve-context
|
||||
- transcribe-audio
|
||||
inputs:
|
||||
parameters:
|
||||
documents:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: documents
|
||||
producerTask: retrieve-context
|
||||
query:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: text
|
||||
producerTask: transcribe-audio
|
||||
taskInfo:
|
||||
name: rerank-documents
|
||||
retrieve-context:
|
||||
cachingOptions:
|
||||
enableCache: true
|
||||
componentRef:
|
||||
name: comp-retrieve-context
|
||||
dependentTasks:
|
||||
- generate-embeddings
|
||||
inputs:
|
||||
parameters:
|
||||
collection_name:
|
||||
componentInputParameter: collection_name
|
||||
embedding:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: embedding
|
||||
producerTask: generate-embeddings
|
||||
taskInfo:
|
||||
name: retrieve-context
|
||||
synthesize-speech:
|
||||
cachingOptions:
|
||||
enableCache: true
|
||||
componentRef:
|
||||
name: comp-synthesize-speech
|
||||
dependentTasks:
|
||||
- generate-response
|
||||
inputs:
|
||||
parameters:
|
||||
text:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: text
|
||||
producerTask: generate-response
|
||||
taskInfo:
|
||||
name: synthesize-speech
|
||||
transcribe-audio:
|
||||
cachingOptions: {}
|
||||
componentRef:
|
||||
name: comp-transcribe-audio
|
||||
inputs:
|
||||
parameters:
|
||||
audio_b64:
|
||||
componentInputParameter: audio_b64
|
||||
taskInfo:
|
||||
name: transcribe-audio
|
||||
inputDefinitions:
|
||||
parameters:
|
||||
audio_b64:
|
||||
description: Base64-encoded audio file
|
||||
parameterType: STRING
|
||||
collection_name:
|
||||
defaultValue: knowledge_base
|
||||
description: Milvus collection for RAG
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
schemaVersion: 2.1.0
|
||||
sdkVersion: kfp-2.12.1
|
||||
Reference in New Issue
Block a user