feat: add vLLM tuning pipeline + recompile voice pipelines with MLflow

New:
- vllm_tuning_pipeline.py: A/B benchmark different vLLM configs,
  logs latency/TPS/TTFT to MLflow (vllm-tuning experiment)
- vllm_tuning_pipeline.yaml: compiled KFP YAML

Updated:
- voice_pipeline.py: per-step NamedTuple outputs with latency tracking,
  new log_pipeline_metrics MLflow component
- voice_pipeline.yaml, tts_pipeline.yaml, rag_pipeline.yaml: recompiled
This commit is contained in:
2026-02-13 08:24:11 -05:00
parent cee21f124c
commit bc4b230dd9
6 changed files with 2216 additions and 26 deletions

363
rag_pipeline.yaml Normal file
View File

@@ -0,0 +1,363 @@
# PIPELINE DEFINITION
# Name: rag-query-pipeline
# Description: RAG query pipeline: Embed -> Retrieve -> Rerank -> LLM. Logs per-step latency to MLflow.
# Inputs:
# collection_name: str [Default: 'knowledge_base']
# query: str
components:
comp-generate-embeddings:
executorLabel: exec-generate-embeddings
inputDefinitions:
parameters:
embeddings_url:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings
isOptional: true
parameterType: STRING
text:
parameterType: STRING
outputDefinitions:
parameters:
embedding:
parameterType: LIST
latency_s:
parameterType: NUMBER_DOUBLE
comp-generate-response:
executorLabel: exec-generate-response
inputDefinitions:
parameters:
context:
parameterType: LIST
model:
defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
isOptional: true
parameterType: STRING
query:
parameterType: STRING
vllm_url:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
isOptional: true
parameterType: STRING
outputDefinitions:
parameters:
completion_tokens:
parameterType: NUMBER_INTEGER
latency_s:
parameterType: NUMBER_DOUBLE
text:
parameterType: STRING
comp-rerank-documents:
executorLabel: exec-rerank-documents
inputDefinitions:
parameters:
documents:
parameterType: LIST
query:
parameterType: STRING
reranker_url:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker
isOptional: true
parameterType: STRING
top_k:
defaultValue: 3.0
isOptional: true
parameterType: NUMBER_INTEGER
outputDefinitions:
parameters:
documents:
parameterType: LIST
latency_s:
parameterType: NUMBER_DOUBLE
comp-retrieve-context:
executorLabel: exec-retrieve-context
inputDefinitions:
parameters:
collection_name:
defaultValue: knowledge_base
isOptional: true
parameterType: STRING
embedding:
parameterType: LIST
milvus_host:
defaultValue: milvus.ai-ml.svc.cluster.local
isOptional: true
parameterType: STRING
top_k:
defaultValue: 5.0
isOptional: true
parameterType: NUMBER_INTEGER
outputDefinitions:
parameters:
documents:
parameterType: LIST
latency_s:
parameterType: NUMBER_DOUBLE
deploymentSpec:
executors:
exec-generate-embeddings:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- generate_embeddings
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef generate_embeddings(\n text: str,\n embeddings_url: str\
\ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings\"\
\n) -> NamedTuple(\"EmbedResult\", [(\"embedding\", list), (\"latency_s\"\
, float)]):\n \"\"\"Generate embeddings for RAG retrieval.\"\"\"\n \
\ import time\n import httpx\n from collections import namedtuple\n\
\n start = time.perf_counter()\n with httpx.Client(timeout=60.0) as\
\ client:\n response = client.post(\n f\"{embeddings_url}/embeddings\"\
,\n json={\"input\": text, \"model\": \"bge-small-en-v1.5\"}\n\
\ )\n result = response.json()\n latency = time.perf_counter()\
\ - start\n\n EmbedResult = namedtuple(\"EmbedResult\", [\"embedding\"\
, \"latency_s\"])\n return EmbedResult(result[\"data\"][0][\"embedding\"\
], latency)\n\n"
image: python:3.13-slim
exec-generate-response:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- generate_response
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef generate_response(\n query: str,\n context: list,\n \
\ vllm_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm\"\
,\n model: str = \"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4\"\
\n) -> NamedTuple(\"LLMResult\", [(\"text\", str), (\"latency_s\", float),\
\ (\"completion_tokens\", int)]):\n \"\"\"Generate response using vLLM.\"\
\"\"\n import time\n import httpx\n from collections import namedtuple\n\
\n # Build context\n if context:\n context_text = \"\\n\\n\"\
.join([doc[\"text\"] for doc in context])\n user_content = f\"Context:\\\
n{context_text}\\n\\nQuestion: {query}\"\n else:\n user_content\
\ = query\n\n system_prompt = \"\"\"You are a helpful voice assistant.\n\
Answer questions based on the provided context when available.\nKeep responses\
\ concise and natural for speech synthesis.\"\"\"\n\n messages = [\n\
\ {\"role\": \"system\", \"content\": system_prompt},\n {\"\
role\": \"user\", \"content\": user_content}\n ]\n\n start = time.perf_counter()\n\
\ with httpx.Client(timeout=180.0) as client:\n response = client.post(\n\
\ f\"{vllm_url}/v1/chat/completions\",\n json={\n\
\ \"model\": model,\n \"messages\": messages,\n\
\ \"max_tokens\": 512,\n \"temperature\":\
\ 0.7\n }\n )\n result = response.json()\n latency\
\ = time.perf_counter() - start\n\n text = result[\"choices\"][0][\"\
message\"][\"content\"]\n usage = result.get(\"usage\", {})\n completion_tokens\
\ = usage.get(\"completion_tokens\", len(text.split()))\n\n LLMResult\
\ = namedtuple(\"LLMResult\", [\"text\", \"latency_s\", \"completion_tokens\"\
])\n return LLMResult(text, latency, completion_tokens)\n\n"
image: python:3.13-slim
exec-rerank-documents:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- rerank_documents
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef rerank_documents(\n query: str,\n documents: list,\n \
\ reranker_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker\"\
,\n top_k: int = 3\n) -> NamedTuple(\"RerankResult\", [(\"documents\"\
, list), (\"latency_s\", float)]):\n \"\"\"Rerank documents using BGE\
\ reranker.\"\"\"\n import time\n import httpx\n from collections\
\ import namedtuple\n\n if not documents:\n RerankResult = namedtuple(\"\
RerankResult\", [\"documents\", \"latency_s\"])\n return RerankResult([],\
\ 0.0)\n\n start = time.perf_counter()\n with httpx.Client(timeout=60.0)\
\ as client:\n response = client.post(\n f\"{reranker_url}/v1/rerank\"\
,\n json={\n \"query\": query,\n \
\ \"documents\": [doc[\"text\"] for doc in documents],\n \
\ \"model\": \"bge-reranker-v2-m3\"\n }\n )\n \
\ result = response.json()\n latency = time.perf_counter() - start\n\
\n # Sort by rerank score\n reranked = sorted(\n zip(documents,\
\ result.get(\"scores\", [0] * len(documents))),\n key=lambda x:\
\ x[1],\n reverse=True\n )[:top_k]\n\n RerankResult = namedtuple(\"\
RerankResult\", [\"documents\", \"latency_s\"])\n return RerankResult([doc\
\ for doc, score in reranked], latency)\n\n"
image: python:3.13-slim
exec-retrieve-context:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- retrieve_context
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'pymilvus' &&\
\ \"$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef retrieve_context(\n embedding: list,\n milvus_host: str\
\ = \"milvus.ai-ml.svc.cluster.local\",\n collection_name: str = \"knowledge_base\"\
,\n top_k: int = 5\n) -> NamedTuple(\"RetrieveResult\", [(\"documents\"\
, list), (\"latency_s\", float)]):\n \"\"\"Retrieve relevant documents\
\ from Milvus vector database.\"\"\"\n import time\n from pymilvus\
\ import connections, Collection, utility\n from collections import namedtuple\n\
\n start = time.perf_counter()\n connections.connect(host=milvus_host,\
\ port=19530)\n\n if not utility.has_collection(collection_name):\n \
\ latency = time.perf_counter() - start\n RetrieveResult =\
\ namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n \
\ return RetrieveResult([], latency)\n\n collection = Collection(collection_name)\n\
\ collection.load()\n\n results = collection.search(\n data=[embedding],\n\
\ anns_field=\"embedding\",\n param={\"metric_type\": \"COSINE\"\
, \"params\": {\"nprobe\": 10}},\n limit=top_k,\n output_fields=[\"\
text\", \"source\"]\n )\n latency = time.perf_counter() - start\n\n\
\ documents = []\n for hits in results:\n for hit in hits:\n\
\ documents.append({\n \"text\": hit.entity.get(\"\
text\"),\n \"source\": hit.entity.get(\"source\"),\n \
\ \"score\": hit.distance\n })\n\n RetrieveResult\
\ = namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n \
\ return RetrieveResult(documents, latency)\n\n"
image: python:3.13-slim
pipelineInfo:
description: 'RAG query pipeline: Embed -> Retrieve -> Rerank -> LLM. Logs per-step
latency to MLflow.'
name: rag-query-pipeline
root:
dag:
tasks:
generate-embeddings:
cachingOptions:
enableCache: true
componentRef:
name: comp-generate-embeddings
inputs:
parameters:
text:
componentInputParameter: query
taskInfo:
name: generate-embeddings
generate-response:
cachingOptions:
enableCache: true
componentRef:
name: comp-generate-response
dependentTasks:
- rerank-documents
inputs:
parameters:
context:
taskOutputParameter:
outputParameterKey: documents
producerTask: rerank-documents
query:
componentInputParameter: query
taskInfo:
name: generate-response
rerank-documents:
cachingOptions:
enableCache: true
componentRef:
name: comp-rerank-documents
dependentTasks:
- retrieve-context
inputs:
parameters:
documents:
taskOutputParameter:
outputParameterKey: documents
producerTask: retrieve-context
query:
componentInputParameter: query
taskInfo:
name: rerank-documents
retrieve-context:
cachingOptions:
enableCache: true
componentRef:
name: comp-retrieve-context
dependentTasks:
- generate-embeddings
inputs:
parameters:
collection_name:
componentInputParameter: collection_name
embedding:
taskOutputParameter:
outputParameterKey: embedding
producerTask: generate-embeddings
taskInfo:
name: retrieve-context
inputDefinitions:
parameters:
collection_name:
defaultValue: knowledge_base
description: Milvus collection name
isOptional: true
parameterType: STRING
query:
description: Text query
parameterType: STRING
schemaVersion: 2.1.0
sdkVersion: kfp-2.12.1

87
tts_pipeline.yaml Normal file
View File

@@ -0,0 +1,87 @@
# PIPELINE DEFINITION
# Name: text-to-speech-pipeline
# Description: Simple text to speech pipeline
# Inputs:
# text: str
components:
comp-synthesize-speech:
executorLabel: exec-synthesize-speech
inputDefinitions:
parameters:
text:
parameterType: STRING
tts_url:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts
isOptional: true
parameterType: STRING
outputDefinitions:
parameters:
audio_b64:
parameterType: STRING
latency_s:
parameterType: NUMBER_DOUBLE
deploymentSpec:
executors:
exec-synthesize-speech:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- synthesize_speech
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef synthesize_speech(\n text: str,\n tts_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts\"\
\n) -> NamedTuple(\"TTSResult\", [(\"audio_b64\", str), (\"latency_s\",\
\ float)]):\n \"\"\"Convert text to speech using TTS service.\"\"\"\n\
\ import base64\n import time\n import httpx\n from collections\
\ import namedtuple\n\n start = time.perf_counter()\n with httpx.Client(timeout=120.0)\
\ as client:\n response = client.post(\n f\"{tts_url}/v1/audio/speech\"\
,\n json={\n \"input\": text,\n \
\ \"voice\": \"en_US-lessac-high\",\n \"response_format\"\
: \"wav\"\n }\n )\n audio_b64 = base64.b64encode(response.content).decode(\"\
utf-8\")\n latency = time.perf_counter() - start\n\n TTSResult = namedtuple(\"\
TTSResult\", [\"audio_b64\", \"latency_s\"])\n return TTSResult(audio_b64,\
\ latency)\n\n"
image: python:3.13-slim
pipelineInfo:
description: Simple text to speech pipeline
name: text-to-speech-pipeline
root:
dag:
tasks:
synthesize-speech:
cachingOptions:
enableCache: true
componentRef:
name: comp-synthesize-speech
inputs:
parameters:
text:
componentInputParameter: text
taskInfo:
name: synthesize-speech
inputDefinitions:
parameters:
text:
parameterType: STRING
schemaVersion: 2.1.0
sdkVersion: kfp-2.12.1

454
vllm_tuning_pipeline.py Normal file
View File

@@ -0,0 +1,454 @@
#!/usr/bin/env python3
"""
vLLM Tuning Evaluation Pipeline - Kubeflow Pipelines SDK
Runs inference benchmarks with different vLLM configurations and logs
results to MLflow so you can compare APC, chunked prefill, speculative
decoding, and GPU memory utilisation settings side-by-side.
Usage:
pip install kfp==2.12.1
python vllm_tuning_pipeline.py
# Upload vllm_tuning_pipeline.yaml to Kubeflow Pipelines UI
"""
from kfp import dsl
from kfp import compiler
from typing import NamedTuple
MLFLOW_IMAGE = "python:3.13-slim"
MLFLOW_PACKAGES = ["mlflow>=2.10.0", "boto3", "psycopg2-binary"]
BENCH_PACKAGES = ["httpx"]
# ---- MLflow components ----
@dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES)
def create_tuning_run(
experiment_name: str,
run_name: str,
tuning_params: dict,
mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
) -> NamedTuple("RunInfo", [("run_id", str), ("experiment_id", str)]):
"""Create an MLflow run for a vLLM tuning experiment."""
import os
import mlflow
from mlflow.tracking import MlflowClient
from collections import namedtuple
mlflow.set_tracking_uri(mlflow_tracking_uri)
client = MlflowClient()
exp = client.get_experiment_by_name(experiment_name)
experiment_id = (
exp.experiment_id
if exp
else client.create_experiment(
name=experiment_name,
artifact_location=f"/mlflow/artifacts/{experiment_name}",
)
)
tags = {
"pipeline.type": "vllm-tuning",
"kfp.run_id": os.environ.get("KFP_RUN_ID", "unknown"),
}
run = mlflow.start_run(
experiment_id=experiment_id, run_name=run_name, tags=tags
)
# Log every tuning param
for key, value in tuning_params.items():
mlflow.log_param(f"vllm.{key}", value)
run_id = run.info.run_id
mlflow.end_run()
RunInfo = namedtuple("RunInfo", ["run_id", "experiment_id"])
return RunInfo(run_id, experiment_id)
@dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES)
def log_benchmark_results(
run_id: str,
metrics: dict,
mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
) -> str:
"""Log benchmark metrics to MLflow and close the run."""
import json
import tempfile
import mlflow
from mlflow.tracking import MlflowClient
from pathlib import Path
mlflow.set_tracking_uri(mlflow_tracking_uri)
client = MlflowClient()
for key, value in metrics.items():
client.log_metric(run_id, key, float(value))
# Save full results as artifact
with tempfile.TemporaryDirectory() as tmpdir:
path = Path(tmpdir) / "benchmark_results.json"
path.write_text(json.dumps(metrics, indent=2))
client.log_artifact(run_id, str(path))
client.set_terminated(run_id, status="FINISHED")
return run_id
# ---- Benchmark components ----
@dsl.component(
base_image="python:3.13-slim",
packages_to_install=BENCH_PACKAGES,
)
def build_prompt_suite() -> list:
"""Return a list of test prompts spanning short, medium, and long inputs."""
return [
{
"id": "short-1",
"category": "short",
"messages": [
{"role": "user", "content": "What is the capital of France?"}
],
"max_tokens": 64,
},
{
"id": "short-2",
"category": "short",
"messages": [
{"role": "user", "content": "Explain quantum computing in one sentence."}
],
"max_tokens": 64,
},
{
"id": "medium-1",
"category": "medium",
"messages": [
{
"role": "system",
"content": "You are a helpful AI assistant running on a homelab.",
},
{
"role": "user",
"content": (
"Compare and contrast supervised and unsupervised "
"machine learning. Give examples of each and explain "
"when you would choose one over the other."
),
},
],
"max_tokens": 512,
},
{
"id": "medium-2",
"category": "medium",
"messages": [
{
"role": "user",
"content": (
"Write a Python function that implements a binary search "
"tree with insert, search, and delete operations. Include "
"docstrings and type hints."
),
},
],
"max_tokens": 1024,
},
{
"id": "long-1",
"category": "long",
"messages": [
{
"role": "system",
"content": "You are a technical writer for a Kubernetes homelab blog.",
},
{
"role": "user",
"content": (
"Write a detailed tutorial on setting up a multi-node "
"Kubernetes cluster with Talos Linux, covering: "
"1) Hardware requirements and network topology, "
"2) Talos machine config generation, "
"3) Control plane bootstrapping, "
"4) Worker node joining, "
"5) CNI setup with Cilium, "
"6) Storage with Rook-Ceph, "
"7) GitOps with Flux CD. "
"Include YAML examples for each step."
),
},
],
"max_tokens": 2048,
},
{
"id": "repeat-prefix-1",
"category": "prefix-cache-test",
"messages": [
{
"role": "system",
"content": "You are a helpful AI assistant running on a homelab.",
},
{
"role": "user",
"content": (
"Compare and contrast supervised and unsupervised "
"machine learning. Now focus specifically on "
"reinforcement learning and how it differs."
),
},
],
"max_tokens": 512,
},
]
@dsl.component(
base_image="python:3.13-slim",
packages_to_install=BENCH_PACKAGES,
)
def run_benchmark(
prompts: list,
llm_endpoint: str,
model_name: str,
num_warmup: int = 2,
num_iterations: int = 3,
) -> dict:
"""
Run all prompts through the LLM endpoint and collect timing metrics.
Returns aggregate metrics: p50/p95/mean latency, tokens/sec, TTFT.
"""
import time
import statistics
import httpx
all_latencies: list[float] = []
all_tps: list[float] = []
all_ttft: list[float] = []
per_category: dict[str, list[float]] = {}
with httpx.Client(timeout=300.0) as client:
# Warmup
for _ in range(num_warmup):
try:
client.post(
f"{llm_endpoint}/v1/chat/completions",
json={
"model": model_name,
"messages": [{"role": "user", "content": "Hi"}],
"max_tokens": 8,
"temperature": 0,
},
)
except Exception:
pass
# Benchmark
for iteration in range(num_iterations):
for prompt in prompts:
category = prompt.get("category", "unknown")
payload = {
"model": model_name,
"messages": prompt["messages"],
"max_tokens": prompt.get("max_tokens", 256),
"temperature": 0,
"stream": True,
}
try:
t_start = time.perf_counter()
first_token_time = None
with client.stream(
"POST",
f"{llm_endpoint}/v1/chat/completions",
json=payload,
) as resp:
resp.raise_for_status()
completion_tokens = 0
for line in resp.iter_lines():
if not line.startswith("data: "):
continue
chunk = line[6:]
if chunk == "[DONE]":
break
if first_token_time is None:
first_token_time = time.perf_counter()
completion_tokens += 1
t_end = time.perf_counter()
latency = t_end - t_start
ttft = (
(first_token_time - t_start)
if first_token_time
else latency
)
tps = (
completion_tokens / latency if latency > 0 else 0
)
all_latencies.append(latency)
all_tps.append(tps)
all_ttft.append(ttft)
per_category.setdefault(category, []).append(latency)
except Exception as exc:
# Record failure but keep going
all_latencies.append(-1)
all_tps.append(0)
all_ttft.append(-1)
# Compute aggregates
valid_latencies = [l for l in all_latencies if l > 0]
valid_tps = [t for t in all_tps if t > 0]
valid_ttft = [t for t in all_ttft if t > 0]
def safe_stat(values, func):
return func(values) if values else 0
metrics = {
"total_requests": len(all_latencies),
"successful_requests": len(valid_latencies),
"failed_requests": len(all_latencies) - len(valid_latencies),
# Latency
"latency_mean_s": safe_stat(valid_latencies, statistics.mean),
"latency_p50_s": safe_stat(
valid_latencies,
lambda v: statistics.median(v),
),
"latency_p95_s": safe_stat(
valid_latencies,
lambda v: sorted(v)[int(len(v) * 0.95)] if v else 0,
),
# Throughput
"tokens_per_second_mean": safe_stat(valid_tps, statistics.mean),
"tokens_per_second_p50": safe_stat(
valid_tps, lambda v: statistics.median(v)
),
# Time to first token
"ttft_mean_s": safe_stat(valid_ttft, statistics.mean),
"ttft_p50_s": safe_stat(valid_ttft, lambda v: statistics.median(v)),
"ttft_p95_s": safe_stat(
valid_ttft,
lambda v: sorted(v)[int(len(v) * 0.95)] if v else 0,
),
}
# Per-category latency
for cat, lats in per_category.items():
valid = [l for l in lats if l > 0]
if valid:
metrics[f"latency_mean_{cat}_s"] = statistics.mean(valid)
return metrics
# ---- Pipeline ----
@dsl.pipeline(
name="vllm-tuning-evaluation",
description=(
"Benchmark vLLM with different tuning configurations. "
"Logs latency, TPS, and TTFT to MLflow for A/B comparison."
),
)
def vllm_tuning_pipeline(
llm_endpoint: str = "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm",
model_name: str = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4",
# Tuning knobs (match env vars in rayservice.yaml)
enable_prefix_caching: str = "true",
enable_chunked_prefill: str = "true",
num_speculative_tokens: str = "3",
ngram_prompt_lookup_max: str = "4",
gpu_memory_utilization: str = "0.90",
# Benchmark config
num_warmup: int = 2,
num_iterations: int = 3,
run_label: str = "baseline",
):
"""
vLLM Tuning Evaluation Pipeline
Run this multiple times with different tuning params, then compare
runs in the MLflow "vllm-tuning" experiment.
Args:
llm_endpoint: vLLM inference endpoint URL
model_name: HF model identifier
enable_prefix_caching: "true" or "false"
enable_chunked_prefill: "true" or "false"
num_speculative_tokens: number of speculative tokens (0 = off)
ngram_prompt_lookup_max: ngram window for spec decode (0 = off)
gpu_memory_utilization: 0.0 - 1.0
num_warmup: warmup requests before timing
num_iterations: how many times to repeat the prompt suite
run_label: human-readable label (e.g. "apc-on-spec3")
"""
tuning_params = {
"enable_prefix_caching": enable_prefix_caching,
"enable_chunked_prefill": enable_chunked_prefill,
"num_speculative_tokens": num_speculative_tokens,
"ngram_prompt_lookup_max": ngram_prompt_lookup_max,
"gpu_memory_utilization": gpu_memory_utilization,
"model_name": model_name,
"llm_endpoint": llm_endpoint,
"num_warmup": str(num_warmup),
"num_iterations": str(num_iterations),
}
# 1. Create MLflow run
mlflow_run = create_tuning_run(
experiment_name="vllm-tuning",
run_name=f"vllm-{run_label}",
tuning_params=tuning_params,
)
# 2. Build prompt suite
prompts_task = build_prompt_suite()
prompts_task.set_caching_options(enable_caching=True)
# 3. Run benchmark
bench_task = run_benchmark(
prompts=prompts_task.output,
llm_endpoint=llm_endpoint,
model_name=model_name,
num_warmup=num_warmup,
num_iterations=num_iterations,
)
bench_task.set_caching_options(enable_caching=False)
# 4. Log results to MLflow
log_task = log_benchmark_results(
run_id=mlflow_run.outputs["run_id"],
metrics=bench_task.output,
)
if __name__ == "__main__":
compiler.Compiler().compile(
vllm_tuning_pipeline,
"vllm_tuning_pipeline.yaml",
)
print("Compiled: vllm_tuning_pipeline.yaml")
print()
print("Example runs to compare configurations:")
print(" # Baseline (current config)")
print(" kfp run submit vllm_tuning_pipeline.yaml --run-label=baseline")
print()
print(" # APC disabled")
print(" kfp run submit vllm_tuning_pipeline.yaml \\")
print(" --enable-prefix-caching=false --run-label=no-apc")
print()
print(" # No speculative decoding")
print(" kfp run submit vllm_tuning_pipeline.yaml \\")
print(" --num-speculative-tokens=0 --run-label=no-spec")
print()
print(" # Aggressive spec decode")
print(" kfp run submit vllm_tuning_pipeline.yaml \\")
print(" --num-speculative-tokens=5 --ngram-prompt-lookup-max=6 --run-label=spec5-ngram6")

501
vllm_tuning_pipeline.yaml Normal file
View File

@@ -0,0 +1,501 @@
# PIPELINE DEFINITION
# Name: vllm-tuning-evaluation
# Description: Benchmark vLLM with different tuning configurations. Logs latency, TPS, and TTFT to MLflow for A/B comparison.
# Inputs:
# enable_chunked_prefill: str [Default: 'true']
# enable_prefix_caching: str [Default: 'true']
# gpu_memory_utilization: str [Default: '0.90']
# llm_endpoint: str [Default: 'http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm']
# model_name: str [Default: 'hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4']
# ngram_prompt_lookup_max: str [Default: '4']
# num_iterations: int [Default: 3.0]
# num_speculative_tokens: str [Default: '3']
# num_warmup: int [Default: 2.0]
# run_label: str [Default: 'baseline']
components:
comp-build-prompt-suite:
executorLabel: exec-build-prompt-suite
outputDefinitions:
parameters:
Output:
parameterType: LIST
comp-create-tuning-run:
executorLabel: exec-create-tuning-run
inputDefinitions:
parameters:
experiment_name:
parameterType: STRING
mlflow_tracking_uri:
defaultValue: http://mlflow.mlflow.svc.cluster.local:80
isOptional: true
parameterType: STRING
run_name:
parameterType: STRING
tuning_params:
parameterType: STRUCT
outputDefinitions:
parameters:
experiment_id:
parameterType: STRING
run_id:
parameterType: STRING
comp-log-benchmark-results:
executorLabel: exec-log-benchmark-results
inputDefinitions:
parameters:
metrics:
parameterType: STRUCT
mlflow_tracking_uri:
defaultValue: http://mlflow.mlflow.svc.cluster.local:80
isOptional: true
parameterType: STRING
run_id:
parameterType: STRING
outputDefinitions:
parameters:
Output:
parameterType: STRING
comp-run-benchmark:
executorLabel: exec-run-benchmark
inputDefinitions:
parameters:
llm_endpoint:
parameterType: STRING
model_name:
parameterType: STRING
num_iterations:
defaultValue: 3.0
isOptional: true
parameterType: NUMBER_INTEGER
num_warmup:
defaultValue: 2.0
isOptional: true
parameterType: NUMBER_INTEGER
prompts:
parameterType: LIST
outputDefinitions:
parameters:
Output:
parameterType: STRUCT
deploymentSpec:
executors:
exec-build-prompt-suite:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- build_prompt_suite
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef build_prompt_suite() -> list:\n \"\"\"Return a list of test\
\ prompts spanning short, medium, and long inputs.\"\"\"\n return [\n\
\ {\n \"id\": \"short-1\",\n \"category\":\
\ \"short\",\n \"messages\": [\n {\"role\": \"\
user\", \"content\": \"What is the capital of France?\"}\n ],\n\
\ \"max_tokens\": 64,\n },\n {\n \"\
id\": \"short-2\",\n \"category\": \"short\",\n \"\
messages\": [\n {\"role\": \"user\", \"content\": \"Explain\
\ quantum computing in one sentence.\"}\n ],\n \"\
max_tokens\": 64,\n },\n {\n \"id\": \"medium-1\"\
,\n \"category\": \"medium\",\n \"messages\": [\n\
\ {\n \"role\": \"system\",\n \
\ \"content\": \"You are a helpful AI assistant running on a\
\ homelab.\",\n },\n {\n \
\ \"role\": \"user\",\n \"content\": (\n \
\ \"Compare and contrast supervised and unsupervised \"\n \
\ \"machine learning. Give examples of each and explain\
\ \"\n \"when you would choose one over the other.\"\
\n ),\n },\n ],\n \
\ \"max_tokens\": 512,\n },\n {\n \"id\": \"\
medium-2\",\n \"category\": \"medium\",\n \"messages\"\
: [\n {\n \"role\": \"user\",\n \
\ \"content\": (\n \"Write a Python\
\ function that implements a binary search \"\n \"\
tree with insert, search, and delete operations. Include \"\n \
\ \"docstrings and type hints.\"\n ),\n\
\ },\n ],\n \"max_tokens\": 1024,\n\
\ },\n {\n \"id\": \"long-1\",\n \"\
category\": \"long\",\n \"messages\": [\n {\n\
\ \"role\": \"system\",\n \"content\"\
: \"You are a technical writer for a Kubernetes homelab blog.\",\n \
\ },\n {\n \"role\": \"user\"\
,\n \"content\": (\n \"Write a\
\ detailed tutorial on setting up a multi-node \"\n \
\ \"Kubernetes cluster with Talos Linux, covering: \"\n \
\ \"1) Hardware requirements and network topology, \"\n \
\ \"2) Talos machine config generation, \"\n \
\ \"3) Control plane bootstrapping, \"\n \
\ \"4) Worker node joining, \"\n \"5) CNI setup\
\ with Cilium, \"\n \"6) Storage with Rook-Ceph,\
\ \"\n \"7) GitOps with Flux CD. \"\n \
\ \"Include YAML examples for each step.\"\n \
\ ),\n },\n ],\n \"max_tokens\"\
: 2048,\n },\n {\n \"id\": \"repeat-prefix-1\"\
,\n \"category\": \"prefix-cache-test\",\n \"messages\"\
: [\n {\n \"role\": \"system\",\n \
\ \"content\": \"You are a helpful AI assistant running on\
\ a homelab.\",\n },\n {\n \
\ \"role\": \"user\",\n \"content\": (\n \
\ \"Compare and contrast supervised and unsupervised \"\n\
\ \"machine learning. Now focus specifically on \"\
\n \"reinforcement learning and how it differs.\"\
\n ),\n },\n ],\n \
\ \"max_tokens\": 512,\n },\n ]\n\n"
image: python:3.13-slim
exec-create-tuning-run:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- create_tuning_run
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
\ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef create_tuning_run(\n experiment_name: str,\n run_name:\
\ str,\n tuning_params: dict,\n mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\
,\n) -> NamedTuple(\"RunInfo\", [(\"run_id\", str), (\"experiment_id\",\
\ str)]):\n \"\"\"Create an MLflow run for a vLLM tuning experiment.\"\
\"\"\n import os\n import mlflow\n from mlflow.tracking import\
\ MlflowClient\n from collections import namedtuple\n\n mlflow.set_tracking_uri(mlflow_tracking_uri)\n\
\ client = MlflowClient()\n\n exp = client.get_experiment_by_name(experiment_name)\n\
\ experiment_id = (\n exp.experiment_id\n if exp\n \
\ else client.create_experiment(\n name=experiment_name,\n\
\ artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\
,\n )\n )\n\n tags = {\n \"pipeline.type\": \"vllm-tuning\"\
,\n \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\", \"unknown\"),\n\
\ }\n\n run = mlflow.start_run(\n experiment_id=experiment_id,\
\ run_name=run_name, tags=tags\n )\n # Log every tuning param\n \
\ for key, value in tuning_params.items():\n mlflow.log_param(f\"\
vllm.{key}\", value)\n run_id = run.info.run_id\n mlflow.end_run()\n\
\n RunInfo = namedtuple(\"RunInfo\", [\"run_id\", \"experiment_id\"])\n\
\ return RunInfo(run_id, experiment_id)\n\n"
image: python:3.13-slim
exec-log-benchmark-results:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- log_benchmark_results
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
\ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef log_benchmark_results(\n run_id: str,\n metrics: dict,\n\
\ mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\
,\n) -> str:\n \"\"\"Log benchmark metrics to MLflow and close the run.\"\
\"\"\n import json\n import tempfile\n import mlflow\n from\
\ mlflow.tracking import MlflowClient\n from pathlib import Path\n\n\
\ mlflow.set_tracking_uri(mlflow_tracking_uri)\n client = MlflowClient()\n\
\n for key, value in metrics.items():\n client.log_metric(run_id,\
\ key, float(value))\n\n # Save full results as artifact\n with tempfile.TemporaryDirectory()\
\ as tmpdir:\n path = Path(tmpdir) / \"benchmark_results.json\"\n\
\ path.write_text(json.dumps(metrics, indent=2))\n client.log_artifact(run_id,\
\ str(path))\n\n client.set_terminated(run_id, status=\"FINISHED\")\n\
\ return run_id\n\n"
image: python:3.13-slim
exec-run-benchmark:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- run_benchmark
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef run_benchmark(\n prompts: list,\n llm_endpoint: str,\n\
\ model_name: str,\n num_warmup: int = 2,\n num_iterations: int\
\ = 3,\n) -> dict:\n \"\"\"\n Run all prompts through the LLM endpoint\
\ and collect timing metrics.\n\n Returns aggregate metrics: p50/p95/mean\
\ latency, tokens/sec, TTFT.\n \"\"\"\n import time\n import statistics\n\
\ import httpx\n\n all_latencies: list[float] = []\n all_tps: list[float]\
\ = []\n all_ttft: list[float] = []\n per_category: dict[str, list[float]]\
\ = {}\n\n with httpx.Client(timeout=300.0) as client:\n # Warmup\n\
\ for _ in range(num_warmup):\n try:\n \
\ client.post(\n f\"{llm_endpoint}/v1/chat/completions\"\
,\n json={\n \"model\": model_name,\n\
\ \"messages\": [{\"role\": \"user\", \"content\"\
: \"Hi\"}],\n \"max_tokens\": 8,\n \
\ \"temperature\": 0,\n },\n \
\ )\n except Exception:\n pass\n\n # Benchmark\n\
\ for iteration in range(num_iterations):\n for prompt\
\ in prompts:\n category = prompt.get(\"category\", \"unknown\"\
)\n payload = {\n \"model\": model_name,\n\
\ \"messages\": prompt[\"messages\"],\n \
\ \"max_tokens\": prompt.get(\"max_tokens\", 256),\n \
\ \"temperature\": 0,\n \"stream\": True,\n \
\ }\n\n try:\n t_start = time.perf_counter()\n\
\ first_token_time = None\n\n with\
\ client.stream(\n \"POST\",\n \
\ f\"{llm_endpoint}/v1/chat/completions\",\n \
\ json=payload,\n ) as resp:\n \
\ resp.raise_for_status()\n completion_tokens =\
\ 0\n for line in resp.iter_lines():\n \
\ if not line.startswith(\"data: \"):\n \
\ continue\n chunk = line[6:]\n\
\ if chunk == \"[DONE]\":\n \
\ break\n if first_token_time is\
\ None:\n first_token_time = time.perf_counter()\n\
\ completion_tokens += 1\n\n \
\ t_end = time.perf_counter()\n latency = t_end -\
\ t_start\n ttft = (\n (first_token_time\
\ - t_start)\n if first_token_time\n \
\ else latency\n )\n tps\
\ = (\n completion_tokens / latency if latency >\
\ 0 else 0\n )\n\n all_latencies.append(latency)\n\
\ all_tps.append(tps)\n all_ttft.append(ttft)\n\
\ per_category.setdefault(category, []).append(latency)\n\
\n except Exception as exc:\n # Record\
\ failure but keep going\n all_latencies.append(-1)\n\
\ all_tps.append(0)\n all_ttft.append(-1)\n\
\n # Compute aggregates\n valid_latencies = [l for l in all_latencies\
\ if l > 0]\n valid_tps = [t for t in all_tps if t > 0]\n valid_ttft\
\ = [t for t in all_ttft if t > 0]\n\n def safe_stat(values, func):\n\
\ return func(values) if values else 0\n\n metrics = {\n \
\ \"total_requests\": len(all_latencies),\n \"successful_requests\"\
: len(valid_latencies),\n \"failed_requests\": len(all_latencies)\
\ - len(valid_latencies),\n # Latency\n \"latency_mean_s\"\
: safe_stat(valid_latencies, statistics.mean),\n \"latency_p50_s\"\
: safe_stat(\n valid_latencies,\n lambda v: statistics.median(v),\n\
\ ),\n \"latency_p95_s\": safe_stat(\n valid_latencies,\n\
\ lambda v: sorted(v)[int(len(v) * 0.95)] if v else 0,\n \
\ ),\n # Throughput\n \"tokens_per_second_mean\": safe_stat(valid_tps,\
\ statistics.mean),\n \"tokens_per_second_p50\": safe_stat(\n \
\ valid_tps, lambda v: statistics.median(v)\n ),\n \
\ # Time to first token\n \"ttft_mean_s\": safe_stat(valid_ttft,\
\ statistics.mean),\n \"ttft_p50_s\": safe_stat(valid_ttft, lambda\
\ v: statistics.median(v)),\n \"ttft_p95_s\": safe_stat(\n \
\ valid_ttft,\n lambda v: sorted(v)[int(len(v) * 0.95)]\
\ if v else 0,\n ),\n }\n\n # Per-category latency\n for\
\ cat, lats in per_category.items():\n valid = [l for l in lats if\
\ l > 0]\n if valid:\n metrics[f\"latency_mean_{cat}_s\"\
] = statistics.mean(valid)\n\n return metrics\n\n"
image: python:3.13-slim
pipelineInfo:
description: Benchmark vLLM with different tuning configurations. Logs latency,
TPS, and TTFT to MLflow for A/B comparison.
name: vllm-tuning-evaluation
root:
dag:
tasks:
build-prompt-suite:
cachingOptions:
enableCache: true
componentRef:
name: comp-build-prompt-suite
taskInfo:
name: build-prompt-suite
create-tuning-run:
cachingOptions:
enableCache: true
componentRef:
name: comp-create-tuning-run
inputs:
parameters:
experiment_name:
runtimeValue:
constant: vllm-tuning
pipelinechannel--enable_chunked_prefill:
componentInputParameter: enable_chunked_prefill
pipelinechannel--enable_prefix_caching:
componentInputParameter: enable_prefix_caching
pipelinechannel--gpu_memory_utilization:
componentInputParameter: gpu_memory_utilization
pipelinechannel--llm_endpoint:
componentInputParameter: llm_endpoint
pipelinechannel--model_name:
componentInputParameter: model_name
pipelinechannel--ngram_prompt_lookup_max:
componentInputParameter: ngram_prompt_lookup_max
pipelinechannel--num_iterations:
componentInputParameter: num_iterations
pipelinechannel--num_speculative_tokens:
componentInputParameter: num_speculative_tokens
pipelinechannel--num_warmup:
componentInputParameter: num_warmup
pipelinechannel--run_label:
componentInputParameter: run_label
run_name:
runtimeValue:
constant: vllm-{{$.inputs.parameters['pipelinechannel--run_label']}}
tuning_params:
runtimeValue:
constant:
enable_chunked_prefill: '{{$.inputs.parameters[''pipelinechannel--enable_chunked_prefill'']}}'
enable_prefix_caching: '{{$.inputs.parameters[''pipelinechannel--enable_prefix_caching'']}}'
gpu_memory_utilization: '{{$.inputs.parameters[''pipelinechannel--gpu_memory_utilization'']}}'
llm_endpoint: '{{$.inputs.parameters[''pipelinechannel--llm_endpoint'']}}'
model_name: '{{$.inputs.parameters[''pipelinechannel--model_name'']}}'
ngram_prompt_lookup_max: '{{$.inputs.parameters[''pipelinechannel--ngram_prompt_lookup_max'']}}'
num_iterations: '{{$.inputs.parameters[''pipelinechannel--num_iterations'']}}'
num_speculative_tokens: '{{$.inputs.parameters[''pipelinechannel--num_speculative_tokens'']}}'
num_warmup: '{{$.inputs.parameters[''pipelinechannel--num_warmup'']}}'
taskInfo:
name: create-tuning-run
log-benchmark-results:
cachingOptions:
enableCache: true
componentRef:
name: comp-log-benchmark-results
dependentTasks:
- create-tuning-run
- run-benchmark
inputs:
parameters:
metrics:
taskOutputParameter:
outputParameterKey: Output
producerTask: run-benchmark
run_id:
taskOutputParameter:
outputParameterKey: run_id
producerTask: create-tuning-run
taskInfo:
name: log-benchmark-results
run-benchmark:
cachingOptions: {}
componentRef:
name: comp-run-benchmark
dependentTasks:
- build-prompt-suite
inputs:
parameters:
llm_endpoint:
componentInputParameter: llm_endpoint
model_name:
componentInputParameter: model_name
num_iterations:
componentInputParameter: num_iterations
num_warmup:
componentInputParameter: num_warmup
prompts:
taskOutputParameter:
outputParameterKey: Output
producerTask: build-prompt-suite
taskInfo:
name: run-benchmark
inputDefinitions:
parameters:
enable_chunked_prefill:
defaultValue: 'true'
description: '"true" or "false"'
isOptional: true
parameterType: STRING
enable_prefix_caching:
defaultValue: 'true'
description: '"true" or "false"'
isOptional: true
parameterType: STRING
gpu_memory_utilization:
defaultValue: '0.90'
description: 0.0 - 1.0
isOptional: true
parameterType: STRING
llm_endpoint:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
description: vLLM inference endpoint URL
isOptional: true
parameterType: STRING
model_name:
defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
description: HF model identifier
isOptional: true
parameterType: STRING
ngram_prompt_lookup_max:
defaultValue: '4'
description: ngram window for spec decode (0 = off)
isOptional: true
parameterType: STRING
num_iterations:
defaultValue: 3.0
description: how many times to repeat the prompt suite
isOptional: true
parameterType: NUMBER_INTEGER
num_speculative_tokens:
defaultValue: '3'
description: number of speculative tokens (0 = off)
isOptional: true
parameterType: STRING
num_warmup:
defaultValue: 2.0
description: warmup requests before timing
isOptional: true
parameterType: NUMBER_INTEGER
run_label:
defaultValue: baseline
description: human-readable label (e.g. "apc-on-spec3")
isOptional: true
parameterType: STRING
schemaVersion: 2.1.0
sdkVersion: kfp-2.12.1

View File

@@ -12,6 +12,11 @@ Usage:
from kfp import dsl
from kfp import compiler
from typing import NamedTuple
MLFLOW_IMAGE = "python:3.13-slim"
MLFLOW_PACKAGES = ["mlflow>=2.10.0", "boto3", "psycopg2-binary"]
@dsl.component(
@@ -21,13 +26,16 @@ from kfp import compiler
def transcribe_audio(
audio_b64: str,
whisper_url: str = "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper"
) -> str:
) -> NamedTuple("STTResult", [("text", str), ("latency_s", float), ("audio_duration_s", float)]):
"""Transcribe audio using Whisper STT service."""
import base64
import time
import httpx
from collections import namedtuple
audio_bytes = base64.b64decode(audio_b64)
start = time.perf_counter()
with httpx.Client(timeout=120.0) as client:
response = client.post(
f"{whisper_url}/v1/audio/transcriptions",
@@ -35,8 +43,14 @@ def transcribe_audio(
data={"model": "whisper-large-v3", "language": "en"}
)
result = response.json()
latency = time.perf_counter() - start
return result.get("text", "")
text = result.get("text", "")
# Estimate audio duration from WAV header (16-bit PCM, 16kHz)
audio_duration = max(len(audio_bytes) / (16000 * 2), 0.1)
STTResult = namedtuple("STTResult", ["text", "latency_s", "audio_duration_s"])
return STTResult(text, latency, audio_duration)
@dsl.component(
@@ -46,18 +60,23 @@ def transcribe_audio(
def generate_embeddings(
text: str,
embeddings_url: str = "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings"
) -> list:
) -> NamedTuple("EmbedResult", [("embedding", list), ("latency_s", float)]):
"""Generate embeddings for RAG retrieval."""
import time
import httpx
from collections import namedtuple
start = time.perf_counter()
with httpx.Client(timeout=60.0) as client:
response = client.post(
f"{embeddings_url}/embeddings",
json={"input": text, "model": "bge-small-en-v1.5"}
)
result = response.json()
latency = time.perf_counter() - start
return result["data"][0]["embedding"]
EmbedResult = namedtuple("EmbedResult", ["embedding", "latency_s"])
return EmbedResult(result["data"][0]["embedding"], latency)
@dsl.component(
@@ -69,14 +88,19 @@ def retrieve_context(
milvus_host: str = "milvus.ai-ml.svc.cluster.local",
collection_name: str = "knowledge_base",
top_k: int = 5
) -> list:
) -> NamedTuple("RetrieveResult", [("documents", list), ("latency_s", float)]):
"""Retrieve relevant documents from Milvus vector database."""
import time
from pymilvus import connections, Collection, utility
from collections import namedtuple
start = time.perf_counter()
connections.connect(host=milvus_host, port=19530)
if not utility.has_collection(collection_name):
return []
latency = time.perf_counter() - start
RetrieveResult = namedtuple("RetrieveResult", ["documents", "latency_s"])
return RetrieveResult([], latency)
collection = Collection(collection_name)
collection.load()
@@ -88,6 +112,7 @@ def retrieve_context(
limit=top_k,
output_fields=["text", "source"]
)
latency = time.perf_counter() - start
documents = []
for hits in results:
@@ -98,7 +123,8 @@ def retrieve_context(
"score": hit.distance
})
return documents
RetrieveResult = namedtuple("RetrieveResult", ["documents", "latency_s"])
return RetrieveResult(documents, latency)
@dsl.component(
@@ -110,13 +136,17 @@ def rerank_documents(
documents: list,
reranker_url: str = "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker",
top_k: int = 3
) -> list:
) -> NamedTuple("RerankResult", [("documents", list), ("latency_s", float)]):
"""Rerank documents using BGE reranker."""
import time
import httpx
from collections import namedtuple
if not documents:
return []
RerankResult = namedtuple("RerankResult", ["documents", "latency_s"])
return RerankResult([], 0.0)
start = time.perf_counter()
with httpx.Client(timeout=60.0) as client:
response = client.post(
f"{reranker_url}/v1/rerank",
@@ -127,6 +157,7 @@ def rerank_documents(
}
)
result = response.json()
latency = time.perf_counter() - start
# Sort by rerank score
reranked = sorted(
@@ -135,7 +166,8 @@ def rerank_documents(
reverse=True
)[:top_k]
return [doc for doc, score in reranked]
RerankResult = namedtuple("RerankResult", ["documents", "latency_s"])
return RerankResult([doc for doc, score in reranked], latency)
@dsl.component(
@@ -147,9 +179,11 @@ def generate_response(
context: list,
vllm_url: str = "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm",
model: str = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
) -> str:
) -> NamedTuple("LLMResult", [("text", str), ("latency_s", float), ("completion_tokens", int)]):
"""Generate response using vLLM."""
import time
import httpx
from collections import namedtuple
# Build context
if context:
@@ -167,6 +201,7 @@ Keep responses concise and natural for speech synthesis."""
{"role": "user", "content": user_content}
]
start = time.perf_counter()
with httpx.Client(timeout=180.0) as client:
response = client.post(
f"{vllm_url}/v1/chat/completions",
@@ -178,8 +213,14 @@ Keep responses concise and natural for speech synthesis."""
}
)
result = response.json()
latency = time.perf_counter() - start
return result["choices"][0]["message"]["content"]
text = result["choices"][0]["message"]["content"]
usage = result.get("usage", {})
completion_tokens = usage.get("completion_tokens", len(text.split()))
LLMResult = namedtuple("LLMResult", ["text", "latency_s", "completion_tokens"])
return LLMResult(text, latency, completion_tokens)
@dsl.component(
@@ -189,11 +230,14 @@ Keep responses concise and natural for speech synthesis."""
def synthesize_speech(
text: str,
tts_url: str = "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts"
) -> str:
) -> NamedTuple("TTSResult", [("audio_b64", str), ("latency_s", float)]):
"""Convert text to speech using TTS service."""
import base64
import time
import httpx
from collections import namedtuple
start = time.perf_counter()
with httpx.Client(timeout=120.0) as client:
response = client.post(
f"{tts_url}/v1/audio/speech",
@@ -204,13 +248,86 @@ def synthesize_speech(
}
)
audio_b64 = base64.b64encode(response.content).decode("utf-8")
latency = time.perf_counter() - start
return audio_b64
TTSResult = namedtuple("TTSResult", ["audio_b64", "latency_s"])
return TTSResult(audio_b64, latency)
# ---- MLflow logging component ----
@dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES)
def log_pipeline_metrics(
stt_latency: float,
stt_audio_duration: float,
embed_latency: float,
retrieve_latency: float,
rerank_latency: float,
llm_latency: float,
llm_completion_tokens: int,
tts_latency: float,
experiment_name: str = "voice-pipeline-metrics",
run_name: str = "voice-pipeline",
mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
) -> str:
"""Log per-step latency metrics to MLflow for the full voice pipeline."""
import os
import mlflow
from mlflow.tracking import MlflowClient
mlflow.set_tracking_uri(mlflow_tracking_uri)
client = MlflowClient()
exp = client.get_experiment_by_name(experiment_name)
experiment_id = (
exp.experiment_id
if exp
else client.create_experiment(
name=experiment_name,
artifact_location=f"/mlflow/artifacts/{experiment_name}",
)
)
run = mlflow.start_run(
experiment_id=experiment_id,
run_name=run_name,
tags={
"pipeline.type": "voice-assistant",
"kfp.run_id": os.environ.get("KFP_RUN_ID", "unknown"),
},
)
total_latency = (
stt_latency + embed_latency + retrieve_latency
+ rerank_latency + llm_latency + tts_latency
)
stt_rtf = stt_latency / stt_audio_duration if stt_audio_duration > 0 else 0
llm_tps = llm_completion_tokens / llm_latency if llm_latency > 0 else 0
mlflow.log_metrics({
"stt_latency_s": stt_latency,
"stt_audio_duration_s": stt_audio_duration,
"stt_realtime_factor": stt_rtf,
"embed_latency_s": embed_latency,
"retrieve_latency_s": retrieve_latency,
"rerank_latency_s": rerank_latency,
"llm_latency_s": llm_latency,
"llm_completion_tokens": llm_completion_tokens,
"llm_tokens_per_second": llm_tps,
"tts_latency_s": tts_latency,
"total_pipeline_latency_s": total_latency,
})
mlflow.end_run()
return run.info.run_id
# ---- Pipelines ----
@dsl.pipeline(
name="voice-assistant-rag-pipeline",
description="End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus -> Rerank -> LLM -> TTS"
description="End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus -> Rerank -> LLM -> TTS. Logs per-step latency to MLflow."
)
def voice_assistant_pipeline(
audio_b64: str,
@@ -229,29 +346,41 @@ def voice_assistant_pipeline(
transcribe_task.set_caching_options(enable_caching=False)
# Step 2: Generate embeddings
embed_task = generate_embeddings(text=transcribe_task.output)
embed_task = generate_embeddings(text=transcribe_task.outputs["text"])
embed_task.set_caching_options(enable_caching=True)
# Step 3: Retrieve context from Milvus
retrieve_task = retrieve_context(
embedding=embed_task.output,
embedding=embed_task.outputs["embedding"],
collection_name=collection_name
)
# Step 4: Rerank documents
rerank_task = rerank_documents(
query=transcribe_task.output,
documents=retrieve_task.output
query=transcribe_task.outputs["text"],
documents=retrieve_task.outputs["documents"]
)
# Step 5: Generate response with context
llm_task = generate_response(
query=transcribe_task.output,
context=rerank_task.output
query=transcribe_task.outputs["text"],
context=rerank_task.outputs["documents"]
)
# Step 6: Synthesize speech
tts_task = synthesize_speech(text=llm_task.output)
tts_task = synthesize_speech(text=llm_task.outputs["text"])
# Step 7: Log all per-step latencies to MLflow
log_task = log_pipeline_metrics(
stt_latency=transcribe_task.outputs["latency_s"],
stt_audio_duration=transcribe_task.outputs["audio_duration_s"],
embed_latency=embed_task.outputs["latency_s"],
retrieve_latency=retrieve_task.outputs["latency_s"],
rerank_latency=rerank_task.outputs["latency_s"],
llm_latency=llm_task.outputs["latency_s"],
llm_completion_tokens=llm_task.outputs["completion_tokens"],
tts_latency=tts_task.outputs["latency_s"],
)
@dsl.pipeline(
@@ -265,7 +394,7 @@ def text_to_speech_pipeline(text: str):
@dsl.pipeline(
name="rag-query-pipeline",
description="RAG query pipeline: Embed -> Retrieve -> Rerank -> LLM"
description="RAG query pipeline: Embed -> Retrieve -> Rerank -> LLM. Logs per-step latency to MLflow."
)
def rag_query_pipeline(
query: str,
@@ -283,20 +412,20 @@ def rag_query_pipeline(
# Retrieve from Milvus
retrieve_task = retrieve_context(
embedding=embed_task.output,
embedding=embed_task.outputs["embedding"],
collection_name=collection_name
)
# Rerank
rerank_task = rerank_documents(
query=query,
documents=retrieve_task.output
documents=retrieve_task.outputs["documents"]
)
# Generate response
llm_task = generate_response(
query=query,
context=rerank_task.output
context=rerank_task.outputs["documents"]
)

656
voice_pipeline.yaml Normal file
View File

@@ -0,0 +1,656 @@
# PIPELINE DEFINITION
# Name: voice-assistant-rag-pipeline
# Description: End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus -> Rerank -> LLM -> TTS. Logs per-step latency to MLflow.
# Inputs:
# audio_b64: str
# collection_name: str [Default: 'knowledge_base']
components:
comp-generate-embeddings:
executorLabel: exec-generate-embeddings
inputDefinitions:
parameters:
embeddings_url:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings
isOptional: true
parameterType: STRING
text:
parameterType: STRING
outputDefinitions:
parameters:
embedding:
parameterType: LIST
latency_s:
parameterType: NUMBER_DOUBLE
comp-generate-response:
executorLabel: exec-generate-response
inputDefinitions:
parameters:
context:
parameterType: LIST
model:
defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
isOptional: true
parameterType: STRING
query:
parameterType: STRING
vllm_url:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
isOptional: true
parameterType: STRING
outputDefinitions:
parameters:
completion_tokens:
parameterType: NUMBER_INTEGER
latency_s:
parameterType: NUMBER_DOUBLE
text:
parameterType: STRING
comp-log-pipeline-metrics:
executorLabel: exec-log-pipeline-metrics
inputDefinitions:
parameters:
embed_latency:
parameterType: NUMBER_DOUBLE
experiment_name:
defaultValue: voice-pipeline-metrics
isOptional: true
parameterType: STRING
llm_completion_tokens:
parameterType: NUMBER_INTEGER
llm_latency:
parameterType: NUMBER_DOUBLE
mlflow_tracking_uri:
defaultValue: http://mlflow.mlflow.svc.cluster.local:80
isOptional: true
parameterType: STRING
rerank_latency:
parameterType: NUMBER_DOUBLE
retrieve_latency:
parameterType: NUMBER_DOUBLE
run_name:
defaultValue: voice-pipeline
isOptional: true
parameterType: STRING
stt_audio_duration:
parameterType: NUMBER_DOUBLE
stt_latency:
parameterType: NUMBER_DOUBLE
tts_latency:
parameterType: NUMBER_DOUBLE
outputDefinitions:
parameters:
Output:
parameterType: STRING
comp-rerank-documents:
executorLabel: exec-rerank-documents
inputDefinitions:
parameters:
documents:
parameterType: LIST
query:
parameterType: STRING
reranker_url:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker
isOptional: true
parameterType: STRING
top_k:
defaultValue: 3.0
isOptional: true
parameterType: NUMBER_INTEGER
outputDefinitions:
parameters:
documents:
parameterType: LIST
latency_s:
parameterType: NUMBER_DOUBLE
comp-retrieve-context:
executorLabel: exec-retrieve-context
inputDefinitions:
parameters:
collection_name:
defaultValue: knowledge_base
isOptional: true
parameterType: STRING
embedding:
parameterType: LIST
milvus_host:
defaultValue: milvus.ai-ml.svc.cluster.local
isOptional: true
parameterType: STRING
top_k:
defaultValue: 5.0
isOptional: true
parameterType: NUMBER_INTEGER
outputDefinitions:
parameters:
documents:
parameterType: LIST
latency_s:
parameterType: NUMBER_DOUBLE
comp-synthesize-speech:
executorLabel: exec-synthesize-speech
inputDefinitions:
parameters:
text:
parameterType: STRING
tts_url:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts
isOptional: true
parameterType: STRING
outputDefinitions:
parameters:
audio_b64:
parameterType: STRING
latency_s:
parameterType: NUMBER_DOUBLE
comp-transcribe-audio:
executorLabel: exec-transcribe-audio
inputDefinitions:
parameters:
audio_b64:
parameterType: STRING
whisper_url:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper
isOptional: true
parameterType: STRING
outputDefinitions:
parameters:
audio_duration_s:
parameterType: NUMBER_DOUBLE
latency_s:
parameterType: NUMBER_DOUBLE
text:
parameterType: STRING
deploymentSpec:
executors:
exec-generate-embeddings:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- generate_embeddings
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef generate_embeddings(\n text: str,\n embeddings_url: str\
\ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings\"\
\n) -> NamedTuple(\"EmbedResult\", [(\"embedding\", list), (\"latency_s\"\
, float)]):\n \"\"\"Generate embeddings for RAG retrieval.\"\"\"\n \
\ import time\n import httpx\n from collections import namedtuple\n\
\n start = time.perf_counter()\n with httpx.Client(timeout=60.0) as\
\ client:\n response = client.post(\n f\"{embeddings_url}/embeddings\"\
,\n json={\"input\": text, \"model\": \"bge-small-en-v1.5\"}\n\
\ )\n result = response.json()\n latency = time.perf_counter()\
\ - start\n\n EmbedResult = namedtuple(\"EmbedResult\", [\"embedding\"\
, \"latency_s\"])\n return EmbedResult(result[\"data\"][0][\"embedding\"\
], latency)\n\n"
image: python:3.13-slim
exec-generate-response:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- generate_response
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef generate_response(\n query: str,\n context: list,\n \
\ vllm_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm\"\
,\n model: str = \"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4\"\
\n) -> NamedTuple(\"LLMResult\", [(\"text\", str), (\"latency_s\", float),\
\ (\"completion_tokens\", int)]):\n \"\"\"Generate response using vLLM.\"\
\"\"\n import time\n import httpx\n from collections import namedtuple\n\
\n # Build context\n if context:\n context_text = \"\\n\\n\"\
.join([doc[\"text\"] for doc in context])\n user_content = f\"Context:\\\
n{context_text}\\n\\nQuestion: {query}\"\n else:\n user_content\
\ = query\n\n system_prompt = \"\"\"You are a helpful voice assistant.\n\
Answer questions based on the provided context when available.\nKeep responses\
\ concise and natural for speech synthesis.\"\"\"\n\n messages = [\n\
\ {\"role\": \"system\", \"content\": system_prompt},\n {\"\
role\": \"user\", \"content\": user_content}\n ]\n\n start = time.perf_counter()\n\
\ with httpx.Client(timeout=180.0) as client:\n response = client.post(\n\
\ f\"{vllm_url}/v1/chat/completions\",\n json={\n\
\ \"model\": model,\n \"messages\": messages,\n\
\ \"max_tokens\": 512,\n \"temperature\":\
\ 0.7\n }\n )\n result = response.json()\n latency\
\ = time.perf_counter() - start\n\n text = result[\"choices\"][0][\"\
message\"][\"content\"]\n usage = result.get(\"usage\", {})\n completion_tokens\
\ = usage.get(\"completion_tokens\", len(text.split()))\n\n LLMResult\
\ = namedtuple(\"LLMResult\", [\"text\", \"latency_s\", \"completion_tokens\"\
])\n return LLMResult(text, latency, completion_tokens)\n\n"
image: python:3.13-slim
exec-log-pipeline-metrics:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- log_pipeline_metrics
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
\ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef log_pipeline_metrics(\n stt_latency: float,\n stt_audio_duration:\
\ float,\n embed_latency: float,\n retrieve_latency: float,\n rerank_latency:\
\ float,\n llm_latency: float,\n llm_completion_tokens: int,\n \
\ tts_latency: float,\n experiment_name: str = \"voice-pipeline-metrics\"\
,\n run_name: str = \"voice-pipeline\",\n mlflow_tracking_uri: str\
\ = \"http://mlflow.mlflow.svc.cluster.local:80\",\n) -> str:\n \"\"\"\
Log per-step latency metrics to MLflow for the full voice pipeline.\"\"\"\
\n import os\n import mlflow\n from mlflow.tracking import MlflowClient\n\
\n mlflow.set_tracking_uri(mlflow_tracking_uri)\n client = MlflowClient()\n\
\n exp = client.get_experiment_by_name(experiment_name)\n experiment_id\
\ = (\n exp.experiment_id\n if exp\n else client.create_experiment(\n\
\ name=experiment_name,\n artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\
,\n )\n )\n\n run = mlflow.start_run(\n experiment_id=experiment_id,\n\
\ run_name=run_name,\n tags={\n \"pipeline.type\"\
: \"voice-assistant\",\n \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\"\
, \"unknown\"),\n },\n )\n\n total_latency = (\n stt_latency\
\ + embed_latency + retrieve_latency\n + rerank_latency + llm_latency\
\ + tts_latency\n )\n stt_rtf = stt_latency / stt_audio_duration if\
\ stt_audio_duration > 0 else 0\n llm_tps = llm_completion_tokens / llm_latency\
\ if llm_latency > 0 else 0\n\n mlflow.log_metrics({\n \"stt_latency_s\"\
: stt_latency,\n \"stt_audio_duration_s\": stt_audio_duration,\n\
\ \"stt_realtime_factor\": stt_rtf,\n \"embed_latency_s\"\
: embed_latency,\n \"retrieve_latency_s\": retrieve_latency,\n \
\ \"rerank_latency_s\": rerank_latency,\n \"llm_latency_s\"\
: llm_latency,\n \"llm_completion_tokens\": llm_completion_tokens,\n\
\ \"llm_tokens_per_second\": llm_tps,\n \"tts_latency_s\"\
: tts_latency,\n \"total_pipeline_latency_s\": total_latency,\n \
\ })\n mlflow.end_run()\n return run.info.run_id\n\n"
image: python:3.13-slim
exec-rerank-documents:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- rerank_documents
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef rerank_documents(\n query: str,\n documents: list,\n \
\ reranker_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker\"\
,\n top_k: int = 3\n) -> NamedTuple(\"RerankResult\", [(\"documents\"\
, list), (\"latency_s\", float)]):\n \"\"\"Rerank documents using BGE\
\ reranker.\"\"\"\n import time\n import httpx\n from collections\
\ import namedtuple\n\n if not documents:\n RerankResult = namedtuple(\"\
RerankResult\", [\"documents\", \"latency_s\"])\n return RerankResult([],\
\ 0.0)\n\n start = time.perf_counter()\n with httpx.Client(timeout=60.0)\
\ as client:\n response = client.post(\n f\"{reranker_url}/v1/rerank\"\
,\n json={\n \"query\": query,\n \
\ \"documents\": [doc[\"text\"] for doc in documents],\n \
\ \"model\": \"bge-reranker-v2-m3\"\n }\n )\n \
\ result = response.json()\n latency = time.perf_counter() - start\n\
\n # Sort by rerank score\n reranked = sorted(\n zip(documents,\
\ result.get(\"scores\", [0] * len(documents))),\n key=lambda x:\
\ x[1],\n reverse=True\n )[:top_k]\n\n RerankResult = namedtuple(\"\
RerankResult\", [\"documents\", \"latency_s\"])\n return RerankResult([doc\
\ for doc, score in reranked], latency)\n\n"
image: python:3.13-slim
exec-retrieve-context:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- retrieve_context
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'pymilvus' &&\
\ \"$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef retrieve_context(\n embedding: list,\n milvus_host: str\
\ = \"milvus.ai-ml.svc.cluster.local\",\n collection_name: str = \"knowledge_base\"\
,\n top_k: int = 5\n) -> NamedTuple(\"RetrieveResult\", [(\"documents\"\
, list), (\"latency_s\", float)]):\n \"\"\"Retrieve relevant documents\
\ from Milvus vector database.\"\"\"\n import time\n from pymilvus\
\ import connections, Collection, utility\n from collections import namedtuple\n\
\n start = time.perf_counter()\n connections.connect(host=milvus_host,\
\ port=19530)\n\n if not utility.has_collection(collection_name):\n \
\ latency = time.perf_counter() - start\n RetrieveResult =\
\ namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n \
\ return RetrieveResult([], latency)\n\n collection = Collection(collection_name)\n\
\ collection.load()\n\n results = collection.search(\n data=[embedding],\n\
\ anns_field=\"embedding\",\n param={\"metric_type\": \"COSINE\"\
, \"params\": {\"nprobe\": 10}},\n limit=top_k,\n output_fields=[\"\
text\", \"source\"]\n )\n latency = time.perf_counter() - start\n\n\
\ documents = []\n for hits in results:\n for hit in hits:\n\
\ documents.append({\n \"text\": hit.entity.get(\"\
text\"),\n \"source\": hit.entity.get(\"source\"),\n \
\ \"score\": hit.distance\n })\n\n RetrieveResult\
\ = namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n \
\ return RetrieveResult(documents, latency)\n\n"
image: python:3.13-slim
exec-synthesize-speech:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- synthesize_speech
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef synthesize_speech(\n text: str,\n tts_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts\"\
\n) -> NamedTuple(\"TTSResult\", [(\"audio_b64\", str), (\"latency_s\",\
\ float)]):\n \"\"\"Convert text to speech using TTS service.\"\"\"\n\
\ import base64\n import time\n import httpx\n from collections\
\ import namedtuple\n\n start = time.perf_counter()\n with httpx.Client(timeout=120.0)\
\ as client:\n response = client.post(\n f\"{tts_url}/v1/audio/speech\"\
,\n json={\n \"input\": text,\n \
\ \"voice\": \"en_US-lessac-high\",\n \"response_format\"\
: \"wav\"\n }\n )\n audio_b64 = base64.b64encode(response.content).decode(\"\
utf-8\")\n latency = time.perf_counter() - start\n\n TTSResult = namedtuple(\"\
TTSResult\", [\"audio_b64\", \"latency_s\"])\n return TTSResult(audio_b64,\
\ latency)\n\n"
image: python:3.13-slim
exec-transcribe-audio:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- transcribe_audio
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef transcribe_audio(\n audio_b64: str,\n whisper_url: str\
\ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper\"\
\n) -> NamedTuple(\"STTResult\", [(\"text\", str), (\"latency_s\", float),\
\ (\"audio_duration_s\", float)]):\n \"\"\"Transcribe audio using Whisper\
\ STT service.\"\"\"\n import base64\n import time\n import httpx\n\
\ from collections import namedtuple\n\n audio_bytes = base64.b64decode(audio_b64)\n\
\n start = time.perf_counter()\n with httpx.Client(timeout=120.0)\
\ as client:\n response = client.post(\n f\"{whisper_url}/v1/audio/transcriptions\"\
,\n files={\"file\": (\"audio.wav\", audio_bytes, \"audio/wav\"\
)},\n data={\"model\": \"whisper-large-v3\", \"language\": \"\
en\"}\n )\n result = response.json()\n latency = time.perf_counter()\
\ - start\n\n text = result.get(\"text\", \"\")\n # Estimate audio\
\ duration from WAV header (16-bit PCM, 16kHz)\n audio_duration = max(len(audio_bytes)\
\ / (16000 * 2), 0.1)\n\n STTResult = namedtuple(\"STTResult\", [\"text\"\
, \"latency_s\", \"audio_duration_s\"])\n return STTResult(text, latency,\
\ audio_duration)\n\n"
image: python:3.13-slim
pipelineInfo:
description: 'End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus ->
Rerank -> LLM -> TTS. Logs per-step latency to MLflow.'
name: voice-assistant-rag-pipeline
root:
dag:
tasks:
generate-embeddings:
cachingOptions:
enableCache: true
componentRef:
name: comp-generate-embeddings
dependentTasks:
- transcribe-audio
inputs:
parameters:
text:
taskOutputParameter:
outputParameterKey: text
producerTask: transcribe-audio
taskInfo:
name: generate-embeddings
generate-response:
cachingOptions:
enableCache: true
componentRef:
name: comp-generate-response
dependentTasks:
- rerank-documents
- transcribe-audio
inputs:
parameters:
context:
taskOutputParameter:
outputParameterKey: documents
producerTask: rerank-documents
query:
taskOutputParameter:
outputParameterKey: text
producerTask: transcribe-audio
taskInfo:
name: generate-response
log-pipeline-metrics:
cachingOptions:
enableCache: true
componentRef:
name: comp-log-pipeline-metrics
dependentTasks:
- generate-embeddings
- generate-response
- rerank-documents
- retrieve-context
- synthesize-speech
- transcribe-audio
inputs:
parameters:
embed_latency:
taskOutputParameter:
outputParameterKey: latency_s
producerTask: generate-embeddings
llm_completion_tokens:
taskOutputParameter:
outputParameterKey: completion_tokens
producerTask: generate-response
llm_latency:
taskOutputParameter:
outputParameterKey: latency_s
producerTask: generate-response
rerank_latency:
taskOutputParameter:
outputParameterKey: latency_s
producerTask: rerank-documents
retrieve_latency:
taskOutputParameter:
outputParameterKey: latency_s
producerTask: retrieve-context
stt_audio_duration:
taskOutputParameter:
outputParameterKey: audio_duration_s
producerTask: transcribe-audio
stt_latency:
taskOutputParameter:
outputParameterKey: latency_s
producerTask: transcribe-audio
tts_latency:
taskOutputParameter:
outputParameterKey: latency_s
producerTask: synthesize-speech
taskInfo:
name: log-pipeline-metrics
rerank-documents:
cachingOptions:
enableCache: true
componentRef:
name: comp-rerank-documents
dependentTasks:
- retrieve-context
- transcribe-audio
inputs:
parameters:
documents:
taskOutputParameter:
outputParameterKey: documents
producerTask: retrieve-context
query:
taskOutputParameter:
outputParameterKey: text
producerTask: transcribe-audio
taskInfo:
name: rerank-documents
retrieve-context:
cachingOptions:
enableCache: true
componentRef:
name: comp-retrieve-context
dependentTasks:
- generate-embeddings
inputs:
parameters:
collection_name:
componentInputParameter: collection_name
embedding:
taskOutputParameter:
outputParameterKey: embedding
producerTask: generate-embeddings
taskInfo:
name: retrieve-context
synthesize-speech:
cachingOptions:
enableCache: true
componentRef:
name: comp-synthesize-speech
dependentTasks:
- generate-response
inputs:
parameters:
text:
taskOutputParameter:
outputParameterKey: text
producerTask: generate-response
taskInfo:
name: synthesize-speech
transcribe-audio:
cachingOptions: {}
componentRef:
name: comp-transcribe-audio
inputs:
parameters:
audio_b64:
componentInputParameter: audio_b64
taskInfo:
name: transcribe-audio
inputDefinitions:
parameters:
audio_b64:
description: Base64-encoded audio file
parameterType: STRING
collection_name:
defaultValue: knowledge_base
description: Milvus collection for RAG
isOptional: true
parameterType: STRING
schemaVersion: 2.1.0
sdkVersion: kfp-2.12.1