feat: add vLLM tuning pipeline + recompile voice pipelines with MLflow

New:
- vllm_tuning_pipeline.py: A/B benchmark different vLLM configs,
  logs latency/TPS/TTFT to MLflow (vllm-tuning experiment)
- vllm_tuning_pipeline.yaml: compiled KFP YAML

Updated:
- voice_pipeline.py: per-step NamedTuple outputs with latency tracking,
  new log_pipeline_metrics MLflow component
- voice_pipeline.yaml, tts_pipeline.yaml, rag_pipeline.yaml: recompiled
This commit is contained in:
2026-02-13 08:24:11 -05:00
parent cee21f124c
commit bc4b230dd9
6 changed files with 2216 additions and 26 deletions

501
vllm_tuning_pipeline.yaml Normal file
View File

@@ -0,0 +1,501 @@
# PIPELINE DEFINITION
# Name: vllm-tuning-evaluation
# Description: Benchmark vLLM with different tuning configurations. Logs latency, TPS, and TTFT to MLflow for A/B comparison.
# Inputs:
# enable_chunked_prefill: str [Default: 'true']
# enable_prefix_caching: str [Default: 'true']
# gpu_memory_utilization: str [Default: '0.90']
# llm_endpoint: str [Default: 'http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm']
# model_name: str [Default: 'hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4']
# ngram_prompt_lookup_max: str [Default: '4']
# num_iterations: int [Default: 3.0]
# num_speculative_tokens: str [Default: '3']
# num_warmup: int [Default: 2.0]
# run_label: str [Default: 'baseline']
components:
comp-build-prompt-suite:
executorLabel: exec-build-prompt-suite
outputDefinitions:
parameters:
Output:
parameterType: LIST
comp-create-tuning-run:
executorLabel: exec-create-tuning-run
inputDefinitions:
parameters:
experiment_name:
parameterType: STRING
mlflow_tracking_uri:
defaultValue: http://mlflow.mlflow.svc.cluster.local:80
isOptional: true
parameterType: STRING
run_name:
parameterType: STRING
tuning_params:
parameterType: STRUCT
outputDefinitions:
parameters:
experiment_id:
parameterType: STRING
run_id:
parameterType: STRING
comp-log-benchmark-results:
executorLabel: exec-log-benchmark-results
inputDefinitions:
parameters:
metrics:
parameterType: STRUCT
mlflow_tracking_uri:
defaultValue: http://mlflow.mlflow.svc.cluster.local:80
isOptional: true
parameterType: STRING
run_id:
parameterType: STRING
outputDefinitions:
parameters:
Output:
parameterType: STRING
comp-run-benchmark:
executorLabel: exec-run-benchmark
inputDefinitions:
parameters:
llm_endpoint:
parameterType: STRING
model_name:
parameterType: STRING
num_iterations:
defaultValue: 3.0
isOptional: true
parameterType: NUMBER_INTEGER
num_warmup:
defaultValue: 2.0
isOptional: true
parameterType: NUMBER_INTEGER
prompts:
parameterType: LIST
outputDefinitions:
parameters:
Output:
parameterType: STRUCT
deploymentSpec:
executors:
exec-build-prompt-suite:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- build_prompt_suite
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef build_prompt_suite() -> list:\n \"\"\"Return a list of test\
\ prompts spanning short, medium, and long inputs.\"\"\"\n return [\n\
\ {\n \"id\": \"short-1\",\n \"category\":\
\ \"short\",\n \"messages\": [\n {\"role\": \"\
user\", \"content\": \"What is the capital of France?\"}\n ],\n\
\ \"max_tokens\": 64,\n },\n {\n \"\
id\": \"short-2\",\n \"category\": \"short\",\n \"\
messages\": [\n {\"role\": \"user\", \"content\": \"Explain\
\ quantum computing in one sentence.\"}\n ],\n \"\
max_tokens\": 64,\n },\n {\n \"id\": \"medium-1\"\
,\n \"category\": \"medium\",\n \"messages\": [\n\
\ {\n \"role\": \"system\",\n \
\ \"content\": \"You are a helpful AI assistant running on a\
\ homelab.\",\n },\n {\n \
\ \"role\": \"user\",\n \"content\": (\n \
\ \"Compare and contrast supervised and unsupervised \"\n \
\ \"machine learning. Give examples of each and explain\
\ \"\n \"when you would choose one over the other.\"\
\n ),\n },\n ],\n \
\ \"max_tokens\": 512,\n },\n {\n \"id\": \"\
medium-2\",\n \"category\": \"medium\",\n \"messages\"\
: [\n {\n \"role\": \"user\",\n \
\ \"content\": (\n \"Write a Python\
\ function that implements a binary search \"\n \"\
tree with insert, search, and delete operations. Include \"\n \
\ \"docstrings and type hints.\"\n ),\n\
\ },\n ],\n \"max_tokens\": 1024,\n\
\ },\n {\n \"id\": \"long-1\",\n \"\
category\": \"long\",\n \"messages\": [\n {\n\
\ \"role\": \"system\",\n \"content\"\
: \"You are a technical writer for a Kubernetes homelab blog.\",\n \
\ },\n {\n \"role\": \"user\"\
,\n \"content\": (\n \"Write a\
\ detailed tutorial on setting up a multi-node \"\n \
\ \"Kubernetes cluster with Talos Linux, covering: \"\n \
\ \"1) Hardware requirements and network topology, \"\n \
\ \"2) Talos machine config generation, \"\n \
\ \"3) Control plane bootstrapping, \"\n \
\ \"4) Worker node joining, \"\n \"5) CNI setup\
\ with Cilium, \"\n \"6) Storage with Rook-Ceph,\
\ \"\n \"7) GitOps with Flux CD. \"\n \
\ \"Include YAML examples for each step.\"\n \
\ ),\n },\n ],\n \"max_tokens\"\
: 2048,\n },\n {\n \"id\": \"repeat-prefix-1\"\
,\n \"category\": \"prefix-cache-test\",\n \"messages\"\
: [\n {\n \"role\": \"system\",\n \
\ \"content\": \"You are a helpful AI assistant running on\
\ a homelab.\",\n },\n {\n \
\ \"role\": \"user\",\n \"content\": (\n \
\ \"Compare and contrast supervised and unsupervised \"\n\
\ \"machine learning. Now focus specifically on \"\
\n \"reinforcement learning and how it differs.\"\
\n ),\n },\n ],\n \
\ \"max_tokens\": 512,\n },\n ]\n\n"
image: python:3.13-slim
exec-create-tuning-run:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- create_tuning_run
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
\ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef create_tuning_run(\n experiment_name: str,\n run_name:\
\ str,\n tuning_params: dict,\n mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\
,\n) -> NamedTuple(\"RunInfo\", [(\"run_id\", str), (\"experiment_id\",\
\ str)]):\n \"\"\"Create an MLflow run for a vLLM tuning experiment.\"\
\"\"\n import os\n import mlflow\n from mlflow.tracking import\
\ MlflowClient\n from collections import namedtuple\n\n mlflow.set_tracking_uri(mlflow_tracking_uri)\n\
\ client = MlflowClient()\n\n exp = client.get_experiment_by_name(experiment_name)\n\
\ experiment_id = (\n exp.experiment_id\n if exp\n \
\ else client.create_experiment(\n name=experiment_name,\n\
\ artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\
,\n )\n )\n\n tags = {\n \"pipeline.type\": \"vllm-tuning\"\
,\n \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\", \"unknown\"),\n\
\ }\n\n run = mlflow.start_run(\n experiment_id=experiment_id,\
\ run_name=run_name, tags=tags\n )\n # Log every tuning param\n \
\ for key, value in tuning_params.items():\n mlflow.log_param(f\"\
vllm.{key}\", value)\n run_id = run.info.run_id\n mlflow.end_run()\n\
\n RunInfo = namedtuple(\"RunInfo\", [\"run_id\", \"experiment_id\"])\n\
\ return RunInfo(run_id, experiment_id)\n\n"
image: python:3.13-slim
exec-log-benchmark-results:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- log_benchmark_results
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
\ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef log_benchmark_results(\n run_id: str,\n metrics: dict,\n\
\ mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\
,\n) -> str:\n \"\"\"Log benchmark metrics to MLflow and close the run.\"\
\"\"\n import json\n import tempfile\n import mlflow\n from\
\ mlflow.tracking import MlflowClient\n from pathlib import Path\n\n\
\ mlflow.set_tracking_uri(mlflow_tracking_uri)\n client = MlflowClient()\n\
\n for key, value in metrics.items():\n client.log_metric(run_id,\
\ key, float(value))\n\n # Save full results as artifact\n with tempfile.TemporaryDirectory()\
\ as tmpdir:\n path = Path(tmpdir) / \"benchmark_results.json\"\n\
\ path.write_text(json.dumps(metrics, indent=2))\n client.log_artifact(run_id,\
\ str(path))\n\n client.set_terminated(run_id, status=\"FINISHED\")\n\
\ return run_id\n\n"
image: python:3.13-slim
exec-run-benchmark:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- run_benchmark
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef run_benchmark(\n prompts: list,\n llm_endpoint: str,\n\
\ model_name: str,\n num_warmup: int = 2,\n num_iterations: int\
\ = 3,\n) -> dict:\n \"\"\"\n Run all prompts through the LLM endpoint\
\ and collect timing metrics.\n\n Returns aggregate metrics: p50/p95/mean\
\ latency, tokens/sec, TTFT.\n \"\"\"\n import time\n import statistics\n\
\ import httpx\n\n all_latencies: list[float] = []\n all_tps: list[float]\
\ = []\n all_ttft: list[float] = []\n per_category: dict[str, list[float]]\
\ = {}\n\n with httpx.Client(timeout=300.0) as client:\n # Warmup\n\
\ for _ in range(num_warmup):\n try:\n \
\ client.post(\n f\"{llm_endpoint}/v1/chat/completions\"\
,\n json={\n \"model\": model_name,\n\
\ \"messages\": [{\"role\": \"user\", \"content\"\
: \"Hi\"}],\n \"max_tokens\": 8,\n \
\ \"temperature\": 0,\n },\n \
\ )\n except Exception:\n pass\n\n # Benchmark\n\
\ for iteration in range(num_iterations):\n for prompt\
\ in prompts:\n category = prompt.get(\"category\", \"unknown\"\
)\n payload = {\n \"model\": model_name,\n\
\ \"messages\": prompt[\"messages\"],\n \
\ \"max_tokens\": prompt.get(\"max_tokens\", 256),\n \
\ \"temperature\": 0,\n \"stream\": True,\n \
\ }\n\n try:\n t_start = time.perf_counter()\n\
\ first_token_time = None\n\n with\
\ client.stream(\n \"POST\",\n \
\ f\"{llm_endpoint}/v1/chat/completions\",\n \
\ json=payload,\n ) as resp:\n \
\ resp.raise_for_status()\n completion_tokens =\
\ 0\n for line in resp.iter_lines():\n \
\ if not line.startswith(\"data: \"):\n \
\ continue\n chunk = line[6:]\n\
\ if chunk == \"[DONE]\":\n \
\ break\n if first_token_time is\
\ None:\n first_token_time = time.perf_counter()\n\
\ completion_tokens += 1\n\n \
\ t_end = time.perf_counter()\n latency = t_end -\
\ t_start\n ttft = (\n (first_token_time\
\ - t_start)\n if first_token_time\n \
\ else latency\n )\n tps\
\ = (\n completion_tokens / latency if latency >\
\ 0 else 0\n )\n\n all_latencies.append(latency)\n\
\ all_tps.append(tps)\n all_ttft.append(ttft)\n\
\ per_category.setdefault(category, []).append(latency)\n\
\n except Exception as exc:\n # Record\
\ failure but keep going\n all_latencies.append(-1)\n\
\ all_tps.append(0)\n all_ttft.append(-1)\n\
\n # Compute aggregates\n valid_latencies = [l for l in all_latencies\
\ if l > 0]\n valid_tps = [t for t in all_tps if t > 0]\n valid_ttft\
\ = [t for t in all_ttft if t > 0]\n\n def safe_stat(values, func):\n\
\ return func(values) if values else 0\n\n metrics = {\n \
\ \"total_requests\": len(all_latencies),\n \"successful_requests\"\
: len(valid_latencies),\n \"failed_requests\": len(all_latencies)\
\ - len(valid_latencies),\n # Latency\n \"latency_mean_s\"\
: safe_stat(valid_latencies, statistics.mean),\n \"latency_p50_s\"\
: safe_stat(\n valid_latencies,\n lambda v: statistics.median(v),\n\
\ ),\n \"latency_p95_s\": safe_stat(\n valid_latencies,\n\
\ lambda v: sorted(v)[int(len(v) * 0.95)] if v else 0,\n \
\ ),\n # Throughput\n \"tokens_per_second_mean\": safe_stat(valid_tps,\
\ statistics.mean),\n \"tokens_per_second_p50\": safe_stat(\n \
\ valid_tps, lambda v: statistics.median(v)\n ),\n \
\ # Time to first token\n \"ttft_mean_s\": safe_stat(valid_ttft,\
\ statistics.mean),\n \"ttft_p50_s\": safe_stat(valid_ttft, lambda\
\ v: statistics.median(v)),\n \"ttft_p95_s\": safe_stat(\n \
\ valid_ttft,\n lambda v: sorted(v)[int(len(v) * 0.95)]\
\ if v else 0,\n ),\n }\n\n # Per-category latency\n for\
\ cat, lats in per_category.items():\n valid = [l for l in lats if\
\ l > 0]\n if valid:\n metrics[f\"latency_mean_{cat}_s\"\
] = statistics.mean(valid)\n\n return metrics\n\n"
image: python:3.13-slim
pipelineInfo:
description: Benchmark vLLM with different tuning configurations. Logs latency,
TPS, and TTFT to MLflow for A/B comparison.
name: vllm-tuning-evaluation
root:
dag:
tasks:
build-prompt-suite:
cachingOptions:
enableCache: true
componentRef:
name: comp-build-prompt-suite
taskInfo:
name: build-prompt-suite
create-tuning-run:
cachingOptions:
enableCache: true
componentRef:
name: comp-create-tuning-run
inputs:
parameters:
experiment_name:
runtimeValue:
constant: vllm-tuning
pipelinechannel--enable_chunked_prefill:
componentInputParameter: enable_chunked_prefill
pipelinechannel--enable_prefix_caching:
componentInputParameter: enable_prefix_caching
pipelinechannel--gpu_memory_utilization:
componentInputParameter: gpu_memory_utilization
pipelinechannel--llm_endpoint:
componentInputParameter: llm_endpoint
pipelinechannel--model_name:
componentInputParameter: model_name
pipelinechannel--ngram_prompt_lookup_max:
componentInputParameter: ngram_prompt_lookup_max
pipelinechannel--num_iterations:
componentInputParameter: num_iterations
pipelinechannel--num_speculative_tokens:
componentInputParameter: num_speculative_tokens
pipelinechannel--num_warmup:
componentInputParameter: num_warmup
pipelinechannel--run_label:
componentInputParameter: run_label
run_name:
runtimeValue:
constant: vllm-{{$.inputs.parameters['pipelinechannel--run_label']}}
tuning_params:
runtimeValue:
constant:
enable_chunked_prefill: '{{$.inputs.parameters[''pipelinechannel--enable_chunked_prefill'']}}'
enable_prefix_caching: '{{$.inputs.parameters[''pipelinechannel--enable_prefix_caching'']}}'
gpu_memory_utilization: '{{$.inputs.parameters[''pipelinechannel--gpu_memory_utilization'']}}'
llm_endpoint: '{{$.inputs.parameters[''pipelinechannel--llm_endpoint'']}}'
model_name: '{{$.inputs.parameters[''pipelinechannel--model_name'']}}'
ngram_prompt_lookup_max: '{{$.inputs.parameters[''pipelinechannel--ngram_prompt_lookup_max'']}}'
num_iterations: '{{$.inputs.parameters[''pipelinechannel--num_iterations'']}}'
num_speculative_tokens: '{{$.inputs.parameters[''pipelinechannel--num_speculative_tokens'']}}'
num_warmup: '{{$.inputs.parameters[''pipelinechannel--num_warmup'']}}'
taskInfo:
name: create-tuning-run
log-benchmark-results:
cachingOptions:
enableCache: true
componentRef:
name: comp-log-benchmark-results
dependentTasks:
- create-tuning-run
- run-benchmark
inputs:
parameters:
metrics:
taskOutputParameter:
outputParameterKey: Output
producerTask: run-benchmark
run_id:
taskOutputParameter:
outputParameterKey: run_id
producerTask: create-tuning-run
taskInfo:
name: log-benchmark-results
run-benchmark:
cachingOptions: {}
componentRef:
name: comp-run-benchmark
dependentTasks:
- build-prompt-suite
inputs:
parameters:
llm_endpoint:
componentInputParameter: llm_endpoint
model_name:
componentInputParameter: model_name
num_iterations:
componentInputParameter: num_iterations
num_warmup:
componentInputParameter: num_warmup
prompts:
taskOutputParameter:
outputParameterKey: Output
producerTask: build-prompt-suite
taskInfo:
name: run-benchmark
inputDefinitions:
parameters:
enable_chunked_prefill:
defaultValue: 'true'
description: '"true" or "false"'
isOptional: true
parameterType: STRING
enable_prefix_caching:
defaultValue: 'true'
description: '"true" or "false"'
isOptional: true
parameterType: STRING
gpu_memory_utilization:
defaultValue: '0.90'
description: 0.0 - 1.0
isOptional: true
parameterType: STRING
llm_endpoint:
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
description: vLLM inference endpoint URL
isOptional: true
parameterType: STRING
model_name:
defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
description: HF model identifier
isOptional: true
parameterType: STRING
ngram_prompt_lookup_max:
defaultValue: '4'
description: ngram window for spec decode (0 = off)
isOptional: true
parameterType: STRING
num_iterations:
defaultValue: 3.0
description: how many times to repeat the prompt suite
isOptional: true
parameterType: NUMBER_INTEGER
num_speculative_tokens:
defaultValue: '3'
description: number of speculative tokens (0 = off)
isOptional: true
parameterType: STRING
num_warmup:
defaultValue: 2.0
description: warmup requests before timing
isOptional: true
parameterType: NUMBER_INTEGER
run_label:
defaultValue: baseline
description: human-readable label (e.g. "apc-on-spec3")
isOptional: true
parameterType: STRING
schemaVersion: 2.1.0
sdkVersion: kfp-2.12.1