# PIPELINE DEFINITION # Name: vllm-tuning-evaluation # Description: Benchmark vLLM with different tuning configurations. Logs latency, TPS, and TTFT to MLflow for A/B comparison. # Inputs: # enable_chunked_prefill: str [Default: 'true'] # enable_prefix_caching: str [Default: 'true'] # gpu_memory_utilization: str [Default: '0.90'] # llm_endpoint: str [Default: 'http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm'] # model_name: str [Default: 'hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4'] # ngram_prompt_lookup_max: str [Default: '4'] # num_iterations: int [Default: 3.0] # num_speculative_tokens: str [Default: '3'] # num_warmup: int [Default: 2.0] # run_label: str [Default: 'baseline'] components: comp-build-prompt-suite: executorLabel: exec-build-prompt-suite outputDefinitions: parameters: Output: parameterType: LIST comp-create-tuning-run: executorLabel: exec-create-tuning-run inputDefinitions: parameters: experiment_name: parameterType: STRING mlflow_tracking_uri: defaultValue: http://mlflow.mlflow.svc.cluster.local:80 isOptional: true parameterType: STRING run_name: parameterType: STRING tuning_params: parameterType: STRUCT outputDefinitions: parameters: experiment_id: parameterType: STRING run_id: parameterType: STRING comp-log-benchmark-results: executorLabel: exec-log-benchmark-results inputDefinitions: parameters: metrics: parameterType: STRUCT mlflow_tracking_uri: defaultValue: http://mlflow.mlflow.svc.cluster.local:80 isOptional: true parameterType: STRING run_id: parameterType: STRING outputDefinitions: parameters: Output: parameterType: STRING comp-run-benchmark: executorLabel: exec-run-benchmark inputDefinitions: parameters: llm_endpoint: parameterType: STRING model_name: parameterType: STRING num_iterations: defaultValue: 3.0 isOptional: true parameterType: NUMBER_INTEGER num_warmup: defaultValue: 2.0 isOptional: true parameterType: NUMBER_INTEGER prompts: parameterType: LIST outputDefinitions: parameters: Output: parameterType: STRUCT deploymentSpec: executors: exec-build-prompt-suite: container: args: - --executor_input - '{{$}}' - --function_to_execute - build_prompt_suite command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\ $0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef build_prompt_suite() -> list:\n \"\"\"Return a list of test\ \ prompts spanning short, medium, and long inputs.\"\"\"\n return [\n\ \ {\n \"id\": \"short-1\",\n \"category\":\ \ \"short\",\n \"messages\": [\n {\"role\": \"\ user\", \"content\": \"What is the capital of France?\"}\n ],\n\ \ \"max_tokens\": 64,\n },\n {\n \"\ id\": \"short-2\",\n \"category\": \"short\",\n \"\ messages\": [\n {\"role\": \"user\", \"content\": \"Explain\ \ quantum computing in one sentence.\"}\n ],\n \"\ max_tokens\": 64,\n },\n {\n \"id\": \"medium-1\"\ ,\n \"category\": \"medium\",\n \"messages\": [\n\ \ {\n \"role\": \"system\",\n \ \ \"content\": \"You are a helpful AI assistant running on a\ \ homelab.\",\n },\n {\n \ \ \"role\": \"user\",\n \"content\": (\n \ \ \"Compare and contrast supervised and unsupervised \"\n \ \ \"machine learning. Give examples of each and explain\ \ \"\n \"when you would choose one over the other.\"\ \n ),\n },\n ],\n \ \ \"max_tokens\": 512,\n },\n {\n \"id\": \"\ medium-2\",\n \"category\": \"medium\",\n \"messages\"\ : [\n {\n \"role\": \"user\",\n \ \ \"content\": (\n \"Write a Python\ \ function that implements a binary search \"\n \"\ tree with insert, search, and delete operations. Include \"\n \ \ \"docstrings and type hints.\"\n ),\n\ \ },\n ],\n \"max_tokens\": 1024,\n\ \ },\n {\n \"id\": \"long-1\",\n \"\ category\": \"long\",\n \"messages\": [\n {\n\ \ \"role\": \"system\",\n \"content\"\ : \"You are a technical writer for a Kubernetes homelab blog.\",\n \ \ },\n {\n \"role\": \"user\"\ ,\n \"content\": (\n \"Write a\ \ detailed tutorial on setting up a multi-node \"\n \ \ \"Kubernetes cluster with Talos Linux, covering: \"\n \ \ \"1) Hardware requirements and network topology, \"\n \ \ \"2) Talos machine config generation, \"\n \ \ \"3) Control plane bootstrapping, \"\n \ \ \"4) Worker node joining, \"\n \"5) CNI setup\ \ with Cilium, \"\n \"6) Storage with Rook-Ceph,\ \ \"\n \"7) GitOps with Flux CD. \"\n \ \ \"Include YAML examples for each step.\"\n \ \ ),\n },\n ],\n \"max_tokens\"\ : 2048,\n },\n {\n \"id\": \"repeat-prefix-1\"\ ,\n \"category\": \"prefix-cache-test\",\n \"messages\"\ : [\n {\n \"role\": \"system\",\n \ \ \"content\": \"You are a helpful AI assistant running on\ \ a homelab.\",\n },\n {\n \ \ \"role\": \"user\",\n \"content\": (\n \ \ \"Compare and contrast supervised and unsupervised \"\n\ \ \"machine learning. Now focus specifically on \"\ \n \"reinforcement learning and how it differs.\"\ \n ),\n },\n ],\n \ \ \"max_tokens\": 512,\n },\n ]\n\n" image: python:3.13-slim exec-create-tuning-run: container: args: - --executor_input - '{{$}}' - --function_to_execute - create_tuning_run command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\ \ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef create_tuning_run(\n experiment_name: str,\n run_name:\ \ str,\n tuning_params: dict,\n mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\ ,\n) -> NamedTuple(\"RunInfo\", [(\"run_id\", str), (\"experiment_id\",\ \ str)]):\n \"\"\"Create an MLflow run for a vLLM tuning experiment.\"\ \"\"\n import os\n import mlflow\n from mlflow.tracking import\ \ MlflowClient\n from collections import namedtuple\n\n mlflow.set_tracking_uri(mlflow_tracking_uri)\n\ \ client = MlflowClient()\n\n exp = client.get_experiment_by_name(experiment_name)\n\ \ experiment_id = (\n exp.experiment_id\n if exp\n \ \ else client.create_experiment(\n name=experiment_name,\n\ \ artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\ ,\n )\n )\n\n tags = {\n \"pipeline.type\": \"vllm-tuning\"\ ,\n \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\", \"unknown\"),\n\ \ }\n\n run = mlflow.start_run(\n experiment_id=experiment_id,\ \ run_name=run_name, tags=tags\n )\n # Log every tuning param\n \ \ for key, value in tuning_params.items():\n mlflow.log_param(f\"\ vllm.{key}\", value)\n run_id = run.info.run_id\n mlflow.end_run()\n\ \n RunInfo = namedtuple(\"RunInfo\", [\"run_id\", \"experiment_id\"])\n\ \ return RunInfo(run_id, experiment_id)\n\n" image: python:3.13-slim exec-log-benchmark-results: container: args: - --executor_input - '{{$}}' - --function_to_execute - log_benchmark_results command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\ \ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef log_benchmark_results(\n run_id: str,\n metrics: dict,\n\ \ mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\ ,\n) -> str:\n \"\"\"Log benchmark metrics to MLflow and close the run.\"\ \"\"\n import json\n import tempfile\n import mlflow\n from\ \ mlflow.tracking import MlflowClient\n from pathlib import Path\n\n\ \ mlflow.set_tracking_uri(mlflow_tracking_uri)\n client = MlflowClient()\n\ \n for key, value in metrics.items():\n client.log_metric(run_id,\ \ key, float(value))\n\n # Save full results as artifact\n with tempfile.TemporaryDirectory()\ \ as tmpdir:\n path = Path(tmpdir) / \"benchmark_results.json\"\n\ \ path.write_text(json.dumps(metrics, indent=2))\n client.log_artifact(run_id,\ \ str(path))\n\n client.set_terminated(run_id, status=\"FINISHED\")\n\ \ return run_id\n\n" image: python:3.13-slim exec-run-benchmark: container: args: - --executor_input - '{{$}}' - --function_to_execute - run_benchmark command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\ $0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef run_benchmark(\n prompts: list,\n llm_endpoint: str,\n\ \ model_name: str,\n num_warmup: int = 2,\n num_iterations: int\ \ = 3,\n) -> dict:\n \"\"\"\n Run all prompts through the LLM endpoint\ \ and collect timing metrics.\n\n Returns aggregate metrics: p50/p95/mean\ \ latency, tokens/sec, TTFT.\n \"\"\"\n import time\n import statistics\n\ \ import httpx\n\n all_latencies: list[float] = []\n all_tps: list[float]\ \ = []\n all_ttft: list[float] = []\n per_category: dict[str, list[float]]\ \ = {}\n\n with httpx.Client(timeout=300.0) as client:\n # Warmup\n\ \ for _ in range(num_warmup):\n try:\n \ \ client.post(\n f\"{llm_endpoint}/v1/chat/completions\"\ ,\n json={\n \"model\": model_name,\n\ \ \"messages\": [{\"role\": \"user\", \"content\"\ : \"Hi\"}],\n \"max_tokens\": 8,\n \ \ \"temperature\": 0,\n },\n \ \ )\n except Exception:\n pass\n\n # Benchmark\n\ \ for iteration in range(num_iterations):\n for prompt\ \ in prompts:\n category = prompt.get(\"category\", \"unknown\"\ )\n payload = {\n \"model\": model_name,\n\ \ \"messages\": prompt[\"messages\"],\n \ \ \"max_tokens\": prompt.get(\"max_tokens\", 256),\n \ \ \"temperature\": 0,\n \"stream\": True,\n \ \ }\n\n try:\n t_start = time.perf_counter()\n\ \ first_token_time = None\n\n with\ \ client.stream(\n \"POST\",\n \ \ f\"{llm_endpoint}/v1/chat/completions\",\n \ \ json=payload,\n ) as resp:\n \ \ resp.raise_for_status()\n completion_tokens =\ \ 0\n for line in resp.iter_lines():\n \ \ if not line.startswith(\"data: \"):\n \ \ continue\n chunk = line[6:]\n\ \ if chunk == \"[DONE]\":\n \ \ break\n if first_token_time is\ \ None:\n first_token_time = time.perf_counter()\n\ \ completion_tokens += 1\n\n \ \ t_end = time.perf_counter()\n latency = t_end -\ \ t_start\n ttft = (\n (first_token_time\ \ - t_start)\n if first_token_time\n \ \ else latency\n )\n tps\ \ = (\n completion_tokens / latency if latency >\ \ 0 else 0\n )\n\n all_latencies.append(latency)\n\ \ all_tps.append(tps)\n all_ttft.append(ttft)\n\ \ per_category.setdefault(category, []).append(latency)\n\ \n except Exception as exc:\n # Record\ \ failure but keep going\n all_latencies.append(-1)\n\ \ all_tps.append(0)\n all_ttft.append(-1)\n\ \n # Compute aggregates\n valid_latencies = [l for l in all_latencies\ \ if l > 0]\n valid_tps = [t for t in all_tps if t > 0]\n valid_ttft\ \ = [t for t in all_ttft if t > 0]\n\n def safe_stat(values, func):\n\ \ return func(values) if values else 0\n\n metrics = {\n \ \ \"total_requests\": len(all_latencies),\n \"successful_requests\"\ : len(valid_latencies),\n \"failed_requests\": len(all_latencies)\ \ - len(valid_latencies),\n # Latency\n \"latency_mean_s\"\ : safe_stat(valid_latencies, statistics.mean),\n \"latency_p50_s\"\ : safe_stat(\n valid_latencies,\n lambda v: statistics.median(v),\n\ \ ),\n \"latency_p95_s\": safe_stat(\n valid_latencies,\n\ \ lambda v: sorted(v)[int(len(v) * 0.95)] if v else 0,\n \ \ ),\n # Throughput\n \"tokens_per_second_mean\": safe_stat(valid_tps,\ \ statistics.mean),\n \"tokens_per_second_p50\": safe_stat(\n \ \ valid_tps, lambda v: statistics.median(v)\n ),\n \ \ # Time to first token\n \"ttft_mean_s\": safe_stat(valid_ttft,\ \ statistics.mean),\n \"ttft_p50_s\": safe_stat(valid_ttft, lambda\ \ v: statistics.median(v)),\n \"ttft_p95_s\": safe_stat(\n \ \ valid_ttft,\n lambda v: sorted(v)[int(len(v) * 0.95)]\ \ if v else 0,\n ),\n }\n\n # Per-category latency\n for\ \ cat, lats in per_category.items():\n valid = [l for l in lats if\ \ l > 0]\n if valid:\n metrics[f\"latency_mean_{cat}_s\"\ ] = statistics.mean(valid)\n\n return metrics\n\n" image: python:3.13-slim pipelineInfo: description: Benchmark vLLM with different tuning configurations. Logs latency, TPS, and TTFT to MLflow for A/B comparison. name: vllm-tuning-evaluation root: dag: tasks: build-prompt-suite: cachingOptions: enableCache: true componentRef: name: comp-build-prompt-suite taskInfo: name: build-prompt-suite create-tuning-run: cachingOptions: enableCache: true componentRef: name: comp-create-tuning-run inputs: parameters: experiment_name: runtimeValue: constant: vllm-tuning pipelinechannel--enable_chunked_prefill: componentInputParameter: enable_chunked_prefill pipelinechannel--enable_prefix_caching: componentInputParameter: enable_prefix_caching pipelinechannel--gpu_memory_utilization: componentInputParameter: gpu_memory_utilization pipelinechannel--llm_endpoint: componentInputParameter: llm_endpoint pipelinechannel--model_name: componentInputParameter: model_name pipelinechannel--ngram_prompt_lookup_max: componentInputParameter: ngram_prompt_lookup_max pipelinechannel--num_iterations: componentInputParameter: num_iterations pipelinechannel--num_speculative_tokens: componentInputParameter: num_speculative_tokens pipelinechannel--num_warmup: componentInputParameter: num_warmup pipelinechannel--run_label: componentInputParameter: run_label run_name: runtimeValue: constant: vllm-{{$.inputs.parameters['pipelinechannel--run_label']}} tuning_params: runtimeValue: constant: enable_chunked_prefill: '{{$.inputs.parameters[''pipelinechannel--enable_chunked_prefill'']}}' enable_prefix_caching: '{{$.inputs.parameters[''pipelinechannel--enable_prefix_caching'']}}' gpu_memory_utilization: '{{$.inputs.parameters[''pipelinechannel--gpu_memory_utilization'']}}' llm_endpoint: '{{$.inputs.parameters[''pipelinechannel--llm_endpoint'']}}' model_name: '{{$.inputs.parameters[''pipelinechannel--model_name'']}}' ngram_prompt_lookup_max: '{{$.inputs.parameters[''pipelinechannel--ngram_prompt_lookup_max'']}}' num_iterations: '{{$.inputs.parameters[''pipelinechannel--num_iterations'']}}' num_speculative_tokens: '{{$.inputs.parameters[''pipelinechannel--num_speculative_tokens'']}}' num_warmup: '{{$.inputs.parameters[''pipelinechannel--num_warmup'']}}' taskInfo: name: create-tuning-run log-benchmark-results: cachingOptions: enableCache: true componentRef: name: comp-log-benchmark-results dependentTasks: - create-tuning-run - run-benchmark inputs: parameters: metrics: taskOutputParameter: outputParameterKey: Output producerTask: run-benchmark run_id: taskOutputParameter: outputParameterKey: run_id producerTask: create-tuning-run taskInfo: name: log-benchmark-results run-benchmark: cachingOptions: {} componentRef: name: comp-run-benchmark dependentTasks: - build-prompt-suite inputs: parameters: llm_endpoint: componentInputParameter: llm_endpoint model_name: componentInputParameter: model_name num_iterations: componentInputParameter: num_iterations num_warmup: componentInputParameter: num_warmup prompts: taskOutputParameter: outputParameterKey: Output producerTask: build-prompt-suite taskInfo: name: run-benchmark inputDefinitions: parameters: enable_chunked_prefill: defaultValue: 'true' description: '"true" or "false"' isOptional: true parameterType: STRING enable_prefix_caching: defaultValue: 'true' description: '"true" or "false"' isOptional: true parameterType: STRING gpu_memory_utilization: defaultValue: '0.90' description: 0.0 - 1.0 isOptional: true parameterType: STRING llm_endpoint: defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm description: vLLM inference endpoint URL isOptional: true parameterType: STRING model_name: defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4 description: HF model identifier isOptional: true parameterType: STRING ngram_prompt_lookup_max: defaultValue: '4' description: ngram window for spec decode (0 = off) isOptional: true parameterType: STRING num_iterations: defaultValue: 3.0 description: how many times to repeat the prompt suite isOptional: true parameterType: NUMBER_INTEGER num_speculative_tokens: defaultValue: '3' description: number of speculative tokens (0 = off) isOptional: true parameterType: STRING num_warmup: defaultValue: 2.0 description: warmup requests before timing isOptional: true parameterType: NUMBER_INTEGER run_label: defaultValue: baseline description: human-readable label (e.g. "apc-on-spec3") isOptional: true parameterType: STRING schemaVersion: 2.1.0 sdkVersion: kfp-2.12.1