#!/usr/bin/env python3 """ vLLM Tuning Evaluation Pipeline - Kubeflow Pipelines SDK Runs inference benchmarks with different vLLM configurations and logs results to MLflow so you can compare APC, chunked prefill, speculative decoding, and GPU memory utilisation settings side-by-side. Usage: pip install kfp==2.12.1 python vllm_tuning_pipeline.py # Upload vllm_tuning_pipeline.yaml to Kubeflow Pipelines UI """ from kfp import dsl from kfp import compiler from typing import NamedTuple MLFLOW_IMAGE = "python:3.13-slim" MLFLOW_PACKAGES = ["mlflow>=2.10.0", "boto3", "psycopg2-binary"] BENCH_PACKAGES = ["httpx"] # ---- MLflow components ---- @dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES) def create_tuning_run( experiment_name: str, run_name: str, tuning_params: dict, mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80", ) -> NamedTuple("RunInfo", [("run_id", str), ("experiment_id", str)]): """Create an MLflow run for a vLLM tuning experiment.""" import os import mlflow from mlflow.tracking import MlflowClient from collections import namedtuple mlflow.set_tracking_uri(mlflow_tracking_uri) client = MlflowClient() exp = client.get_experiment_by_name(experiment_name) experiment_id = ( exp.experiment_id if exp else client.create_experiment( name=experiment_name, artifact_location=f"/mlflow/artifacts/{experiment_name}", ) ) tags = { "pipeline.type": "vllm-tuning", "kfp.run_id": os.environ.get("KFP_RUN_ID", "unknown"), } run = mlflow.start_run( experiment_id=experiment_id, run_name=run_name, tags=tags ) # Log every tuning param for key, value in tuning_params.items(): mlflow.log_param(f"vllm.{key}", value) run_id = run.info.run_id mlflow.end_run() RunInfo = namedtuple("RunInfo", ["run_id", "experiment_id"]) return RunInfo(run_id, experiment_id) @dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES) def log_benchmark_results( run_id: str, metrics: dict, mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80", ) -> str: """Log benchmark metrics to MLflow and close the run.""" import json import tempfile import mlflow from mlflow.tracking import MlflowClient from pathlib import Path mlflow.set_tracking_uri(mlflow_tracking_uri) client = MlflowClient() for key, value in metrics.items(): client.log_metric(run_id, key, float(value)) # Save full results as artifact with tempfile.TemporaryDirectory() as tmpdir: path = Path(tmpdir) / "benchmark_results.json" path.write_text(json.dumps(metrics, indent=2)) client.log_artifact(run_id, str(path)) client.set_terminated(run_id, status="FINISHED") return run_id # ---- Benchmark components ---- @dsl.component( base_image="python:3.13-slim", packages_to_install=BENCH_PACKAGES, ) def build_prompt_suite() -> list: """Return a list of test prompts spanning short, medium, and long inputs.""" return [ { "id": "short-1", "category": "short", "messages": [ {"role": "user", "content": "What is the capital of France?"} ], "max_tokens": 64, }, { "id": "short-2", "category": "short", "messages": [ {"role": "user", "content": "Explain quantum computing in one sentence."} ], "max_tokens": 64, }, { "id": "medium-1", "category": "medium", "messages": [ { "role": "system", "content": "You are a helpful AI assistant running on a homelab.", }, { "role": "user", "content": ( "Compare and contrast supervised and unsupervised " "machine learning. Give examples of each and explain " "when you would choose one over the other." ), }, ], "max_tokens": 512, }, { "id": "medium-2", "category": "medium", "messages": [ { "role": "user", "content": ( "Write a Python function that implements a binary search " "tree with insert, search, and delete operations. Include " "docstrings and type hints." ), }, ], "max_tokens": 1024, }, { "id": "long-1", "category": "long", "messages": [ { "role": "system", "content": "You are a technical writer for a Kubernetes homelab blog.", }, { "role": "user", "content": ( "Write a detailed tutorial on setting up a multi-node " "Kubernetes cluster with Talos Linux, covering: " "1) Hardware requirements and network topology, " "2) Talos machine config generation, " "3) Control plane bootstrapping, " "4) Worker node joining, " "5) CNI setup with Cilium, " "6) Storage with Rook-Ceph, " "7) GitOps with Flux CD. " "Include YAML examples for each step." ), }, ], "max_tokens": 2048, }, { "id": "repeat-prefix-1", "category": "prefix-cache-test", "messages": [ { "role": "system", "content": "You are a helpful AI assistant running on a homelab.", }, { "role": "user", "content": ( "Compare and contrast supervised and unsupervised " "machine learning. Now focus specifically on " "reinforcement learning and how it differs." ), }, ], "max_tokens": 512, }, ] @dsl.component( base_image="python:3.13-slim", packages_to_install=BENCH_PACKAGES, ) def run_benchmark( prompts: list, llm_endpoint: str, model_name: str, num_warmup: int = 2, num_iterations: int = 3, ) -> dict: """ Run all prompts through the LLM endpoint and collect timing metrics. Returns aggregate metrics: p50/p95/mean latency, tokens/sec, TTFT. """ import time import statistics import httpx all_latencies: list[float] = [] all_tps: list[float] = [] all_ttft: list[float] = [] per_category: dict[str, list[float]] = {} with httpx.Client(timeout=300.0) as client: # Warmup for _ in range(num_warmup): try: client.post( f"{llm_endpoint}/v1/chat/completions", json={ "model": model_name, "messages": [{"role": "user", "content": "Hi"}], "max_tokens": 8, "temperature": 0, }, ) except Exception: pass # Benchmark for iteration in range(num_iterations): for prompt in prompts: category = prompt.get("category", "unknown") payload = { "model": model_name, "messages": prompt["messages"], "max_tokens": prompt.get("max_tokens", 256), "temperature": 0, "stream": True, } try: t_start = time.perf_counter() first_token_time = None with client.stream( "POST", f"{llm_endpoint}/v1/chat/completions", json=payload, ) as resp: resp.raise_for_status() completion_tokens = 0 for line in resp.iter_lines(): if not line.startswith("data: "): continue chunk = line[6:] if chunk == "[DONE]": break if first_token_time is None: first_token_time = time.perf_counter() completion_tokens += 1 t_end = time.perf_counter() latency = t_end - t_start ttft = ( (first_token_time - t_start) if first_token_time else latency ) tps = ( completion_tokens / latency if latency > 0 else 0 ) all_latencies.append(latency) all_tps.append(tps) all_ttft.append(ttft) per_category.setdefault(category, []).append(latency) except Exception as exc: # Record failure but keep going all_latencies.append(-1) all_tps.append(0) all_ttft.append(-1) # Compute aggregates valid_latencies = [l for l in all_latencies if l > 0] valid_tps = [t for t in all_tps if t > 0] valid_ttft = [t for t in all_ttft if t > 0] def safe_stat(values, func): return func(values) if values else 0 metrics = { "total_requests": len(all_latencies), "successful_requests": len(valid_latencies), "failed_requests": len(all_latencies) - len(valid_latencies), # Latency "latency_mean_s": safe_stat(valid_latencies, statistics.mean), "latency_p50_s": safe_stat( valid_latencies, lambda v: statistics.median(v), ), "latency_p95_s": safe_stat( valid_latencies, lambda v: sorted(v)[int(len(v) * 0.95)] if v else 0, ), # Throughput "tokens_per_second_mean": safe_stat(valid_tps, statistics.mean), "tokens_per_second_p50": safe_stat( valid_tps, lambda v: statistics.median(v) ), # Time to first token "ttft_mean_s": safe_stat(valid_ttft, statistics.mean), "ttft_p50_s": safe_stat(valid_ttft, lambda v: statistics.median(v)), "ttft_p95_s": safe_stat( valid_ttft, lambda v: sorted(v)[int(len(v) * 0.95)] if v else 0, ), } # Per-category latency for cat, lats in per_category.items(): valid = [l for l in lats if l > 0] if valid: metrics[f"latency_mean_{cat}_s"] = statistics.mean(valid) return metrics # ---- Pipeline ---- @dsl.pipeline( name="vllm-tuning-evaluation", description=( "Benchmark vLLM with different tuning configurations. " "Logs latency, TPS, and TTFT to MLflow for A/B comparison." ), ) def vllm_tuning_pipeline( llm_endpoint: str = "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm", model_name: str = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4", # Tuning knobs (match env vars in rayservice.yaml) enable_prefix_caching: str = "true", enable_chunked_prefill: str = "true", num_speculative_tokens: str = "3", ngram_prompt_lookup_max: str = "4", gpu_memory_utilization: str = "0.90", # Benchmark config num_warmup: int = 2, num_iterations: int = 3, run_label: str = "baseline", ): """ vLLM Tuning Evaluation Pipeline Run this multiple times with different tuning params, then compare runs in the MLflow "vllm-tuning" experiment. Args: llm_endpoint: vLLM inference endpoint URL model_name: HF model identifier enable_prefix_caching: "true" or "false" enable_chunked_prefill: "true" or "false" num_speculative_tokens: number of speculative tokens (0 = off) ngram_prompt_lookup_max: ngram window for spec decode (0 = off) gpu_memory_utilization: 0.0 - 1.0 num_warmup: warmup requests before timing num_iterations: how many times to repeat the prompt suite run_label: human-readable label (e.g. "apc-on-spec3") """ tuning_params = { "enable_prefix_caching": enable_prefix_caching, "enable_chunked_prefill": enable_chunked_prefill, "num_speculative_tokens": num_speculative_tokens, "ngram_prompt_lookup_max": ngram_prompt_lookup_max, "gpu_memory_utilization": gpu_memory_utilization, "model_name": model_name, "llm_endpoint": llm_endpoint, "num_warmup": str(num_warmup), "num_iterations": str(num_iterations), } # 1. Create MLflow run mlflow_run = create_tuning_run( experiment_name="vllm-tuning", run_name=f"vllm-{run_label}", tuning_params=tuning_params, ) # 2. Build prompt suite prompts_task = build_prompt_suite() prompts_task.set_caching_options(enable_caching=True) # 3. Run benchmark bench_task = run_benchmark( prompts=prompts_task.output, llm_endpoint=llm_endpoint, model_name=model_name, num_warmup=num_warmup, num_iterations=num_iterations, ) bench_task.set_caching_options(enable_caching=False) # 4. Log results to MLflow log_task = log_benchmark_results( run_id=mlflow_run.outputs["run_id"], metrics=bench_task.output, ) if __name__ == "__main__": compiler.Compiler().compile( vllm_tuning_pipeline, "vllm_tuning_pipeline.yaml", ) print("Compiled: vllm_tuning_pipeline.yaml") print() print("Example runs to compare configurations:") print(" # Baseline (current config)") print(" kfp run submit vllm_tuning_pipeline.yaml --run-label=baseline") print() print(" # APC disabled") print(" kfp run submit vllm_tuning_pipeline.yaml \\") print(" --enable-prefix-caching=false --run-label=no-apc") print() print(" # No speculative decoding") print(" kfp run submit vllm_tuning_pipeline.yaml \\") print(" --num-speculative-tokens=0 --run-label=no-spec") print() print(" # Aggressive spec decode") print(" kfp run submit vllm_tuning_pipeline.yaml \\") print(" --num-speculative-tokens=5 --ngram-prompt-lookup-max=6 --run-label=spec5-ngram6")