kubeflow/vllm_tuning_pipeline.yaml

# PIPELINE DEFINITION
# Name: vllm-tuning-evaluation
# Description: Benchmark vLLM with different tuning configurations. Logs latency, TPS, and TTFT to MLflow for A/B comparison.
# Inputs:
#    enable_chunked_prefill: str [Default: 'true']
#    enable_prefix_caching: str [Default: 'true']
#    gpu_memory_utilization: str [Default: '0.90']
#    llm_endpoint: str [Default: 'http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm']
#    model_name: str [Default: 'hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4']
#    ngram_prompt_lookup_max: str [Default: '4']
#    num_iterations: int [Default: 3.0]
#    num_speculative_tokens: str [Default: '3']
#    num_warmup: int [Default: 2.0]
#    run_label: str [Default: 'baseline']
components:
  comp-build-prompt-suite:
    executorLabel: exec-build-prompt-suite
    outputDefinitions:
      parameters:
        Output:
          parameterType: LIST
  comp-create-tuning-run:
    executorLabel: exec-create-tuning-run
    inputDefinitions:
      parameters:
        experiment_name:
          parameterType: STRING
        mlflow_tracking_uri:
          defaultValue: http://mlflow.mlflow.svc.cluster.local:80
          isOptional: true
          parameterType: STRING
        run_name:
          parameterType: STRING
        tuning_params:
          parameterType: STRUCT
    outputDefinitions:
      parameters:
        experiment_id:
          parameterType: STRING
        run_id:
          parameterType: STRING
  comp-log-benchmark-results:
    executorLabel: exec-log-benchmark-results
    inputDefinitions:
      parameters:
        metrics:
          parameterType: STRUCT
        mlflow_tracking_uri:
          defaultValue: http://mlflow.mlflow.svc.cluster.local:80
          isOptional: true
          parameterType: STRING
        run_id:
          parameterType: STRING
    outputDefinitions:
      parameters:
        Output:
          parameterType: STRING
  comp-run-benchmark:
    executorLabel: exec-run-benchmark
    inputDefinitions:
      parameters:
        llm_endpoint:
          parameterType: STRING
        model_name:
          parameterType: STRING
        num_iterations:
          defaultValue: 3.0
          isOptional: true
          parameterType: NUMBER_INTEGER
        num_warmup:
          defaultValue: 2.0
          isOptional: true
          parameterType: NUMBER_INTEGER
        prompts:
          parameterType: LIST
    outputDefinitions:
      parameters:
        Output:
          parameterType: STRUCT
deploymentSpec:
  executors:
    exec-build-prompt-suite:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - build_prompt_suite
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
          $0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef build_prompt_suite() -> list:\n    \"\"\"Return a list of test\
          \ prompts spanning short, medium, and long inputs.\"\"\"\n    return [\n\
          \        {\n            \"id\": \"short-1\",\n            \"category\":\
          \ \"short\",\n            \"messages\": [\n                {\"role\": \"\
          user\", \"content\": \"What is the capital of France?\"}\n            ],\n\
          \            \"max_tokens\": 64,\n        },\n        {\n            \"\
          id\": \"short-2\",\n            \"category\": \"short\",\n            \"\
          messages\": [\n                {\"role\": \"user\", \"content\": \"Explain\
          \ quantum computing in one sentence.\"}\n            ],\n            \"\
          max_tokens\": 64,\n        },\n        {\n            \"id\": \"medium-1\"\
          ,\n            \"category\": \"medium\",\n            \"messages\": [\n\
          \                {\n                    \"role\": \"system\",\n        \
          \            \"content\": \"You are a helpful AI assistant running on a\
          \ homelab.\",\n                },\n                {\n                 \
          \   \"role\": \"user\",\n                    \"content\": (\n          \
          \              \"Compare and contrast supervised and unsupervised \"\n \
          \                       \"machine learning. Give examples of each and explain\
          \ \"\n                        \"when you would choose one over the other.\"\
          \n                    ),\n                },\n            ],\n         \
          \   \"max_tokens\": 512,\n        },\n        {\n            \"id\": \"\
          medium-2\",\n            \"category\": \"medium\",\n            \"messages\"\
          : [\n                {\n                    \"role\": \"user\",\n      \
          \              \"content\": (\n                        \"Write a Python\
          \ function that implements a binary search \"\n                        \"\
          tree with insert, search, and delete operations. Include \"\n          \
          \              \"docstrings and type hints.\"\n                    ),\n\
          \                },\n            ],\n            \"max_tokens\": 1024,\n\
          \        },\n        {\n            \"id\": \"long-1\",\n            \"\
          category\": \"long\",\n            \"messages\": [\n                {\n\
          \                    \"role\": \"system\",\n                    \"content\"\
          : \"You are a technical writer for a Kubernetes homelab blog.\",\n     \
          \           },\n                {\n                    \"role\": \"user\"\
          ,\n                    \"content\": (\n                        \"Write a\
          \ detailed tutorial on setting up a multi-node \"\n                    \
          \    \"Kubernetes cluster with Talos Linux, covering: \"\n             \
          \           \"1) Hardware requirements and network topology, \"\n      \
          \                  \"2) Talos machine config generation, \"\n          \
          \              \"3) Control plane bootstrapping, \"\n                  \
          \      \"4) Worker node joining, \"\n                        \"5) CNI setup\
          \ with Cilium, \"\n                        \"6) Storage with Rook-Ceph,\
          \ \"\n                        \"7) GitOps with Flux CD. \"\n           \
          \             \"Include YAML examples for each step.\"\n               \
          \     ),\n                },\n            ],\n            \"max_tokens\"\
          : 2048,\n        },\n        {\n            \"id\": \"repeat-prefix-1\"\
          ,\n            \"category\": \"prefix-cache-test\",\n            \"messages\"\
          : [\n                {\n                    \"role\": \"system\",\n    \
          \                \"content\": \"You are a helpful AI assistant running on\
          \ a homelab.\",\n                },\n                {\n               \
          \     \"role\": \"user\",\n                    \"content\": (\n        \
          \                \"Compare and contrast supervised and unsupervised \"\n\
          \                        \"machine learning. Now focus specifically on \"\
          \n                        \"reinforcement learning and how it differs.\"\
          \n                    ),\n                },\n            ],\n         \
          \   \"max_tokens\": 512,\n        },\n    ]\n\n"
        image: python:3.13-slim
    exec-create-tuning-run:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - create_tuning_run
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
          \ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef create_tuning_run(\n    experiment_name: str,\n    run_name:\
          \ str,\n    tuning_params: dict,\n    mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\
          ,\n) -> NamedTuple(\"RunInfo\", [(\"run_id\", str), (\"experiment_id\",\
          \ str)]):\n    \"\"\"Create an MLflow run for a vLLM tuning experiment.\"\
          \"\"\n    import os\n    import mlflow\n    from mlflow.tracking import\
          \ MlflowClient\n    from collections import namedtuple\n\n    mlflow.set_tracking_uri(mlflow_tracking_uri)\n\
          \    client = MlflowClient()\n\n    exp = client.get_experiment_by_name(experiment_name)\n\
          \    experiment_id = (\n        exp.experiment_id\n        if exp\n    \
          \    else client.create_experiment(\n            name=experiment_name,\n\
          \            artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\
          ,\n        )\n    )\n\n    tags = {\n        \"pipeline.type\": \"vllm-tuning\"\
          ,\n        \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\", \"unknown\"),\n\
          \    }\n\n    run = mlflow.start_run(\n        experiment_id=experiment_id,\
          \ run_name=run_name, tags=tags\n    )\n    # Log every tuning param\n  \
          \  for key, value in tuning_params.items():\n        mlflow.log_param(f\"\
          vllm.{key}\", value)\n    run_id = run.info.run_id\n    mlflow.end_run()\n\
          \n    RunInfo = namedtuple(\"RunInfo\", [\"run_id\", \"experiment_id\"])\n\
          \    return RunInfo(run_id, experiment_id)\n\n"
        image: python:3.13-slim
    exec-log-benchmark-results:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - log_benchmark_results
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
          \ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef log_benchmark_results(\n    run_id: str,\n    metrics: dict,\n\
          \    mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\
          ,\n) -> str:\n    \"\"\"Log benchmark metrics to MLflow and close the run.\"\
          \"\"\n    import json\n    import tempfile\n    import mlflow\n    from\
          \ mlflow.tracking import MlflowClient\n    from pathlib import Path\n\n\
          \    mlflow.set_tracking_uri(mlflow_tracking_uri)\n    client = MlflowClient()\n\
          \n    for key, value in metrics.items():\n        client.log_metric(run_id,\
          \ key, float(value))\n\n    # Save full results as artifact\n    with tempfile.TemporaryDirectory()\
          \ as tmpdir:\n        path = Path(tmpdir) / \"benchmark_results.json\"\n\
          \        path.write_text(json.dumps(metrics, indent=2))\n        client.log_artifact(run_id,\
          \ str(path))\n\n    client.set_terminated(run_id, status=\"FINISHED\")\n\
          \    return run_id\n\n"
        image: python:3.13-slim
    exec-run-benchmark:
      container:
        args:
        - --executor_input
        - '{{$}}'
        - --function_to_execute
        - run_benchmark
        command:
        - sh
        - -c
        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
          $0\" \"$@\"\n"
        - sh
        - -ec
        - 'program_path=$(mktemp -d)


          printf "%s" "$0" > "$program_path/ephemeral_component.py"

          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"

          '
        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
          \ *\n\ndef run_benchmark(\n    prompts: list,\n    llm_endpoint: str,\n\
          \    model_name: str,\n    num_warmup: int = 2,\n    num_iterations: int\
          \ = 3,\n) -> dict:\n    \"\"\"\n    Run all prompts through the LLM endpoint\
          \ and collect timing metrics.\n\n    Returns aggregate metrics: p50/p95/mean\
          \ latency, tokens/sec, TTFT.\n    \"\"\"\n    import time\n    import statistics\n\
          \    import httpx\n\n    all_latencies: list[float] = []\n    all_tps: list[float]\
          \ = []\n    all_ttft: list[float] = []\n    per_category: dict[str, list[float]]\
          \ = {}\n\n    with httpx.Client(timeout=300.0) as client:\n        # Warmup\n\
          \        for _ in range(num_warmup):\n            try:\n               \
          \ client.post(\n                    f\"{llm_endpoint}/v1/chat/completions\"\
          ,\n                    json={\n                        \"model\": model_name,\n\
          \                        \"messages\": [{\"role\": \"user\", \"content\"\
          : \"Hi\"}],\n                        \"max_tokens\": 8,\n              \
          \          \"temperature\": 0,\n                    },\n               \
          \ )\n            except Exception:\n                pass\n\n        # Benchmark\n\
          \        for iteration in range(num_iterations):\n            for prompt\
          \ in prompts:\n                category = prompt.get(\"category\", \"unknown\"\
          )\n                payload = {\n                    \"model\": model_name,\n\
          \                    \"messages\": prompt[\"messages\"],\n             \
          \       \"max_tokens\": prompt.get(\"max_tokens\", 256),\n             \
          \       \"temperature\": 0,\n                    \"stream\": True,\n   \
          \             }\n\n                try:\n                    t_start = time.perf_counter()\n\
          \                    first_token_time = None\n\n                    with\
          \ client.stream(\n                        \"POST\",\n                  \
          \      f\"{llm_endpoint}/v1/chat/completions\",\n                      \
          \  json=payload,\n                    ) as resp:\n                     \
          \   resp.raise_for_status()\n                        completion_tokens =\
          \ 0\n                        for line in resp.iter_lines():\n          \
          \                  if not line.startswith(\"data: \"):\n               \
          \                 continue\n                            chunk = line[6:]\n\
          \                            if chunk == \"[DONE]\":\n                 \
          \               break\n                            if first_token_time is\
          \ None:\n                                first_token_time = time.perf_counter()\n\
          \                            completion_tokens += 1\n\n                \
          \    t_end = time.perf_counter()\n                    latency = t_end -\
          \ t_start\n                    ttft = (\n                        (first_token_time\
          \ - t_start)\n                        if first_token_time\n            \
          \            else latency\n                    )\n                    tps\
          \ = (\n                        completion_tokens / latency if latency >\
          \ 0 else 0\n                    )\n\n                    all_latencies.append(latency)\n\
          \                    all_tps.append(tps)\n                    all_ttft.append(ttft)\n\
          \                    per_category.setdefault(category, []).append(latency)\n\
          \n                except Exception as exc:\n                    # Record\
          \ failure but keep going\n                    all_latencies.append(-1)\n\
          \                    all_tps.append(0)\n                    all_ttft.append(-1)\n\
          \n    # Compute aggregates\n    valid_latencies = [l for l in all_latencies\
          \ if l > 0]\n    valid_tps = [t for t in all_tps if t > 0]\n    valid_ttft\
          \ = [t for t in all_ttft if t > 0]\n\n    def safe_stat(values, func):\n\
          \        return func(values) if values else 0\n\n    metrics = {\n     \
          \   \"total_requests\": len(all_latencies),\n        \"successful_requests\"\
          : len(valid_latencies),\n        \"failed_requests\": len(all_latencies)\
          \ - len(valid_latencies),\n        # Latency\n        \"latency_mean_s\"\
          : safe_stat(valid_latencies, statistics.mean),\n        \"latency_p50_s\"\
          : safe_stat(\n            valid_latencies,\n            lambda v: statistics.median(v),\n\
          \        ),\n        \"latency_p95_s\": safe_stat(\n            valid_latencies,\n\
          \            lambda v: sorted(v)[int(len(v) * 0.95)] if v else 0,\n    \
          \    ),\n        # Throughput\n        \"tokens_per_second_mean\": safe_stat(valid_tps,\
          \ statistics.mean),\n        \"tokens_per_second_p50\": safe_stat(\n   \
          \         valid_tps, lambda v: statistics.median(v)\n        ),\n      \
          \  # Time to first token\n        \"ttft_mean_s\": safe_stat(valid_ttft,\
          \ statistics.mean),\n        \"ttft_p50_s\": safe_stat(valid_ttft, lambda\
          \ v: statistics.median(v)),\n        \"ttft_p95_s\": safe_stat(\n      \
          \      valid_ttft,\n            lambda v: sorted(v)[int(len(v) * 0.95)]\
          \ if v else 0,\n        ),\n    }\n\n    # Per-category latency\n    for\
          \ cat, lats in per_category.items():\n        valid = [l for l in lats if\
          \ l > 0]\n        if valid:\n            metrics[f\"latency_mean_{cat}_s\"\
          ] = statistics.mean(valid)\n\n    return metrics\n\n"
        image: python:3.13-slim
pipelineInfo:
  description: Benchmark vLLM with different tuning configurations. Logs latency,
    TPS, and TTFT to MLflow for A/B comparison.
  name: vllm-tuning-evaluation
root:
  dag:
    tasks:
      build-prompt-suite:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-build-prompt-suite
        taskInfo:
          name: build-prompt-suite
      create-tuning-run:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-create-tuning-run
        inputs:
          parameters:
            experiment_name:
              runtimeValue:
                constant: vllm-tuning
            pipelinechannel--enable_chunked_prefill:
              componentInputParameter: enable_chunked_prefill
            pipelinechannel--enable_prefix_caching:
              componentInputParameter: enable_prefix_caching
            pipelinechannel--gpu_memory_utilization:
              componentInputParameter: gpu_memory_utilization
            pipelinechannel--llm_endpoint:
              componentInputParameter: llm_endpoint
            pipelinechannel--model_name:
              componentInputParameter: model_name
            pipelinechannel--ngram_prompt_lookup_max:
              componentInputParameter: ngram_prompt_lookup_max
            pipelinechannel--num_iterations:
              componentInputParameter: num_iterations
            pipelinechannel--num_speculative_tokens:
              componentInputParameter: num_speculative_tokens
            pipelinechannel--num_warmup:
              componentInputParameter: num_warmup
            pipelinechannel--run_label:
              componentInputParameter: run_label
            run_name:
              runtimeValue:
                constant: vllm-{{$.inputs.parameters['pipelinechannel--run_label']}}
            tuning_params:
              runtimeValue:
                constant:
                  enable_chunked_prefill: '{{$.inputs.parameters[''pipelinechannel--enable_chunked_prefill'']}}'
                  enable_prefix_caching: '{{$.inputs.parameters[''pipelinechannel--enable_prefix_caching'']}}'
                  gpu_memory_utilization: '{{$.inputs.parameters[''pipelinechannel--gpu_memory_utilization'']}}'
                  llm_endpoint: '{{$.inputs.parameters[''pipelinechannel--llm_endpoint'']}}'
                  model_name: '{{$.inputs.parameters[''pipelinechannel--model_name'']}}'
                  ngram_prompt_lookup_max: '{{$.inputs.parameters[''pipelinechannel--ngram_prompt_lookup_max'']}}'
                  num_iterations: '{{$.inputs.parameters[''pipelinechannel--num_iterations'']}}'
                  num_speculative_tokens: '{{$.inputs.parameters[''pipelinechannel--num_speculative_tokens'']}}'
                  num_warmup: '{{$.inputs.parameters[''pipelinechannel--num_warmup'']}}'
        taskInfo:
          name: create-tuning-run
      log-benchmark-results:
        cachingOptions:
          enableCache: true
        componentRef:
          name: comp-log-benchmark-results
        dependentTasks:
        - create-tuning-run
        - run-benchmark
        inputs:
          parameters:
            metrics:
              taskOutputParameter:
                outputParameterKey: Output
                producerTask: run-benchmark
            run_id:
              taskOutputParameter:
                outputParameterKey: run_id
                producerTask: create-tuning-run
        taskInfo:
          name: log-benchmark-results
      run-benchmark:
        cachingOptions: {}
        componentRef:
          name: comp-run-benchmark
        dependentTasks:
        - build-prompt-suite
        inputs:
          parameters:
            llm_endpoint:
              componentInputParameter: llm_endpoint
            model_name:
              componentInputParameter: model_name
            num_iterations:
              componentInputParameter: num_iterations
            num_warmup:
              componentInputParameter: num_warmup
            prompts:
              taskOutputParameter:
                outputParameterKey: Output
                producerTask: build-prompt-suite
        taskInfo:
          name: run-benchmark
  inputDefinitions:
    parameters:
      enable_chunked_prefill:
        defaultValue: 'true'
        description: '"true" or "false"'
        isOptional: true
        parameterType: STRING
      enable_prefix_caching:
        defaultValue: 'true'
        description: '"true" or "false"'
        isOptional: true
        parameterType: STRING
      gpu_memory_utilization:
        defaultValue: '0.90'
        description: 0.0 - 1.0
        isOptional: true
        parameterType: STRING
      llm_endpoint:
        defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
        description: vLLM inference endpoint URL
        isOptional: true
        parameterType: STRING
      model_name:
        defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
        description: HF model identifier
        isOptional: true
        parameterType: STRING
      ngram_prompt_lookup_max:
        defaultValue: '4'
        description: ngram window for spec decode (0 = off)
        isOptional: true
        parameterType: STRING
      num_iterations:
        defaultValue: 3.0
        description: how many times to repeat the prompt suite
        isOptional: true
        parameterType: NUMBER_INTEGER
      num_speculative_tokens:
        defaultValue: '3'
        description: number of speculative tokens (0 = off)
        isOptional: true
        parameterType: STRING
      num_warmup:
        defaultValue: 2.0
        description: warmup requests before timing
        isOptional: true
        parameterType: NUMBER_INTEGER
      run_label:
        defaultValue: baseline
        description: human-readable label (e.g. "apc-on-spec3")
        isOptional: true
        parameterType: STRING
schemaVersion: 2.1.0
sdkVersion: kfp-2.12.1