feat: add vLLM tuning pipeline + recompile voice pipelines with MLflow

New: - vllm_tuning_pipeline.py: A/B benchmark different vLLM configs, logs latency/TPS/TTFT to MLflow (vllm-tuning experiment) - vllm_tuning_pipeline.yaml: compiled KFP YAML Updated: - voice_pipeline.py: per-step NamedTuple outputs with latency tracking, new log_pipeline_metrics MLflow component - voice_pipeline.yaml, tts_pipeline.yaml, rag_pipeline.yaml: recompiled
2026-02-13 08:24:11 -05:00
parent cee21f124c
commit bc4b230dd9
6 changed files with 2216 additions and 26 deletions
--- a/vllm_tuning_pipeline.yaml
+++ b/vllm_tuning_pipeline.yaml
@@ -0,0 +1,501 @@
+# PIPELINE DEFINITION
+# Name: vllm-tuning-evaluation
+# Description: Benchmark vLLM with different tuning configurations. Logs latency, TPS, and TTFT to MLflow for A/B comparison.
+# Inputs:
+#    enable_chunked_prefill: str [Default: 'true']
+#    enable_prefix_caching: str [Default: 'true']
+#    gpu_memory_utilization: str [Default: '0.90']
+#    llm_endpoint: str [Default: 'http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm']
+#    model_name: str [Default: 'hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4']
+#    ngram_prompt_lookup_max: str [Default: '4']
+#    num_iterations: int [Default: 3.0]
+#    num_speculative_tokens: str [Default: '3']
+#    num_warmup: int [Default: 2.0]
+#    run_label: str [Default: 'baseline']
+components:
+  comp-build-prompt-suite:
+    executorLabel: exec-build-prompt-suite
+    outputDefinitions:
+      parameters:
+        Output:
+          parameterType: LIST
+  comp-create-tuning-run:
+    executorLabel: exec-create-tuning-run
+    inputDefinitions:
+      parameters:
+        experiment_name:
+          parameterType: STRING
+        mlflow_tracking_uri:
+          defaultValue: http://mlflow.mlflow.svc.cluster.local:80
+          isOptional: true
+          parameterType: STRING
+        run_name:
+          parameterType: STRING
+        tuning_params:
+          parameterType: STRUCT
+    outputDefinitions:
+      parameters:
+        experiment_id:
+          parameterType: STRING
+        run_id:
+          parameterType: STRING
+  comp-log-benchmark-results:
+    executorLabel: exec-log-benchmark-results
+    inputDefinitions:
+      parameters:
+        metrics:
+          parameterType: STRUCT
+        mlflow_tracking_uri:
+          defaultValue: http://mlflow.mlflow.svc.cluster.local:80
+          isOptional: true
+          parameterType: STRING
+        run_id:
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        Output:
+          parameterType: STRING
+  comp-run-benchmark:
+    executorLabel: exec-run-benchmark
+    inputDefinitions:
+      parameters:
+        llm_endpoint:
+          parameterType: STRING
+        model_name:
+          parameterType: STRING
+        num_iterations:
+          defaultValue: 3.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        num_warmup:
+          defaultValue: 2.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        prompts:
+          parameterType: LIST
+    outputDefinitions:
+      parameters:
+        Output:
+          parameterType: STRUCT
+deploymentSpec:
+  executors:
+    exec-build-prompt-suite:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - build_prompt_suite
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef build_prompt_suite() -> list:\n    \"\"\"Return a list of test\
+          \ prompts spanning short, medium, and long inputs.\"\"\"\n    return [\n\
+          \        {\n            \"id\": \"short-1\",\n            \"category\":\
+          \ \"short\",\n            \"messages\": [\n                {\"role\": \"\
+          user\", \"content\": \"What is the capital of France?\"}\n            ],\n\
+          \            \"max_tokens\": 64,\n        },\n        {\n            \"\
+          id\": \"short-2\",\n            \"category\": \"short\",\n            \"\
+          messages\": [\n                {\"role\": \"user\", \"content\": \"Explain\
+          \ quantum computing in one sentence.\"}\n            ],\n            \"\
+          max_tokens\": 64,\n        },\n        {\n            \"id\": \"medium-1\"\
+          ,\n            \"category\": \"medium\",\n            \"messages\": [\n\
+          \                {\n                    \"role\": \"system\",\n        \
+          \            \"content\": \"You are a helpful AI assistant running on a\
+          \ homelab.\",\n                },\n                {\n                 \
+          \   \"role\": \"user\",\n                    \"content\": (\n          \
+          \              \"Compare and contrast supervised and unsupervised \"\n \
+          \                       \"machine learning. Give examples of each and explain\
+          \ \"\n                        \"when you would choose one over the other.\"\
+          \n                    ),\n                },\n            ],\n         \
+          \   \"max_tokens\": 512,\n        },\n        {\n            \"id\": \"\
+          medium-2\",\n            \"category\": \"medium\",\n            \"messages\"\
+          : [\n                {\n                    \"role\": \"user\",\n      \
+          \              \"content\": (\n                        \"Write a Python\
+          \ function that implements a binary search \"\n                        \"\
+          tree with insert, search, and delete operations. Include \"\n          \
+          \              \"docstrings and type hints.\"\n                    ),\n\
+          \                },\n            ],\n            \"max_tokens\": 1024,\n\
+          \        },\n        {\n            \"id\": \"long-1\",\n            \"\
+          category\": \"long\",\n            \"messages\": [\n                {\n\
+          \                    \"role\": \"system\",\n                    \"content\"\
+          : \"You are a technical writer for a Kubernetes homelab blog.\",\n     \
+          \           },\n                {\n                    \"role\": \"user\"\
+          ,\n                    \"content\": (\n                        \"Write a\
+          \ detailed tutorial on setting up a multi-node \"\n                    \
+          \    \"Kubernetes cluster with Talos Linux, covering: \"\n             \
+          \           \"1) Hardware requirements and network topology, \"\n      \
+          \                  \"2) Talos machine config generation, \"\n          \
+          \              \"3) Control plane bootstrapping, \"\n                  \
+          \      \"4) Worker node joining, \"\n                        \"5) CNI setup\
+          \ with Cilium, \"\n                        \"6) Storage with Rook-Ceph,\
+          \ \"\n                        \"7) GitOps with Flux CD. \"\n           \
+          \             \"Include YAML examples for each step.\"\n               \
+          \     ),\n                },\n            ],\n            \"max_tokens\"\
+          : 2048,\n        },\n        {\n            \"id\": \"repeat-prefix-1\"\
+          ,\n            \"category\": \"prefix-cache-test\",\n            \"messages\"\
+          : [\n                {\n                    \"role\": \"system\",\n    \
+          \                \"content\": \"You are a helpful AI assistant running on\
+          \ a homelab.\",\n                },\n                {\n               \
+          \     \"role\": \"user\",\n                    \"content\": (\n        \
+          \                \"Compare and contrast supervised and unsupervised \"\n\
+          \                        \"machine learning. Now focus specifically on \"\
+          \n                        \"reinforcement learning and how it differs.\"\
+          \n                    ),\n                },\n            ],\n         \
+          \   \"max_tokens\": 512,\n        },\n    ]\n\n"
+        image: python:3.13-slim
+    exec-create-tuning-run:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - create_tuning_run
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
+          \ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef create_tuning_run(\n    experiment_name: str,\n    run_name:\
+          \ str,\n    tuning_params: dict,\n    mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\
+          ,\n) -> NamedTuple(\"RunInfo\", [(\"run_id\", str), (\"experiment_id\",\
+          \ str)]):\n    \"\"\"Create an MLflow run for a vLLM tuning experiment.\"\
+          \"\"\n    import os\n    import mlflow\n    from mlflow.tracking import\
+          \ MlflowClient\n    from collections import namedtuple\n\n    mlflow.set_tracking_uri(mlflow_tracking_uri)\n\
+          \    client = MlflowClient()\n\n    exp = client.get_experiment_by_name(experiment_name)\n\
+          \    experiment_id = (\n        exp.experiment_id\n        if exp\n    \
+          \    else client.create_experiment(\n            name=experiment_name,\n\
+          \            artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\
+          ,\n        )\n    )\n\n    tags = {\n        \"pipeline.type\": \"vllm-tuning\"\
+          ,\n        \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\", \"unknown\"),\n\
+          \    }\n\n    run = mlflow.start_run(\n        experiment_id=experiment_id,\
+          \ run_name=run_name, tags=tags\n    )\n    # Log every tuning param\n  \
+          \  for key, value in tuning_params.items():\n        mlflow.log_param(f\"\
+          vllm.{key}\", value)\n    run_id = run.info.run_id\n    mlflow.end_run()\n\
+          \n    RunInfo = namedtuple(\"RunInfo\", [\"run_id\", \"experiment_id\"])\n\
+          \    return RunInfo(run_id, experiment_id)\n\n"
+        image: python:3.13-slim
+    exec-log-benchmark-results:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - log_benchmark_results
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
+          \ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef log_benchmark_results(\n    run_id: str,\n    metrics: dict,\n\
+          \    mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\
+          ,\n) -> str:\n    \"\"\"Log benchmark metrics to MLflow and close the run.\"\
+          \"\"\n    import json\n    import tempfile\n    import mlflow\n    from\
+          \ mlflow.tracking import MlflowClient\n    from pathlib import Path\n\n\
+          \    mlflow.set_tracking_uri(mlflow_tracking_uri)\n    client = MlflowClient()\n\
+          \n    for key, value in metrics.items():\n        client.log_metric(run_id,\
+          \ key, float(value))\n\n    # Save full results as artifact\n    with tempfile.TemporaryDirectory()\
+          \ as tmpdir:\n        path = Path(tmpdir) / \"benchmark_results.json\"\n\
+          \        path.write_text(json.dumps(metrics, indent=2))\n        client.log_artifact(run_id,\
+          \ str(path))\n\n    client.set_terminated(run_id, status=\"FINISHED\")\n\
+          \    return run_id\n\n"
+        image: python:3.13-slim
+    exec-run-benchmark:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - run_benchmark
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+          \  python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef run_benchmark(\n    prompts: list,\n    llm_endpoint: str,\n\
+          \    model_name: str,\n    num_warmup: int = 2,\n    num_iterations: int\
+          \ = 3,\n) -> dict:\n    \"\"\"\n    Run all prompts through the LLM endpoint\
+          \ and collect timing metrics.\n\n    Returns aggregate metrics: p50/p95/mean\
+          \ latency, tokens/sec, TTFT.\n    \"\"\"\n    import time\n    import statistics\n\
+          \    import httpx\n\n    all_latencies: list[float] = []\n    all_tps: list[float]\
+          \ = []\n    all_ttft: list[float] = []\n    per_category: dict[str, list[float]]\
+          \ = {}\n\n    with httpx.Client(timeout=300.0) as client:\n        # Warmup\n\
+          \        for _ in range(num_warmup):\n            try:\n               \
+          \ client.post(\n                    f\"{llm_endpoint}/v1/chat/completions\"\
+          ,\n                    json={\n                        \"model\": model_name,\n\
+          \                        \"messages\": [{\"role\": \"user\", \"content\"\
+          : \"Hi\"}],\n                        \"max_tokens\": 8,\n              \
+          \          \"temperature\": 0,\n                    },\n               \
+          \ )\n            except Exception:\n                pass\n\n        # Benchmark\n\
+          \        for iteration in range(num_iterations):\n            for prompt\
+          \ in prompts:\n                category = prompt.get(\"category\", \"unknown\"\
+          )\n                payload = {\n                    \"model\": model_name,\n\
+          \                    \"messages\": prompt[\"messages\"],\n             \
+          \       \"max_tokens\": prompt.get(\"max_tokens\", 256),\n             \
+          \       \"temperature\": 0,\n                    \"stream\": True,\n   \
+          \             }\n\n                try:\n                    t_start = time.perf_counter()\n\
+          \                    first_token_time = None\n\n                    with\
+          \ client.stream(\n                        \"POST\",\n                  \
+          \      f\"{llm_endpoint}/v1/chat/completions\",\n                      \
+          \  json=payload,\n                    ) as resp:\n                     \
+          \   resp.raise_for_status()\n                        completion_tokens =\
+          \ 0\n                        for line in resp.iter_lines():\n          \
+          \                  if not line.startswith(\"data: \"):\n               \
+          \                 continue\n                            chunk = line[6:]\n\
+          \                            if chunk == \"[DONE]\":\n                 \
+          \               break\n                            if first_token_time is\
+          \ None:\n                                first_token_time = time.perf_counter()\n\
+          \                            completion_tokens += 1\n\n                \
+          \    t_end = time.perf_counter()\n                    latency = t_end -\
+          \ t_start\n                    ttft = (\n                        (first_token_time\
+          \ - t_start)\n                        if first_token_time\n            \
+          \            else latency\n                    )\n                    tps\
+          \ = (\n                        completion_tokens / latency if latency >\
+          \ 0 else 0\n                    )\n\n                    all_latencies.append(latency)\n\
+          \                    all_tps.append(tps)\n                    all_ttft.append(ttft)\n\
+          \                    per_category.setdefault(category, []).append(latency)\n\
+          \n                except Exception as exc:\n                    # Record\
+          \ failure but keep going\n                    all_latencies.append(-1)\n\
+          \                    all_tps.append(0)\n                    all_ttft.append(-1)\n\
+          \n    # Compute aggregates\n    valid_latencies = [l for l in all_latencies\
+          \ if l > 0]\n    valid_tps = [t for t in all_tps if t > 0]\n    valid_ttft\
+          \ = [t for t in all_ttft if t > 0]\n\n    def safe_stat(values, func):\n\
+          \        return func(values) if values else 0\n\n    metrics = {\n     \
+          \   \"total_requests\": len(all_latencies),\n        \"successful_requests\"\
+          : len(valid_latencies),\n        \"failed_requests\": len(all_latencies)\
+          \ - len(valid_latencies),\n        # Latency\n        \"latency_mean_s\"\
+          : safe_stat(valid_latencies, statistics.mean),\n        \"latency_p50_s\"\
+          : safe_stat(\n            valid_latencies,\n            lambda v: statistics.median(v),\n\
+          \        ),\n        \"latency_p95_s\": safe_stat(\n            valid_latencies,\n\
+          \            lambda v: sorted(v)[int(len(v) * 0.95)] if v else 0,\n    \
+          \    ),\n        # Throughput\n        \"tokens_per_second_mean\": safe_stat(valid_tps,\
+          \ statistics.mean),\n        \"tokens_per_second_p50\": safe_stat(\n   \
+          \         valid_tps, lambda v: statistics.median(v)\n        ),\n      \
+          \  # Time to first token\n        \"ttft_mean_s\": safe_stat(valid_ttft,\
+          \ statistics.mean),\n        \"ttft_p50_s\": safe_stat(valid_ttft, lambda\
+          \ v: statistics.median(v)),\n        \"ttft_p95_s\": safe_stat(\n      \
+          \      valid_ttft,\n            lambda v: sorted(v)[int(len(v) * 0.95)]\
+          \ if v else 0,\n        ),\n    }\n\n    # Per-category latency\n    for\
+          \ cat, lats in per_category.items():\n        valid = [l for l in lats if\
+          \ l > 0]\n        if valid:\n            metrics[f\"latency_mean_{cat}_s\"\
+          ] = statistics.mean(valid)\n\n    return metrics\n\n"
+        image: python:3.13-slim
+pipelineInfo:
+  description: Benchmark vLLM with different tuning configurations. Logs latency,
+    TPS, and TTFT to MLflow for A/B comparison.
+  name: vllm-tuning-evaluation
+root:
+  dag:
+    tasks:
+      build-prompt-suite:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-build-prompt-suite
+        taskInfo:
+          name: build-prompt-suite
+      create-tuning-run:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-create-tuning-run
+        inputs:
+          parameters:
+            experiment_name:
+              runtimeValue:
+                constant: vllm-tuning
+            pipelinechannel--enable_chunked_prefill:
+              componentInputParameter: enable_chunked_prefill
+            pipelinechannel--enable_prefix_caching:
+              componentInputParameter: enable_prefix_caching
+            pipelinechannel--gpu_memory_utilization:
+              componentInputParameter: gpu_memory_utilization
+            pipelinechannel--llm_endpoint:
+              componentInputParameter: llm_endpoint
+            pipelinechannel--model_name:
+              componentInputParameter: model_name
+            pipelinechannel--ngram_prompt_lookup_max:
+              componentInputParameter: ngram_prompt_lookup_max
+            pipelinechannel--num_iterations:
+              componentInputParameter: num_iterations
+            pipelinechannel--num_speculative_tokens:
+              componentInputParameter: num_speculative_tokens
+            pipelinechannel--num_warmup:
+              componentInputParameter: num_warmup
+            pipelinechannel--run_label:
+              componentInputParameter: run_label
+            run_name:
+              runtimeValue:
+                constant: vllm-{{$.inputs.parameters['pipelinechannel--run_label']}}
+            tuning_params:
+              runtimeValue:
+                constant:
+                  enable_chunked_prefill: '{{$.inputs.parameters[''pipelinechannel--enable_chunked_prefill'']}}'
+                  enable_prefix_caching: '{{$.inputs.parameters[''pipelinechannel--enable_prefix_caching'']}}'
+                  gpu_memory_utilization: '{{$.inputs.parameters[''pipelinechannel--gpu_memory_utilization'']}}'
+                  llm_endpoint: '{{$.inputs.parameters[''pipelinechannel--llm_endpoint'']}}'
+                  model_name: '{{$.inputs.parameters[''pipelinechannel--model_name'']}}'
+                  ngram_prompt_lookup_max: '{{$.inputs.parameters[''pipelinechannel--ngram_prompt_lookup_max'']}}'
+                  num_iterations: '{{$.inputs.parameters[''pipelinechannel--num_iterations'']}}'
+                  num_speculative_tokens: '{{$.inputs.parameters[''pipelinechannel--num_speculative_tokens'']}}'
+                  num_warmup: '{{$.inputs.parameters[''pipelinechannel--num_warmup'']}}'
+        taskInfo:
+          name: create-tuning-run
+      log-benchmark-results:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-log-benchmark-results
+        dependentTasks:
+        - create-tuning-run
+        - run-benchmark
+        inputs:
+          parameters:
+            metrics:
+              taskOutputParameter:
+                outputParameterKey: Output
+                producerTask: run-benchmark
+            run_id:
+              taskOutputParameter:
+                outputParameterKey: run_id
+                producerTask: create-tuning-run
+        taskInfo:
+          name: log-benchmark-results
+      run-benchmark:
+        cachingOptions: {}
+        componentRef:
+          name: comp-run-benchmark
+        dependentTasks:
+        - build-prompt-suite
+        inputs:
+          parameters:
+            llm_endpoint:
+              componentInputParameter: llm_endpoint
+            model_name:
+              componentInputParameter: model_name
+            num_iterations:
+              componentInputParameter: num_iterations
+            num_warmup:
+              componentInputParameter: num_warmup
+            prompts:
+              taskOutputParameter:
+                outputParameterKey: Output
+                producerTask: build-prompt-suite
+        taskInfo:
+          name: run-benchmark
+  inputDefinitions:
+    parameters:
+      enable_chunked_prefill:
+        defaultValue: 'true'
+        description: '"true" or "false"'
+        isOptional: true
+        parameterType: STRING
+      enable_prefix_caching:
+        defaultValue: 'true'
+        description: '"true" or "false"'
+        isOptional: true
+        parameterType: STRING
+      gpu_memory_utilization:
+        defaultValue: '0.90'
+        description: 0.0 - 1.0
+        isOptional: true
+        parameterType: STRING
+      llm_endpoint:
+        defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
+        description: vLLM inference endpoint URL
+        isOptional: true
+        parameterType: STRING
+      model_name:
+        defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
+        description: HF model identifier
+        isOptional: true
+        parameterType: STRING
+      ngram_prompt_lookup_max:
+        defaultValue: '4'
+        description: ngram window for spec decode (0 = off)
+        isOptional: true
+        parameterType: STRING
+      num_iterations:
+        defaultValue: 3.0
+        description: how many times to repeat the prompt suite
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      num_speculative_tokens:
+        defaultValue: '3'
+        description: number of speculative tokens (0 = off)
+        isOptional: true
+        parameterType: STRING
+      num_warmup:
+        defaultValue: 2.0
+        description: warmup requests before timing
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      run_label:
+        defaultValue: baseline
+        description: human-readable label (e.g. "apc-on-spec3")
+        isOptional: true
+        parameterType: STRING
+schemaVersion: 2.1.0
+sdkVersion: kfp-2.12.1