New: - vllm_tuning_pipeline.py: A/B benchmark different vLLM configs, logs latency/TPS/TTFT to MLflow (vllm-tuning experiment) - vllm_tuning_pipeline.yaml: compiled KFP YAML Updated: - voice_pipeline.py: per-step NamedTuple outputs with latency tracking, new log_pipeline_metrics MLflow component - voice_pipeline.yaml, tts_pipeline.yaml, rag_pipeline.yaml: recompiled
502 lines
26 KiB
YAML
502 lines
26 KiB
YAML
# PIPELINE DEFINITION
|
|
# Name: vllm-tuning-evaluation
|
|
# Description: Benchmark vLLM with different tuning configurations. Logs latency, TPS, and TTFT to MLflow for A/B comparison.
|
|
# Inputs:
|
|
# enable_chunked_prefill: str [Default: 'true']
|
|
# enable_prefix_caching: str [Default: 'true']
|
|
# gpu_memory_utilization: str [Default: '0.90']
|
|
# llm_endpoint: str [Default: 'http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm']
|
|
# model_name: str [Default: 'hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4']
|
|
# ngram_prompt_lookup_max: str [Default: '4']
|
|
# num_iterations: int [Default: 3.0]
|
|
# num_speculative_tokens: str [Default: '3']
|
|
# num_warmup: int [Default: 2.0]
|
|
# run_label: str [Default: 'baseline']
|
|
components:
|
|
comp-build-prompt-suite:
|
|
executorLabel: exec-build-prompt-suite
|
|
outputDefinitions:
|
|
parameters:
|
|
Output:
|
|
parameterType: LIST
|
|
comp-create-tuning-run:
|
|
executorLabel: exec-create-tuning-run
|
|
inputDefinitions:
|
|
parameters:
|
|
experiment_name:
|
|
parameterType: STRING
|
|
mlflow_tracking_uri:
|
|
defaultValue: http://mlflow.mlflow.svc.cluster.local:80
|
|
isOptional: true
|
|
parameterType: STRING
|
|
run_name:
|
|
parameterType: STRING
|
|
tuning_params:
|
|
parameterType: STRUCT
|
|
outputDefinitions:
|
|
parameters:
|
|
experiment_id:
|
|
parameterType: STRING
|
|
run_id:
|
|
parameterType: STRING
|
|
comp-log-benchmark-results:
|
|
executorLabel: exec-log-benchmark-results
|
|
inputDefinitions:
|
|
parameters:
|
|
metrics:
|
|
parameterType: STRUCT
|
|
mlflow_tracking_uri:
|
|
defaultValue: http://mlflow.mlflow.svc.cluster.local:80
|
|
isOptional: true
|
|
parameterType: STRING
|
|
run_id:
|
|
parameterType: STRING
|
|
outputDefinitions:
|
|
parameters:
|
|
Output:
|
|
parameterType: STRING
|
|
comp-run-benchmark:
|
|
executorLabel: exec-run-benchmark
|
|
inputDefinitions:
|
|
parameters:
|
|
llm_endpoint:
|
|
parameterType: STRING
|
|
model_name:
|
|
parameterType: STRING
|
|
num_iterations:
|
|
defaultValue: 3.0
|
|
isOptional: true
|
|
parameterType: NUMBER_INTEGER
|
|
num_warmup:
|
|
defaultValue: 2.0
|
|
isOptional: true
|
|
parameterType: NUMBER_INTEGER
|
|
prompts:
|
|
parameterType: LIST
|
|
outputDefinitions:
|
|
parameters:
|
|
Output:
|
|
parameterType: STRUCT
|
|
deploymentSpec:
|
|
executors:
|
|
exec-build-prompt-suite:
|
|
container:
|
|
args:
|
|
- --executor_input
|
|
- '{{$}}'
|
|
- --function_to_execute
|
|
- build_prompt_suite
|
|
command:
|
|
- sh
|
|
- -c
|
|
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
|
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
|
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
|
$0\" \"$@\"\n"
|
|
- sh
|
|
- -ec
|
|
- 'program_path=$(mktemp -d)
|
|
|
|
|
|
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
|
|
|
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
|
|
|
'
|
|
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
|
\ *\n\ndef build_prompt_suite() -> list:\n \"\"\"Return a list of test\
|
|
\ prompts spanning short, medium, and long inputs.\"\"\"\n return [\n\
|
|
\ {\n \"id\": \"short-1\",\n \"category\":\
|
|
\ \"short\",\n \"messages\": [\n {\"role\": \"\
|
|
user\", \"content\": \"What is the capital of France?\"}\n ],\n\
|
|
\ \"max_tokens\": 64,\n },\n {\n \"\
|
|
id\": \"short-2\",\n \"category\": \"short\",\n \"\
|
|
messages\": [\n {\"role\": \"user\", \"content\": \"Explain\
|
|
\ quantum computing in one sentence.\"}\n ],\n \"\
|
|
max_tokens\": 64,\n },\n {\n \"id\": \"medium-1\"\
|
|
,\n \"category\": \"medium\",\n \"messages\": [\n\
|
|
\ {\n \"role\": \"system\",\n \
|
|
\ \"content\": \"You are a helpful AI assistant running on a\
|
|
\ homelab.\",\n },\n {\n \
|
|
\ \"role\": \"user\",\n \"content\": (\n \
|
|
\ \"Compare and contrast supervised and unsupervised \"\n \
|
|
\ \"machine learning. Give examples of each and explain\
|
|
\ \"\n \"when you would choose one over the other.\"\
|
|
\n ),\n },\n ],\n \
|
|
\ \"max_tokens\": 512,\n },\n {\n \"id\": \"\
|
|
medium-2\",\n \"category\": \"medium\",\n \"messages\"\
|
|
: [\n {\n \"role\": \"user\",\n \
|
|
\ \"content\": (\n \"Write a Python\
|
|
\ function that implements a binary search \"\n \"\
|
|
tree with insert, search, and delete operations. Include \"\n \
|
|
\ \"docstrings and type hints.\"\n ),\n\
|
|
\ },\n ],\n \"max_tokens\": 1024,\n\
|
|
\ },\n {\n \"id\": \"long-1\",\n \"\
|
|
category\": \"long\",\n \"messages\": [\n {\n\
|
|
\ \"role\": \"system\",\n \"content\"\
|
|
: \"You are a technical writer for a Kubernetes homelab blog.\",\n \
|
|
\ },\n {\n \"role\": \"user\"\
|
|
,\n \"content\": (\n \"Write a\
|
|
\ detailed tutorial on setting up a multi-node \"\n \
|
|
\ \"Kubernetes cluster with Talos Linux, covering: \"\n \
|
|
\ \"1) Hardware requirements and network topology, \"\n \
|
|
\ \"2) Talos machine config generation, \"\n \
|
|
\ \"3) Control plane bootstrapping, \"\n \
|
|
\ \"4) Worker node joining, \"\n \"5) CNI setup\
|
|
\ with Cilium, \"\n \"6) Storage with Rook-Ceph,\
|
|
\ \"\n \"7) GitOps with Flux CD. \"\n \
|
|
\ \"Include YAML examples for each step.\"\n \
|
|
\ ),\n },\n ],\n \"max_tokens\"\
|
|
: 2048,\n },\n {\n \"id\": \"repeat-prefix-1\"\
|
|
,\n \"category\": \"prefix-cache-test\",\n \"messages\"\
|
|
: [\n {\n \"role\": \"system\",\n \
|
|
\ \"content\": \"You are a helpful AI assistant running on\
|
|
\ a homelab.\",\n },\n {\n \
|
|
\ \"role\": \"user\",\n \"content\": (\n \
|
|
\ \"Compare and contrast supervised and unsupervised \"\n\
|
|
\ \"machine learning. Now focus specifically on \"\
|
|
\n \"reinforcement learning and how it differs.\"\
|
|
\n ),\n },\n ],\n \
|
|
\ \"max_tokens\": 512,\n },\n ]\n\n"
|
|
image: python:3.13-slim
|
|
exec-create-tuning-run:
|
|
container:
|
|
args:
|
|
- --executor_input
|
|
- '{{$}}'
|
|
- --function_to_execute
|
|
- create_tuning_run
|
|
command:
|
|
- sh
|
|
- -c
|
|
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
|
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
|
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
|
|
\ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
|
|
- sh
|
|
- -ec
|
|
- 'program_path=$(mktemp -d)
|
|
|
|
|
|
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
|
|
|
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
|
|
|
'
|
|
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
|
\ *\n\ndef create_tuning_run(\n experiment_name: str,\n run_name:\
|
|
\ str,\n tuning_params: dict,\n mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\
|
|
,\n) -> NamedTuple(\"RunInfo\", [(\"run_id\", str), (\"experiment_id\",\
|
|
\ str)]):\n \"\"\"Create an MLflow run for a vLLM tuning experiment.\"\
|
|
\"\"\n import os\n import mlflow\n from mlflow.tracking import\
|
|
\ MlflowClient\n from collections import namedtuple\n\n mlflow.set_tracking_uri(mlflow_tracking_uri)\n\
|
|
\ client = MlflowClient()\n\n exp = client.get_experiment_by_name(experiment_name)\n\
|
|
\ experiment_id = (\n exp.experiment_id\n if exp\n \
|
|
\ else client.create_experiment(\n name=experiment_name,\n\
|
|
\ artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\
|
|
,\n )\n )\n\n tags = {\n \"pipeline.type\": \"vllm-tuning\"\
|
|
,\n \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\", \"unknown\"),\n\
|
|
\ }\n\n run = mlflow.start_run(\n experiment_id=experiment_id,\
|
|
\ run_name=run_name, tags=tags\n )\n # Log every tuning param\n \
|
|
\ for key, value in tuning_params.items():\n mlflow.log_param(f\"\
|
|
vllm.{key}\", value)\n run_id = run.info.run_id\n mlflow.end_run()\n\
|
|
\n RunInfo = namedtuple(\"RunInfo\", [\"run_id\", \"experiment_id\"])\n\
|
|
\ return RunInfo(run_id, experiment_id)\n\n"
|
|
image: python:3.13-slim
|
|
exec-log-benchmark-results:
|
|
container:
|
|
args:
|
|
- --executor_input
|
|
- '{{$}}'
|
|
- --function_to_execute
|
|
- log_benchmark_results
|
|
command:
|
|
- sh
|
|
- -c
|
|
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
|
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
|
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
|
|
\ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
|
|
- sh
|
|
- -ec
|
|
- 'program_path=$(mktemp -d)
|
|
|
|
|
|
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
|
|
|
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
|
|
|
'
|
|
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
|
\ *\n\ndef log_benchmark_results(\n run_id: str,\n metrics: dict,\n\
|
|
\ mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\
|
|
,\n) -> str:\n \"\"\"Log benchmark metrics to MLflow and close the run.\"\
|
|
\"\"\n import json\n import tempfile\n import mlflow\n from\
|
|
\ mlflow.tracking import MlflowClient\n from pathlib import Path\n\n\
|
|
\ mlflow.set_tracking_uri(mlflow_tracking_uri)\n client = MlflowClient()\n\
|
|
\n for key, value in metrics.items():\n client.log_metric(run_id,\
|
|
\ key, float(value))\n\n # Save full results as artifact\n with tempfile.TemporaryDirectory()\
|
|
\ as tmpdir:\n path = Path(tmpdir) / \"benchmark_results.json\"\n\
|
|
\ path.write_text(json.dumps(metrics, indent=2))\n client.log_artifact(run_id,\
|
|
\ str(path))\n\n client.set_terminated(run_id, status=\"FINISHED\")\n\
|
|
\ return run_id\n\n"
|
|
image: python:3.13-slim
|
|
exec-run-benchmark:
|
|
container:
|
|
args:
|
|
- --executor_input
|
|
- '{{$}}'
|
|
- --function_to_execute
|
|
- run_benchmark
|
|
command:
|
|
- sh
|
|
- -c
|
|
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
|
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
|
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
|
$0\" \"$@\"\n"
|
|
- sh
|
|
- -ec
|
|
- 'program_path=$(mktemp -d)
|
|
|
|
|
|
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
|
|
|
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
|
|
|
'
|
|
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
|
\ *\n\ndef run_benchmark(\n prompts: list,\n llm_endpoint: str,\n\
|
|
\ model_name: str,\n num_warmup: int = 2,\n num_iterations: int\
|
|
\ = 3,\n) -> dict:\n \"\"\"\n Run all prompts through the LLM endpoint\
|
|
\ and collect timing metrics.\n\n Returns aggregate metrics: p50/p95/mean\
|
|
\ latency, tokens/sec, TTFT.\n \"\"\"\n import time\n import statistics\n\
|
|
\ import httpx\n\n all_latencies: list[float] = []\n all_tps: list[float]\
|
|
\ = []\n all_ttft: list[float] = []\n per_category: dict[str, list[float]]\
|
|
\ = {}\n\n with httpx.Client(timeout=300.0) as client:\n # Warmup\n\
|
|
\ for _ in range(num_warmup):\n try:\n \
|
|
\ client.post(\n f\"{llm_endpoint}/v1/chat/completions\"\
|
|
,\n json={\n \"model\": model_name,\n\
|
|
\ \"messages\": [{\"role\": \"user\", \"content\"\
|
|
: \"Hi\"}],\n \"max_tokens\": 8,\n \
|
|
\ \"temperature\": 0,\n },\n \
|
|
\ )\n except Exception:\n pass\n\n # Benchmark\n\
|
|
\ for iteration in range(num_iterations):\n for prompt\
|
|
\ in prompts:\n category = prompt.get(\"category\", \"unknown\"\
|
|
)\n payload = {\n \"model\": model_name,\n\
|
|
\ \"messages\": prompt[\"messages\"],\n \
|
|
\ \"max_tokens\": prompt.get(\"max_tokens\", 256),\n \
|
|
\ \"temperature\": 0,\n \"stream\": True,\n \
|
|
\ }\n\n try:\n t_start = time.perf_counter()\n\
|
|
\ first_token_time = None\n\n with\
|
|
\ client.stream(\n \"POST\",\n \
|
|
\ f\"{llm_endpoint}/v1/chat/completions\",\n \
|
|
\ json=payload,\n ) as resp:\n \
|
|
\ resp.raise_for_status()\n completion_tokens =\
|
|
\ 0\n for line in resp.iter_lines():\n \
|
|
\ if not line.startswith(\"data: \"):\n \
|
|
\ continue\n chunk = line[6:]\n\
|
|
\ if chunk == \"[DONE]\":\n \
|
|
\ break\n if first_token_time is\
|
|
\ None:\n first_token_time = time.perf_counter()\n\
|
|
\ completion_tokens += 1\n\n \
|
|
\ t_end = time.perf_counter()\n latency = t_end -\
|
|
\ t_start\n ttft = (\n (first_token_time\
|
|
\ - t_start)\n if first_token_time\n \
|
|
\ else latency\n )\n tps\
|
|
\ = (\n completion_tokens / latency if latency >\
|
|
\ 0 else 0\n )\n\n all_latencies.append(latency)\n\
|
|
\ all_tps.append(tps)\n all_ttft.append(ttft)\n\
|
|
\ per_category.setdefault(category, []).append(latency)\n\
|
|
\n except Exception as exc:\n # Record\
|
|
\ failure but keep going\n all_latencies.append(-1)\n\
|
|
\ all_tps.append(0)\n all_ttft.append(-1)\n\
|
|
\n # Compute aggregates\n valid_latencies = [l for l in all_latencies\
|
|
\ if l > 0]\n valid_tps = [t for t in all_tps if t > 0]\n valid_ttft\
|
|
\ = [t for t in all_ttft if t > 0]\n\n def safe_stat(values, func):\n\
|
|
\ return func(values) if values else 0\n\n metrics = {\n \
|
|
\ \"total_requests\": len(all_latencies),\n \"successful_requests\"\
|
|
: len(valid_latencies),\n \"failed_requests\": len(all_latencies)\
|
|
\ - len(valid_latencies),\n # Latency\n \"latency_mean_s\"\
|
|
: safe_stat(valid_latencies, statistics.mean),\n \"latency_p50_s\"\
|
|
: safe_stat(\n valid_latencies,\n lambda v: statistics.median(v),\n\
|
|
\ ),\n \"latency_p95_s\": safe_stat(\n valid_latencies,\n\
|
|
\ lambda v: sorted(v)[int(len(v) * 0.95)] if v else 0,\n \
|
|
\ ),\n # Throughput\n \"tokens_per_second_mean\": safe_stat(valid_tps,\
|
|
\ statistics.mean),\n \"tokens_per_second_p50\": safe_stat(\n \
|
|
\ valid_tps, lambda v: statistics.median(v)\n ),\n \
|
|
\ # Time to first token\n \"ttft_mean_s\": safe_stat(valid_ttft,\
|
|
\ statistics.mean),\n \"ttft_p50_s\": safe_stat(valid_ttft, lambda\
|
|
\ v: statistics.median(v)),\n \"ttft_p95_s\": safe_stat(\n \
|
|
\ valid_ttft,\n lambda v: sorted(v)[int(len(v) * 0.95)]\
|
|
\ if v else 0,\n ),\n }\n\n # Per-category latency\n for\
|
|
\ cat, lats in per_category.items():\n valid = [l for l in lats if\
|
|
\ l > 0]\n if valid:\n metrics[f\"latency_mean_{cat}_s\"\
|
|
] = statistics.mean(valid)\n\n return metrics\n\n"
|
|
image: python:3.13-slim
|
|
pipelineInfo:
|
|
description: Benchmark vLLM with different tuning configurations. Logs latency,
|
|
TPS, and TTFT to MLflow for A/B comparison.
|
|
name: vllm-tuning-evaluation
|
|
root:
|
|
dag:
|
|
tasks:
|
|
build-prompt-suite:
|
|
cachingOptions:
|
|
enableCache: true
|
|
componentRef:
|
|
name: comp-build-prompt-suite
|
|
taskInfo:
|
|
name: build-prompt-suite
|
|
create-tuning-run:
|
|
cachingOptions:
|
|
enableCache: true
|
|
componentRef:
|
|
name: comp-create-tuning-run
|
|
inputs:
|
|
parameters:
|
|
experiment_name:
|
|
runtimeValue:
|
|
constant: vllm-tuning
|
|
pipelinechannel--enable_chunked_prefill:
|
|
componentInputParameter: enable_chunked_prefill
|
|
pipelinechannel--enable_prefix_caching:
|
|
componentInputParameter: enable_prefix_caching
|
|
pipelinechannel--gpu_memory_utilization:
|
|
componentInputParameter: gpu_memory_utilization
|
|
pipelinechannel--llm_endpoint:
|
|
componentInputParameter: llm_endpoint
|
|
pipelinechannel--model_name:
|
|
componentInputParameter: model_name
|
|
pipelinechannel--ngram_prompt_lookup_max:
|
|
componentInputParameter: ngram_prompt_lookup_max
|
|
pipelinechannel--num_iterations:
|
|
componentInputParameter: num_iterations
|
|
pipelinechannel--num_speculative_tokens:
|
|
componentInputParameter: num_speculative_tokens
|
|
pipelinechannel--num_warmup:
|
|
componentInputParameter: num_warmup
|
|
pipelinechannel--run_label:
|
|
componentInputParameter: run_label
|
|
run_name:
|
|
runtimeValue:
|
|
constant: vllm-{{$.inputs.parameters['pipelinechannel--run_label']}}
|
|
tuning_params:
|
|
runtimeValue:
|
|
constant:
|
|
enable_chunked_prefill: '{{$.inputs.parameters[''pipelinechannel--enable_chunked_prefill'']}}'
|
|
enable_prefix_caching: '{{$.inputs.parameters[''pipelinechannel--enable_prefix_caching'']}}'
|
|
gpu_memory_utilization: '{{$.inputs.parameters[''pipelinechannel--gpu_memory_utilization'']}}'
|
|
llm_endpoint: '{{$.inputs.parameters[''pipelinechannel--llm_endpoint'']}}'
|
|
model_name: '{{$.inputs.parameters[''pipelinechannel--model_name'']}}'
|
|
ngram_prompt_lookup_max: '{{$.inputs.parameters[''pipelinechannel--ngram_prompt_lookup_max'']}}'
|
|
num_iterations: '{{$.inputs.parameters[''pipelinechannel--num_iterations'']}}'
|
|
num_speculative_tokens: '{{$.inputs.parameters[''pipelinechannel--num_speculative_tokens'']}}'
|
|
num_warmup: '{{$.inputs.parameters[''pipelinechannel--num_warmup'']}}'
|
|
taskInfo:
|
|
name: create-tuning-run
|
|
log-benchmark-results:
|
|
cachingOptions:
|
|
enableCache: true
|
|
componentRef:
|
|
name: comp-log-benchmark-results
|
|
dependentTasks:
|
|
- create-tuning-run
|
|
- run-benchmark
|
|
inputs:
|
|
parameters:
|
|
metrics:
|
|
taskOutputParameter:
|
|
outputParameterKey: Output
|
|
producerTask: run-benchmark
|
|
run_id:
|
|
taskOutputParameter:
|
|
outputParameterKey: run_id
|
|
producerTask: create-tuning-run
|
|
taskInfo:
|
|
name: log-benchmark-results
|
|
run-benchmark:
|
|
cachingOptions: {}
|
|
componentRef:
|
|
name: comp-run-benchmark
|
|
dependentTasks:
|
|
- build-prompt-suite
|
|
inputs:
|
|
parameters:
|
|
llm_endpoint:
|
|
componentInputParameter: llm_endpoint
|
|
model_name:
|
|
componentInputParameter: model_name
|
|
num_iterations:
|
|
componentInputParameter: num_iterations
|
|
num_warmup:
|
|
componentInputParameter: num_warmup
|
|
prompts:
|
|
taskOutputParameter:
|
|
outputParameterKey: Output
|
|
producerTask: build-prompt-suite
|
|
taskInfo:
|
|
name: run-benchmark
|
|
inputDefinitions:
|
|
parameters:
|
|
enable_chunked_prefill:
|
|
defaultValue: 'true'
|
|
description: '"true" or "false"'
|
|
isOptional: true
|
|
parameterType: STRING
|
|
enable_prefix_caching:
|
|
defaultValue: 'true'
|
|
description: '"true" or "false"'
|
|
isOptional: true
|
|
parameterType: STRING
|
|
gpu_memory_utilization:
|
|
defaultValue: '0.90'
|
|
description: 0.0 - 1.0
|
|
isOptional: true
|
|
parameterType: STRING
|
|
llm_endpoint:
|
|
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
|
|
description: vLLM inference endpoint URL
|
|
isOptional: true
|
|
parameterType: STRING
|
|
model_name:
|
|
defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
|
|
description: HF model identifier
|
|
isOptional: true
|
|
parameterType: STRING
|
|
ngram_prompt_lookup_max:
|
|
defaultValue: '4'
|
|
description: ngram window for spec decode (0 = off)
|
|
isOptional: true
|
|
parameterType: STRING
|
|
num_iterations:
|
|
defaultValue: 3.0
|
|
description: how many times to repeat the prompt suite
|
|
isOptional: true
|
|
parameterType: NUMBER_INTEGER
|
|
num_speculative_tokens:
|
|
defaultValue: '3'
|
|
description: number of speculative tokens (0 = off)
|
|
isOptional: true
|
|
parameterType: STRING
|
|
num_warmup:
|
|
defaultValue: 2.0
|
|
description: warmup requests before timing
|
|
isOptional: true
|
|
parameterType: NUMBER_INTEGER
|
|
run_label:
|
|
defaultValue: baseline
|
|
description: human-readable label (e.g. "apc-on-spec3")
|
|
isOptional: true
|
|
parameterType: STRING
|
|
schemaVersion: 2.1.0
|
|
sdkVersion: kfp-2.12.1
|