feat: add vLLM tuning pipeline + recompile voice pipelines with MLflow
New: - vllm_tuning_pipeline.py: A/B benchmark different vLLM configs, logs latency/TPS/TTFT to MLflow (vllm-tuning experiment) - vllm_tuning_pipeline.yaml: compiled KFP YAML Updated: - voice_pipeline.py: per-step NamedTuple outputs with latency tracking, new log_pipeline_metrics MLflow component - voice_pipeline.yaml, tts_pipeline.yaml, rag_pipeline.yaml: recompiled
This commit is contained in:
501
vllm_tuning_pipeline.yaml
Normal file
501
vllm_tuning_pipeline.yaml
Normal file
@@ -0,0 +1,501 @@
|
||||
# PIPELINE DEFINITION
|
||||
# Name: vllm-tuning-evaluation
|
||||
# Description: Benchmark vLLM with different tuning configurations. Logs latency, TPS, and TTFT to MLflow for A/B comparison.
|
||||
# Inputs:
|
||||
# enable_chunked_prefill: str [Default: 'true']
|
||||
# enable_prefix_caching: str [Default: 'true']
|
||||
# gpu_memory_utilization: str [Default: '0.90']
|
||||
# llm_endpoint: str [Default: 'http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm']
|
||||
# model_name: str [Default: 'hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4']
|
||||
# ngram_prompt_lookup_max: str [Default: '4']
|
||||
# num_iterations: int [Default: 3.0]
|
||||
# num_speculative_tokens: str [Default: '3']
|
||||
# num_warmup: int [Default: 2.0]
|
||||
# run_label: str [Default: 'baseline']
|
||||
components:
|
||||
comp-build-prompt-suite:
|
||||
executorLabel: exec-build-prompt-suite
|
||||
outputDefinitions:
|
||||
parameters:
|
||||
Output:
|
||||
parameterType: LIST
|
||||
comp-create-tuning-run:
|
||||
executorLabel: exec-create-tuning-run
|
||||
inputDefinitions:
|
||||
parameters:
|
||||
experiment_name:
|
||||
parameterType: STRING
|
||||
mlflow_tracking_uri:
|
||||
defaultValue: http://mlflow.mlflow.svc.cluster.local:80
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
run_name:
|
||||
parameterType: STRING
|
||||
tuning_params:
|
||||
parameterType: STRUCT
|
||||
outputDefinitions:
|
||||
parameters:
|
||||
experiment_id:
|
||||
parameterType: STRING
|
||||
run_id:
|
||||
parameterType: STRING
|
||||
comp-log-benchmark-results:
|
||||
executorLabel: exec-log-benchmark-results
|
||||
inputDefinitions:
|
||||
parameters:
|
||||
metrics:
|
||||
parameterType: STRUCT
|
||||
mlflow_tracking_uri:
|
||||
defaultValue: http://mlflow.mlflow.svc.cluster.local:80
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
run_id:
|
||||
parameterType: STRING
|
||||
outputDefinitions:
|
||||
parameters:
|
||||
Output:
|
||||
parameterType: STRING
|
||||
comp-run-benchmark:
|
||||
executorLabel: exec-run-benchmark
|
||||
inputDefinitions:
|
||||
parameters:
|
||||
llm_endpoint:
|
||||
parameterType: STRING
|
||||
model_name:
|
||||
parameterType: STRING
|
||||
num_iterations:
|
||||
defaultValue: 3.0
|
||||
isOptional: true
|
||||
parameterType: NUMBER_INTEGER
|
||||
num_warmup:
|
||||
defaultValue: 2.0
|
||||
isOptional: true
|
||||
parameterType: NUMBER_INTEGER
|
||||
prompts:
|
||||
parameterType: LIST
|
||||
outputDefinitions:
|
||||
parameters:
|
||||
Output:
|
||||
parameterType: STRUCT
|
||||
deploymentSpec:
|
||||
executors:
|
||||
exec-build-prompt-suite:
|
||||
container:
|
||||
args:
|
||||
- --executor_input
|
||||
- '{{$}}'
|
||||
- --function_to_execute
|
||||
- build_prompt_suite
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
||||
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
||||
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
||||
$0\" \"$@\"\n"
|
||||
- sh
|
||||
- -ec
|
||||
- 'program_path=$(mktemp -d)
|
||||
|
||||
|
||||
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
||||
|
||||
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
||||
|
||||
'
|
||||
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
||||
\ *\n\ndef build_prompt_suite() -> list:\n \"\"\"Return a list of test\
|
||||
\ prompts spanning short, medium, and long inputs.\"\"\"\n return [\n\
|
||||
\ {\n \"id\": \"short-1\",\n \"category\":\
|
||||
\ \"short\",\n \"messages\": [\n {\"role\": \"\
|
||||
user\", \"content\": \"What is the capital of France?\"}\n ],\n\
|
||||
\ \"max_tokens\": 64,\n },\n {\n \"\
|
||||
id\": \"short-2\",\n \"category\": \"short\",\n \"\
|
||||
messages\": [\n {\"role\": \"user\", \"content\": \"Explain\
|
||||
\ quantum computing in one sentence.\"}\n ],\n \"\
|
||||
max_tokens\": 64,\n },\n {\n \"id\": \"medium-1\"\
|
||||
,\n \"category\": \"medium\",\n \"messages\": [\n\
|
||||
\ {\n \"role\": \"system\",\n \
|
||||
\ \"content\": \"You are a helpful AI assistant running on a\
|
||||
\ homelab.\",\n },\n {\n \
|
||||
\ \"role\": \"user\",\n \"content\": (\n \
|
||||
\ \"Compare and contrast supervised and unsupervised \"\n \
|
||||
\ \"machine learning. Give examples of each and explain\
|
||||
\ \"\n \"when you would choose one over the other.\"\
|
||||
\n ),\n },\n ],\n \
|
||||
\ \"max_tokens\": 512,\n },\n {\n \"id\": \"\
|
||||
medium-2\",\n \"category\": \"medium\",\n \"messages\"\
|
||||
: [\n {\n \"role\": \"user\",\n \
|
||||
\ \"content\": (\n \"Write a Python\
|
||||
\ function that implements a binary search \"\n \"\
|
||||
tree with insert, search, and delete operations. Include \"\n \
|
||||
\ \"docstrings and type hints.\"\n ),\n\
|
||||
\ },\n ],\n \"max_tokens\": 1024,\n\
|
||||
\ },\n {\n \"id\": \"long-1\",\n \"\
|
||||
category\": \"long\",\n \"messages\": [\n {\n\
|
||||
\ \"role\": \"system\",\n \"content\"\
|
||||
: \"You are a technical writer for a Kubernetes homelab blog.\",\n \
|
||||
\ },\n {\n \"role\": \"user\"\
|
||||
,\n \"content\": (\n \"Write a\
|
||||
\ detailed tutorial on setting up a multi-node \"\n \
|
||||
\ \"Kubernetes cluster with Talos Linux, covering: \"\n \
|
||||
\ \"1) Hardware requirements and network topology, \"\n \
|
||||
\ \"2) Talos machine config generation, \"\n \
|
||||
\ \"3) Control plane bootstrapping, \"\n \
|
||||
\ \"4) Worker node joining, \"\n \"5) CNI setup\
|
||||
\ with Cilium, \"\n \"6) Storage with Rook-Ceph,\
|
||||
\ \"\n \"7) GitOps with Flux CD. \"\n \
|
||||
\ \"Include YAML examples for each step.\"\n \
|
||||
\ ),\n },\n ],\n \"max_tokens\"\
|
||||
: 2048,\n },\n {\n \"id\": \"repeat-prefix-1\"\
|
||||
,\n \"category\": \"prefix-cache-test\",\n \"messages\"\
|
||||
: [\n {\n \"role\": \"system\",\n \
|
||||
\ \"content\": \"You are a helpful AI assistant running on\
|
||||
\ a homelab.\",\n },\n {\n \
|
||||
\ \"role\": \"user\",\n \"content\": (\n \
|
||||
\ \"Compare and contrast supervised and unsupervised \"\n\
|
||||
\ \"machine learning. Now focus specifically on \"\
|
||||
\n \"reinforcement learning and how it differs.\"\
|
||||
\n ),\n },\n ],\n \
|
||||
\ \"max_tokens\": 512,\n },\n ]\n\n"
|
||||
image: python:3.13-slim
|
||||
exec-create-tuning-run:
|
||||
container:
|
||||
args:
|
||||
- --executor_input
|
||||
- '{{$}}'
|
||||
- --function_to_execute
|
||||
- create_tuning_run
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
||||
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
||||
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
|
||||
\ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
|
||||
- sh
|
||||
- -ec
|
||||
- 'program_path=$(mktemp -d)
|
||||
|
||||
|
||||
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
||||
|
||||
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
||||
|
||||
'
|
||||
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
||||
\ *\n\ndef create_tuning_run(\n experiment_name: str,\n run_name:\
|
||||
\ str,\n tuning_params: dict,\n mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\
|
||||
,\n) -> NamedTuple(\"RunInfo\", [(\"run_id\", str), (\"experiment_id\",\
|
||||
\ str)]):\n \"\"\"Create an MLflow run for a vLLM tuning experiment.\"\
|
||||
\"\"\n import os\n import mlflow\n from mlflow.tracking import\
|
||||
\ MlflowClient\n from collections import namedtuple\n\n mlflow.set_tracking_uri(mlflow_tracking_uri)\n\
|
||||
\ client = MlflowClient()\n\n exp = client.get_experiment_by_name(experiment_name)\n\
|
||||
\ experiment_id = (\n exp.experiment_id\n if exp\n \
|
||||
\ else client.create_experiment(\n name=experiment_name,\n\
|
||||
\ artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\
|
||||
,\n )\n )\n\n tags = {\n \"pipeline.type\": \"vllm-tuning\"\
|
||||
,\n \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\", \"unknown\"),\n\
|
||||
\ }\n\n run = mlflow.start_run(\n experiment_id=experiment_id,\
|
||||
\ run_name=run_name, tags=tags\n )\n # Log every tuning param\n \
|
||||
\ for key, value in tuning_params.items():\n mlflow.log_param(f\"\
|
||||
vllm.{key}\", value)\n run_id = run.info.run_id\n mlflow.end_run()\n\
|
||||
\n RunInfo = namedtuple(\"RunInfo\", [\"run_id\", \"experiment_id\"])\n\
|
||||
\ return RunInfo(run_id, experiment_id)\n\n"
|
||||
image: python:3.13-slim
|
||||
exec-log-benchmark-results:
|
||||
container:
|
||||
args:
|
||||
- --executor_input
|
||||
- '{{$}}'
|
||||
- --function_to_execute
|
||||
- log_benchmark_results
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
||||
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
||||
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\
|
||||
\ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n"
|
||||
- sh
|
||||
- -ec
|
||||
- 'program_path=$(mktemp -d)
|
||||
|
||||
|
||||
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
||||
|
||||
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
||||
|
||||
'
|
||||
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
||||
\ *\n\ndef log_benchmark_results(\n run_id: str,\n metrics: dict,\n\
|
||||
\ mlflow_tracking_uri: str = \"http://mlflow.mlflow.svc.cluster.local:80\"\
|
||||
,\n) -> str:\n \"\"\"Log benchmark metrics to MLflow and close the run.\"\
|
||||
\"\"\n import json\n import tempfile\n import mlflow\n from\
|
||||
\ mlflow.tracking import MlflowClient\n from pathlib import Path\n\n\
|
||||
\ mlflow.set_tracking_uri(mlflow_tracking_uri)\n client = MlflowClient()\n\
|
||||
\n for key, value in metrics.items():\n client.log_metric(run_id,\
|
||||
\ key, float(value))\n\n # Save full results as artifact\n with tempfile.TemporaryDirectory()\
|
||||
\ as tmpdir:\n path = Path(tmpdir) / \"benchmark_results.json\"\n\
|
||||
\ path.write_text(json.dumps(metrics, indent=2))\n client.log_artifact(run_id,\
|
||||
\ str(path))\n\n client.set_terminated(run_id, status=\"FINISHED\")\n\
|
||||
\ return run_id\n\n"
|
||||
image: python:3.13-slim
|
||||
exec-run-benchmark:
|
||||
container:
|
||||
args:
|
||||
- --executor_input
|
||||
- '{{$}}'
|
||||
- --function_to_execute
|
||||
- run_benchmark
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
||||
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
||||
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
||||
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
||||
$0\" \"$@\"\n"
|
||||
- sh
|
||||
- -ec
|
||||
- 'program_path=$(mktemp -d)
|
||||
|
||||
|
||||
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
||||
|
||||
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
||||
|
||||
'
|
||||
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
||||
\ *\n\ndef run_benchmark(\n prompts: list,\n llm_endpoint: str,\n\
|
||||
\ model_name: str,\n num_warmup: int = 2,\n num_iterations: int\
|
||||
\ = 3,\n) -> dict:\n \"\"\"\n Run all prompts through the LLM endpoint\
|
||||
\ and collect timing metrics.\n\n Returns aggregate metrics: p50/p95/mean\
|
||||
\ latency, tokens/sec, TTFT.\n \"\"\"\n import time\n import statistics\n\
|
||||
\ import httpx\n\n all_latencies: list[float] = []\n all_tps: list[float]\
|
||||
\ = []\n all_ttft: list[float] = []\n per_category: dict[str, list[float]]\
|
||||
\ = {}\n\n with httpx.Client(timeout=300.0) as client:\n # Warmup\n\
|
||||
\ for _ in range(num_warmup):\n try:\n \
|
||||
\ client.post(\n f\"{llm_endpoint}/v1/chat/completions\"\
|
||||
,\n json={\n \"model\": model_name,\n\
|
||||
\ \"messages\": [{\"role\": \"user\", \"content\"\
|
||||
: \"Hi\"}],\n \"max_tokens\": 8,\n \
|
||||
\ \"temperature\": 0,\n },\n \
|
||||
\ )\n except Exception:\n pass\n\n # Benchmark\n\
|
||||
\ for iteration in range(num_iterations):\n for prompt\
|
||||
\ in prompts:\n category = prompt.get(\"category\", \"unknown\"\
|
||||
)\n payload = {\n \"model\": model_name,\n\
|
||||
\ \"messages\": prompt[\"messages\"],\n \
|
||||
\ \"max_tokens\": prompt.get(\"max_tokens\", 256),\n \
|
||||
\ \"temperature\": 0,\n \"stream\": True,\n \
|
||||
\ }\n\n try:\n t_start = time.perf_counter()\n\
|
||||
\ first_token_time = None\n\n with\
|
||||
\ client.stream(\n \"POST\",\n \
|
||||
\ f\"{llm_endpoint}/v1/chat/completions\",\n \
|
||||
\ json=payload,\n ) as resp:\n \
|
||||
\ resp.raise_for_status()\n completion_tokens =\
|
||||
\ 0\n for line in resp.iter_lines():\n \
|
||||
\ if not line.startswith(\"data: \"):\n \
|
||||
\ continue\n chunk = line[6:]\n\
|
||||
\ if chunk == \"[DONE]\":\n \
|
||||
\ break\n if first_token_time is\
|
||||
\ None:\n first_token_time = time.perf_counter()\n\
|
||||
\ completion_tokens += 1\n\n \
|
||||
\ t_end = time.perf_counter()\n latency = t_end -\
|
||||
\ t_start\n ttft = (\n (first_token_time\
|
||||
\ - t_start)\n if first_token_time\n \
|
||||
\ else latency\n )\n tps\
|
||||
\ = (\n completion_tokens / latency if latency >\
|
||||
\ 0 else 0\n )\n\n all_latencies.append(latency)\n\
|
||||
\ all_tps.append(tps)\n all_ttft.append(ttft)\n\
|
||||
\ per_category.setdefault(category, []).append(latency)\n\
|
||||
\n except Exception as exc:\n # Record\
|
||||
\ failure but keep going\n all_latencies.append(-1)\n\
|
||||
\ all_tps.append(0)\n all_ttft.append(-1)\n\
|
||||
\n # Compute aggregates\n valid_latencies = [l for l in all_latencies\
|
||||
\ if l > 0]\n valid_tps = [t for t in all_tps if t > 0]\n valid_ttft\
|
||||
\ = [t for t in all_ttft if t > 0]\n\n def safe_stat(values, func):\n\
|
||||
\ return func(values) if values else 0\n\n metrics = {\n \
|
||||
\ \"total_requests\": len(all_latencies),\n \"successful_requests\"\
|
||||
: len(valid_latencies),\n \"failed_requests\": len(all_latencies)\
|
||||
\ - len(valid_latencies),\n # Latency\n \"latency_mean_s\"\
|
||||
: safe_stat(valid_latencies, statistics.mean),\n \"latency_p50_s\"\
|
||||
: safe_stat(\n valid_latencies,\n lambda v: statistics.median(v),\n\
|
||||
\ ),\n \"latency_p95_s\": safe_stat(\n valid_latencies,\n\
|
||||
\ lambda v: sorted(v)[int(len(v) * 0.95)] if v else 0,\n \
|
||||
\ ),\n # Throughput\n \"tokens_per_second_mean\": safe_stat(valid_tps,\
|
||||
\ statistics.mean),\n \"tokens_per_second_p50\": safe_stat(\n \
|
||||
\ valid_tps, lambda v: statistics.median(v)\n ),\n \
|
||||
\ # Time to first token\n \"ttft_mean_s\": safe_stat(valid_ttft,\
|
||||
\ statistics.mean),\n \"ttft_p50_s\": safe_stat(valid_ttft, lambda\
|
||||
\ v: statistics.median(v)),\n \"ttft_p95_s\": safe_stat(\n \
|
||||
\ valid_ttft,\n lambda v: sorted(v)[int(len(v) * 0.95)]\
|
||||
\ if v else 0,\n ),\n }\n\n # Per-category latency\n for\
|
||||
\ cat, lats in per_category.items():\n valid = [l for l in lats if\
|
||||
\ l > 0]\n if valid:\n metrics[f\"latency_mean_{cat}_s\"\
|
||||
] = statistics.mean(valid)\n\n return metrics\n\n"
|
||||
image: python:3.13-slim
|
||||
pipelineInfo:
|
||||
description: Benchmark vLLM with different tuning configurations. Logs latency,
|
||||
TPS, and TTFT to MLflow for A/B comparison.
|
||||
name: vllm-tuning-evaluation
|
||||
root:
|
||||
dag:
|
||||
tasks:
|
||||
build-prompt-suite:
|
||||
cachingOptions:
|
||||
enableCache: true
|
||||
componentRef:
|
||||
name: comp-build-prompt-suite
|
||||
taskInfo:
|
||||
name: build-prompt-suite
|
||||
create-tuning-run:
|
||||
cachingOptions:
|
||||
enableCache: true
|
||||
componentRef:
|
||||
name: comp-create-tuning-run
|
||||
inputs:
|
||||
parameters:
|
||||
experiment_name:
|
||||
runtimeValue:
|
||||
constant: vllm-tuning
|
||||
pipelinechannel--enable_chunked_prefill:
|
||||
componentInputParameter: enable_chunked_prefill
|
||||
pipelinechannel--enable_prefix_caching:
|
||||
componentInputParameter: enable_prefix_caching
|
||||
pipelinechannel--gpu_memory_utilization:
|
||||
componentInputParameter: gpu_memory_utilization
|
||||
pipelinechannel--llm_endpoint:
|
||||
componentInputParameter: llm_endpoint
|
||||
pipelinechannel--model_name:
|
||||
componentInputParameter: model_name
|
||||
pipelinechannel--ngram_prompt_lookup_max:
|
||||
componentInputParameter: ngram_prompt_lookup_max
|
||||
pipelinechannel--num_iterations:
|
||||
componentInputParameter: num_iterations
|
||||
pipelinechannel--num_speculative_tokens:
|
||||
componentInputParameter: num_speculative_tokens
|
||||
pipelinechannel--num_warmup:
|
||||
componentInputParameter: num_warmup
|
||||
pipelinechannel--run_label:
|
||||
componentInputParameter: run_label
|
||||
run_name:
|
||||
runtimeValue:
|
||||
constant: vllm-{{$.inputs.parameters['pipelinechannel--run_label']}}
|
||||
tuning_params:
|
||||
runtimeValue:
|
||||
constant:
|
||||
enable_chunked_prefill: '{{$.inputs.parameters[''pipelinechannel--enable_chunked_prefill'']}}'
|
||||
enable_prefix_caching: '{{$.inputs.parameters[''pipelinechannel--enable_prefix_caching'']}}'
|
||||
gpu_memory_utilization: '{{$.inputs.parameters[''pipelinechannel--gpu_memory_utilization'']}}'
|
||||
llm_endpoint: '{{$.inputs.parameters[''pipelinechannel--llm_endpoint'']}}'
|
||||
model_name: '{{$.inputs.parameters[''pipelinechannel--model_name'']}}'
|
||||
ngram_prompt_lookup_max: '{{$.inputs.parameters[''pipelinechannel--ngram_prompt_lookup_max'']}}'
|
||||
num_iterations: '{{$.inputs.parameters[''pipelinechannel--num_iterations'']}}'
|
||||
num_speculative_tokens: '{{$.inputs.parameters[''pipelinechannel--num_speculative_tokens'']}}'
|
||||
num_warmup: '{{$.inputs.parameters[''pipelinechannel--num_warmup'']}}'
|
||||
taskInfo:
|
||||
name: create-tuning-run
|
||||
log-benchmark-results:
|
||||
cachingOptions:
|
||||
enableCache: true
|
||||
componentRef:
|
||||
name: comp-log-benchmark-results
|
||||
dependentTasks:
|
||||
- create-tuning-run
|
||||
- run-benchmark
|
||||
inputs:
|
||||
parameters:
|
||||
metrics:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: Output
|
||||
producerTask: run-benchmark
|
||||
run_id:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: run_id
|
||||
producerTask: create-tuning-run
|
||||
taskInfo:
|
||||
name: log-benchmark-results
|
||||
run-benchmark:
|
||||
cachingOptions: {}
|
||||
componentRef:
|
||||
name: comp-run-benchmark
|
||||
dependentTasks:
|
||||
- build-prompt-suite
|
||||
inputs:
|
||||
parameters:
|
||||
llm_endpoint:
|
||||
componentInputParameter: llm_endpoint
|
||||
model_name:
|
||||
componentInputParameter: model_name
|
||||
num_iterations:
|
||||
componentInputParameter: num_iterations
|
||||
num_warmup:
|
||||
componentInputParameter: num_warmup
|
||||
prompts:
|
||||
taskOutputParameter:
|
||||
outputParameterKey: Output
|
||||
producerTask: build-prompt-suite
|
||||
taskInfo:
|
||||
name: run-benchmark
|
||||
inputDefinitions:
|
||||
parameters:
|
||||
enable_chunked_prefill:
|
||||
defaultValue: 'true'
|
||||
description: '"true" or "false"'
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
enable_prefix_caching:
|
||||
defaultValue: 'true'
|
||||
description: '"true" or "false"'
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
gpu_memory_utilization:
|
||||
defaultValue: '0.90'
|
||||
description: 0.0 - 1.0
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
llm_endpoint:
|
||||
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
|
||||
description: vLLM inference endpoint URL
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
model_name:
|
||||
defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
|
||||
description: HF model identifier
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
ngram_prompt_lookup_max:
|
||||
defaultValue: '4'
|
||||
description: ngram window for spec decode (0 = off)
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
num_iterations:
|
||||
defaultValue: 3.0
|
||||
description: how many times to repeat the prompt suite
|
||||
isOptional: true
|
||||
parameterType: NUMBER_INTEGER
|
||||
num_speculative_tokens:
|
||||
defaultValue: '3'
|
||||
description: number of speculative tokens (0 = off)
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
num_warmup:
|
||||
defaultValue: 2.0
|
||||
description: warmup requests before timing
|
||||
isOptional: true
|
||||
parameterType: NUMBER_INTEGER
|
||||
run_label:
|
||||
defaultValue: baseline
|
||||
description: human-readable label (e.g. "apc-on-spec3")
|
||||
isOptional: true
|
||||
parameterType: STRING
|
||||
schemaVersion: 2.1.0
|
||||
sdkVersion: kfp-2.12.1
|
||||
Reference in New Issue
Block a user