New: - vllm_tuning_pipeline.py: A/B benchmark different vLLM configs, logs latency/TPS/TTFT to MLflow (vllm-tuning experiment) - vllm_tuning_pipeline.yaml: compiled KFP YAML Updated: - voice_pipeline.py: per-step NamedTuple outputs with latency tracking, new log_pipeline_metrics MLflow component - voice_pipeline.yaml, tts_pipeline.yaml, rag_pipeline.yaml: recompiled
364 lines
16 KiB
YAML
364 lines
16 KiB
YAML
# PIPELINE DEFINITION
|
|
# Name: rag-query-pipeline
|
|
# Description: RAG query pipeline: Embed -> Retrieve -> Rerank -> LLM. Logs per-step latency to MLflow.
|
|
# Inputs:
|
|
# collection_name: str [Default: 'knowledge_base']
|
|
# query: str
|
|
components:
|
|
comp-generate-embeddings:
|
|
executorLabel: exec-generate-embeddings
|
|
inputDefinitions:
|
|
parameters:
|
|
embeddings_url:
|
|
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings
|
|
isOptional: true
|
|
parameterType: STRING
|
|
text:
|
|
parameterType: STRING
|
|
outputDefinitions:
|
|
parameters:
|
|
embedding:
|
|
parameterType: LIST
|
|
latency_s:
|
|
parameterType: NUMBER_DOUBLE
|
|
comp-generate-response:
|
|
executorLabel: exec-generate-response
|
|
inputDefinitions:
|
|
parameters:
|
|
context:
|
|
parameterType: LIST
|
|
model:
|
|
defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
|
|
isOptional: true
|
|
parameterType: STRING
|
|
query:
|
|
parameterType: STRING
|
|
vllm_url:
|
|
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm
|
|
isOptional: true
|
|
parameterType: STRING
|
|
outputDefinitions:
|
|
parameters:
|
|
completion_tokens:
|
|
parameterType: NUMBER_INTEGER
|
|
latency_s:
|
|
parameterType: NUMBER_DOUBLE
|
|
text:
|
|
parameterType: STRING
|
|
comp-rerank-documents:
|
|
executorLabel: exec-rerank-documents
|
|
inputDefinitions:
|
|
parameters:
|
|
documents:
|
|
parameterType: LIST
|
|
query:
|
|
parameterType: STRING
|
|
reranker_url:
|
|
defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker
|
|
isOptional: true
|
|
parameterType: STRING
|
|
top_k:
|
|
defaultValue: 3.0
|
|
isOptional: true
|
|
parameterType: NUMBER_INTEGER
|
|
outputDefinitions:
|
|
parameters:
|
|
documents:
|
|
parameterType: LIST
|
|
latency_s:
|
|
parameterType: NUMBER_DOUBLE
|
|
comp-retrieve-context:
|
|
executorLabel: exec-retrieve-context
|
|
inputDefinitions:
|
|
parameters:
|
|
collection_name:
|
|
defaultValue: knowledge_base
|
|
isOptional: true
|
|
parameterType: STRING
|
|
embedding:
|
|
parameterType: LIST
|
|
milvus_host:
|
|
defaultValue: milvus.ai-ml.svc.cluster.local
|
|
isOptional: true
|
|
parameterType: STRING
|
|
top_k:
|
|
defaultValue: 5.0
|
|
isOptional: true
|
|
parameterType: NUMBER_INTEGER
|
|
outputDefinitions:
|
|
parameters:
|
|
documents:
|
|
parameterType: LIST
|
|
latency_s:
|
|
parameterType: NUMBER_DOUBLE
|
|
deploymentSpec:
|
|
executors:
|
|
exec-generate-embeddings:
|
|
container:
|
|
args:
|
|
- --executor_input
|
|
- '{{$}}'
|
|
- --function_to_execute
|
|
- generate_embeddings
|
|
command:
|
|
- sh
|
|
- -c
|
|
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
|
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
|
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
|
$0\" \"$@\"\n"
|
|
- sh
|
|
- -ec
|
|
- 'program_path=$(mktemp -d)
|
|
|
|
|
|
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
|
|
|
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
|
|
|
'
|
|
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
|
\ *\n\ndef generate_embeddings(\n text: str,\n embeddings_url: str\
|
|
\ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings\"\
|
|
\n) -> NamedTuple(\"EmbedResult\", [(\"embedding\", list), (\"latency_s\"\
|
|
, float)]):\n \"\"\"Generate embeddings for RAG retrieval.\"\"\"\n \
|
|
\ import time\n import httpx\n from collections import namedtuple\n\
|
|
\n start = time.perf_counter()\n with httpx.Client(timeout=60.0) as\
|
|
\ client:\n response = client.post(\n f\"{embeddings_url}/embeddings\"\
|
|
,\n json={\"input\": text, \"model\": \"bge-small-en-v1.5\"}\n\
|
|
\ )\n result = response.json()\n latency = time.perf_counter()\
|
|
\ - start\n\n EmbedResult = namedtuple(\"EmbedResult\", [\"embedding\"\
|
|
, \"latency_s\"])\n return EmbedResult(result[\"data\"][0][\"embedding\"\
|
|
], latency)\n\n"
|
|
image: python:3.13-slim
|
|
exec-generate-response:
|
|
container:
|
|
args:
|
|
- --executor_input
|
|
- '{{$}}'
|
|
- --function_to_execute
|
|
- generate_response
|
|
command:
|
|
- sh
|
|
- -c
|
|
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
|
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
|
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
|
$0\" \"$@\"\n"
|
|
- sh
|
|
- -ec
|
|
- 'program_path=$(mktemp -d)
|
|
|
|
|
|
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
|
|
|
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
|
|
|
'
|
|
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
|
\ *\n\ndef generate_response(\n query: str,\n context: list,\n \
|
|
\ vllm_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm\"\
|
|
,\n model: str = \"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4\"\
|
|
\n) -> NamedTuple(\"LLMResult\", [(\"text\", str), (\"latency_s\", float),\
|
|
\ (\"completion_tokens\", int)]):\n \"\"\"Generate response using vLLM.\"\
|
|
\"\"\n import time\n import httpx\n from collections import namedtuple\n\
|
|
\n # Build context\n if context:\n context_text = \"\\n\\n\"\
|
|
.join([doc[\"text\"] for doc in context])\n user_content = f\"Context:\\\
|
|
n{context_text}\\n\\nQuestion: {query}\"\n else:\n user_content\
|
|
\ = query\n\n system_prompt = \"\"\"You are a helpful voice assistant.\n\
|
|
Answer questions based on the provided context when available.\nKeep responses\
|
|
\ concise and natural for speech synthesis.\"\"\"\n\n messages = [\n\
|
|
\ {\"role\": \"system\", \"content\": system_prompt},\n {\"\
|
|
role\": \"user\", \"content\": user_content}\n ]\n\n start = time.perf_counter()\n\
|
|
\ with httpx.Client(timeout=180.0) as client:\n response = client.post(\n\
|
|
\ f\"{vllm_url}/v1/chat/completions\",\n json={\n\
|
|
\ \"model\": model,\n \"messages\": messages,\n\
|
|
\ \"max_tokens\": 512,\n \"temperature\":\
|
|
\ 0.7\n }\n )\n result = response.json()\n latency\
|
|
\ = time.perf_counter() - start\n\n text = result[\"choices\"][0][\"\
|
|
message\"][\"content\"]\n usage = result.get(\"usage\", {})\n completion_tokens\
|
|
\ = usage.get(\"completion_tokens\", len(text.split()))\n\n LLMResult\
|
|
\ = namedtuple(\"LLMResult\", [\"text\", \"latency_s\", \"completion_tokens\"\
|
|
])\n return LLMResult(text, latency, completion_tokens)\n\n"
|
|
image: python:3.13-slim
|
|
exec-rerank-documents:
|
|
container:
|
|
args:
|
|
- --executor_input
|
|
- '{{$}}'
|
|
- --function_to_execute
|
|
- rerank_documents
|
|
command:
|
|
- sh
|
|
- -c
|
|
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
|
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
|
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\
|
|
$0\" \"$@\"\n"
|
|
- sh
|
|
- -ec
|
|
- 'program_path=$(mktemp -d)
|
|
|
|
|
|
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
|
|
|
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
|
|
|
'
|
|
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
|
\ *\n\ndef rerank_documents(\n query: str,\n documents: list,\n \
|
|
\ reranker_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker\"\
|
|
,\n top_k: int = 3\n) -> NamedTuple(\"RerankResult\", [(\"documents\"\
|
|
, list), (\"latency_s\", float)]):\n \"\"\"Rerank documents using BGE\
|
|
\ reranker.\"\"\"\n import time\n import httpx\n from collections\
|
|
\ import namedtuple\n\n if not documents:\n RerankResult = namedtuple(\"\
|
|
RerankResult\", [\"documents\", \"latency_s\"])\n return RerankResult([],\
|
|
\ 0.0)\n\n start = time.perf_counter()\n with httpx.Client(timeout=60.0)\
|
|
\ as client:\n response = client.post(\n f\"{reranker_url}/v1/rerank\"\
|
|
,\n json={\n \"query\": query,\n \
|
|
\ \"documents\": [doc[\"text\"] for doc in documents],\n \
|
|
\ \"model\": \"bge-reranker-v2-m3\"\n }\n )\n \
|
|
\ result = response.json()\n latency = time.perf_counter() - start\n\
|
|
\n # Sort by rerank score\n reranked = sorted(\n zip(documents,\
|
|
\ result.get(\"scores\", [0] * len(documents))),\n key=lambda x:\
|
|
\ x[1],\n reverse=True\n )[:top_k]\n\n RerankResult = namedtuple(\"\
|
|
RerankResult\", [\"documents\", \"latency_s\"])\n return RerankResult([doc\
|
|
\ for doc, score in reranked], latency)\n\n"
|
|
image: python:3.13-slim
|
|
exec-retrieve-context:
|
|
container:
|
|
args:
|
|
- --executor_input
|
|
- '{{$}}'
|
|
- --function_to_execute
|
|
- retrieve_context
|
|
command:
|
|
- sh
|
|
- -c
|
|
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
|
|
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\
|
|
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
|
|
\ python3 -m pip install --quiet --no-warn-script-location 'pymilvus' &&\
|
|
\ \"$0\" \"$@\"\n"
|
|
- sh
|
|
- -ec
|
|
- 'program_path=$(mktemp -d)
|
|
|
|
|
|
printf "%s" "$0" > "$program_path/ephemeral_component.py"
|
|
|
|
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
|
|
|
|
'
|
|
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
|
|
\ *\n\ndef retrieve_context(\n embedding: list,\n milvus_host: str\
|
|
\ = \"milvus.ai-ml.svc.cluster.local\",\n collection_name: str = \"knowledge_base\"\
|
|
,\n top_k: int = 5\n) -> NamedTuple(\"RetrieveResult\", [(\"documents\"\
|
|
, list), (\"latency_s\", float)]):\n \"\"\"Retrieve relevant documents\
|
|
\ from Milvus vector database.\"\"\"\n import time\n from pymilvus\
|
|
\ import connections, Collection, utility\n from collections import namedtuple\n\
|
|
\n start = time.perf_counter()\n connections.connect(host=milvus_host,\
|
|
\ port=19530)\n\n if not utility.has_collection(collection_name):\n \
|
|
\ latency = time.perf_counter() - start\n RetrieveResult =\
|
|
\ namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n \
|
|
\ return RetrieveResult([], latency)\n\n collection = Collection(collection_name)\n\
|
|
\ collection.load()\n\n results = collection.search(\n data=[embedding],\n\
|
|
\ anns_field=\"embedding\",\n param={\"metric_type\": \"COSINE\"\
|
|
, \"params\": {\"nprobe\": 10}},\n limit=top_k,\n output_fields=[\"\
|
|
text\", \"source\"]\n )\n latency = time.perf_counter() - start\n\n\
|
|
\ documents = []\n for hits in results:\n for hit in hits:\n\
|
|
\ documents.append({\n \"text\": hit.entity.get(\"\
|
|
text\"),\n \"source\": hit.entity.get(\"source\"),\n \
|
|
\ \"score\": hit.distance\n })\n\n RetrieveResult\
|
|
\ = namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n \
|
|
\ return RetrieveResult(documents, latency)\n\n"
|
|
image: python:3.13-slim
|
|
pipelineInfo:
|
|
description: 'RAG query pipeline: Embed -> Retrieve -> Rerank -> LLM. Logs per-step
|
|
latency to MLflow.'
|
|
name: rag-query-pipeline
|
|
root:
|
|
dag:
|
|
tasks:
|
|
generate-embeddings:
|
|
cachingOptions:
|
|
enableCache: true
|
|
componentRef:
|
|
name: comp-generate-embeddings
|
|
inputs:
|
|
parameters:
|
|
text:
|
|
componentInputParameter: query
|
|
taskInfo:
|
|
name: generate-embeddings
|
|
generate-response:
|
|
cachingOptions:
|
|
enableCache: true
|
|
componentRef:
|
|
name: comp-generate-response
|
|
dependentTasks:
|
|
- rerank-documents
|
|
inputs:
|
|
parameters:
|
|
context:
|
|
taskOutputParameter:
|
|
outputParameterKey: documents
|
|
producerTask: rerank-documents
|
|
query:
|
|
componentInputParameter: query
|
|
taskInfo:
|
|
name: generate-response
|
|
rerank-documents:
|
|
cachingOptions:
|
|
enableCache: true
|
|
componentRef:
|
|
name: comp-rerank-documents
|
|
dependentTasks:
|
|
- retrieve-context
|
|
inputs:
|
|
parameters:
|
|
documents:
|
|
taskOutputParameter:
|
|
outputParameterKey: documents
|
|
producerTask: retrieve-context
|
|
query:
|
|
componentInputParameter: query
|
|
taskInfo:
|
|
name: rerank-documents
|
|
retrieve-context:
|
|
cachingOptions:
|
|
enableCache: true
|
|
componentRef:
|
|
name: comp-retrieve-context
|
|
dependentTasks:
|
|
- generate-embeddings
|
|
inputs:
|
|
parameters:
|
|
collection_name:
|
|
componentInputParameter: collection_name
|
|
embedding:
|
|
taskOutputParameter:
|
|
outputParameterKey: embedding
|
|
producerTask: generate-embeddings
|
|
taskInfo:
|
|
name: retrieve-context
|
|
inputDefinitions:
|
|
parameters:
|
|
collection_name:
|
|
defaultValue: knowledge_base
|
|
description: Milvus collection name
|
|
isOptional: true
|
|
parameterType: STRING
|
|
query:
|
|
description: Text query
|
|
parameterType: STRING
|
|
schemaVersion: 2.1.0
|
|
sdkVersion: kfp-2.12.1
|