# PIPELINE DEFINITION # Name: voice-assistant-rag-pipeline # Description: End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus -> Rerank -> LLM -> TTS. Logs per-step latency to MLflow. # Inputs: # audio_b64: str # collection_name: str [Default: 'knowledge_base'] components: comp-generate-embeddings: executorLabel: exec-generate-embeddings inputDefinitions: parameters: embeddings_url: defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings isOptional: true parameterType: STRING text: parameterType: STRING outputDefinitions: parameters: embedding: parameterType: LIST latency_s: parameterType: NUMBER_DOUBLE comp-generate-response: executorLabel: exec-generate-response inputDefinitions: parameters: context: parameterType: LIST model: defaultValue: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4 isOptional: true parameterType: STRING query: parameterType: STRING vllm_url: defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm isOptional: true parameterType: STRING outputDefinitions: parameters: completion_tokens: parameterType: NUMBER_INTEGER latency_s: parameterType: NUMBER_DOUBLE text: parameterType: STRING comp-log-pipeline-metrics: executorLabel: exec-log-pipeline-metrics inputDefinitions: parameters: embed_latency: parameterType: NUMBER_DOUBLE experiment_name: defaultValue: voice-pipeline-metrics isOptional: true parameterType: STRING llm_completion_tokens: parameterType: NUMBER_INTEGER llm_latency: parameterType: NUMBER_DOUBLE mlflow_tracking_uri: defaultValue: http://mlflow.mlflow.svc.cluster.local:80 isOptional: true parameterType: STRING rerank_latency: parameterType: NUMBER_DOUBLE retrieve_latency: parameterType: NUMBER_DOUBLE run_name: defaultValue: voice-pipeline isOptional: true parameterType: STRING stt_audio_duration: parameterType: NUMBER_DOUBLE stt_latency: parameterType: NUMBER_DOUBLE tts_latency: parameterType: NUMBER_DOUBLE outputDefinitions: parameters: Output: parameterType: STRING comp-rerank-documents: executorLabel: exec-rerank-documents inputDefinitions: parameters: documents: parameterType: LIST query: parameterType: STRING reranker_url: defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker isOptional: true parameterType: STRING top_k: defaultValue: 3.0 isOptional: true parameterType: NUMBER_INTEGER outputDefinitions: parameters: documents: parameterType: LIST latency_s: parameterType: NUMBER_DOUBLE comp-retrieve-context: executorLabel: exec-retrieve-context inputDefinitions: parameters: collection_name: defaultValue: knowledge_base isOptional: true parameterType: STRING embedding: parameterType: LIST milvus_host: defaultValue: milvus.ai-ml.svc.cluster.local isOptional: true parameterType: STRING top_k: defaultValue: 5.0 isOptional: true parameterType: NUMBER_INTEGER outputDefinitions: parameters: documents: parameterType: LIST latency_s: parameterType: NUMBER_DOUBLE comp-synthesize-speech: executorLabel: exec-synthesize-speech inputDefinitions: parameters: text: parameterType: STRING tts_url: defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts isOptional: true parameterType: STRING outputDefinitions: parameters: audio_b64: parameterType: STRING latency_s: parameterType: NUMBER_DOUBLE comp-transcribe-audio: executorLabel: exec-transcribe-audio inputDefinitions: parameters: audio_b64: parameterType: STRING whisper_url: defaultValue: http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper isOptional: true parameterType: STRING outputDefinitions: parameters: audio_duration_s: parameterType: NUMBER_DOUBLE latency_s: parameterType: NUMBER_DOUBLE text: parameterType: STRING deploymentSpec: executors: exec-generate-embeddings: container: args: - --executor_input - '{{$}}' - --function_to_execute - generate_embeddings command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\ $0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef generate_embeddings(\n text: str,\n embeddings_url: str\ \ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/embeddings\"\ \n) -> NamedTuple(\"EmbedResult\", [(\"embedding\", list), (\"latency_s\"\ , float)]):\n \"\"\"Generate embeddings for RAG retrieval.\"\"\"\n \ \ import time\n import httpx\n from collections import namedtuple\n\ \n start = time.perf_counter()\n with httpx.Client(timeout=60.0) as\ \ client:\n response = client.post(\n f\"{embeddings_url}/embeddings\"\ ,\n json={\"input\": text, \"model\": \"bge-small-en-v1.5\"}\n\ \ )\n result = response.json()\n latency = time.perf_counter()\ \ - start\n\n EmbedResult = namedtuple(\"EmbedResult\", [\"embedding\"\ , \"latency_s\"])\n return EmbedResult(result[\"data\"][0][\"embedding\"\ ], latency)\n\n" image: python:3.13-slim exec-generate-response: container: args: - --executor_input - '{{$}}' - --function_to_execute - generate_response command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\ $0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef generate_response(\n query: str,\n context: list,\n \ \ vllm_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm\"\ ,\n model: str = \"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4\"\ \n) -> NamedTuple(\"LLMResult\", [(\"text\", str), (\"latency_s\", float),\ \ (\"completion_tokens\", int)]):\n \"\"\"Generate response using vLLM.\"\ \"\"\n import time\n import httpx\n from collections import namedtuple\n\ \n # Build context\n if context:\n context_text = \"\\n\\n\"\ .join([doc[\"text\"] for doc in context])\n user_content = f\"Context:\\\ n{context_text}\\n\\nQuestion: {query}\"\n else:\n user_content\ \ = query\n\n system_prompt = \"\"\"You are a helpful voice assistant.\n\ Answer questions based on the provided context when available.\nKeep responses\ \ concise and natural for speech synthesis.\"\"\"\n\n messages = [\n\ \ {\"role\": \"system\", \"content\": system_prompt},\n {\"\ role\": \"user\", \"content\": user_content}\n ]\n\n start = time.perf_counter()\n\ \ with httpx.Client(timeout=180.0) as client:\n response = client.post(\n\ \ f\"{vllm_url}/v1/chat/completions\",\n json={\n\ \ \"model\": model,\n \"messages\": messages,\n\ \ \"max_tokens\": 512,\n \"temperature\":\ \ 0.7\n }\n )\n result = response.json()\n latency\ \ = time.perf_counter() - start\n\n text = result[\"choices\"][0][\"\ message\"][\"content\"]\n usage = result.get(\"usage\", {})\n completion_tokens\ \ = usage.get(\"completion_tokens\", len(text.split()))\n\n LLMResult\ \ = namedtuple(\"LLMResult\", [\"text\", \"latency_s\", \"completion_tokens\"\ ])\n return LLMResult(text, latency, completion_tokens)\n\n" image: python:3.13-slim exec-log-pipeline-metrics: container: args: - --executor_input - '{{$}}' - --function_to_execute - log_pipeline_metrics command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'mlflow>=2.10.0'\ \ 'boto3' 'psycopg2-binary' && \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef log_pipeline_metrics(\n stt_latency: float,\n stt_audio_duration:\ \ float,\n embed_latency: float,\n retrieve_latency: float,\n rerank_latency:\ \ float,\n llm_latency: float,\n llm_completion_tokens: int,\n \ \ tts_latency: float,\n experiment_name: str = \"voice-pipeline-metrics\"\ ,\n run_name: str = \"voice-pipeline\",\n mlflow_tracking_uri: str\ \ = \"http://mlflow.mlflow.svc.cluster.local:80\",\n) -> str:\n \"\"\"\ Log per-step latency metrics to MLflow for the full voice pipeline.\"\"\"\ \n import os\n import mlflow\n from mlflow.tracking import MlflowClient\n\ \n mlflow.set_tracking_uri(mlflow_tracking_uri)\n client = MlflowClient()\n\ \n exp = client.get_experiment_by_name(experiment_name)\n experiment_id\ \ = (\n exp.experiment_id\n if exp\n else client.create_experiment(\n\ \ name=experiment_name,\n artifact_location=f\"/mlflow/artifacts/{experiment_name}\"\ ,\n )\n )\n\n run = mlflow.start_run(\n experiment_id=experiment_id,\n\ \ run_name=run_name,\n tags={\n \"pipeline.type\"\ : \"voice-assistant\",\n \"kfp.run_id\": os.environ.get(\"KFP_RUN_ID\"\ , \"unknown\"),\n },\n )\n\n total_latency = (\n stt_latency\ \ + embed_latency + retrieve_latency\n + rerank_latency + llm_latency\ \ + tts_latency\n )\n stt_rtf = stt_latency / stt_audio_duration if\ \ stt_audio_duration > 0 else 0\n llm_tps = llm_completion_tokens / llm_latency\ \ if llm_latency > 0 else 0\n\n mlflow.log_metrics({\n \"stt_latency_s\"\ : stt_latency,\n \"stt_audio_duration_s\": stt_audio_duration,\n\ \ \"stt_realtime_factor\": stt_rtf,\n \"embed_latency_s\"\ : embed_latency,\n \"retrieve_latency_s\": retrieve_latency,\n \ \ \"rerank_latency_s\": rerank_latency,\n \"llm_latency_s\"\ : llm_latency,\n \"llm_completion_tokens\": llm_completion_tokens,\n\ \ \"llm_tokens_per_second\": llm_tps,\n \"tts_latency_s\"\ : tts_latency,\n \"total_pipeline_latency_s\": total_latency,\n \ \ })\n mlflow.end_run()\n return run.info.run_id\n\n" image: python:3.13-slim exec-rerank-documents: container: args: - --executor_input - '{{$}}' - --function_to_execute - rerank_documents command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\ $0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef rerank_documents(\n query: str,\n documents: list,\n \ \ reranker_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/reranker\"\ ,\n top_k: int = 3\n) -> NamedTuple(\"RerankResult\", [(\"documents\"\ , list), (\"latency_s\", float)]):\n \"\"\"Rerank documents using BGE\ \ reranker.\"\"\"\n import time\n import httpx\n from collections\ \ import namedtuple\n\n if not documents:\n RerankResult = namedtuple(\"\ RerankResult\", [\"documents\", \"latency_s\"])\n return RerankResult([],\ \ 0.0)\n\n start = time.perf_counter()\n with httpx.Client(timeout=60.0)\ \ as client:\n response = client.post(\n f\"{reranker_url}/v1/rerank\"\ ,\n json={\n \"query\": query,\n \ \ \"documents\": [doc[\"text\"] for doc in documents],\n \ \ \"model\": \"bge-reranker-v2-m3\"\n }\n )\n \ \ result = response.json()\n latency = time.perf_counter() - start\n\ \n # Sort by rerank score\n reranked = sorted(\n zip(documents,\ \ result.get(\"scores\", [0] * len(documents))),\n key=lambda x:\ \ x[1],\n reverse=True\n )[:top_k]\n\n RerankResult = namedtuple(\"\ RerankResult\", [\"documents\", \"latency_s\"])\n return RerankResult([doc\ \ for doc, score in reranked], latency)\n\n" image: python:3.13-slim exec-retrieve-context: container: args: - --executor_input - '{{$}}' - --function_to_execute - retrieve_context command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'pymilvus' &&\ \ \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef retrieve_context(\n embedding: list,\n milvus_host: str\ \ = \"milvus.ai-ml.svc.cluster.local\",\n collection_name: str = \"knowledge_base\"\ ,\n top_k: int = 5\n) -> NamedTuple(\"RetrieveResult\", [(\"documents\"\ , list), (\"latency_s\", float)]):\n \"\"\"Retrieve relevant documents\ \ from Milvus vector database.\"\"\"\n import time\n from pymilvus\ \ import connections, Collection, utility\n from collections import namedtuple\n\ \n start = time.perf_counter()\n connections.connect(host=milvus_host,\ \ port=19530)\n\n if not utility.has_collection(collection_name):\n \ \ latency = time.perf_counter() - start\n RetrieveResult =\ \ namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n \ \ return RetrieveResult([], latency)\n\n collection = Collection(collection_name)\n\ \ collection.load()\n\n results = collection.search(\n data=[embedding],\n\ \ anns_field=\"embedding\",\n param={\"metric_type\": \"COSINE\"\ , \"params\": {\"nprobe\": 10}},\n limit=top_k,\n output_fields=[\"\ text\", \"source\"]\n )\n latency = time.perf_counter() - start\n\n\ \ documents = []\n for hits in results:\n for hit in hits:\n\ \ documents.append({\n \"text\": hit.entity.get(\"\ text\"),\n \"source\": hit.entity.get(\"source\"),\n \ \ \"score\": hit.distance\n })\n\n RetrieveResult\ \ = namedtuple(\"RetrieveResult\", [\"documents\", \"latency_s\"])\n \ \ return RetrieveResult(documents, latency)\n\n" image: python:3.13-slim exec-synthesize-speech: container: args: - --executor_input - '{{$}}' - --function_to_execute - synthesize_speech command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\ $0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef synthesize_speech(\n text: str,\n tts_url: str = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts\"\ \n) -> NamedTuple(\"TTSResult\", [(\"audio_b64\", str), (\"latency_s\",\ \ float)]):\n \"\"\"Convert text to speech using TTS service.\"\"\"\n\ \ import base64\n import time\n import httpx\n from collections\ \ import namedtuple\n\n start = time.perf_counter()\n with httpx.Client(timeout=120.0)\ \ as client:\n response = client.post(\n f\"{tts_url}/v1/audio/speech\"\ ,\n json={\n \"input\": text,\n \ \ \"voice\": \"en_US-lessac-high\",\n \"response_format\"\ : \"wav\"\n }\n )\n audio_b64 = base64.b64encode(response.content).decode(\"\ utf-8\")\n latency = time.perf_counter() - start\n\n TTSResult = namedtuple(\"\ TTSResult\", [\"audio_b64\", \"latency_s\"])\n return TTSResult(audio_b64,\ \ latency)\n\n" image: python:3.13-slim exec-transcribe-audio: container: args: - --executor_input - '{{$}}' - --function_to_execute - transcribe_audio command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'httpx' && \"\ $0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef transcribe_audio(\n audio_b64: str,\n whisper_url: str\ \ = \"http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/whisper\"\ \n) -> NamedTuple(\"STTResult\", [(\"text\", str), (\"latency_s\", float),\ \ (\"audio_duration_s\", float)]):\n \"\"\"Transcribe audio using Whisper\ \ STT service.\"\"\"\n import base64\n import time\n import httpx\n\ \ from collections import namedtuple\n\n audio_bytes = base64.b64decode(audio_b64)\n\ \n start = time.perf_counter()\n with httpx.Client(timeout=120.0)\ \ as client:\n response = client.post(\n f\"{whisper_url}/v1/audio/transcriptions\"\ ,\n files={\"file\": (\"audio.wav\", audio_bytes, \"audio/wav\"\ )},\n data={\"model\": \"whisper-large-v3\", \"language\": \"\ en\"}\n )\n result = response.json()\n latency = time.perf_counter()\ \ - start\n\n text = result.get(\"text\", \"\")\n # Estimate audio\ \ duration from WAV header (16-bit PCM, 16kHz)\n audio_duration = max(len(audio_bytes)\ \ / (16000 * 2), 0.1)\n\n STTResult = namedtuple(\"STTResult\", [\"text\"\ , \"latency_s\", \"audio_duration_s\"])\n return STTResult(text, latency,\ \ audio_duration)\n\n" image: python:3.13-slim pipelineInfo: description: 'End-to-end voice assistant with RAG: STT -> Embeddings -> Milvus -> Rerank -> LLM -> TTS. Logs per-step latency to MLflow.' name: voice-assistant-rag-pipeline root: dag: tasks: generate-embeddings: cachingOptions: enableCache: true componentRef: name: comp-generate-embeddings dependentTasks: - transcribe-audio inputs: parameters: text: taskOutputParameter: outputParameterKey: text producerTask: transcribe-audio taskInfo: name: generate-embeddings generate-response: cachingOptions: enableCache: true componentRef: name: comp-generate-response dependentTasks: - rerank-documents - transcribe-audio inputs: parameters: context: taskOutputParameter: outputParameterKey: documents producerTask: rerank-documents query: taskOutputParameter: outputParameterKey: text producerTask: transcribe-audio taskInfo: name: generate-response log-pipeline-metrics: cachingOptions: enableCache: true componentRef: name: comp-log-pipeline-metrics dependentTasks: - generate-embeddings - generate-response - rerank-documents - retrieve-context - synthesize-speech - transcribe-audio inputs: parameters: embed_latency: taskOutputParameter: outputParameterKey: latency_s producerTask: generate-embeddings llm_completion_tokens: taskOutputParameter: outputParameterKey: completion_tokens producerTask: generate-response llm_latency: taskOutputParameter: outputParameterKey: latency_s producerTask: generate-response rerank_latency: taskOutputParameter: outputParameterKey: latency_s producerTask: rerank-documents retrieve_latency: taskOutputParameter: outputParameterKey: latency_s producerTask: retrieve-context stt_audio_duration: taskOutputParameter: outputParameterKey: audio_duration_s producerTask: transcribe-audio stt_latency: taskOutputParameter: outputParameterKey: latency_s producerTask: transcribe-audio tts_latency: taskOutputParameter: outputParameterKey: latency_s producerTask: synthesize-speech taskInfo: name: log-pipeline-metrics rerank-documents: cachingOptions: enableCache: true componentRef: name: comp-rerank-documents dependentTasks: - retrieve-context - transcribe-audio inputs: parameters: documents: taskOutputParameter: outputParameterKey: documents producerTask: retrieve-context query: taskOutputParameter: outputParameterKey: text producerTask: transcribe-audio taskInfo: name: rerank-documents retrieve-context: cachingOptions: enableCache: true componentRef: name: comp-retrieve-context dependentTasks: - generate-embeddings inputs: parameters: collection_name: componentInputParameter: collection_name embedding: taskOutputParameter: outputParameterKey: embedding producerTask: generate-embeddings taskInfo: name: retrieve-context synthesize-speech: cachingOptions: enableCache: true componentRef: name: comp-synthesize-speech dependentTasks: - generate-response inputs: parameters: text: taskOutputParameter: outputParameterKey: text producerTask: generate-response taskInfo: name: synthesize-speech transcribe-audio: cachingOptions: {} componentRef: name: comp-transcribe-audio inputs: parameters: audio_b64: componentInputParameter: audio_b64 taskInfo: name: transcribe-audio inputDefinitions: parameters: audio_b64: description: Base64-encoded audio file parameterType: STRING collection_name: defaultValue: knowledge_base description: Milvus collection for RAG isOptional: true parameterType: STRING schemaVersion: 2.1.0 sdkVersion: kfp-2.12.1