# PIPELINE DEFINITION # Name: dvd-video-transcription # Description: Extract audio from a DVD or video file, transcribe it via Whisper STT, and produce a timestamped transcript (SRT/VTT/TXT). # Inputs: # chunk_duration_s: int [Default: 300.0] # language: str [Default: 'en'] # mlflow_tracking_uri: str [Default: 'http://mlflow.mlflow.svc.cluster.local:80'] # output_format: str [Default: 'srt'] # source_path: str [Default: '/data/dvd/movie.mkv'] # whisper_url: str [Default: 'http://ai-inference-serve-svc.kuberay.svc.cluster.local:8000/whisper'] components: comp-chunk-audio: executorLabel: exec-chunk-audio inputDefinitions: parameters: chunk_duration_s: defaultValue: 300.0 description: Seconds per chunk (default 5 minutes). isOptional: true parameterType: NUMBER_INTEGER wav_path: description: ' Path to the mono 16 kHz WAV.' parameterType: STRING outputDefinitions: parameters: chunk_paths: parameterType: LIST num_chunks: parameterType: NUMBER_INTEGER comp-extract-audio: executorLabel: exec-extract-audio inputDefinitions: parameters: mono: defaultValue: true description: ' Down-mix to mono (Whisper expects single-channel).' isOptional: true parameterType: BOOLEAN sample_rate: defaultValue: 16000.0 description: Target sample rate (16 kHz is optimal for Whisper). isOptional: true parameterType: NUMBER_INTEGER source_path: description: 'Path to DVD ISO, VOB, MKV, MP4, or any ffmpeg- supported file. Can also be a /dev/sr0 device.' parameterType: STRING outputDefinitions: parameters: duration_s: parameterType: NUMBER_DOUBLE wav_path: parameterType: STRING comp-format-transcript: executorLabel: exec-format-transcript inputDefinitions: parameters: full_text: description: ' Full concatenated text.' parameterType: STRING output_format: defaultValue: srt description: ' ''srt'', ''vtt'', or ''txt''.' isOptional: true parameterType: STRING segments: description: ' List of segment dicts with start/end/text.' parameterType: LIST total_duration_s: description: Total audio duration in seconds. parameterType: NUMBER_DOUBLE outputDefinitions: parameters: output_path: parameterType: STRING transcript: parameterType: STRING comp-log-transcription-metrics: executorLabel: exec-log-transcription-metrics inputDefinitions: parameters: experiment_name: defaultValue: dvd-transcription isOptional: true parameterType: STRING full_text: parameterType: STRING mlflow_tracking_uri: defaultValue: http://mlflow.mlflow.svc.cluster.local:80 isOptional: true parameterType: STRING source_path: parameterType: STRING total_duration_s: parameterType: NUMBER_DOUBLE comp-transcribe-chunks: executorLabel: exec-transcribe-chunks inputDefinitions: parameters: chunk_paths: description: ' List of WAV file paths to transcribe.' parameterType: LIST language: defaultValue: en description: ' Language code for Whisper (None for auto-detect).' isOptional: true parameterType: STRING response_format: defaultValue: verbose_json description: '''json'', ''verbose_json'', or ''text''.' isOptional: true parameterType: STRING whisper_url: defaultValue: http://ai-inference-serve-svc.kuberay.svc.cluster.local:8000/whisper description: ' In-cluster Whisper endpoint URL.' isOptional: true parameterType: STRING outputDefinitions: parameters: full_text: parameterType: STRING segments: parameterType: LIST total_duration_s: parameterType: NUMBER_DOUBLE deploymentSpec: executors: exec-chunk-audio: container: args: - --executor_input - '{{$}}' - --function_to_execute - chunk_audio command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ $0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef chunk_audio(\n wav_path: str,\n chunk_duration_s: int =\ \ 300,\n) -> NamedTuple(\"ChunkOutput\", [(\"chunk_paths\", list), (\"num_chunks\"\ , int)]):\n \"\"\"Split a WAV file into fixed-duration chunks.\n\n \ \ Args:\n wav_path: Path to the mono 16 kHz WAV.\n \ \ chunk_duration_s: Seconds per chunk (default 5 minutes).\n \"\"\"\ \n import os\n import subprocess\n import glob\n\n subprocess.run(\n\ \ [\"apt-get\", \"update\", \"-qq\"],\n check=True, capture_output=True,\n\ \ )\n subprocess.run(\n [\"apt-get\", \"install\", \"-y\",\ \ \"-qq\", \"ffmpeg\"],\n check=True, capture_output=True,\n )\n\ \n out_dir = \"/tmp/dvd_chunks\"\n os.makedirs(out_dir, exist_ok=True)\n\ \ pattern = os.path.join(out_dir, \"chunk_%04d.wav\")\n\n cmd = [\n\ \ \"ffmpeg\", \"-y\",\n \"-i\", wav_path,\n \"-f\"\ , \"segment\",\n \"-segment_time\", str(chunk_duration_s),\n \ \ \"-c\", \"copy\",\n pattern,\n ]\n print(f\"Chunking:\ \ {' '.join(cmd)}\")\n result = subprocess.run(cmd, capture_output=True,\ \ text=True, timeout=3600)\n if result.returncode != 0:\n raise\ \ RuntimeError(f\"ffmpeg chunk failed:\\n{result.stderr}\")\n\n chunks\ \ = sorted(glob.glob(os.path.join(out_dir, \"chunk_*.wav\")))\n print(f\"\ Created {len(chunks)} chunks of ~{chunk_duration_s}s each\")\n\n from\ \ collections import namedtuple\n ChunkOutput = namedtuple(\"ChunkOutput\"\ , [\"chunk_paths\", \"num_chunks\"])\n return ChunkOutput(chunk_paths=chunks,\ \ num_chunks=len(chunks))\n\n" image: python:3.13-slim exec-extract-audio: container: args: - --executor_input - '{{$}}' - --function_to_execute - extract_audio command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'requests' &&\ \ \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef extract_audio(\n source_path: str,\n sample_rate: int =\ \ 16000,\n mono: bool = True,\n) -> NamedTuple(\"AudioOutput\", [(\"\ wav_path\", str), (\"duration_s\", float)]):\n \"\"\"Extract audio from\ \ a video/DVD file using ffmpeg.\n\n Args:\n source_path: Path\ \ to DVD ISO, VOB, MKV, MP4, or any ffmpeg-\n supported\ \ file. Can also be a /dev/sr0 device.\n sample_rate: Target sample\ \ rate (16 kHz is optimal for Whisper).\n mono: Down-mix to\ \ mono (Whisper expects single-channel).\n \"\"\"\n import os\n \ \ import subprocess\n import json\n\n # Install ffmpeg inside the\ \ container\n subprocess.run(\n [\"apt-get\", \"update\", \"-qq\"\ ],\n check=True, capture_output=True,\n )\n subprocess.run(\n\ \ [\"apt-get\", \"install\", \"-y\", \"-qq\", \"ffmpeg\"],\n \ \ check=True, capture_output=True,\n )\n\n out_dir = \"/tmp/dvd_audio\"\ \n os.makedirs(out_dir, exist_ok=True)\n wav_path = os.path.join(out_dir,\ \ \"full_audio.wav\")\n\n # Build ffmpeg command\n cmd = [\"ffmpeg\"\ , \"-y\"]\n\n # Handle DVD ISOs \u2013 mount via concat demuxer or direct\ \ input\n if source_path.lower().endswith(\".iso\"):\n # For ISOs,\ \ ffmpeg can read via dvdread protocol\n cmd += [\"-i\", f\"dvd://{source_path}\"\ ]\n else:\n cmd += [\"-i\", source_path]\n\n # Audio extraction\ \ options\n cmd += [\n \"-vn\", # drop\ \ video\n \"-acodec\", \"pcm_s16le\", # 16-bit WAV\n \ \ \"-ar\", str(sample_rate), # resample\n ]\n if mono:\n \ \ cmd += [\"-ac\", \"1\"] # down-mix to mono\n\n cmd +=\ \ [wav_path]\n\n print(f\"Running: {' '.join(cmd)}\")\n result = subprocess.run(cmd,\ \ capture_output=True, text=True, timeout=7200)\n if result.returncode\ \ != 0:\n raise RuntimeError(f\"ffmpeg failed:\\n{result.stderr}\"\ )\n\n # Get duration via ffprobe\n probe = subprocess.run(\n \ \ [\n \"ffprobe\", \"-v\", \"quiet\",\n \"-show_entries\"\ , \"format=duration\",\n \"-of\", \"json\", wav_path,\n \ \ ],\n capture_output=True, text=True,\n )\n duration_s =\ \ float(json.loads(probe.stdout)[\"format\"][\"duration\"])\n file_size_mb\ \ = os.path.getsize(wav_path) / (1024 * 1024)\n print(f\"Extracted: {wav_path}\ \ ({file_size_mb:.1f} MB, {duration_s:.1f}s)\")\n\n from collections\ \ import namedtuple\n AudioOutput = namedtuple(\"AudioOutput\", [\"wav_path\"\ , \"duration_s\"])\n return AudioOutput(wav_path=wav_path, duration_s=duration_s)\n\ \n" image: python:3.13-slim exec-format-transcript: container: args: - --executor_input - '{{$}}' - --function_to_execute - format_transcript command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ $0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef format_transcript(\n segments: list,\n full_text: str,\n\ \ total_duration_s: float,\n output_format: str = \"srt\",\n) -> NamedTuple(\"\ FormattedOutput\", [(\"transcript\", str), (\"output_path\", str)]):\n \ \ \"\"\"Format the transcript as SRT, VTT, or plain text.\n\n Args:\n\ \ segments: List of segment dicts with start/end/text.\n \ \ full_text: Full concatenated text.\n total_duration_s:\ \ Total audio duration in seconds.\n output_format: 'srt', 'vtt',\ \ or 'txt'.\n \"\"\"\n import os\n\n def _fmt_ts_srt(seconds: float)\ \ -> str:\n h = int(seconds // 3600)\n m = int((seconds %\ \ 3600) // 60)\n s = int(seconds % 60)\n ms = int((seconds\ \ % 1) * 1000)\n return f\"{h:02d}:{m:02d}:{s:02d},{ms:03d}\"\n\n\ \ def _fmt_ts_vtt(seconds: float) -> str:\n h = int(seconds //\ \ 3600)\n m = int((seconds % 3600) // 60)\n s = int(seconds\ \ % 60)\n ms = int((seconds % 1) * 1000)\n return f\"{h:02d}:{m:02d}:{s:02d}.{ms:03d}\"\ \n\n out_dir = \"/tmp/dvd_transcript\"\n os.makedirs(out_dir, exist_ok=True)\n\ \n if output_format == \"srt\":\n lines = []\n for i, seg\ \ in enumerate(segments, 1):\n start_ts = _fmt_ts_srt(seg[\"\ start\"])\n end_ts = _fmt_ts_srt(seg[\"end\"])\n text\ \ = seg.get(\"text\", \"\").strip()\n lines.append(f\"{i}\\n{start_ts}\ \ --> {end_ts}\\n{text}\\n\")\n transcript = \"\\n\".join(lines)\n\ \ ext = \"srt\"\n\n elif output_format == \"vtt\":\n lines\ \ = [\"WEBVTT\\n\"]\n for seg in segments:\n start_ts\ \ = _fmt_ts_vtt(seg[\"start\"])\n end_ts = _fmt_ts_vtt(seg[\"\ end\"])\n text = seg.get(\"text\", \"\").strip()\n \ \ lines.append(f\"{start_ts} --> {end_ts}\\n{text}\\n\")\n transcript\ \ = \"\\n\".join(lines)\n ext = \"vtt\"\n\n else: # txt\n \ \ transcript = full_text\n ext = \"txt\"\n\n out_path = os.path.join(out_dir,\ \ f\"transcript.{ext}\")\n with open(out_path, \"w\", encoding=\"utf-8\"\ ) as f:\n f.write(transcript)\n\n h = int(total_duration_s //\ \ 3600)\n m = int((total_duration_s % 3600) // 60)\n print(f\"Transcript\ \ saved: {out_path}\")\n print(f\"Audio duration: {h}h {m}m, Segments:\ \ {len(segments)}\")\n print(f\"Format: {output_format.upper()}, Size:\ \ {len(transcript)} chars\")\n\n from collections import namedtuple\n\ \ FormattedOutput = namedtuple(\"FormattedOutput\", [\"transcript\",\ \ \"output_path\"])\n return FormattedOutput(transcript=transcript, output_path=out_path)\n\ \n" image: python:3.13-slim exec-log-transcription-metrics: container: args: - --executor_input - '{{$}}' - --function_to_execute - log_transcription_metrics command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'mlflow==2.22.0'\ \ && \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef log_transcription_metrics(\n total_duration_s: float,\n \ \ full_text: str,\n source_path: str,\n mlflow_tracking_uri: str\ \ = \"http://mlflow.mlflow.svc.cluster.local:80\",\n experiment_name:\ \ str = \"dvd-transcription\",\n):\n \"\"\"Log transcription run metrics\ \ to MLflow.\"\"\"\n import mlflow\n\n mlflow.set_tracking_uri(mlflow_tracking_uri)\n\ \ mlflow.set_experiment(experiment_name)\n\n with mlflow.start_run(run_name=f\"\ transcribe-{source_path.split('/')[-1]}\"):\n mlflow.log_params({\n\ \ \"source_path\": source_path,\n \"model\": \"whisper-large-v3\"\ ,\n })\n mlflow.log_metrics({\n \"audio_duration_s\"\ : total_duration_s,\n \"transcript_chars\": float(len(full_text)),\n\ \ })\n\n" image: python:3.13-slim exec-transcribe-chunks: container: args: - --executor_input - '{{$}}' - --function_to_execute - transcribe_chunks command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.12.1'\ \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ \ python3 -m pip install --quiet --no-warn-script-location 'requests' &&\ \ \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) printf "%s" "$0" > "$program_path/ephemeral_component.py" _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef transcribe_chunks(\n chunk_paths: list,\n whisper_url:\ \ str = \"http://ai-inference-serve-svc.kuberay.svc.cluster.local:8000/whisper\"\ ,\n language: str = \"en\",\n response_format: str = \"verbose_json\"\ ,\n) -> NamedTuple(\"TranscriptOutput\", [(\"segments\", list), (\"full_text\"\ , str), (\"total_duration_s\", float)]):\n \"\"\"Send each audio chunk\ \ to the Whisper STT endpoint.\n\n Args:\n chunk_paths: List\ \ of WAV file paths to transcribe.\n whisper_url: In-cluster\ \ Whisper endpoint URL.\n language: Language code for Whisper\ \ (None for auto-detect).\n response_format: 'json', 'verbose_json',\ \ or 'text'.\n \"\"\"\n import base64\n import json\n import\ \ time\n import requests\n\n all_segments = []\n full_text_parts\ \ = []\n total_audio_duration = 0.0\n time_offset = 0.0 # cumulative\ \ offset for absolute timestamps\n\n for i, path in enumerate(chunk_paths):\n\ \ print(f\"Transcribing chunk {i + 1}/{len(chunk_paths)}: {path}\"\ )\n\n # Read and base64-encode the chunk\n with open(path,\ \ \"rb\") as f:\n audio_b64 = base64.b64encode(f.read()).decode(\"\ utf-8\")\n\n payload = {\n \"audio\": audio_b64,\n \ \ \"audio_format\": \"wav\",\n \"language\": language,\n\ \ \"task\": \"transcribe\",\n \"response_format\"\ : response_format,\n \"word_timestamps\": False,\n }\n\ \n start = time.time()\n resp = requests.post(whisper_url,\ \ json=payload, timeout=600)\n elapsed = time.time() - start\n \ \ resp.raise_for_status()\n data = resp.json()\n\n chunk_duration\ \ = data.get(\"duration\", 0.0)\n total_audio_duration += chunk_duration\n\ \n if \"segments\" in data:\n for seg in data[\"segments\"\ ]:\n # Offset timestamps to be absolute within the full audio\n\ \ seg[\"start\"] += time_offset\n seg[\"end\"\ ] += time_offset\n all_segments.append(seg)\n\n chunk_text\ \ = data.get(\"text\", \"\")\n full_text_parts.append(chunk_text)\n\ \ time_offset += chunk_duration\n rtf = elapsed / chunk_duration\ \ if chunk_duration > 0 else 0\n print(f\" \u2192 {len(chunk_text)}\ \ chars, {chunk_duration:.1f}s audio in {elapsed:.1f}s (RTF={rtf:.2f})\"\ )\n\n full_text = \"\\n\".join(full_text_parts)\n print(f\"\\nTotal:\ \ {len(all_segments)} segments, {total_audio_duration:.1f}s audio\")\n \ \ print(f\"Transcript length: {len(full_text)} characters\")\n\n from\ \ collections import namedtuple\n TranscriptOutput = namedtuple(\"TranscriptOutput\"\ , [\"segments\", \"full_text\", \"total_duration_s\"])\n return TranscriptOutput(\n\ \ segments=all_segments,\n full_text=full_text.strip(),\n\ \ total_duration_s=total_audio_duration,\n )\n\n" image: python:3.13-slim pipelineInfo: description: Extract audio from a DVD or video file, transcribe it via Whisper STT, and produce a timestamped transcript (SRT/VTT/TXT). name: dvd-video-transcription root: dag: tasks: chunk-audio: cachingOptions: enableCache: true componentRef: name: comp-chunk-audio dependentTasks: - extract-audio inputs: parameters: chunk_duration_s: componentInputParameter: chunk_duration_s wav_path: taskOutputParameter: outputParameterKey: wav_path producerTask: extract-audio taskInfo: name: chunk-audio extract-audio: cachingOptions: enableCache: true componentRef: name: comp-extract-audio inputs: parameters: mono: runtimeValue: constant: true sample_rate: runtimeValue: constant: 16000.0 source_path: componentInputParameter: source_path taskInfo: name: extract-audio format-transcript: cachingOptions: enableCache: true componentRef: name: comp-format-transcript dependentTasks: - transcribe-chunks inputs: parameters: full_text: taskOutputParameter: outputParameterKey: full_text producerTask: transcribe-chunks output_format: componentInputParameter: output_format segments: taskOutputParameter: outputParameterKey: segments producerTask: transcribe-chunks total_duration_s: taskOutputParameter: outputParameterKey: total_duration_s producerTask: transcribe-chunks taskInfo: name: format-transcript log-transcription-metrics: cachingOptions: enableCache: true componentRef: name: comp-log-transcription-metrics dependentTasks: - transcribe-chunks inputs: parameters: full_text: taskOutputParameter: outputParameterKey: full_text producerTask: transcribe-chunks mlflow_tracking_uri: componentInputParameter: mlflow_tracking_uri source_path: componentInputParameter: source_path total_duration_s: taskOutputParameter: outputParameterKey: total_duration_s producerTask: transcribe-chunks taskInfo: name: log-transcription-metrics transcribe-chunks: cachingOptions: enableCache: true componentRef: name: comp-transcribe-chunks dependentTasks: - chunk-audio inputs: parameters: chunk_paths: taskOutputParameter: outputParameterKey: chunk_paths producerTask: chunk-audio language: componentInputParameter: language response_format: runtimeValue: constant: verbose_json whisper_url: componentInputParameter: whisper_url taskInfo: name: transcribe-chunks inputDefinitions: parameters: chunk_duration_s: defaultValue: 300.0 isOptional: true parameterType: NUMBER_INTEGER language: defaultValue: en isOptional: true parameterType: STRING mlflow_tracking_uri: defaultValue: http://mlflow.mlflow.svc.cluster.local:80 isOptional: true parameterType: STRING output_format: defaultValue: srt isOptional: true parameterType: STRING source_path: defaultValue: /data/dvd/movie.mkv isOptional: true parameterType: STRING whisper_url: defaultValue: http://ai-inference-serve-svc.kuberay.svc.cluster.local:8000/whisper isOptional: true parameterType: STRING schemaVersion: 2.1.0 sdkVersion: kfp-2.12.1