feat: add streaming STT service with Whisper backend

- stt_streaming.py: HTTP-based STT using external Whisper service - stt_streaming_local.py: ROCm-based local Whisper inference - Voice Activity Detection (VAD) with WebRTC - Interrupt detection for barge-in support - Session state management (listening/responding) - OpenTelemetry instrumentation with HyperDX support - Dockerfile variants for HTTP and ROCm deployments
2026-02-02 06:23:12 -05:00
parent 680e43fe39
commit 8fc5eb1193
9 changed files with 1473 additions and 1 deletions
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -0,0 +1,52 @@
+# STT Streaming Service with ROCm for AMD GPU Whisper inference
+# Targets AMD Strix Halo (gfx1151 / RDNA 3.5) but includes RDNA 3 compatibility
+#
+# Uses OpenAI Whisper with PyTorch ROCm backend
+#
+FROM docker.io/rocm/pytorch:rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.9.1 AS base
+
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault in
+# libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
+# TheRock nightly builds work correctly. Install BEFORE other deps since
+# openai-whisper depends on torch.
+RUN pip install --no-cache-dir --break-system-packages \
+    --index-url https://rocm.nightlies.amd.com/v2/gfx1151/ \
+    torch torchaudio torchvision --force-reinstall
+
+# Install Python dependencies for STT streaming
+# Use pip directly (more reliable than uv in this base image)
+COPY requirements-rocm.txt .
+RUN pip install --no-cache-dir --break-system-packages -r requirements-rocm.txt
+
+# Download Whisper model at build time for faster startup
+# Using medium model for good accuracy/speed balance
+ARG WHISPER_MODEL=medium
+ENV WHISPER_MODEL_SIZE=${WHISPER_MODEL}
+
+# Pre-download the model during build (whisper is installed as openai-whisper)
+# Use python3 to ensure correct interpreter
+RUN python3 -c "import whisper; whisper.load_model('${WHISPER_MODEL}')" || echo "Model will be downloaded at runtime"
+
+# Copy application code
+COPY stt_streaming_local.py .
+COPY healthcheck.py .
+
+# Set ROCm environment for AMD Strix Halo (gfx1151 / RDNA 3.5)
+ENV HIP_VISIBLE_DEVICES=0
+ENV HSA_ENABLE_SDMA=0
+# Ensure PyTorch uses ROCm with expandable segments for large models
+ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512
+# Target gfx1151 (Strix Halo) - ROCm 7.1+ has native support
+# Falls back to runtime override if kernels not available
+ENV ROCM_TARGET_LST=gfx1151,gfx1100
+
+# Run the service
+CMD ["python", "stt_streaming_local.py"]