feat: add streaming STT service with Whisper backend

- stt_streaming.py: HTTP-based STT using external Whisper service
- stt_streaming_local.py: ROCm-based local Whisper inference
- Voice Activity Detection (VAD) with WebRTC
- Interrupt detection for barge-in support
- Session state management (listening/responding)
- OpenTelemetry instrumentation with HyperDX support
- Dockerfile variants for HTTP and ROCm deployments
This commit is contained in:
2026-02-02 06:23:12 -05:00
parent 680e43fe39
commit 8fc5eb1193
9 changed files with 1473 additions and 1 deletions

52
Dockerfile.rocm Normal file
View File

@@ -0,0 +1,52 @@
# STT Streaming Service with ROCm for AMD GPU Whisper inference
# Targets AMD Strix Halo (gfx1151 / RDNA 3.5) but includes RDNA 3 compatibility
#
# Uses OpenAI Whisper with PyTorch ROCm backend
#
FROM docker.io/rocm/pytorch:rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.9.1 AS base
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
ffmpeg \
libsndfile1 \
&& rm -rf /var/lib/apt/lists/*
# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault in
# libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
# TheRock nightly builds work correctly. Install BEFORE other deps since
# openai-whisper depends on torch.
RUN pip install --no-cache-dir --break-system-packages \
--index-url https://rocm.nightlies.amd.com/v2/gfx1151/ \
torch torchaudio torchvision --force-reinstall
# Install Python dependencies for STT streaming
# Use pip directly (more reliable than uv in this base image)
COPY requirements-rocm.txt .
RUN pip install --no-cache-dir --break-system-packages -r requirements-rocm.txt
# Download Whisper model at build time for faster startup
# Using medium model for good accuracy/speed balance
ARG WHISPER_MODEL=medium
ENV WHISPER_MODEL_SIZE=${WHISPER_MODEL}
# Pre-download the model during build (whisper is installed as openai-whisper)
# Use python3 to ensure correct interpreter
RUN python3 -c "import whisper; whisper.load_model('${WHISPER_MODEL}')" || echo "Model will be downloaded at runtime"
# Copy application code
COPY stt_streaming_local.py .
COPY healthcheck.py .
# Set ROCm environment for AMD Strix Halo (gfx1151 / RDNA 3.5)
ENV HIP_VISIBLE_DEVICES=0
ENV HSA_ENABLE_SDMA=0
# Ensure PyTorch uses ROCm with expandable segments for large models
ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512
# Target gfx1151 (Strix Halo) - ROCm 7.1+ has native support
# Falls back to runtime override if kernels not available
ENV ROCM_TARGET_LST=gfx1151,gfx1100
# Run the service
CMD ["python", "stt_streaming_local.py"]