feat: rewrite stt-module (HTTP variant) in Go

Replace Python streaming STT service with Go for smaller container images. Local Whisper/ROCm variant (stt_streaming_local.py, Dockerfile.rocm) stays Python. - AudioBuffer with session state management (listening/responding) - RMS-based voice activity detection (pure Go, no cgo) - Interrupt detection during LLM response playback - JetStream AI_VOICE_STREAM setup - Session auto-creation and cleanup - Dockerfile: multi-stage golang:1.25-alpine → scratch - CI: Gitea Actions with lint/test/release/docker/notify
2026-02-19 18:04:15 -05:00
parent 43109cc931
commit 9d4d48e693
15 changed files with 1073 additions and 1852 deletions
--- a/main_test.go
+++ b/main_test.go
@@ -0,0 +1,200 @@
+package main
+
+import (
+	"encoding/binary"
+	"encoding/json"
+	"math"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+)
+
+func TestCalculateAudioRMS(t *testing.T) {
+	// Silence: all zeros
+	silence := make([]byte, 200)
+	rms := calculateAudioRMS(silence)
+	if rms != 0.0 {
+		t.Errorf("silence RMS = %f, want 0.0", rms)
+	}
+
+	// Max amplitude: 16-bit samples at max value
+	loud := make([]byte, 200)
+	for i := 0; i < 100; i++ {
+		binary.LittleEndian.PutUint16(loud[i*2:], uint16(32767))
+	}
+	rms = calculateAudioRMS(loud)
+	if rms < 0.99 {
+		t.Errorf("max amplitude RMS = %f, want ~1.0", rms)
+	}
+
+	// Small input
+	if rms := calculateAudioRMS([]byte{0}); rms != 0.0 {
+		t.Errorf("single byte RMS = %f", rms)
+	}
+}
+
+func TestDetectVoiceActivity(t *testing.T) {
+	// Silence should not be detected as voice
+	silence := make([]byte, 200)
+	if detectVoiceActivity(silence) {
+		t.Error("silence detected as voice")
+	}
+
+	// Loud audio should be detected as voice
+	loud := make([]byte, 200)
+	for i := 0; i < 100; i++ {
+		val := int16(16000 * math.Sin(float64(i)*0.1))
+		binary.LittleEndian.PutUint16(loud[i*2:], uint16(val))
+	}
+	if !detectVoiceActivity(loud) {
+		t.Error("loud audio not detected as voice")
+	}
+}
+
+func TestAudioBufferBasic(t *testing.T) {
+	ab := newAudioBuffer("test-session")
+	if ab.sessionID != "test-session" {
+		t.Error("wrong session ID")
+	}
+	if ab.state != stateListening {
+		t.Error("initial state should be listening")
+	}
+
+	// Add chunk
+	chunk := make([]byte, 1000)
+	ab.addChunk(chunk)
+	if ab.totalBytes != 1000 {
+		t.Errorf("totalBytes = %d, want 1000", ab.totalBytes)
+	}
+
+	// Get audio
+	audio := ab.getAudio()
+	if len(audio) != 1000 {
+		t.Errorf("getAudio len = %d, want 1000", len(audio))
+	}
+
+	// Clear
+	ab.clear()
+	if ab.totalBytes != 0 {
+		t.Error("totalBytes should be 0 after clear")
+	}
+	if ab.sequence != 1 {
+		t.Errorf("sequence = %d, want 1", ab.sequence)
+	}
+}
+
+func TestAudioBufferStateChange(t *testing.T) {
+	ab := newAudioBuffer("s1")
+	ab.setState(stateResponding)
+	ab.mu.Lock()
+	if ab.state != stateResponding {
+		t.Error("state should be responding")
+	}
+	ab.mu.Unlock()
+
+	ab.setState("invalid")
+	ab.mu.Lock()
+	if ab.state != stateResponding {
+		t.Error("state should still be responding")
+	}
+	ab.mu.Unlock()
+}
+
+func TestAudioBufferShouldProcess(t *testing.T) {
+	ab := newAudioBuffer("s2")
+	// Empty buffer, recent time — should not process
+	if ab.shouldProcess(512000, 5120000, 2.0) {
+		t.Error("empty buffer should not process")
+	}
+
+	// Add enough data to meet threshold
+	ab.addChunk(make([]byte, 512000))
+	ab.hasVoiceActivity = true
+	if !ab.shouldProcess(512000, 5120000, 2.0) {
+		t.Error("full buffer should process")
+	}
+}
+
+func TestAudioBufferTimeout(t *testing.T) {
+	ab := newAudioBuffer("s3")
+	ab.addChunk(make([]byte, 100))
+	ab.hasVoiceActivity = true
+	// Simulate old lastChunkTime
+	ab.mu.Lock()
+	ab.lastChunkTime = time.Now().Add(-3 * time.Second)
+	ab.mu.Unlock()
+
+	if !ab.shouldProcess(512000, 5120000, 2.0) {
+		t.Error("timed-out buffer should process")
+	}
+}
+
+func TestAudioBufferCheckInterrupt(t *testing.T) {
+	ab := newAudioBuffer("s4")
+	// Not in responding state — no interrupt
+	loud := makeLoudAudio(100)
+	if ab.checkInterrupt(loud, true, 0.001, 0.0) {
+		t.Error("should not interrupt in listening state")
+	}
+
+	// Switch to responding
+	ab.setState(stateResponding)
+	// With 0 duration threshold, immediate interrupt
+	if !ab.checkInterrupt(loud, true, 0.001, 0.0) {
+		t.Error("should interrupt in responding state with loud audio")
+	}
+}
+
+func TestTranscribeHTTP(t *testing.T) {
+	ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path != "/v1/audio/transcriptions" {
+			t.Errorf("unexpected path: %s", r.URL.Path)
+		}
+		if r.Method != http.MethodPost {
+			t.Errorf("expected POST, got %s", r.Method)
+		}
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(map[string]string{"text": "hello world"})
+	}))
+	defer ts.Close()
+
+	// Verify the mock responds correctly
+	resp, err := http.Post(ts.URL+"/v1/audio/transcriptions", "audio/wav", nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode != 200 {
+		t.Errorf("status = %d", resp.StatusCode)
+	}
+}
+
+func TestHelpers(t *testing.T) {
+	t.Setenv("STT_TEST", "val")
+	if got := getEnv("STT_TEST", "x"); got != "val" {
+		t.Errorf("getEnv = %q", got)
+	}
+	t.Setenv("STT_PORT", "9090")
+	if got := getEnvInt("STT_PORT", 0); got != 9090 {
+		t.Errorf("getEnvInt = %d", got)
+	}
+	t.Setenv("STT_TIMEOUT", "1.5")
+	if got := getEnvFloat("STT_TIMEOUT", 0); got != 1.5 {
+		t.Errorf("getEnvFloat = %f", got)
+	}
+	t.Setenv("STT_FLAG", "true")
+	if got := getEnvBool("STT_FLAG", false); !got {
+		t.Error("getEnvBool should be true")
+	}
+}
+
+// makeLoudAudio creates a 16-bit PCM audio buffer with high amplitude.
+func makeLoudAudio(numSamples int) []byte {
+	buf := make([]byte, numSamples*2)
+	for i := 0; i < numSamples; i++ {
+		val := int16(20000 * math.Sin(float64(i)*0.3))
+		binary.LittleEndian.PutUint16(buf[i*2:], uint16(val))
+	}
+	return buf
+}