feat: rewrite stt-module (HTTP variant) in Go
Replace Python streaming STT service with Go for smaller container images. Local Whisper/ROCm variant (stt_streaming_local.py, Dockerfile.rocm) stays Python. - AudioBuffer with session state management (listening/responding) - RMS-based voice activity detection (pure Go, no cgo) - Interrupt detection during LLM response playback - JetStream AI_VOICE_STREAM setup - Session auto-creation and cleanup - Dockerfile: multi-stage golang:1.25-alpine → scratch - CI: Gitea Actions with lint/test/release/docker/notify
This commit is contained in:
200
main_test.go
Normal file
200
main_test.go
Normal file
@@ -0,0 +1,200 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"encoding/json"
|
||||
"math"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestCalculateAudioRMS(t *testing.T) {
|
||||
// Silence: all zeros
|
||||
silence := make([]byte, 200)
|
||||
rms := calculateAudioRMS(silence)
|
||||
if rms != 0.0 {
|
||||
t.Errorf("silence RMS = %f, want 0.0", rms)
|
||||
}
|
||||
|
||||
// Max amplitude: 16-bit samples at max value
|
||||
loud := make([]byte, 200)
|
||||
for i := 0; i < 100; i++ {
|
||||
binary.LittleEndian.PutUint16(loud[i*2:], uint16(32767))
|
||||
}
|
||||
rms = calculateAudioRMS(loud)
|
||||
if rms < 0.99 {
|
||||
t.Errorf("max amplitude RMS = %f, want ~1.0", rms)
|
||||
}
|
||||
|
||||
// Small input
|
||||
if rms := calculateAudioRMS([]byte{0}); rms != 0.0 {
|
||||
t.Errorf("single byte RMS = %f", rms)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDetectVoiceActivity(t *testing.T) {
|
||||
// Silence should not be detected as voice
|
||||
silence := make([]byte, 200)
|
||||
if detectVoiceActivity(silence) {
|
||||
t.Error("silence detected as voice")
|
||||
}
|
||||
|
||||
// Loud audio should be detected as voice
|
||||
loud := make([]byte, 200)
|
||||
for i := 0; i < 100; i++ {
|
||||
val := int16(16000 * math.Sin(float64(i)*0.1))
|
||||
binary.LittleEndian.PutUint16(loud[i*2:], uint16(val))
|
||||
}
|
||||
if !detectVoiceActivity(loud) {
|
||||
t.Error("loud audio not detected as voice")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAudioBufferBasic(t *testing.T) {
|
||||
ab := newAudioBuffer("test-session")
|
||||
if ab.sessionID != "test-session" {
|
||||
t.Error("wrong session ID")
|
||||
}
|
||||
if ab.state != stateListening {
|
||||
t.Error("initial state should be listening")
|
||||
}
|
||||
|
||||
// Add chunk
|
||||
chunk := make([]byte, 1000)
|
||||
ab.addChunk(chunk)
|
||||
if ab.totalBytes != 1000 {
|
||||
t.Errorf("totalBytes = %d, want 1000", ab.totalBytes)
|
||||
}
|
||||
|
||||
// Get audio
|
||||
audio := ab.getAudio()
|
||||
if len(audio) != 1000 {
|
||||
t.Errorf("getAudio len = %d, want 1000", len(audio))
|
||||
}
|
||||
|
||||
// Clear
|
||||
ab.clear()
|
||||
if ab.totalBytes != 0 {
|
||||
t.Error("totalBytes should be 0 after clear")
|
||||
}
|
||||
if ab.sequence != 1 {
|
||||
t.Errorf("sequence = %d, want 1", ab.sequence)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAudioBufferStateChange(t *testing.T) {
|
||||
ab := newAudioBuffer("s1")
|
||||
ab.setState(stateResponding)
|
||||
ab.mu.Lock()
|
||||
if ab.state != stateResponding {
|
||||
t.Error("state should be responding")
|
||||
}
|
||||
ab.mu.Unlock()
|
||||
|
||||
ab.setState("invalid")
|
||||
ab.mu.Lock()
|
||||
if ab.state != stateResponding {
|
||||
t.Error("state should still be responding")
|
||||
}
|
||||
ab.mu.Unlock()
|
||||
}
|
||||
|
||||
func TestAudioBufferShouldProcess(t *testing.T) {
|
||||
ab := newAudioBuffer("s2")
|
||||
// Empty buffer, recent time — should not process
|
||||
if ab.shouldProcess(512000, 5120000, 2.0) {
|
||||
t.Error("empty buffer should not process")
|
||||
}
|
||||
|
||||
// Add enough data to meet threshold
|
||||
ab.addChunk(make([]byte, 512000))
|
||||
ab.hasVoiceActivity = true
|
||||
if !ab.shouldProcess(512000, 5120000, 2.0) {
|
||||
t.Error("full buffer should process")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAudioBufferTimeout(t *testing.T) {
|
||||
ab := newAudioBuffer("s3")
|
||||
ab.addChunk(make([]byte, 100))
|
||||
ab.hasVoiceActivity = true
|
||||
// Simulate old lastChunkTime
|
||||
ab.mu.Lock()
|
||||
ab.lastChunkTime = time.Now().Add(-3 * time.Second)
|
||||
ab.mu.Unlock()
|
||||
|
||||
if !ab.shouldProcess(512000, 5120000, 2.0) {
|
||||
t.Error("timed-out buffer should process")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAudioBufferCheckInterrupt(t *testing.T) {
|
||||
ab := newAudioBuffer("s4")
|
||||
// Not in responding state — no interrupt
|
||||
loud := makeLoudAudio(100)
|
||||
if ab.checkInterrupt(loud, true, 0.001, 0.0) {
|
||||
t.Error("should not interrupt in listening state")
|
||||
}
|
||||
|
||||
// Switch to responding
|
||||
ab.setState(stateResponding)
|
||||
// With 0 duration threshold, immediate interrupt
|
||||
if !ab.checkInterrupt(loud, true, 0.001, 0.0) {
|
||||
t.Error("should interrupt in responding state with loud audio")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTranscribeHTTP(t *testing.T) {
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/v1/audio/transcriptions" {
|
||||
t.Errorf("unexpected path: %s", r.URL.Path)
|
||||
}
|
||||
if r.Method != http.MethodPost {
|
||||
t.Errorf("expected POST, got %s", r.Method)
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(map[string]string{"text": "hello world"})
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
// Verify the mock responds correctly
|
||||
resp, err := http.Post(ts.URL+"/v1/audio/transcriptions", "audio/wav", nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != 200 {
|
||||
t.Errorf("status = %d", resp.StatusCode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHelpers(t *testing.T) {
|
||||
t.Setenv("STT_TEST", "val")
|
||||
if got := getEnv("STT_TEST", "x"); got != "val" {
|
||||
t.Errorf("getEnv = %q", got)
|
||||
}
|
||||
t.Setenv("STT_PORT", "9090")
|
||||
if got := getEnvInt("STT_PORT", 0); got != 9090 {
|
||||
t.Errorf("getEnvInt = %d", got)
|
||||
}
|
||||
t.Setenv("STT_TIMEOUT", "1.5")
|
||||
if got := getEnvFloat("STT_TIMEOUT", 0); got != 1.5 {
|
||||
t.Errorf("getEnvFloat = %f", got)
|
||||
}
|
||||
t.Setenv("STT_FLAG", "true")
|
||||
if got := getEnvBool("STT_FLAG", false); !got {
|
||||
t.Error("getEnvBool should be true")
|
||||
}
|
||||
}
|
||||
|
||||
// makeLoudAudio creates a 16-bit PCM audio buffer with high amplitude.
|
||||
func makeLoudAudio(numSamples int) []byte {
|
||||
buf := make([]byte, numSamples*2)
|
||||
for i := 0; i < numSamples; i++ {
|
||||
val := int16(20000 * math.Sin(float64(i)*0.3))
|
||||
binary.LittleEndian.PutUint16(buf[i*2:], uint16(val))
|
||||
}
|
||||
return buf
|
||||
}
|
||||
Reference in New Issue
Block a user