package main import ( "bytes" "encoding/binary" "encoding/json" "math" "mime/multipart" "net/http" "net/http/httptest" "strings" "sync" "testing" "time" ) // ──────────────────────────────────────────────────────────────────────────── // E2E tests: audio buffer lifecycle + transcription pipeline // ──────────────────────────────────────────────────────────────────────────── func TestAudioBufferE2E_FullLifecycle(t *testing.T) { // Simulate a full session: start → chunks → process → end ab := newAudioBuffer("e2e-session") // Send 10 chunks of 50 KB each for i := 0; i < 10; i++ { ab.addChunk(make([]byte, 50000)) } ab.mu.Lock() if ab.totalBytes != 500000 { t.Errorf("totalBytes = %d, want 500000", ab.totalBytes) } ab.mu.Unlock() // Should process (meets 512KB threshold approximately) if !ab.shouldProcess(512000, 5120000, 2.0) { // Under threshold but check with voice activity ab.hasVoiceActivity = true } // Get audio and verify concatenation audio := ab.getAudio() if len(audio) != 500000 { t.Errorf("getAudio() len = %d, want 500000", len(audio)) } // Clear and verify ab.clear() ab.mu.Lock() seq := ab.sequence total := ab.totalBytes ab.mu.Unlock() if seq != 1 { t.Errorf("sequence = %d, want 1", seq) } if total != 0 { t.Errorf("totalBytes after clear = %d", total) } // Mark complete ab.markComplete() ab.mu.Lock() if !ab.isComplete { t.Error("expected isComplete=true") } ab.mu.Unlock() } func TestAudioBufferE2E_InterruptDuringResponse(t *testing.T) { ab := newAudioBuffer("interrupt-session") ab.setState(stateResponding) // Simulate loud speech during response (user interrupting) loud := makeLoudAudio(500) // First check: starts tracking interrupt timer ab.checkInterrupt(loud, true, 0.01, 0.1) // Sleep to exceed duration threshold time.Sleep(150 * time.Millisecond) // Second check: should now confirm interrupt interrupted := ab.checkInterrupt(loud, true, 0.01, 0.1) if !interrupted { t.Error("expected interrupt after duration threshold") } } func TestAudioBufferE2E_InterruptDisabled(t *testing.T) { ab := newAudioBuffer("no-interrupt") ab.setState(stateResponding) loud := makeLoudAudio(500) if ab.checkInterrupt(loud, false, 0.01, 0.0) { t.Error("interrupt should not trigger when disabled") } } func TestAudioBufferE2E_ConcurrentChunks(t *testing.T) { ab := newAudioBuffer("concurrent") var wg sync.WaitGroup numGoroutines := 20 chunkSize := 1000 wg.Add(numGoroutines) for i := 0; i < numGoroutines; i++ { go func() { defer wg.Done() ab.addChunk(make([]byte, chunkSize)) }() } wg.Wait() ab.mu.Lock() total := ab.totalBytes chunks := len(ab.chunks) ab.mu.Unlock() if total != numGoroutines*chunkSize { t.Errorf("totalBytes = %d, want %d", total, numGoroutines*chunkSize) } if chunks != numGoroutines { t.Errorf("chunks = %d, want %d", chunks, numGoroutines) } audio := ab.getAudio() if len(audio) != numGoroutines*chunkSize { t.Errorf("getAudio len = %d, want %d", len(audio), numGoroutines*chunkSize) } } func TestTranscriptionE2E_MockWhisper(t *testing.T) { // Full mock Whisper server that validates multipart form upload whisperSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodPost { t.Errorf("expected POST, got %s", r.Method) } ct := r.Header.Get("Content-Type") if ct == "" { t.Error("missing Content-Type") } // Parse multipart form if present if strings.HasPrefix(ct, "multipart/form-data") { if err := r.ParseMultipartForm(32 << 20); err != nil { t.Errorf("multipart parse: %v", err) } file, _, err := r.FormFile("file") if err != nil { t.Errorf("missing 'file' field: %v", err) } else { _ = file.Close() } } w.Header().Set("Content-Type", "application/json") _ = json.NewEncoder(w).Encode(map[string]string{"text": "hello world"}) })) defer whisperSrv.Close() // Build a proper multipart request like stt-module's transcribe() does var buf bytes.Buffer writer := multipart.NewWriter(&buf) part, err := writer.CreateFormFile("file", "audio.wav") if err != nil { t.Fatal(err) } _, _ = part.Write(make([]byte, 8000)) // simulated audio _ = writer.Close() resp, err := http.Post(whisperSrv.URL+"/v1/audio/transcriptions", writer.FormDataContentType(), &buf) if err != nil { t.Fatal(err) } defer func() { _ = resp.Body.Close() }() if resp.StatusCode != 200 { t.Errorf("status = %d", resp.StatusCode) } var result map[string]string _ = json.NewDecoder(resp.Body).Decode(&result) if result["text"] != "hello world" { t.Errorf("text = %q, want %q", result["text"], "hello world") } } func TestAudioRMSE2E_RealisticSignal(t *testing.T) { // Generate a sine wave at 440 Hz, 16kHz sample rate sampleRate := 16000 duration := 0.1 // 100ms numSamples := int(float64(sampleRate) * duration) audio := make([]byte, numSamples*2) amplitude := 16000.0 for i := 0; i < numSamples; i++ { sample := int16(amplitude * math.Sin(2*math.Pi*440*float64(i)/float64(sampleRate))) binary.LittleEndian.PutUint16(audio[i*2:], uint16(sample)) } rms := calculateAudioRMS(audio) // RMS of a sine wave = amplitude / sqrt(2) / 32768 expectedRMS := amplitude / math.Sqrt(2) / 32768.0 tolerance := 0.01 if math.Abs(rms-expectedRMS) > tolerance { t.Errorf("RMS = %.4f, expected ~%.4f (±%.2f)", rms, expectedRMS, tolerance) } if !detectVoiceActivity(audio) { t.Error("440 Hz sine at amplitude 16000 should be detected as voice") } } // ──────────────────────────────────────────────────────────────────────────── // Benchmarks // ──────────────────────────────────────────────────────────────────────────── func BenchmarkAudioBufferAddChunk(b *testing.B) { ab := newAudioBuffer("bench") chunk := make([]byte, 4096) b.ResetTimer() for b.Loop() { ab.addChunk(chunk) } } func BenchmarkAudioBufferGetAudio(b *testing.B) { ab := newAudioBuffer("bench") for i := 0; i < 100; i++ { ab.addChunk(make([]byte, 4096)) } b.ResetTimer() for b.Loop() { _ = ab.getAudio() } } func BenchmarkCalculateAudioRMS(b *testing.B) { audio := make([]byte, 32000) // 1 second at 16kHz, 16-bit for i := 0; i < 16000; i++ { val := int16(16000 * math.Sin(float64(i)*0.1)) binary.LittleEndian.PutUint16(audio[i*2:], uint16(val)) } b.ResetTimer() for b.Loop() { calculateAudioRMS(audio) } } func BenchmarkDetectVoiceActivity(b *testing.B) { audio := make([]byte, 8000) // 250ms at 16kHz for i := 0; i < 4000; i++ { val := int16(10000 * math.Sin(float64(i)*0.2)) binary.LittleEndian.PutUint16(audio[i*2:], uint16(val)) } b.ResetTimer() for b.Loop() { detectVoiceActivity(audio) } } func BenchmarkCheckInterrupt(b *testing.B) { ab := newAudioBuffer("bench") ab.setState(stateResponding) audio := makeLoudAudio(500) b.ResetTimer() for b.Loop() { ab.checkInterrupt(audio, true, 0.02, 999.0) // high duration so it never triggers ab.mu.Lock() ab.interruptStartTime = nil // reset for clean iteration ab.mu.Unlock() } }