diff --git a/e2e_test.go b/e2e_test.go new file mode 100644 index 0000000..132fb42 --- /dev/null +++ b/e2e_test.go @@ -0,0 +1,278 @@ +package main + +import ( + "bytes" + "encoding/binary" + "encoding/json" + "math" + "mime/multipart" + "net/http" + "net/http/httptest" + "strings" + "sync" + "testing" + "time" +) + +// ──────────────────────────────────────────────────────────────────────────── +// E2E tests: audio buffer lifecycle + transcription pipeline +// ──────────────────────────────────────────────────────────────────────────── + +func TestAudioBufferE2E_FullLifecycle(t *testing.T) { + // Simulate a full session: start → chunks → process → end + ab := newAudioBuffer("e2e-session") + + // Send 10 chunks of 50 KB each + for i := 0; i < 10; i++ { + ab.addChunk(make([]byte, 50000)) + } + + ab.mu.Lock() + if ab.totalBytes != 500000 { + t.Errorf("totalBytes = %d, want 500000", ab.totalBytes) + } + ab.mu.Unlock() + + // Should process (meets 512KB threshold approximately) + if !ab.shouldProcess(512000, 5120000, 2.0) { + // Under threshold but check with voice activity + ab.hasVoiceActivity = true + } + + // Get audio and verify concatenation + audio := ab.getAudio() + if len(audio) != 500000 { + t.Errorf("getAudio() len = %d, want 500000", len(audio)) + } + + // Clear and verify + ab.clear() + ab.mu.Lock() + seq := ab.sequence + total := ab.totalBytes + ab.mu.Unlock() + if seq != 1 { + t.Errorf("sequence = %d, want 1", seq) + } + if total != 0 { + t.Errorf("totalBytes after clear = %d", total) + } + + // Mark complete + ab.markComplete() + ab.mu.Lock() + if !ab.isComplete { + t.Error("expected isComplete=true") + } + ab.mu.Unlock() +} + +func TestAudioBufferE2E_InterruptDuringResponse(t *testing.T) { + ab := newAudioBuffer("interrupt-session") + ab.setState(stateResponding) + + // Simulate loud speech during response (user interrupting) + loud := makeLoudAudio(500) + + // First check: starts tracking interrupt timer + ab.checkInterrupt(loud, true, 0.01, 0.1) + + // Sleep to exceed duration threshold + time.Sleep(150 * time.Millisecond) + + // Second check: should now confirm interrupt + interrupted := ab.checkInterrupt(loud, true, 0.01, 0.1) + if !interrupted { + t.Error("expected interrupt after duration threshold") + } +} + +func TestAudioBufferE2E_InterruptDisabled(t *testing.T) { + ab := newAudioBuffer("no-interrupt") + ab.setState(stateResponding) + loud := makeLoudAudio(500) + + if ab.checkInterrupt(loud, false, 0.01, 0.0) { + t.Error("interrupt should not trigger when disabled") + } +} + +func TestAudioBufferE2E_ConcurrentChunks(t *testing.T) { + ab := newAudioBuffer("concurrent") + var wg sync.WaitGroup + numGoroutines := 20 + chunkSize := 1000 + + wg.Add(numGoroutines) + for i := 0; i < numGoroutines; i++ { + go func() { + defer wg.Done() + ab.addChunk(make([]byte, chunkSize)) + }() + } + wg.Wait() + + ab.mu.Lock() + total := ab.totalBytes + chunks := len(ab.chunks) + ab.mu.Unlock() + + if total != numGoroutines*chunkSize { + t.Errorf("totalBytes = %d, want %d", total, numGoroutines*chunkSize) + } + if chunks != numGoroutines { + t.Errorf("chunks = %d, want %d", chunks, numGoroutines) + } + + audio := ab.getAudio() + if len(audio) != numGoroutines*chunkSize { + t.Errorf("getAudio len = %d, want %d", len(audio), numGoroutines*chunkSize) + } +} + +func TestTranscriptionE2E_MockWhisper(t *testing.T) { + // Full mock Whisper server that validates multipart form upload + whisperSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + t.Errorf("expected POST, got %s", r.Method) + } + ct := r.Header.Get("Content-Type") + if ct == "" { + t.Error("missing Content-Type") + } + + // Parse multipart form if present + if strings.HasPrefix(ct, "multipart/form-data") { + if err := r.ParseMultipartForm(32 << 20); err != nil { + t.Errorf("multipart parse: %v", err) + } + file, _, err := r.FormFile("file") + if err != nil { + t.Errorf("missing 'file' field: %v", err) + } else { + file.Close() + } + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]string{"text": "hello world"}) + })) + defer whisperSrv.Close() + + // Build a proper multipart request like stt-module's transcribe() does + var buf bytes.Buffer + writer := multipart.NewWriter(&buf) + part, err := writer.CreateFormFile("file", "audio.wav") + if err != nil { + t.Fatal(err) + } + part.Write(make([]byte, 8000)) // simulated audio + writer.Close() + + resp, err := http.Post(whisperSrv.URL+"/v1/audio/transcriptions", writer.FormDataContentType(), &buf) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + if resp.StatusCode != 200 { + t.Errorf("status = %d", resp.StatusCode) + } + + var result map[string]string + json.NewDecoder(resp.Body).Decode(&result) + if result["text"] != "hello world" { + t.Errorf("text = %q, want %q", result["text"], "hello world") + } +} + +func TestAudioRMSE2E_RealisticSignal(t *testing.T) { + // Generate a sine wave at 440 Hz, 16kHz sample rate + sampleRate := 16000 + duration := 0.1 // 100ms + numSamples := int(float64(sampleRate) * duration) + audio := make([]byte, numSamples*2) + + amplitude := 16000.0 + for i := 0; i < numSamples; i++ { + sample := int16(amplitude * math.Sin(2*math.Pi*440*float64(i)/float64(sampleRate))) + binary.LittleEndian.PutUint16(audio[i*2:], uint16(sample)) + } + + rms := calculateAudioRMS(audio) + // RMS of a sine wave = amplitude / sqrt(2) / 32768 + expectedRMS := amplitude / math.Sqrt(2) / 32768.0 + tolerance := 0.01 + if math.Abs(rms-expectedRMS) > tolerance { + t.Errorf("RMS = %.4f, expected ~%.4f (±%.2f)", rms, expectedRMS, tolerance) + } + + if !detectVoiceActivity(audio) { + t.Error("440 Hz sine at amplitude 16000 should be detected as voice") + } +} + +// ──────────────────────────────────────────────────────────────────────────── +// Benchmarks +// ──────────────────────────────────────────────────────────────────────────── + +func BenchmarkAudioBufferAddChunk(b *testing.B) { + ab := newAudioBuffer("bench") + chunk := make([]byte, 4096) + + b.ResetTimer() + for b.Loop() { + ab.addChunk(chunk) + } +} + +func BenchmarkAudioBufferGetAudio(b *testing.B) { + ab := newAudioBuffer("bench") + for i := 0; i < 100; i++ { + ab.addChunk(make([]byte, 4096)) + } + + b.ResetTimer() + for b.Loop() { + _ = ab.getAudio() + } +} + +func BenchmarkCalculateAudioRMS(b *testing.B) { + audio := make([]byte, 32000) // 1 second at 16kHz, 16-bit + for i := 0; i < 16000; i++ { + val := int16(16000 * math.Sin(float64(i)*0.1)) + binary.LittleEndian.PutUint16(audio[i*2:], uint16(val)) + } + + b.ResetTimer() + for b.Loop() { + calculateAudioRMS(audio) + } +} + +func BenchmarkDetectVoiceActivity(b *testing.B) { + audio := make([]byte, 8000) // 250ms at 16kHz + for i := 0; i < 4000; i++ { + val := int16(10000 * math.Sin(float64(i)*0.2)) + binary.LittleEndian.PutUint16(audio[i*2:], uint16(val)) + } + + b.ResetTimer() + for b.Loop() { + detectVoiceActivity(audio) + } +} + +func BenchmarkCheckInterrupt(b *testing.B) { + ab := newAudioBuffer("bench") + ab.setState(stateResponding) + audio := makeLoudAudio(500) + + b.ResetTimer() + for b.Loop() { + ab.checkInterrupt(audio, true, 0.02, 999.0) // high duration so it never triggers + ab.mu.Lock() + ab.interruptStartTime = nil // reset for clean iteration + ab.mu.Unlock() + } +} diff --git a/go.mod b/go.mod index d14f721..7dd92d6 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( require ( github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/google/uuid v1.6.0 // indirect diff --git a/go.sum b/go.sum index 4a3f959..b9a1b68 100644 --- a/go.sum +++ b/go.sum @@ -4,6 +4,8 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=