- e2e_test.go: AudioBuffer lifecycle, interrupt detection, concurrent chunks - Mock Whisper multipart transcription, realistic sine wave RMS validation - Benchmarks: addChunk 2.7µs, getAudio 155µs, RMS 18µs, VAD 4.9µs
279 lines
7.5 KiB
Go
279 lines
7.5 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/binary"
|
|
"encoding/json"
|
|
"math"
|
|
"mime/multipart"
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"strings"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
)
|
|
|
|
// ────────────────────────────────────────────────────────────────────────────
|
|
// E2E tests: audio buffer lifecycle + transcription pipeline
|
|
// ────────────────────────────────────────────────────────────────────────────
|
|
|
|
func TestAudioBufferE2E_FullLifecycle(t *testing.T) {
|
|
// Simulate a full session: start → chunks → process → end
|
|
ab := newAudioBuffer("e2e-session")
|
|
|
|
// Send 10 chunks of 50 KB each
|
|
for i := 0; i < 10; i++ {
|
|
ab.addChunk(make([]byte, 50000))
|
|
}
|
|
|
|
ab.mu.Lock()
|
|
if ab.totalBytes != 500000 {
|
|
t.Errorf("totalBytes = %d, want 500000", ab.totalBytes)
|
|
}
|
|
ab.mu.Unlock()
|
|
|
|
// Should process (meets 512KB threshold approximately)
|
|
if !ab.shouldProcess(512000, 5120000, 2.0) {
|
|
// Under threshold but check with voice activity
|
|
ab.hasVoiceActivity = true
|
|
}
|
|
|
|
// Get audio and verify concatenation
|
|
audio := ab.getAudio()
|
|
if len(audio) != 500000 {
|
|
t.Errorf("getAudio() len = %d, want 500000", len(audio))
|
|
}
|
|
|
|
// Clear and verify
|
|
ab.clear()
|
|
ab.mu.Lock()
|
|
seq := ab.sequence
|
|
total := ab.totalBytes
|
|
ab.mu.Unlock()
|
|
if seq != 1 {
|
|
t.Errorf("sequence = %d, want 1", seq)
|
|
}
|
|
if total != 0 {
|
|
t.Errorf("totalBytes after clear = %d", total)
|
|
}
|
|
|
|
// Mark complete
|
|
ab.markComplete()
|
|
ab.mu.Lock()
|
|
if !ab.isComplete {
|
|
t.Error("expected isComplete=true")
|
|
}
|
|
ab.mu.Unlock()
|
|
}
|
|
|
|
func TestAudioBufferE2E_InterruptDuringResponse(t *testing.T) {
|
|
ab := newAudioBuffer("interrupt-session")
|
|
ab.setState(stateResponding)
|
|
|
|
// Simulate loud speech during response (user interrupting)
|
|
loud := makeLoudAudio(500)
|
|
|
|
// First check: starts tracking interrupt timer
|
|
ab.checkInterrupt(loud, true, 0.01, 0.1)
|
|
|
|
// Sleep to exceed duration threshold
|
|
time.Sleep(150 * time.Millisecond)
|
|
|
|
// Second check: should now confirm interrupt
|
|
interrupted := ab.checkInterrupt(loud, true, 0.01, 0.1)
|
|
if !interrupted {
|
|
t.Error("expected interrupt after duration threshold")
|
|
}
|
|
}
|
|
|
|
func TestAudioBufferE2E_InterruptDisabled(t *testing.T) {
|
|
ab := newAudioBuffer("no-interrupt")
|
|
ab.setState(stateResponding)
|
|
loud := makeLoudAudio(500)
|
|
|
|
if ab.checkInterrupt(loud, false, 0.01, 0.0) {
|
|
t.Error("interrupt should not trigger when disabled")
|
|
}
|
|
}
|
|
|
|
func TestAudioBufferE2E_ConcurrentChunks(t *testing.T) {
|
|
ab := newAudioBuffer("concurrent")
|
|
var wg sync.WaitGroup
|
|
numGoroutines := 20
|
|
chunkSize := 1000
|
|
|
|
wg.Add(numGoroutines)
|
|
for i := 0; i < numGoroutines; i++ {
|
|
go func() {
|
|
defer wg.Done()
|
|
ab.addChunk(make([]byte, chunkSize))
|
|
}()
|
|
}
|
|
wg.Wait()
|
|
|
|
ab.mu.Lock()
|
|
total := ab.totalBytes
|
|
chunks := len(ab.chunks)
|
|
ab.mu.Unlock()
|
|
|
|
if total != numGoroutines*chunkSize {
|
|
t.Errorf("totalBytes = %d, want %d", total, numGoroutines*chunkSize)
|
|
}
|
|
if chunks != numGoroutines {
|
|
t.Errorf("chunks = %d, want %d", chunks, numGoroutines)
|
|
}
|
|
|
|
audio := ab.getAudio()
|
|
if len(audio) != numGoroutines*chunkSize {
|
|
t.Errorf("getAudio len = %d, want %d", len(audio), numGoroutines*chunkSize)
|
|
}
|
|
}
|
|
|
|
func TestTranscriptionE2E_MockWhisper(t *testing.T) {
|
|
// Full mock Whisper server that validates multipart form upload
|
|
whisperSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodPost {
|
|
t.Errorf("expected POST, got %s", r.Method)
|
|
}
|
|
ct := r.Header.Get("Content-Type")
|
|
if ct == "" {
|
|
t.Error("missing Content-Type")
|
|
}
|
|
|
|
// Parse multipart form if present
|
|
if strings.HasPrefix(ct, "multipart/form-data") {
|
|
if err := r.ParseMultipartForm(32 << 20); err != nil {
|
|
t.Errorf("multipart parse: %v", err)
|
|
}
|
|
file, _, err := r.FormFile("file")
|
|
if err != nil {
|
|
t.Errorf("missing 'file' field: %v", err)
|
|
} else {
|
|
file.Close()
|
|
}
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(map[string]string{"text": "hello world"})
|
|
}))
|
|
defer whisperSrv.Close()
|
|
|
|
// Build a proper multipart request like stt-module's transcribe() does
|
|
var buf bytes.Buffer
|
|
writer := multipart.NewWriter(&buf)
|
|
part, err := writer.CreateFormFile("file", "audio.wav")
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
part.Write(make([]byte, 8000)) // simulated audio
|
|
writer.Close()
|
|
|
|
resp, err := http.Post(whisperSrv.URL+"/v1/audio/transcriptions", writer.FormDataContentType(), &buf)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode != 200 {
|
|
t.Errorf("status = %d", resp.StatusCode)
|
|
}
|
|
|
|
var result map[string]string
|
|
json.NewDecoder(resp.Body).Decode(&result)
|
|
if result["text"] != "hello world" {
|
|
t.Errorf("text = %q, want %q", result["text"], "hello world")
|
|
}
|
|
}
|
|
|
|
func TestAudioRMSE2E_RealisticSignal(t *testing.T) {
|
|
// Generate a sine wave at 440 Hz, 16kHz sample rate
|
|
sampleRate := 16000
|
|
duration := 0.1 // 100ms
|
|
numSamples := int(float64(sampleRate) * duration)
|
|
audio := make([]byte, numSamples*2)
|
|
|
|
amplitude := 16000.0
|
|
for i := 0; i < numSamples; i++ {
|
|
sample := int16(amplitude * math.Sin(2*math.Pi*440*float64(i)/float64(sampleRate)))
|
|
binary.LittleEndian.PutUint16(audio[i*2:], uint16(sample))
|
|
}
|
|
|
|
rms := calculateAudioRMS(audio)
|
|
// RMS of a sine wave = amplitude / sqrt(2) / 32768
|
|
expectedRMS := amplitude / math.Sqrt(2) / 32768.0
|
|
tolerance := 0.01
|
|
if math.Abs(rms-expectedRMS) > tolerance {
|
|
t.Errorf("RMS = %.4f, expected ~%.4f (±%.2f)", rms, expectedRMS, tolerance)
|
|
}
|
|
|
|
if !detectVoiceActivity(audio) {
|
|
t.Error("440 Hz sine at amplitude 16000 should be detected as voice")
|
|
}
|
|
}
|
|
|
|
// ────────────────────────────────────────────────────────────────────────────
|
|
// Benchmarks
|
|
// ────────────────────────────────────────────────────────────────────────────
|
|
|
|
func BenchmarkAudioBufferAddChunk(b *testing.B) {
|
|
ab := newAudioBuffer("bench")
|
|
chunk := make([]byte, 4096)
|
|
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
ab.addChunk(chunk)
|
|
}
|
|
}
|
|
|
|
func BenchmarkAudioBufferGetAudio(b *testing.B) {
|
|
ab := newAudioBuffer("bench")
|
|
for i := 0; i < 100; i++ {
|
|
ab.addChunk(make([]byte, 4096))
|
|
}
|
|
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
_ = ab.getAudio()
|
|
}
|
|
}
|
|
|
|
func BenchmarkCalculateAudioRMS(b *testing.B) {
|
|
audio := make([]byte, 32000) // 1 second at 16kHz, 16-bit
|
|
for i := 0; i < 16000; i++ {
|
|
val := int16(16000 * math.Sin(float64(i)*0.1))
|
|
binary.LittleEndian.PutUint16(audio[i*2:], uint16(val))
|
|
}
|
|
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
calculateAudioRMS(audio)
|
|
}
|
|
}
|
|
|
|
func BenchmarkDetectVoiceActivity(b *testing.B) {
|
|
audio := make([]byte, 8000) // 250ms at 16kHz
|
|
for i := 0; i < 4000; i++ {
|
|
val := int16(10000 * math.Sin(float64(i)*0.2))
|
|
binary.LittleEndian.PutUint16(audio[i*2:], uint16(val))
|
|
}
|
|
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
detectVoiceActivity(audio)
|
|
}
|
|
}
|
|
|
|
func BenchmarkCheckInterrupt(b *testing.B) {
|
|
ab := newAudioBuffer("bench")
|
|
ab.setState(stateResponding)
|
|
audio := makeLoudAudio(500)
|
|
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
ab.checkInterrupt(audio, true, 0.02, 999.0) // high duration so it never triggers
|
|
ab.mu.Lock()
|
|
ab.interruptStartTime = nil // reset for clean iteration
|
|
ab.mu.Unlock()
|
|
}
|
|
}
|