stt-module/e2e_test.go

package main

import (
	"bytes"
	"encoding/binary"
	"encoding/json"
	"math"
	"mime/multipart"
	"net/http"
	"net/http/httptest"
	"strings"
	"sync"
	"testing"
	"time"
)

// ────────────────────────────────────────────────────────────────────────────
// E2E tests: audio buffer lifecycle + transcription pipeline
// ────────────────────────────────────────────────────────────────────────────

func TestAudioBufferE2E_FullLifecycle(t *testing.T) {
	// Simulate a full session: start → chunks → process → end
	ab := newAudioBuffer("e2e-session")

	// Send 10 chunks of 50 KB each
	for i := 0; i < 10; i++ {
		ab.addChunk(make([]byte, 50000))
	}

	ab.mu.Lock()
	if ab.totalBytes != 500000 {
		t.Errorf("totalBytes = %d, want 500000", ab.totalBytes)
	}
	ab.mu.Unlock()

	// Should process (meets 512KB threshold approximately)
	if !ab.shouldProcess(512000, 5120000, 2.0) {
		// Under threshold but check with voice activity
		ab.hasVoiceActivity = true
	}

	// Get audio and verify concatenation
	audio := ab.getAudio()
	if len(audio) != 500000 {
		t.Errorf("getAudio() len = %d, want 500000", len(audio))
	}

	// Clear and verify
	ab.clear()
	ab.mu.Lock()
	seq := ab.sequence
	total := ab.totalBytes
	ab.mu.Unlock()
	if seq != 1 {
		t.Errorf("sequence = %d, want 1", seq)
	}
	if total != 0 {
		t.Errorf("totalBytes after clear = %d", total)
	}

	// Mark complete
	ab.markComplete()
	ab.mu.Lock()
	if !ab.isComplete {
		t.Error("expected isComplete=true")
	}
	ab.mu.Unlock()
}

func TestAudioBufferE2E_InterruptDuringResponse(t *testing.T) {
	ab := newAudioBuffer("interrupt-session")
	ab.setState(stateResponding)

	// Simulate loud speech during response (user interrupting)
	loud := makeLoudAudio(500)

	// First check: starts tracking interrupt timer
	ab.checkInterrupt(loud, true, 0.01, 0.1)

	// Sleep to exceed duration threshold
	time.Sleep(150 * time.Millisecond)

	// Second check: should now confirm interrupt
	interrupted := ab.checkInterrupt(loud, true, 0.01, 0.1)
	if !interrupted {
		t.Error("expected interrupt after duration threshold")
	}
}

func TestAudioBufferE2E_InterruptDisabled(t *testing.T) {
	ab := newAudioBuffer("no-interrupt")
	ab.setState(stateResponding)
	loud := makeLoudAudio(500)

	if ab.checkInterrupt(loud, false, 0.01, 0.0) {
		t.Error("interrupt should not trigger when disabled")
	}
}

func TestAudioBufferE2E_ConcurrentChunks(t *testing.T) {
	ab := newAudioBuffer("concurrent")
	var wg sync.WaitGroup
	numGoroutines := 20
	chunkSize := 1000

	wg.Add(numGoroutines)
	for i := 0; i < numGoroutines; i++ {
		go func() {
			defer wg.Done()
			ab.addChunk(make([]byte, chunkSize))
		}()
	}
	wg.Wait()

	ab.mu.Lock()
	total := ab.totalBytes
	chunks := len(ab.chunks)
	ab.mu.Unlock()

	if total != numGoroutines*chunkSize {
		t.Errorf("totalBytes = %d, want %d", total, numGoroutines*chunkSize)
	}
	if chunks != numGoroutines {
		t.Errorf("chunks = %d, want %d", chunks, numGoroutines)
	}

	audio := ab.getAudio()
	if len(audio) != numGoroutines*chunkSize {
		t.Errorf("getAudio len = %d, want %d", len(audio), numGoroutines*chunkSize)
	}
}

func TestTranscriptionE2E_MockWhisper(t *testing.T) {
	// Full mock Whisper server that validates multipart form upload
	whisperSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		if r.Method != http.MethodPost {
			t.Errorf("expected POST, got %s", r.Method)
		}
		ct := r.Header.Get("Content-Type")
		if ct == "" {
			t.Error("missing Content-Type")
		}

		// Parse multipart form if present
		if strings.HasPrefix(ct, "multipart/form-data") {
			if err := r.ParseMultipartForm(32 << 20); err != nil {
				t.Errorf("multipart parse: %v", err)
			}
			file, _, err := r.FormFile("file")
			if err != nil {
				t.Errorf("missing 'file' field: %v", err)
			} else {
				_ = file.Close()
			}
		}

		w.Header().Set("Content-Type", "application/json")
		_ = json.NewEncoder(w).Encode(map[string]string{"text": "hello world"})
	}))
	defer whisperSrv.Close()

	// Build a proper multipart request like stt-module's transcribe() does
	var buf bytes.Buffer
	writer := multipart.NewWriter(&buf)
	part, err := writer.CreateFormFile("file", "audio.wav")
	if err != nil {
		t.Fatal(err)
	}
	_, _ = part.Write(make([]byte, 8000)) // simulated audio
	_ = writer.Close()

	resp, err := http.Post(whisperSrv.URL+"/v1/audio/transcriptions", writer.FormDataContentType(), &buf)
	if err != nil {
		t.Fatal(err)
	}
	defer func() { _ = resp.Body.Close() }()
	if resp.StatusCode != 200 {
		t.Errorf("status = %d", resp.StatusCode)
	}

	var result map[string]string
	_ = json.NewDecoder(resp.Body).Decode(&result)
	if result["text"] != "hello world" {
		t.Errorf("text = %q, want %q", result["text"], "hello world")
	}
}

func TestAudioRMSE2E_RealisticSignal(t *testing.T) {
	// Generate a sine wave at 440 Hz, 16kHz sample rate
	sampleRate := 16000
	duration := 0.1 // 100ms
	numSamples := int(float64(sampleRate) * duration)
	audio := make([]byte, numSamples*2)

	amplitude := 16000.0
	for i := 0; i < numSamples; i++ {
		sample := int16(amplitude * math.Sin(2*math.Pi*440*float64(i)/float64(sampleRate)))
		binary.LittleEndian.PutUint16(audio[i*2:], uint16(sample))
	}

	rms := calculateAudioRMS(audio)
	// RMS of a sine wave = amplitude / sqrt(2) / 32768
	expectedRMS := amplitude / math.Sqrt(2) / 32768.0
	tolerance := 0.01
	if math.Abs(rms-expectedRMS) > tolerance {
		t.Errorf("RMS = %.4f, expected ~%.4f (±%.2f)", rms, expectedRMS, tolerance)
	}

	if !detectVoiceActivity(audio) {
		t.Error("440 Hz sine at amplitude 16000 should be detected as voice")
	}
}

// ────────────────────────────────────────────────────────────────────────────
// Benchmarks
// ────────────────────────────────────────────────────────────────────────────

func BenchmarkAudioBufferAddChunk(b *testing.B) {
	ab := newAudioBuffer("bench")
	chunk := make([]byte, 4096)

	b.ResetTimer()
	for b.Loop() {
		ab.addChunk(chunk)
	}
}

func BenchmarkAudioBufferGetAudio(b *testing.B) {
	ab := newAudioBuffer("bench")
	for i := 0; i < 100; i++ {
		ab.addChunk(make([]byte, 4096))
	}

	b.ResetTimer()
	for b.Loop() {
		_ = ab.getAudio()
	}
}

func BenchmarkCalculateAudioRMS(b *testing.B) {
	audio := make([]byte, 32000) // 1 second at 16kHz, 16-bit
	for i := 0; i < 16000; i++ {
		val := int16(16000 * math.Sin(float64(i)*0.1))
		binary.LittleEndian.PutUint16(audio[i*2:], uint16(val))
	}

	b.ResetTimer()
	for b.Loop() {
		calculateAudioRMS(audio)
	}
}

func BenchmarkDetectVoiceActivity(b *testing.B) {
	audio := make([]byte, 8000) // 250ms at 16kHz
	for i := 0; i < 4000; i++ {
		val := int16(10000 * math.Sin(float64(i)*0.2))
		binary.LittleEndian.PutUint16(audio[i*2:], uint16(val))
	}

	b.ResetTimer()
	for b.Loop() {
		detectVoiceActivity(audio)
	}
}

func BenchmarkCheckInterrupt(b *testing.B) {
	ab := newAudioBuffer("bench")
	ab.setState(stateResponding)
	audio := makeLoudAudio(500)

	b.ResetTimer()
	for b.Loop() {
		ab.checkInterrupt(audio, true, 0.02, 999.0) // high duration so it never triggers
		ab.mu.Lock()
		ab.interruptStartTime = nil // reset for clean iteration
		ab.mu.Unlock()
	}
}