feat: migrate to typed messages, drop base64, fix AudioBuffer
Some checks failed
CI / Lint (pull_request) Failing after 57s
CI / Test (pull_request) Failing after 1m23s
CI / Release (pull_request) Has been skipped
CI / Docker Build & Push (pull_request) Has been skipped
CI / Notify (pull_request) Successful in 1s

- AudioBuffer getAudio(): use ab.totalBytes directly (eliminates triple-copy)
- Decode STTStreamMessage via natsutil.Decode[messages.STTStreamMessage]
- Audio chunks arrive as raw []byte (no base64 decode needed)
- Publish STTTranscription struct (not map[string]any)
- Interrupts use messages.STTInterrupt
- Remove encoding/base64 import
- Add .dockerignore, GOAMD64=v3 in Dockerfile
- All 15 tests pass
This commit is contained in:
2026-02-20 07:11:23 -05:00
parent 2c578aad44
commit af9f8cc01e
3 changed files with 39 additions and 49 deletions

9
.dockerignore Normal file
View File

@@ -0,0 +1,9 @@
.git
.gitignore
*.md
LICENSE
renovate.json
*_test.go
e2e_test.go
__pycache__
.env*

View File

@@ -10,7 +10,7 @@ RUN go mod download
COPY . . COPY . .
RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-w -s" -o /stt-module . RUN CGO_ENABLED=0 GOOS=linux GOAMD64=v3 go build -ldflags="-w -s" -o /stt-module .
# Runtime stage # Runtime stage
FROM scratch FROM scratch

77
main.go
View File

@@ -3,7 +3,6 @@ package main
import ( import (
"bytes" "bytes"
"context" "context"
"encoding/base64"
"encoding/binary" "encoding/binary"
"encoding/json" "encoding/json"
"fmt" "fmt"
@@ -25,6 +24,7 @@ import (
"git.daviestechlabs.io/daviestechlabs/handler-base/config" "git.daviestechlabs.io/daviestechlabs/handler-base/config"
"git.daviestechlabs.io/daviestechlabs/handler-base/health" "git.daviestechlabs.io/daviestechlabs/handler-base/health"
"git.daviestechlabs.io/daviestechlabs/handler-base/messages"
"git.daviestechlabs.io/daviestechlabs/handler-base/natsutil" "git.daviestechlabs.io/daviestechlabs/handler-base/natsutil"
"git.daviestechlabs.io/daviestechlabs/handler-base/telemetry" "git.daviestechlabs.io/daviestechlabs/handler-base/telemetry"
) )
@@ -137,11 +137,7 @@ func (ab *AudioBuffer) shouldProcess(bufferSize, maxBufferSize int, chunkTimeout
func (ab *AudioBuffer) getAudio() []byte { func (ab *AudioBuffer) getAudio() []byte {
ab.mu.Lock() ab.mu.Lock()
defer ab.mu.Unlock() defer ab.mu.Unlock()
var total int result := make([]byte, 0, ab.totalBytes)
for _, c := range ab.chunks {
total += len(c)
}
result := make([]byte, 0, total)
for _, c := range ab.chunks { for _, c := range ab.chunks {
result = append(result, c...) result = append(result, c...)
} }
@@ -327,16 +323,16 @@ func main() {
} }
if transcript != "" { if transcript != "" {
result := map[string]any{ result := &messages.STTTranscription{
"session_id": sessionID, SessionID: sessionID,
"transcript": transcript, Transcript: transcript,
"sequence": seq, Sequence: seq,
"is_partial": !complete, IsPartial: !complete,
"is_final": complete, IsFinal: complete,
"timestamp": time.Now().Unix(), Timestamp: time.Now().Unix(),
"speaker_id": speakerID, SpeakerID: speakerID,
"has_voice_activity": hasVoice, HasVoiceActivity: hasVoice,
"state": state, State: state,
} }
packed, _ := msgpack.Marshal(result) packed, _ := msgpack.Marshal(result)
nc.Conn().Publish(fmt.Sprintf("%s.%s", transcriptionSubjectPrefix, sessionID), packed) nc.Conn().Publish(fmt.Sprintf("%s.%s", transcriptionSubjectPrefix, sessionID), packed)
@@ -382,26 +378,21 @@ func main() {
} }
sessionID := parts[3] sessionID := parts[3]
data, err := natsutil.DecodeMsgpackMap(natMsg.Data) streamMsg, err := natsutil.Decode[messages.STTStreamMessage](natMsg.Data)
if err != nil { if err != nil {
slog.Error("decode error", "error", err) slog.Error("decode error", "error", err)
return return
} }
msgType := "" switch streamMsg.Type {
if t, ok := data["type"].(string); ok {
msgType = t
}
switch msgType {
case "start": case "start":
slog.Info("starting stream session", "session", sessionID) slog.Info("starting stream session", "session", sessionID)
buf := newAudioBuffer(sessionID) buf := newAudioBuffer(sessionID)
if s, ok := data["state"].(string); ok { if streamMsg.State != "" {
buf.setState(s) buf.setState(streamMsg.State)
} }
if s, ok := data["speaker_id"].(string); ok { if streamMsg.SpeakerID != "" {
buf.speakerID = s buf.speakerID = streamMsg.SpeakerID
} }
sessionsMu.Lock() sessionsMu.Lock()
sessions[sessionID] = buf sessions[sessionID] = buf
@@ -412,10 +403,8 @@ func main() {
sessionsMu.RLock() sessionsMu.RLock()
buffer, ok := sessions[sessionID] buffer, ok := sessions[sessionID]
sessionsMu.RUnlock() sessionsMu.RUnlock()
if ok { if ok && streamMsg.State != "" {
if s, ok := data["state"].(string); ok { buffer.setState(streamMsg.State)
buffer.setState(s)
}
} }
case "end": case "end":
@@ -434,16 +423,8 @@ func main() {
} }
case "chunk": case "chunk":
audioB64 := "" // Audio arrives as raw bytes — no base64 decode needed
if s, ok := data["audio_b64"].(string); ok { if len(streamMsg.Audio) == 0 {
audioB64 = s
}
if audioB64 == "" {
return
}
audioBytes, err := base64.StdEncoding.DecodeString(audioB64)
if err != nil {
slog.Error("base64 decode failed", "error", err)
return return
} }
@@ -459,12 +440,12 @@ func main() {
sessionsMu.Unlock() sessionsMu.Unlock()
// Check for interrupt // Check for interrupt
if buffer.checkInterrupt(audioBytes, enableInterrupt, audioLevelThreshold, interruptDuration) { if buffer.checkInterrupt(streamMsg.Audio, enableInterrupt, audioLevelThreshold, interruptDuration) {
interruptMsg := map[string]any{ interruptMsg := &messages.STTInterrupt{
"session_id": sessionID, SessionID: sessionID,
"type": "interrupt", Type: "interrupt",
"timestamp": time.Now().Unix(), Timestamp: time.Now().Unix(),
"speaker_id": buffer.speakerID, SpeakerID: buffer.speakerID,
} }
packed, _ := msgpack.Marshal(interruptMsg) packed, _ := msgpack.Marshal(interruptMsg)
nc.Conn().Publish(fmt.Sprintf("%s.%s", transcriptionSubjectPrefix, sessionID), packed) nc.Conn().Publish(fmt.Sprintf("%s.%s", transcriptionSubjectPrefix, sessionID), packed)
@@ -472,7 +453,7 @@ func main() {
buffer.setState(stateListening) buffer.setState(stateListening)
} }
buffer.addChunk(audioBytes) buffer.addChunk(streamMsg.Audio)
} }
} }