feat: migrate to typed messages, drop base64, fix AudioBuffer
- AudioBuffer getAudio(): use ab.totalBytes directly (eliminates triple-copy) - Decode STTStreamMessage via natsutil.Decode[messages.STTStreamMessage] - Audio chunks arrive as raw []byte (no base64 decode needed) - Publish STTTranscription struct (not map[string]any) - Interrupts use messages.STTInterrupt - Remove encoding/base64 import - Add .dockerignore, GOAMD64=v3 in Dockerfile - All 15 tests pass
This commit is contained in:
9
.dockerignore
Normal file
9
.dockerignore
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
.git
|
||||||
|
.gitignore
|
||||||
|
*.md
|
||||||
|
LICENSE
|
||||||
|
renovate.json
|
||||||
|
*_test.go
|
||||||
|
e2e_test.go
|
||||||
|
__pycache__
|
||||||
|
.env*
|
||||||
@@ -10,7 +10,7 @@ RUN go mod download
|
|||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-w -s" -o /stt-module .
|
RUN CGO_ENABLED=0 GOOS=linux GOAMD64=v3 go build -ldflags="-w -s" -o /stt-module .
|
||||||
|
|
||||||
# Runtime stage
|
# Runtime stage
|
||||||
FROM scratch
|
FROM scratch
|
||||||
|
|||||||
77
main.go
77
main.go
@@ -3,7 +3,6 @@ package main
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"encoding/base64"
|
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -25,6 +24,7 @@ import (
|
|||||||
|
|
||||||
"git.daviestechlabs.io/daviestechlabs/handler-base/config"
|
"git.daviestechlabs.io/daviestechlabs/handler-base/config"
|
||||||
"git.daviestechlabs.io/daviestechlabs/handler-base/health"
|
"git.daviestechlabs.io/daviestechlabs/handler-base/health"
|
||||||
|
"git.daviestechlabs.io/daviestechlabs/handler-base/messages"
|
||||||
"git.daviestechlabs.io/daviestechlabs/handler-base/natsutil"
|
"git.daviestechlabs.io/daviestechlabs/handler-base/natsutil"
|
||||||
"git.daviestechlabs.io/daviestechlabs/handler-base/telemetry"
|
"git.daviestechlabs.io/daviestechlabs/handler-base/telemetry"
|
||||||
)
|
)
|
||||||
@@ -137,11 +137,7 @@ func (ab *AudioBuffer) shouldProcess(bufferSize, maxBufferSize int, chunkTimeout
|
|||||||
func (ab *AudioBuffer) getAudio() []byte {
|
func (ab *AudioBuffer) getAudio() []byte {
|
||||||
ab.mu.Lock()
|
ab.mu.Lock()
|
||||||
defer ab.mu.Unlock()
|
defer ab.mu.Unlock()
|
||||||
var total int
|
result := make([]byte, 0, ab.totalBytes)
|
||||||
for _, c := range ab.chunks {
|
|
||||||
total += len(c)
|
|
||||||
}
|
|
||||||
result := make([]byte, 0, total)
|
|
||||||
for _, c := range ab.chunks {
|
for _, c := range ab.chunks {
|
||||||
result = append(result, c...)
|
result = append(result, c...)
|
||||||
}
|
}
|
||||||
@@ -327,16 +323,16 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if transcript != "" {
|
if transcript != "" {
|
||||||
result := map[string]any{
|
result := &messages.STTTranscription{
|
||||||
"session_id": sessionID,
|
SessionID: sessionID,
|
||||||
"transcript": transcript,
|
Transcript: transcript,
|
||||||
"sequence": seq,
|
Sequence: seq,
|
||||||
"is_partial": !complete,
|
IsPartial: !complete,
|
||||||
"is_final": complete,
|
IsFinal: complete,
|
||||||
"timestamp": time.Now().Unix(),
|
Timestamp: time.Now().Unix(),
|
||||||
"speaker_id": speakerID,
|
SpeakerID: speakerID,
|
||||||
"has_voice_activity": hasVoice,
|
HasVoiceActivity: hasVoice,
|
||||||
"state": state,
|
State: state,
|
||||||
}
|
}
|
||||||
packed, _ := msgpack.Marshal(result)
|
packed, _ := msgpack.Marshal(result)
|
||||||
nc.Conn().Publish(fmt.Sprintf("%s.%s", transcriptionSubjectPrefix, sessionID), packed)
|
nc.Conn().Publish(fmt.Sprintf("%s.%s", transcriptionSubjectPrefix, sessionID), packed)
|
||||||
@@ -382,26 +378,21 @@ func main() {
|
|||||||
}
|
}
|
||||||
sessionID := parts[3]
|
sessionID := parts[3]
|
||||||
|
|
||||||
data, err := natsutil.DecodeMsgpackMap(natMsg.Data)
|
streamMsg, err := natsutil.Decode[messages.STTStreamMessage](natMsg.Data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Error("decode error", "error", err)
|
slog.Error("decode error", "error", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
msgType := ""
|
switch streamMsg.Type {
|
||||||
if t, ok := data["type"].(string); ok {
|
|
||||||
msgType = t
|
|
||||||
}
|
|
||||||
|
|
||||||
switch msgType {
|
|
||||||
case "start":
|
case "start":
|
||||||
slog.Info("starting stream session", "session", sessionID)
|
slog.Info("starting stream session", "session", sessionID)
|
||||||
buf := newAudioBuffer(sessionID)
|
buf := newAudioBuffer(sessionID)
|
||||||
if s, ok := data["state"].(string); ok {
|
if streamMsg.State != "" {
|
||||||
buf.setState(s)
|
buf.setState(streamMsg.State)
|
||||||
}
|
}
|
||||||
if s, ok := data["speaker_id"].(string); ok {
|
if streamMsg.SpeakerID != "" {
|
||||||
buf.speakerID = s
|
buf.speakerID = streamMsg.SpeakerID
|
||||||
}
|
}
|
||||||
sessionsMu.Lock()
|
sessionsMu.Lock()
|
||||||
sessions[sessionID] = buf
|
sessions[sessionID] = buf
|
||||||
@@ -412,10 +403,8 @@ func main() {
|
|||||||
sessionsMu.RLock()
|
sessionsMu.RLock()
|
||||||
buffer, ok := sessions[sessionID]
|
buffer, ok := sessions[sessionID]
|
||||||
sessionsMu.RUnlock()
|
sessionsMu.RUnlock()
|
||||||
if ok {
|
if ok && streamMsg.State != "" {
|
||||||
if s, ok := data["state"].(string); ok {
|
buffer.setState(streamMsg.State)
|
||||||
buffer.setState(s)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
case "end":
|
case "end":
|
||||||
@@ -434,16 +423,8 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
case "chunk":
|
case "chunk":
|
||||||
audioB64 := ""
|
// Audio arrives as raw bytes — no base64 decode needed
|
||||||
if s, ok := data["audio_b64"].(string); ok {
|
if len(streamMsg.Audio) == 0 {
|
||||||
audioB64 = s
|
|
||||||
}
|
|
||||||
if audioB64 == "" {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
audioBytes, err := base64.StdEncoding.DecodeString(audioB64)
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("base64 decode failed", "error", err)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -459,12 +440,12 @@ func main() {
|
|||||||
sessionsMu.Unlock()
|
sessionsMu.Unlock()
|
||||||
|
|
||||||
// Check for interrupt
|
// Check for interrupt
|
||||||
if buffer.checkInterrupt(audioBytes, enableInterrupt, audioLevelThreshold, interruptDuration) {
|
if buffer.checkInterrupt(streamMsg.Audio, enableInterrupt, audioLevelThreshold, interruptDuration) {
|
||||||
interruptMsg := map[string]any{
|
interruptMsg := &messages.STTInterrupt{
|
||||||
"session_id": sessionID,
|
SessionID: sessionID,
|
||||||
"type": "interrupt",
|
Type: "interrupt",
|
||||||
"timestamp": time.Now().Unix(),
|
Timestamp: time.Now().Unix(),
|
||||||
"speaker_id": buffer.speakerID,
|
SpeakerID: buffer.speakerID,
|
||||||
}
|
}
|
||||||
packed, _ := msgpack.Marshal(interruptMsg)
|
packed, _ := msgpack.Marshal(interruptMsg)
|
||||||
nc.Conn().Publish(fmt.Sprintf("%s.%s", transcriptionSubjectPrefix, sessionID), packed)
|
nc.Conn().Publish(fmt.Sprintf("%s.%s", transcriptionSubjectPrefix, sessionID), packed)
|
||||||
@@ -472,7 +453,7 @@ func main() {
|
|||||||
buffer.setState(stateListening)
|
buffer.setState(stateListening)
|
||||||
}
|
}
|
||||||
|
|
||||||
buffer.addChunk(audioBytes)
|
buffer.addChunk(streamMsg.Audio)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user