feat: replace fake streaming with real SSE StreamGenerate
Some checks failed
CI / Lint (push) Successful in 3m0s
CI / Test (push) Successful in 3m23s
CI / Docker Build & Push (push) Failing after 4m55s
CI / Release (push) Successful in 1m4s
CI / Notify (push) Successful in 1s

Use handler-base StreamGenerate() to publish real token-by-token
ChatStreamChunk messages to NATS as they arrive from Ray Serve,
instead of calling Generate() and splitting into 4-word chunks.

Add 8 streaming tests: happy path, system prompt, RAG context,
nil callback, timeout, HTTP error, context canceled, fallback.
This commit is contained in:
2026-02-21 09:23:57 -05:00
parent 7678d911fe
commit 87d0545d2c
2 changed files with 259 additions and 29 deletions

51
main.go
View File

@@ -157,8 +157,27 @@ func main() {
}
}
// 5. Generate LLM response
responseText, err := llm.Generate(ctx, query, contextText, systemPrompt)
// 5. Generate LLM response (streaming when requested)
var responseText string
if req.EnableStreaming {
streamSubject := fmt.Sprintf("ai.chat.response.stream.%s", requestID)
responseText, err = llm.StreamGenerate(ctx, query, contextText, systemPrompt, func(token string) {
_ = h.NATS.Publish(streamSubject, &messages.ChatStreamChunk{
RequestID: requestID,
Type: "chunk",
Content: token,
Timestamp: messages.Timestamp(),
})
})
_ = h.NATS.Publish(streamSubject, &messages.ChatStreamChunk{
RequestID: requestID,
Type: "done",
Done: true,
Timestamp: messages.Timestamp(),
})
} else {
responseText, err = llm.Generate(ctx, query, contextText, systemPrompt)
}
if err != nil {
slog.Error("LLM generation failed", "error", err)
return &messages.ChatResponse{
@@ -168,33 +187,7 @@ func main() {
}, nil
}
// 6. Stream chunks if requested
if req.EnableStreaming {
streamSubject := fmt.Sprintf("ai.chat.response.stream.%s", requestID)
words := strings.Fields(responseText)
chunkSize := 4
for i := 0; i < len(words); i += chunkSize {
end := i + chunkSize
if end > len(words) {
end = len(words)
}
chunk := strings.Join(words[i:end], " ")
_ = h.NATS.Publish(streamSubject, &messages.ChatStreamChunk{
RequestID: requestID,
Type: "chunk",
Content: chunk,
Timestamp: messages.Timestamp(),
})
}
_ = h.NATS.Publish(streamSubject, &messages.ChatStreamChunk{
RequestID: requestID,
Type: "done",
Done: true,
Timestamp: messages.Timestamp(),
})
}
// 7. Optional TTS — audio as raw bytes (no base64)
// 6. Optional TTS — audio as raw bytes (no base64)
var audio []byte
if reqEnableTTS && tts != nil {
audioBytes, err := tts.Synthesize(ctx, responseText, ttsLanguage, "")