feat: replace fake streaming with real SSE StreamGenerate
Use handler-base StreamGenerate() to publish real token-by-token ChatStreamChunk messages to NATS as they arrive from Ray Serve, instead of calling Generate() and splitting into 4-word chunks. Add 8 streaming tests: happy path, system prompt, RAG context, nil callback, timeout, HTTP error, context canceled, fallback.
This commit is contained in:
51
main.go
51
main.go
@@ -157,8 +157,27 @@ func main() {
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Generate LLM response
|
||||
responseText, err := llm.Generate(ctx, query, contextText, systemPrompt)
|
||||
// 5. Generate LLM response (streaming when requested)
|
||||
var responseText string
|
||||
if req.EnableStreaming {
|
||||
streamSubject := fmt.Sprintf("ai.chat.response.stream.%s", requestID)
|
||||
responseText, err = llm.StreamGenerate(ctx, query, contextText, systemPrompt, func(token string) {
|
||||
_ = h.NATS.Publish(streamSubject, &messages.ChatStreamChunk{
|
||||
RequestID: requestID,
|
||||
Type: "chunk",
|
||||
Content: token,
|
||||
Timestamp: messages.Timestamp(),
|
||||
})
|
||||
})
|
||||
_ = h.NATS.Publish(streamSubject, &messages.ChatStreamChunk{
|
||||
RequestID: requestID,
|
||||
Type: "done",
|
||||
Done: true,
|
||||
Timestamp: messages.Timestamp(),
|
||||
})
|
||||
} else {
|
||||
responseText, err = llm.Generate(ctx, query, contextText, systemPrompt)
|
||||
}
|
||||
if err != nil {
|
||||
slog.Error("LLM generation failed", "error", err)
|
||||
return &messages.ChatResponse{
|
||||
@@ -168,33 +187,7 @@ func main() {
|
||||
}, nil
|
||||
}
|
||||
|
||||
// 6. Stream chunks if requested
|
||||
if req.EnableStreaming {
|
||||
streamSubject := fmt.Sprintf("ai.chat.response.stream.%s", requestID)
|
||||
words := strings.Fields(responseText)
|
||||
chunkSize := 4
|
||||
for i := 0; i < len(words); i += chunkSize {
|
||||
end := i + chunkSize
|
||||
if end > len(words) {
|
||||
end = len(words)
|
||||
}
|
||||
chunk := strings.Join(words[i:end], " ")
|
||||
_ = h.NATS.Publish(streamSubject, &messages.ChatStreamChunk{
|
||||
RequestID: requestID,
|
||||
Type: "chunk",
|
||||
Content: chunk,
|
||||
Timestamp: messages.Timestamp(),
|
||||
})
|
||||
}
|
||||
_ = h.NATS.Publish(streamSubject, &messages.ChatStreamChunk{
|
||||
RequestID: requestID,
|
||||
Type: "done",
|
||||
Done: true,
|
||||
Timestamp: messages.Timestamp(),
|
||||
})
|
||||
}
|
||||
|
||||
// 7. Optional TTS — audio as raw bytes (no base64)
|
||||
// 6. Optional TTS — audio as raw bytes (no base64)
|
||||
var audio []byte
|
||||
if reqEnableTTS && tts != nil {
|
||||
audioBytes, err := tts.Synthesize(ctx, responseText, ttsLanguage, "")
|
||||
|
||||
Reference in New Issue
Block a user