From 84ffeca8f259b6b58ceaccd81f603af26a8e9570 Mon Sep 17 00:00:00 2001 From: "Billy D." Date: Sun, 22 Feb 2026 12:19:06 -0500 Subject: [PATCH] fix(tts): add /health endpoint, fix language param for single-lang models - Add GET /health endpoint returning model name and GPU status - Don't pass language/speaker to Coqui TTS when model doesn't support multilingual/multi-speaker (fixes 500 on ljspeech/tacotron2-DDC) - Applied to all three endpoints: POST /, GET /api/tts, POST /stream --- ray_serve/serve_tts.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/ray_serve/serve_tts.py b/ray_serve/serve_tts.py index e213444..59306da 100644 --- a/ray_serve/serve_tts.py +++ b/ray_serve/serve_tts.py @@ -120,6 +120,17 @@ class TTSDeployment: realtime_factor=elapsed / duration if duration > 0 else 0, ) + # ── GET /health — simple liveness check ───────────────────────────── + + @_fastapi.get("/health") + def health(self) -> dict[str, Any]: + """Simple health/readiness check.""" + return { + "status": "ok", + "model": self.model_name, + "gpu": self.use_gpu, + } + # ── POST / — JSON API (base64 audio in response) ──────────────────── @_fastapi.post("/") @@ -140,6 +151,12 @@ class TTSDeployment: output_format = request.get("output_format", "wav") return_base64 = request.get("return_base64", True) + # Only pass language/speaker if the model supports it + if not (hasattr(self.tts, "is_multi_lingual") and self.tts.is_multi_lingual): + language = None + if not (hasattr(self.tts, "is_multi_speaker") and self.tts.is_multi_speaker): + speaker = None + try: audio_bytes, sample_rate, duration = self._synthesize( text, speaker, language, speed @@ -174,9 +191,13 @@ class TTSDeployment: if not text: return Response(content="text parameter required", status_code=400) + # Only pass language/speaker if the model is multi-lingual/multi-speaker + lang = language_id if hasattr(self.tts, "is_multi_lingual") and self.tts.is_multi_lingual else None + spk = speaker_id if hasattr(self.tts, "is_multi_speaker") and self.tts.is_multi_speaker else None + try: audio_bytes, _sr, duration = self._synthesize( - text, speaker_id, language_id + text, spk, lang ) self._log(_start, duration, len(text)) return Response(content=audio_bytes, media_type="audio/wav") @@ -212,6 +233,13 @@ class TTSDeployment: speaker = body.get("speaker") language = body.get("language") speed = body.get("speed", 1.0) + + # Only pass language/speaker if the model supports it + if not (hasattr(self.tts, "is_multi_lingual") and self.tts.is_multi_lingual): + language = None + if not (hasattr(self.tts, "is_multi_speaker") and self.tts.is_multi_speaker): + speaker = None + sentences = _split_sentences(text) async def _generate():