fix(tts): add /health endpoint, fix language param for single-lang models
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 1m54s

- Add GET /health endpoint returning model name and GPU status
- Don't pass language/speaker to Coqui TTS when model doesn't support
  multilingual/multi-speaker (fixes 500 on ljspeech/tacotron2-DDC)
- Applied to all three endpoints: POST /, GET /api/tts, POST /stream
This commit is contained in:
2026-02-22 12:19:06 -05:00
parent 194a431e8c
commit 84ffeca8f2

View File

@@ -120,6 +120,17 @@ class TTSDeployment:
realtime_factor=elapsed / duration if duration > 0 else 0, realtime_factor=elapsed / duration if duration > 0 else 0,
) )
# ── GET /health — simple liveness check ─────────────────────────────
@_fastapi.get("/health")
def health(self) -> dict[str, Any]:
"""Simple health/readiness check."""
return {
"status": "ok",
"model": self.model_name,
"gpu": self.use_gpu,
}
# ── POST / — JSON API (base64 audio in response) ──────────────────── # ── POST / — JSON API (base64 audio in response) ────────────────────
@_fastapi.post("/") @_fastapi.post("/")
@@ -140,6 +151,12 @@ class TTSDeployment:
output_format = request.get("output_format", "wav") output_format = request.get("output_format", "wav")
return_base64 = request.get("return_base64", True) return_base64 = request.get("return_base64", True)
# Only pass language/speaker if the model supports it
if not (hasattr(self.tts, "is_multi_lingual") and self.tts.is_multi_lingual):
language = None
if not (hasattr(self.tts, "is_multi_speaker") and self.tts.is_multi_speaker):
speaker = None
try: try:
audio_bytes, sample_rate, duration = self._synthesize( audio_bytes, sample_rate, duration = self._synthesize(
text, speaker, language, speed text, speaker, language, speed
@@ -174,9 +191,13 @@ class TTSDeployment:
if not text: if not text:
return Response(content="text parameter required", status_code=400) return Response(content="text parameter required", status_code=400)
# Only pass language/speaker if the model is multi-lingual/multi-speaker
lang = language_id if hasattr(self.tts, "is_multi_lingual") and self.tts.is_multi_lingual else None
spk = speaker_id if hasattr(self.tts, "is_multi_speaker") and self.tts.is_multi_speaker else None
try: try:
audio_bytes, _sr, duration = self._synthesize( audio_bytes, _sr, duration = self._synthesize(
text, speaker_id, language_id text, spk, lang
) )
self._log(_start, duration, len(text)) self._log(_start, duration, len(text))
return Response(content=audio_bytes, media_type="audio/wav") return Response(content=audio_bytes, media_type="audio/wav")
@@ -212,6 +233,13 @@ class TTSDeployment:
speaker = body.get("speaker") speaker = body.get("speaker")
language = body.get("language") language = body.get("language")
speed = body.get("speed", 1.0) speed = body.get("speed", 1.0)
# Only pass language/speaker if the model supports it
if not (hasattr(self.tts, "is_multi_lingual") and self.tts.is_multi_lingual):
language = None
if not (hasattr(self.tts, "is_multi_speaker") and self.tts.is_multi_speaker):
speaker = None
sentences = _split_sentences(text) sentences = _split_sentences(text)
async def _generate(): async def _generate():