From 0fb325fa0525d7efac9b16c5f10ee4eb63d83a8d Mon Sep 17 00:00:00 2001 From: "Billy D." Date: Sat, 21 Feb 2026 12:49:44 -0500 Subject: [PATCH] =?UTF-8?q?feat:=20FastAPI=20ingress=20for=20TTS=20?= =?UTF-8?q?=E2=80=94=20GET=20/api/tts=20returns=20raw=20WAV?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add FastAPI ingress to TTSDeployment with two routes: POST / — JSON API with base64 audio (backward compat) GET /api/tts?text=&language_id= — raw WAV bytes (zero overhead) - GET /speakers endpoint for speaker listing - Properly uses _fastapi naming to avoid collision with Ray binding - app = TTSDeployment.bind() for rayservice.yaml compatibility --- ray_serve/serve_tts.py | 148 +++++++++++++++++++++++------------------ 1 file changed, 83 insertions(+), 65 deletions(-) diff --git a/ray_serve/serve_tts.py b/ray_serve/serve_tts.py index 68486e6..b409933 100644 --- a/ray_serve/serve_tts.py +++ b/ray_serve/serve_tts.py @@ -1,6 +1,10 @@ """ Ray Serve deployment for Coqui TTS. Runs on: elminster (RTX 2070 8GB, CUDA) + +Provides two API styles: + POST /tts — JSON body → JSON response with base64 audio + GET /tts/api/tts — Coqui-compatible query params → raw WAV bytes """ import base64 @@ -9,6 +13,8 @@ import os import time from typing import Any +from fastapi import FastAPI, Query +from fastapi.responses import Response from ray import serve try: @@ -16,8 +22,11 @@ try: except ImportError: InferenceLogger = None +_fastapi = FastAPI() + @serve.deployment(name="TTSDeployment", num_replicas=1) +@serve.ingress(_fastapi) class TTSDeployment: def __init__(self): import torch @@ -50,100 +59,109 @@ class TTSDeployment: else: self._mlflow = None - async def __call__(self, request: dict[str, Any]) -> dict[str, Any]: - """ - Handle text-to-speech requests. + # ── internal synthesis helpers ──────────────────────────────────────── - Expected request format: - { - "text": "Text to synthesize", - "speaker": "speaker_name", - "language": "en", - "speed": 1.0, - "output_format": "wav", - "return_base64": true - } - """ + def _synthesize(self, text: str, speaker: str | None = None, + language: str | None = None, speed: float = 1.0): + """Return (wav_bytes: bytes, sample_rate: int, duration: float).""" import numpy as np from scipy.io import wavfile + wav = self.tts.tts(text=text, speaker=speaker, language=language, speed=speed) + if not isinstance(wav, np.ndarray): + wav = np.array(wav) + + wav_int16 = (wav * 32767).astype(np.int16) + sample_rate = ( + self.tts.synthesizer.output_sample_rate + if hasattr(self.tts, "synthesizer") + else 22050 + ) + buf = io.BytesIO() + wavfile.write(buf, sample_rate, wav_int16) + return buf.getvalue(), sample_rate, len(wav) / sample_rate + + def _log(self, start: float, duration: float, text_len: int): + if self._mlflow: + elapsed = time.time() - start + self._mlflow.log_request( + latency_s=elapsed, + audio_duration_s=duration, + text_chars=text_len, + realtime_factor=elapsed / duration if duration > 0 else 0, + ) + + # ── POST / — JSON API (base64 audio in response) ──────────────────── + + @_fastapi.post("/") + async def generate_json(self, request: dict[str, Any]) -> dict[str, Any]: + """ + JSON API — POST body: + {"text": "...", "speaker": "...", "language": "en", "speed": 1.0, + "output_format": "wav", "return_base64": true} + """ _start = time.time() text = request.get("text", "") + if not text: + return {"error": "No text provided"} + speaker = request.get("speaker") language = request.get("language") speed = request.get("speed", 1.0) output_format = request.get("output_format", "wav") return_base64 = request.get("return_base64", True) - if not text: - return {"error": "No text provided"} - - # Generate speech try: - # TTS.tts returns a numpy array of audio samples - wav = self.tts.tts( - text=text, - speaker=speaker, - language=language, - speed=speed, + audio_bytes, sample_rate, duration = self._synthesize( + text, speaker, language, speed ) + self._log(_start, duration, len(text)) - # Convert to numpy array if needed - if not isinstance(wav, np.ndarray): - wav = np.array(wav) - - # Normalize to int16 - wav_int16 = (wav * 32767).astype(np.int16) - - # Get sample rate from model config - sample_rate = ( - self.tts.synthesizer.output_sample_rate - if hasattr(self.tts, "synthesizer") - else 22050 - ) - - # Write to buffer - buffer = io.BytesIO() - wavfile.write(buffer, sample_rate, wav_int16) - audio_bytes = buffer.getvalue() - - duration = len(wav) / sample_rate - - # Log to MLflow - if self._mlflow: - self._mlflow.log_request( - latency_s=time.time() - _start, - audio_duration_s=duration, - text_chars=len(text), - realtime_factor=(time.time() - _start) / duration if duration > 0 else 0, - ) - - response = { + resp: dict[str, Any] = { "model": self.model_name, "sample_rate": sample_rate, "duration": duration, "format": output_format, } - if return_base64: - response["audio"] = base64.b64encode(audio_bytes).decode("utf-8") + resp["audio"] = base64.b64encode(audio_bytes).decode("utf-8") else: - response["audio_bytes"] = audio_bytes - - return response - + resp["audio_bytes"] = audio_bytes + return resp except Exception as e: - return { - "error": str(e), - "model": self.model_name, - } + return {"error": str(e), "model": self.model_name} + # ── GET /api/tts — Coqui-compatible raw WAV endpoint ───────────────── + + @_fastapi.get("/api/tts") + async def generate_raw( + self, + text: str = Query(..., description="Text to synthesize"), + language_id: str = Query("en", description="Language code"), + speaker_id: str | None = Query(None, description="Speaker name"), + ) -> Response: + """Coqui XTTS-compatible endpoint — returns raw WAV bytes.""" + _start = time.time() + if not text: + return Response(content="text parameter required", status_code=400) + + try: + audio_bytes, _sr, duration = self._synthesize( + text, speaker_id, language_id + ) + self._log(_start, duration, len(text)) + return Response(content=audio_bytes, media_type="audio/wav") + except Exception as e: + return Response(content=str(e), status_code=500) + + # ── GET /speakers — list available speakers ────────────────────────── + + @_fastapi.get("/speakers") def list_speakers(self) -> dict[str, Any]: """List available speakers for multi-speaker models.""" speakers = [] if hasattr(self.tts, "speakers") and self.tts.speakers: speakers = self.tts.speakers - return { "model": self.model_name, "speakers": speakers,