feat: FastAPI ingress for TTS — GET /api/tts returns raw WAV
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 2m5s

- Add FastAPI ingress to TTSDeployment with two routes:
  POST / — JSON API with base64 audio (backward compat)
  GET /api/tts?text=&language_id= — raw WAV bytes (zero overhead)
- GET /speakers endpoint for speaker listing
- Properly uses _fastapi naming to avoid collision with Ray binding
- app = TTSDeployment.bind() for rayservice.yaml compatibility
This commit is contained in:
2026-02-21 12:49:44 -05:00
parent 59655e3dcf
commit 0fb325fa05

View File

@@ -1,6 +1,10 @@
""" """
Ray Serve deployment for Coqui TTS. Ray Serve deployment for Coqui TTS.
Runs on: elminster (RTX 2070 8GB, CUDA) Runs on: elminster (RTX 2070 8GB, CUDA)
Provides two API styles:
POST /tts — JSON body → JSON response with base64 audio
GET /tts/api/tts — Coqui-compatible query params → raw WAV bytes
""" """
import base64 import base64
@@ -9,6 +13,8 @@ import os
import time import time
from typing import Any from typing import Any
from fastapi import FastAPI, Query
from fastapi.responses import Response
from ray import serve from ray import serve
try: try:
@@ -16,8 +22,11 @@ try:
except ImportError: except ImportError:
InferenceLogger = None InferenceLogger = None
_fastapi = FastAPI()
@serve.deployment(name="TTSDeployment", num_replicas=1) @serve.deployment(name="TTSDeployment", num_replicas=1)
@serve.ingress(_fastapi)
class TTSDeployment: class TTSDeployment:
def __init__(self): def __init__(self):
import torch import torch
@@ -50,100 +59,109 @@ class TTSDeployment:
else: else:
self._mlflow = None self._mlflow = None
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]: # ── internal synthesis helpers ────────────────────────────────────────
"""
Handle text-to-speech requests.
Expected request format: def _synthesize(self, text: str, speaker: str | None = None,
{ language: str | None = None, speed: float = 1.0):
"text": "Text to synthesize", """Return (wav_bytes: bytes, sample_rate: int, duration: float)."""
"speaker": "speaker_name",
"language": "en",
"speed": 1.0,
"output_format": "wav",
"return_base64": true
}
"""
import numpy as np import numpy as np
from scipy.io import wavfile from scipy.io import wavfile
wav = self.tts.tts(text=text, speaker=speaker, language=language, speed=speed)
if not isinstance(wav, np.ndarray):
wav = np.array(wav)
wav_int16 = (wav * 32767).astype(np.int16)
sample_rate = (
self.tts.synthesizer.output_sample_rate
if hasattr(self.tts, "synthesizer")
else 22050
)
buf = io.BytesIO()
wavfile.write(buf, sample_rate, wav_int16)
return buf.getvalue(), sample_rate, len(wav) / sample_rate
def _log(self, start: float, duration: float, text_len: int):
if self._mlflow:
elapsed = time.time() - start
self._mlflow.log_request(
latency_s=elapsed,
audio_duration_s=duration,
text_chars=text_len,
realtime_factor=elapsed / duration if duration > 0 else 0,
)
# ── POST / — JSON API (base64 audio in response) ────────────────────
@_fastapi.post("/")
async def generate_json(self, request: dict[str, Any]) -> dict[str, Any]:
"""
JSON API — POST body:
{"text": "...", "speaker": "...", "language": "en", "speed": 1.0,
"output_format": "wav", "return_base64": true}
"""
_start = time.time() _start = time.time()
text = request.get("text", "") text = request.get("text", "")
if not text:
return {"error": "No text provided"}
speaker = request.get("speaker") speaker = request.get("speaker")
language = request.get("language") language = request.get("language")
speed = request.get("speed", 1.0) speed = request.get("speed", 1.0)
output_format = request.get("output_format", "wav") output_format = request.get("output_format", "wav")
return_base64 = request.get("return_base64", True) return_base64 = request.get("return_base64", True)
if not text:
return {"error": "No text provided"}
# Generate speech
try: try:
# TTS.tts returns a numpy array of audio samples audio_bytes, sample_rate, duration = self._synthesize(
wav = self.tts.tts( text, speaker, language, speed
text=text,
speaker=speaker,
language=language,
speed=speed,
) )
self._log(_start, duration, len(text))
# Convert to numpy array if needed resp: dict[str, Any] = {
if not isinstance(wav, np.ndarray):
wav = np.array(wav)
# Normalize to int16
wav_int16 = (wav * 32767).astype(np.int16)
# Get sample rate from model config
sample_rate = (
self.tts.synthesizer.output_sample_rate
if hasattr(self.tts, "synthesizer")
else 22050
)
# Write to buffer
buffer = io.BytesIO()
wavfile.write(buffer, sample_rate, wav_int16)
audio_bytes = buffer.getvalue()
duration = len(wav) / sample_rate
# Log to MLflow
if self._mlflow:
self._mlflow.log_request(
latency_s=time.time() - _start,
audio_duration_s=duration,
text_chars=len(text),
realtime_factor=(time.time() - _start) / duration if duration > 0 else 0,
)
response = {
"model": self.model_name, "model": self.model_name,
"sample_rate": sample_rate, "sample_rate": sample_rate,
"duration": duration, "duration": duration,
"format": output_format, "format": output_format,
} }
if return_base64: if return_base64:
response["audio"] = base64.b64encode(audio_bytes).decode("utf-8") resp["audio"] = base64.b64encode(audio_bytes).decode("utf-8")
else: else:
response["audio_bytes"] = audio_bytes resp["audio_bytes"] = audio_bytes
return resp
return response
except Exception as e: except Exception as e:
return { return {"error": str(e), "model": self.model_name}
"error": str(e),
"model": self.model_name,
}
# ── GET /api/tts — Coqui-compatible raw WAV endpoint ─────────────────
@_fastapi.get("/api/tts")
async def generate_raw(
self,
text: str = Query(..., description="Text to synthesize"),
language_id: str = Query("en", description="Language code"),
speaker_id: str | None = Query(None, description="Speaker name"),
) -> Response:
"""Coqui XTTS-compatible endpoint — returns raw WAV bytes."""
_start = time.time()
if not text:
return Response(content="text parameter required", status_code=400)
try:
audio_bytes, _sr, duration = self._synthesize(
text, speaker_id, language_id
)
self._log(_start, duration, len(text))
return Response(content=audio_bytes, media_type="audio/wav")
except Exception as e:
return Response(content=str(e), status_code=500)
# ── GET /speakers — list available speakers ──────────────────────────
@_fastapi.get("/speakers")
def list_speakers(self) -> dict[str, Any]: def list_speakers(self) -> dict[str, Any]:
"""List available speakers for multi-speaker models.""" """List available speakers for multi-speaker models."""
speakers = [] speakers = []
if hasattr(self.tts, "speakers") and self.tts.speakers: if hasattr(self.tts, "speakers") and self.tts.speakers:
speakers = self.tts.speakers speakers = self.tts.speakers
return { return {
"model": self.model_name, "model": self.model_name,
"speakers": speakers, "speakers": speakers,