#!/usr/bin/env python3 """ TTS Demo - Gradio UI for testing Text-to-Speech service. Features: - Text input with language selection - Audio playback of synthesized speech - Voice/speaker selection (when available) - MLflow metrics logging - Multiple TTS backends support (Coqui XTTS, Piper, etc.) """ import os import time import logging import io import base64 import gradio as gr import httpx import soundfile as sf import numpy as np from theme import get_lab_theme, CUSTOM_CSS, create_footer # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger("tts-demo") # Configuration TTS_URL = os.environ.get( "TTS_URL", # Default: Ray Serve TTS endpoint "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/tts" ) MLFLOW_TRACKING_URI = os.environ.get( "MLFLOW_TRACKING_URI", "http://mlflow.mlflow.svc.cluster.local:80" ) # ─── MLflow experiment tracking ────────────────────────────────────────── try: import mlflow from mlflow.tracking import MlflowClient mlflow.set_tracking_uri(MLFLOW_TRACKING_URI) _mlflow_client = MlflowClient() _experiment = _mlflow_client.get_experiment_by_name("gradio-tts-tuning") if _experiment is None: _experiment_id = _mlflow_client.create_experiment( "gradio-tts-tuning", artifact_location="/mlflow/artifacts/gradio-tts-tuning", ) else: _experiment_id = _experiment.experiment_id _mlflow_run = mlflow.start_run( experiment_id=_experiment_id, run_name=f"gradio-tts-{os.environ.get('HOSTNAME', 'local')}", tags={"service": "gradio-tts", "endpoint": TTS_URL}, ) _mlflow_run_id = _mlflow_run.info.run_id _mlflow_step = 0 MLFLOW_ENABLED = True logger.info("MLflow tracking enabled: experiment=%s run=%s", _experiment_id, _mlflow_run_id) except Exception as exc: logger.warning("MLflow tracking disabled: %s", exc) _mlflow_client = None _mlflow_run_id = None _mlflow_step = 0 MLFLOW_ENABLED = False def _log_tts_metrics( latency: float, audio_duration: float, text_chars: int, language: str, ) -> None: """Log TTS inference metrics to MLflow (non-blocking best-effort).""" global _mlflow_step if not MLFLOW_ENABLED or _mlflow_client is None: return try: _mlflow_step += 1 ts = int(time.time() * 1000) rtf = latency / audio_duration if audio_duration > 0 else 0 cps = text_chars / latency if latency > 0 else 0 _mlflow_client.log_batch( _mlflow_run_id, metrics=[ mlflow.entities.Metric("latency_s", latency, ts, _mlflow_step), mlflow.entities.Metric("audio_duration_s", audio_duration, ts, _mlflow_step), mlflow.entities.Metric("realtime_factor", rtf, ts, _mlflow_step), mlflow.entities.Metric("chars_per_second", cps, ts, _mlflow_step), mlflow.entities.Metric("text_chars", text_chars, ts, _mlflow_step), ], ) except Exception: logger.debug("MLflow log failed", exc_info=True) # HTTP client with longer timeout for audio generation client = httpx.Client(timeout=120.0) # Supported languages for XTTS LANGUAGES = { "English": "en", "Spanish": "es", "French": "fr", "German": "de", "Italian": "it", "Portuguese": "pt", "Polish": "pl", "Turkish": "tr", "Russian": "ru", "Dutch": "nl", "Czech": "cs", "Arabic": "ar", "Chinese": "zh-cn", "Japanese": "ja", "Korean": "ko", "Hungarian": "hu", } def synthesize_speech(text: str, language: str) -> tuple[str, tuple[int, np.ndarray] | None, str]: """Synthesize speech from text using the TTS service.""" if not text.strip(): return "❌ Please enter some text", None, "" lang_code = LANGUAGES.get(language, "en") try: start_time = time.time() # Call TTS service (Coqui XTTS API format) response = client.get( f"{TTS_URL}/api/tts", params={"text": text, "language_id": lang_code} ) response.raise_for_status() latency = time.time() - start_time audio_bytes = response.content # Parse audio data audio_io = io.BytesIO(audio_bytes) audio_data, sample_rate = sf.read(audio_io) # Calculate duration if len(audio_data.shape) == 1: duration = len(audio_data) / sample_rate else: duration = len(audio_data) / sample_rate # Status message status = f"✅ Generated {duration:.2f}s of audio in {latency*1000:.0f}ms" # Log to MLflow _log_tts_metrics( latency=latency, audio_duration=duration, text_chars=len(text), language=lang_code, ) # Metrics metrics = f""" **Audio Statistics:** - Duration: {duration:.2f} seconds - Sample Rate: {sample_rate} Hz - Size: {len(audio_bytes) / 1024:.1f} KB - Generation Time: {latency*1000:.0f}ms - Real-time Factor: {latency/duration:.2f}x - Language: {language} ({lang_code}) - Characters: {len(text)} - Chars/sec: {len(text)/latency:.1f} """ return status, (sample_rate, audio_data), metrics except httpx.HTTPStatusError as e: logger.exception("TTS request failed") return f"❌ TTS service error: {e.response.status_code}", None, "" except Exception as e: logger.exception("TTS synthesis failed") return f"❌ Error: {str(e)}", None, "" def check_service_health() -> str: """Check if the TTS service is healthy.""" try: # Try the health endpoint first response = client.get(f"{TTS_URL}/health", timeout=5.0) if response.status_code == 200: return "🟢 Service is healthy" # Fall back to root endpoint response = client.get(f"{TTS_URL}/", timeout=5.0) if response.status_code == 200: return "🟢 Service is responding" return f"🟡 Service returned status {response.status_code}" except Exception as e: return f"🔴 Service unavailable: {str(e)}" # Build the Gradio app with gr.Blocks(theme=get_lab_theme(), css=CUSTOM_CSS, title="TTS Demo") as demo: gr.Markdown(""" # 🔊 Text-to-Speech Demo Test the **Coqui XTTS** text-to-speech service. Convert text to natural-sounding speech in multiple languages. """) # Service status with gr.Row(): health_btn = gr.Button("🔄 Check Service", size="sm") health_status = gr.Textbox(label="Service Status", interactive=False) health_btn.click(fn=check_service_health, outputs=health_status) with gr.Tabs(): # Tab 1: Basic TTS with gr.TabItem("🎤 Text to Speech"): with gr.Row(): with gr.Column(scale=2): text_input = gr.Textbox( label="Text to Synthesize", placeholder="Enter text to convert to speech...", lines=5, max_lines=10 ) with gr.Row(): language = gr.Dropdown( choices=list(LANGUAGES.keys()), value="English", label="Language" ) synthesize_btn = gr.Button("🔊 Synthesize", variant="primary", scale=2) with gr.Column(scale=1): status_output = gr.Textbox(label="Status", interactive=False) metrics_output = gr.Markdown(label="Metrics") audio_output = gr.Audio(label="Generated Audio", type="numpy") synthesize_btn.click( fn=synthesize_speech, inputs=[text_input, language], outputs=[status_output, audio_output, metrics_output] ) # Example texts gr.Examples( examples=[ ["Hello! Welcome to Davies Tech Labs. This is a demonstration of our text-to-speech system.", "English"], ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet.", "English"], ["Bonjour! Bienvenue au laboratoire technique de Davies.", "French"], ["Hola! Bienvenido al laboratorio de tecnología.", "Spanish"], ["Guten Tag! Willkommen im Techniklabor.", "German"], ], inputs=[text_input, language], ) # Tab 2: Comparison with gr.TabItem("🔄 Language Comparison"): gr.Markdown("Compare the same text in different languages.") compare_text = gr.Textbox( label="Text to Compare", value="Hello, how are you today?", lines=2 ) with gr.Row(): lang1 = gr.Dropdown(choices=list(LANGUAGES.keys()), value="English", label="Language 1") lang2 = gr.Dropdown(choices=list(LANGUAGES.keys()), value="Spanish", label="Language 2") compare_btn = gr.Button("Compare Languages", variant="primary") with gr.Row(): with gr.Column(): gr.Markdown("### Language 1") audio1 = gr.Audio(label="Audio 1", type="numpy") status1 = gr.Textbox(label="Status", interactive=False) with gr.Column(): gr.Markdown("### Language 2") audio2 = gr.Audio(label="Audio 2", type="numpy") status2 = gr.Textbox(label="Status", interactive=False) def compare_languages(text, l1, l2): s1, a1, _ = synthesize_speech(text, l1) s2, a2, _ = synthesize_speech(text, l2) return s1, a1, s2, a2 compare_btn.click( fn=compare_languages, inputs=[compare_text, lang1, lang2], outputs=[status1, audio1, status2, audio2] ) # Tab 3: Batch Processing with gr.TabItem("📚 Batch Synthesis"): gr.Markdown("Synthesize multiple texts at once (one per line).") batch_input = gr.Textbox( label="Texts (one per line)", placeholder="Enter multiple texts, one per line...", lines=6 ) batch_lang = gr.Dropdown( choices=list(LANGUAGES.keys()), value="English", label="Language" ) batch_btn = gr.Button("Synthesize All", variant="primary") batch_status = gr.Textbox(label="Status", interactive=False) batch_audios = gr.Dataset( components=[gr.Audio(type="numpy")], label="Generated Audio Files" ) # Note: Batch processing would need more complex handling # This is a simplified version gr.Markdown(""" *Note: For batch processing of many texts, consider using the API directly or the Kubeflow pipeline for better throughput.* """) create_footer() if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, show_error=True )