feat: Add Gradio UI apps for AI services
- embeddings.py: BGE embeddings demo with similarity - stt.py: Whisper speech-to-text demo - tts.py: XTTS text-to-speech demo - theme.py: Shared DaviesTechLabs Gradio theme - K8s deployments for each app
This commit is contained in:
306
stt.py
Normal file
306
stt.py
Normal file
@@ -0,0 +1,306 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
STT Demo - Gradio UI for testing Speech-to-Text (Whisper) service.
|
||||
|
||||
Features:
|
||||
- Microphone recording input
|
||||
- Audio file upload support
|
||||
- Multiple language support
|
||||
- Translation mode
|
||||
- MLflow metrics logging
|
||||
"""
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import io
|
||||
import tempfile
|
||||
|
||||
import gradio as gr
|
||||
import httpx
|
||||
import soundfile as sf
|
||||
import numpy as np
|
||||
|
||||
from theme import get_lab_theme, CUSTOM_CSS, create_footer
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger("stt-demo")
|
||||
|
||||
# Configuration
|
||||
STT_URL = os.environ.get(
|
||||
"STT_URL",
|
||||
"http://whisper-predictor.ai-ml.svc.cluster.local"
|
||||
)
|
||||
MLFLOW_TRACKING_URI = os.environ.get(
|
||||
"MLFLOW_TRACKING_URI",
|
||||
"http://mlflow.mlflow.svc.cluster.local:80"
|
||||
)
|
||||
|
||||
# HTTP client with longer timeout for transcription
|
||||
client = httpx.Client(timeout=180.0)
|
||||
|
||||
# Whisper supported languages
|
||||
LANGUAGES = {
|
||||
"Auto-detect": None,
|
||||
"English": "en",
|
||||
"Spanish": "es",
|
||||
"French": "fr",
|
||||
"German": "de",
|
||||
"Italian": "it",
|
||||
"Portuguese": "pt",
|
||||
"Dutch": "nl",
|
||||
"Russian": "ru",
|
||||
"Chinese": "zh",
|
||||
"Japanese": "ja",
|
||||
"Korean": "ko",
|
||||
"Arabic": "ar",
|
||||
"Hindi": "hi",
|
||||
"Turkish": "tr",
|
||||
"Polish": "pl",
|
||||
"Ukrainian": "uk",
|
||||
}
|
||||
|
||||
|
||||
def transcribe_audio(
|
||||
audio_input: tuple[int, np.ndarray] | str | None,
|
||||
language: str,
|
||||
task: str
|
||||
) -> tuple[str, str, str]:
|
||||
"""Transcribe audio using the Whisper STT service."""
|
||||
if audio_input is None:
|
||||
return "❌ Please provide audio input", "", ""
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Handle different input types
|
||||
if isinstance(audio_input, tuple):
|
||||
# Microphone input: (sample_rate, audio_data)
|
||||
sample_rate, audio_data = audio_input
|
||||
|
||||
# Convert to WAV bytes
|
||||
audio_buffer = io.BytesIO()
|
||||
sf.write(audio_buffer, audio_data, sample_rate, format='WAV')
|
||||
audio_bytes = audio_buffer.getvalue()
|
||||
audio_duration = len(audio_data) / sample_rate
|
||||
else:
|
||||
# File path
|
||||
with open(audio_input, 'rb') as f:
|
||||
audio_bytes = f.read()
|
||||
# Get duration
|
||||
audio_data, sample_rate = sf.read(audio_input)
|
||||
audio_duration = len(audio_data) / sample_rate
|
||||
|
||||
# Prepare request
|
||||
lang_code = LANGUAGES.get(language)
|
||||
|
||||
files = {"file": ("audio.wav", audio_bytes, "audio/wav")}
|
||||
data = {"response_format": "json"}
|
||||
|
||||
if lang_code:
|
||||
data["language"] = lang_code
|
||||
|
||||
# Choose endpoint based on task
|
||||
if task == "Translate to English":
|
||||
endpoint = f"{STT_URL}/v1/audio/translations"
|
||||
else:
|
||||
endpoint = f"{STT_URL}/v1/audio/transcriptions"
|
||||
|
||||
# Send request
|
||||
response = client.post(endpoint, files=files, data=data)
|
||||
response.raise_for_status()
|
||||
|
||||
latency = time.time() - start_time
|
||||
result = response.json()
|
||||
|
||||
text = result.get("text", "")
|
||||
detected_language = result.get("language", "unknown")
|
||||
|
||||
# Status message
|
||||
status = f"✅ Transcribed {audio_duration:.1f}s of audio in {latency*1000:.0f}ms"
|
||||
|
||||
# Metrics
|
||||
metrics = f"""
|
||||
**Transcription Statistics:**
|
||||
- Audio Duration: {audio_duration:.2f} seconds
|
||||
- Processing Time: {latency*1000:.0f}ms
|
||||
- Real-time Factor: {latency/audio_duration:.2f}x
|
||||
- Detected Language: {detected_language}
|
||||
- Task: {task}
|
||||
- Word Count: {len(text.split())}
|
||||
- Character Count: {len(text)}
|
||||
"""
|
||||
|
||||
return status, text, metrics
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.exception("STT request failed")
|
||||
return f"❌ STT service error: {e.response.status_code}", "", ""
|
||||
except Exception as e:
|
||||
logger.exception("Transcription failed")
|
||||
return f"❌ Error: {str(e)}", "", ""
|
||||
|
||||
|
||||
def check_service_health() -> str:
|
||||
"""Check if the STT service is healthy."""
|
||||
try:
|
||||
response = client.get(f"{STT_URL}/health", timeout=5.0)
|
||||
if response.status_code == 200:
|
||||
return "🟢 Service is healthy"
|
||||
|
||||
# Try v1/models endpoint (OpenAI-compatible)
|
||||
response = client.get(f"{STT_URL}/v1/models", timeout=5.0)
|
||||
if response.status_code == 200:
|
||||
return "🟢 Service is healthy"
|
||||
|
||||
return f"🟡 Service returned status {response.status_code}"
|
||||
except Exception as e:
|
||||
return f"🔴 Service unavailable: {str(e)}"
|
||||
|
||||
|
||||
# Build the Gradio app
|
||||
with gr.Blocks(theme=get_lab_theme(), css=CUSTOM_CSS, title="STT Demo") as demo:
|
||||
gr.Markdown("""
|
||||
# 🎙️ Speech-to-Text Demo
|
||||
|
||||
Test the **Whisper** speech-to-text service. Transcribe audio from microphone
|
||||
or file upload with support for 100+ languages.
|
||||
""")
|
||||
|
||||
# Service status
|
||||
with gr.Row():
|
||||
health_btn = gr.Button("🔄 Check Service", size="sm")
|
||||
health_status = gr.Textbox(label="Service Status", interactive=False)
|
||||
|
||||
health_btn.click(fn=check_service_health, outputs=health_status)
|
||||
|
||||
with gr.Tabs():
|
||||
# Tab 1: Microphone Input
|
||||
with gr.TabItem("🎤 Microphone"):
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
mic_input = gr.Audio(
|
||||
label="Record Audio",
|
||||
sources=["microphone"],
|
||||
type="numpy"
|
||||
)
|
||||
|
||||
with gr.Row():
|
||||
mic_language = gr.Dropdown(
|
||||
choices=list(LANGUAGES.keys()),
|
||||
value="Auto-detect",
|
||||
label="Language"
|
||||
)
|
||||
mic_task = gr.Radio(
|
||||
choices=["Transcribe", "Translate to English"],
|
||||
value="Transcribe",
|
||||
label="Task"
|
||||
)
|
||||
|
||||
mic_btn = gr.Button("🎯 Transcribe", variant="primary")
|
||||
|
||||
with gr.Column():
|
||||
mic_status = gr.Textbox(label="Status", interactive=False)
|
||||
mic_metrics = gr.Markdown(label="Metrics")
|
||||
|
||||
mic_output = gr.Textbox(
|
||||
label="Transcription",
|
||||
lines=5
|
||||
)
|
||||
|
||||
mic_btn.click(
|
||||
fn=transcribe_audio,
|
||||
inputs=[mic_input, mic_language, mic_task],
|
||||
outputs=[mic_status, mic_output, mic_metrics]
|
||||
)
|
||||
|
||||
# Tab 2: File Upload
|
||||
with gr.TabItem("📁 File Upload"):
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
file_input = gr.Audio(
|
||||
label="Upload Audio File",
|
||||
sources=["upload"],
|
||||
type="filepath"
|
||||
)
|
||||
|
||||
with gr.Row():
|
||||
file_language = gr.Dropdown(
|
||||
choices=list(LANGUAGES.keys()),
|
||||
value="Auto-detect",
|
||||
label="Language"
|
||||
)
|
||||
file_task = gr.Radio(
|
||||
choices=["Transcribe", "Translate to English"],
|
||||
value="Transcribe",
|
||||
label="Task"
|
||||
)
|
||||
|
||||
file_btn = gr.Button("🎯 Transcribe", variant="primary")
|
||||
|
||||
with gr.Column():
|
||||
file_status = gr.Textbox(label="Status", interactive=False)
|
||||
file_metrics = gr.Markdown(label="Metrics")
|
||||
|
||||
file_output = gr.Textbox(
|
||||
label="Transcription",
|
||||
lines=5
|
||||
)
|
||||
|
||||
file_btn.click(
|
||||
fn=transcribe_audio,
|
||||
inputs=[file_input, file_language, file_task],
|
||||
outputs=[file_status, file_output, file_metrics]
|
||||
)
|
||||
|
||||
gr.Markdown("""
|
||||
**Supported formats:** WAV, MP3, FLAC, OGG, M4A, WEBM
|
||||
|
||||
*For best results, use clear audio with minimal background noise.*
|
||||
""")
|
||||
|
||||
# Tab 3: Translation
|
||||
with gr.TabItem("🌍 Translation"):
|
||||
gr.Markdown("""
|
||||
### Speech Translation
|
||||
|
||||
Upload or record audio in any language and get English translation.
|
||||
Whisper will automatically detect the source language.
|
||||
""")
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
trans_input = gr.Audio(
|
||||
label="Audio Input",
|
||||
sources=["microphone", "upload"],
|
||||
type="numpy"
|
||||
)
|
||||
trans_btn = gr.Button("🌍 Translate to English", variant="primary")
|
||||
|
||||
with gr.Column():
|
||||
trans_status = gr.Textbox(label="Status", interactive=False)
|
||||
trans_metrics = gr.Markdown(label="Metrics")
|
||||
|
||||
trans_output = gr.Textbox(
|
||||
label="English Translation",
|
||||
lines=5
|
||||
)
|
||||
|
||||
def translate_audio(audio):
|
||||
return transcribe_audio(audio, "Auto-detect", "Translate to English")
|
||||
|
||||
trans_btn.click(
|
||||
fn=translate_audio,
|
||||
inputs=trans_input,
|
||||
outputs=[trans_status, trans_output, trans_metrics]
|
||||
)
|
||||
|
||||
create_footer()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo.launch(
|
||||
server_name="0.0.0.0",
|
||||
server_port=7860,
|
||||
show_error=True
|
||||
)
|
||||
Reference in New Issue
Block a user