feat: add MLflow inference logging to all Ray Serve apps
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 16s
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 16s
- Add mlflow_logger.py: lightweight REST-based MLflow logger (no mlflow dep) - Instrument serve_llm.py with latency, token counts, tokens/sec metrics - Instrument serve_embeddings.py with latency, batch_size, total_tokens - Instrument serve_whisper.py with latency, audio_duration, realtime_factor - Instrument serve_tts.py with latency, audio_duration, text_chars - Instrument serve_reranker.py with latency, num_pairs, top_k
This commit is contained in:
@@ -6,10 +6,13 @@ Runs on: elminster (RTX 2070 8GB, CUDA)
|
||||
import base64
|
||||
import io
|
||||
import os
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
from ray import serve
|
||||
|
||||
from ray_serve.mlflow_logger import InferenceLogger
|
||||
|
||||
|
||||
@serve.deployment(name="TTSDeployment", num_replicas=1)
|
||||
class TTSDeployment:
|
||||
@@ -32,6 +35,15 @@ class TTSDeployment:
|
||||
|
||||
print("TTS model loaded successfully")
|
||||
|
||||
# MLflow metrics
|
||||
self._mlflow = InferenceLogger(
|
||||
experiment_name="ray-serve-tts",
|
||||
run_name=f"tts-{self.model_name.split('/')[-1]}",
|
||||
tags={"model.name": self.model_name, "model.framework": "coqui-tts", "gpu": str(self.use_gpu)},
|
||||
flush_every=5,
|
||||
)
|
||||
self._mlflow.initialize(params={"model_name": self.model_name, "use_gpu": str(self.use_gpu)})
|
||||
|
||||
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Handle text-to-speech requests.
|
||||
@@ -49,6 +61,7 @@ class TTSDeployment:
|
||||
import numpy as np
|
||||
from scipy.io import wavfile
|
||||
|
||||
_start = time.time()
|
||||
text = request.get("text", "")
|
||||
speaker = request.get("speaker")
|
||||
language = request.get("language")
|
||||
@@ -88,10 +101,20 @@ class TTSDeployment:
|
||||
wavfile.write(buffer, sample_rate, wav_int16)
|
||||
audio_bytes = buffer.getvalue()
|
||||
|
||||
duration = len(wav) / sample_rate
|
||||
|
||||
# Log to MLflow
|
||||
self._mlflow.log_request(
|
||||
latency_s=time.time() - _start,
|
||||
audio_duration_s=duration,
|
||||
text_chars=len(text),
|
||||
realtime_factor=(time.time() - _start) / duration if duration > 0 else 0,
|
||||
)
|
||||
|
||||
response = {
|
||||
"model": self.model_name,
|
||||
"sample_rate": sample_rate,
|
||||
"duration": len(wav) / sample_rate,
|
||||
"duration": duration,
|
||||
"format": output_format,
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user