feat: add MLflow inference logging to all Ray Serve apps
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 16s

- Add mlflow_logger.py: lightweight REST-based MLflow logger (no mlflow dep)
- Instrument serve_llm.py with latency, token counts, tokens/sec metrics
- Instrument serve_embeddings.py with latency, batch_size, total_tokens
- Instrument serve_whisper.py with latency, audio_duration, realtime_factor
- Instrument serve_tts.py with latency, audio_duration, text_chars
- Instrument serve_reranker.py with latency, num_pairs, top_k
This commit is contained in:
2026-02-12 06:14:30 -05:00
parent 2edafc33c0
commit 7ec2107e0c
6 changed files with 346 additions and 4 deletions

View File

@@ -4,10 +4,13 @@ Runs on: drizzt (Radeon 680M iGPU, ROCm)
"""
import os
import time
from typing import Any
from ray import serve
from ray_serve.mlflow_logger import InferenceLogger
@serve.deployment(name="EmbeddingsDeployment", num_replicas=1)
class EmbeddingsDeployment:
@@ -33,6 +36,17 @@ class EmbeddingsDeployment:
print(f"Model loaded. Embedding dimension: {self.embedding_dim}")
# MLflow metrics
self._mlflow = InferenceLogger(
experiment_name="ray-serve-embeddings",
run_name=f"embeddings-{self.model_id.split('/')[-1]}",
tags={"model.name": self.model_id, "model.framework": "sentence-transformers", "device": self.device},
flush_every=10,
)
self._mlflow.initialize(
params={"model_id": self.model_id, "embedding_dim": str(self.embedding_dim), "device": self.device}
)
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
"""
Handle OpenAI-compatible embedding requests.
@@ -46,6 +60,8 @@ class EmbeddingsDeployment:
"""
input_data = request.get("input", "")
_start = time.time()
# Handle both single string and list of strings
texts = [input_data] if isinstance(input_data, str) else input_data
@@ -69,6 +85,13 @@ class EmbeddingsDeployment:
)
total_tokens += len(text.split())
# Log to MLflow
self._mlflow.log_request(
latency_s=time.time() - _start,
batch_size=len(texts),
total_tokens=total_tokens,
)
# Return OpenAI-compatible response
return {
"object": "list",