feat: add MLflow inference logging to all Ray Serve apps
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 16s
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 16s
- Add mlflow_logger.py: lightweight REST-based MLflow logger (no mlflow dep) - Instrument serve_llm.py with latency, token counts, tokens/sec metrics - Instrument serve_embeddings.py with latency, batch_size, total_tokens - Instrument serve_whisper.py with latency, audio_duration, realtime_factor - Instrument serve_tts.py with latency, audio_duration, text_chars - Instrument serve_reranker.py with latency, num_pairs, top_k
This commit is contained in:
@@ -4,10 +4,13 @@ Runs on: drizzt (Radeon 680M iGPU, ROCm) or danilo (Intel i915 iGPU, OpenVINO/IP
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
from ray import serve
|
||||
|
||||
from ray_serve.mlflow_logger import InferenceLogger
|
||||
|
||||
|
||||
@serve.deployment(name="RerankerDeployment", num_replicas=1)
|
||||
class RerankerDeployment:
|
||||
@@ -58,6 +61,17 @@ class RerankerDeployment:
|
||||
|
||||
print("Reranker model loaded successfully")
|
||||
|
||||
# MLflow metrics
|
||||
self._mlflow = InferenceLogger(
|
||||
experiment_name="ray-serve-reranker",
|
||||
run_name=f"reranker-{self.model_id.split('/')[-1]}",
|
||||
tags={"model.name": self.model_id, "model.framework": "sentence-transformers", "device": self.device},
|
||||
flush_every=10,
|
||||
)
|
||||
self._mlflow.initialize(
|
||||
params={"model_id": self.model_id, "device": self.device, "use_ipex": str(self.use_ipex)}
|
||||
)
|
||||
|
||||
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Handle reranking requests.
|
||||
@@ -75,6 +89,8 @@ class RerankerDeployment:
|
||||
"pairs": [["query", "doc1"], ["query", "doc2"]]
|
||||
}
|
||||
"""
|
||||
_start = time.time()
|
||||
|
||||
# Handle pairs format
|
||||
if "pairs" in request:
|
||||
pairs = request["pairs"]
|
||||
@@ -89,6 +105,11 @@ class RerankerDeployment:
|
||||
}
|
||||
)
|
||||
|
||||
self._mlflow.log_request(
|
||||
latency_s=time.time() - _start,
|
||||
num_pairs=len(pairs),
|
||||
)
|
||||
|
||||
return {
|
||||
"object": "list",
|
||||
"results": results,
|
||||
@@ -131,6 +152,14 @@ class RerankerDeployment:
|
||||
# Apply top_k
|
||||
results = results[:top_k]
|
||||
|
||||
# Log to MLflow
|
||||
self._mlflow.log_request(
|
||||
latency_s=time.time() - _start,
|
||||
num_pairs=len(pairs),
|
||||
num_documents=len(documents),
|
||||
top_k=top_k,
|
||||
)
|
||||
|
||||
return {
|
||||
"object": "list",
|
||||
"results": results,
|
||||
|
||||
Reference in New Issue
Block a user