feat: add MLflow inference logging to all Ray Serve apps

- Add mlflow_logger.py: lightweight REST-based MLflow logger (no mlflow dep) - Instrument serve_llm.py with latency, token counts, tokens/sec metrics - Instrument serve_embeddings.py with latency, batch_size, total_tokens - Instrument serve_whisper.py with latency, audio_duration, realtime_factor - Instrument serve_tts.py with latency, audio_duration, text_chars - Instrument serve_reranker.py with latency, num_pairs, top_k
2026-02-12 06:14:30 -05:00
parent 2edafc33c0
commit 7ec2107e0c
6 changed files with 346 additions and 4 deletions
--- a/ray_serve/serve_llm.py
+++ b/ray_serve/serve_llm.py
@@ -10,6 +10,8 @@ from typing import Any

 from ray import serve

+from ray_serve.mlflow_logger import InferenceLogger
+

@serve.deployment(name="LLMDeployment", num_replicas=1)
 class LLMDeployment:
@@ -37,6 +39,21 @@ class LLMDeployment:
        self.SamplingParams = SamplingParams
        print(f"Model {self.model_id} async engine created")

+        # MLflow metrics
+        self._mlflow = InferenceLogger(
+            experiment_name="ray-serve-llm",
+            run_name=f"llm-{self.model_id.split('/')[-1]}",
+            tags={"model.name": self.model_id, "model.framework": "vllm", "gpu": "strixhalo"},
+            flush_every=5,
+        )
+        self._mlflow.initialize(
+            params={
+                "model_id": self.model_id,
+                "max_model_len": str(self.max_model_len),
+                "gpu_memory_utilization": str(self.gpu_memory_utilization),
+            }
+        )
+
    async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
        """
        Handle OpenAI-compatible chat completion requests.
@@ -67,11 +84,27 @@ class LLMDeployment:
            stop=stop,
        )

+        start_time = time.time()
        request_id = uuid.uuid4().hex
        final_result = None
        async for result in self.engine.generate(prompt, sampling_params, request_id):
            final_result = result
        generated_text = final_result.outputs[0].text
+        latency = time.time() - start_time
+
+        prompt_tokens = len(prompt.split())
+        completion_tokens = len(generated_text.split())
+
+        # Log to MLflow
+        self._mlflow.log_request(
+            latency_s=latency,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+            tokens_per_second=completion_tokens / latency if latency > 0 else 0,
+            temperature=temperature,
+            max_tokens_requested=max_tokens,
+        )

        # Return OpenAI-compatible response
        return {
@@ -90,9 +123,9 @@ class LLMDeployment:
                }
            ],
            "usage": {
-                "prompt_tokens": len(prompt.split()),
-                "completion_tokens": len(generated_text.split()),
-                "total_tokens": len(prompt.split()) + len(generated_text.split()),
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
            },
        }