feat: add MLflow inference logging to all Ray Serve apps
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 16s
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 16s
- Add mlflow_logger.py: lightweight REST-based MLflow logger (no mlflow dep) - Instrument serve_llm.py with latency, token counts, tokens/sec metrics - Instrument serve_embeddings.py with latency, batch_size, total_tokens - Instrument serve_whisper.py with latency, audio_duration, realtime_factor - Instrument serve_tts.py with latency, audio_duration, text_chars - Instrument serve_reranker.py with latency, num_pairs, top_k
This commit is contained in:
@@ -10,6 +10,8 @@ from typing import Any
|
||||
|
||||
from ray import serve
|
||||
|
||||
from ray_serve.mlflow_logger import InferenceLogger
|
||||
|
||||
|
||||
@serve.deployment(name="LLMDeployment", num_replicas=1)
|
||||
class LLMDeployment:
|
||||
@@ -37,6 +39,21 @@ class LLMDeployment:
|
||||
self.SamplingParams = SamplingParams
|
||||
print(f"Model {self.model_id} async engine created")
|
||||
|
||||
# MLflow metrics
|
||||
self._mlflow = InferenceLogger(
|
||||
experiment_name="ray-serve-llm",
|
||||
run_name=f"llm-{self.model_id.split('/')[-1]}",
|
||||
tags={"model.name": self.model_id, "model.framework": "vllm", "gpu": "strixhalo"},
|
||||
flush_every=5,
|
||||
)
|
||||
self._mlflow.initialize(
|
||||
params={
|
||||
"model_id": self.model_id,
|
||||
"max_model_len": str(self.max_model_len),
|
||||
"gpu_memory_utilization": str(self.gpu_memory_utilization),
|
||||
}
|
||||
)
|
||||
|
||||
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Handle OpenAI-compatible chat completion requests.
|
||||
@@ -67,11 +84,27 @@ class LLMDeployment:
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
request_id = uuid.uuid4().hex
|
||||
final_result = None
|
||||
async for result in self.engine.generate(prompt, sampling_params, request_id):
|
||||
final_result = result
|
||||
generated_text = final_result.outputs[0].text
|
||||
latency = time.time() - start_time
|
||||
|
||||
prompt_tokens = len(prompt.split())
|
||||
completion_tokens = len(generated_text.split())
|
||||
|
||||
# Log to MLflow
|
||||
self._mlflow.log_request(
|
||||
latency_s=latency,
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=prompt_tokens + completion_tokens,
|
||||
tokens_per_second=completion_tokens / latency if latency > 0 else 0,
|
||||
temperature=temperature,
|
||||
max_tokens_requested=max_tokens,
|
||||
)
|
||||
|
||||
# Return OpenAI-compatible response
|
||||
return {
|
||||
@@ -90,9 +123,9 @@ class LLMDeployment:
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": len(prompt.split()),
|
||||
"completion_tokens": len(generated_text.split()),
|
||||
"total_tokens": len(prompt.split()) + len(generated_text.split()),
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"total_tokens": prompt_tokens + completion_tokens,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user