fix: make mlflow_logger import optional with no-op fallback
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 11s

The strixhalo LLM worker uses py_executable pointing to the Docker
image venv which doesn't have the updated ray-serve-apps package.
Wrap all InferenceLogger imports in try/except and guard usage with
None checks so apps degrade gracefully without MLflow logging.
This commit is contained in:
2026-02-12 07:01:17 -05:00
parent 7ec2107e0c
commit 15e4b8afa3
5 changed files with 124 additions and 88 deletions

View File

@@ -9,7 +9,10 @@ from typing import Any
from ray import serve from ray import serve
from ray_serve.mlflow_logger import InferenceLogger try:
from ray_serve.mlflow_logger import InferenceLogger
except ImportError:
InferenceLogger = None
@serve.deployment(name="EmbeddingsDeployment", num_replicas=1) @serve.deployment(name="EmbeddingsDeployment", num_replicas=1)
@@ -37,15 +40,18 @@ class EmbeddingsDeployment:
print(f"Model loaded. Embedding dimension: {self.embedding_dim}") print(f"Model loaded. Embedding dimension: {self.embedding_dim}")
# MLflow metrics # MLflow metrics
self._mlflow = InferenceLogger( if InferenceLogger is not None:
experiment_name="ray-serve-embeddings", self._mlflow = InferenceLogger(
run_name=f"embeddings-{self.model_id.split('/')[-1]}", experiment_name="ray-serve-embeddings",
tags={"model.name": self.model_id, "model.framework": "sentence-transformers", "device": self.device}, run_name=f"embeddings-{self.model_id.split('/')[-1]}",
flush_every=10, tags={"model.name": self.model_id, "model.framework": "sentence-transformers", "device": self.device},
) flush_every=10,
self._mlflow.initialize( )
params={"model_id": self.model_id, "embedding_dim": str(self.embedding_dim), "device": self.device} self._mlflow.initialize(
) params={"model_id": self.model_id, "embedding_dim": str(self.embedding_dim), "device": self.device}
)
else:
self._mlflow = None
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]: async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
""" """
@@ -86,11 +92,12 @@ class EmbeddingsDeployment:
total_tokens += len(text.split()) total_tokens += len(text.split())
# Log to MLflow # Log to MLflow
self._mlflow.log_request( if self._mlflow:
latency_s=time.time() - _start, self._mlflow.log_request(
batch_size=len(texts), latency_s=time.time() - _start,
total_tokens=total_tokens, batch_size=len(texts),
) total_tokens=total_tokens,
)
# Return OpenAI-compatible response # Return OpenAI-compatible response
return { return {

View File

@@ -10,7 +10,10 @@ from typing import Any
from ray import serve from ray import serve
from ray_serve.mlflow_logger import InferenceLogger try:
from ray_serve.mlflow_logger import InferenceLogger
except ImportError:
InferenceLogger = None
@serve.deployment(name="LLMDeployment", num_replicas=1) @serve.deployment(name="LLMDeployment", num_replicas=1)
@@ -40,19 +43,22 @@ class LLMDeployment:
print(f"Model {self.model_id} async engine created") print(f"Model {self.model_id} async engine created")
# MLflow metrics # MLflow metrics
self._mlflow = InferenceLogger( if InferenceLogger is not None:
experiment_name="ray-serve-llm", self._mlflow = InferenceLogger(
run_name=f"llm-{self.model_id.split('/')[-1]}", experiment_name="ray-serve-llm",
tags={"model.name": self.model_id, "model.framework": "vllm", "gpu": "strixhalo"}, run_name=f"llm-{self.model_id.split('/')[-1]}",
flush_every=5, tags={"model.name": self.model_id, "model.framework": "vllm", "gpu": "strixhalo"},
) flush_every=5,
self._mlflow.initialize( )
params={ self._mlflow.initialize(
"model_id": self.model_id, params={
"max_model_len": str(self.max_model_len), "model_id": self.model_id,
"gpu_memory_utilization": str(self.gpu_memory_utilization), "max_model_len": str(self.max_model_len),
} "gpu_memory_utilization": str(self.gpu_memory_utilization),
) }
)
else:
self._mlflow = None
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]: async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
""" """
@@ -96,15 +102,16 @@ class LLMDeployment:
completion_tokens = len(generated_text.split()) completion_tokens = len(generated_text.split())
# Log to MLflow # Log to MLflow
self._mlflow.log_request( if self._mlflow:
latency_s=latency, self._mlflow.log_request(
prompt_tokens=prompt_tokens, latency_s=latency,
completion_tokens=completion_tokens, prompt_tokens=prompt_tokens,
total_tokens=prompt_tokens + completion_tokens, completion_tokens=completion_tokens,
tokens_per_second=completion_tokens / latency if latency > 0 else 0, total_tokens=prompt_tokens + completion_tokens,
temperature=temperature, tokens_per_second=completion_tokens / latency if latency > 0 else 0,
max_tokens_requested=max_tokens, temperature=temperature,
) max_tokens_requested=max_tokens,
)
# Return OpenAI-compatible response # Return OpenAI-compatible response
return { return {

View File

@@ -9,7 +9,10 @@ from typing import Any
from ray import serve from ray import serve
from ray_serve.mlflow_logger import InferenceLogger try:
from ray_serve.mlflow_logger import InferenceLogger
except ImportError:
InferenceLogger = None
@serve.deployment(name="RerankerDeployment", num_replicas=1) @serve.deployment(name="RerankerDeployment", num_replicas=1)
@@ -62,15 +65,18 @@ class RerankerDeployment:
print("Reranker model loaded successfully") print("Reranker model loaded successfully")
# MLflow metrics # MLflow metrics
self._mlflow = InferenceLogger( if InferenceLogger is not None:
experiment_name="ray-serve-reranker", self._mlflow = InferenceLogger(
run_name=f"reranker-{self.model_id.split('/')[-1]}", experiment_name="ray-serve-reranker",
tags={"model.name": self.model_id, "model.framework": "sentence-transformers", "device": self.device}, run_name=f"reranker-{self.model_id.split('/')[-1]}",
flush_every=10, tags={"model.name": self.model_id, "model.framework": "sentence-transformers", "device": self.device},
) flush_every=10,
self._mlflow.initialize( )
params={"model_id": self.model_id, "device": self.device, "use_ipex": str(self.use_ipex)} self._mlflow.initialize(
) params={"model_id": self.model_id, "device": self.device, "use_ipex": str(self.use_ipex)}
)
else:
self._mlflow = None
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]: async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
""" """
@@ -105,10 +111,11 @@ class RerankerDeployment:
} }
) )
self._mlflow.log_request( if self._mlflow:
latency_s=time.time() - _start, self._mlflow.log_request(
num_pairs=len(pairs), latency_s=time.time() - _start,
) num_pairs=len(pairs),
)
return { return {
"object": "list", "object": "list",
@@ -153,12 +160,13 @@ class RerankerDeployment:
results = results[:top_k] results = results[:top_k]
# Log to MLflow # Log to MLflow
self._mlflow.log_request( if self._mlflow:
latency_s=time.time() - _start, self._mlflow.log_request(
num_pairs=len(pairs), latency_s=time.time() - _start,
num_documents=len(documents), num_pairs=len(pairs),
top_k=top_k, num_documents=len(documents),
) top_k=top_k,
)
return { return {
"object": "list", "object": "list",

View File

@@ -11,7 +11,10 @@ from typing import Any
from ray import serve from ray import serve
from ray_serve.mlflow_logger import InferenceLogger try:
from ray_serve.mlflow_logger import InferenceLogger
except ImportError:
InferenceLogger = None
@serve.deployment(name="TTSDeployment", num_replicas=1) @serve.deployment(name="TTSDeployment", num_replicas=1)
@@ -36,13 +39,16 @@ class TTSDeployment:
print("TTS model loaded successfully") print("TTS model loaded successfully")
# MLflow metrics # MLflow metrics
self._mlflow = InferenceLogger( if InferenceLogger is not None:
experiment_name="ray-serve-tts", self._mlflow = InferenceLogger(
run_name=f"tts-{self.model_name.split('/')[-1]}", experiment_name="ray-serve-tts",
tags={"model.name": self.model_name, "model.framework": "coqui-tts", "gpu": str(self.use_gpu)}, run_name=f"tts-{self.model_name.split('/')[-1]}",
flush_every=5, tags={"model.name": self.model_name, "model.framework": "coqui-tts", "gpu": str(self.use_gpu)},
) flush_every=5,
self._mlflow.initialize(params={"model_name": self.model_name, "use_gpu": str(self.use_gpu)}) )
self._mlflow.initialize(params={"model_name": self.model_name, "use_gpu": str(self.use_gpu)})
else:
self._mlflow = None
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]: async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
""" """
@@ -104,12 +110,13 @@ class TTSDeployment:
duration = len(wav) / sample_rate duration = len(wav) / sample_rate
# Log to MLflow # Log to MLflow
self._mlflow.log_request( if self._mlflow:
latency_s=time.time() - _start, self._mlflow.log_request(
audio_duration_s=duration, latency_s=time.time() - _start,
text_chars=len(text), audio_duration_s=duration,
realtime_factor=(time.time() - _start) / duration if duration > 0 else 0, text_chars=len(text),
) realtime_factor=(time.time() - _start) / duration if duration > 0 else 0,
)
response = { response = {
"model": self.model_name, "model": self.model_name,

View File

@@ -11,7 +11,10 @@ from typing import Any
from ray import serve from ray import serve
from ray_serve.mlflow_logger import InferenceLogger try:
from ray_serve.mlflow_logger import InferenceLogger
except ImportError:
InferenceLogger = None
@serve.deployment(name="WhisperDeployment", num_replicas=1) @serve.deployment(name="WhisperDeployment", num_replicas=1)
@@ -42,15 +45,18 @@ class WhisperDeployment:
print("Whisper model loaded successfully") print("Whisper model loaded successfully")
# MLflow metrics # MLflow metrics
self._mlflow = InferenceLogger( if InferenceLogger is not None:
experiment_name="ray-serve-whisper", self._mlflow = InferenceLogger(
run_name=f"whisper-{self.model_size}", experiment_name="ray-serve-whisper",
tags={"model.name": f"whisper-{self.model_size}", "model.framework": "faster-whisper", "device": self.device}, run_name=f"whisper-{self.model_size}",
flush_every=5, tags={"model.name": f"whisper-{self.model_size}", "model.framework": "faster-whisper", "device": self.device},
) flush_every=5,
self._mlflow.initialize( )
params={"model_size": self.model_size, "device": self.device, "compute_type": self.compute_type} self._mlflow.initialize(
) params={"model_size": self.model_size, "device": self.device, "compute_type": self.compute_type}
)
else:
self._mlflow = None
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]: async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
""" """
@@ -146,12 +152,13 @@ class WhisperDeployment:
} }
# Log to MLflow # Log to MLflow
self._mlflow.log_request( if self._mlflow:
latency_s=time.time() - _start, self._mlflow.log_request(
audio_duration_s=info.duration, latency_s=time.time() - _start,
segments=len(segment_list), audio_duration_s=info.duration,
realtime_factor=(time.time() - _start) / info.duration if info.duration > 0 else 0, segments=len(segment_list),
) realtime_factor=(time.time() - _start) / info.duration if info.duration > 0 else 0,
)
# Default JSON format (OpenAI-compatible) # Default JSON format (OpenAI-compatible)
return { return {