fix: make mlflow_logger import optional with no-op fallback
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 11s
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 11s
The strixhalo LLM worker uses py_executable pointing to the Docker image venv which doesn't have the updated ray-serve-apps package. Wrap all InferenceLogger imports in try/except and guard usage with None checks so apps degrade gracefully without MLflow logging.
This commit is contained in:
@@ -9,7 +9,10 @@ from typing import Any
|
|||||||
|
|
||||||
from ray import serve
|
from ray import serve
|
||||||
|
|
||||||
from ray_serve.mlflow_logger import InferenceLogger
|
try:
|
||||||
|
from ray_serve.mlflow_logger import InferenceLogger
|
||||||
|
except ImportError:
|
||||||
|
InferenceLogger = None
|
||||||
|
|
||||||
|
|
||||||
@serve.deployment(name="EmbeddingsDeployment", num_replicas=1)
|
@serve.deployment(name="EmbeddingsDeployment", num_replicas=1)
|
||||||
@@ -37,15 +40,18 @@ class EmbeddingsDeployment:
|
|||||||
print(f"Model loaded. Embedding dimension: {self.embedding_dim}")
|
print(f"Model loaded. Embedding dimension: {self.embedding_dim}")
|
||||||
|
|
||||||
# MLflow metrics
|
# MLflow metrics
|
||||||
self._mlflow = InferenceLogger(
|
if InferenceLogger is not None:
|
||||||
experiment_name="ray-serve-embeddings",
|
self._mlflow = InferenceLogger(
|
||||||
run_name=f"embeddings-{self.model_id.split('/')[-1]}",
|
experiment_name="ray-serve-embeddings",
|
||||||
tags={"model.name": self.model_id, "model.framework": "sentence-transformers", "device": self.device},
|
run_name=f"embeddings-{self.model_id.split('/')[-1]}",
|
||||||
flush_every=10,
|
tags={"model.name": self.model_id, "model.framework": "sentence-transformers", "device": self.device},
|
||||||
)
|
flush_every=10,
|
||||||
self._mlflow.initialize(
|
)
|
||||||
params={"model_id": self.model_id, "embedding_dim": str(self.embedding_dim), "device": self.device}
|
self._mlflow.initialize(
|
||||||
)
|
params={"model_id": self.model_id, "embedding_dim": str(self.embedding_dim), "device": self.device}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self._mlflow = None
|
||||||
|
|
||||||
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
|
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
@@ -86,11 +92,12 @@ class EmbeddingsDeployment:
|
|||||||
total_tokens += len(text.split())
|
total_tokens += len(text.split())
|
||||||
|
|
||||||
# Log to MLflow
|
# Log to MLflow
|
||||||
self._mlflow.log_request(
|
if self._mlflow:
|
||||||
latency_s=time.time() - _start,
|
self._mlflow.log_request(
|
||||||
batch_size=len(texts),
|
latency_s=time.time() - _start,
|
||||||
total_tokens=total_tokens,
|
batch_size=len(texts),
|
||||||
)
|
total_tokens=total_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
# Return OpenAI-compatible response
|
# Return OpenAI-compatible response
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -10,7 +10,10 @@ from typing import Any
|
|||||||
|
|
||||||
from ray import serve
|
from ray import serve
|
||||||
|
|
||||||
from ray_serve.mlflow_logger import InferenceLogger
|
try:
|
||||||
|
from ray_serve.mlflow_logger import InferenceLogger
|
||||||
|
except ImportError:
|
||||||
|
InferenceLogger = None
|
||||||
|
|
||||||
|
|
||||||
@serve.deployment(name="LLMDeployment", num_replicas=1)
|
@serve.deployment(name="LLMDeployment", num_replicas=1)
|
||||||
@@ -40,19 +43,22 @@ class LLMDeployment:
|
|||||||
print(f"Model {self.model_id} async engine created")
|
print(f"Model {self.model_id} async engine created")
|
||||||
|
|
||||||
# MLflow metrics
|
# MLflow metrics
|
||||||
self._mlflow = InferenceLogger(
|
if InferenceLogger is not None:
|
||||||
experiment_name="ray-serve-llm",
|
self._mlflow = InferenceLogger(
|
||||||
run_name=f"llm-{self.model_id.split('/')[-1]}",
|
experiment_name="ray-serve-llm",
|
||||||
tags={"model.name": self.model_id, "model.framework": "vllm", "gpu": "strixhalo"},
|
run_name=f"llm-{self.model_id.split('/')[-1]}",
|
||||||
flush_every=5,
|
tags={"model.name": self.model_id, "model.framework": "vllm", "gpu": "strixhalo"},
|
||||||
)
|
flush_every=5,
|
||||||
self._mlflow.initialize(
|
)
|
||||||
params={
|
self._mlflow.initialize(
|
||||||
"model_id": self.model_id,
|
params={
|
||||||
"max_model_len": str(self.max_model_len),
|
"model_id": self.model_id,
|
||||||
"gpu_memory_utilization": str(self.gpu_memory_utilization),
|
"max_model_len": str(self.max_model_len),
|
||||||
}
|
"gpu_memory_utilization": str(self.gpu_memory_utilization),
|
||||||
)
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self._mlflow = None
|
||||||
|
|
||||||
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
|
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
@@ -96,15 +102,16 @@ class LLMDeployment:
|
|||||||
completion_tokens = len(generated_text.split())
|
completion_tokens = len(generated_text.split())
|
||||||
|
|
||||||
# Log to MLflow
|
# Log to MLflow
|
||||||
self._mlflow.log_request(
|
if self._mlflow:
|
||||||
latency_s=latency,
|
self._mlflow.log_request(
|
||||||
prompt_tokens=prompt_tokens,
|
latency_s=latency,
|
||||||
completion_tokens=completion_tokens,
|
prompt_tokens=prompt_tokens,
|
||||||
total_tokens=prompt_tokens + completion_tokens,
|
completion_tokens=completion_tokens,
|
||||||
tokens_per_second=completion_tokens / latency if latency > 0 else 0,
|
total_tokens=prompt_tokens + completion_tokens,
|
||||||
temperature=temperature,
|
tokens_per_second=completion_tokens / latency if latency > 0 else 0,
|
||||||
max_tokens_requested=max_tokens,
|
temperature=temperature,
|
||||||
)
|
max_tokens_requested=max_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
# Return OpenAI-compatible response
|
# Return OpenAI-compatible response
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -9,7 +9,10 @@ from typing import Any
|
|||||||
|
|
||||||
from ray import serve
|
from ray import serve
|
||||||
|
|
||||||
from ray_serve.mlflow_logger import InferenceLogger
|
try:
|
||||||
|
from ray_serve.mlflow_logger import InferenceLogger
|
||||||
|
except ImportError:
|
||||||
|
InferenceLogger = None
|
||||||
|
|
||||||
|
|
||||||
@serve.deployment(name="RerankerDeployment", num_replicas=1)
|
@serve.deployment(name="RerankerDeployment", num_replicas=1)
|
||||||
@@ -62,15 +65,18 @@ class RerankerDeployment:
|
|||||||
print("Reranker model loaded successfully")
|
print("Reranker model loaded successfully")
|
||||||
|
|
||||||
# MLflow metrics
|
# MLflow metrics
|
||||||
self._mlflow = InferenceLogger(
|
if InferenceLogger is not None:
|
||||||
experiment_name="ray-serve-reranker",
|
self._mlflow = InferenceLogger(
|
||||||
run_name=f"reranker-{self.model_id.split('/')[-1]}",
|
experiment_name="ray-serve-reranker",
|
||||||
tags={"model.name": self.model_id, "model.framework": "sentence-transformers", "device": self.device},
|
run_name=f"reranker-{self.model_id.split('/')[-1]}",
|
||||||
flush_every=10,
|
tags={"model.name": self.model_id, "model.framework": "sentence-transformers", "device": self.device},
|
||||||
)
|
flush_every=10,
|
||||||
self._mlflow.initialize(
|
)
|
||||||
params={"model_id": self.model_id, "device": self.device, "use_ipex": str(self.use_ipex)}
|
self._mlflow.initialize(
|
||||||
)
|
params={"model_id": self.model_id, "device": self.device, "use_ipex": str(self.use_ipex)}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self._mlflow = None
|
||||||
|
|
||||||
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
|
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
@@ -105,10 +111,11 @@ class RerankerDeployment:
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
self._mlflow.log_request(
|
if self._mlflow:
|
||||||
latency_s=time.time() - _start,
|
self._mlflow.log_request(
|
||||||
num_pairs=len(pairs),
|
latency_s=time.time() - _start,
|
||||||
)
|
num_pairs=len(pairs),
|
||||||
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"object": "list",
|
"object": "list",
|
||||||
@@ -153,12 +160,13 @@ class RerankerDeployment:
|
|||||||
results = results[:top_k]
|
results = results[:top_k]
|
||||||
|
|
||||||
# Log to MLflow
|
# Log to MLflow
|
||||||
self._mlflow.log_request(
|
if self._mlflow:
|
||||||
latency_s=time.time() - _start,
|
self._mlflow.log_request(
|
||||||
num_pairs=len(pairs),
|
latency_s=time.time() - _start,
|
||||||
num_documents=len(documents),
|
num_pairs=len(pairs),
|
||||||
top_k=top_k,
|
num_documents=len(documents),
|
||||||
)
|
top_k=top_k,
|
||||||
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"object": "list",
|
"object": "list",
|
||||||
|
|||||||
@@ -11,7 +11,10 @@ from typing import Any
|
|||||||
|
|
||||||
from ray import serve
|
from ray import serve
|
||||||
|
|
||||||
from ray_serve.mlflow_logger import InferenceLogger
|
try:
|
||||||
|
from ray_serve.mlflow_logger import InferenceLogger
|
||||||
|
except ImportError:
|
||||||
|
InferenceLogger = None
|
||||||
|
|
||||||
|
|
||||||
@serve.deployment(name="TTSDeployment", num_replicas=1)
|
@serve.deployment(name="TTSDeployment", num_replicas=1)
|
||||||
@@ -36,13 +39,16 @@ class TTSDeployment:
|
|||||||
print("TTS model loaded successfully")
|
print("TTS model loaded successfully")
|
||||||
|
|
||||||
# MLflow metrics
|
# MLflow metrics
|
||||||
self._mlflow = InferenceLogger(
|
if InferenceLogger is not None:
|
||||||
experiment_name="ray-serve-tts",
|
self._mlflow = InferenceLogger(
|
||||||
run_name=f"tts-{self.model_name.split('/')[-1]}",
|
experiment_name="ray-serve-tts",
|
||||||
tags={"model.name": self.model_name, "model.framework": "coqui-tts", "gpu": str(self.use_gpu)},
|
run_name=f"tts-{self.model_name.split('/')[-1]}",
|
||||||
flush_every=5,
|
tags={"model.name": self.model_name, "model.framework": "coqui-tts", "gpu": str(self.use_gpu)},
|
||||||
)
|
flush_every=5,
|
||||||
self._mlflow.initialize(params={"model_name": self.model_name, "use_gpu": str(self.use_gpu)})
|
)
|
||||||
|
self._mlflow.initialize(params={"model_name": self.model_name, "use_gpu": str(self.use_gpu)})
|
||||||
|
else:
|
||||||
|
self._mlflow = None
|
||||||
|
|
||||||
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
|
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
@@ -104,12 +110,13 @@ class TTSDeployment:
|
|||||||
duration = len(wav) / sample_rate
|
duration = len(wav) / sample_rate
|
||||||
|
|
||||||
# Log to MLflow
|
# Log to MLflow
|
||||||
self._mlflow.log_request(
|
if self._mlflow:
|
||||||
latency_s=time.time() - _start,
|
self._mlflow.log_request(
|
||||||
audio_duration_s=duration,
|
latency_s=time.time() - _start,
|
||||||
text_chars=len(text),
|
audio_duration_s=duration,
|
||||||
realtime_factor=(time.time() - _start) / duration if duration > 0 else 0,
|
text_chars=len(text),
|
||||||
)
|
realtime_factor=(time.time() - _start) / duration if duration > 0 else 0,
|
||||||
|
)
|
||||||
|
|
||||||
response = {
|
response = {
|
||||||
"model": self.model_name,
|
"model": self.model_name,
|
||||||
|
|||||||
@@ -11,7 +11,10 @@ from typing import Any
|
|||||||
|
|
||||||
from ray import serve
|
from ray import serve
|
||||||
|
|
||||||
from ray_serve.mlflow_logger import InferenceLogger
|
try:
|
||||||
|
from ray_serve.mlflow_logger import InferenceLogger
|
||||||
|
except ImportError:
|
||||||
|
InferenceLogger = None
|
||||||
|
|
||||||
|
|
||||||
@serve.deployment(name="WhisperDeployment", num_replicas=1)
|
@serve.deployment(name="WhisperDeployment", num_replicas=1)
|
||||||
@@ -42,15 +45,18 @@ class WhisperDeployment:
|
|||||||
print("Whisper model loaded successfully")
|
print("Whisper model loaded successfully")
|
||||||
|
|
||||||
# MLflow metrics
|
# MLflow metrics
|
||||||
self._mlflow = InferenceLogger(
|
if InferenceLogger is not None:
|
||||||
experiment_name="ray-serve-whisper",
|
self._mlflow = InferenceLogger(
|
||||||
run_name=f"whisper-{self.model_size}",
|
experiment_name="ray-serve-whisper",
|
||||||
tags={"model.name": f"whisper-{self.model_size}", "model.framework": "faster-whisper", "device": self.device},
|
run_name=f"whisper-{self.model_size}",
|
||||||
flush_every=5,
|
tags={"model.name": f"whisper-{self.model_size}", "model.framework": "faster-whisper", "device": self.device},
|
||||||
)
|
flush_every=5,
|
||||||
self._mlflow.initialize(
|
)
|
||||||
params={"model_size": self.model_size, "device": self.device, "compute_type": self.compute_type}
|
self._mlflow.initialize(
|
||||||
)
|
params={"model_size": self.model_size, "device": self.device, "compute_type": self.compute_type}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self._mlflow = None
|
||||||
|
|
||||||
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
|
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
@@ -146,12 +152,13 @@ class WhisperDeployment:
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Log to MLflow
|
# Log to MLflow
|
||||||
self._mlflow.log_request(
|
if self._mlflow:
|
||||||
latency_s=time.time() - _start,
|
self._mlflow.log_request(
|
||||||
audio_duration_s=info.duration,
|
latency_s=time.time() - _start,
|
||||||
segments=len(segment_list),
|
audio_duration_s=info.duration,
|
||||||
realtime_factor=(time.time() - _start) / info.duration if info.duration > 0 else 0,
|
segments=len(segment_list),
|
||||||
)
|
realtime_factor=(time.time() - _start) / info.duration if info.duration > 0 else 0,
|
||||||
|
)
|
||||||
|
|
||||||
# Default JSON format (OpenAI-compatible)
|
# Default JSON format (OpenAI-compatible)
|
||||||
return {
|
return {
|
||||||
|
|||||||
Reference in New Issue
Block a user