From 297b0d8ebdfb8b9ea0b6a96a3ed9235ebd805b71 Mon Sep 17 00:00:00 2001
From: "Billy D." <billy.davies.10@icloud.com>
Date: Thu, 12 Feb 2026 07:06:49 -0500
Subject: [PATCH] fix: move mlflow import inside __init__ to avoid cloudpickle
 serialization failure

The strixhalo LLM worker uses py_executable which bypasses pip runtime_env.
Module-level try/except still fails because cloudpickle on the head node
resolves the real InferenceLogger class and serializes a module reference.
Moving the import inside __init__ means it runs at actor construction time
on the worker, where ImportError is caught gracefully.
---
 ray_serve/serve_llm.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/ray_serve/serve_llm.py b/ray_serve/serve_llm.py
index 8034217..c067860 100644
--- a/ray_serve/serve_llm.py
+++ b/ray_serve/serve_llm.py
@@ -10,11 +10,6 @@ from typing import Any
 
 from ray import serve
 
-try:
-    from ray_serve.mlflow_logger import InferenceLogger
-except ImportError:
-    InferenceLogger = None
-
 
 @serve.deployment(name="LLMDeployment", num_replicas=1)
 class LLMDeployment:
@@ -42,8 +37,12 @@ class LLMDeployment:
         self.SamplingParams = SamplingParams
         print(f"Model {self.model_id} async engine created")
 
-        # MLflow metrics
-        if InferenceLogger is not None:
+        # MLflow metrics — import locally to avoid cloudpickle
+        # serializing a module reference that fails on the worker
+        # (strixhalo uses py_executable which bypasses pip runtime_env)
+        try:
+            from ray_serve.mlflow_logger import InferenceLogger
+
             self._mlflow = InferenceLogger(
                 experiment_name="ray-serve-llm",
                 run_name=f"llm-{self.model_id.split('/')[-1]}",
@@ -57,7 +56,7 @@ class LLMDeployment:
                     "gpu_memory_utilization": str(self.gpu_memory_utilization),
                 }
             )
-        else:
+        except ImportError:
             self._mlflow = None
 
     async def __call__(self, request: dict[str, Any]) -> dict[str, Any]: