fix: add ENFORCE_EAGER env var to skip torch.compile on ROCm

torch._dynamo.exc.Unsupported crashes EngineCore during graph tracing of LlamaDecoderLayer on gfx1151. ENFORCE_EAGER=true bypasses torch.compile and CUDA graph capture entirely.
2026-02-13 06:56:29 -05:00
parent 6a391147a6
commit f66de251eb
1 changed files with 4 additions and 0 deletions
--- a/ray_serve/serve_llm.py
+++ b/ray_serve/serve_llm.py
@@ -54,6 +54,7 @@ class LLMDeployment:
        self.enable_chunked_prefill = os.environ.get("ENABLE_CHUNKED_PREFILL", "true").lower() == "true"
        self.num_speculative_tokens = int(os.environ.get("NUM_SPECULATIVE_TOKENS", "0"))
        self.ngram_prompt_lookup_max = int(os.environ.get("NGRAM_PROMPT_LOOKUP_MAX", "0"))
        self.enforce_eager = os.environ.get("ENFORCE_EAGER", "false").lower() == "true"
        engine_kwargs: dict[str, Any] = {
            "model": self.model_id,
@@ -62,6 +63,7 @@ class LLMDeployment:
            "trust_remote_code": True,
            "enable_prefix_caching": self.enable_prefix_caching,
            "enable_chunked_prefill": self.enable_chunked_prefill,
            "enforce_eager": self.enforce_eager,
        }
        # n-gram speculative decoding (no draft model needed)
@@ -75,6 +77,7 @@ class LLMDeployment:
        print(f"Prefix caching: {self.enable_prefix_caching}")
        print(f"Chunked prefill: {self.enable_chunked_prefill}")
        print(f"Enforce eager (no torch.compile): {self.enforce_eager}")
        engine_args = AsyncEngineArgs(**engine_kwargs)
        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
@@ -102,6 +105,7 @@ class LLMDeployment:
                    "enable_chunked_prefill": str(self.enable_chunked_prefill),
                    "num_speculative_tokens": str(self.num_speculative_tokens),
                    "ngram_prompt_lookup_max": str(self.ngram_prompt_lookup_max),
                    "enforce_eager": str(self.enforce_eager),
                }
            )
        except ImportError: