fix: add ENFORCE_EAGER env var to skip torch.compile on ROCm
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 12s

torch._dynamo.exc.Unsupported crashes EngineCore during graph tracing
of LlamaDecoderLayer on gfx1151.  ENFORCE_EAGER=true bypasses
torch.compile and CUDA graph capture entirely.
This commit is contained in:
2026-02-13 06:56:29 -05:00
parent 6a391147a6
commit f66de251eb

View File

@@ -54,6 +54,7 @@ class LLMDeployment:
self.enable_chunked_prefill = os.environ.get("ENABLE_CHUNKED_PREFILL", "true").lower() == "true" self.enable_chunked_prefill = os.environ.get("ENABLE_CHUNKED_PREFILL", "true").lower() == "true"
self.num_speculative_tokens = int(os.environ.get("NUM_SPECULATIVE_TOKENS", "0")) self.num_speculative_tokens = int(os.environ.get("NUM_SPECULATIVE_TOKENS", "0"))
self.ngram_prompt_lookup_max = int(os.environ.get("NGRAM_PROMPT_LOOKUP_MAX", "0")) self.ngram_prompt_lookup_max = int(os.environ.get("NGRAM_PROMPT_LOOKUP_MAX", "0"))
self.enforce_eager = os.environ.get("ENFORCE_EAGER", "false").lower() == "true"
engine_kwargs: dict[str, Any] = { engine_kwargs: dict[str, Any] = {
"model": self.model_id, "model": self.model_id,
@@ -62,6 +63,7 @@ class LLMDeployment:
"trust_remote_code": True, "trust_remote_code": True,
"enable_prefix_caching": self.enable_prefix_caching, "enable_prefix_caching": self.enable_prefix_caching,
"enable_chunked_prefill": self.enable_chunked_prefill, "enable_chunked_prefill": self.enable_chunked_prefill,
"enforce_eager": self.enforce_eager,
} }
# n-gram speculative decoding (no draft model needed) # n-gram speculative decoding (no draft model needed)
@@ -75,6 +77,7 @@ class LLMDeployment:
print(f"Prefix caching: {self.enable_prefix_caching}") print(f"Prefix caching: {self.enable_prefix_caching}")
print(f"Chunked prefill: {self.enable_chunked_prefill}") print(f"Chunked prefill: {self.enable_chunked_prefill}")
print(f"Enforce eager (no torch.compile): {self.enforce_eager}")
engine_args = AsyncEngineArgs(**engine_kwargs) engine_args = AsyncEngineArgs(**engine_kwargs)
self.engine = AsyncLLMEngine.from_engine_args(engine_args) self.engine = AsyncLLMEngine.from_engine_args(engine_args)
@@ -102,6 +105,7 @@ class LLMDeployment:
"enable_chunked_prefill": str(self.enable_chunked_prefill), "enable_chunked_prefill": str(self.enable_chunked_prefill),
"num_speculative_tokens": str(self.num_speculative_tokens), "num_speculative_tokens": str(self.num_speculative_tokens),
"ngram_prompt_lookup_max": str(self.ngram_prompt_lookup_max), "ngram_prompt_lookup_max": str(self.ngram_prompt_lookup_max),
"enforce_eager": str(self.enforce_eager),
} }
) )
except ImportError: except ImportError: