diff --git a/ray_serve/serve_llm.py b/ray_serve/serve_llm.py index 4ae11a6..265c87e 100644 --- a/ray_serve/serve_llm.py +++ b/ray_serve/serve_llm.py @@ -54,6 +54,7 @@ class LLMDeployment: self.enable_chunked_prefill = os.environ.get("ENABLE_CHUNKED_PREFILL", "true").lower() == "true" self.num_speculative_tokens = int(os.environ.get("NUM_SPECULATIVE_TOKENS", "0")) self.ngram_prompt_lookup_max = int(os.environ.get("NGRAM_PROMPT_LOOKUP_MAX", "0")) + self.enforce_eager = os.environ.get("ENFORCE_EAGER", "false").lower() == "true" engine_kwargs: dict[str, Any] = { "model": self.model_id, @@ -62,6 +63,7 @@ class LLMDeployment: "trust_remote_code": True, "enable_prefix_caching": self.enable_prefix_caching, "enable_chunked_prefill": self.enable_chunked_prefill, + "enforce_eager": self.enforce_eager, } # n-gram speculative decoding (no draft model needed) @@ -75,6 +77,7 @@ class LLMDeployment: print(f"Prefix caching: {self.enable_prefix_caching}") print(f"Chunked prefill: {self.enable_chunked_prefill}") + print(f"Enforce eager (no torch.compile): {self.enforce_eager}") engine_args = AsyncEngineArgs(**engine_kwargs) self.engine = AsyncLLMEngine.from_engine_args(engine_args) @@ -102,6 +105,7 @@ class LLMDeployment: "enable_chunked_prefill": str(self.enable_chunked_prefill), "num_speculative_tokens": str(self.num_speculative_tokens), "ngram_prompt_lookup_max": str(self.ngram_prompt_lookup_max), + "enforce_eager": str(self.enforce_eager), } ) except ImportError: