From 96f7650b231f044287cb56723857894f3c96d96d Mon Sep 17 00:00:00 2001 From: "Billy D." Date: Fri, 13 Feb 2026 07:29:57 -0500 Subject: [PATCH] fix: respect VLLM_USE_TRITON_AWQ from runtime_env instead of hardcoding 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous code unconditionally set VLLM_USE_TRITON_AWQ=0, overriding the value from the RayService runtime_env env_vars. On gfx1151: - Triton AWQ kernels work (TRITON_AWQ=1) - C++ awq_dequantize op does NOT exist (TRITON_AWQ=0 → crash) Changed to os.environ.setdefault('VLLM_USE_TRITON_AWQ', '1') so the operator-configured value is preserved, defaulting to Triton AWQ. --- ray_serve/serve_llm.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ray_serve/serve_llm.py b/ray_serve/serve_llm.py index 265c87e..1992083 100644 --- a/ray_serve/serve_llm.py +++ b/ray_serve/serve_llm.py @@ -16,10 +16,10 @@ class LLMDeployment: def __init__(self): # Workaround: vLLM's rocm.py:verify_quantization unconditionally # sets VLLM_USE_TRITON_AWQ=1 (a bug — the os.environ line is - # outside the if-block). Triton AWQ kernels can't compile for - # gfx1151, so we must keep it at 0. Monkey-patch before engine - # creation to prevent the override. - os.environ["VLLM_USE_TRITON_AWQ"] = "0" + # outside the if-block). Monkey-patch before engine creation to + # preserve whatever the operator set in runtime_env env_vars. + # On gfx1151 Triton AWQ works; the C++ awq_dequantize does NOT. + os.environ.setdefault("VLLM_USE_TRITON_AWQ", "1") try: from vllm.platforms.rocm import RocmPlatform @@ -34,7 +34,8 @@ class LLMDeployment: os.environ["VLLM_USE_TRITON_AWQ"] = saved RocmPlatform.verify_quantization = _patched_verify - print("Patched RocmPlatform.verify_quantization to preserve VLLM_USE_TRITON_AWQ=0") + triton_val = os.environ.get("VLLM_USE_TRITON_AWQ", "?") + print(f"Patched RocmPlatform.verify_quantization to preserve VLLM_USE_TRITON_AWQ={triton_val}") except Exception as exc: print(f"Could not patch RocmPlatform: {exc}")