diff --git a/ray_serve/serve_llm.py b/ray_serve/serve_llm.py index 265c87e..1992083 100644 --- a/ray_serve/serve_llm.py +++ b/ray_serve/serve_llm.py @@ -16,10 +16,10 @@ class LLMDeployment: def __init__(self): # Workaround: vLLM's rocm.py:verify_quantization unconditionally # sets VLLM_USE_TRITON_AWQ=1 (a bug — the os.environ line is - # outside the if-block). Triton AWQ kernels can't compile for - # gfx1151, so we must keep it at 0. Monkey-patch before engine - # creation to prevent the override. - os.environ["VLLM_USE_TRITON_AWQ"] = "0" + # outside the if-block). Monkey-patch before engine creation to + # preserve whatever the operator set in runtime_env env_vars. + # On gfx1151 Triton AWQ works; the C++ awq_dequantize does NOT. + os.environ.setdefault("VLLM_USE_TRITON_AWQ", "1") try: from vllm.platforms.rocm import RocmPlatform @@ -34,7 +34,8 @@ class LLMDeployment: os.environ["VLLM_USE_TRITON_AWQ"] = saved RocmPlatform.verify_quantization = _patched_verify - print("Patched RocmPlatform.verify_quantization to preserve VLLM_USE_TRITON_AWQ=0") + triton_val = os.environ.get("VLLM_USE_TRITON_AWQ", "?") + print(f"Patched RocmPlatform.verify_quantization to preserve VLLM_USE_TRITON_AWQ={triton_val}") except Exception as exc: print(f"Could not patch RocmPlatform: {exc}")