From fcc9781d427232fd7bbb56c71c9828b86ac3a616 Mon Sep 17 00:00:00 2001 From: "Billy D." Date: Mon, 9 Feb 2026 11:08:33 -0500 Subject: [PATCH] different rocm ; ; --- dockerfiles/Dockerfile.ray-worker-strixhalo | 31 ++++++--------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/dockerfiles/Dockerfile.ray-worker-strixhalo b/dockerfiles/Dockerfile.ray-worker-strixhalo index 6d7de9b..bde7458 100644 --- a/dockerfiles/Dockerfile.ray-worker-strixhalo +++ b/dockerfiles/Dockerfile.ray-worker-strixhalo @@ -60,39 +60,26 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv USER ray -# Install vLLM ROCm build and inference dependencies. -# The vLLM ROCm wheel from wheels.vllm.ai includes HIP-compiled C-extensions -# (vllm._C, vllm._rocm_C) that are ABI-compatible with ROCm PyTorch. -# PyPI vLLM is CUDA-only and crashes with: libcudart.so.12 not found. -# uv gives --extra-index-url higher priority than PyPI, so the ROCm wheel -# is selected over the CUDA wheel. +# Install vLLM ROCm build, torch ROCm 7.0, and inference dependencies. +# IMPORTANT: vLLM ROCm wheel (0.15.1+rocm700) has C-extensions compiled against +# the official PyTorch ROCm 7.0 ABI. We MUST use torch from the same ROCm 7.0 +# index — TheRock nightlies have an incompatible c10::hip ABI that causes +# undefined symbol errors in vllm._C and vllm._rocm_C. +# HSA_OVERRIDE_GFX_VERSION=11.0.0 makes gfx1151 appear as gfx1100 to this torch. RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \ uv pip install --system \ --extra-index-url https://wheels.vllm.ai/rocm/ \ + --extra-index-url https://download.pytorch.org/whl/rocm7.0 \ vllm \ + torch torchaudio torchvision \ 'transformers>=4.35.0,<5.0' \ 'accelerate>=0.25.0,<1.0' \ 'sentence-transformers>=2.3.0,<3.0' \ 'httpx>=0.27.0,<1.0' \ 'scipy>=1.11.0,<2.0' -# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault -# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo). -# TheRock gfx110X-all packages provide compatible Python 3.12 wheels. -# Reinstall AFTER vLLM to override the standard ROCm torch it pulled in. -# vLLM's ROCm C-extensions remain compatible (same HIP ABI, torch 2.10.x). -RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \ - uv pip install --system --reinstall \ - --index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \ - torch torchaudio torchvision - -# FIX: Uninstall flash_attn — it was compiled against the vLLM ROCm wheel's -# PyTorch, but the TheRock nightly above has a different c10::hip ABI. -# vLLM ROCm uses its own Triton/CK attention backends, so flash_attn is not needed. -RUN pip uninstall -y flash-attn 2>/dev/null || true - # FIX: Ray base image has pandas 1.5.3 which is incompatible with numpy 2.x -# The TheRock PyTorch wheels require numpy 2.x, so upgrade pandas to match. +# PyTorch ROCm 7.0 requires numpy 2.x, so upgrade pandas to match. # Pin numpy <2.3 because numba (required by vLLM for speculative decoding) # does not yet support numpy 2.3+. RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \