overhaul image builds.

2026-02-06 07:47:37 -05:00
parent 38784f3a04
commit 5f1873908f
3 changed files with 23 additions and 21 deletions
--- a/dockerfiles/Dockerfile.ray-worker-strixhalo
+++ b/dockerfiles/Dockerfile.ray-worker-strixhalo
@@ -26,8 +26,8 @@ LABEL ray.version="2.53.0"

 WORKDIR /app

-# Copy ROCm stack from vendor image
-COPY --from=rocm-source /opt/rocm /opt/rocm
+# Copy ROCm stack from vendor image (--link makes this layer independent for better caching)
+COPY --link --from=rocm-source /opt/rocm /opt/rocm

 # ROCm environment variables - split to ensure ROCM_HOME is set first
 ENV ROCM_HOME=/opt/rocm
@@ -59,15 +59,8 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv

 USER ray

-# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault
-# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
-# TheRock gfx110X-all packages provide compatible Python 3.11 wheels.
-RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
-    uv pip install --system \
-        --index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
-        torch torchaudio torchvision
-
-# Install vLLM and inference dependencies (uv is 10-100x faster than pip)
+# Install vLLM and inference dependencies first (without torch)
+# vLLM will try to install CUDA torch as dependency, we exclude it here
 RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
    uv pip install --system \
        'vllm>=0.5.0' \
@@ -77,6 +70,15 @@ RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
        'httpx>=0.27.0,<1.0' \
        'scipy>=1.11.0,<2.0'

+# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault
+# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
+# TheRock gfx110X-all packages provide compatible Python 3.11 wheels.
+# Install AFTER vLLM to override the CUDA torch it pulled in.
+RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
+    uv pip install --system --reinstall \
+        --index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
+        torch torchaudio torchvision
+
 # FIX: Ray base image has pandas 1.5.3 which is incompatible with numpy 2.x
 # The TheRock PyTorch wheels require numpy 2.x, so upgrade pandas to match
 RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \