overhaul image builds.
Some checks failed
Build and Push Images / determine-version (push) Successful in 5s
Build and Push Images / build-nvidia (push) Failing after 21s
Build and Push Images / build-rdna2 (push) Failing after 21s
Build and Push Images / build-strixhalo (push) Failing after 12s
Build and Push Images / build-intel (push) Failing after 19s
Build and Push Images / Release (push) Has been skipped
Build and Push Images / Notify (push) Successful in 1s

This commit is contained in:
2026-02-06 07:47:37 -05:00
parent 38784f3a04
commit 5f1873908f
3 changed files with 23 additions and 21 deletions

View File

@@ -26,8 +26,8 @@ LABEL ray.version="2.53.0"
WORKDIR /app
# Copy ROCm stack from vendor image
COPY --from=rocm-source /opt/rocm /opt/rocm
# Copy ROCm stack from vendor image (--link makes this layer independent for better caching)
COPY --link --from=rocm-source /opt/rocm /opt/rocm
# ROCm environment variables - split to ensure ROCM_HOME is set first
ENV ROCM_HOME=/opt/rocm
@@ -59,15 +59,8 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
USER ray
# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault
# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
# TheRock gfx110X-all packages provide compatible Python 3.11 wheels.
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
uv pip install --system \
--index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
torch torchaudio torchvision
# Install vLLM and inference dependencies (uv is 10-100x faster than pip)
# Install vLLM and inference dependencies first (without torch)
# vLLM will try to install CUDA torch as dependency, we exclude it here
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
uv pip install --system \
'vllm>=0.5.0' \
@@ -77,6 +70,15 @@ RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
'httpx>=0.27.0,<1.0' \
'scipy>=1.11.0,<2.0'
# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault
# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
# TheRock gfx110X-all packages provide compatible Python 3.11 wheels.
# Install AFTER vLLM to override the CUDA torch it pulled in.
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
uv pip install --system --reinstall \
--index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
torch torchaudio torchvision
# FIX: Ray base image has pandas 1.5.3 which is incompatible with numpy 2.x
# The TheRock PyTorch wheels require numpy 2.x, so upgrade pandas to match
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \