From 5f1873908fa3aa4ba3a5bbb63f6faa2e4feac63d Mon Sep 17 00:00:00 2001 From: "Billy D." Date: Fri, 6 Feb 2026 07:47:37 -0500 Subject: [PATCH] overhaul image builds. --- .gitea/workflows/build-push.yaml | 16 +++++++------- dockerfiles/Dockerfile.ray-worker-rdna2 | 4 ++-- dockerfiles/Dockerfile.ray-worker-strixhalo | 24 +++++++++++---------- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/.gitea/workflows/build-push.yaml b/.gitea/workflows/build-push.yaml index 5dfe2c1..8f87282 100644 --- a/.gitea/workflows/build-push.yaml +++ b/.gitea/workflows/build-push.yaml @@ -148,8 +148,8 @@ jobs: push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max + cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-nvidia:buildcache + cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-nvidia:buildcache,mode=max,image-manifest=true,compression=zstd build-rdna2: needs: [determine-version] @@ -211,8 +211,8 @@ jobs: push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max + cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-rdna2:buildcache + cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-rdna2:buildcache,mode=max,image-manifest=true,compression=zstd build-strixhalo: needs: [determine-version] @@ -274,8 +274,8 @@ jobs: push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max + cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-strixhalo:buildcache + cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-strixhalo:buildcache,mode=max,image-manifest=true,compression=zstd build-intel: needs: [determine-version] @@ -337,8 +337,8 @@ jobs: push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max + cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-intel:buildcache + cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-intel:buildcache,mode=max,image-manifest=true,compression=zstd release: name: Release diff --git a/dockerfiles/Dockerfile.ray-worker-rdna2 b/dockerfiles/Dockerfile.ray-worker-rdna2 index 6b7bbec..48d0923 100644 --- a/dockerfiles/Dockerfile.ray-worker-rdna2 +++ b/dockerfiles/Dockerfile.ray-worker-rdna2 @@ -25,8 +25,8 @@ LABEL ray.version="2.53.0" WORKDIR /app -# Copy ROCm stack from vendor image (single COPY layer) -COPY --from=rocm-source /opt/rocm /opt/rocm +# Copy ROCm stack from vendor image (--link makes this layer independent for better caching) +COPY --link --from=rocm-source /opt/rocm /opt/rocm # ROCm environment variables - split to ensure ROCM_HOME is set first ENV ROCM_HOME=/opt/rocm diff --git a/dockerfiles/Dockerfile.ray-worker-strixhalo b/dockerfiles/Dockerfile.ray-worker-strixhalo index b4f5c3c..94bd71a 100644 --- a/dockerfiles/Dockerfile.ray-worker-strixhalo +++ b/dockerfiles/Dockerfile.ray-worker-strixhalo @@ -26,8 +26,8 @@ LABEL ray.version="2.53.0" WORKDIR /app -# Copy ROCm stack from vendor image -COPY --from=rocm-source /opt/rocm /opt/rocm +# Copy ROCm stack from vendor image (--link makes this layer independent for better caching) +COPY --link --from=rocm-source /opt/rocm /opt/rocm # ROCm environment variables - split to ensure ROCM_HOME is set first ENV ROCM_HOME=/opt/rocm @@ -59,15 +59,8 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv USER ray -# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault -# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo). -# TheRock gfx110X-all packages provide compatible Python 3.11 wheels. -RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \ - uv pip install --system \ - --index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \ - torch torchaudio torchvision - -# Install vLLM and inference dependencies (uv is 10-100x faster than pip) +# Install vLLM and inference dependencies first (without torch) +# vLLM will try to install CUDA torch as dependency, we exclude it here RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \ uv pip install --system \ 'vllm>=0.5.0' \ @@ -77,6 +70,15 @@ RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \ 'httpx>=0.27.0,<1.0' \ 'scipy>=1.11.0,<2.0' +# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault +# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo). +# TheRock gfx110X-all packages provide compatible Python 3.11 wheels. +# Install AFTER vLLM to override the CUDA torch it pulled in. +RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \ + uv pip install --system --reinstall \ + --index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \ + torch torchaudio torchvision + # FIX: Ray base image has pandas 1.5.3 which is incompatible with numpy 2.x # The TheRock PyTorch wheels require numpy 2.x, so upgrade pandas to match RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \