overhaul image builds.

2026-02-06 07:47:37 -05:00
parent 38784f3a04
commit 5f1873908f
3 changed files with 23 additions and 21 deletions
--- a/.gitea/workflows/build-push.yaml
+++ b/.gitea/workflows/build-push.yaml
@@ -148,8 +148,8 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha
+          cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-nvidia:buildcache
-          cache-to: type=gha,mode=max
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-nvidia:buildcache,mode=max,image-manifest=true,compression=zstd
  build-rdna2:
    needs: [determine-version]
@@ -211,8 +211,8 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha
+          cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-rdna2:buildcache
-          cache-to: type=gha,mode=max
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-rdna2:buildcache,mode=max,image-manifest=true,compression=zstd
  build-strixhalo:
    needs: [determine-version]
@@ -274,8 +274,8 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha
+          cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-strixhalo:buildcache
-          cache-to: type=gha,mode=max
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-strixhalo:buildcache,mode=max,image-manifest=true,compression=zstd
  build-intel:
    needs: [determine-version]
@@ -337,8 +337,8 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha
+          cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-intel:buildcache
-          cache-to: type=gha,mode=max
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-intel:buildcache,mode=max,image-manifest=true,compression=zstd
  release:
    name: Release
--- a/dockerfiles/Dockerfile.ray-worker-rdna2
+++ b/dockerfiles/Dockerfile.ray-worker-rdna2
@@ -25,8 +25,8 @@ LABEL ray.version="2.53.0"
 WORKDIR /app
-# Copy ROCm stack from vendor image (single COPY layer)
+# Copy ROCm stack from vendor image (--link makes this layer independent for better caching)
-COPY --from=rocm-source /opt/rocm /opt/rocm
+COPY --link --from=rocm-source /opt/rocm /opt/rocm
 # ROCm environment variables - split to ensure ROCM_HOME is set first
 ENV ROCM_HOME=/opt/rocm
--- a/dockerfiles/Dockerfile.ray-worker-strixhalo
+++ b/dockerfiles/Dockerfile.ray-worker-strixhalo
@@ -26,8 +26,8 @@ LABEL ray.version="2.53.0"
 WORKDIR /app
-# Copy ROCm stack from vendor image
+# Copy ROCm stack from vendor image (--link makes this layer independent for better caching)
-COPY --from=rocm-source /opt/rocm /opt/rocm
+COPY --link --from=rocm-source /opt/rocm /opt/rocm
 # ROCm environment variables - split to ensure ROCM_HOME is set first
 ENV ROCM_HOME=/opt/rocm
@@ -59,15 +59,8 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
 USER ray
-# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault
+# Install vLLM and inference dependencies first (without torch)
-# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
+# vLLM will try to install CUDA torch as dependency, we exclude it here
 # TheRock gfx110X-all packages provide compatible Python 3.11 wheels.
 RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
    uv pip install --system \
        --index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
        torch torchaudio torchvision
 # Install vLLM and inference dependencies (uv is 10-100x faster than pip)
 RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
    uv pip install --system \
        'vllm>=0.5.0' \
@@ -77,6 +70,15 @@ RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
        'httpx>=0.27.0,<1.0' \
        'scipy>=1.11.0,<2.0'
 # WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault
 # in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
 # TheRock gfx110X-all packages provide compatible Python 3.11 wheels.
 # Install AFTER vLLM to override the CUDA torch it pulled in.
 RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
    uv pip install --system --reinstall \
        --index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
        torch torchaudio torchvision
 # FIX: Ray base image has pandas 1.5.3 which is incompatible with numpy 2.x
 # The TheRock PyTorch wheels require numpy 2.x, so upgrade pandas to match
 RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \