From 5f1873908fa3aa4ba3a5bbb63f6faa2e4feac63d Mon Sep 17 00:00:00 2001
From: "Billy D." <billy.davies.10@icloud.com>
Date: Fri, 6 Feb 2026 07:47:37 -0500
Subject: [PATCH] overhaul image builds.

---
 .gitea/workflows/build-push.yaml            | 16 +++++++-------
 dockerfiles/Dockerfile.ray-worker-rdna2     |  4 ++--
 dockerfiles/Dockerfile.ray-worker-strixhalo | 24 +++++++++++----------
 3 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/.gitea/workflows/build-push.yaml b/.gitea/workflows/build-push.yaml
index 5dfe2c1..8f87282 100644
--- a/.gitea/workflows/build-push.yaml
+++ b/.gitea/workflows/build-push.yaml
@@ -148,8 +148,8 @@ jobs:
           push: ${{ github.event_name != 'pull_request' }}
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
+          cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-nvidia:buildcache
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-nvidia:buildcache,mode=max,image-manifest=true,compression=zstd
 
   build-rdna2:
     needs: [determine-version]
@@ -211,8 +211,8 @@ jobs:
           push: ${{ github.event_name != 'pull_request' }}
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
+          cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-rdna2:buildcache
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-rdna2:buildcache,mode=max,image-manifest=true,compression=zstd
 
   build-strixhalo:
     needs: [determine-version]
@@ -274,8 +274,8 @@ jobs:
           push: ${{ github.event_name != 'pull_request' }}
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
+          cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-strixhalo:buildcache
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-strixhalo:buildcache,mode=max,image-manifest=true,compression=zstd
 
   build-intel:
     needs: [determine-version]
@@ -337,8 +337,8 @@ jobs:
           push: ${{ github.event_name != 'pull_request' }}
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
+          cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-intel:buildcache
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-intel:buildcache,mode=max,image-manifest=true,compression=zstd
 
   release:
     name: Release
diff --git a/dockerfiles/Dockerfile.ray-worker-rdna2 b/dockerfiles/Dockerfile.ray-worker-rdna2
index 6b7bbec..48d0923 100644
--- a/dockerfiles/Dockerfile.ray-worker-rdna2
+++ b/dockerfiles/Dockerfile.ray-worker-rdna2
@@ -25,8 +25,8 @@ LABEL ray.version="2.53.0"
 
 WORKDIR /app
 
-# Copy ROCm stack from vendor image (single COPY layer)
-COPY --from=rocm-source /opt/rocm /opt/rocm
+# Copy ROCm stack from vendor image (--link makes this layer independent for better caching)
+COPY --link --from=rocm-source /opt/rocm /opt/rocm
 
 # ROCm environment variables - split to ensure ROCM_HOME is set first
 ENV ROCM_HOME=/opt/rocm
diff --git a/dockerfiles/Dockerfile.ray-worker-strixhalo b/dockerfiles/Dockerfile.ray-worker-strixhalo
index b4f5c3c..94bd71a 100644
--- a/dockerfiles/Dockerfile.ray-worker-strixhalo
+++ b/dockerfiles/Dockerfile.ray-worker-strixhalo
@@ -26,8 +26,8 @@ LABEL ray.version="2.53.0"
 
 WORKDIR /app
 
-# Copy ROCm stack from vendor image
-COPY --from=rocm-source /opt/rocm /opt/rocm
+# Copy ROCm stack from vendor image (--link makes this layer independent for better caching)
+COPY --link --from=rocm-source /opt/rocm /opt/rocm
 
 # ROCm environment variables - split to ensure ROCM_HOME is set first
 ENV ROCM_HOME=/opt/rocm
@@ -59,15 +59,8 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
 
 USER ray
 
-# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault
-# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
-# TheRock gfx110X-all packages provide compatible Python 3.11 wheels.
-RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
-    uv pip install --system \
-        --index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
-        torch torchaudio torchvision
-
-# Install vLLM and inference dependencies (uv is 10-100x faster than pip)
+# Install vLLM and inference dependencies first (without torch)
+# vLLM will try to install CUDA torch as dependency, we exclude it here
 RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
     uv pip install --system \
         'vllm>=0.5.0' \
@@ -77,6 +70,15 @@ RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
         'httpx>=0.27.0,<1.0' \
         'scipy>=1.11.0,<2.0'
 
+# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault
+# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
+# TheRock gfx110X-all packages provide compatible Python 3.11 wheels.
+# Install AFTER vLLM to override the CUDA torch it pulled in.
+RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
+    uv pip install --system --reinstall \
+        --index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
+        torch torchaudio torchvision
+
 # FIX: Ray base image has pandas 1.5.3 which is incompatible with numpy 2.x
 # The TheRock PyTorch wheels require numpy 2.x, so upgrade pandas to match
 RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \