overhaul image builds.
Some checks failed
Build and Push Images / determine-version (push) Successful in 5s
Build and Push Images / build-nvidia (push) Failing after 21s
Build and Push Images / build-rdna2 (push) Failing after 21s
Build and Push Images / build-strixhalo (push) Failing after 12s
Build and Push Images / build-intel (push) Failing after 19s
Build and Push Images / Release (push) Has been skipped
Build and Push Images / Notify (push) Successful in 1s
Some checks failed
Build and Push Images / determine-version (push) Successful in 5s
Build and Push Images / build-nvidia (push) Failing after 21s
Build and Push Images / build-rdna2 (push) Failing after 21s
Build and Push Images / build-strixhalo (push) Failing after 12s
Build and Push Images / build-intel (push) Failing after 19s
Build and Push Images / Release (push) Has been skipped
Build and Push Images / Notify (push) Successful in 1s
This commit is contained in:
@@ -148,8 +148,8 @@ jobs:
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-nvidia:buildcache
|
||||
cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-nvidia:buildcache,mode=max,image-manifest=true,compression=zstd
|
||||
|
||||
build-rdna2:
|
||||
needs: [determine-version]
|
||||
@@ -211,8 +211,8 @@ jobs:
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-rdna2:buildcache
|
||||
cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-rdna2:buildcache,mode=max,image-manifest=true,compression=zstd
|
||||
|
||||
build-strixhalo:
|
||||
needs: [determine-version]
|
||||
@@ -274,8 +274,8 @@ jobs:
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-strixhalo:buildcache
|
||||
cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-strixhalo:buildcache,mode=max,image-manifest=true,compression=zstd
|
||||
|
||||
build-intel:
|
||||
needs: [determine-version]
|
||||
@@ -337,8 +337,8 @@ jobs:
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-intel:buildcache
|
||||
cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-intel:buildcache,mode=max,image-manifest=true,compression=zstd
|
||||
|
||||
release:
|
||||
name: Release
|
||||
|
||||
@@ -25,8 +25,8 @@ LABEL ray.version="2.53.0"
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy ROCm stack from vendor image (single COPY layer)
|
||||
COPY --from=rocm-source /opt/rocm /opt/rocm
|
||||
# Copy ROCm stack from vendor image (--link makes this layer independent for better caching)
|
||||
COPY --link --from=rocm-source /opt/rocm /opt/rocm
|
||||
|
||||
# ROCm environment variables - split to ensure ROCM_HOME is set first
|
||||
ENV ROCM_HOME=/opt/rocm
|
||||
|
||||
@@ -26,8 +26,8 @@ LABEL ray.version="2.53.0"
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy ROCm stack from vendor image
|
||||
COPY --from=rocm-source /opt/rocm /opt/rocm
|
||||
# Copy ROCm stack from vendor image (--link makes this layer independent for better caching)
|
||||
COPY --link --from=rocm-source /opt/rocm /opt/rocm
|
||||
|
||||
# ROCm environment variables - split to ensure ROCM_HOME is set first
|
||||
ENV ROCM_HOME=/opt/rocm
|
||||
@@ -59,15 +59,8 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
||||
|
||||
USER ray
|
||||
|
||||
# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault
|
||||
# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
|
||||
# TheRock gfx110X-all packages provide compatible Python 3.11 wheels.
|
||||
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||
uv pip install --system \
|
||||
--index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
|
||||
torch torchaudio torchvision
|
||||
|
||||
# Install vLLM and inference dependencies (uv is 10-100x faster than pip)
|
||||
# Install vLLM and inference dependencies first (without torch)
|
||||
# vLLM will try to install CUDA torch as dependency, we exclude it here
|
||||
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||
uv pip install --system \
|
||||
'vllm>=0.5.0' \
|
||||
@@ -77,6 +70,15 @@ RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||
'httpx>=0.27.0,<1.0' \
|
||||
'scipy>=1.11.0,<2.0'
|
||||
|
||||
# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault
|
||||
# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
|
||||
# TheRock gfx110X-all packages provide compatible Python 3.11 wheels.
|
||||
# Install AFTER vLLM to override the CUDA torch it pulled in.
|
||||
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||
uv pip install --system --reinstall \
|
||||
--index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
|
||||
torch torchaudio torchvision
|
||||
|
||||
# FIX: Ray base image has pandas 1.5.3 which is incompatible with numpy 2.x
|
||||
# The TheRock PyTorch wheels require numpy 2.x, so upgrade pandas to match
|
||||
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||
|
||||
Reference in New Issue
Block a user