overhaul image builds.
Some checks failed
Build and Push Images / determine-version (push) Successful in 5s
Build and Push Images / build-nvidia (push) Failing after 21s
Build and Push Images / build-rdna2 (push) Failing after 21s
Build and Push Images / build-strixhalo (push) Failing after 12s
Build and Push Images / build-intel (push) Failing after 19s
Build and Push Images / Release (push) Has been skipped
Build and Push Images / Notify (push) Successful in 1s
Some checks failed
Build and Push Images / determine-version (push) Successful in 5s
Build and Push Images / build-nvidia (push) Failing after 21s
Build and Push Images / build-rdna2 (push) Failing after 21s
Build and Push Images / build-strixhalo (push) Failing after 12s
Build and Push Images / build-intel (push) Failing after 19s
Build and Push Images / Release (push) Has been skipped
Build and Push Images / Notify (push) Successful in 1s
This commit is contained in:
@@ -148,8 +148,8 @@ jobs:
|
|||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: ${{ steps.meta.outputs.tags }}
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
cache-from: type=gha
|
cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-nvidia:buildcache
|
||||||
cache-to: type=gha,mode=max
|
cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-nvidia:buildcache,mode=max,image-manifest=true,compression=zstd
|
||||||
|
|
||||||
build-rdna2:
|
build-rdna2:
|
||||||
needs: [determine-version]
|
needs: [determine-version]
|
||||||
@@ -211,8 +211,8 @@ jobs:
|
|||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: ${{ steps.meta.outputs.tags }}
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
cache-from: type=gha
|
cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-rdna2:buildcache
|
||||||
cache-to: type=gha,mode=max
|
cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-rdna2:buildcache,mode=max,image-manifest=true,compression=zstd
|
||||||
|
|
||||||
build-strixhalo:
|
build-strixhalo:
|
||||||
needs: [determine-version]
|
needs: [determine-version]
|
||||||
@@ -274,8 +274,8 @@ jobs:
|
|||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: ${{ steps.meta.outputs.tags }}
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
cache-from: type=gha
|
cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-strixhalo:buildcache
|
||||||
cache-to: type=gha,mode=max
|
cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-strixhalo:buildcache,mode=max,image-manifest=true,compression=zstd
|
||||||
|
|
||||||
build-intel:
|
build-intel:
|
||||||
needs: [determine-version]
|
needs: [determine-version]
|
||||||
@@ -337,8 +337,8 @@ jobs:
|
|||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: ${{ steps.meta.outputs.tags }}
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
cache-from: type=gha
|
cache-from: type=registry,ref=${{ env.REGISTRY }}/ray-worker-intel:buildcache
|
||||||
cache-to: type=gha,mode=max
|
cache-to: type=registry,ref=${{ env.REGISTRY }}/ray-worker-intel:buildcache,mode=max,image-manifest=true,compression=zstd
|
||||||
|
|
||||||
release:
|
release:
|
||||||
name: Release
|
name: Release
|
||||||
|
|||||||
@@ -25,8 +25,8 @@ LABEL ray.version="2.53.0"
|
|||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Copy ROCm stack from vendor image (single COPY layer)
|
# Copy ROCm stack from vendor image (--link makes this layer independent for better caching)
|
||||||
COPY --from=rocm-source /opt/rocm /opt/rocm
|
COPY --link --from=rocm-source /opt/rocm /opt/rocm
|
||||||
|
|
||||||
# ROCm environment variables - split to ensure ROCM_HOME is set first
|
# ROCm environment variables - split to ensure ROCM_HOME is set first
|
||||||
ENV ROCM_HOME=/opt/rocm
|
ENV ROCM_HOME=/opt/rocm
|
||||||
|
|||||||
@@ -26,8 +26,8 @@ LABEL ray.version="2.53.0"
|
|||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Copy ROCm stack from vendor image
|
# Copy ROCm stack from vendor image (--link makes this layer independent for better caching)
|
||||||
COPY --from=rocm-source /opt/rocm /opt/rocm
|
COPY --link --from=rocm-source /opt/rocm /opt/rocm
|
||||||
|
|
||||||
# ROCm environment variables - split to ensure ROCM_HOME is set first
|
# ROCm environment variables - split to ensure ROCM_HOME is set first
|
||||||
ENV ROCM_HOME=/opt/rocm
|
ENV ROCM_HOME=/opt/rocm
|
||||||
@@ -59,15 +59,8 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
|||||||
|
|
||||||
USER ray
|
USER ray
|
||||||
|
|
||||||
# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault
|
# Install vLLM and inference dependencies first (without torch)
|
||||||
# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
|
# vLLM will try to install CUDA torch as dependency, we exclude it here
|
||||||
# TheRock gfx110X-all packages provide compatible Python 3.11 wheels.
|
|
||||||
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
|
||||||
uv pip install --system \
|
|
||||||
--index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
|
|
||||||
torch torchaudio torchvision
|
|
||||||
|
|
||||||
# Install vLLM and inference dependencies (uv is 10-100x faster than pip)
|
|
||||||
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||||
uv pip install --system \
|
uv pip install --system \
|
||||||
'vllm>=0.5.0' \
|
'vllm>=0.5.0' \
|
||||||
@@ -77,6 +70,15 @@ RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
|||||||
'httpx>=0.27.0,<1.0' \
|
'httpx>=0.27.0,<1.0' \
|
||||||
'scipy>=1.11.0,<2.0'
|
'scipy>=1.11.0,<2.0'
|
||||||
|
|
||||||
|
# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault
|
||||||
|
# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
|
||||||
|
# TheRock gfx110X-all packages provide compatible Python 3.11 wheels.
|
||||||
|
# Install AFTER vLLM to override the CUDA torch it pulled in.
|
||||||
|
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||||
|
uv pip install --system --reinstall \
|
||||||
|
--index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
|
||||||
|
torch torchaudio torchvision
|
||||||
|
|
||||||
# FIX: Ray base image has pandas 1.5.3 which is incompatible with numpy 2.x
|
# FIX: Ray base image has pandas 1.5.3 which is incompatible with numpy 2.x
|
||||||
# The TheRock PyTorch wheels require numpy 2.x, so upgrade pandas to match
|
# The TheRock PyTorch wheels require numpy 2.x, so upgrade pandas to match
|
||||||
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||||
|
|||||||
Reference in New Issue
Block a user