fix(strixhalo): remove apt-get layer that corrupts vendor hipcc

The vendor image (rocm/pytorch:rocm7.0.2) ships all needed runtime packages. Any apt-get install triggers ROCm repo dependency resolution that upgrades vendor hipcc 1.1.1.70002 to Ubuntu's 5.7.1, whose hipconfig.pl reports HIP version 0.0.0 → cmake can't find HIP. Changes: - Remove entire apt-get layer (git, ccache, runtime libs all pre-installed) - Keep only ray user creation from that RUN block - Add detailed comments explaining why apt-get must never be used Combined with cmake<4 (downgrade from 4.0.0) and HIP_ROOT_DIR=/opt/rocm from prior commits, this produces a successful build (attempt 5). Verified: torch 2.9.1+rocm7.0.2 (vendor), vllm 0.15.2.dev0 (source-built)
2026-02-09 18:24:50 -05:00
parent 2e3fbb8c60
commit 8adaef62a2
1 changed files with 32 additions and 22 deletions
--- a/dockerfiles/Dockerfile.ray-worker-strixhalo
+++ b/dockerfiles/Dockerfile.ray-worker-strixhalo
@@ -1,4 +1,3 @@
-# syntax=docker/dockerfile:1.7
 # AMD Strix Halo Ray Worker for khelben (gfx1151 / RDNA 3.5)
 # Used for: vLLM (Llama 3.1 70B)
 #
@@ -43,11 +42,12 @@ WORKDIR /app
 # The vendor image ships a venv at /opt/venv with Python 3.12 + torch 2.9.1.
 # All pip installs go into this venv.
 ENV ROCM_HOME=/opt/rocm \
-    VIRTUAL_ENV=/opt/venv
+    VIRTUAL_ENV=/opt/venv \
+    HIP_CLANG_PATH=/opt/rocm/llvm/bin
 ENV PATH="/opt/venv/bin:/opt/rocm/bin:/opt/rocm/llvm/bin:/home/ray/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" \
    LD_LIBRARY_PATH="/opt/rocm/lib:/opt/rocm/lib64:/opt/venv/lib" \
    HSA_PATH="/opt/rocm/hsa" \
-    HIP_PATH="/opt/rocm/hip" \
+    HIP_PATH="/opt/rocm" \
    # Strix Halo (gfx1151 / RDNA 3.5) runtime settings
    HIP_VISIBLE_DEVICES=0 \
    HSA_ENABLE_SDMA=0 \
@@ -55,18 +55,20 @@ ENV PATH="/opt/venv/bin:/opt/rocm/bin:/opt/rocm/llvm/bin:/home/ray/.local/bin:/u
    HSA_OVERRIDE_GFX_VERSION="11.0.0" \
    ROCM_TARGET_LST="gfx1151,gfx1100"

-# ── System dependencies + build tools ───────────────────────────────────
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt-get update && apt-get install -y --no-install-recommends \
-        # Runtime
-        libelf1 libnuma1 libdrm2 libdrm-amdgpu1 kmod libopenmpi3 \
-        # Build (for vLLM C++/HIP compilation)
-        git cmake ninja-build ccache \
-    && rm -rf /var/lib/apt/lists/* \
-    # Create ray user (uid 1000 / gid 100) for KubeRay.
-    # Vendor image may already have UID 1000 — rename or create.
-    && (groupadd -g 100 -o users 2>/dev/null || true) \
+# ── System setup ─────────────────────────────────────────────────────────
+# The vendor image already ships ALL needed packages:
+#   cmake 4.0, hipcc 7.0.2, clang++ 20.0 (AMD ROCm LLVM), git,
+#   libelf, libnuma, libdrm, libopenmpi3, and HIP dev headers/cmake configs.
+#
+# CRITICAL: Do NOT run apt-get upgrade or install ANY packages from apt.
+# Even installing ccache triggers a dependency cascade that pulls in
+# Ubuntu's hipcc 5.7.1 (which overwrites the vendor hipcc 7.0.2) and
+# a broken /usr/bin/hipconfig.pl that makes cmake find_package(hip)
+# report version 0.0.0 → "Can't find CUDA or HIP installation."
+#
+# Create ray user (uid 1000 / gid 100) for KubeRay.
+# Vendor image may already have UID 1000 — rename or create.
+RUN (groupadd -g 100 -o users 2>/dev/null || true) \
    && existing=$(getent passwd 1000 | cut -d: -f1) \
    && if [ -n "$existing" ] && [ "$existing" != "ray" ]; then \
           usermod -l ray -d /home/ray -m -s /bin/bash "$existing"; \
@@ -79,9 +81,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv

 # ── Python build dependencies ──────────────────────────────────────────
+# CRITICAL: vLLM requires cmake<4.  The vendor image ships cmake 4.0.0
+# which changed find_package(MODULE) behaviour and breaks FindHIP.cmake
+# (reports HIP version 0.0.0).  Downgrade to 3.x per vLLM's rocm-build.txt.
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --python /opt/venv/bin/python3 \
-        'cmake>=3.26.1' \
+        'cmake>=3.26.1,<4' \
        ninja \
        'packaging>=24.2' \
        'setuptools>=77.0.3,<80.0.0' \
@@ -108,17 +113,22 @@ RUN if [ -f use_existing_torch.py ]; then \
    fi

 # Compile C++/HIP extensions and install the vLLM Python package.
-#   --no-build-isolation : use vendor torch + our build deps directly
-#   --no-deps            : we install runtime deps ourselves (below)
+# vLLM's setup.py passes -DROCM_PATH=$ROCM_HOME to cmake automatically.
+# HIP_ROOT_DIR tells FindHIP.cmake where to look for hipconfig.
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} \
    VLLM_TARGET_DEVICE=rocm \
    MAX_JOBS=${MAX_JOBS} \
    CMAKE_BUILD_TYPE=Release \
+    HIP_ROOT_DIR=/opt/rocm \
+    CMAKE_PREFIX_PATH="/opt/rocm;/opt/rocm/lib/cmake" \
    CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=cache,target=/root/.cache/ccache \
-    uv pip install --python /opt/venv/bin/python3 \
-        --no-build-isolation --no-deps .
+
+# Build using setup.py bdist_wheel (same as vLLM CI in Dockerfile.rocm),
+# then install the wheel.  This avoids a develop-mode egg-link to the
+# build directory so we can safely clean up /tmp/vllm-build afterwards.
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    python3 setup.py bdist_wheel --dist-dir=dist \
+    && uv pip install --python /opt/venv/bin/python3 --no-deps dist/*.whl

 # ── ROCm-specific Python wheels ────────────────────────────────────────
 # triton (ROCm HIP backend) and flash-attn (Triton AMD kernels for gfx11)