From ab2a7f486ef6b83e21cfef2b579e533fb2185859 Mon Sep 17 00:00:00 2001 From: "Billy D." Date: Mon, 9 Feb 2026 14:37:05 -0500 Subject: [PATCH] fix(strixhalo): switch base to ROCm 7.0.2 to fix libhsa segfault ROCm 7.1 system libraries (libhsa-runtime64.so.1.18.70100) are ABI- incompatible with the torch/vLLM ROCm 7.0 wheels from wheels.vllm.ai. This caused SIGSEGV at 0x34 in libhsa-runtime64 on every GPU operation. Switch to rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.9.1 which provides matching ROCm 7.0.2 system libraries while keeping Ubuntu 24.04 (glibc 2.38) and Python 3.12. --- dockerfiles/Dockerfile.ray-worker-strixhalo | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dockerfiles/Dockerfile.ray-worker-strixhalo b/dockerfiles/Dockerfile.ray-worker-strixhalo index 327015a..96ab0c9 100644 --- a/dockerfiles/Dockerfile.ray-worker-strixhalo +++ b/dockerfiles/Dockerfile.ray-worker-strixhalo @@ -6,12 +6,13 @@ # docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-strixhalo:latest \ # -f dockerfiles/Dockerfile.ray-worker-strixhalo . # -# Uses ROCm vendor image as base (Ubuntu 24.04 / glibc 2.38) so that all -# ROCm 7.1 shared libraries (libMIOpen, libhipblas, etc.) find a compatible -# glibc. Ray 2.53.0 is installed into the vendor venv via pip. +# Uses ROCm 7.0.2 vendor image (Ubuntu 24.04 / glibc 2.38) so system ROCm +# libraries (libhsa-runtime64, libhipblas, libMIOpen, etc.) are ABI-compatible +# with torch + vLLM wheels from wheels.vllm.ai/rocm/ (compiled for ROCm 7.0). +# ROCm 7.1 base causes segfault in libhsa-runtime64.so due to ABI mismatch. # Note: Python 3.12 required — vLLM ROCm wheel (wheels.vllm.ai/rocm) is cp312 only -FROM docker.io/rocm/pytorch:rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.9.1 +FROM docker.io/rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.9.1 # OCI Image Spec labels LABEL org.opencontainers.image.title="Ray Worker - AMD Strix Halo" @@ -19,7 +20,7 @@ LABEL org.opencontainers.image.description="Ray Serve worker for AMD Strix Halo LABEL org.opencontainers.image.vendor="DaviesTechLabs" LABEL org.opencontainers.image.source="https://git.daviestechlabs.io/daviestechlabs/kuberay-images" LABEL org.opencontainers.image.licenses="MIT" -LABEL gpu.target="amd-rocm-7.1-gfx1151" +LABEL gpu.target="amd-rocm-7.0-gfx1151" LABEL ray.version="2.53.0" WORKDIR /app @@ -63,10 +64,9 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ # Install uv for fast Python package management (ADR-0014) COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv -# Remove vendor torch — the vendor ships torch 2.9.1+rocm7.1 but vLLM -# ROCm wheel (0.15.1+rocm700) was compiled against the PyTorch ROCm 7.0 -# ABI. Installing from the ROCm 7.0 index avoids undefined-symbol errors -# in vllm._C / vllm._rocm_C (e.g. silu_and_mul). +# Remove vendor torch — replace with the exact torch from wheels.vllm.ai/rocm/ +# (2.9.1+git8907517) that vLLM 0.15.1+rocm700 was compiled against. +# The vendor torch is close but may differ in C++ ABI details. RUN uv pip uninstall --python /opt/venv/bin/python3 \ torch torchaudio torchvision 2>/dev/null || true