build: optimize Dockerfiles for production

- Use BuildKit syntax 1.7 with cache mounts for apt/uv - Switch from pip to uv for 10-100x faster installs (ADR-0014) - Add OCI Image Spec labels for container metadata - Add HEALTHCHECK directives for orchestration - Add .dockerignore to reduce context size - Update Makefile with buildx and lint target - Add retry logic to ray-entrypoint.sh Refs: ADR-0012 (uv), ADR-0014 (Docker best practices)
2026-02-02 07:26:27 -05:00
parent a16ffff73f
commit cb80709d3d
8 changed files with 443 additions and 232 deletions
--- a/dockerfiles/Dockerfile.ray-worker-strixhalo
+++ b/dockerfiles/Dockerfile.ray-worker-strixhalo
@@ -1,72 +1,100 @@
-# Ray Worker for AMD Strix Halo (gfx1151 / RDNA 3.5)
-# Pre-bakes all dependencies for fast startup
+# syntax=docker/dockerfile:1.7
+# AMD Strix Halo Ray Worker for khelben (gfx1151 / RDNA 3.5)
+# Used for: vLLM (Llama 3.1 70B)
 #
-# Build from llm-workflows root:
-#   docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-strixhalo:latest -f dockerfiles/Dockerfile.ray-worker-strixhalo .
+# Build:
+#   docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-strixhalo:latest \
+#     -f dockerfiles/Dockerfile.ray-worker-strixhalo .
 #
-# Multi-stage build to ensure Python 3.11.11 matches Ray head node
+# Multi-stage build: Extract ROCm 7.1 from vendor image, use Ray base for Python 3.11
+# Note: Uses TheRock gfx110X wheels due to ROCm/ROCm#5853 segfault issue

-# Stage 1: Extract ROCm 7.1 libraries from vendor image
-FROM docker.io/rocm/pytorch:rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.9.1 AS rocm-libs
+# Stage 1: ROCm 7.1 libraries from AMD vendor image
+FROM docker.io/rocm/pytorch:rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.9.1 AS rocm-source

-# Stage 2: Build on Ray base with Python 3.11
-FROM rayproject/ray:2.53.0-py311 AS base
+# Stage 2: Production image
+FROM rayproject/ray:2.53.0-py311 AS production

-# Copy ROCm stack from vendor image
-COPY --from=rocm-libs /opt/rocm /opt/rocm
-
-# Set up ROCm environment
-ENV ROCM_HOME=/opt/rocm
-ENV PATH="${ROCM_HOME}/bin:${ROCM_HOME}/llvm/bin:${PATH}"
-ENV LD_LIBRARY_PATH="${ROCM_HOME}/lib:${ROCM_HOME}/lib64:${LD_LIBRARY_PATH}"
-ENV HSA_PATH="${ROCM_HOME}/hsa"
-ENV HIP_PATH="${ROCM_HOME}/hip"
-
-# ROCm environment for AMD Strix Halo (gfx1151 / RDNA 3.5)
-ENV HIP_VISIBLE_DEVICES=0
-ENV HSA_ENABLE_SDMA=0
-ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512
-ENV HSA_OVERRIDE_GFX_VERSION=11.0.0
-ENV ROCM_TARGET_LST=gfx1151,gfx1100
-ENV PYTHONPATH=/app
+# OCI Image Spec labels
+LABEL org.opencontainers.image.title="Ray Worker - AMD Strix Halo"
+LABEL org.opencontainers.image.description="Ray Serve worker for AMD Strix Halo (vLLM LLM inference)"
+LABEL org.opencontainers.image.vendor="DaviesTechLabs"
+LABEL org.opencontainers.image.source="https://git.daviestechlabs.io/daviestechlabs/kuberay-images"
+LABEL org.opencontainers.image.licenses="MIT"
+LABEL gpu.target="amd-rocm-7.1-gfx1151"
+LABEL ray.version="2.53.0"

 WORKDIR /app

-# Install ROCm system dependencies
+# Copy ROCm stack from vendor image
+COPY --from=rocm-source /opt/rocm /opt/rocm
+
+# ROCm environment variables
+ENV ROCM_HOME=/opt/rocm \
+    PATH="${ROCM_HOME}/bin:${ROCM_HOME}/llvm/bin:${PATH}" \
+    LD_LIBRARY_PATH="${ROCM_HOME}/lib:${ROCM_HOME}/lib64:${LD_LIBRARY_PATH}" \
+    HSA_PATH="${ROCM_HOME}/hsa" \
+    HIP_PATH="${ROCM_HOME}/hip" \
+    # Strix Halo (gfx1151) specific settings
+    HIP_VISIBLE_DEVICES=0 \
+    HSA_ENABLE_SDMA=0 \
+    PYTORCH_HIP_ALLOC_CONF="expandable_segments:True,max_split_size_mb:512" \
+    HSA_OVERRIDE_GFX_VERSION="11.0.0" \
+    ROCM_TARGET_LST="gfx1151,gfx1100"
+
+# Install system dependencies
 USER root
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    libelf1 \
-    libnuma1 \
-    libdrm2 \
-    libdrm-amdgpu1 \
-    kmod \
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get update && apt-get install -y --no-install-recommends \
+        libelf1 \
+        libnuma1 \
+        libdrm2 \
+        libdrm-amdgpu1 \
+        kmod \
    && rm -rf /var/lib/apt/lists/*
+
+# Install uv for fast Python package management (ADR-0014)
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+
 USER ray

 # WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault
 # in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
-# TheRock gfx110X-all packages provide Python 3.11 compatible wheels.
-RUN pip install --no-cache-dir \
-    --index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
-    torch torchaudio torchvision
+# TheRock gfx110X-all packages provide compatible Python 3.11 wheels.
+RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
+    uv pip install --system \
+        --index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
+        torch torchaudio torchvision

-# Install Ray Serve and AI inference dependencies
-RUN pip install --no-cache-dir \
-    vllm \
-    transformers \
-    accelerate \
-    sentence-transformers \
-    httpx \
-    numpy \
-    scipy
+# Install vLLM and inference dependencies (uv is 10-100x faster than pip)
+RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
+    uv pip install --system \
+        'vllm>=0.5.0' \
+        'transformers>=4.35.0,<5.0' \
+        'accelerate>=0.25.0,<1.0' \
+        'sentence-transformers>=2.3.0,<3.0' \
+        'httpx>=0.27.0,<1.0' \
+        'numpy>=1.26.0,<2.0' \
+        'scipy>=1.11.0,<2.0'

-# Pre-download common models for faster cold starts
-RUN python3 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-large-en-v1.5')" || true
+# Pre-download common models for faster cold starts (optional, increases image size)
+# RUN python3 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-large-en-v1.5')"

-# Copy Ray Serve Python code
-COPY ray-serve/ /app/ray_serve/
+# Copy application code
+COPY --chown=ray:ray ray-serve/ /app/ray_serve/
+COPY --chown=ray:ray --chmod=755 dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh

-# Ray worker entrypoint
-COPY --chmod=755 dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh
+# Environment configuration
+ENV PYTHONPATH=/app \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    RAY_HEAD_SVC="ai-inference-raycluster-head-svc" \
+    GPU_RESOURCE="gpu_amd_strixhalo" \
+    NUM_GPUS="1"
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
+    CMD ray status --address=localhost:6379 || exit 1

 ENTRYPOINT ["/app/ray-entrypoint.sh"]