build: optimize Dockerfiles for production

- Use BuildKit syntax 1.7 with cache mounts for apt/uv - Switch from pip to uv for 10-100x faster installs (ADR-0014) - Add OCI Image Spec labels for container metadata - Add HEALTHCHECK directives for orchestration - Add .dockerignore to reduce context size - Update Makefile with buildx and lint target - Add retry logic to ray-entrypoint.sh Refs: ADR-0012 (uv), ADR-0014 (Docker best practices)
2026-02-02 07:26:27 -05:00
parent a16ffff73f
commit cb80709d3d
8 changed files with 443 additions and 232 deletions
--- a/dockerfiles/Dockerfile.ray-worker-rdna2
+++ b/dockerfiles/Dockerfile.ray-worker-rdna2
@@ -1,65 +1,94 @@
-# Ray Worker for AMD RDNA 2 (gfx1035 - Radeon 680M)
-# Pre-bakes all dependencies for fast startup
+# syntax=docker/dockerfile:1.7
+# AMD RDNA 2 Ray Worker for drizzt (Radeon 680M - gfx1035)
+# Used for: BGE Embeddings
 #
-# Build from llm-workflows root:
-#   docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-rdna2:latest -f dockerfiles/Dockerfile.ray-worker-rdna2 .
+# Build:
+#   docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-rdna2:latest \
+#     -f dockerfiles/Dockerfile.ray-worker-rdna2 .
 #
-# Multi-stage build to ensure Python 3.11.11 matches Ray head node
+# Multi-stage build: Extract ROCm from vendor image, use Ray base for Python 3.11

-# Stage 1: Extract ROCm libraries from vendor image
-FROM docker.io/rocm/pytorch:rocm6.4.4_ubuntu22.04_py3.10_pytorch_release_2.7.1 AS rocm-libs
+# Stage 1: ROCm libraries from AMD vendor image
+FROM docker.io/rocm/pytorch:rocm6.4.4_ubuntu22.04_py3.10_pytorch_release_2.7.1 AS rocm-source

-# Stage 2: Build on Ray base with Python 3.11
-FROM rayproject/ray:2.53.0-py311 AS base
+# Stage 2: Production image
+FROM rayproject/ray:2.53.0-py311 AS production

-# Copy ROCm stack from vendor image
-COPY --from=rocm-libs /opt/rocm /opt/rocm
-
-# Set up ROCm environment
-ENV ROCM_HOME=/opt/rocm
-ENV PATH="${ROCM_HOME}/bin:${ROCM_HOME}/llvm/bin:${PATH}"
-ENV LD_LIBRARY_PATH="${ROCM_HOME}/lib:${ROCM_HOME}/lib64:${LD_LIBRARY_PATH}"
-ENV HSA_PATH="${ROCM_HOME}/hsa"
-ENV HIP_PATH="${ROCM_HOME}/hip"
-
-# ROCm environment for RDNA 2 (gfx1035)
-ENV HIP_VISIBLE_DEVICES=0 \
-    HSA_ENABLE_SDMA=0 \
-    PYTORCH_HIP_ALLOC_CONF=expandable_segments:True \
-    PYTHONPATH=/app
+# OCI Image Spec labels
+LABEL org.opencontainers.image.title="Ray Worker - AMD RDNA 2"
+LABEL org.opencontainers.image.description="Ray Serve worker for AMD RDNA 2 GPUs (BGE Embeddings)"
+LABEL org.opencontainers.image.vendor="DaviesTechLabs"
+LABEL org.opencontainers.image.source="https://git.daviestechlabs.io/daviestechlabs/kuberay-images"
+LABEL org.opencontainers.image.licenses="MIT"
+LABEL gpu.target="amd-rocm-6.4-gfx1035"
+LABEL ray.version="2.53.0"

 WORKDIR /app

-# Install ROCm system dependencies
+# Copy ROCm stack from vendor image (single COPY layer)
+COPY --from=rocm-source /opt/rocm /opt/rocm
+
+# ROCm environment variables
+ENV ROCM_HOME=/opt/rocm \
+    PATH="${ROCM_HOME}/bin:${ROCM_HOME}/llvm/bin:${PATH}" \
+    LD_LIBRARY_PATH="${ROCM_HOME}/lib:${ROCM_HOME}/lib64:${LD_LIBRARY_PATH}" \
+    HSA_PATH="${ROCM_HOME}/hsa" \
+    HIP_PATH="${ROCM_HOME}/hip" \
+    # RDNA 2 specific settings
+    HIP_VISIBLE_DEVICES=0 \
+    HSA_ENABLE_SDMA=0 \
+    PYTORCH_HIP_ALLOC_CONF=expandable_segments:True
+
+# Install system dependencies
 USER root
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    libelf1 \
-    libnuma1 \
-    libdrm2 \
-    libdrm-amdgpu1 \
-    kmod \
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get update && apt-get install -y --no-install-recommends \
+        libelf1 \
+        libnuma1 \
+        libdrm2 \
+        libdrm-amdgpu1 \
+        kmod \
    && rm -rf /var/lib/apt/lists/*
+
+# Install uv for fast Python package management (ADR-0014)
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+
 USER ray

-# Install PyTorch ROCm wheels compatible with Python 3.11 and ROCm 6.2
-RUN pip install --no-cache-dir \
-    torch==2.5.1 torchvision torchaudio \
-    --index-url https://download.pytorch.org/whl/rocm6.2
+# Install PyTorch with ROCm 6.2 wheels for Python 3.11 (uv is 10-100x faster)
+RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
+    uv pip install --system \
+        torch==2.5.1 torchvision torchaudio \
+        --index-url https://download.pytorch.org/whl/rocm6.2

-# Install Ray Serve and AI inference dependencies
-RUN pip install --no-cache-dir \
-    transformers \
-    accelerate \
-    sentence-transformers \
-    httpx \
-    numpy \
-    scipy
+# Install inference dependencies
+RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
+    uv pip install --system \
+        'transformers>=4.35.0,<5.0' \
+        'accelerate>=0.25.0,<1.0' \
+        'sentence-transformers>=2.3.0,<3.0' \
+        'httpx>=0.27.0,<1.0' \
+        'numpy>=1.26.0,<2.0' \
+        'scipy>=1.11.0,<2.0'

 # Pre-download embedding model for faster cold starts
 RUN python3 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-large-en-v1.5')"

 # Copy application code
-COPY ray-serve/ /app/ray_serve/
-COPY --chmod=755 dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh
+COPY --chown=ray:ray ray-serve/ /app/ray_serve/
+COPY --chown=ray:ray --chmod=755 dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh
+
+# Environment configuration
+ENV PYTHONPATH=/app \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    RAY_HEAD_SVC="ai-inference-raycluster-head-svc" \
+    GPU_RESOURCE="gpu_amd_rdna2" \
+    NUM_GPUS="1"
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD ray status --address=localhost:6379 || exit 1

 ENTRYPOINT ["/app/ray-entrypoint.sh"]