build: optimize Dockerfiles for production

- Use BuildKit syntax 1.7 with cache mounts for apt/uv - Switch from pip to uv for 10-100x faster installs (ADR-0014) - Add OCI Image Spec labels for container metadata - Add HEALTHCHECK directives for orchestration - Add .dockerignore to reduce context size - Update Makefile with buildx and lint target - Add retry logic to ray-entrypoint.sh Refs: ADR-0012 (uv), ADR-0014 (Docker best practices)
2026-02-02 07:26:27 -05:00
parent a16ffff73f
commit cb80709d3d
8 changed files with 443 additions and 232 deletions
--- a/dockerfiles/ray-entrypoint.sh
+++ b/dockerfiles/ray-entrypoint.sh
@@ -1,27 +1,64 @@
 #!/bin/bash
 # Ray Worker Entrypoint
-# Connects to Ray head node and registers custom resources
+# Connects to Ray head node and registers custom GPU resources
+#
+# Environment variables:
+#   RAY_HEAD_SVC    - Ray head service name (default: ray-head-svc)
+#   GPU_RESOURCE    - Custom GPU resource name (default: gpu_amd)
+#   NUM_GPUS        - Number of GPUs to register (default: 1)
+#   RAY_OBJECT_STORE_MEMORY - Object store memory limit (optional)

-set -e
+set -euo pipefail

-# Ensure Ray is in PATH (works across all base images)
+# Ensure Ray CLI is in PATH (works across all base images)
 export PATH="/home/ray/.local/bin:/home/ray/anaconda3/bin:${PATH}"

-# Get Ray head address from environment or default
+# Configuration with defaults
 RAY_HEAD_ADDRESS="${RAY_HEAD_SVC:-ray-head-svc}:6379"
-
-# Get custom resources from environment
 GPU_RESOURCE="${GPU_RESOURCE:-gpu_amd}"
 NUM_GPUS="${NUM_GPUS:-1}"

-echo "Starting Ray worker..."
-echo "  Head address: $RAY_HEAD_ADDRESS"
-echo "  GPU resource: $GPU_RESOURCE"
-echo "  Num GPUs: $NUM_GPUS"
+# Log startup info
+echo "============================================="
+echo "Ray Worker Starting"
+echo "============================================="
+echo "  Head address:  ${RAY_HEAD_ADDRESS}"
+echo "  GPU resource:  ${GPU_RESOURCE}"
+echo "  Num GPUs:      ${NUM_GPUS}"
+echo "  Python:        $(python3 --version)"
+echo "  Ray version:   $(ray --version)"
+echo "============================================="

-# Start Ray worker with custom resources
-exec ray start \
-    --address="$RAY_HEAD_ADDRESS" \
-    --num-gpus="$NUM_GPUS" \
-    --resources="{\"$GPU_RESOURCE\": 1}" \
+# Wait for Ray head to be available (with retry)
+MAX_RETRIES=30
+RETRY_INTERVAL=5
+retry_count=0
+
+echo "Waiting for Ray head node..."
+until ray health-check --address="${RAY_HEAD_ADDRESS}" 2>/dev/null; do
+    retry_count=$((retry_count + 1))
+    if [ $retry_count -ge $MAX_RETRIES ]; then
+        echo "ERROR: Ray head not available after ${MAX_RETRIES} attempts"
+        exit 1
+    fi
+    echo "  Attempt ${retry_count}/${MAX_RETRIES} - retrying in ${RETRY_INTERVAL}s..."
+    sleep "${RETRY_INTERVAL}"
+done
+echo "Ray head is ready!"
+
+# Build ray start command with optional args
+RAY_START_ARGS=(
+    --address="${RAY_HEAD_ADDRESS}"
+    --num-gpus="${NUM_GPUS}"
+    --resources="{\"${GPU_RESOURCE}\": 1}"
    --block
+)
+
+# Add object store memory limit if specified
+if [ -n "${RAY_OBJECT_STORE_MEMORY:-}" ]; then
+    RAY_START_ARGS+=(--object-store-memory="${RAY_OBJECT_STORE_MEMORY}")
+fi
+
+# Start Ray worker
+echo "Starting Ray worker with resources: {\"${GPU_RESOURCE}\": 1}"
+exec ray start "${RAY_START_ARGS[@]}"