build: optimize Dockerfiles for production
Some checks failed
Build and Push Images / build-rdna2 (push) Failing after 4m3s
Build and Push Images / build-nvidia (push) Failing after 4m6s
Build and Push Images / build-strixhalo (push) Failing after 18s
Build and Push Images / build-intel (push) Failing after 21s

- Use BuildKit syntax 1.7 with cache mounts for apt/uv
- Switch from pip to uv for 10-100x faster installs (ADR-0014)
- Add OCI Image Spec labels for container metadata
- Add HEALTHCHECK directives for orchestration
- Add .dockerignore to reduce context size
- Update Makefile with buildx and lint target
- Add retry logic to ray-entrypoint.sh

Refs: ADR-0012 (uv), ADR-0014 (Docker best practices)
This commit is contained in:
2026-02-02 07:26:27 -05:00
parent a16ffff73f
commit cb80709d3d
8 changed files with 443 additions and 232 deletions

View File

@@ -1,27 +1,64 @@
#!/bin/bash
# Ray Worker Entrypoint
# Connects to Ray head node and registers custom resources
# Connects to Ray head node and registers custom GPU resources
#
# Environment variables:
# RAY_HEAD_SVC - Ray head service name (default: ray-head-svc)
# GPU_RESOURCE - Custom GPU resource name (default: gpu_amd)
# NUM_GPUS - Number of GPUs to register (default: 1)
# RAY_OBJECT_STORE_MEMORY - Object store memory limit (optional)
set -e
set -euo pipefail
# Ensure Ray is in PATH (works across all base images)
# Ensure Ray CLI is in PATH (works across all base images)
export PATH="/home/ray/.local/bin:/home/ray/anaconda3/bin:${PATH}"
# Get Ray head address from environment or default
# Configuration with defaults
RAY_HEAD_ADDRESS="${RAY_HEAD_SVC:-ray-head-svc}:6379"
# Get custom resources from environment
GPU_RESOURCE="${GPU_RESOURCE:-gpu_amd}"
NUM_GPUS="${NUM_GPUS:-1}"
echo "Starting Ray worker..."
echo " Head address: $RAY_HEAD_ADDRESS"
echo " GPU resource: $GPU_RESOURCE"
echo " Num GPUs: $NUM_GPUS"
# Log startup info
echo "============================================="
echo "Ray Worker Starting"
echo "============================================="
echo " Head address: ${RAY_HEAD_ADDRESS}"
echo " GPU resource: ${GPU_RESOURCE}"
echo " Num GPUs: ${NUM_GPUS}"
echo " Python: $(python3 --version)"
echo " Ray version: $(ray --version)"
echo "============================================="
# Start Ray worker with custom resources
exec ray start \
--address="$RAY_HEAD_ADDRESS" \
--num-gpus="$NUM_GPUS" \
--resources="{\"$GPU_RESOURCE\": 1}" \
# Wait for Ray head to be available (with retry)
MAX_RETRIES=30
RETRY_INTERVAL=5
retry_count=0
echo "Waiting for Ray head node..."
until ray health-check --address="${RAY_HEAD_ADDRESS}" 2>/dev/null; do
retry_count=$((retry_count + 1))
if [ $retry_count -ge $MAX_RETRIES ]; then
echo "ERROR: Ray head not available after ${MAX_RETRIES} attempts"
exit 1
fi
echo " Attempt ${retry_count}/${MAX_RETRIES} - retrying in ${RETRY_INTERVAL}s..."
sleep "${RETRY_INTERVAL}"
done
echo "Ray head is ready!"
# Build ray start command with optional args
RAY_START_ARGS=(
--address="${RAY_HEAD_ADDRESS}"
--num-gpus="${NUM_GPUS}"
--resources="{\"${GPU_RESOURCE}\": 1}"
--block
)
# Add object store memory limit if specified
if [ -n "${RAY_OBJECT_STORE_MEMORY:-}" ]; then
RAY_START_ARGS+=(--object-store-memory="${RAY_OBJECT_STORE_MEMORY}")
fi
# Start Ray worker
echo "Starting Ray worker with resources: {\"${GPU_RESOURCE}\": 1}"
exec ray start "${RAY_START_ARGS[@]}"