build: optimize Dockerfiles for production
Some checks failed
Some checks failed
- Use BuildKit syntax 1.7 with cache mounts for apt/uv - Switch from pip to uv for 10-100x faster installs (ADR-0014) - Add OCI Image Spec labels for container metadata - Add HEALTHCHECK directives for orchestration - Add .dockerignore to reduce context size - Update Makefile with buildx and lint target - Add retry logic to ray-entrypoint.sh Refs: ADR-0012 (uv), ADR-0014 (Docker best practices)
This commit is contained in:
@@ -1,27 +1,64 @@
|
||||
#!/bin/bash
|
||||
# Ray Worker Entrypoint
|
||||
# Connects to Ray head node and registers custom resources
|
||||
# Connects to Ray head node and registers custom GPU resources
|
||||
#
|
||||
# Environment variables:
|
||||
# RAY_HEAD_SVC - Ray head service name (default: ray-head-svc)
|
||||
# GPU_RESOURCE - Custom GPU resource name (default: gpu_amd)
|
||||
# NUM_GPUS - Number of GPUs to register (default: 1)
|
||||
# RAY_OBJECT_STORE_MEMORY - Object store memory limit (optional)
|
||||
|
||||
set -e
|
||||
set -euo pipefail
|
||||
|
||||
# Ensure Ray is in PATH (works across all base images)
|
||||
# Ensure Ray CLI is in PATH (works across all base images)
|
||||
export PATH="/home/ray/.local/bin:/home/ray/anaconda3/bin:${PATH}"
|
||||
|
||||
# Get Ray head address from environment or default
|
||||
# Configuration with defaults
|
||||
RAY_HEAD_ADDRESS="${RAY_HEAD_SVC:-ray-head-svc}:6379"
|
||||
|
||||
# Get custom resources from environment
|
||||
GPU_RESOURCE="${GPU_RESOURCE:-gpu_amd}"
|
||||
NUM_GPUS="${NUM_GPUS:-1}"
|
||||
|
||||
echo "Starting Ray worker..."
|
||||
echo " Head address: $RAY_HEAD_ADDRESS"
|
||||
echo " GPU resource: $GPU_RESOURCE"
|
||||
echo " Num GPUs: $NUM_GPUS"
|
||||
# Log startup info
|
||||
echo "============================================="
|
||||
echo "Ray Worker Starting"
|
||||
echo "============================================="
|
||||
echo " Head address: ${RAY_HEAD_ADDRESS}"
|
||||
echo " GPU resource: ${GPU_RESOURCE}"
|
||||
echo " Num GPUs: ${NUM_GPUS}"
|
||||
echo " Python: $(python3 --version)"
|
||||
echo " Ray version: $(ray --version)"
|
||||
echo "============================================="
|
||||
|
||||
# Start Ray worker with custom resources
|
||||
exec ray start \
|
||||
--address="$RAY_HEAD_ADDRESS" \
|
||||
--num-gpus="$NUM_GPUS" \
|
||||
--resources="{\"$GPU_RESOURCE\": 1}" \
|
||||
# Wait for Ray head to be available (with retry)
|
||||
MAX_RETRIES=30
|
||||
RETRY_INTERVAL=5
|
||||
retry_count=0
|
||||
|
||||
echo "Waiting for Ray head node..."
|
||||
until ray health-check --address="${RAY_HEAD_ADDRESS}" 2>/dev/null; do
|
||||
retry_count=$((retry_count + 1))
|
||||
if [ $retry_count -ge $MAX_RETRIES ]; then
|
||||
echo "ERROR: Ray head not available after ${MAX_RETRIES} attempts"
|
||||
exit 1
|
||||
fi
|
||||
echo " Attempt ${retry_count}/${MAX_RETRIES} - retrying in ${RETRY_INTERVAL}s..."
|
||||
sleep "${RETRY_INTERVAL}"
|
||||
done
|
||||
echo "Ray head is ready!"
|
||||
|
||||
# Build ray start command with optional args
|
||||
RAY_START_ARGS=(
|
||||
--address="${RAY_HEAD_ADDRESS}"
|
||||
--num-gpus="${NUM_GPUS}"
|
||||
--resources="{\"${GPU_RESOURCE}\": 1}"
|
||||
--block
|
||||
)
|
||||
|
||||
# Add object store memory limit if specified
|
||||
if [ -n "${RAY_OBJECT_STORE_MEMORY:-}" ]; then
|
||||
RAY_START_ARGS+=(--object-store-memory="${RAY_OBJECT_STORE_MEMORY}")
|
||||
fi
|
||||
|
||||
# Start Ray worker
|
||||
echo "Starting Ray worker with resources: {\"${GPU_RESOURCE}\": 1}"
|
||||
exec ray start "${RAY_START_ARGS[@]}"
|
||||
|
||||
Reference in New Issue
Block a user