Some checks failed
- Use BuildKit syntax 1.7 with cache mounts for apt/uv - Switch from pip to uv for 10-100x faster installs (ADR-0014) - Add OCI Image Spec labels for container metadata - Add HEALTHCHECK directives for orchestration - Add .dockerignore to reduce context size - Update Makefile with buildx and lint target - Add retry logic to ray-entrypoint.sh Refs: ADR-0012 (uv), ADR-0014 (Docker best practices)
65 lines
2.1 KiB
Bash
65 lines
2.1 KiB
Bash
#!/bin/bash
|
|
# Ray Worker Entrypoint
|
|
# Connects to Ray head node and registers custom GPU resources
|
|
#
|
|
# Environment variables:
|
|
# RAY_HEAD_SVC - Ray head service name (default: ray-head-svc)
|
|
# GPU_RESOURCE - Custom GPU resource name (default: gpu_amd)
|
|
# NUM_GPUS - Number of GPUs to register (default: 1)
|
|
# RAY_OBJECT_STORE_MEMORY - Object store memory limit (optional)
|
|
|
|
set -euo pipefail
|
|
|
|
# Ensure Ray CLI is in PATH (works across all base images)
|
|
export PATH="/home/ray/.local/bin:/home/ray/anaconda3/bin:${PATH}"
|
|
|
|
# Configuration with defaults
|
|
RAY_HEAD_ADDRESS="${RAY_HEAD_SVC:-ray-head-svc}:6379"
|
|
GPU_RESOURCE="${GPU_RESOURCE:-gpu_amd}"
|
|
NUM_GPUS="${NUM_GPUS:-1}"
|
|
|
|
# Log startup info
|
|
echo "============================================="
|
|
echo "Ray Worker Starting"
|
|
echo "============================================="
|
|
echo " Head address: ${RAY_HEAD_ADDRESS}"
|
|
echo " GPU resource: ${GPU_RESOURCE}"
|
|
echo " Num GPUs: ${NUM_GPUS}"
|
|
echo " Python: $(python3 --version)"
|
|
echo " Ray version: $(ray --version)"
|
|
echo "============================================="
|
|
|
|
# Wait for Ray head to be available (with retry)
|
|
MAX_RETRIES=30
|
|
RETRY_INTERVAL=5
|
|
retry_count=0
|
|
|
|
echo "Waiting for Ray head node..."
|
|
until ray health-check --address="${RAY_HEAD_ADDRESS}" 2>/dev/null; do
|
|
retry_count=$((retry_count + 1))
|
|
if [ $retry_count -ge $MAX_RETRIES ]; then
|
|
echo "ERROR: Ray head not available after ${MAX_RETRIES} attempts"
|
|
exit 1
|
|
fi
|
|
echo " Attempt ${retry_count}/${MAX_RETRIES} - retrying in ${RETRY_INTERVAL}s..."
|
|
sleep "${RETRY_INTERVAL}"
|
|
done
|
|
echo "Ray head is ready!"
|
|
|
|
# Build ray start command with optional args
|
|
RAY_START_ARGS=(
|
|
--address="${RAY_HEAD_ADDRESS}"
|
|
--num-gpus="${NUM_GPUS}"
|
|
--resources="{\"${GPU_RESOURCE}\": 1}"
|
|
--block
|
|
)
|
|
|
|
# Add object store memory limit if specified
|
|
if [ -n "${RAY_OBJECT_STORE_MEMORY:-}" ]; then
|
|
RAY_START_ARGS+=(--object-store-memory="${RAY_OBJECT_STORE_MEMORY}")
|
|
fi
|
|
|
|
# Start Ray worker
|
|
echo "Starting Ray worker with resources: {\"${GPU_RESOURCE}\": 1}"
|
|
exec ray start "${RAY_START_ARGS[@]}"
|