#!/bin/bash # Ray Worker Entrypoint # Connects to Ray head node and registers custom GPU resources # # Environment variables: # RAY_HEAD_SVC - Ray head service name (default: ray-head-svc) # GPU_RESOURCE - Custom GPU resource name (default: gpu_amd) # NUM_GPUS - Number of GPUs to register (default: 1) # RAY_OBJECT_STORE_MEMORY - Object store memory limit (optional) set -euo pipefail # Ensure Ray CLI is in PATH (works across all base images) export PATH="/home/ray/.local/bin:/home/ray/anaconda3/bin:${PATH}" # Configuration with defaults RAY_HEAD_ADDRESS="${RAY_HEAD_SVC:-ray-head-svc}:6379" GPU_RESOURCE="${GPU_RESOURCE:-gpu_amd}" NUM_GPUS="${NUM_GPUS:-1}" # Log startup info echo "=============================================" echo "Ray Worker Starting" echo "=============================================" echo " Head address: ${RAY_HEAD_ADDRESS}" echo " GPU resource: ${GPU_RESOURCE}" echo " Num GPUs: ${NUM_GPUS}" echo " Python: $(python3 --version)" echo " Ray version: $(ray --version)" echo "=============================================" # Wait for Ray head to be available (with retry) MAX_RETRIES=30 RETRY_INTERVAL=5 retry_count=0 echo "Waiting for Ray head node..." until ray health-check --address="${RAY_HEAD_ADDRESS}" 2>/dev/null; do retry_count=$((retry_count + 1)) if [ $retry_count -ge $MAX_RETRIES ]; then echo "ERROR: Ray head not available after ${MAX_RETRIES} attempts" exit 1 fi echo " Attempt ${retry_count}/${MAX_RETRIES} - retrying in ${RETRY_INTERVAL}s..." sleep "${RETRY_INTERVAL}" done echo "Ray head is ready!" # Build ray start command with optional args RAY_START_ARGS=( --address="${RAY_HEAD_ADDRESS}" --num-gpus="${NUM_GPUS}" --resources="{\"${GPU_RESOURCE}\": 1}" --block ) # Add object store memory limit if specified if [ -n "${RAY_OBJECT_STORE_MEMORY:-}" ]; then RAY_START_ARGS+=(--object-store-memory="${RAY_OBJECT_STORE_MEMORY}") fi # Start Ray worker echo "Starting Ray worker with resources: {\"${GPU_RESOURCE}\": 1}" exec ray start "${RAY_START_ARGS[@]}"