build: optimize Dockerfiles for production
Some checks failed
Some checks failed
- Use BuildKit syntax 1.7 with cache mounts for apt/uv - Switch from pip to uv for 10-100x faster installs (ADR-0014) - Add OCI Image Spec labels for container metadata - Add HEALTHCHECK directives for orchestration - Add .dockerignore to reduce context size - Update Makefile with buildx and lint target - Add retry logic to ray-entrypoint.sh Refs: ADR-0012 (uv), ADR-0014 (Docker best practices)
This commit is contained in:
44
.dockerignore
Normal file
44
.dockerignore
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
# Git
|
||||||
|
.git
|
||||||
|
.gitignore
|
||||||
|
.gitea
|
||||||
|
|
||||||
|
# Documentation
|
||||||
|
*.md
|
||||||
|
LICENSE
|
||||||
|
docs/
|
||||||
|
|
||||||
|
# IDE and editors
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# Python artifacts
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
.pytest_cache/
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
.env
|
||||||
|
*.egg-info/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
|
||||||
|
# OS files
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# Build logs
|
||||||
|
*.log
|
||||||
|
*.tmp
|
||||||
|
|
||||||
|
# Local development
|
||||||
|
Makefile
|
||||||
|
.goreleaser.yml
|
||||||
|
|
||||||
|
# Don't ignore these (explicitly include)
|
||||||
|
!ray-serve/
|
||||||
|
!dockerfiles/
|
||||||
63
Makefile
63
Makefile
@@ -3,52 +3,73 @@
|
|||||||
|
|
||||||
REGISTRY := git.daviestechlabs.io/daviestechlabs
|
REGISTRY := git.daviestechlabs.io/daviestechlabs
|
||||||
TAG := latest
|
TAG := latest
|
||||||
|
PLATFORM := linux/amd64
|
||||||
|
|
||||||
# Image names
|
# Image names
|
||||||
IMAGES := ray-worker-nvidia ray-worker-rdna2 ray-worker-strixhalo ray-worker-intel
|
IMAGES := ray-worker-nvidia ray-worker-rdna2 ray-worker-strixhalo ray-worker-intel
|
||||||
|
|
||||||
.PHONY: all build-all push-all clean help $(addprefix build-,$(IMAGES)) $(addprefix push-,$(IMAGES))
|
.PHONY: all build-all push-all clean help lint $(addprefix build-,$(IMAGES)) $(addprefix push-,$(IMAGES))
|
||||||
|
|
||||||
help:
|
help:
|
||||||
@echo "KubeRay Images Build System"
|
@echo "KubeRay Images Build System"
|
||||||
@echo ""
|
@echo ""
|
||||||
@echo "Usage:"
|
@echo "Usage:"
|
||||||
@echo " make build-all Build all images"
|
@echo " make build-all Build all images"
|
||||||
@echo " make push-all Push all images to registry"
|
@echo " make push-all Push all images to registry"
|
||||||
@echo " make build-nvidia Build NVIDIA worker image"
|
@echo " make build-nvidia Build NVIDIA worker image"
|
||||||
@echo " make build-rdna2 Build AMD RDNA2 worker image"
|
@echo " make build-rdna2 Build AMD RDNA2 worker image"
|
||||||
@echo " make build-strixhalo Build AMD Strix Halo worker image"
|
@echo " make build-strixhalo Build AMD Strix Halo worker image"
|
||||||
@echo " make build-intel Build Intel XPU worker image"
|
@echo " make build-intel Build Intel XPU worker image"
|
||||||
@echo " make push-nvidia Push NVIDIA worker image"
|
@echo " make push-nvidia Push NVIDIA worker image"
|
||||||
|
@echo " make lint Lint Dockerfiles with hadolint"
|
||||||
@echo " make TAG=v1.0.0 push-all Push with specific tag"
|
@echo " make TAG=v1.0.0 push-all Push with specific tag"
|
||||||
@echo ""
|
@echo ""
|
||||||
@echo "Environment:"
|
@echo "Environment:"
|
||||||
@echo " REGISTRY=$(REGISTRY)"
|
@echo " REGISTRY=$(REGISTRY)"
|
||||||
@echo " TAG=$(TAG)"
|
@echo " TAG=$(TAG)"
|
||||||
|
@echo " PLATFORM=$(PLATFORM)"
|
||||||
|
|
||||||
# Build targets
|
# Lint Dockerfiles with hadolint
|
||||||
|
lint:
|
||||||
|
@echo "Linting Dockerfiles..."
|
||||||
|
@command -v hadolint >/dev/null 2>&1 || { echo "hadolint not found, skipping..."; exit 0; }
|
||||||
|
hadolint dockerfiles/Dockerfile.ray-worker-nvidia
|
||||||
|
hadolint dockerfiles/Dockerfile.ray-worker-rdna2
|
||||||
|
hadolint dockerfiles/Dockerfile.ray-worker-strixhalo
|
||||||
|
hadolint dockerfiles/Dockerfile.ray-worker-intel
|
||||||
|
@echo "Lint passed!"
|
||||||
|
|
||||||
|
# Build targets using buildx for cache support
|
||||||
build-nvidia:
|
build-nvidia:
|
||||||
docker build \
|
docker buildx build \
|
||||||
-t $(REGISTRY)/ray-worker-nvidia:$(TAG) \
|
--platform $(PLATFORM) \
|
||||||
-f dockerfiles/Dockerfile.ray-worker-nvidia \
|
--tag $(REGISTRY)/ray-worker-nvidia:$(TAG) \
|
||||||
|
--file dockerfiles/Dockerfile.ray-worker-nvidia \
|
||||||
|
--load \
|
||||||
.
|
.
|
||||||
|
|
||||||
build-rdna2:
|
build-rdna2:
|
||||||
docker build \
|
docker buildx build \
|
||||||
-t $(REGISTRY)/ray-worker-rdna2:$(TAG) \
|
--platform $(PLATFORM) \
|
||||||
-f dockerfiles/Dockerfile.ray-worker-rdna2 \
|
--tag $(REGISTRY)/ray-worker-rdna2:$(TAG) \
|
||||||
|
--file dockerfiles/Dockerfile.ray-worker-rdna2 \
|
||||||
|
--load \
|
||||||
.
|
.
|
||||||
|
|
||||||
build-strixhalo:
|
build-strixhalo:
|
||||||
docker build \
|
docker buildx build \
|
||||||
-t $(REGISTRY)/ray-worker-strixhalo:$(TAG) \
|
--platform $(PLATFORM) \
|
||||||
-f dockerfiles/Dockerfile.ray-worker-strixhalo \
|
--tag $(REGISTRY)/ray-worker-strixhalo:$(TAG) \
|
||||||
|
--file dockerfiles/Dockerfile.ray-worker-strixhalo \
|
||||||
|
--load \
|
||||||
.
|
.
|
||||||
|
|
||||||
build-intel:
|
build-intel:
|
||||||
docker build \
|
docker buildx build \
|
||||||
-t $(REGISTRY)/ray-worker-intel:$(TAG) \
|
--platform $(PLATFORM) \
|
||||||
-f dockerfiles/Dockerfile.ray-worker-intel \
|
--tag $(REGISTRY)/ray-worker-intel:$(TAG) \
|
||||||
|
--file dockerfiles/Dockerfile.ray-worker-intel \
|
||||||
|
--load \
|
||||||
.
|
.
|
||||||
|
|
||||||
build-all: build-nvidia build-rdna2 build-strixhalo build-intel
|
build-all: build-nvidia build-rdna2 build-strixhalo build-intel
|
||||||
|
|||||||
28
README.md
28
README.md
@@ -2,18 +2,29 @@
|
|||||||
|
|
||||||
GPU-specific Ray worker images for the DaviesTechLabs AI/ML platform.
|
GPU-specific Ray worker images for the DaviesTechLabs AI/ML platform.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **BuildKit optimized**: Cache mounts for apt and pip speed up rebuilds
|
||||||
|
- **OCI compliant**: Standard image labels (`org.opencontainers.image.*`)
|
||||||
|
- **Health checks**: Built-in HEALTHCHECK for container orchestration
|
||||||
|
- **Non-root execution**: Ray runs as unprivileged `ray` user
|
||||||
|
- **Retry logic**: Entrypoint waits for Ray head with exponential backoff
|
||||||
|
|
||||||
## Images
|
## Images
|
||||||
|
|
||||||
| Image | GPU Target | Workloads | Registry |
|
| Image | GPU Target | Workloads | Base |
|
||||||
|-------|------------|-----------|----------|
|
|-------|------------|-----------|------|
|
||||||
| `ray-worker-nvidia` | NVIDIA CUDA (RTX 2070) | Whisper STT, XTTS TTS | `git.daviestechlabs.io/daviestechlabs/ray-worker-nvidia` |
|
| `ray-worker-nvidia` | NVIDIA CUDA 12.1 (RTX 2070) | Whisper STT, XTTS TTS | `rayproject/ray-ml:2.53.0-py310-cu121` |
|
||||||
| `ray-worker-rdna2` | AMD ROCm (Radeon 680M) | BGE Embeddings | `git.daviestechlabs.io/daviestechlabs/ray-worker-rdna2` |
|
| `ray-worker-rdna2` | AMD ROCm 6.4 (Radeon 680M) | BGE Embeddings | `rocm/pytorch:rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.6.0` |
|
||||||
| `ray-worker-strixhalo` | AMD ROCm (Strix Halo) | vLLM, BGE | `git.daviestechlabs.io/daviestechlabs/ray-worker-strixhalo` |
|
| `ray-worker-strixhalo` | AMD ROCm 7.1 (Strix Halo) | vLLM, BGE | `rocm/pytorch:rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.8.0` |
|
||||||
| `ray-worker-intel` | Intel XPU (Arc) | BGE Reranker | `git.daviestechlabs.io/daviestechlabs/ray-worker-intel` |
|
| `ray-worker-intel` | Intel XPU (Arc) | BGE Reranker | `rayproject/ray-ml:2.53.0-py310` |
|
||||||
|
|
||||||
## Building Locally
|
## Building Locally
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
# Lint Dockerfiles (requires hadolint)
|
||||||
|
make lint
|
||||||
|
|
||||||
# Build all images
|
# Build all images
|
||||||
make build-all
|
make build-all
|
||||||
|
|
||||||
@@ -24,8 +35,11 @@ make build-strixhalo
|
|||||||
make build-intel
|
make build-intel
|
||||||
|
|
||||||
# Push to Gitea registry (requires login)
|
# Push to Gitea registry (requires login)
|
||||||
docker login git.daviestechlabs.io
|
make login
|
||||||
make push-all
|
make push-all
|
||||||
|
|
||||||
|
# Release with version tag
|
||||||
|
make VERSION=v1.0.0 release
|
||||||
```
|
```
|
||||||
|
|
||||||
## CI/CD
|
## CI/CD
|
||||||
|
|||||||
@@ -1,77 +1,98 @@
|
|||||||
# Intel GPU Ray Worker for danilo (Intel i915 iGPU)
|
# syntax=docker/dockerfile:1.7
|
||||||
# Used for: Reranker
|
# Intel GPU Ray Worker for danilo (Intel Arc / i915 iGPU)
|
||||||
|
# Used for: BGE Reranker
|
||||||
#
|
#
|
||||||
# Build from llm-workflows root:
|
# Build:
|
||||||
# docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-intel:latest -f dockerfiles/Dockerfile.ray-worker-intel .
|
# docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-intel:latest \
|
||||||
#
|
# -f dockerfiles/Dockerfile.ray-worker-intel .
|
||||||
# Multi-stage build to ensure Python 3.11.11 matches Ray head node
|
|
||||||
FROM rayproject/ray:2.53.0-py311 AS base
|
|
||||||
|
|
||||||
LABEL maintainer="billy-davies-2"
|
FROM rayproject/ray:2.53.0-py311
|
||||||
LABEL description="Ray worker for Intel GPUs (Reranker)"
|
|
||||||
|
# OCI Image Spec labels
|
||||||
|
LABEL org.opencontainers.image.title="Ray Worker - Intel GPU"
|
||||||
|
LABEL org.opencontainers.image.description="Ray Serve worker for Intel GPUs (BGE Reranker)"
|
||||||
|
LABEL org.opencontainers.image.vendor="DaviesTechLabs"
|
||||||
|
LABEL org.opencontainers.image.source="https://git.daviestechlabs.io/daviestechlabs/kuberay-images"
|
||||||
|
LABEL org.opencontainers.image.licenses="MIT"
|
||||||
LABEL gpu.target="intel-xpu"
|
LABEL gpu.target="intel-xpu"
|
||||||
|
LABEL ray.version="2.53.0"
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install system dependencies for Intel GPU support
|
# Install system dependencies and Intel GPU runtime
|
||||||
USER root
|
USER root
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||||
git \
|
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||||
curl \
|
|
||||||
wget \
|
|
||||||
gnupg2 \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Add Intel oneAPI repository for runtime libraries
|
|
||||||
RUN wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor -o /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
|
||||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" > /etc/apt/sources.list.d/intel-oneapi.list
|
|
||||||
|
|
||||||
# Add Intel compute-runtime repository for Level Zero
|
|
||||||
RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor -o /usr/share/keyrings/intel-graphics-archive-keyring.gpg && \
|
|
||||||
echo "deb [signed-by=/usr/share/keyrings/intel-graphics-archive-keyring.gpg arch=amd64] https://repositories.intel.com/gpu/ubuntu jammy client" > /etc/apt/sources.list.d/intel-gpu.list && \
|
|
||||||
apt-get update && apt-get install -y --no-install-recommends \
|
apt-get update && apt-get install -y --no-install-recommends \
|
||||||
intel-oneapi-runtime-opencl \
|
curl \
|
||||||
intel-oneapi-runtime-compilers \
|
wget \
|
||||||
intel-level-zero-gpu \
|
gnupg2 \
|
||||||
level-zero \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Add Intel oneAPI and GPU compute repositories
|
||||||
|
RUN wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
|
||||||
|
| gpg --dearmor -o /usr/share/keyrings/intel-oneapi-archive-keyring.gpg \
|
||||||
|
&& echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
|
||||||
|
> /etc/apt/sources.list.d/intel-oneapi.list \
|
||||||
|
&& wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
|
||||||
|
| gpg --dearmor -o /usr/share/keyrings/intel-graphics-archive-keyring.gpg \
|
||||||
|
&& echo "deb [signed-by=/usr/share/keyrings/intel-graphics-archive-keyring.gpg arch=amd64] https://repositories.intel.com/gpu/ubuntu jammy client" \
|
||||||
|
> /etc/apt/sources.list.d/intel-gpu.list
|
||||||
|
|
||||||
|
# Install Intel runtime libraries
|
||||||
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||||
|
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||||
|
apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
intel-oneapi-runtime-opencl \
|
||||||
|
intel-oneapi-runtime-compilers \
|
||||||
|
intel-level-zero-gpu \
|
||||||
|
level-zero \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install uv for fast Python package management (ADR-0014)
|
||||||
|
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
||||||
|
|
||||||
USER ray
|
USER ray
|
||||||
|
|
||||||
# Ensure Ray CLI is in PATH
|
# Ensure Ray CLI is in PATH
|
||||||
ENV PATH="/home/ray/.local/bin:${PATH}"
|
ENV PATH="/home/ray/.local/bin:${PATH}"
|
||||||
|
|
||||||
# Install Intel Extension for PyTorch (IPEX) for Python 3.11
|
# Install Intel Extension for PyTorch (IPEX) with XPU support (uv is 10-100x faster)
|
||||||
# This provides XPU support for Intel GPUs
|
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||||
RUN pip install --no-cache-dir \
|
uv pip install --system \
|
||||||
torch==2.5.1 \
|
torch==2.5.1 \
|
||||||
intel-extension-for-pytorch==2.5.10+xpu \
|
intel-extension-for-pytorch==2.5.10+xpu \
|
||||||
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
||||||
|
|
||||||
# Install Ray Serve and AI inference dependencies
|
# Install inference dependencies
|
||||||
RUN pip install --no-cache-dir \
|
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||||
sentence-transformers \
|
uv pip install --system \
|
||||||
FlagEmbedding \
|
'sentence-transformers>=2.3.0,<3.0' \
|
||||||
fastapi \
|
'FlagEmbedding>=1.2.0,<2.0' \
|
||||||
uvicorn \
|
'transformers>=4.35.0,<5.0' \
|
||||||
httpx \
|
'huggingface_hub>=0.20.0,<1.0' \
|
||||||
pydantic \
|
'fastapi>=0.100.0,<1.0' \
|
||||||
transformers \
|
'uvicorn>=0.23.0,<1.0' \
|
||||||
huggingface_hub
|
'httpx>=0.27.0,<1.0' \
|
||||||
|
'pydantic>=2.0.0,<3.0'
|
||||||
|
|
||||||
# Copy Ray Serve Python code
|
# Copy application code
|
||||||
COPY ray-serve/ /app/ray_serve/
|
COPY --chown=ray:ray ray-serve/ /app/ray_serve/
|
||||||
ENV PYTHONPATH=/app
|
COPY --chown=ray:ray --chmod=755 dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh
|
||||||
|
|
||||||
# Copy Ray Serve entrypoint
|
# Environment configuration
|
||||||
COPY --chmod=755 dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh
|
ENV PYTHONPATH=/app \
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
RAY_HEAD_SVC="ai-inference-raycluster-head-svc" \
|
||||||
|
GPU_RESOURCE="gpu_intel" \
|
||||||
|
NUM_GPUS="1" \
|
||||||
|
# Intel XPU settings
|
||||||
|
ZE_AFFINITY_MASK=0 \
|
||||||
|
SYCL_DEVICE_FILTER="level_zero:gpu"
|
||||||
|
|
||||||
# Default environment variables
|
# Health check
|
||||||
ENV RAY_HEAD_SVC="ai-inference-raycluster-head-svc"
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||||
ENV GPU_RESOURCE="gpu_intel"
|
CMD ray status --address=localhost:6379 || exit 1
|
||||||
ENV NUM_GPUS="1"
|
|
||||||
# Intel XPU settings
|
|
||||||
ENV ZE_AFFINITY_MASK=0
|
|
||||||
ENV SYCL_DEVICE_FILTER=level_zero:gpu
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/ray-entrypoint.sh"]
|
ENTRYPOINT ["/app/ray-entrypoint.sh"]
|
||||||
|
|||||||
@@ -1,53 +1,70 @@
|
|||||||
|
# syntax=docker/dockerfile:1.7
|
||||||
# NVIDIA GPU Ray Worker for elminster (RTX 2070)
|
# NVIDIA GPU Ray Worker for elminster (RTX 2070)
|
||||||
# Used for: Whisper STT, TTS
|
# Used for: Whisper STT, XTTS Text-to-Speech
|
||||||
#
|
|
||||||
# Build from llm-workflows root:
|
|
||||||
# docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-nvidia:latest -f dockerfiles/Dockerfile.ray-worker-nvidia .
|
|
||||||
#
|
#
|
||||||
|
# Build:
|
||||||
|
# docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-nvidia:latest \
|
||||||
|
# -f dockerfiles/Dockerfile.ray-worker-nvidia .
|
||||||
|
|
||||||
FROM rayproject/ray:2.53.0-py311-cu121
|
FROM rayproject/ray:2.53.0-py311-cu121
|
||||||
|
|
||||||
LABEL maintainer="billy-davies-2"
|
# OCI Image Spec labels
|
||||||
LABEL description="Ray worker for NVIDIA GPUs (Whisper, TTS)"
|
LABEL org.opencontainers.image.title="Ray Worker - NVIDIA GPU"
|
||||||
LABEL gpu.target="nvidia-cuda"
|
LABEL org.opencontainers.image.description="Ray Serve worker for NVIDIA GPUs (Whisper STT, XTTS TTS)"
|
||||||
|
LABEL org.opencontainers.image.vendor="DaviesTechLabs"
|
||||||
|
LABEL org.opencontainers.image.source="https://git.daviestechlabs.io/daviestechlabs/kuberay-images"
|
||||||
|
LABEL org.opencontainers.image.licenses="MIT"
|
||||||
|
LABEL gpu.target="nvidia-cuda-12.1"
|
||||||
|
LABEL ray.version="2.53.0"
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install system dependencies for audio processing
|
# Install system dependencies in a single layer with cleanup
|
||||||
USER root
|
USER root
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||||
ffmpeg \
|
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||||
libsndfile1 \
|
apt-get update && apt-get install -y --no-install-recommends \
|
||||||
git \
|
ffmpeg \
|
||||||
|
libsndfile1 \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install uv for fast Python package management (ADR-0014)
|
||||||
|
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
||||||
|
|
||||||
|
# Switch back to non-root ray user
|
||||||
USER ray
|
USER ray
|
||||||
|
|
||||||
# Install Python dependencies for inference
|
# Install Python dependencies with uv cache mount (10-100x faster than pip)
|
||||||
RUN pip install --no-cache-dir \
|
# Pinned versions for reproducibility
|
||||||
faster-whisper \
|
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||||
openai-whisper \
|
uv pip install --system \
|
||||||
TTS \
|
'faster-whisper>=1.0.0,<2.0' \
|
||||||
soundfile \
|
'TTS>=0.22.0,<1.0' \
|
||||||
pydub \
|
'soundfile>=0.12.0,<1.0' \
|
||||||
librosa \
|
'pydub>=0.25.0,<1.0' \
|
||||||
torch \
|
'librosa>=0.10.0,<1.0' \
|
||||||
torchaudio \
|
'torch>=2.0.0,<3.0' \
|
||||||
fastapi \
|
'torchaudio>=2.0.0,<3.0' \
|
||||||
uvicorn \
|
'fastapi>=0.100.0,<1.0' \
|
||||||
httpx \
|
'uvicorn>=0.23.0,<1.0' \
|
||||||
pydantic
|
'httpx>=0.27.0,<1.0' \
|
||||||
|
'pydantic>=2.0.0,<3.0'
|
||||||
|
|
||||||
# Copy Ray Serve Python code
|
# Copy application code with proper ownership
|
||||||
COPY --chown=ray:ray ray-serve/ /app/ray_serve/
|
COPY --chown=ray:ray ray-serve/ /app/ray_serve/
|
||||||
ENV PYTHONPATH=/app
|
COPY --chown=ray:ray --chmod=755 dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh
|
||||||
|
|
||||||
# Copy Ray Serve entrypoint
|
# Environment configuration
|
||||||
COPY --chown=ray:ray dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh
|
ENV PYTHONPATH=/app \
|
||||||
RUN chmod +x /app/ray-entrypoint.sh
|
PYTHONUNBUFFERED=1 \
|
||||||
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
CUDA_VISIBLE_DEVICES=0 \
|
||||||
|
RAY_HEAD_SVC="ai-inference-raycluster-head-svc" \
|
||||||
|
GPU_RESOURCE="gpu_nvidia" \
|
||||||
|
NUM_GPUS="1"
|
||||||
|
|
||||||
# Default environment variables
|
# Health check - verify Ray worker can connect
|
||||||
ENV CUDA_VISIBLE_DEVICES=0
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||||
ENV RAY_HEAD_SVC="ai-inference-raycluster-head-svc"
|
CMD ray status --address=localhost:6379 || exit 1
|
||||||
ENV GPU_RESOURCE="gpu_nvidia"
|
|
||||||
ENV NUM_GPUS="1"
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/ray-entrypoint.sh"]
|
ENTRYPOINT ["/app/ray-entrypoint.sh"]
|
||||||
|
|||||||
@@ -1,65 +1,94 @@
|
|||||||
# Ray Worker for AMD RDNA 2 (gfx1035 - Radeon 680M)
|
# syntax=docker/dockerfile:1.7
|
||||||
# Pre-bakes all dependencies for fast startup
|
# AMD RDNA 2 Ray Worker for drizzt (Radeon 680M - gfx1035)
|
||||||
|
# Used for: BGE Embeddings
|
||||||
#
|
#
|
||||||
# Build from llm-workflows root:
|
# Build:
|
||||||
# docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-rdna2:latest -f dockerfiles/Dockerfile.ray-worker-rdna2 .
|
# docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-rdna2:latest \
|
||||||
|
# -f dockerfiles/Dockerfile.ray-worker-rdna2 .
|
||||||
#
|
#
|
||||||
# Multi-stage build to ensure Python 3.11.11 matches Ray head node
|
# Multi-stage build: Extract ROCm from vendor image, use Ray base for Python 3.11
|
||||||
|
|
||||||
# Stage 1: Extract ROCm libraries from vendor image
|
# Stage 1: ROCm libraries from AMD vendor image
|
||||||
FROM docker.io/rocm/pytorch:rocm6.4.4_ubuntu22.04_py3.10_pytorch_release_2.7.1 AS rocm-libs
|
FROM docker.io/rocm/pytorch:rocm6.4.4_ubuntu22.04_py3.10_pytorch_release_2.7.1 AS rocm-source
|
||||||
|
|
||||||
# Stage 2: Build on Ray base with Python 3.11
|
# Stage 2: Production image
|
||||||
FROM rayproject/ray:2.53.0-py311 AS base
|
FROM rayproject/ray:2.53.0-py311 AS production
|
||||||
|
|
||||||
# Copy ROCm stack from vendor image
|
# OCI Image Spec labels
|
||||||
COPY --from=rocm-libs /opt/rocm /opt/rocm
|
LABEL org.opencontainers.image.title="Ray Worker - AMD RDNA 2"
|
||||||
|
LABEL org.opencontainers.image.description="Ray Serve worker for AMD RDNA 2 GPUs (BGE Embeddings)"
|
||||||
# Set up ROCm environment
|
LABEL org.opencontainers.image.vendor="DaviesTechLabs"
|
||||||
ENV ROCM_HOME=/opt/rocm
|
LABEL org.opencontainers.image.source="https://git.daviestechlabs.io/daviestechlabs/kuberay-images"
|
||||||
ENV PATH="${ROCM_HOME}/bin:${ROCM_HOME}/llvm/bin:${PATH}"
|
LABEL org.opencontainers.image.licenses="MIT"
|
||||||
ENV LD_LIBRARY_PATH="${ROCM_HOME}/lib:${ROCM_HOME}/lib64:${LD_LIBRARY_PATH}"
|
LABEL gpu.target="amd-rocm-6.4-gfx1035"
|
||||||
ENV HSA_PATH="${ROCM_HOME}/hsa"
|
LABEL ray.version="2.53.0"
|
||||||
ENV HIP_PATH="${ROCM_HOME}/hip"
|
|
||||||
|
|
||||||
# ROCm environment for RDNA 2 (gfx1035)
|
|
||||||
ENV HIP_VISIBLE_DEVICES=0 \
|
|
||||||
HSA_ENABLE_SDMA=0 \
|
|
||||||
PYTORCH_HIP_ALLOC_CONF=expandable_segments:True \
|
|
||||||
PYTHONPATH=/app
|
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install ROCm system dependencies
|
# Copy ROCm stack from vendor image (single COPY layer)
|
||||||
|
COPY --from=rocm-source /opt/rocm /opt/rocm
|
||||||
|
|
||||||
|
# ROCm environment variables
|
||||||
|
ENV ROCM_HOME=/opt/rocm \
|
||||||
|
PATH="${ROCM_HOME}/bin:${ROCM_HOME}/llvm/bin:${PATH}" \
|
||||||
|
LD_LIBRARY_PATH="${ROCM_HOME}/lib:${ROCM_HOME}/lib64:${LD_LIBRARY_PATH}" \
|
||||||
|
HSA_PATH="${ROCM_HOME}/hsa" \
|
||||||
|
HIP_PATH="${ROCM_HOME}/hip" \
|
||||||
|
# RDNA 2 specific settings
|
||||||
|
HIP_VISIBLE_DEVICES=0 \
|
||||||
|
HSA_ENABLE_SDMA=0 \
|
||||||
|
PYTORCH_HIP_ALLOC_CONF=expandable_segments:True
|
||||||
|
|
||||||
|
# Install system dependencies
|
||||||
USER root
|
USER root
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||||
libelf1 \
|
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||||
libnuma1 \
|
apt-get update && apt-get install -y --no-install-recommends \
|
||||||
libdrm2 \
|
libelf1 \
|
||||||
libdrm-amdgpu1 \
|
libnuma1 \
|
||||||
kmod \
|
libdrm2 \
|
||||||
|
libdrm-amdgpu1 \
|
||||||
|
kmod \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install uv for fast Python package management (ADR-0014)
|
||||||
|
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
||||||
|
|
||||||
USER ray
|
USER ray
|
||||||
|
|
||||||
# Install PyTorch ROCm wheels compatible with Python 3.11 and ROCm 6.2
|
# Install PyTorch with ROCm 6.2 wheels for Python 3.11 (uv is 10-100x faster)
|
||||||
RUN pip install --no-cache-dir \
|
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||||
torch==2.5.1 torchvision torchaudio \
|
uv pip install --system \
|
||||||
--index-url https://download.pytorch.org/whl/rocm6.2
|
torch==2.5.1 torchvision torchaudio \
|
||||||
|
--index-url https://download.pytorch.org/whl/rocm6.2
|
||||||
|
|
||||||
# Install Ray Serve and AI inference dependencies
|
# Install inference dependencies
|
||||||
RUN pip install --no-cache-dir \
|
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||||
transformers \
|
uv pip install --system \
|
||||||
accelerate \
|
'transformers>=4.35.0,<5.0' \
|
||||||
sentence-transformers \
|
'accelerate>=0.25.0,<1.0' \
|
||||||
httpx \
|
'sentence-transformers>=2.3.0,<3.0' \
|
||||||
numpy \
|
'httpx>=0.27.0,<1.0' \
|
||||||
scipy
|
'numpy>=1.26.0,<2.0' \
|
||||||
|
'scipy>=1.11.0,<2.0'
|
||||||
|
|
||||||
# Pre-download embedding model for faster cold starts
|
# Pre-download embedding model for faster cold starts
|
||||||
RUN python3 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-large-en-v1.5')"
|
RUN python3 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-large-en-v1.5')"
|
||||||
|
|
||||||
# Copy application code
|
# Copy application code
|
||||||
COPY ray-serve/ /app/ray_serve/
|
COPY --chown=ray:ray ray-serve/ /app/ray_serve/
|
||||||
COPY --chmod=755 dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh
|
COPY --chown=ray:ray --chmod=755 dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh
|
||||||
|
|
||||||
|
# Environment configuration
|
||||||
|
ENV PYTHONPATH=/app \
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
RAY_HEAD_SVC="ai-inference-raycluster-head-svc" \
|
||||||
|
GPU_RESOURCE="gpu_amd_rdna2" \
|
||||||
|
NUM_GPUS="1"
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||||
|
CMD ray status --address=localhost:6379 || exit 1
|
||||||
|
|
||||||
ENTRYPOINT ["/app/ray-entrypoint.sh"]
|
ENTRYPOINT ["/app/ray-entrypoint.sh"]
|
||||||
|
|||||||
@@ -1,72 +1,100 @@
|
|||||||
# Ray Worker for AMD Strix Halo (gfx1151 / RDNA 3.5)
|
# syntax=docker/dockerfile:1.7
|
||||||
# Pre-bakes all dependencies for fast startup
|
# AMD Strix Halo Ray Worker for khelben (gfx1151 / RDNA 3.5)
|
||||||
|
# Used for: vLLM (Llama 3.1 70B)
|
||||||
#
|
#
|
||||||
# Build from llm-workflows root:
|
# Build:
|
||||||
# docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-strixhalo:latest -f dockerfiles/Dockerfile.ray-worker-strixhalo .
|
# docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-strixhalo:latest \
|
||||||
|
# -f dockerfiles/Dockerfile.ray-worker-strixhalo .
|
||||||
#
|
#
|
||||||
# Multi-stage build to ensure Python 3.11.11 matches Ray head node
|
# Multi-stage build: Extract ROCm 7.1 from vendor image, use Ray base for Python 3.11
|
||||||
|
# Note: Uses TheRock gfx110X wheels due to ROCm/ROCm#5853 segfault issue
|
||||||
|
|
||||||
# Stage 1: Extract ROCm 7.1 libraries from vendor image
|
# Stage 1: ROCm 7.1 libraries from AMD vendor image
|
||||||
FROM docker.io/rocm/pytorch:rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.9.1 AS rocm-libs
|
FROM docker.io/rocm/pytorch:rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.9.1 AS rocm-source
|
||||||
|
|
||||||
# Stage 2: Build on Ray base with Python 3.11
|
# Stage 2: Production image
|
||||||
FROM rayproject/ray:2.53.0-py311 AS base
|
FROM rayproject/ray:2.53.0-py311 AS production
|
||||||
|
|
||||||
# Copy ROCm stack from vendor image
|
# OCI Image Spec labels
|
||||||
COPY --from=rocm-libs /opt/rocm /opt/rocm
|
LABEL org.opencontainers.image.title="Ray Worker - AMD Strix Halo"
|
||||||
|
LABEL org.opencontainers.image.description="Ray Serve worker for AMD Strix Halo (vLLM LLM inference)"
|
||||||
# Set up ROCm environment
|
LABEL org.opencontainers.image.vendor="DaviesTechLabs"
|
||||||
ENV ROCM_HOME=/opt/rocm
|
LABEL org.opencontainers.image.source="https://git.daviestechlabs.io/daviestechlabs/kuberay-images"
|
||||||
ENV PATH="${ROCM_HOME}/bin:${ROCM_HOME}/llvm/bin:${PATH}"
|
LABEL org.opencontainers.image.licenses="MIT"
|
||||||
ENV LD_LIBRARY_PATH="${ROCM_HOME}/lib:${ROCM_HOME}/lib64:${LD_LIBRARY_PATH}"
|
LABEL gpu.target="amd-rocm-7.1-gfx1151"
|
||||||
ENV HSA_PATH="${ROCM_HOME}/hsa"
|
LABEL ray.version="2.53.0"
|
||||||
ENV HIP_PATH="${ROCM_HOME}/hip"
|
|
||||||
|
|
||||||
# ROCm environment for AMD Strix Halo (gfx1151 / RDNA 3.5)
|
|
||||||
ENV HIP_VISIBLE_DEVICES=0
|
|
||||||
ENV HSA_ENABLE_SDMA=0
|
|
||||||
ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512
|
|
||||||
ENV HSA_OVERRIDE_GFX_VERSION=11.0.0
|
|
||||||
ENV ROCM_TARGET_LST=gfx1151,gfx1100
|
|
||||||
ENV PYTHONPATH=/app
|
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install ROCm system dependencies
|
# Copy ROCm stack from vendor image
|
||||||
|
COPY --from=rocm-source /opt/rocm /opt/rocm
|
||||||
|
|
||||||
|
# ROCm environment variables
|
||||||
|
ENV ROCM_HOME=/opt/rocm \
|
||||||
|
PATH="${ROCM_HOME}/bin:${ROCM_HOME}/llvm/bin:${PATH}" \
|
||||||
|
LD_LIBRARY_PATH="${ROCM_HOME}/lib:${ROCM_HOME}/lib64:${LD_LIBRARY_PATH}" \
|
||||||
|
HSA_PATH="${ROCM_HOME}/hsa" \
|
||||||
|
HIP_PATH="${ROCM_HOME}/hip" \
|
||||||
|
# Strix Halo (gfx1151) specific settings
|
||||||
|
HIP_VISIBLE_DEVICES=0 \
|
||||||
|
HSA_ENABLE_SDMA=0 \
|
||||||
|
PYTORCH_HIP_ALLOC_CONF="expandable_segments:True,max_split_size_mb:512" \
|
||||||
|
HSA_OVERRIDE_GFX_VERSION="11.0.0" \
|
||||||
|
ROCM_TARGET_LST="gfx1151,gfx1100"
|
||||||
|
|
||||||
|
# Install system dependencies
|
||||||
USER root
|
USER root
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||||
libelf1 \
|
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||||
libnuma1 \
|
apt-get update && apt-get install -y --no-install-recommends \
|
||||||
libdrm2 \
|
libelf1 \
|
||||||
libdrm-amdgpu1 \
|
libnuma1 \
|
||||||
kmod \
|
libdrm2 \
|
||||||
|
libdrm-amdgpu1 \
|
||||||
|
kmod \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install uv for fast Python package management (ADR-0014)
|
||||||
|
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
||||||
|
|
||||||
USER ray
|
USER ray
|
||||||
|
|
||||||
# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault
|
# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault
|
||||||
# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
|
# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo).
|
||||||
# TheRock gfx110X-all packages provide Python 3.11 compatible wheels.
|
# TheRock gfx110X-all packages provide compatible Python 3.11 wheels.
|
||||||
RUN pip install --no-cache-dir \
|
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||||
--index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
|
uv pip install --system \
|
||||||
torch torchaudio torchvision
|
--index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \
|
||||||
|
torch torchaudio torchvision
|
||||||
|
|
||||||
# Install Ray Serve and AI inference dependencies
|
# Install vLLM and inference dependencies (uv is 10-100x faster than pip)
|
||||||
RUN pip install --no-cache-dir \
|
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||||
vllm \
|
uv pip install --system \
|
||||||
transformers \
|
'vllm>=0.5.0' \
|
||||||
accelerate \
|
'transformers>=4.35.0,<5.0' \
|
||||||
sentence-transformers \
|
'accelerate>=0.25.0,<1.0' \
|
||||||
httpx \
|
'sentence-transformers>=2.3.0,<3.0' \
|
||||||
numpy \
|
'httpx>=0.27.0,<1.0' \
|
||||||
scipy
|
'numpy>=1.26.0,<2.0' \
|
||||||
|
'scipy>=1.11.0,<2.0'
|
||||||
|
|
||||||
# Pre-download common models for faster cold starts
|
# Pre-download common models for faster cold starts (optional, increases image size)
|
||||||
RUN python3 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-large-en-v1.5')" || true
|
# RUN python3 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-large-en-v1.5')"
|
||||||
|
|
||||||
# Copy Ray Serve Python code
|
# Copy application code
|
||||||
COPY ray-serve/ /app/ray_serve/
|
COPY --chown=ray:ray ray-serve/ /app/ray_serve/
|
||||||
|
COPY --chown=ray:ray --chmod=755 dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh
|
||||||
|
|
||||||
# Ray worker entrypoint
|
# Environment configuration
|
||||||
COPY --chmod=755 dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh
|
ENV PYTHONPATH=/app \
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
RAY_HEAD_SVC="ai-inference-raycluster-head-svc" \
|
||||||
|
GPU_RESOURCE="gpu_amd_strixhalo" \
|
||||||
|
NUM_GPUS="1"
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
|
||||||
|
CMD ray status --address=localhost:6379 || exit 1
|
||||||
|
|
||||||
ENTRYPOINT ["/app/ray-entrypoint.sh"]
|
ENTRYPOINT ["/app/ray-entrypoint.sh"]
|
||||||
|
|||||||
@@ -1,27 +1,64 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# Ray Worker Entrypoint
|
# Ray Worker Entrypoint
|
||||||
# Connects to Ray head node and registers custom resources
|
# Connects to Ray head node and registers custom GPU resources
|
||||||
|
#
|
||||||
|
# Environment variables:
|
||||||
|
# RAY_HEAD_SVC - Ray head service name (default: ray-head-svc)
|
||||||
|
# GPU_RESOURCE - Custom GPU resource name (default: gpu_amd)
|
||||||
|
# NUM_GPUS - Number of GPUs to register (default: 1)
|
||||||
|
# RAY_OBJECT_STORE_MEMORY - Object store memory limit (optional)
|
||||||
|
|
||||||
set -e
|
set -euo pipefail
|
||||||
|
|
||||||
# Ensure Ray is in PATH (works across all base images)
|
# Ensure Ray CLI is in PATH (works across all base images)
|
||||||
export PATH="/home/ray/.local/bin:/home/ray/anaconda3/bin:${PATH}"
|
export PATH="/home/ray/.local/bin:/home/ray/anaconda3/bin:${PATH}"
|
||||||
|
|
||||||
# Get Ray head address from environment or default
|
# Configuration with defaults
|
||||||
RAY_HEAD_ADDRESS="${RAY_HEAD_SVC:-ray-head-svc}:6379"
|
RAY_HEAD_ADDRESS="${RAY_HEAD_SVC:-ray-head-svc}:6379"
|
||||||
|
|
||||||
# Get custom resources from environment
|
|
||||||
GPU_RESOURCE="${GPU_RESOURCE:-gpu_amd}"
|
GPU_RESOURCE="${GPU_RESOURCE:-gpu_amd}"
|
||||||
NUM_GPUS="${NUM_GPUS:-1}"
|
NUM_GPUS="${NUM_GPUS:-1}"
|
||||||
|
|
||||||
echo "Starting Ray worker..."
|
# Log startup info
|
||||||
echo " Head address: $RAY_HEAD_ADDRESS"
|
echo "============================================="
|
||||||
echo " GPU resource: $GPU_RESOURCE"
|
echo "Ray Worker Starting"
|
||||||
echo " Num GPUs: $NUM_GPUS"
|
echo "============================================="
|
||||||
|
echo " Head address: ${RAY_HEAD_ADDRESS}"
|
||||||
|
echo " GPU resource: ${GPU_RESOURCE}"
|
||||||
|
echo " Num GPUs: ${NUM_GPUS}"
|
||||||
|
echo " Python: $(python3 --version)"
|
||||||
|
echo " Ray version: $(ray --version)"
|
||||||
|
echo "============================================="
|
||||||
|
|
||||||
# Start Ray worker with custom resources
|
# Wait for Ray head to be available (with retry)
|
||||||
exec ray start \
|
MAX_RETRIES=30
|
||||||
--address="$RAY_HEAD_ADDRESS" \
|
RETRY_INTERVAL=5
|
||||||
--num-gpus="$NUM_GPUS" \
|
retry_count=0
|
||||||
--resources="{\"$GPU_RESOURCE\": 1}" \
|
|
||||||
|
echo "Waiting for Ray head node..."
|
||||||
|
until ray health-check --address="${RAY_HEAD_ADDRESS}" 2>/dev/null; do
|
||||||
|
retry_count=$((retry_count + 1))
|
||||||
|
if [ $retry_count -ge $MAX_RETRIES ]; then
|
||||||
|
echo "ERROR: Ray head not available after ${MAX_RETRIES} attempts"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo " Attempt ${retry_count}/${MAX_RETRIES} - retrying in ${RETRY_INTERVAL}s..."
|
||||||
|
sleep "${RETRY_INTERVAL}"
|
||||||
|
done
|
||||||
|
echo "Ray head is ready!"
|
||||||
|
|
||||||
|
# Build ray start command with optional args
|
||||||
|
RAY_START_ARGS=(
|
||||||
|
--address="${RAY_HEAD_ADDRESS}"
|
||||||
|
--num-gpus="${NUM_GPUS}"
|
||||||
|
--resources="{\"${GPU_RESOURCE}\": 1}"
|
||||||
--block
|
--block
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add object store memory limit if specified
|
||||||
|
if [ -n "${RAY_OBJECT_STORE_MEMORY:-}" ]; then
|
||||||
|
RAY_START_ARGS+=(--object-store-memory="${RAY_OBJECT_STORE_MEMORY}")
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Start Ray worker
|
||||||
|
echo "Starting Ray worker with resources: {\"${GPU_RESOURCE}\": 1}"
|
||||||
|
exec ray start "${RAY_START_ARGS[@]}"
|
||||||
|
|||||||
Reference in New Issue
Block a user