diff --git a/.gitea/workflows/build-push.yaml b/.gitea/workflows/build-push.yaml new file mode 100644 index 0000000..45a7426 --- /dev/null +++ b/.gitea/workflows/build-push.yaml @@ -0,0 +1,181 @@ +name: Build and Push Images + +on: + push: + branches: + - main + tags: + - 'v*' + pull_request: + branches: + - main + workflow_dispatch: + inputs: + image: + description: 'Image to build (all, nvidia, rdna2, strixhalo, intel)' + required: false + default: 'all' + +env: + REGISTRY: git.daviestechlabs.io/daviestechlabs + +jobs: + build-nvidia: + if: github.event.inputs.image == 'all' || github.event.inputs.image == 'nvidia' || github.event.inputs.image == '' + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Gitea Registry + uses: docker/login-action@v3 + with: + registry: git.daviestechlabs.io + username: ${{ secrets.REGISTRY_USER }} + password: ${{ secrets.REGISTRY_TOKEN }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/ray-worker-nvidia + tags: | + type=ref,event=branch + type=ref,event=pr + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: . + file: dockerfiles/Dockerfile.ray-worker-nvidia + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + build-rdna2: + if: github.event.inputs.image == 'all' || github.event.inputs.image == 'rdna2' || github.event.inputs.image == '' + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Gitea Registry + uses: docker/login-action@v3 + with: + registry: git.daviestechlabs.io + username: ${{ secrets.REGISTRY_USER }} + password: ${{ secrets.REGISTRY_TOKEN }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/ray-worker-rdna2 + tags: | + type=ref,event=branch + type=ref,event=pr + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: . + file: dockerfiles/Dockerfile.ray-worker-rdna2 + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + build-strixhalo: + if: github.event.inputs.image == 'all' || github.event.inputs.image == 'strixhalo' || github.event.inputs.image == '' + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Gitea Registry + uses: docker/login-action@v3 + with: + registry: git.daviestechlabs.io + username: ${{ secrets.REGISTRY_USER }} + password: ${{ secrets.REGISTRY_TOKEN }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/ray-worker-strixhalo + tags: | + type=ref,event=branch + type=ref,event=pr + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: . + file: dockerfiles/Dockerfile.ray-worker-strixhalo + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + build-intel: + if: github.event.inputs.image == 'all' || github.event.inputs.image == 'intel' || github.event.inputs.image == '' + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Gitea Registry + uses: docker/login-action@v3 + with: + registry: git.daviestechlabs.io + username: ${{ secrets.REGISTRY_USER }} + password: ${{ secrets.REGISTRY_TOKEN }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/ray-worker-intel + tags: | + type=ref,event=branch + type=ref,event=pr + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: . + file: dockerfiles/Dockerfile.ray-worker-intel + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..517b3c2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +# Build artifacts +*.log +*.tmp + +# Python +__pycache__/ +*.py[cod] +*$py.class +.pytest_cache/ +.venv/ +venv/ +.env + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Docker +.docker/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8f8fe0f --- /dev/null +++ b/Makefile @@ -0,0 +1,93 @@ +# KubeRay Images Makefile +# Build and push GPU-specific Ray worker images + +REGISTRY := git.daviestechlabs.io/daviestechlabs +TAG := latest + +# Image names +IMAGES := ray-worker-nvidia ray-worker-rdna2 ray-worker-strixhalo ray-worker-intel + +.PHONY: all build-all push-all clean help $(addprefix build-,$(IMAGES)) $(addprefix push-,$(IMAGES)) + +help: + @echo "KubeRay Images Build System" + @echo "" + @echo "Usage:" + @echo " make build-all Build all images" + @echo " make push-all Push all images to registry" + @echo " make build-nvidia Build NVIDIA worker image" + @echo " make build-rdna2 Build AMD RDNA2 worker image" + @echo " make build-strixhalo Build AMD Strix Halo worker image" + @echo " make build-intel Build Intel XPU worker image" + @echo " make push-nvidia Push NVIDIA worker image" + @echo " make TAG=v1.0.0 push-all Push with specific tag" + @echo "" + @echo "Environment:" + @echo " REGISTRY=$(REGISTRY)" + @echo " TAG=$(TAG)" + +# Build targets +build-nvidia: + docker build \ + -t $(REGISTRY)/ray-worker-nvidia:$(TAG) \ + -f dockerfiles/Dockerfile.ray-worker-nvidia \ + . + +build-rdna2: + docker build \ + -t $(REGISTRY)/ray-worker-rdna2:$(TAG) \ + -f dockerfiles/Dockerfile.ray-worker-rdna2 \ + . + +build-strixhalo: + docker build \ + -t $(REGISTRY)/ray-worker-strixhalo:$(TAG) \ + -f dockerfiles/Dockerfile.ray-worker-strixhalo \ + . + +build-intel: + docker build \ + -t $(REGISTRY)/ray-worker-intel:$(TAG) \ + -f dockerfiles/Dockerfile.ray-worker-intel \ + . + +build-all: build-nvidia build-rdna2 build-strixhalo build-intel + @echo "All images built successfully" + +# Push targets +push-nvidia: + docker push $(REGISTRY)/ray-worker-nvidia:$(TAG) + +push-rdna2: + docker push $(REGISTRY)/ray-worker-rdna2:$(TAG) + +push-strixhalo: + docker push $(REGISTRY)/ray-worker-strixhalo:$(TAG) + +push-intel: + docker push $(REGISTRY)/ray-worker-intel:$(TAG) + +push-all: push-nvidia push-rdna2 push-strixhalo push-intel + @echo "All images pushed successfully" + +# Tag and push with both latest and version tag +release: +ifndef VERSION + $(error VERSION is not set. Usage: make VERSION=v1.0.0 release) +endif + @echo "Releasing version $(VERSION)" + $(MAKE) TAG=$(VERSION) build-all + $(MAKE) TAG=$(VERSION) push-all + $(MAKE) TAG=latest build-all + $(MAKE) TAG=latest push-all + +# Login to registry +login: + docker login $(REGISTRY) + +# Clean local images +clean: + -docker rmi $(REGISTRY)/ray-worker-nvidia:$(TAG) + -docker rmi $(REGISTRY)/ray-worker-rdna2:$(TAG) + -docker rmi $(REGISTRY)/ray-worker-strixhalo:$(TAG) + -docker rmi $(REGISTRY)/ray-worker-intel:$(TAG) diff --git a/README.md b/README.md index ac377fc..39e8346 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,89 @@ -# kuberay-images +# KubeRay Worker Images -Where all my kuberay images will go \ No newline at end of file +GPU-specific Ray worker images for the DaviesTechLabs AI/ML platform. + +## Images + +| Image | GPU Target | Workloads | Registry | +|-------|------------|-----------|----------| +| `ray-worker-nvidia` | NVIDIA CUDA (RTX 2070) | Whisper STT, XTTS TTS | `git.daviestechlabs.io/daviestechlabs/ray-worker-nvidia` | +| `ray-worker-rdna2` | AMD ROCm (Radeon 680M) | BGE Embeddings | `git.daviestechlabs.io/daviestechlabs/ray-worker-rdna2` | +| `ray-worker-strixhalo` | AMD ROCm (Strix Halo) | vLLM, BGE | `git.daviestechlabs.io/daviestechlabs/ray-worker-strixhalo` | +| `ray-worker-intel` | Intel XPU (Arc) | BGE Reranker | `git.daviestechlabs.io/daviestechlabs/ray-worker-intel` | + +## Building Locally + +```bash +# Build all images +make build-all + +# Build specific image +make build-nvidia +make build-rdna2 +make build-strixhalo +make build-intel + +# Push to Gitea registry (requires login) +docker login git.daviestechlabs.io +make push-all +``` + +## CI/CD + +Images are automatically built and pushed to `git.daviestechlabs.io` package registry on: +- Push to `main` branch +- Git tag creation (e.g., `v1.0.0`) + +### Gitea Actions Secrets Required + +Add these secrets in Gitea repo settings → Actions → Secrets: + +| Secret | Description | +|--------|-------------| +| `REGISTRY_USER` | Gitea username | +| `REGISTRY_TOKEN` | Gitea access token with `package:write` scope | + +## Directory Structure + +``` +kuberay-images/ +├── dockerfiles/ +│ ├── Dockerfile.ray-worker-nvidia +│ ├── Dockerfile.ray-worker-rdna2 +│ ├── Dockerfile.ray-worker-strixhalo +│ ├── Dockerfile.ray-worker-intel +│ └── ray-entrypoint.sh +├── ray-serve/ +│ ├── serve_embeddings.py +│ ├── serve_whisper.py +│ ├── serve_tts.py +│ ├── serve_llm.py +│ ├── serve_reranker.py +│ └── requirements.txt +├── .gitea/workflows/ +│ └── build-push.yaml +├── Makefile +└── README.md +``` + +## Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `RAY_HEAD_SVC` | Ray head service name | `ai-inference-raycluster-head-svc` | +| `GPU_RESOURCE` | Custom Ray resource name | `gpu_nvidia`, `gpu_amd`, etc. | +| `NUM_GPUS` | Number of GPUs to expose | `1` | + +## Node Allocation + +| Node | Image | GPU | Memory | +|------|-------|-----|--------| +| elminster | ray-worker-nvidia | RTX 2070 | 8GB VRAM | +| khelben | ray-worker-strixhalo | Strix Halo | 64GB Unified | +| drizzt | ray-worker-rdna2 | Radeon 680M | 12GB VRAM | +| danilo | ray-worker-intel | Intel Arc | 16GB Shared | + +## Related + +- [homelab-design](https://git.daviestechlabs.io/daviestechlabs/homelab-design) - Architecture documentation +- [homelab-k8s2](https://github.com/Billy-Davies-2/homelab-k8s2) - Kubernetes manifests diff --git a/dockerfiles/Dockerfile.ray-worker-intel b/dockerfiles/Dockerfile.ray-worker-intel new file mode 100644 index 0000000..0c664ea --- /dev/null +++ b/dockerfiles/Dockerfile.ray-worker-intel @@ -0,0 +1,77 @@ +# Intel GPU Ray Worker for danilo (Intel i915 iGPU) +# Used for: Reranker +# +# Build from llm-workflows root: +# docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-intel:latest -f dockerfiles/Dockerfile.ray-worker-intel . +# +# Multi-stage build to ensure Python 3.11.11 matches Ray head node +FROM rayproject/ray:2.53.0-py311 AS base + +LABEL maintainer="billy-davies-2" +LABEL description="Ray worker for Intel GPUs (Reranker)" +LABEL gpu.target="intel-xpu" + +WORKDIR /app + +# Install system dependencies for Intel GPU support +USER root +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + curl \ + wget \ + gnupg2 \ + && rm -rf /var/lib/apt/lists/* + +# Add Intel oneAPI repository for runtime libraries +RUN wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor -o /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \ + echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" > /etc/apt/sources.list.d/intel-oneapi.list + +# Add Intel compute-runtime repository for Level Zero +RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor -o /usr/share/keyrings/intel-graphics-archive-keyring.gpg && \ + echo "deb [signed-by=/usr/share/keyrings/intel-graphics-archive-keyring.gpg arch=amd64] https://repositories.intel.com/gpu/ubuntu jammy client" > /etc/apt/sources.list.d/intel-gpu.list && \ + apt-get update && apt-get install -y --no-install-recommends \ + intel-oneapi-runtime-opencl \ + intel-oneapi-runtime-compilers \ + intel-level-zero-gpu \ + level-zero \ + && rm -rf /var/lib/apt/lists/* + +USER ray + +# Ensure Ray CLI is in PATH +ENV PATH="/home/ray/.local/bin:${PATH}" + +# Install Intel Extension for PyTorch (IPEX) for Python 3.11 +# This provides XPU support for Intel GPUs +RUN pip install --no-cache-dir \ + torch==2.5.1 \ + intel-extension-for-pytorch==2.5.10+xpu \ + --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +# Install Ray Serve and AI inference dependencies +RUN pip install --no-cache-dir \ + sentence-transformers \ + FlagEmbedding \ + fastapi \ + uvicorn \ + httpx \ + pydantic \ + transformers \ + huggingface_hub + +# Copy Ray Serve Python code +COPY ray-serve/ /app/ray_serve/ +ENV PYTHONPATH=/app + +# Copy Ray Serve entrypoint +COPY --chmod=755 dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh + +# Default environment variables +ENV RAY_HEAD_SVC="ai-inference-raycluster-head-svc" +ENV GPU_RESOURCE="gpu_intel" +ENV NUM_GPUS="1" +# Intel XPU settings +ENV ZE_AFFINITY_MASK=0 +ENV SYCL_DEVICE_FILTER=level_zero:gpu + +ENTRYPOINT ["/app/ray-entrypoint.sh"] diff --git a/dockerfiles/Dockerfile.ray-worker-nvidia b/dockerfiles/Dockerfile.ray-worker-nvidia new file mode 100644 index 0000000..ca4b1ad --- /dev/null +++ b/dockerfiles/Dockerfile.ray-worker-nvidia @@ -0,0 +1,53 @@ +# NVIDIA GPU Ray Worker for elminster (RTX 2070) +# Used for: Whisper STT, TTS +# +# Build from llm-workflows root: +# docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-nvidia:latest -f dockerfiles/Dockerfile.ray-worker-nvidia . +# +FROM rayproject/ray:2.53.0-py311-cu121 + +LABEL maintainer="billy-davies-2" +LABEL description="Ray worker for NVIDIA GPUs (Whisper, TTS)" +LABEL gpu.target="nvidia-cuda" + +WORKDIR /app + +# Install system dependencies for audio processing +USER root +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg \ + libsndfile1 \ + git \ + && rm -rf /var/lib/apt/lists/* +USER ray + +# Install Python dependencies for inference +RUN pip install --no-cache-dir \ + faster-whisper \ + openai-whisper \ + TTS \ + soundfile \ + pydub \ + librosa \ + torch \ + torchaudio \ + fastapi \ + uvicorn \ + httpx \ + pydantic + +# Copy Ray Serve Python code +COPY --chown=ray:ray ray-serve/ /app/ray_serve/ +ENV PYTHONPATH=/app + +# Copy Ray Serve entrypoint +COPY --chown=ray:ray dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh +RUN chmod +x /app/ray-entrypoint.sh + +# Default environment variables +ENV CUDA_VISIBLE_DEVICES=0 +ENV RAY_HEAD_SVC="ai-inference-raycluster-head-svc" +ENV GPU_RESOURCE="gpu_nvidia" +ENV NUM_GPUS="1" + +ENTRYPOINT ["/app/ray-entrypoint.sh"] diff --git a/dockerfiles/Dockerfile.ray-worker-rdna2 b/dockerfiles/Dockerfile.ray-worker-rdna2 new file mode 100644 index 0000000..8ef1ef4 --- /dev/null +++ b/dockerfiles/Dockerfile.ray-worker-rdna2 @@ -0,0 +1,65 @@ +# Ray Worker for AMD RDNA 2 (gfx1035 - Radeon 680M) +# Pre-bakes all dependencies for fast startup +# +# Build from llm-workflows root: +# docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-rdna2:latest -f dockerfiles/Dockerfile.ray-worker-rdna2 . +# +# Multi-stage build to ensure Python 3.11.11 matches Ray head node + +# Stage 1: Extract ROCm libraries from vendor image +FROM docker.io/rocm/pytorch:rocm6.4.4_ubuntu22.04_py3.10_pytorch_release_2.7.1 AS rocm-libs + +# Stage 2: Build on Ray base with Python 3.11 +FROM rayproject/ray:2.53.0-py311 AS base + +# Copy ROCm stack from vendor image +COPY --from=rocm-libs /opt/rocm /opt/rocm + +# Set up ROCm environment +ENV ROCM_HOME=/opt/rocm +ENV PATH="${ROCM_HOME}/bin:${ROCM_HOME}/llvm/bin:${PATH}" +ENV LD_LIBRARY_PATH="${ROCM_HOME}/lib:${ROCM_HOME}/lib64:${LD_LIBRARY_PATH}" +ENV HSA_PATH="${ROCM_HOME}/hsa" +ENV HIP_PATH="${ROCM_HOME}/hip" + +# ROCm environment for RDNA 2 (gfx1035) +ENV HIP_VISIBLE_DEVICES=0 \ + HSA_ENABLE_SDMA=0 \ + PYTORCH_HIP_ALLOC_CONF=expandable_segments:True \ + PYTHONPATH=/app + +WORKDIR /app + +# Install ROCm system dependencies +USER root +RUN apt-get update && apt-get install -y --no-install-recommends \ + libelf1 \ + libnuma1 \ + libdrm2 \ + libdrm-amdgpu1 \ + kmod \ + && rm -rf /var/lib/apt/lists/* +USER ray + +# Install PyTorch ROCm wheels compatible with Python 3.11 and ROCm 6.2 +RUN pip install --no-cache-dir \ + torch==2.5.1 torchvision torchaudio \ + --index-url https://download.pytorch.org/whl/rocm6.2 + +# Install Ray Serve and AI inference dependencies +RUN pip install --no-cache-dir \ + transformers \ + accelerate \ + sentence-transformers \ + httpx \ + numpy \ + scipy + +# Pre-download embedding model for faster cold starts +RUN python3 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-large-en-v1.5')" + +# Copy application code +COPY ray-serve/ /app/ray_serve/ +COPY --chmod=755 dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh + +ENTRYPOINT ["/app/ray-entrypoint.sh"] diff --git a/dockerfiles/Dockerfile.ray-worker-strixhalo b/dockerfiles/Dockerfile.ray-worker-strixhalo new file mode 100644 index 0000000..e176cb4 --- /dev/null +++ b/dockerfiles/Dockerfile.ray-worker-strixhalo @@ -0,0 +1,72 @@ +# Ray Worker for AMD Strix Halo (gfx1151 / RDNA 3.5) +# Pre-bakes all dependencies for fast startup +# +# Build from llm-workflows root: +# docker build -t git.daviestechlabs.io/daviestechlabs/ray-worker-strixhalo:latest -f dockerfiles/Dockerfile.ray-worker-strixhalo . +# +# Multi-stage build to ensure Python 3.11.11 matches Ray head node + +# Stage 1: Extract ROCm 7.1 libraries from vendor image +FROM docker.io/rocm/pytorch:rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.9.1 AS rocm-libs + +# Stage 2: Build on Ray base with Python 3.11 +FROM rayproject/ray:2.53.0-py311 AS base + +# Copy ROCm stack from vendor image +COPY --from=rocm-libs /opt/rocm /opt/rocm + +# Set up ROCm environment +ENV ROCM_HOME=/opt/rocm +ENV PATH="${ROCM_HOME}/bin:${ROCM_HOME}/llvm/bin:${PATH}" +ENV LD_LIBRARY_PATH="${ROCM_HOME}/lib:${ROCM_HOME}/lib64:${LD_LIBRARY_PATH}" +ENV HSA_PATH="${ROCM_HOME}/hsa" +ENV HIP_PATH="${ROCM_HOME}/hip" + +# ROCm environment for AMD Strix Halo (gfx1151 / RDNA 3.5) +ENV HIP_VISIBLE_DEVICES=0 +ENV HSA_ENABLE_SDMA=0 +ENV PYTORCH_HIP_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 +ENV HSA_OVERRIDE_GFX_VERSION=11.0.0 +ENV ROCM_TARGET_LST=gfx1151,gfx1100 +ENV PYTHONPATH=/app + +WORKDIR /app + +# Install ROCm system dependencies +USER root +RUN apt-get update && apt-get install -y --no-install-recommends \ + libelf1 \ + libnuma1 \ + libdrm2 \ + libdrm-amdgpu1 \ + kmod \ + && rm -rf /var/lib/apt/lists/* +USER ray + +# WORKAROUND: ROCm/ROCm#5853 - Standard PyTorch ROCm wheels cause segfault +# in libhsa-runtime64.so during VRAM allocation on gfx1151 (Strix Halo). +# TheRock gfx110X-all packages provide Python 3.11 compatible wheels. +RUN pip install --no-cache-dir \ + --index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ \ + torch torchaudio torchvision + +# Install Ray Serve and AI inference dependencies +RUN pip install --no-cache-dir \ + vllm \ + transformers \ + accelerate \ + sentence-transformers \ + httpx \ + numpy \ + scipy + +# Pre-download common models for faster cold starts +RUN python3 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-large-en-v1.5')" || true + +# Copy Ray Serve Python code +COPY ray-serve/ /app/ray_serve/ + +# Ray worker entrypoint +COPY --chmod=755 dockerfiles/ray-entrypoint.sh /app/ray-entrypoint.sh + +ENTRYPOINT ["/app/ray-entrypoint.sh"] diff --git a/dockerfiles/ray-entrypoint.sh b/dockerfiles/ray-entrypoint.sh new file mode 100644 index 0000000..2f85ced --- /dev/null +++ b/dockerfiles/ray-entrypoint.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Ray Worker Entrypoint +# Connects to Ray head node and registers custom resources + +set -e + +# Ensure Ray is in PATH (works across all base images) +export PATH="/home/ray/.local/bin:/home/ray/anaconda3/bin:${PATH}" + +# Get Ray head address from environment or default +RAY_HEAD_ADDRESS="${RAY_HEAD_SVC:-ray-head-svc}:6379" + +# Get custom resources from environment +GPU_RESOURCE="${GPU_RESOURCE:-gpu_amd}" +NUM_GPUS="${NUM_GPUS:-1}" + +echo "Starting Ray worker..." +echo " Head address: $RAY_HEAD_ADDRESS" +echo " GPU resource: $GPU_RESOURCE" +echo " Num GPUs: $NUM_GPUS" + +# Start Ray worker with custom resources +exec ray start \ + --address="$RAY_HEAD_ADDRESS" \ + --num-gpus="$NUM_GPUS" \ + --resources="{\"$GPU_RESOURCE\": 1}" \ + --block diff --git a/ray-serve/__init__.py b/ray-serve/__init__.py new file mode 100644 index 0000000..3149c17 --- /dev/null +++ b/ray-serve/__init__.py @@ -0,0 +1 @@ +# Ray Serve deployments for GPU-shared AI inference diff --git a/ray-serve/requirements.txt b/ray-serve/requirements.txt new file mode 100644 index 0000000..c375f37 --- /dev/null +++ b/ray-serve/requirements.txt @@ -0,0 +1,24 @@ +# Ray Serve dependencies +ray[serve]==2.53.0 + +# LLM inference +vllm + +# Embeddings and reranking +sentence-transformers + +# Speech-to-text +faster-whisper + +# Text-to-speech +TTS + +# HTTP client +httpx + +# Numerical computing +numpy +scipy + +# Optional: Intel GPU support (for danilo node) +# intel-extension-for-pytorch diff --git a/ray-serve/serve_embeddings.py b/ray-serve/serve_embeddings.py new file mode 100644 index 0000000..3a13985 --- /dev/null +++ b/ray-serve/serve_embeddings.py @@ -0,0 +1,87 @@ +""" +Ray Serve deployment for sentence-transformers BGE embeddings. +Runs on: drizzt (Radeon 680M iGPU, ROCm) +""" + +import os +import time +import uuid +from typing import Any, Dict, List, Union + +from ray import serve + + +@serve.deployment(name="EmbeddingsDeployment", num_replicas=1) +class EmbeddingsDeployment: + def __init__(self): + from sentence_transformers import SentenceTransformer + import torch + + self.model_id = os.environ.get("MODEL_ID", "BAAI/bge-large-en-v1.5") + + # Detect device + if torch.cuda.is_available(): + self.device = "cuda" + elif hasattr(torch, "xpu") and torch.xpu.is_available(): + self.device = "xpu" + else: + self.device = "cpu" + + print(f"Loading embeddings model: {self.model_id}") + print(f"Using device: {self.device}") + + self.model = SentenceTransformer(self.model_id, device=self.device) + self.embedding_dim = self.model.get_sentence_embedding_dimension() + + print(f"Model loaded. Embedding dimension: {self.embedding_dim}") + + async def __call__(self, request: Dict[str, Any]) -> Dict[str, Any]: + """ + Handle OpenAI-compatible embedding requests. + + Expected request format: + { + "model": "model-name", + "input": "text to embed" or ["text1", "text2"], + "encoding_format": "float" + } + """ + input_data = request.get("input", "") + + # Handle both single string and list of strings + if isinstance(input_data, str): + texts = [input_data] + else: + texts = input_data + + # Generate embeddings + embeddings = self.model.encode( + texts, + normalize_embeddings=True, + show_progress_bar=False, + ) + + # Build response data + data = [] + total_tokens = 0 + for i, (text, embedding) in enumerate(zip(texts, embeddings)): + data.append({ + "object": "embedding", + "index": i, + "embedding": embedding.tolist(), + }) + total_tokens += len(text.split()) + + # Return OpenAI-compatible response + return { + "object": "list", + "data": data, + "model": self.model_id, + "usage": { + "prompt_tokens": total_tokens, + "total_tokens": total_tokens, + }, + } + + +app = EmbeddingsDeployment.bind() diff --git a/ray-serve/serve_llm.py b/ray-serve/serve_llm.py new file mode 100644 index 0000000..4e15965 --- /dev/null +++ b/ray-serve/serve_llm.py @@ -0,0 +1,108 @@ +""" +Ray Serve deployment for vLLM with OpenAI-compatible API. +Runs on: khelben (Strix Halo 64GB, ROCm) +""" + +import os +import time +import uuid +from typing import Any, Dict, List, Optional + +from ray import serve + + +@serve.deployment(name="LLMDeployment", num_replicas=1) +class LLMDeployment: + def __init__(self): + from vllm import LLM, SamplingParams + + self.model_id = os.environ.get("MODEL_ID", "meta-llama/Llama-3.1-70B-Instruct") + self.max_model_len = int(os.environ.get("MAX_MODEL_LEN", "8192")) + self.gpu_memory_utilization = float(os.environ.get("GPU_MEMORY_UTILIZATION", "0.9")) + + print(f"Loading vLLM model: {self.model_id}") + print(f"Max model length: {self.max_model_len}") + print(f"GPU memory utilization: {self.gpu_memory_utilization}") + + self.llm = LLM( + model=self.model_id, + max_model_len=self.max_model_len, + gpu_memory_utilization=self.gpu_memory_utilization, + trust_remote_code=True, + ) + self.SamplingParams = SamplingParams + print(f"Model {self.model_id} loaded successfully") + + async def __call__(self, request: Dict[str, Any]) -> Dict[str, Any]: + """ + Handle OpenAI-compatible chat completion requests. + + Expected request format: + { + "model": "model-name", + "messages": [{"role": "user", "content": "Hello"}], + "temperature": 0.7, + "max_tokens": 256, + "top_p": 1.0, + "stream": false + } + """ + messages = request.get("messages", []) + temperature = request.get("temperature", 0.7) + max_tokens = request.get("max_tokens", 256) + top_p = request.get("top_p", 1.0) + stop = request.get("stop", None) + + # Convert messages to prompt + prompt = self._format_messages(messages) + + sampling_params = self.SamplingParams( + temperature=temperature, + max_tokens=max_tokens, + top_p=top_p, + stop=stop, + ) + + outputs = self.llm.generate([prompt], sampling_params) + generated_text = outputs[0].outputs[0].text + + # Return OpenAI-compatible response + return { + "id": f"chatcmpl-{uuid.uuid4().hex[:8]}", + "object": "chat.completion", + "created": int(time.time()), + "model": self.model_id, + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": generated_text, + }, + "finish_reason": "stop", + } + ], + "usage": { + "prompt_tokens": len(prompt.split()), + "completion_tokens": len(generated_text.split()), + "total_tokens": len(prompt.split()) + len(generated_text.split()), + }, + } + + def _format_messages(self, messages: List[Dict[str, str]]) -> str: + """Format chat messages into a prompt string.""" + formatted = "" + for msg in messages: + role = msg.get("role", "user") + content = msg.get("content", "") + if role == "system": + formatted += f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>" + elif role == "user": + formatted += f"<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>" + elif role == "assistant": + formatted += f"<|start_header_id|>assistant<|end_header_id|>\n\n{content}<|eot_id|>" + formatted += "<|start_header_id|>assistant<|end_header_id|>\n\n" + return formatted + + +app = LLMDeployment.bind() diff --git a/ray-serve/serve_reranker.py b/ray-serve/serve_reranker.py new file mode 100644 index 0000000..13ff876 --- /dev/null +++ b/ray-serve/serve_reranker.py @@ -0,0 +1,142 @@ +""" +Ray Serve deployment for sentence-transformers CrossEncoder reranking. +Runs on: drizzt (Radeon 680M iGPU, ROCm) or danilo (Intel i915 iGPU, OpenVINO/IPEX) +""" + +import os +import time +import uuid +from typing import Any, Dict, List, Tuple + +from ray import serve + + +@serve.deployment(name="RerankerDeployment", num_replicas=1) +class RerankerDeployment: + def __init__(self): + from sentence_transformers import CrossEncoder + import torch + + self.model_id = os.environ.get("MODEL_ID", "BAAI/bge-reranker-v2-m3") + self.use_ipex = False + self.device = "cpu" + + # Detect device - check for Intel GPU first via IPEX + try: + import intel_extension_for_pytorch as ipex + self.use_ipex = True + if hasattr(torch, "xpu") and torch.xpu.is_available(): + self.device = "xpu" + print("Intel GPU detected via IPEX, using XPU device") + else: + print("IPEX available, will use CPU optimization") + except ImportError: + print("IPEX not available, checking for other GPUs") + + # Check for CUDA/ROCm if not using Intel + if not self.use_ipex: + if torch.cuda.is_available(): + self.device = "cuda" + print(f"Using CUDA/ROCm device") + else: + print("No GPU detected, using CPU") + + print(f"Loading reranker model: {self.model_id}") + print(f"Using device: {self.device}") + + # Load model + self.model = CrossEncoder(self.model_id, device=self.device) + + # Apply IPEX optimization if available + if self.use_ipex and self.device == "cpu": + try: + import intel_extension_for_pytorch as ipex + self.model.model = ipex.optimize(self.model.model) + print("IPEX CPU optimization applied") + except Exception as e: + print(f"IPEX optimization failed: {e}") + + print(f"Reranker model loaded successfully") + + async def __call__(self, request: Dict[str, Any]) -> Dict[str, Any]: + """ + Handle reranking requests. + + Expected request format: + { + "query": "search query", + "documents": ["doc1", "doc2", "doc3"], + "top_k": 3, + "return_documents": true + } + + Alternative format (pairs): + { + "pairs": [["query", "doc1"], ["query", "doc2"]] + } + """ + # Handle pairs format + if "pairs" in request: + pairs = request["pairs"] + scores = self.model.predict(pairs) + + results = [] + for i, (pair, score) in enumerate(zip(pairs, scores)): + results.append({ + "index": i, + "score": float(score), + }) + + return { + "object": "list", + "results": results, + "model": self.model_id, + } + + # Handle query + documents format + query = request.get("query", "") + documents = request.get("documents", []) + top_k = request.get("top_k", len(documents)) + return_documents = request.get("return_documents", True) + + if not documents: + return { + "object": "list", + "results": [], + "model": self.model_id, + } + + # Create query-document pairs + pairs = [[query, doc] for doc in documents] + + # Get scores + scores = self.model.predict(pairs) + + # Create results with indices and scores + results = [] + for i, (doc, score) in enumerate(zip(documents, scores)): + result = { + "index": i, + "score": float(score), + } + if return_documents: + result["document"] = doc + results.append(result) + + # Sort by score descending + results.sort(key=lambda x: x["score"], reverse=True) + + # Apply top_k + results = results[:top_k] + + return { + "object": "list", + "results": results, + "model": self.model_id, + "usage": { + "total_pairs": len(pairs), + }, + } + + +app = RerankerDeployment.bind() diff --git a/ray-serve/serve_tts.py b/ray-serve/serve_tts.py new file mode 100644 index 0000000..636895e --- /dev/null +++ b/ray-serve/serve_tts.py @@ -0,0 +1,122 @@ +""" +Ray Serve deployment for Coqui TTS. +Runs on: elminster (RTX 2070 8GB, CUDA) +""" + +import os +import io +import time +import uuid +import base64 +from typing import Any, Dict, Optional + +from ray import serve + + +@serve.deployment(name="TTSDeployment", num_replicas=1) +class TTSDeployment: + def __init__(self): + from TTS.api import TTS + import torch + + self.model_name = os.environ.get("MODEL_NAME", "tts_models/en/ljspeech/tacotron2-DDC") + + # Detect device + self.use_gpu = torch.cuda.is_available() + + print(f"Loading TTS model: {self.model_name}") + print(f"Using GPU: {self.use_gpu}") + + self.tts = TTS(model_name=self.model_name, progress_bar=False) + + if self.use_gpu: + self.tts = self.tts.to("cuda") + + print(f"TTS model loaded successfully") + + async def __call__(self, request: Dict[str, Any]) -> Dict[str, Any]: + """ + Handle text-to-speech requests. + + Expected request format: + { + "text": "Text to synthesize", + "speaker": "speaker_name", + "language": "en", + "speed": 1.0, + "output_format": "wav", + "return_base64": true + } + """ + import numpy as np + from scipy.io import wavfile + + text = request.get("text", "") + speaker = request.get("speaker", None) + language = request.get("language", None) + speed = request.get("speed", 1.0) + output_format = request.get("output_format", "wav") + return_base64 = request.get("return_base64", True) + + if not text: + return {"error": "No text provided"} + + # Generate speech + try: + # TTS.tts returns a numpy array of audio samples + wav = self.tts.tts( + text=text, + speaker=speaker, + language=language, + speed=speed, + ) + + # Convert to numpy array if needed + if not isinstance(wav, np.ndarray): + wav = np.array(wav) + + # Normalize to int16 + wav_int16 = (wav * 32767).astype(np.int16) + + # Get sample rate from model config + sample_rate = self.tts.synthesizer.output_sample_rate if hasattr(self.tts, 'synthesizer') else 22050 + + # Write to buffer + buffer = io.BytesIO() + wavfile.write(buffer, sample_rate, wav_int16) + audio_bytes = buffer.getvalue() + + response = { + "model": self.model_name, + "sample_rate": sample_rate, + "duration": len(wav) / sample_rate, + "format": output_format, + } + + if return_base64: + response["audio"] = base64.b64encode(audio_bytes).decode("utf-8") + else: + response["audio_bytes"] = audio_bytes + + return response + + except Exception as e: + return { + "error": str(e), + "model": self.model_name, + } + + def list_speakers(self) -> Dict[str, Any]: + """List available speakers for multi-speaker models.""" + speakers = [] + if hasattr(self.tts, 'speakers') and self.tts.speakers: + speakers = self.tts.speakers + + return { + "model": self.model_name, + "speakers": speakers, + "is_multi_speaker": len(speakers) > 0, + } + + +app = TTSDeployment.bind() diff --git a/ray-serve/serve_whisper.py b/ray-serve/serve_whisper.py new file mode 100644 index 0000000..23861f2 --- /dev/null +++ b/ray-serve/serve_whisper.py @@ -0,0 +1,146 @@ +""" +Ray Serve deployment for faster-whisper STT. +Runs on: elminster (RTX 2070 8GB, CUDA) +""" + +import os +import io +import time +import uuid +import base64 +from typing import Any, Dict, Optional + +from ray import serve + + +@serve.deployment(name="WhisperDeployment", num_replicas=1) +class WhisperDeployment: + def __init__(self): + from faster_whisper import WhisperModel + import torch + + self.model_size = os.environ.get("MODEL_SIZE", "large-v3") + + # Detect device and compute type + if torch.cuda.is_available(): + self.device = "cuda" + self.compute_type = "float16" + else: + self.device = "cpu" + self.compute_type = "int8" + + print(f"Loading Whisper model: {self.model_size}") + print(f"Using device: {self.device}, compute_type: {self.compute_type}") + + self.model = WhisperModel( + self.model_size, + device=self.device, + compute_type=self.compute_type, + ) + + print(f"Whisper model loaded successfully") + + async def __call__(self, request: Dict[str, Any]) -> Dict[str, Any]: + """ + Handle transcription requests. + + Expected request format: + { + "audio": "base64_encoded_audio_data", + "audio_format": "wav", + "language": "en", + "task": "transcribe", + "response_format": "json", + "word_timestamps": false + } + + Alternative with file path: + { + "file": "/path/to/audio.wav", + ... + } + """ + import numpy as np + from scipy.io import wavfile + + language = request.get("language", None) + task = request.get("task", "transcribe") # transcribe or translate + response_format = request.get("response_format", "json") + word_timestamps = request.get("word_timestamps", False) + + # Get audio data + audio_input = None + + if "audio" in request: + # Base64 encoded audio + audio_bytes = base64.b64decode(request["audio"]) + audio_input = io.BytesIO(audio_bytes) + elif "file" in request: + # File path + audio_input = request["file"] + elif "audio_bytes" in request: + # Raw bytes + audio_input = io.BytesIO(request["audio_bytes"]) + else: + return { + "error": "No audio data provided. Use 'audio' (base64), 'file' (path), or 'audio_bytes'", + } + + # Transcribe + segments, info = self.model.transcribe( + audio_input, + language=language, + task=task, + word_timestamps=word_timestamps, + vad_filter=True, + ) + + # Collect segments + segment_list = [] + full_text = "" + + for segment in segments: + seg_data = { + "id": segment.id, + "start": segment.start, + "end": segment.end, + "text": segment.text, + } + + if word_timestamps and segment.words: + seg_data["words"] = [ + { + "word": word.word, + "start": word.start, + "end": word.end, + "probability": word.probability, + } + for word in segment.words + ] + + segment_list.append(seg_data) + full_text += segment.text + + # Build response based on format + if response_format == "text": + return {"text": full_text.strip()} + + if response_format == "verbose_json": + return { + "task": task, + "language": info.language, + "duration": info.duration, + "text": full_text.strip(), + "segments": segment_list, + } + + # Default JSON format (OpenAI-compatible) + return { + "text": full_text.strip(), + "language": info.language, + "duration": info.duration, + "model": self.model_size, + } + + +app = WhisperDeployment.bind()