feat: Add GPU-specific Ray worker images with CI/CD
Some checks failed
Build and Push Images / build-nvidia (push) Failing after 1s
Build and Push Images / build-rdna2 (push) Failing after 1s
Build and Push Images / build-strixhalo (push) Failing after 1s
Build and Push Images / build-intel (push) Failing after 1s

- Add Dockerfiles for nvidia, rdna2, strixhalo, and intel GPU targets
- Add ray-serve modules (embeddings, whisper, tts, llm, reranker)
- Add Gitea Actions workflow for automated builds
- Add Makefile for local development
- Update README with comprehensive documentation
This commit is contained in:
2026-02-01 15:04:31 -05:00
parent e68d5c1f0e
commit a16ffff73f
16 changed files with 1311 additions and 2 deletions

146
ray-serve/serve_whisper.py Normal file
View File

@@ -0,0 +1,146 @@
"""
Ray Serve deployment for faster-whisper STT.
Runs on: elminster (RTX 2070 8GB, CUDA)
"""
import os
import io
import time
import uuid
import base64
from typing import Any, Dict, Optional
from ray import serve
@serve.deployment(name="WhisperDeployment", num_replicas=1)
class WhisperDeployment:
def __init__(self):
from faster_whisper import WhisperModel
import torch
self.model_size = os.environ.get("MODEL_SIZE", "large-v3")
# Detect device and compute type
if torch.cuda.is_available():
self.device = "cuda"
self.compute_type = "float16"
else:
self.device = "cpu"
self.compute_type = "int8"
print(f"Loading Whisper model: {self.model_size}")
print(f"Using device: {self.device}, compute_type: {self.compute_type}")
self.model = WhisperModel(
self.model_size,
device=self.device,
compute_type=self.compute_type,
)
print(f"Whisper model loaded successfully")
async def __call__(self, request: Dict[str, Any]) -> Dict[str, Any]:
"""
Handle transcription requests.
Expected request format:
{
"audio": "base64_encoded_audio_data",
"audio_format": "wav",
"language": "en",
"task": "transcribe",
"response_format": "json",
"word_timestamps": false
}
Alternative with file path:
{
"file": "/path/to/audio.wav",
...
}
"""
import numpy as np
from scipy.io import wavfile
language = request.get("language", None)
task = request.get("task", "transcribe") # transcribe or translate
response_format = request.get("response_format", "json")
word_timestamps = request.get("word_timestamps", False)
# Get audio data
audio_input = None
if "audio" in request:
# Base64 encoded audio
audio_bytes = base64.b64decode(request["audio"])
audio_input = io.BytesIO(audio_bytes)
elif "file" in request:
# File path
audio_input = request["file"]
elif "audio_bytes" in request:
# Raw bytes
audio_input = io.BytesIO(request["audio_bytes"])
else:
return {
"error": "No audio data provided. Use 'audio' (base64), 'file' (path), or 'audio_bytes'",
}
# Transcribe
segments, info = self.model.transcribe(
audio_input,
language=language,
task=task,
word_timestamps=word_timestamps,
vad_filter=True,
)
# Collect segments
segment_list = []
full_text = ""
for segment in segments:
seg_data = {
"id": segment.id,
"start": segment.start,
"end": segment.end,
"text": segment.text,
}
if word_timestamps and segment.words:
seg_data["words"] = [
{
"word": word.word,
"start": word.start,
"end": word.end,
"probability": word.probability,
}
for word in segment.words
]
segment_list.append(seg_data)
full_text += segment.text
# Build response based on format
if response_format == "text":
return {"text": full_text.strip()}
if response_format == "verbose_json":
return {
"task": task,
"language": info.language,
"duration": info.duration,
"text": full_text.strip(),
"segments": segment_list,
}
# Default JSON format (OpenAI-compatible)
return {
"text": full_text.strip(),
"language": info.language,
"duration": info.duration,
"model": self.model_size,
}
app = WhisperDeployment.bind()