feat: Add GPU-specific Ray worker images with CI/CD

- Add Dockerfiles for nvidia, rdna2, strixhalo, and intel GPU targets - Add ray-serve modules (embeddings, whisper, tts, llm, reranker) - Add Gitea Actions workflow for automated builds - Add Makefile for local development - Update README with comprehensive documentation
2026-02-01 15:04:31 -05:00
parent e68d5c1f0e
commit a16ffff73f
16 changed files with 1311 additions and 2 deletions
--- a/ray-serve/serve_whisper.py
+++ b/ray-serve/serve_whisper.py
@@ -0,0 +1,146 @@
+"""
+Ray Serve deployment for faster-whisper STT.
+Runs on: elminster (RTX 2070 8GB, CUDA)
+"""
+
+import os
+import io
+import time
+import uuid
+import base64
+from typing import Any, Dict, Optional
+
+from ray import serve
+
+
+@serve.deployment(name="WhisperDeployment", num_replicas=1)
+class WhisperDeployment:
+    def __init__(self):
+        from faster_whisper import WhisperModel
+        import torch
+        
+        self.model_size = os.environ.get("MODEL_SIZE", "large-v3")
+        
+        # Detect device and compute type
+        if torch.cuda.is_available():
+            self.device = "cuda"
+            self.compute_type = "float16"
+        else:
+            self.device = "cpu"
+            self.compute_type = "int8"
+        
+        print(f"Loading Whisper model: {self.model_size}")
+        print(f"Using device: {self.device}, compute_type: {self.compute_type}")
+        
+        self.model = WhisperModel(
+            self.model_size,
+            device=self.device,
+            compute_type=self.compute_type,
+        )
+        
+        print(f"Whisper model loaded successfully")
+
+    async def __call__(self, request: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Handle transcription requests.
+        
+        Expected request format:
+        {
+            "audio": "base64_encoded_audio_data",
+            "audio_format": "wav",
+            "language": "en",
+            "task": "transcribe",
+            "response_format": "json",
+            "word_timestamps": false
+        }
+        
+        Alternative with file path:
+        {
+            "file": "/path/to/audio.wav",
+            ...
+        }
+        """
+        import numpy as np
+        from scipy.io import wavfile
+        
+        language = request.get("language", None)
+        task = request.get("task", "transcribe")  # transcribe or translate
+        response_format = request.get("response_format", "json")
+        word_timestamps = request.get("word_timestamps", False)
+        
+        # Get audio data
+        audio_input = None
+        
+        if "audio" in request:
+            # Base64 encoded audio
+            audio_bytes = base64.b64decode(request["audio"])
+            audio_input = io.BytesIO(audio_bytes)
+        elif "file" in request:
+            # File path
+            audio_input = request["file"]
+        elif "audio_bytes" in request:
+            # Raw bytes
+            audio_input = io.BytesIO(request["audio_bytes"])
+        else:
+            return {
+                "error": "No audio data provided. Use 'audio' (base64), 'file' (path), or 'audio_bytes'",
+            }
+        
+        # Transcribe
+        segments, info = self.model.transcribe(
+            audio_input,
+            language=language,
+            task=task,
+            word_timestamps=word_timestamps,
+            vad_filter=True,
+        )
+        
+        # Collect segments
+        segment_list = []
+        full_text = ""
+        
+        for segment in segments:
+            seg_data = {
+                "id": segment.id,
+                "start": segment.start,
+                "end": segment.end,
+                "text": segment.text,
+            }
+            
+            if word_timestamps and segment.words:
+                seg_data["words"] = [
+                    {
+                        "word": word.word,
+                        "start": word.start,
+                        "end": word.end,
+                        "probability": word.probability,
+                    }
+                    for word in segment.words
+                ]
+            
+            segment_list.append(seg_data)
+            full_text += segment.text
+        
+        # Build response based on format
+        if response_format == "text":
+            return {"text": full_text.strip()}
+        
+        if response_format == "verbose_json":
+            return {
+                "task": task,
+                "language": info.language,
+                "duration": info.duration,
+                "text": full_text.strip(),
+                "segments": segment_list,
+            }
+        
+        # Default JSON format (OpenAI-compatible)
+        return {
+            "text": full_text.strip(),
+            "language": info.language,
+            "duration": info.duration,
+            "model": self.model_size,
+        }
+
+
+app = WhisperDeployment.bind()