feat: add pyproject.toml and CI for ray-serve-apps package
Some checks failed
Build and Push Images / build-nvidia (push) Failing after 7m25s
Build and Push Images / build-rdna2 (push) Failing after 7m29s
Build and Push Images / build-strixhalo (push) Failing after 6m45s
Build and Push Images / build-intel (push) Failing after 6m22s
Build and Push Images / Release (push) Has been skipped
Build and Push Images / Notify (push) Successful in 1s
Build and Publish ray-serve-apps / lint (push) Failing after 3m9s
Build and Publish ray-serve-apps / publish (push) Has been skipped
Some checks failed
Build and Push Images / build-nvidia (push) Failing after 7m25s
Build and Push Images / build-rdna2 (push) Failing after 7m29s
Build and Push Images / build-strixhalo (push) Failing after 6m45s
Build and Push Images / build-intel (push) Failing after 6m22s
Build and Push Images / Release (push) Has been skipped
Build and Push Images / Notify (push) Successful in 1s
Build and Publish ray-serve-apps / lint (push) Failing after 3m9s
Build and Publish ray-serve-apps / publish (push) Has been skipped
- Restructure ray-serve as proper Python package (ray_serve/) - Add pyproject.toml with hatch build system - Add CI workflow to publish to Gitea PyPI - Add py.typed for PEP 561 compliance - Aligns with ADR-0019 handler deployment strategy
This commit is contained in:
146
ray-serve/ray_serve/serve_whisper.py
Normal file
146
ray-serve/ray_serve/serve_whisper.py
Normal file
@@ -0,0 +1,146 @@
|
||||
"""
|
||||
Ray Serve deployment for faster-whisper STT.
|
||||
Runs on: elminster (RTX 2070 8GB, CUDA)
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
import time
|
||||
import uuid
|
||||
import base64
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from ray import serve
|
||||
|
||||
|
||||
@serve.deployment(name="WhisperDeployment", num_replicas=1)
|
||||
class WhisperDeployment:
|
||||
def __init__(self):
|
||||
from faster_whisper import WhisperModel
|
||||
import torch
|
||||
|
||||
self.model_size = os.environ.get("MODEL_SIZE", "large-v3")
|
||||
|
||||
# Detect device and compute type
|
||||
if torch.cuda.is_available():
|
||||
self.device = "cuda"
|
||||
self.compute_type = "float16"
|
||||
else:
|
||||
self.device = "cpu"
|
||||
self.compute_type = "int8"
|
||||
|
||||
print(f"Loading Whisper model: {self.model_size}")
|
||||
print(f"Using device: {self.device}, compute_type: {self.compute_type}")
|
||||
|
||||
self.model = WhisperModel(
|
||||
self.model_size,
|
||||
device=self.device,
|
||||
compute_type=self.compute_type,
|
||||
)
|
||||
|
||||
print(f"Whisper model loaded successfully")
|
||||
|
||||
async def __call__(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Handle transcription requests.
|
||||
|
||||
Expected request format:
|
||||
{
|
||||
"audio": "base64_encoded_audio_data",
|
||||
"audio_format": "wav",
|
||||
"language": "en",
|
||||
"task": "transcribe",
|
||||
"response_format": "json",
|
||||
"word_timestamps": false
|
||||
}
|
||||
|
||||
Alternative with file path:
|
||||
{
|
||||
"file": "/path/to/audio.wav",
|
||||
...
|
||||
}
|
||||
"""
|
||||
import numpy as np
|
||||
from scipy.io import wavfile
|
||||
|
||||
language = request.get("language", None)
|
||||
task = request.get("task", "transcribe") # transcribe or translate
|
||||
response_format = request.get("response_format", "json")
|
||||
word_timestamps = request.get("word_timestamps", False)
|
||||
|
||||
# Get audio data
|
||||
audio_input = None
|
||||
|
||||
if "audio" in request:
|
||||
# Base64 encoded audio
|
||||
audio_bytes = base64.b64decode(request["audio"])
|
||||
audio_input = io.BytesIO(audio_bytes)
|
||||
elif "file" in request:
|
||||
# File path
|
||||
audio_input = request["file"]
|
||||
elif "audio_bytes" in request:
|
||||
# Raw bytes
|
||||
audio_input = io.BytesIO(request["audio_bytes"])
|
||||
else:
|
||||
return {
|
||||
"error": "No audio data provided. Use 'audio' (base64), 'file' (path), or 'audio_bytes'",
|
||||
}
|
||||
|
||||
# Transcribe
|
||||
segments, info = self.model.transcribe(
|
||||
audio_input,
|
||||
language=language,
|
||||
task=task,
|
||||
word_timestamps=word_timestamps,
|
||||
vad_filter=True,
|
||||
)
|
||||
|
||||
# Collect segments
|
||||
segment_list = []
|
||||
full_text = ""
|
||||
|
||||
for segment in segments:
|
||||
seg_data = {
|
||||
"id": segment.id,
|
||||
"start": segment.start,
|
||||
"end": segment.end,
|
||||
"text": segment.text,
|
||||
}
|
||||
|
||||
if word_timestamps and segment.words:
|
||||
seg_data["words"] = [
|
||||
{
|
||||
"word": word.word,
|
||||
"start": word.start,
|
||||
"end": word.end,
|
||||
"probability": word.probability,
|
||||
}
|
||||
for word in segment.words
|
||||
]
|
||||
|
||||
segment_list.append(seg_data)
|
||||
full_text += segment.text
|
||||
|
||||
# Build response based on format
|
||||
if response_format == "text":
|
||||
return {"text": full_text.strip()}
|
||||
|
||||
if response_format == "verbose_json":
|
||||
return {
|
||||
"task": task,
|
||||
"language": info.language,
|
||||
"duration": info.duration,
|
||||
"text": full_text.strip(),
|
||||
"segments": segment_list,
|
||||
}
|
||||
|
||||
# Default JSON format (OpenAI-compatible)
|
||||
return {
|
||||
"text": full_text.strip(),
|
||||
"language": info.language,
|
||||
"duration": info.duration,
|
||||
"model": self.model_size,
|
||||
}
|
||||
|
||||
|
||||
app = WhisperDeployment.bind()
|
||||
Reference in New Issue
Block a user