""" Ray Serve deployment for sentence-transformers BGE embeddings. Runs on: drizzt (Radeon 680M iGPU, ROCm) """ import os from typing import Any from ray import serve @serve.deployment(name="EmbeddingsDeployment", num_replicas=1) class EmbeddingsDeployment: def __init__(self): import torch from sentence_transformers import SentenceTransformer self.model_id = os.environ.get("MODEL_ID", "BAAI/bge-large-en-v1.5") # Detect device if torch.cuda.is_available(): self.device = "cuda" elif hasattr(torch, "xpu") and torch.xpu.is_available(): self.device = "xpu" else: self.device = "cpu" print(f"Loading embeddings model: {self.model_id}") print(f"Using device: {self.device}") self.model = SentenceTransformer(self.model_id, device=self.device) self.embedding_dim = self.model.get_sentence_embedding_dimension() print(f"Model loaded. Embedding dimension: {self.embedding_dim}") async def __call__(self, request: dict[str, Any]) -> dict[str, Any]: """ Handle OpenAI-compatible embedding requests. Expected request format: { "model": "model-name", "input": "text to embed" or ["text1", "text2"], "encoding_format": "float" } """ input_data = request.get("input", "") # Handle both single string and list of strings texts = [input_data] if isinstance(input_data, str) else input_data # Generate embeddings embeddings = self.model.encode( texts, normalize_embeddings=True, show_progress_bar=False, ) # Build response data data = [] total_tokens = 0 for i, (text, embedding) in enumerate(zip(texts, embeddings, strict=False)): data.append( { "object": "embedding", "index": i, "embedding": embedding.tolist(), } ) total_tokens += len(text.split()) # Return OpenAI-compatible response return { "object": "list", "data": data, "model": self.model_id, "usage": { "prompt_tokens": total_tokens, "total_tokens": total_tokens, }, } app = EmbeddingsDeployment.bind()