feat: add pyproject.toml and CI for ray-serve-apps package
Some checks failed
Build and Push Images / build-nvidia (push) Failing after 7m25s
Build and Push Images / build-rdna2 (push) Failing after 7m29s
Build and Push Images / build-strixhalo (push) Failing after 6m45s
Build and Push Images / build-intel (push) Failing after 6m22s
Build and Push Images / Release (push) Has been skipped
Build and Push Images / Notify (push) Successful in 1s
Build and Publish ray-serve-apps / lint (push) Failing after 3m9s
Build and Publish ray-serve-apps / publish (push) Has been skipped

- Restructure ray-serve as proper Python package (ray_serve/)
- Add pyproject.toml with hatch build system
- Add CI workflow to publish to Gitea PyPI
- Add py.typed for PEP 561 compliance
- Aligns with ADR-0019 handler deployment strategy
This commit is contained in:
2026-02-02 09:22:03 -05:00
parent 876188a150
commit 7efdcb059e
10 changed files with 172 additions and 1 deletions

View File

@@ -0,0 +1,87 @@
"""
Ray Serve deployment for sentence-transformers BGE embeddings.
Runs on: drizzt (Radeon 680M iGPU, ROCm)
"""
import os
import time
import uuid
from typing import Any, Dict, List, Union
from ray import serve
@serve.deployment(name="EmbeddingsDeployment", num_replicas=1)
class EmbeddingsDeployment:
def __init__(self):
from sentence_transformers import SentenceTransformer
import torch
self.model_id = os.environ.get("MODEL_ID", "BAAI/bge-large-en-v1.5")
# Detect device
if torch.cuda.is_available():
self.device = "cuda"
elif hasattr(torch, "xpu") and torch.xpu.is_available():
self.device = "xpu"
else:
self.device = "cpu"
print(f"Loading embeddings model: {self.model_id}")
print(f"Using device: {self.device}")
self.model = SentenceTransformer(self.model_id, device=self.device)
self.embedding_dim = self.model.get_sentence_embedding_dimension()
print(f"Model loaded. Embedding dimension: {self.embedding_dim}")
async def __call__(self, request: Dict[str, Any]) -> Dict[str, Any]:
"""
Handle OpenAI-compatible embedding requests.
Expected request format:
{
"model": "model-name",
"input": "text to embed" or ["text1", "text2"],
"encoding_format": "float"
}
"""
input_data = request.get("input", "")
# Handle both single string and list of strings
if isinstance(input_data, str):
texts = [input_data]
else:
texts = input_data
# Generate embeddings
embeddings = self.model.encode(
texts,
normalize_embeddings=True,
show_progress_bar=False,
)
# Build response data
data = []
total_tokens = 0
for i, (text, embedding) in enumerate(zip(texts, embeddings)):
data.append({
"object": "embedding",
"index": i,
"embedding": embedding.tolist(),
})
total_tokens += len(text.split())
# Return OpenAI-compatible response
return {
"object": "list",
"data": data,
"model": self.model_id,
"usage": {
"prompt_tokens": total_tokens,
"total_tokens": total_tokens,
},
}
app = EmbeddingsDeployment.bind()