feat: add py.typed, Ray handles for clients, and pre-commit config

- Add py.typed marker for PEP 561 type hint support - Add ray_utils module for Ray handle detection and caching - Update all clients (Embeddings, LLM, TTS, STT, Reranker) to use Ray handles when running inside Ray cluster for faster internal calls - Add .pre-commit-config.yaml with ruff and standard hooks - Add pre-commit and ray[serve] to optional dependencies - Bump ruff version to 0.4.0
2026-02-02 09:08:43 -05:00
parent dbf1a93141
commit 408f31e56d
10 changed files with 1506 additions and 8 deletions
--- a/handler_base/clients/llm.py
+++ b/handler_base/clients/llm.py
@@ -1,13 +1,16 @@
 """
 LLM service client (vLLM/OpenAI-compatible).
+
+Supports both HTTP (external) and Ray handles (internal) for optimal performance.
 """

 import logging
-from typing import AsyncIterator, Optional
+from typing import Any, AsyncIterator, Optional

 import httpx

 from handler_base.config import LLMSettings
+from handler_base.ray_utils import get_ray_handle
 from handler_base.telemetry import create_span

 logger = logging.getLogger(__name__)
@@ -17,6 +20,9 @@ class LLMClient:
    """
    Client for the LLM service (vLLM with OpenAI-compatible API).

+    When running inside Ray, automatically uses Ray handles for faster
+    internal communication. Falls back to HTTP for external calls.
+
    Usage:
        client = LLMClient()
        response = await client.generate("Hello, how are you?")
@@ -32,12 +38,25 @@ class LLMClient:
            print(chunk, end="")
    """

+    # Ray Serve deployment configuration
+    RAY_DEPLOYMENT_NAME = "VLLMDeployment"
+    RAY_APP_NAME = "llm"
+
    def __init__(self, settings: Optional[LLMSettings] = None):
        self.settings = settings or LLMSettings()
        self._client = httpx.AsyncClient(
            base_url=self.settings.llm_url,
            timeout=self.settings.http_timeout,
        )
+        self._ray_handle: Optional[Any] = None
+        self._ray_checked = False
+
+    def _get_ray_handle(self) -> Optional[Any]:
+        """Get Ray handle, checking only once."""
+        if not self._ray_checked:
+            self._ray_handle = get_ray_handle(self.RAY_DEPLOYMENT_NAME, self.RAY_APP_NAME)
+            self._ray_checked = True
+        return self._ray_handle

    async def close(self) -> None:
        """Close the HTTP client."""
@@ -87,6 +106,24 @@ class LLMClient:
            if stop:
                payload["stop"] = stop

+            # Try Ray handle first (faster internal path)
+            handle = self._get_ray_handle()
+            if handle:
+                try:
+                    if span:
+                        span.set_attribute("llm.transport", "ray")
+                    result = await handle.remote(payload)
+                    content = result["choices"][0]["message"]["content"]
+                    if span:
+                        span.set_attribute("llm.response_length", len(content))
+                    return content
+                except Exception as e:
+                    logger.warning(f"Ray handle failed, falling back to HTTP: {e}")
+
+            # HTTP fallback
+            if span:
+                span.set_attribute("llm.transport", "http")
+
            response = await self._client.post("/v1/chat/completions", json=payload)
            response.raise_for_status()