feat: add py.typed, Ray handles for clients, and pre-commit config
- Add py.typed marker for PEP 561 type hint support - Add ray_utils module for Ray handle detection and caching - Update all clients (Embeddings, LLM, TTS, STT, Reranker) to use Ray handles when running inside Ray cluster for faster internal calls - Add .pre-commit-config.yaml with ruff and standard hooks - Add pre-commit and ray[serve] to optional dependencies - Bump ruff version to 0.4.0
This commit is contained in:
@@ -1,13 +1,16 @@
|
||||
"""
|
||||
LLM service client (vLLM/OpenAI-compatible).
|
||||
|
||||
Supports both HTTP (external) and Ray handles (internal) for optimal performance.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import AsyncIterator, Optional
|
||||
from typing import Any, AsyncIterator, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from handler_base.config import LLMSettings
|
||||
from handler_base.ray_utils import get_ray_handle
|
||||
from handler_base.telemetry import create_span
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -17,6 +20,9 @@ class LLMClient:
|
||||
"""
|
||||
Client for the LLM service (vLLM with OpenAI-compatible API).
|
||||
|
||||
When running inside Ray, automatically uses Ray handles for faster
|
||||
internal communication. Falls back to HTTP for external calls.
|
||||
|
||||
Usage:
|
||||
client = LLMClient()
|
||||
response = await client.generate("Hello, how are you?")
|
||||
@@ -32,12 +38,25 @@ class LLMClient:
|
||||
print(chunk, end="")
|
||||
"""
|
||||
|
||||
# Ray Serve deployment configuration
|
||||
RAY_DEPLOYMENT_NAME = "VLLMDeployment"
|
||||
RAY_APP_NAME = "llm"
|
||||
|
||||
def __init__(self, settings: Optional[LLMSettings] = None):
|
||||
self.settings = settings or LLMSettings()
|
||||
self._client = httpx.AsyncClient(
|
||||
base_url=self.settings.llm_url,
|
||||
timeout=self.settings.http_timeout,
|
||||
)
|
||||
self._ray_handle: Optional[Any] = None
|
||||
self._ray_checked = False
|
||||
|
||||
def _get_ray_handle(self) -> Optional[Any]:
|
||||
"""Get Ray handle, checking only once."""
|
||||
if not self._ray_checked:
|
||||
self._ray_handle = get_ray_handle(self.RAY_DEPLOYMENT_NAME, self.RAY_APP_NAME)
|
||||
self._ray_checked = True
|
||||
return self._ray_handle
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close the HTTP client."""
|
||||
@@ -87,6 +106,24 @@ class LLMClient:
|
||||
if stop:
|
||||
payload["stop"] = stop
|
||||
|
||||
# Try Ray handle first (faster internal path)
|
||||
handle = self._get_ray_handle()
|
||||
if handle:
|
||||
try:
|
||||
if span:
|
||||
span.set_attribute("llm.transport", "ray")
|
||||
result = await handle.remote(payload)
|
||||
content = result["choices"][0]["message"]["content"]
|
||||
if span:
|
||||
span.set_attribute("llm.response_length", len(content))
|
||||
return content
|
||||
except Exception as e:
|
||||
logger.warning(f"Ray handle failed, falling back to HTTP: {e}")
|
||||
|
||||
# HTTP fallback
|
||||
if span:
|
||||
span.set_attribute("llm.transport", "http")
|
||||
|
||||
response = await self._client.post("/v1/chat/completions", json=payload)
|
||||
response.raise_for_status()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user