diff --git a/ray_serve/serve_llm.py b/ray_serve/serve_llm.py index 9b44b0c..bc27a20 100644 --- a/ray_serve/serve_llm.py +++ b/ray_serve/serve_llm.py @@ -120,7 +120,7 @@ class LLMDeployment: except ImportError: self._mlflow = None - async def __call__(self, request: dict[str, Any]) -> dict[str, Any]: + async def __call__(self, request) -> dict[str, Any]: """ Handle OpenAI-compatible chat completion requests. @@ -134,14 +134,23 @@ class LLMDeployment: "stream": false } """ - messages = request.get("messages", []) - temperature = request.get("temperature", 0.7) + # Ray Serve passes a Starlette Request for HTTP calls — + # parse the JSON body so we actually read the user's payload. + from starlette.requests import Request + + if isinstance(request, Request): + body = await request.json() + else: + body = request + + messages = body.get("messages", []) + temperature = body.get("temperature", 0.7) max_tokens = min( - request.get("max_tokens", self.default_max_tokens), + body.get("max_tokens", self.default_max_tokens), self.max_model_len, ) - top_p = request.get("top_p", 1.0) - stop = request.get("stop") + top_p = body.get("top_p", 1.0) + stop = body.get("stop") # Convert messages to prompt prompt = self._format_messages(messages)