From a973768aeea92e5395182c115e0bc3669b4c37d1 Mon Sep 17 00:00:00 2001 From: "Billy D." Date: Wed, 18 Feb 2026 07:30:00 -0500 Subject: [PATCH] fixing serve-llm stuff. --- ray_serve/serve_llm.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/ray_serve/serve_llm.py b/ray_serve/serve_llm.py index 9b44b0c..bc27a20 100644 --- a/ray_serve/serve_llm.py +++ b/ray_serve/serve_llm.py @@ -120,7 +120,7 @@ class LLMDeployment: except ImportError: self._mlflow = None - async def __call__(self, request: dict[str, Any]) -> dict[str, Any]: + async def __call__(self, request) -> dict[str, Any]: """ Handle OpenAI-compatible chat completion requests. @@ -134,14 +134,23 @@ class LLMDeployment: "stream": false } """ - messages = request.get("messages", []) - temperature = request.get("temperature", 0.7) + # Ray Serve passes a Starlette Request for HTTP calls — + # parse the JSON body so we actually read the user's payload. + from starlette.requests import Request + + if isinstance(request, Request): + body = await request.json() + else: + body = request + + messages = body.get("messages", []) + temperature = body.get("temperature", 0.7) max_tokens = min( - request.get("max_tokens", self.default_max_tokens), + body.get("max_tokens", self.default_max_tokens), self.max_model_len, ) - top_p = request.get("top_p", 1.0) - stop = request.get("stop") + top_p = body.get("top_p", 1.0) + stop = body.get("stop") # Convert messages to prompt prompt = self._format_messages(messages)