fixing serve-llm stuff.
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 12s
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 12s
This commit is contained in:
@@ -120,7 +120,7 @@ class LLMDeployment:
|
||||
except ImportError:
|
||||
self._mlflow = None
|
||||
|
||||
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
|
||||
async def __call__(self, request) -> dict[str, Any]:
|
||||
"""
|
||||
Handle OpenAI-compatible chat completion requests.
|
||||
|
||||
@@ -134,14 +134,23 @@ class LLMDeployment:
|
||||
"stream": false
|
||||
}
|
||||
"""
|
||||
messages = request.get("messages", [])
|
||||
temperature = request.get("temperature", 0.7)
|
||||
# Ray Serve passes a Starlette Request for HTTP calls —
|
||||
# parse the JSON body so we actually read the user's payload.
|
||||
from starlette.requests import Request
|
||||
|
||||
if isinstance(request, Request):
|
||||
body = await request.json()
|
||||
else:
|
||||
body = request
|
||||
|
||||
messages = body.get("messages", [])
|
||||
temperature = body.get("temperature", 0.7)
|
||||
max_tokens = min(
|
||||
request.get("max_tokens", self.default_max_tokens),
|
||||
body.get("max_tokens", self.default_max_tokens),
|
||||
self.max_model_len,
|
||||
)
|
||||
top_p = request.get("top_p", 1.0)
|
||||
stop = request.get("stop")
|
||||
top_p = body.get("top_p", 1.0)
|
||||
stop = body.get("stop")
|
||||
|
||||
# Convert messages to prompt
|
||||
prompt = self._format_messages(messages)
|
||||
|
||||
Reference in New Issue
Block a user