async vllm is better.
All checks were successful
Build and Publish ray-serve-apps / build-and-publish (push) Successful in 1m3s

This commit is contained in:
2026-02-11 06:05:50 -05:00
parent c9d7a2b5b7
commit 2edafc33c0

View File

@@ -14,7 +14,9 @@ from ray import serve
@serve.deployment(name="LLMDeployment", num_replicas=1) @serve.deployment(name="LLMDeployment", num_replicas=1)
class LLMDeployment: class LLMDeployment:
def __init__(self): def __init__(self):
from vllm import LLM, SamplingParams from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
self.model_id = os.environ.get("MODEL_ID", "meta-llama/Llama-3.1-70B-Instruct") self.model_id = os.environ.get("MODEL_ID", "meta-llama/Llama-3.1-70B-Instruct")
self.max_model_len = int(os.environ.get("MAX_MODEL_LEN", "8192")) self.max_model_len = int(os.environ.get("MAX_MODEL_LEN", "8192"))
@@ -24,14 +26,16 @@ class LLMDeployment:
print(f"Max model length: {self.max_model_len}") print(f"Max model length: {self.max_model_len}")
print(f"GPU memory utilization: {self.gpu_memory_utilization}") print(f"GPU memory utilization: {self.gpu_memory_utilization}")
self.llm = LLM( engine_args = AsyncEngineArgs(
model=self.model_id, model=self.model_id,
max_model_len=self.max_model_len, max_model_len=self.max_model_len,
gpu_memory_utilization=self.gpu_memory_utilization, gpu_memory_utilization=self.gpu_memory_utilization,
trust_remote_code=True, trust_remote_code=True,
disable_log_stats=True,
) )
self.engine = AsyncLLMEngine.from_engine_args(engine_args)
self.SamplingParams = SamplingParams self.SamplingParams = SamplingParams
print(f"Model {self.model_id} loaded successfully") print(f"Model {self.model_id} async engine created")
async def __call__(self, request: dict[str, Any]) -> dict[str, Any]: async def __call__(self, request: dict[str, Any]) -> dict[str, Any]:
""" """
@@ -63,8 +67,11 @@ class LLMDeployment:
stop=stop, stop=stop,
) )
outputs = self.llm.generate([prompt], sampling_params) request_id = uuid.uuid4().hex
generated_text = outputs[0].outputs[0].text final_result = None
async for result in self.engine.generate(prompt, sampling_params, request_id):
final_result = result
generated_text = final_result.outputs[0].text
# Return OpenAI-compatible response # Return OpenAI-compatible response
return { return {