fixing llm readiness check.
This commit is contained in:
11
llm.py
11
llm.py
@@ -115,7 +115,7 @@ DEFAULT_SYSTEM_PROMPT = (
|
|||||||
|
|
||||||
# Use async client for streaming
|
# Use async client for streaming
|
||||||
async_client = httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=30.0))
|
async_client = httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=30.0))
|
||||||
sync_client = httpx.Client(timeout=10.0)
|
sync_client = httpx.Client(timeout=httpx.Timeout(60.0, connect=10.0))
|
||||||
|
|
||||||
|
|
||||||
async def chat_stream(
|
async def chat_stream(
|
||||||
@@ -199,6 +199,13 @@ async def chat_stream(
|
|||||||
def check_service_health() -> str:
|
def check_service_health() -> str:
|
||||||
"""Check if the LLM service is reachable."""
|
"""Check if the LLM service is reachable."""
|
||||||
try:
|
try:
|
||||||
|
# Try a lightweight GET against the Ray Serve base first.
|
||||||
|
# This avoids burning GPU time on a full inference round-trip.
|
||||||
|
base_url = LLM_URL.rsplit("/", 1)[0] # strip /llm path
|
||||||
|
response = sync_client.get(f"{base_url}/-/routes")
|
||||||
|
if response.status_code == 200:
|
||||||
|
return "🟢 LLM service is healthy"
|
||||||
|
# Fall back to a minimal inference probe
|
||||||
response = sync_client.post(
|
response = sync_client.post(
|
||||||
LLM_URL,
|
LLM_URL,
|
||||||
json={
|
json={
|
||||||
@@ -212,6 +219,8 @@ def check_service_health() -> str:
|
|||||||
return f"🟡 LLM responded with status {response.status_code}"
|
return f"🟡 LLM responded with status {response.status_code}"
|
||||||
except httpx.ConnectError:
|
except httpx.ConnectError:
|
||||||
return "🔴 Cannot connect to LLM service"
|
return "🔴 Cannot connect to LLM service"
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
return "🟡 LLM service is reachable but slow to respond"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return f"🔴 Service unavailable: {e}"
|
return f"🔴 Service unavailable: {e}"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user