diff --git a/llm.py b/llm.py index d49b300..3d0fe3b 100644 --- a/llm.py +++ b/llm.py @@ -115,7 +115,7 @@ DEFAULT_SYSTEM_PROMPT = ( # Use async client for streaming async_client = httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=30.0)) -sync_client = httpx.Client(timeout=10.0) +sync_client = httpx.Client(timeout=httpx.Timeout(60.0, connect=10.0)) async def chat_stream( @@ -199,6 +199,13 @@ async def chat_stream( def check_service_health() -> str: """Check if the LLM service is reachable.""" try: + # Try a lightweight GET against the Ray Serve base first. + # This avoids burning GPU time on a full inference round-trip. + base_url = LLM_URL.rsplit("/", 1)[0] # strip /llm path + response = sync_client.get(f"{base_url}/-/routes") + if response.status_code == 200: + return "🟢 LLM service is healthy" + # Fall back to a minimal inference probe response = sync_client.post( LLM_URL, json={ @@ -212,6 +219,8 @@ def check_service_health() -> str: return f"🟡 LLM responded with status {response.status_code}" except httpx.ConnectError: return "🔴 Cannot connect to LLM service" + except httpx.TimeoutException: + return "🟡 LLM service is reachable but slow to respond" except Exception as e: return f"🔴 Service unavailable: {e}"