diff --git a/llm.py b/llm.py
index d49b300..3d0fe3b 100644
--- a/llm.py
+++ b/llm.py
@@ -115,7 +115,7 @@ DEFAULT_SYSTEM_PROMPT = (
 
 # Use async client for streaming
 async_client = httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=30.0))
-sync_client = httpx.Client(timeout=10.0)
+sync_client = httpx.Client(timeout=httpx.Timeout(60.0, connect=10.0))
 
 
 async def chat_stream(
@@ -199,6 +199,13 @@ async def chat_stream(
 def check_service_health() -> str:
     """Check if the LLM service is reachable."""
     try:
+        # Try a lightweight GET against the Ray Serve base first.
+        # This avoids burning GPU time on a full inference round-trip.
+        base_url = LLM_URL.rsplit("/", 1)[0]  # strip /llm path
+        response = sync_client.get(f"{base_url}/-/routes")
+        if response.status_code == 200:
+            return "🟢 LLM service is healthy"
+        # Fall back to a minimal inference probe
         response = sync_client.post(
             LLM_URL,
             json={
@@ -212,6 +219,8 @@ def check_service_health() -> str:
         return f"🟡 LLM responded with status {response.status_code}"
     except httpx.ConnectError:
         return "🔴 Cannot connect to LLM service"
+    except httpx.TimeoutException:
+        return "🟡 LLM service is reachable but slow to respond"
     except Exception as e:
         return f"🔴 Service unavailable: {e}"