From af6798473739c164eb321e94f4a5ef6edad862ae Mon Sep 17 00:00:00 2001 From: "Billy D." Date: Thu, 12 Feb 2026 05:27:13 -0500 Subject: [PATCH] fixing up themes, adding in an endpoint for llm. --- kustomization.yaml | 1 + llm.py | 272 +++++++++++++++++++++++++++++++++++++++++++++ llm.yaml | 96 ++++++++++++++++ theme.py | 176 +++++++++++++++++++++++++++++ 4 files changed, 545 insertions(+) create mode 100644 llm.py create mode 100644 llm.yaml diff --git a/kustomization.yaml b/kustomization.yaml index 318ff53..2bc42eb 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -5,5 +5,6 @@ namespace: ai-ml resources: - embeddings.yaml + - llm.yaml - tts.yaml - stt.yaml diff --git a/llm.py b/llm.py new file mode 100644 index 0000000..455e960 --- /dev/null +++ b/llm.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 +""" +LLM Chat Demo - Gradio UI for testing vLLM inference service. + +Features: +- Multi-turn chat with streaming responses +- Configurable temperature, max tokens, top-p +- System prompt customisation +- Token usage and latency metrics +- Chat history management +""" +import os +import time +import logging +import json + +import gradio as gr +import httpx + +from theme import get_lab_theme, CUSTOM_CSS, create_footer + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger("llm-demo") + +# Configuration +LLM_URL = os.environ.get( + "LLM_URL", + # Default: Ray Serve LLM endpoint + "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm", +) + +DEFAULT_SYSTEM_PROMPT = ( + "You are a helpful AI assistant running on Davies Tech Labs homelab infrastructure. " + "You are powered by Llama 3.1 70B served via vLLM on AMD Strix Halo (ROCm). " + "Be concise and helpful." +) + +# Use async client for streaming +async_client = httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=30.0)) +sync_client = httpx.Client(timeout=10.0) + + +async def chat_stream( + message: str, + history: list[dict[str, str]], + system_prompt: str, + temperature: float, + max_tokens: int, + top_p: float, +): + """Stream chat responses from the vLLM endpoint.""" + if not message.strip(): + yield "" + return + + # Build message list from history + messages = [] + if system_prompt.strip(): + messages.append({"role": "system", "content": system_prompt}) + + for entry in history: + messages.append({"role": entry["role"], "content": entry["content"]}) + + messages.append({"role": "user", "content": message}) + + payload = { + "messages": messages, + "temperature": temperature, + "max_tokens": max_tokens, + "top_p": top_p, + } + + start_time = time.time() + + try: + response = await async_client.post(LLM_URL, json=payload) + response.raise_for_status() + + result = response.json() + text = result["choices"][0]["message"]["content"] + latency = time.time() - start_time + usage = result.get("usage", {}) + + logger.info( + "LLM response: %d tokens in %.1fs (prompt=%d, completion=%d)", + usage.get("total_tokens", 0), + latency, + usage.get("prompt_tokens", 0), + usage.get("completion_tokens", 0), + ) + + # Yield text progressively for a nicer streaming feel + chunk_size = 4 + words = text.split(" ") + partial = "" + for i, word in enumerate(words): + partial += ("" if i == 0 else " ") + word + if i % chunk_size == 0 or i == len(words) - 1: + yield partial + + except httpx.HTTPStatusError as e: + logger.exception("LLM request failed") + yield f"❌ LLM service error: {e.response.status_code} — {e.response.text[:200]}" + except httpx.ConnectError: + yield "❌ Cannot connect to LLM service. Is the Ray Serve cluster running?" + except Exception as e: + logger.exception("LLM chat failed") + yield f"❌ Error: {e}" + + +def check_service_health() -> str: + """Check if the LLM service is reachable.""" + try: + response = sync_client.post( + LLM_URL, + json={ + "messages": [{"role": "user", "content": "ping"}], + "max_tokens": 1, + "temperature": 0.0, + }, + ) + if response.status_code == 200: + return "🟢 LLM service is healthy" + return f"🟡 LLM responded with status {response.status_code}" + except httpx.ConnectError: + return "🔴 Cannot connect to LLM service" + except Exception as e: + return f"🔴 Service unavailable: {e}" + + +def single_prompt( + prompt: str, + system_prompt: str, + temperature: float, + max_tokens: int, + top_p: float, +) -> tuple[str, str]: + """Send a single prompt (non-chat mode) and return output + metrics.""" + if not prompt.strip(): + return "❌ Please enter a prompt", "" + + messages = [] + if system_prompt.strip(): + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": prompt}) + + payload = { + "messages": messages, + "temperature": temperature, + "max_tokens": max_tokens, + "top_p": top_p, + } + + start_time = time.time() + + try: + client = httpx.Client(timeout=300.0) + response = client.post(LLM_URL, json=payload) + response.raise_for_status() + result = response.json() + latency = time.time() - start_time + + text = result["choices"][0]["message"]["content"] + usage = result.get("usage", {}) + + metrics = f""" +**Generation Metrics:** +- Latency: {latency:.1f}s +- Prompt tokens: {usage.get('prompt_tokens', 'N/A')} +- Completion tokens: {usage.get('completion_tokens', 'N/A')} +- Total tokens: {usage.get('total_tokens', 'N/A')} +- Model: {result.get('model', 'N/A')} +""" + return text, metrics + + except httpx.HTTPStatusError as e: + return f"❌ Error {e.response.status_code}: {e.response.text[:300]}", "" + except httpx.ConnectError: + return "❌ Cannot connect to LLM service", "" + except Exception as e: + return f"❌ {e}", "" + + +# ─── Build the Gradio app ──────────────────────────────────────────────── + +with gr.Blocks(theme=get_lab_theme(), css=CUSTOM_CSS, title="LLM Chat Demo") as demo: + gr.Markdown( + """ +# 🧠 LLM Chat Demo + +Chat with **Llama 3.1 70B** (AWQ INT4) served via vLLM on AMD Strix Halo (ROCm). +""" + ) + + # Service status + with gr.Row(): + health_btn = gr.Button("🔄 Check Service", size="sm") + health_status = gr.Textbox(label="Service Status", interactive=False) + + health_btn.click(fn=check_service_health, outputs=health_status) + + # Shared parameters + with gr.Accordion("⚙️ Parameters", open=False): + system_prompt = gr.Textbox( + label="System Prompt", + value=DEFAULT_SYSTEM_PROMPT, + lines=3, + max_lines=6, + ) + with gr.Row(): + temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature") + max_tokens = gr.Slider(16, 4096, value=512, step=16, label="Max Tokens") + top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01, label="Top-p") + + with gr.Tabs(): + # Tab 1: Multi-turn Chat + with gr.TabItem("💬 Chat"): + chatbot = gr.ChatInterface( + fn=chat_stream, + type="messages", + additional_inputs=[system_prompt, temperature, max_tokens, top_p], + examples=[ + "Hello! What can you tell me about yourself?", + "Explain how a GPU executes a matrix multiplication.", + "Write a Python function to compute the Fibonacci sequence.", + "What are the pros and cons of running LLMs on AMD GPUs?", + ], + chatbot=gr.Chatbot( + height=520, + type="messages", + show_copy_button=True, + placeholder="Type a message to start chatting...", + ), + ) + + # Tab 2: Single Prompt + with gr.TabItem("📝 Single Prompt"): + gr.Markdown("Send a one-shot prompt without conversation history.") + + prompt_input = gr.Textbox( + label="Prompt", + placeholder="Enter your prompt...", + lines=4, + max_lines=10, + ) + generate_btn = gr.Button("🚀 Generate", variant="primary") + + output_text = gr.Textbox(label="Response", lines=12, interactive=False) + output_metrics = gr.Markdown(label="Metrics") + + generate_btn.click( + fn=single_prompt, + inputs=[prompt_input, system_prompt, temperature, max_tokens, top_p], + outputs=[output_text, output_metrics], + ) + + gr.Examples( + examples=[ + ["Summarise the key differences between CUDA and ROCm for ML workloads."], + ["Write a haiku about Kubernetes."], + ["Explain Ray Serve in one paragraph for someone new to ML serving."], + ["List 5 creative uses for a homelab GPU cluster."], + ], + inputs=[prompt_input], + ) + + create_footer() + + +if __name__ == "__main__": + demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True) diff --git a/llm.yaml b/llm.yaml new file mode 100644 index 0000000..451232f --- /dev/null +++ b/llm.yaml @@ -0,0 +1,96 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-ui + namespace: ai-ml + labels: + app: llm + component: demo-ui +spec: + replicas: 1 + selector: + matchLabels: + app: llm + template: + metadata: + labels: + app: llm + component: demo-ui + spec: + containers: + - name: gradio + image: ghcr.io/billy-davies-2/llm-apps:v2-202601271655 + imagePullPolicy: Always + command: ["python", "llm.py"] + ports: + - containerPort: 7860 + name: http + protocol: TCP + env: + - name: LLM_URL + # Ray Serve endpoint - routes to /llm prefix + value: "http://ai-inference-serve-svc.ai-ml.svc.cluster.local:8000/llm" + - name: MLFLOW_TRACKING_URI + value: "http://mlflow.mlflow.svc.cluster.local:80" + resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" + livenessProbe: + httpGet: + path: / + port: 7860 + initialDelaySeconds: 10 + periodSeconds: 30 + readinessProbe: + httpGet: + path: / + port: 7860 + initialDelaySeconds: 5 + periodSeconds: 10 + imagePullSecrets: + - name: ghcr-registry +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-ui + namespace: ai-ml + labels: + app: llm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 7860 + protocol: TCP + name: http + selector: + app: llm +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-ui + namespace: ai-ml + annotations: + external-dns.alpha.kubernetes.io/hostname: llm-ui.lab.daviestechlabs.io +spec: + parentRefs: + - name: envoy-internal + namespace: network + sectionName: https-lab + hostnames: + - llm-ui.lab.daviestechlabs.io + rules: + - matches: + - path: + type: PathPrefix + value: / + backendRefs: + - name: llm-ui + port: 80 diff --git a/theme.py b/theme.py index c53befb..ddbdec6 100644 --- a/theme.py +++ b/theme.py @@ -75,10 +75,38 @@ def get_lab_theme() -> gr.Theme: block_background_fill_dark=CYBER_GRAY, block_border_color="#2a2a2a", block_border_color_dark="#2a2a2a", + block_label_background_fill="#1a1a00", + block_label_background_fill_dark="#1a1a00", block_label_text_color=CYBER_YELLOW, block_label_text_color_dark=CYBER_YELLOW, + block_label_border_color=CYBER_YELLOW, + block_label_border_color_dark=CYBER_YELLOW, block_title_text_color=CYBER_TEXT, block_title_text_color_dark=CYBER_TEXT, + # Table / Dataframe + table_border_color="#2a2a2a", + table_even_background_fill="#111111", + table_even_background_fill_dark="#111111", + table_odd_background_fill=CYBER_GRAY, + table_odd_background_fill_dark=CYBER_GRAY, + table_row_focus="#1f1a00", + table_row_focus_dark="#1f1a00", + # Panel / accordion + panel_background_fill=CYBER_DARK, + panel_background_fill_dark=CYBER_DARK, + panel_border_color="#2a2a2a", + panel_border_color_dark="#2a2a2a", + # Checkbox / radio + checkbox_background_color=CYBER_DARKER, + checkbox_background_color_dark=CYBER_DARKER, + checkbox_label_background_fill=CYBER_GRAY, + checkbox_label_background_fill_dark=CYBER_GRAY, + checkbox_label_text_color=CYBER_TEXT, + checkbox_label_text_color_dark=CYBER_TEXT, + # Colors + color_accent=CYBER_YELLOW, + color_accent_soft="#1f1a00", + color_accent_soft_dark="#1f1a00", ) @@ -338,6 +366,154 @@ label, .gr-label { .glow-text { text-shadow: 0 0 10px var(--cyber-yellow), 0 0 20px var(--cyber-yellow); } + +/* ── Examples table / Dataframe overrides ── */ +/* Gradio renders Examples as a inside a Dataset component. + The default styles inject white / light-gray rows that blow out the + cyberpunk palette. Force them dark here. */ +.gr-samples-table, +.gr-sample-textbox, +table.table, +.gr-examples table, +div[class*="dataset"] table { + background: var(--cyber-dark) !important; + color: var(--cyber-text) !important; +} + +.gr-samples-table tr, +.gr-examples table tr, +div[class*="dataset"] table tr { + background: #111111 !important; + border-bottom: 1px solid #222 !important; +} + +.gr-samples-table tr:nth-child(even), +.gr-examples table tr:nth-child(even), +div[class*="dataset"] table tr:nth-child(even) { + background: #0d0d0d !important; +} + +.gr-samples-table tr:hover, +.gr-examples table tr:hover, +div[class*="dataset"] table tr:hover { + background: #1f1a00 !important; + cursor: pointer; +} + +.gr-samples-table th, +.gr-examples table th, +div[class*="dataset"] table th { + background: var(--cyber-gray) !important; + color: var(--cyber-yellow) !important; + text-transform: uppercase !important; + font-size: 0.75rem !important; + letter-spacing: 0.1em !important; + border-bottom: 2px solid var(--cyber-yellow) !important; + padding: 10px 16px !important; +} + +.gr-samples-table td, +.gr-examples table td, +div[class*="dataset"] table td { + color: #999 !important; + border-bottom: 1px solid #1a1a1a !important; + padding: 10px 16px !important; + font-family: 'JetBrains Mono', monospace !important; + font-size: 0.85rem !important; +} + +/* ── Block label pill (e.g. "GENERATED AUDIO", "STATUS") ── */ +/* These are the small floating labels above each component block */ +span[class*="label-wrap"], +.gr-block-label, +.label-wrap { + background: #1a1a00 !important; + border: 1px solid var(--cyber-yellow) !important; + color: var(--cyber-yellow) !important; +} + +/* ── Dropdown / select menus ── */ +.gr-dropdown, +select, +ul[role="listbox"], +div[class*="dropdown"], +.secondary-wrap { + background: #0a0a0a !important; + color: var(--cyber-text) !important; + border-color: #333 !important; +} + +ul[role="listbox"] li, +div[class*="dropdown"] li { + background: #0a0a0a !important; + color: var(--cyber-text) !important; +} + +ul[role="listbox"] li:hover, +ul[role="listbox"] li[aria-selected="true"], +div[class*="dropdown"] li:hover { + background: #1f1a00 !important; + color: var(--cyber-yellow) !important; +} + +/* ── Audio player ── */ +.gr-audio, +audio { + background: var(--cyber-dark) !important; + border: 1px solid #2a2a2a !important; +} + +/* Audio waveform container */ +div[data-testid="waveform-container"], +div[class*="audio"] { + background: #0a0a0a !important; +} + +/* ── Markdown inside blocks ── */ +.gr-markdown, +.gr-markdown p, +.prose { + color: var(--cyber-text) !important; +} + +.gr-markdown h3, +.gr-markdown h2 { + color: var(--cyber-yellow) !important; + letter-spacing: 0.05em !important; +} + +.gr-markdown strong { + color: var(--cyber-gold) !important; +} + +/* ── Examples accordion header ("Examples" label) ── */ +.gr-examples .label-wrap, +div[id*="examples"] .label-wrap, +span[data-testid="block-label"] { + background: #1a1a00 !important; + color: var(--cyber-yellow) !important; + border: 1px solid var(--cyber-yellow) !important; + font-size: 0.7rem !important; + text-transform: uppercase !important; + letter-spacing: 0.1em !important; +} + +/* ── Misc: tooltip, info text ── */ +.gr-info, +.gr-description { + color: #666 !important; +} + +/* ── Svelte internal: make sure no white backgrounds leak ── */ +.contain > div, +.wrap > div { + background: inherit !important; +} + +/* ── Tab content panels ── */ +.tabitem { + background: var(--cyber-dark) !important; +} """