fix: remove unused imports and apply ruff formatting
Some checks failed
CI / Docker Build & Push (push) Failing after 2m37s
CI / Deploy to Kubernetes (push) Has been skipped
CI / Notify (push) Successful in 1s
CI / Lint (push) Successful in 10s
CI / Release (push) Successful in 4s

- Remove unused imports: json (llm.py), tempfile (stt.py), base64 (tts.py)
- Apply ruff format to all Python files
This commit is contained in:
2026-02-18 18:36:16 -05:00
parent 0cc03aa145
commit faa5dc0d9d
5 changed files with 283 additions and 246 deletions

35
llm.py
View File

@@ -9,10 +9,10 @@ Features:
- Token usage and latency metrics
- Chat history management
"""
import os
import time
import logging
import json
import gradio as gr
import httpx
@@ -65,7 +65,9 @@ try:
_mlflow_run_id = _mlflow_run.info.run_id
_mlflow_step = 0
MLFLOW_ENABLED = True
logger.info("MLflow tracking enabled: experiment=%s run=%s", _experiment_id, _mlflow_run_id)
logger.info(
"MLflow tracking enabled: experiment=%s run=%s", _experiment_id, _mlflow_run_id
)
except Exception as exc:
logger.warning("MLflow tracking disabled: %s", exc)
_mlflow_client = None
@@ -95,18 +97,25 @@ def _log_llm_metrics(
_mlflow_run_id,
metrics=[
mlflow.entities.Metric("latency_s", latency, ts, _mlflow_step),
mlflow.entities.Metric("prompt_tokens", prompt_tokens, ts, _mlflow_step),
mlflow.entities.Metric("completion_tokens", completion_tokens, ts, _mlflow_step),
mlflow.entities.Metric(
"prompt_tokens", prompt_tokens, ts, _mlflow_step
),
mlflow.entities.Metric(
"completion_tokens", completion_tokens, ts, _mlflow_step
),
mlflow.entities.Metric("total_tokens", total_tokens, ts, _mlflow_step),
mlflow.entities.Metric("tokens_per_second", tps, ts, _mlflow_step),
mlflow.entities.Metric("temperature", temperature, ts, _mlflow_step),
mlflow.entities.Metric("max_tokens_requested", max_tokens, ts, _mlflow_step),
mlflow.entities.Metric(
"max_tokens_requested", max_tokens, ts, _mlflow_step
),
mlflow.entities.Metric("top_p", top_p, ts, _mlflow_step),
],
)
except Exception:
logger.debug("MLflow log failed", exc_info=True)
DEFAULT_SYSTEM_PROMPT = (
"You are a helpful AI assistant running on Davies Tech Labs homelab infrastructure. "
"You are powered by Llama 3.1 70B served via vLLM on AMD Strix Halo (ROCm). "
@@ -273,10 +282,10 @@ def single_prompt(
metrics = f"""
**Generation Metrics:**
- Latency: {latency:.1f}s
- Prompt tokens: {usage.get('prompt_tokens', 'N/A')}
- Completion tokens: {usage.get('completion_tokens', 'N/A')}
- Total tokens: {usage.get('total_tokens', 'N/A')}
- Model: {result.get('model', 'N/A')}
- Prompt tokens: {usage.get("prompt_tokens", "N/A")}
- Completion tokens: {usage.get("completion_tokens", "N/A")}
- Total tokens: {usage.get("total_tokens", "N/A")}
- Model: {result.get("model", "N/A")}
"""
return text, metrics
@@ -360,9 +369,13 @@ Chat with **Llama 3.1 70B** (AWQ INT4) served via vLLM on AMD Strix Halo (ROCm).
gr.Examples(
examples=[
["Summarise the key differences between CUDA and ROCm for ML workloads."],
[
"Summarise the key differences between CUDA and ROCm for ML workloads."
],
["Write a haiku about Kubernetes."],
["Explain Ray Serve in one paragraph for someone new to ML serving."],
[
"Explain Ray Serve in one paragraph for someone new to ML serving."
],
["List 5 creative uses for a homelab GPU cluster."],
],
inputs=[prompt_input],