fix: remove unused imports and apply ruff formatting

- Remove unused imports: json (llm.py), tempfile (stt.py), base64 (tts.py) - Apply ruff format to all Python files
2026-02-18 18:36:16 -05:00
parent 0cc03aa145
commit faa5dc0d9d
5 changed files with 283 additions and 246 deletions
--- a/llm.py
+++ b/llm.py
@@ -9,10 +9,10 @@ Features:
 - Token usage and latency metrics
 - Chat history management
 """
+
 import os
 import time
 import logging
-import json

 import gradio as gr
 import httpx
@@ -65,7 +65,9 @@ try:
    _mlflow_run_id = _mlflow_run.info.run_id
    _mlflow_step = 0
    MLFLOW_ENABLED = True
-    logger.info("MLflow tracking enabled: experiment=%s run=%s", _experiment_id, _mlflow_run_id)
+    logger.info(
+        "MLflow tracking enabled: experiment=%s run=%s", _experiment_id, _mlflow_run_id
+    )
 except Exception as exc:
    logger.warning("MLflow tracking disabled: %s", exc)
    _mlflow_client = None
@@ -95,18 +97,25 @@ def _log_llm_metrics(
            _mlflow_run_id,
            metrics=[
                mlflow.entities.Metric("latency_s", latency, ts, _mlflow_step),
-                mlflow.entities.Metric("prompt_tokens", prompt_tokens, ts, _mlflow_step),
-                mlflow.entities.Metric("completion_tokens", completion_tokens, ts, _mlflow_step),
+                mlflow.entities.Metric(
+                    "prompt_tokens", prompt_tokens, ts, _mlflow_step
+                ),
+                mlflow.entities.Metric(
+                    "completion_tokens", completion_tokens, ts, _mlflow_step
+                ),
                mlflow.entities.Metric("total_tokens", total_tokens, ts, _mlflow_step),
                mlflow.entities.Metric("tokens_per_second", tps, ts, _mlflow_step),
                mlflow.entities.Metric("temperature", temperature, ts, _mlflow_step),
-                mlflow.entities.Metric("max_tokens_requested", max_tokens, ts, _mlflow_step),
+                mlflow.entities.Metric(
+                    "max_tokens_requested", max_tokens, ts, _mlflow_step
+                ),
                mlflow.entities.Metric("top_p", top_p, ts, _mlflow_step),
            ],
        )
    except Exception:
        logger.debug("MLflow log failed", exc_info=True)

+
 DEFAULT_SYSTEM_PROMPT = (
    "You are a helpful AI assistant running on Davies Tech Labs homelab infrastructure. "
    "You are powered by Llama 3.1 70B served via vLLM on AMD Strix Halo (ROCm). "
@@ -273,10 +282,10 @@ def single_prompt(
        metrics = f"""
 **Generation Metrics:**
 - Latency: {latency:.1f}s
- Prompt tokens: {usage.get('prompt_tokens', 'N/A')}
- Completion tokens: {usage.get('completion_tokens', 'N/A')}
- Total tokens: {usage.get('total_tokens', 'N/A')}
- Model: {result.get('model', 'N/A')}
+- Prompt tokens: {usage.get("prompt_tokens", "N/A")}
+- Completion tokens: {usage.get("completion_tokens", "N/A")}
+- Total tokens: {usage.get("total_tokens", "N/A")}
+- Model: {result.get("model", "N/A")}
 """
        return text, metrics

@@ -360,9 +369,13 @@ Chat with **Llama 3.1 70B** (AWQ INT4) served via vLLM on AMD Strix Halo (ROCm).

            gr.Examples(
                examples=[
-                    ["Summarise the key differences between CUDA and ROCm for ML workloads."],
+                    [
+                        "Summarise the key differences between CUDA and ROCm for ML workloads."
+                    ],
                    ["Write a haiku about Kubernetes."],
-                    ["Explain Ray Serve in one paragraph for someone new to ML serving."],
+                    [
+                        "Explain Ray Serve in one paragraph for someone new to ML serving."
+                    ],
                    ["List 5 creative uses for a homelab GPU cluster."],
                ],
                inputs=[prompt_input],