fix: remove unused imports and apply ruff formatting
- Remove unused imports: json (llm.py), tempfile (stt.py), base64 (tts.py) - Apply ruff format to all Python files
This commit is contained in:
35
llm.py
35
llm.py
@@ -9,10 +9,10 @@ Features:
|
||||
- Token usage and latency metrics
|
||||
- Chat history management
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import json
|
||||
|
||||
import gradio as gr
|
||||
import httpx
|
||||
@@ -65,7 +65,9 @@ try:
|
||||
_mlflow_run_id = _mlflow_run.info.run_id
|
||||
_mlflow_step = 0
|
||||
MLFLOW_ENABLED = True
|
||||
logger.info("MLflow tracking enabled: experiment=%s run=%s", _experiment_id, _mlflow_run_id)
|
||||
logger.info(
|
||||
"MLflow tracking enabled: experiment=%s run=%s", _experiment_id, _mlflow_run_id
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("MLflow tracking disabled: %s", exc)
|
||||
_mlflow_client = None
|
||||
@@ -95,18 +97,25 @@ def _log_llm_metrics(
|
||||
_mlflow_run_id,
|
||||
metrics=[
|
||||
mlflow.entities.Metric("latency_s", latency, ts, _mlflow_step),
|
||||
mlflow.entities.Metric("prompt_tokens", prompt_tokens, ts, _mlflow_step),
|
||||
mlflow.entities.Metric("completion_tokens", completion_tokens, ts, _mlflow_step),
|
||||
mlflow.entities.Metric(
|
||||
"prompt_tokens", prompt_tokens, ts, _mlflow_step
|
||||
),
|
||||
mlflow.entities.Metric(
|
||||
"completion_tokens", completion_tokens, ts, _mlflow_step
|
||||
),
|
||||
mlflow.entities.Metric("total_tokens", total_tokens, ts, _mlflow_step),
|
||||
mlflow.entities.Metric("tokens_per_second", tps, ts, _mlflow_step),
|
||||
mlflow.entities.Metric("temperature", temperature, ts, _mlflow_step),
|
||||
mlflow.entities.Metric("max_tokens_requested", max_tokens, ts, _mlflow_step),
|
||||
mlflow.entities.Metric(
|
||||
"max_tokens_requested", max_tokens, ts, _mlflow_step
|
||||
),
|
||||
mlflow.entities.Metric("top_p", top_p, ts, _mlflow_step),
|
||||
],
|
||||
)
|
||||
except Exception:
|
||||
logger.debug("MLflow log failed", exc_info=True)
|
||||
|
||||
|
||||
DEFAULT_SYSTEM_PROMPT = (
|
||||
"You are a helpful AI assistant running on Davies Tech Labs homelab infrastructure. "
|
||||
"You are powered by Llama 3.1 70B served via vLLM on AMD Strix Halo (ROCm). "
|
||||
@@ -273,10 +282,10 @@ def single_prompt(
|
||||
metrics = f"""
|
||||
**Generation Metrics:**
|
||||
- Latency: {latency:.1f}s
|
||||
- Prompt tokens: {usage.get('prompt_tokens', 'N/A')}
|
||||
- Completion tokens: {usage.get('completion_tokens', 'N/A')}
|
||||
- Total tokens: {usage.get('total_tokens', 'N/A')}
|
||||
- Model: {result.get('model', 'N/A')}
|
||||
- Prompt tokens: {usage.get("prompt_tokens", "N/A")}
|
||||
- Completion tokens: {usage.get("completion_tokens", "N/A")}
|
||||
- Total tokens: {usage.get("total_tokens", "N/A")}
|
||||
- Model: {result.get("model", "N/A")}
|
||||
"""
|
||||
return text, metrics
|
||||
|
||||
@@ -360,9 +369,13 @@ Chat with **Llama 3.1 70B** (AWQ INT4) served via vLLM on AMD Strix Halo (ROCm).
|
||||
|
||||
gr.Examples(
|
||||
examples=[
|
||||
["Summarise the key differences between CUDA and ROCm for ML workloads."],
|
||||
[
|
||||
"Summarise the key differences between CUDA and ROCm for ML workloads."
|
||||
],
|
||||
["Write a haiku about Kubernetes."],
|
||||
["Explain Ray Serve in one paragraph for someone new to ML serving."],
|
||||
[
|
||||
"Explain Ray Serve in one paragraph for someone new to ML serving."
|
||||
],
|
||||
["List 5 creative uses for a homelab GPU cluster."],
|
||||
],
|
||||
inputs=[prompt_input],
|
||||
|
||||
Reference in New Issue
Block a user