feat(strixhalo): add amdsmi sysfs shim to bypass glibc 2.38 requirement
Some checks failed
Build and Push Images / determine-version (push) Successful in 58s
Build and Push Images / Release (push) Has been cancelled
Build and Push Images / Notify (push) Has been cancelled
Build and Push Images / build-strixhalo (push) Has been cancelled
Build and Push Images / build-intel (push) Has been cancelled
Build and Push Images / build-nvidia (push) Has been cancelled
Build and Push Images / build-rdna2 (push) Has been cancelled
Some checks failed
Build and Push Images / determine-version (push) Successful in 58s
Build and Push Images / Release (push) Has been cancelled
Build and Push Images / Notify (push) Has been cancelled
Build and Push Images / build-strixhalo (push) Has been cancelled
Build and Push Images / build-intel (push) Has been cancelled
Build and Push Images / build-nvidia (push) Has been cancelled
Build and Push Images / build-rdna2 (push) Has been cancelled
The native amdsmi from ROCm 7.1 requires libamd_smi.so linked against glibc 2.38 (Ubuntu 24.04), but the Ray base image is Ubuntu 22.04 (glibc 2.35). This caused vLLM to fail ROCm platform detection with 'No module named amdsmi' / GLIBC_2.38 not found errors. Solution: Pure-Python amdsmi shim that reads GPU info from sysfs (/sys/class/drm/*) instead of the native library. Provides the full API surface used by both vLLM (platform detection, device info) and PyTorch (device counting, memory/power/temp monitoring). Tested in-container: vLLM detects RocmPlatform, PyTorch sees GPU (Radeon 8060S, 128GB, HIP 7.3), DeviceConfig resolves to 'cuda'. Changes: - Add amdsmi-shim/ package with sysfs-backed implementation - Update Dockerfile to install shim after vLLM/torch - Add amdsmi-shim/ to .dockerignore explicit includes
This commit is contained in:
@@ -42,3 +42,4 @@ Makefile
|
|||||||
# Don't ignore these (explicitly include)
|
# Don't ignore these (explicitly include)
|
||||||
!ray-serve/
|
!ray-serve/
|
||||||
!dockerfiles/
|
!dockerfiles/
|
||||||
|
!amdsmi-shim/
|
||||||
|
|||||||
335
amdsmi-shim/amdsmi/__init__.py
Normal file
335
amdsmi-shim/amdsmi/__init__.py
Normal file
@@ -0,0 +1,335 @@
|
|||||||
|
"""
|
||||||
|
Pure-Python amdsmi shim using sysfs for ROCm platform detection.
|
||||||
|
|
||||||
|
Replaces the native amdsmi package (which requires libamd_smi.so linked
|
||||||
|
against glibc 2.38) with a lightweight implementation that reads GPU
|
||||||
|
information from sysfs (/sys/class/drm/*).
|
||||||
|
|
||||||
|
This is needed because the Ray base image (Ubuntu 22.04, glibc 2.35) cannot
|
||||||
|
load the ROCm 7.1 vendor libamd_smi.so (built for Ubuntu 24.04, glibc 2.38).
|
||||||
|
|
||||||
|
Provides the subset of the amdsmi API used by:
|
||||||
|
- vLLM (platform detection, device info, topology)
|
||||||
|
- PyTorch (device counting, memory/power/temp monitoring)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import glob
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from enum import IntEnum
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Exception
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class AmdSmiException(Exception):
|
||||||
|
"""Shim for amdsmi.AmdSmiException."""
|
||||||
|
|
||||||
|
def __init__(self, message: str = "", err_code: int = -1):
|
||||||
|
super().__init__(message)
|
||||||
|
self.err_code = err_code
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Enums used by PyTorch monitoring helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class AmdSmiTemperatureType(IntEnum):
|
||||||
|
EDGE = 0
|
||||||
|
JUNCTION = 1
|
||||||
|
VRAM = 2
|
||||||
|
|
||||||
|
|
||||||
|
class AmdSmiTemperatureMetric(IntEnum):
|
||||||
|
CURRENT = 0
|
||||||
|
MAX = 1
|
||||||
|
MIN = 2
|
||||||
|
|
||||||
|
|
||||||
|
class AmdSmiClkType(IntEnum):
|
||||||
|
GFX = 0
|
||||||
|
MEM = 1
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# sysfs helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_DRM_BASE = "/sys/class/drm"
|
||||||
|
|
||||||
|
|
||||||
|
def _read_sysfs(path: str, default: str = "") -> str:
|
||||||
|
"""Read a sysfs file, returning *default* on any error."""
|
||||||
|
try:
|
||||||
|
return Path(path).read_text().strip()
|
||||||
|
except (OSError, IOError):
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def _read_sysfs_int(path: str, default: int = 0) -> int:
|
||||||
|
val = _read_sysfs(path)
|
||||||
|
try:
|
||||||
|
if val.startswith("0x"):
|
||||||
|
return int(val, 16)
|
||||||
|
return int(val)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def _discover_cards() -> list[str]:
|
||||||
|
"""Return sorted list of card directory names, e.g. ['card0', 'card1']."""
|
||||||
|
cards = []
|
||||||
|
for entry in sorted(glob.glob(os.path.join(_DRM_BASE, "card[0-9]*"))):
|
||||||
|
name = os.path.basename(entry)
|
||||||
|
# Exclude render nodes and connector entries like card0-DP-1
|
||||||
|
if name.startswith("card") and "-" not in name:
|
||||||
|
vendor = _read_sysfs(os.path.join(entry, "device", "vendor"))
|
||||||
|
# 0x1002 = AMD
|
||||||
|
if vendor == "0x1002":
|
||||||
|
cards.append(name)
|
||||||
|
return cards
|
||||||
|
|
||||||
|
|
||||||
|
def _find_hwmon(card: str) -> str | None:
|
||||||
|
"""Return the first hwmon directory path for a card, or None."""
|
||||||
|
hwmon_base = os.path.join(_DRM_BASE, card, "device", "hwmon")
|
||||||
|
entries = sorted(glob.glob(os.path.join(hwmon_base, "hwmon*")))
|
||||||
|
return entries[0] if entries else None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Handle management
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Each "handle" is just a dict with cached paths
|
||||||
|
_handles: list[dict[str, Any]] = []
|
||||||
|
_initialized = False
|
||||||
|
|
||||||
|
|
||||||
|
def amdsmi_init() -> None:
|
||||||
|
"""Initialise the shim by discovering AMD GPUs via sysfs."""
|
||||||
|
global _handles, _initialized
|
||||||
|
if _initialized:
|
||||||
|
return
|
||||||
|
cards = _discover_cards()
|
||||||
|
_handles = []
|
||||||
|
for card in cards:
|
||||||
|
dev = os.path.join(_DRM_BASE, card, "device")
|
||||||
|
_handles.append(
|
||||||
|
{
|
||||||
|
"card": card,
|
||||||
|
"dev": dev,
|
||||||
|
"hwmon": _find_hwmon(card),
|
||||||
|
"vendor": _read_sysfs(os.path.join(dev, "vendor")),
|
||||||
|
"device_id": _read_sysfs(os.path.join(dev, "device")),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
_initialized = True
|
||||||
|
logger.debug("amdsmi-shim: discovered %d AMD GPU(s) via sysfs", len(_handles))
|
||||||
|
|
||||||
|
|
||||||
|
def amdsmi_shut_down() -> None:
|
||||||
|
"""Reset internal state."""
|
||||||
|
global _handles, _initialized
|
||||||
|
_handles = []
|
||||||
|
_initialized = False
|
||||||
|
|
||||||
|
|
||||||
|
def amdsmi_get_processor_handles() -> list[dict[str, Any]]:
|
||||||
|
"""Return a list of opaque handles (one per GPU)."""
|
||||||
|
if not _initialized:
|
||||||
|
raise AmdSmiException("amdsmi not initialised", err_code=1)
|
||||||
|
return list(_handles)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Device info (used by vLLM get_device_name + PyTorch UUID helpers)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def amdsmi_get_gpu_asic_info(handle: dict[str, Any]) -> dict[str, str]:
|
||||||
|
"""
|
||||||
|
Return ASIC information for a GPU handle.
|
||||||
|
|
||||||
|
Keys returned: device_id, market_name, asic_serial, vendor_id, rev_id
|
||||||
|
"""
|
||||||
|
dev = handle["dev"]
|
||||||
|
device_id_hex = _read_sysfs(os.path.join(dev, "device"), "0x0000")
|
||||||
|
# Read marketing / product name (may not exist on all kernels)
|
||||||
|
market = (
|
||||||
|
_read_sysfs(os.path.join(dev, "product_name"))
|
||||||
|
or _read_sysfs(os.path.join(dev, "marketing_name"))
|
||||||
|
or f"AMD GPU {device_id_hex}"
|
||||||
|
)
|
||||||
|
# Unique ID for serial (not always available)
|
||||||
|
serial = _read_sysfs(os.path.join(dev, "unique_id"), "0000000000000000")
|
||||||
|
return {
|
||||||
|
"device_id": device_id_hex,
|
||||||
|
"market_name": market,
|
||||||
|
"asic_serial": serial,
|
||||||
|
"vendor_id": _read_sysfs(os.path.join(dev, "vendor"), "0x1002"),
|
||||||
|
"rev_id": _read_sysfs(os.path.join(dev, "revision"), "0x00"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Topology (used by vLLM is_fully_connected)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def amdsmi_topo_get_link_type(
|
||||||
|
handle: dict[str, Any], peer_handle: dict[str, Any]
|
||||||
|
) -> dict[str, int]:
|
||||||
|
"""
|
||||||
|
Return link topology between two GPUs.
|
||||||
|
|
||||||
|
For single-GPU systems this is essentially a no-op.
|
||||||
|
type: 2 = XGMI, 1 = PCIe; hops = number of hops.
|
||||||
|
"""
|
||||||
|
if handle is peer_handle or handle["card"] == peer_handle["card"]:
|
||||||
|
return {"type": 2, "hops": 0} # same device
|
||||||
|
# Default: not XGMI-connected
|
||||||
|
return {"type": 1, "hops": 2}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Memory (used by PyTorch _get_amdsmi_device_memory_used)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def amdsmi_get_gpu_vram_usage(handle: dict[str, Any]) -> dict[str, int]:
|
||||||
|
"""
|
||||||
|
Return VRAM usage in megabytes.
|
||||||
|
|
||||||
|
Keys: vram_used, vram_total
|
||||||
|
"""
|
||||||
|
dev = handle["dev"]
|
||||||
|
total_bytes = _read_sysfs_int(os.path.join(dev, "mem_info_vram_total"))
|
||||||
|
used_bytes = _read_sysfs_int(os.path.join(dev, "mem_info_vram_used"))
|
||||||
|
return {
|
||||||
|
"vram_used": used_bytes // (1024 * 1024),
|
||||||
|
"vram_total": total_bytes // (1024 * 1024),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Activity (used by PyTorch _get_amdsmi_utilization / memory_usage)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def amdsmi_get_gpu_activity(handle: dict[str, Any]) -> dict[str, int]:
|
||||||
|
"""
|
||||||
|
Return GPU and memory controller activity percentages.
|
||||||
|
|
||||||
|
Keys: gfx_activity, umc_activity, mm_activity
|
||||||
|
"""
|
||||||
|
dev = handle["dev"]
|
||||||
|
gfx = _read_sysfs_int(os.path.join(dev, "gpu_busy_percent"))
|
||||||
|
# mem_busy_percent is not always available
|
||||||
|
umc = _read_sysfs_int(os.path.join(dev, "mem_busy_percent"))
|
||||||
|
return {"gfx_activity": gfx, "umc_activity": umc, "mm_activity": 0}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Power (used by PyTorch _get_amdsmi_power_draw)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def amdsmi_get_power_info(handle: dict[str, Any]) -> dict[str, int]:
|
||||||
|
"""
|
||||||
|
Return power info in watts.
|
||||||
|
|
||||||
|
Keys: average_socket_power, current_socket_power
|
||||||
|
"""
|
||||||
|
hwmon = handle.get("hwmon")
|
||||||
|
if not hwmon:
|
||||||
|
return {"average_socket_power": 0, "current_socket_power": 0}
|
||||||
|
# sysfs reports in microwatts
|
||||||
|
avg_uw = _read_sysfs_int(os.path.join(hwmon, "power1_average"))
|
||||||
|
cur_uw = _read_sysfs_int(os.path.join(hwmon, "power1_input"), avg_uw)
|
||||||
|
return {
|
||||||
|
"average_socket_power": avg_uw // 1_000_000,
|
||||||
|
"current_socket_power": cur_uw // 1_000_000,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Temperature (used by PyTorch _get_amdsmi_temperature)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def amdsmi_get_temp_metric(
|
||||||
|
handle: dict[str, Any],
|
||||||
|
sensor_type: AmdSmiTemperatureType = AmdSmiTemperatureType.JUNCTION,
|
||||||
|
metric: AmdSmiTemperatureMetric = AmdSmiTemperatureMetric.CURRENT,
|
||||||
|
) -> int:
|
||||||
|
"""
|
||||||
|
Return temperature in millidegrees Celsius.
|
||||||
|
|
||||||
|
PyTorch uses JUNCTION / CURRENT and the result is divided by 1000.
|
||||||
|
"""
|
||||||
|
hwmon = handle.get("hwmon")
|
||||||
|
if not hwmon:
|
||||||
|
return 0
|
||||||
|
# Map sensor type to sysfs file: temp1=edge, temp2=junction, temp3=vram
|
||||||
|
idx = {
|
||||||
|
AmdSmiTemperatureType.EDGE: 1,
|
||||||
|
AmdSmiTemperatureType.JUNCTION: 2,
|
||||||
|
AmdSmiTemperatureType.VRAM: 3,
|
||||||
|
}.get(sensor_type, 1)
|
||||||
|
suffix = {
|
||||||
|
AmdSmiTemperatureMetric.CURRENT: "input",
|
||||||
|
AmdSmiTemperatureMetric.MAX: "max",
|
||||||
|
AmdSmiTemperatureMetric.MIN: "min",
|
||||||
|
}.get(metric, "input")
|
||||||
|
# sysfs already reports millidegrees
|
||||||
|
return _read_sysfs_int(os.path.join(hwmon, f"temp{idx}_{suffix}"))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Clock (used by PyTorch _get_amdsmi_clock_rate via vLLM reference)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def amdsmi_get_clock_info(
|
||||||
|
handle: dict[str, Any],
|
||||||
|
clk_type: AmdSmiClkType = AmdSmiClkType.GFX,
|
||||||
|
) -> dict[str, int]:
|
||||||
|
"""
|
||||||
|
Return clock info in MHz.
|
||||||
|
|
||||||
|
Keys: clk (current), max_clk, min_clk
|
||||||
|
"""
|
||||||
|
dev = handle["dev"]
|
||||||
|
sysfs_file = {
|
||||||
|
AmdSmiClkType.GFX: "pp_dpm_sclk",
|
||||||
|
AmdSmiClkType.MEM: "pp_dpm_mclk",
|
||||||
|
}.get(clk_type, "pp_dpm_sclk")
|
||||||
|
|
||||||
|
raw = _read_sysfs(os.path.join(dev, sysfs_file))
|
||||||
|
current_mhz = 0
|
||||||
|
min_mhz = 0
|
||||||
|
max_mhz = 0
|
||||||
|
for line in raw.splitlines():
|
||||||
|
# Lines look like: "0: 600Mhz *"
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 2:
|
||||||
|
try:
|
||||||
|
freq = int(parts[1].replace("Mhz", "").replace("MHz", ""))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
if min_mhz == 0 or freq < min_mhz:
|
||||||
|
min_mhz = freq
|
||||||
|
if freq > max_mhz:
|
||||||
|
max_mhz = freq
|
||||||
|
if "*" in line:
|
||||||
|
current_mhz = freq
|
||||||
|
return {"clk": current_mhz, "max_clk": max_mhz, "min_clk": min_mhz}
|
||||||
13
amdsmi-shim/pyproject.toml
Normal file
13
amdsmi-shim/pyproject.toml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=64"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "amdsmi"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Lightweight sysfs-based amdsmi shim for ROCm platform detection without libamd_smi.so"
|
||||||
|
requires-python = ">=3.8"
|
||||||
|
license = "MIT"
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
include = ["amdsmi*"]
|
||||||
@@ -32,7 +32,7 @@ COPY --link --from=rocm-source /opt/rocm /opt/rocm
|
|||||||
# ROCm environment variables - split to ensure ROCM_HOME is set first
|
# ROCm environment variables - split to ensure ROCM_HOME is set first
|
||||||
ENV ROCM_HOME=/opt/rocm
|
ENV ROCM_HOME=/opt/rocm
|
||||||
ENV PATH="/opt/rocm/bin:/opt/rocm/llvm/bin:/home/ray/anaconda3/bin:/home/ray/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" \
|
ENV PATH="/opt/rocm/bin:/opt/rocm/llvm/bin:/home/ray/anaconda3/bin:/home/ray/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" \
|
||||||
LD_LIBRARY_PATH="/opt/rocm/lib:/opt/rocm/lib64" \
|
LD_LIBRARY_PATH="/opt/rocm/lib:/opt/rocm/lib64:/home/ray/anaconda3/lib" \
|
||||||
HSA_PATH="/opt/rocm/hsa" \
|
HSA_PATH="/opt/rocm/hsa" \
|
||||||
HIP_PATH="/opt/rocm/hip" \
|
HIP_PATH="/opt/rocm/hip" \
|
||||||
# Strix Halo (gfx1151) specific settings
|
# Strix Halo (gfx1151) specific settings
|
||||||
@@ -84,6 +84,15 @@ RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
|||||||
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||||
uv pip install --system 'pandas>=2.0.0,<3.0'
|
uv pip install --system 'pandas>=2.0.0,<3.0'
|
||||||
|
|
||||||
|
# Install amdsmi sysfs shim LAST (required for vLLM ROCm platform detection).
|
||||||
|
# The native amdsmi from ROCm 7.1 requires glibc 2.38 (Ubuntu 24.04),
|
||||||
|
# but the Ray base image is Ubuntu 22.04 (glibc 2.35). This pure-Python
|
||||||
|
# shim reads GPU info from /sys/class/drm/* instead of libamd_smi.so.
|
||||||
|
# Must be installed after vLLM/torch to prevent PyPI amdsmi from overwriting it.
|
||||||
|
COPY --chown=1000:100 amdsmi-shim /tmp/amdsmi-shim
|
||||||
|
RUN --mount=type=cache,target=/home/ray/.cache/uv,uid=1000,gid=1000 \
|
||||||
|
uv pip install --system /tmp/amdsmi-shim && rm -rf /tmp/amdsmi-shim
|
||||||
|
|
||||||
# Pre-download common models for faster cold starts (optional, increases image size)
|
# Pre-download common models for faster cold starts (optional, increases image size)
|
||||||
# RUN python3 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-large-en-v1.5')"
|
# RUN python3 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-large-en-v1.5')"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user