fix(strixhalo): skip VRAM patch in low-memory init containers
Some checks failed
Build and Push Images / determine-version (push) Successful in 5s
Build and Push Images / build (Dockerfile.ray-worker-nvidia, nvidia) (push) Failing after 24s
Build and Push Images / build (Dockerfile.ray-worker-intel, intel) (push) Failing after 27s
Build and Push Images / build (Dockerfile.ray-worker-strixhalo, strixhalo) (push) Failing after 22s
Build and Push Images / build (Dockerfile.ray-worker-rdna2, rdna2) (push) Failing after 24s
Build and Push Images / Release (push) Has been skipped
Build and Push Images / Notify (push) Successful in 1s
Some checks failed
Build and Push Images / determine-version (push) Successful in 5s
Build and Push Images / build (Dockerfile.ray-worker-nvidia, nvidia) (push) Failing after 24s
Build and Push Images / build (Dockerfile.ray-worker-intel, intel) (push) Failing after 27s
Build and Push Images / build (Dockerfile.ray-worker-strixhalo, strixhalo) (push) Failing after 22s
Build and Push Images / build (Dockerfile.ray-worker-rdna2, rdna2) (push) Failing after 24s
Build and Push Images / Release (push) Has been skipped
Build and Push Images / Notify (push) Successful in 1s
KubeRay's auto-injected wait-gcs-ready init container has only 256Mi memory limit. The .pth hook was unconditionally importing torch+ROCm which requires >256Mi, causing OOMKill. Now checks cgroup memory limit first — if under 512Mi, skips the expensive torch import entirely. The VRAM patch is only needed by the main Ray worker process, not by health-check init containers.
This commit is contained in:
@@ -55,8 +55,33 @@ def _get_real_vram() -> tuple[int, int] | None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _should_skip() -> bool:
|
||||||
|
"""Check if we should skip the patch (lightweight/init containers)."""
|
||||||
|
# Check cgroup memory limit — if under 512Mi, skip the expensive
|
||||||
|
# torch/ROCm import. KubeRay's wait-gcs-ready init container has
|
||||||
|
# only 256Mi and importing torch+ROCm would OOMKill it.
|
||||||
|
for cgroup_mem_path in (
|
||||||
|
"/sys/fs/cgroup/memory.max", # cgroup v2
|
||||||
|
"/sys/fs/cgroup/memory/memory.limit_in_bytes", # cgroup v1
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
with open(cgroup_mem_path) as f:
|
||||||
|
val = f.read().strip()
|
||||||
|
if val != "max" and int(val) < 512 * 1024 * 1024:
|
||||||
|
return True
|
||||||
|
except (OSError, ValueError):
|
||||||
|
continue
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _apply_patch() -> None:
|
def _apply_patch() -> None:
|
||||||
"""Patch torch.cuda.mem_get_info if we detect unified memory mis-reporting."""
|
"""Patch torch.cuda.mem_get_info if we detect unified memory mis-reporting."""
|
||||||
|
if _should_skip():
|
||||||
|
return
|
||||||
|
|
||||||
|
if _get_real_vram() is None:
|
||||||
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import torch
|
import torch
|
||||||
if not hasattr(torch, "cuda") or not torch.cuda.is_available():
|
if not hasattr(torch, "cuda") or not torch.cuda.is_available():
|
||||||
|
|||||||
Reference in New Issue
Block a user