From 475c53a55d9928acffe03801d4a23c1946520431 Mon Sep 17 00:00:00 2001 From: crimist Date: Mon, 5 Jan 2026 12:26:59 -0800 Subject: [PATCH] nvml: add rtd3 memory workaround, fix slog imports (#1587) * NVML: only read memory usage if utilization > 0% to allow rtd3, #1522 * logging: /x/exp/slog -> log/slog everywhere, fixes log instance inconsistencies --- agent/gpu.go | 2 +- agent/gpu_nvml.go | 42 ++++++++++++++++++++++++++--------------- agent/gpu_nvml_linux.go | 2 +- agent/handlers.go | 2 +- agent/smart.go | 2 +- 5 files changed, 31 insertions(+), 19 deletions(-) diff --git a/agent/gpu.go b/agent/gpu.go index e9c7c54b..dc5a6530 100644 --- a/agent/gpu.go +++ b/agent/gpu.go @@ -15,7 +15,7 @@ import ( "github.com/henrygd/beszel/internal/entities/system" - "golang.org/x/exp/slog" + "log/slog" ) const ( diff --git a/agent/gpu_nvml.go b/agent/gpu_nvml.go index 2615d80b..6f5951e1 100644 --- a/agent/gpu_nvml.go +++ b/agent/gpu_nvml.go @@ -8,7 +8,7 @@ import ( "github.com/ebitengine/purego" "github.com/henrygd/beszel/internal/entities/system" - "golang.org/x/exp/slog" + "log/slog" ) // NVML constants and types @@ -180,19 +180,33 @@ func (c *nvmlCollector) collect() { var temp uint32 nvmlDeviceGetTemperature(device, 0, &temp) // 0 is NVML_TEMPERATURE_GPU - // Memory - var usedMem, totalMem uint64 - if c.isV2 { - var memory nvmlMemoryV2 - memory.Version = 0x02000028 // (2 << 24) | 40 bytes - nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory))) - usedMem = memory.Used - totalMem = memory.Total + // only poll memory if GPU is active to avoid resetting 21 second suspend timer + if utilization.Gpu > 0 { + var usedMem, totalMem uint64 + if c.isV2 { + var memory nvmlMemoryV2 + memory.Version = 0x02000028 // (2 << 24) | 40 bytes + if ret := nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory))); ret != nvmlReturn(nvmlSuccess) { + slog.Debug("NVML: MemoryInfo_v2 failed", "bdf", bdf, "ret", ret) + } else { + usedMem = memory.Used + totalMem = memory.Total + } + } else { + var memory nvmlMemoryV1 + if ret := nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory))); ret != nvmlReturn(nvmlSuccess) { + slog.Debug("NVML: MemoryInfo failed", "bdf", bdf, "ret", ret) + } else { + usedMem = memory.Used + totalMem = memory.Total + } + } + if totalMem > 0 { + gpu.MemoryUsed = float64(usedMem) / 1024 / 1024 / mebibytesInAMegabyte + gpu.MemoryTotal = float64(totalMem) / 1024 / 1024 / mebibytesInAMegabyte + } } else { - var memory nvmlMemoryV1 - nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory))) - usedMem = memory.Used - totalMem = memory.Total + slog.Debug("NVML: Skipping memory info (utilization=0)", "bdf", bdf) } // Power @@ -200,8 +214,6 @@ func (c *nvmlCollector) collect() { nvmlDeviceGetPowerUsage(device, &power) gpu.Temperature = float64(temp) - gpu.MemoryUsed = float64(usedMem) / 1024 / 1024 / mebibytesInAMegabyte - gpu.MemoryTotal = float64(totalMem) / 1024 / 1024 / mebibytesInAMegabyte gpu.Usage += float64(utilization.Gpu) gpu.Power += float64(power) / 1000.0 gpu.Count++ diff --git a/agent/gpu_nvml_linux.go b/agent/gpu_nvml_linux.go index 67468c12..a9bf4573 100644 --- a/agent/gpu_nvml_linux.go +++ b/agent/gpu_nvml_linux.go @@ -8,7 +8,7 @@ import ( "strings" "github.com/ebitengine/purego" - "golang.org/x/exp/slog" + "log/slog" ) func openLibrary(name string) (uintptr, error) { diff --git a/agent/handlers.go b/agent/handlers.go index c6f8f9c3..2db07d31 100644 --- a/agent/handlers.go +++ b/agent/handlers.go @@ -9,7 +9,7 @@ import ( "github.com/henrygd/beszel/internal/common" "github.com/henrygd/beszel/internal/entities/smart" - "golang.org/x/exp/slog" + "log/slog" ) // HandlerContext provides context for request handlers diff --git a/agent/smart.go b/agent/smart.go index d1e003d6..3989d6ac 100644 --- a/agent/smart.go +++ b/agent/smart.go @@ -19,7 +19,7 @@ import ( "github.com/henrygd/beszel/internal/entities/smart" - "golang.org/x/exp/slog" + "log/slog" ) // SmartManager manages data collection for SMART devices