diff --git a/agent/gpu.go b/agent/gpu.go index 18838d11..e9c7c54b 100644 --- a/agent/gpu.go +++ b/agent/gpu.go @@ -44,6 +44,7 @@ type GPUManager struct { rocmSmi bool tegrastats bool intelGpuStats bool + nvml bool GpuDataMap map[string]*system.GPUData // lastAvgData stores the last calculated averages for each GPU // Used when a collection happens before new data arrives (Count == 0) @@ -297,8 +298,13 @@ func (gm *GPUManager) calculateGPUAverage(id string, gpu *system.GPUData, cacheK currentCount := uint32(gpu.Count) deltaCount := gm.calculateDeltaCount(currentCount, lastSnapshot) - // If no new data arrived, use last known average + // If no new data arrived if deltaCount == 0 { + // If GPU appears suspended (instantaneous values are 0), return zero values + // Otherwise return last known average for temporary collection gaps + if gpu.Temperature == 0 && gpu.MemoryUsed == 0 { + return system.GPUData{Name: gpu.Name} + } return gm.lastAvgData[id] // zero value if not found } @@ -396,7 +402,7 @@ func (gm *GPUManager) detectGPUs() error { if _, err := exec.LookPath(intelGpuStatsCmd); err == nil { gm.intelGpuStats = true } - if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelGpuStats { + if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelGpuStats || gm.nvml { return nil } return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, tegrastats, or intel_gpu_top") @@ -467,7 +473,20 @@ func NewGPUManager() (*GPUManager, error) { gm.GpuDataMap = make(map[string]*system.GPUData) if gm.nvidiaSmi { - gm.startCollector(nvidiaSmiCmd) + if nvml, _ := GetEnv("NVML"); nvml == "true" { + gm.nvml = true + gm.nvidiaSmi = false + collector := &nvmlCollector{gm: &gm} + if err := collector.init(); err == nil { + go collector.start() + } else { + slog.Warn("Failed to initialize NVML, falling back to nvidia-smi", "err", err) + gm.nvidiaSmi = true + gm.startCollector(nvidiaSmiCmd) + } + } else { + gm.startCollector(nvidiaSmiCmd) + } } if gm.rocmSmi { gm.startCollector(rocmSmiCmd) diff --git a/agent/gpu_nvml.go b/agent/gpu_nvml.go new file mode 100644 index 00000000..2615d80b --- /dev/null +++ b/agent/gpu_nvml.go @@ -0,0 +1,210 @@ +package agent + +import ( + "fmt" + "strings" + "time" + "unsafe" + + "github.com/ebitengine/purego" + "github.com/henrygd/beszel/internal/entities/system" + "golang.org/x/exp/slog" +) + +// NVML constants and types +const ( + nvmlSuccess int = 0 +) + +type nvmlDevice uintptr + +type nvmlReturn int + +type nvmlMemoryV1 struct { + Total uint64 + Free uint64 + Used uint64 +} + +type nvmlMemoryV2 struct { + Version uint32 + Total uint64 + Reserved uint64 + Free uint64 + Used uint64 +} + +type nvmlUtilization struct { + Gpu uint32 + Memory uint32 +} + +type nvmlPciInfo struct { + BusId [16]byte + Domain uint32 + Bus uint32 + Device uint32 + PciDeviceId uint32 + PciSubSystemId uint32 +} + +// NVML function signatures +var ( + nvmlInit func() nvmlReturn + nvmlShutdown func() nvmlReturn + nvmlDeviceGetCount func(count *uint32) nvmlReturn + nvmlDeviceGetHandleByIndex func(index uint32, device *nvmlDevice) nvmlReturn + nvmlDeviceGetName func(device nvmlDevice, name *byte, length uint32) nvmlReturn + nvmlDeviceGetMemoryInfo func(device nvmlDevice, memory uintptr) nvmlReturn + nvmlDeviceGetUtilizationRates func(device nvmlDevice, utilization *nvmlUtilization) nvmlReturn + nvmlDeviceGetTemperature func(device nvmlDevice, sensorType int, temp *uint32) nvmlReturn + nvmlDeviceGetPowerUsage func(device nvmlDevice, power *uint32) nvmlReturn + nvmlDeviceGetPciInfo func(device nvmlDevice, pci *nvmlPciInfo) nvmlReturn + nvmlErrorString func(result nvmlReturn) string +) + +type nvmlCollector struct { + gm *GPUManager + lib uintptr + devices []nvmlDevice + bdfs []string + isV2 bool +} + +func (c *nvmlCollector) init() error { + slog.Debug("NVML: Initializing") + libPath := getNVMLPath() + + lib, err := openLibrary(libPath) + if err != nil { + return fmt.Errorf("failed to load %s: %w", libPath, err) + } + c.lib = lib + + purego.RegisterLibFunc(&nvmlInit, lib, "nvmlInit") + purego.RegisterLibFunc(&nvmlShutdown, lib, "nvmlShutdown") + purego.RegisterLibFunc(&nvmlDeviceGetCount, lib, "nvmlDeviceGetCount") + purego.RegisterLibFunc(&nvmlDeviceGetHandleByIndex, lib, "nvmlDeviceGetHandleByIndex") + purego.RegisterLibFunc(&nvmlDeviceGetName, lib, "nvmlDeviceGetName") + // Try to get v2 memory info, fallback to v1 if not available + if hasSymbol(lib, "nvmlDeviceGetMemoryInfo_v2") { + c.isV2 = true + purego.RegisterLibFunc(&nvmlDeviceGetMemoryInfo, lib, "nvmlDeviceGetMemoryInfo_v2") + } else { + purego.RegisterLibFunc(&nvmlDeviceGetMemoryInfo, lib, "nvmlDeviceGetMemoryInfo") + } + purego.RegisterLibFunc(&nvmlDeviceGetUtilizationRates, lib, "nvmlDeviceGetUtilizationRates") + purego.RegisterLibFunc(&nvmlDeviceGetTemperature, lib, "nvmlDeviceGetTemperature") + purego.RegisterLibFunc(&nvmlDeviceGetPowerUsage, lib, "nvmlDeviceGetPowerUsage") + purego.RegisterLibFunc(&nvmlDeviceGetPciInfo, lib, "nvmlDeviceGetPciInfo") + purego.RegisterLibFunc(&nvmlErrorString, lib, "nvmlErrorString") + + if ret := nvmlInit(); ret != nvmlReturn(nvmlSuccess) { + return fmt.Errorf("nvmlInit failed: %v", ret) + } + + var count uint32 + if ret := nvmlDeviceGetCount(&count); ret != nvmlReturn(nvmlSuccess) { + return fmt.Errorf("nvmlDeviceGetCount failed: %v", ret) + } + + for i := uint32(0); i < count; i++ { + var device nvmlDevice + if ret := nvmlDeviceGetHandleByIndex(i, &device); ret == nvmlReturn(nvmlSuccess) { + c.devices = append(c.devices, device) + // Get BDF for power state check + var pci nvmlPciInfo + if ret := nvmlDeviceGetPciInfo(device, &pci); ret == nvmlReturn(nvmlSuccess) { + busID := string(pci.BusId[:]) + if idx := strings.Index(busID, "\x00"); idx != -1 { + busID = busID[:idx] + } + c.bdfs = append(c.bdfs, strings.ToLower(busID)) + } else { + c.bdfs = append(c.bdfs, "") + } + } + } + + return nil +} + +func (c *nvmlCollector) start() { + defer nvmlShutdown() + ticker := time.Tick(3 * time.Second) + + for range ticker { + c.collect() + } +} + +func (c *nvmlCollector) collect() { + c.gm.Lock() + defer c.gm.Unlock() + + for i, device := range c.devices { + id := fmt.Sprintf("%d", i) + bdf := c.bdfs[i] + + // Update GPUDataMap + if _, ok := c.gm.GpuDataMap[id]; !ok { + var nameBuf [64]byte + if ret := nvmlDeviceGetName(device, &nameBuf[0], 64); ret != nvmlReturn(nvmlSuccess) { + continue + } + name := string(nameBuf[:strings.Index(string(nameBuf[:]), "\x00")]) + name = strings.TrimPrefix(name, "NVIDIA ") + c.gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")} + } + gpu := c.gm.GpuDataMap[id] + + if bdf != "" && !c.isGPUActive(bdf) { + slog.Debug("NVML: GPU is suspended, skipping", "bdf", bdf) + gpu.Temperature = 0 + gpu.MemoryUsed = 0 + continue + } + + // Utilization + var utilization nvmlUtilization + if ret := nvmlDeviceGetUtilizationRates(device, &utilization); ret != nvmlReturn(nvmlSuccess) { + slog.Debug("NVML: Utilization failed (GPU likely suspended)", "bdf", bdf, "ret", ret) + gpu.Temperature = 0 + gpu.MemoryUsed = 0 + continue + } + + slog.Debug("NVML: Collecting data for GPU", "bdf", bdf) + + // Temperature + var temp uint32 + nvmlDeviceGetTemperature(device, 0, &temp) // 0 is NVML_TEMPERATURE_GPU + + // Memory + var usedMem, totalMem uint64 + if c.isV2 { + var memory nvmlMemoryV2 + memory.Version = 0x02000028 // (2 << 24) | 40 bytes + nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory))) + usedMem = memory.Used + totalMem = memory.Total + } else { + var memory nvmlMemoryV1 + nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory))) + usedMem = memory.Used + totalMem = memory.Total + } + + // Power + var power uint32 + nvmlDeviceGetPowerUsage(device, &power) + + gpu.Temperature = float64(temp) + gpu.MemoryUsed = float64(usedMem) / 1024 / 1024 / mebibytesInAMegabyte + gpu.MemoryTotal = float64(totalMem) / 1024 / 1024 / mebibytesInAMegabyte + gpu.Usage += float64(utilization.Gpu) + gpu.Power += float64(power) / 1000.0 + gpu.Count++ + slog.Debug("NVML: Collected data", "gpu", gpu) + } +} diff --git a/agent/gpu_nvml_linux.go b/agent/gpu_nvml_linux.go new file mode 100644 index 00000000..67468c12 --- /dev/null +++ b/agent/gpu_nvml_linux.go @@ -0,0 +1,57 @@ +//go:build linux + +package agent + +import ( + "os" + "path/filepath" + "strings" + + "github.com/ebitengine/purego" + "golang.org/x/exp/slog" +) + +func openLibrary(name string) (uintptr, error) { + return purego.Dlopen(name, purego.RTLD_NOW|purego.RTLD_GLOBAL) +} + +func getNVMLPath() string { + return "libnvidia-ml.so.1" +} + +func hasSymbol(lib uintptr, symbol string) bool { + _, err := purego.Dlsym(lib, symbol) + return err == nil +} + +func (c *nvmlCollector) isGPUActive(bdf string) bool { + // runtime_status + statusPath := filepath.Join("/sys/bus/pci/devices", bdf, "power/runtime_status") + status, err := os.ReadFile(statusPath) + if err != nil { + slog.Debug("NVML: Can't read runtime_status", "bdf", bdf, "err", err) + return true // Assume active if we can't read status + } + statusStr := strings.TrimSpace(string(status)) + if statusStr != "active" && statusStr != "resuming" { + slog.Debug("NVML: GPU not active", "bdf", bdf, "status", statusStr) + return false + } + + // power_state (D0 check) + // Find any drm card device power_state + pstatePathPattern := filepath.Join("/sys/bus/pci/devices", bdf, "drm/card*/device/power_state") + matches, _ := filepath.Glob(pstatePathPattern) + if len(matches) > 0 { + pstate, err := os.ReadFile(matches[0]) + if err == nil { + pstateStr := strings.TrimSpace(string(pstate)) + if pstateStr != "D0" { + slog.Debug("NVML: GPU not in D0 state", "bdf", bdf, "pstate", pstateStr) + return false + } + } + } + + return true +} diff --git a/agent/gpu_nvml_unsupported.go b/agent/gpu_nvml_unsupported.go new file mode 100644 index 00000000..ac5502a3 --- /dev/null +++ b/agent/gpu_nvml_unsupported.go @@ -0,0 +1,21 @@ +//go:build !linux && !windows + +package agent + +import "fmt" + +func openLibrary(name string) (uintptr, error) { + return 0, fmt.Errorf("nvml not supported on this platform") +} + +func getNVMLPath() string { + return "" +} + +func hasSymbol(lib uintptr, symbol string) bool { + return false +} + +func (c *nvmlCollector) isGPUActive(bdf string) bool { + return true +} diff --git a/agent/gpu_nvml_windows.go b/agent/gpu_nvml_windows.go new file mode 100644 index 00000000..67318ec9 --- /dev/null +++ b/agent/gpu_nvml_windows.go @@ -0,0 +1,25 @@ +//go:build windows + +package agent + +import ( + "golang.org/x/sys/windows" +) + +func openLibrary(name string) (uintptr, error) { + handle, err := windows.LoadLibrary(name) + return uintptr(handle), err +} + +func getNVMLPath() string { + return "nvml.dll" +} + +func hasSymbol(lib uintptr, symbol string) bool { + _, err := windows.GetProcAddress(windows.Handle(lib), symbol) + return err == nil +} + +func (c *nvmlCollector) isGPUActive(bdf string) bool { + return true +} diff --git a/go.mod b/go.mod index 2b38fda3..d19d4ff9 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/blang/semver v3.5.1+incompatible github.com/coreos/go-systemd/v22 v22.6.0 github.com/distatus/battery v0.11.0 + github.com/ebitengine/purego v0.9.1 github.com/fxamacker/cbor/v2 v2.9.0 github.com/gliderlabs/ssh v0.3.8 github.com/google/uuid v1.6.0 @@ -20,6 +21,7 @@ require ( github.com/stretchr/testify v1.11.1 golang.org/x/crypto v0.45.0 golang.org/x/exp v0.0.0-20251125195548-87e1e737ad39 + golang.org/x/sys v0.38.0 gopkg.in/yaml.v3 v3.0.1 ) @@ -31,7 +33,6 @@ require ( github.com/dolthub/maphash v0.1.0 // indirect github.com/domodwyer/mailyak/v3 v3.6.2 // indirect github.com/dustin/go-humanize v1.0.1 // indirect - github.com/ebitengine/purego v0.9.1 // indirect github.com/fatih/color v1.18.0 // indirect github.com/gabriel-vasile/mimetype v1.4.11 // indirect github.com/ganigeorgiev/fexpr v0.5.0 // indirect @@ -57,7 +58,6 @@ require ( golang.org/x/net v0.47.0 // indirect golang.org/x/oauth2 v0.33.0 // indirect golang.org/x/sync v0.18.0 // indirect - golang.org/x/sys v0.38.0 // indirect golang.org/x/term v0.37.0 // indirect golang.org/x/text v0.31.0 // indirect howett.net/plist v1.0.1 // indirect diff --git a/go.sum b/go.sum index a7697a0b..b5dc5a17 100644 --- a/go.sum +++ b/go.sum @@ -62,8 +62,6 @@ github.com/google/pprof v0.0.0-20251114195745-4902fdda35c8 h1:3DsUAV+VNEQa2CUVLx github.com/google/pprof v0.0.0-20251114195745-4902fdda35c8/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= -github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/jarcoal/httpmock v1.4.1 h1:0Ju+VCFuARfFlhVXFc2HxlcQkfB+Xq12/EotHko+x2A= @@ -173,22 +171,18 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= howett.net/plist v1.0.1 h1:37GdZ8tP09Q35o9ych3ehygcsL+HqKSwzctveSlarvM= howett.net/plist v1.0.1/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g= -modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis= -modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0= -modernc.org/ccgo/v4 v4.30.1 h1:4r4U1J6Fhj98NKfSjnPUN7Ze2c6MnAdL0hWw6+LrJpc= -modernc.org/ccgo/v4 v4.30.1/go.mod h1:bIOeI1JL54Utlxn+LwrFyjCx2n2RDiYEaJVSrgdrRfM= +modernc.org/cc/v4 v4.26.5 h1:xM3bX7Mve6G8K8b+T11ReenJOT+BmVqQj0FY5T4+5Y4= +modernc.org/cc/v4 v4.26.5/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0= +modernc.org/ccgo/v4 v4.28.1 h1:wPKYn5EC/mYTqBO373jKjvX2n+3+aK7+sICCv4Fjy1A= +modernc.org/ccgo/v4 v4.28.1/go.mod h1:uD+4RnfrVgE6ec9NGguUNdhqzNIeeomeXf6CL0GTE5Q= modernc.org/fileutil v1.3.40 h1:ZGMswMNc9JOCrcrakF1HrvmergNLAmxOPjizirpfqBA= modernc.org/fileutil v1.3.40/go.mod h1:HxmghZSZVAz/LXcMNwZPA/DRrQZEVP9VX0V4LQGQFOc= modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI= modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito= -modernc.org/gc/v3 v3.1.1 h1:k8T3gkXWY9sEiytKhcgyiZ2L0DTyCQ/nvX+LoCljoRE= -modernc.org/gc/v3 v3.1.1/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY= modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks= modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI= modernc.org/libc v1.66.10 h1:yZkb3YeLx4oynyR+iUsXsybsX4Ubx7MQlSYEw4yj59A= modernc.org/libc v1.66.10/go.mod h1:8vGSEwvoUoltr4dlywvHqjtAqHBaw0j1jI7iFBTAr2I= -modernc.org/libc v1.67.1 h1:bFaqOaa5/zbWYJo8aW0tXPX21hXsngG2M7mckCnFSVk= -modernc.org/libc v1.67.1/go.mod h1:QvvnnJ5P7aitu0ReNpVIEyesuhmDLQ8kaEoyMjIFZJA= modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=