package agent import ( "fmt" "os" "path/filepath" "strings" "time" "unsafe" "github.com/ebitengine/purego" "github.com/henrygd/beszel/internal/entities/system" "golang.org/x/exp/slog" ) // NVML constants and types const ( nvmlSuccess int = 0 ) type nvmlDevice uintptr type nvmlReturn int type nvmlMemoryV1 struct { Total uint64 Free uint64 Used uint64 } type nvmlMemoryV2 struct { Version uint32 Total uint64 Reserved uint64 Free uint64 Used uint64 } type nvmlUtilization struct { Gpu uint32 Memory uint32 } type nvmlPciInfo struct { BusId [16]byte Domain uint32 Bus uint32 Device uint32 PciDeviceId uint32 PciSubSystemId uint32 } // NVML function signatures var ( nvmlInit func() nvmlReturn nvmlShutdown func() nvmlReturn nvmlDeviceGetCount func(count *uint32) nvmlReturn nvmlDeviceGetHandleByIndex func(index uint32, device *nvmlDevice) nvmlReturn nvmlDeviceGetName func(device nvmlDevice, name *byte, length uint32) nvmlReturn nvmlDeviceGetMemoryInfo func(device nvmlDevice, memory uintptr) nvmlReturn nvmlDeviceGetUtilizationRates func(device nvmlDevice, utilization *nvmlUtilization) nvmlReturn nvmlDeviceGetTemperature func(device nvmlDevice, sensorType int, temp *uint32) nvmlReturn nvmlDeviceGetPowerUsage func(device nvmlDevice, power *uint32) nvmlReturn nvmlDeviceGetPciInfo func(device nvmlDevice, pci *nvmlPciInfo) nvmlReturn nvmlErrorString func(result nvmlReturn) string ) type nvmlCollector struct { gm *GPUManager lib uintptr devices []nvmlDevice bdfs []string isV2 bool } func (c *nvmlCollector) init() error { slog.Info("NVML: Initializing") libPath := "libnvidia-ml.so.1" // Check for standard locations if necessary, but purego/dlopen usually handles this lib, err := purego.Dlopen(libPath, purego.RTLD_NOW|purego.RTLD_GLOBAL) if err != nil { return fmt.Errorf("failed to load %s: %w", libPath, err) } c.lib = lib purego.RegisterLibFunc(&nvmlInit, lib, "nvmlInit") purego.RegisterLibFunc(&nvmlShutdown, lib, "nvmlShutdown") purego.RegisterLibFunc(&nvmlDeviceGetCount, lib, "nvmlDeviceGetCount") purego.RegisterLibFunc(&nvmlDeviceGetHandleByIndex, lib, "nvmlDeviceGetHandleByIndex") purego.RegisterLibFunc(&nvmlDeviceGetName, lib, "nvmlDeviceGetName") // Try to get v2 memory info, fallback to v1 if not available _, err = purego.Dlsym(lib, "nvmlDeviceGetMemoryInfo_v2") if err == nil { c.isV2 = true purego.RegisterLibFunc(&nvmlDeviceGetMemoryInfo, lib, "nvmlDeviceGetMemoryInfo_v2") } else { purego.RegisterLibFunc(&nvmlDeviceGetMemoryInfo, lib, "nvmlDeviceGetMemoryInfo") } purego.RegisterLibFunc(&nvmlDeviceGetUtilizationRates, lib, "nvmlDeviceGetUtilizationRates") purego.RegisterLibFunc(&nvmlDeviceGetTemperature, lib, "nvmlDeviceGetTemperature") purego.RegisterLibFunc(&nvmlDeviceGetPowerUsage, lib, "nvmlDeviceGetPowerUsage") purego.RegisterLibFunc(&nvmlDeviceGetPciInfo, lib, "nvmlDeviceGetPciInfo") purego.RegisterLibFunc(&nvmlErrorString, lib, "nvmlErrorString") if ret := nvmlInit(); ret != nvmlReturn(nvmlSuccess) { return fmt.Errorf("nvmlInit failed: %v", ret) } var count uint32 if ret := nvmlDeviceGetCount(&count); ret != nvmlReturn(nvmlSuccess) { return fmt.Errorf("nvmlDeviceGetCount failed: %v", ret) } for i := uint32(0); i < count; i++ { var device nvmlDevice if ret := nvmlDeviceGetHandleByIndex(i, &device); ret == nvmlReturn(nvmlSuccess) { c.devices = append(c.devices, device) // Get BDF for power state check var pci nvmlPciInfo if ret := nvmlDeviceGetPciInfo(device, &pci); ret == nvmlReturn(nvmlSuccess) { busID := string(pci.BusId[:]) if idx := strings.Index(busID, "\x00"); idx != -1 { busID = busID[:idx] } c.bdfs = append(c.bdfs, strings.ToLower(busID)) } else { c.bdfs = append(c.bdfs, "") } } } return nil } func (c *nvmlCollector) start() { defer nvmlShutdown() ticker := time.Tick(3 * time.Second) for range ticker { c.collect() } } func (c *nvmlCollector) collect() { c.gm.Lock() defer c.gm.Unlock() for i, device := range c.devices { bdf := c.bdfs[i] if bdf != "" && !c.isGPUActive(bdf) { slog.Info("NVML: GPU is suspended, skipping", "bdf", bdf) continue } slog.Info("NVML: Collecting data for GPU", "bdf", bdf) id := fmt.Sprintf("%d", i) // Utilization var utilization nvmlUtilization if ret := nvmlDeviceGetUtilizationRates(device, &utilization); ret != nvmlReturn(nvmlSuccess) { continue } // Temperature var temp uint32 nvmlDeviceGetTemperature(device, 0, &temp) // 0 is NVML_TEMPERATURE_GPU // Memory var usedMem, totalMem uint64 if c.isV2 { var memory nvmlMemoryV2 memory.Version = 0x02000028 // (2 << 24) | 40 bytes nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory))) usedMem = memory.Used totalMem = memory.Total } else { var memory nvmlMemoryV1 nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory))) usedMem = memory.Used totalMem = memory.Total } // Power var power uint32 nvmlDeviceGetPowerUsage(device, &power) // Update GPUDataMap if _, ok := c.gm.GpuDataMap[id]; !ok { var nameBuf [64]byte nvmlDeviceGetName(device, &nameBuf[0], 64) name := string(nameBuf[:strings.Index(string(nameBuf[:]), "\x00")]) name = strings.TrimPrefix(name, "NVIDIA ") c.gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")} } gpu := c.gm.GpuDataMap[id] gpu.Temperature = float64(temp) gpu.MemoryUsed = float64(usedMem) / 1024 / 1024 / mebibytesInAMegabyte gpu.MemoryTotal = float64(totalMem) / 1024 / 1024 / mebibytesInAMegabyte gpu.Usage += float64(utilization.Gpu) gpu.Power += float64(power) / 1000.0 gpu.Count++ slog.Info("NVML: Collected data", "gpu", gpu) } } func (c *nvmlCollector) isGPUActive(bdf string) bool { // runtime_status statusPath := filepath.Join("/sys/bus/pci/devices", bdf, "power/runtime_status") status, err := os.ReadFile(statusPath) if err != nil { slog.Info("NVML: Can't read runtime_status", "bdf", bdf, "err", err) return true // Assume active if we can't read status } statusStr := strings.TrimSpace(string(status)) if statusStr != "active" && statusStr != "resuming" { slog.Info("NVML: GPU is not active", "bdf", bdf, "status", statusStr) return false } // power_state (D0 check) // Find any drm card device power_state pstatePathPattern := filepath.Join("/sys/bus/pci/devices", bdf, "drm/card*/device/power_state") matches, _ := filepath.Glob(pstatePathPattern) if len(matches) > 0 { pstate, err := os.ReadFile(matches[0]) if err == nil { pstateStr := strings.TrimSpace(string(pstate)) if pstateStr != "D0" { return false } } } return true }