Files
beszel-ipv6/agent/gpu_nvml.go
2025-12-21 17:10:42 -05:00

211 lines
5.9 KiB
Go

package agent
import (
"fmt"
"strings"
"time"
"unsafe"
"github.com/ebitengine/purego"
"github.com/henrygd/beszel/internal/entities/system"
"golang.org/x/exp/slog"
)
// NVML constants and types
const (
nvmlSuccess int = 0
)
type nvmlDevice uintptr
type nvmlReturn int
type nvmlMemoryV1 struct {
Total uint64
Free uint64
Used uint64
}
type nvmlMemoryV2 struct {
Version uint32
Total uint64
Reserved uint64
Free uint64
Used uint64
}
type nvmlUtilization struct {
Gpu uint32
Memory uint32
}
type nvmlPciInfo struct {
BusId [16]byte
Domain uint32
Bus uint32
Device uint32
PciDeviceId uint32
PciSubSystemId uint32
}
// NVML function signatures
var (
nvmlInit func() nvmlReturn
nvmlShutdown func() nvmlReturn
nvmlDeviceGetCount func(count *uint32) nvmlReturn
nvmlDeviceGetHandleByIndex func(index uint32, device *nvmlDevice) nvmlReturn
nvmlDeviceGetName func(device nvmlDevice, name *byte, length uint32) nvmlReturn
nvmlDeviceGetMemoryInfo func(device nvmlDevice, memory uintptr) nvmlReturn
nvmlDeviceGetUtilizationRates func(device nvmlDevice, utilization *nvmlUtilization) nvmlReturn
nvmlDeviceGetTemperature func(device nvmlDevice, sensorType int, temp *uint32) nvmlReturn
nvmlDeviceGetPowerUsage func(device nvmlDevice, power *uint32) nvmlReturn
nvmlDeviceGetPciInfo func(device nvmlDevice, pci *nvmlPciInfo) nvmlReturn
nvmlErrorString func(result nvmlReturn) string
)
type nvmlCollector struct {
gm *GPUManager
lib uintptr
devices []nvmlDevice
bdfs []string
isV2 bool
}
func (c *nvmlCollector) init() error {
slog.Debug("NVML: Initializing")
libPath := getNVMLPath()
lib, err := openLibrary(libPath)
if err != nil {
return fmt.Errorf("failed to load %s: %w", libPath, err)
}
c.lib = lib
purego.RegisterLibFunc(&nvmlInit, lib, "nvmlInit")
purego.RegisterLibFunc(&nvmlShutdown, lib, "nvmlShutdown")
purego.RegisterLibFunc(&nvmlDeviceGetCount, lib, "nvmlDeviceGetCount")
purego.RegisterLibFunc(&nvmlDeviceGetHandleByIndex, lib, "nvmlDeviceGetHandleByIndex")
purego.RegisterLibFunc(&nvmlDeviceGetName, lib, "nvmlDeviceGetName")
// Try to get v2 memory info, fallback to v1 if not available
if hasSymbol(lib, "nvmlDeviceGetMemoryInfo_v2") {
c.isV2 = true
purego.RegisterLibFunc(&nvmlDeviceGetMemoryInfo, lib, "nvmlDeviceGetMemoryInfo_v2")
} else {
purego.RegisterLibFunc(&nvmlDeviceGetMemoryInfo, lib, "nvmlDeviceGetMemoryInfo")
}
purego.RegisterLibFunc(&nvmlDeviceGetUtilizationRates, lib, "nvmlDeviceGetUtilizationRates")
purego.RegisterLibFunc(&nvmlDeviceGetTemperature, lib, "nvmlDeviceGetTemperature")
purego.RegisterLibFunc(&nvmlDeviceGetPowerUsage, lib, "nvmlDeviceGetPowerUsage")
purego.RegisterLibFunc(&nvmlDeviceGetPciInfo, lib, "nvmlDeviceGetPciInfo")
purego.RegisterLibFunc(&nvmlErrorString, lib, "nvmlErrorString")
if ret := nvmlInit(); ret != nvmlReturn(nvmlSuccess) {
return fmt.Errorf("nvmlInit failed: %v", ret)
}
var count uint32
if ret := nvmlDeviceGetCount(&count); ret != nvmlReturn(nvmlSuccess) {
return fmt.Errorf("nvmlDeviceGetCount failed: %v", ret)
}
for i := uint32(0); i < count; i++ {
var device nvmlDevice
if ret := nvmlDeviceGetHandleByIndex(i, &device); ret == nvmlReturn(nvmlSuccess) {
c.devices = append(c.devices, device)
// Get BDF for power state check
var pci nvmlPciInfo
if ret := nvmlDeviceGetPciInfo(device, &pci); ret == nvmlReturn(nvmlSuccess) {
busID := string(pci.BusId[:])
if idx := strings.Index(busID, "\x00"); idx != -1 {
busID = busID[:idx]
}
c.bdfs = append(c.bdfs, strings.ToLower(busID))
} else {
c.bdfs = append(c.bdfs, "")
}
}
}
return nil
}
func (c *nvmlCollector) start() {
defer nvmlShutdown()
ticker := time.Tick(3 * time.Second)
for range ticker {
c.collect()
}
}
func (c *nvmlCollector) collect() {
c.gm.Lock()
defer c.gm.Unlock()
for i, device := range c.devices {
id := fmt.Sprintf("%d", i)
bdf := c.bdfs[i]
// Update GPUDataMap
if _, ok := c.gm.GpuDataMap[id]; !ok {
var nameBuf [64]byte
if ret := nvmlDeviceGetName(device, &nameBuf[0], 64); ret != nvmlReturn(nvmlSuccess) {
continue
}
name := string(nameBuf[:strings.Index(string(nameBuf[:]), "\x00")])
name = strings.TrimPrefix(name, "NVIDIA ")
c.gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")}
}
gpu := c.gm.GpuDataMap[id]
if bdf != "" && !c.isGPUActive(bdf) {
slog.Debug("NVML: GPU is suspended, skipping", "bdf", bdf)
gpu.Temperature = 0
gpu.MemoryUsed = 0
continue
}
// Utilization
var utilization nvmlUtilization
if ret := nvmlDeviceGetUtilizationRates(device, &utilization); ret != nvmlReturn(nvmlSuccess) {
slog.Debug("NVML: Utilization failed (GPU likely suspended)", "bdf", bdf, "ret", ret)
gpu.Temperature = 0
gpu.MemoryUsed = 0
continue
}
slog.Debug("NVML: Collecting data for GPU", "bdf", bdf)
// Temperature
var temp uint32
nvmlDeviceGetTemperature(device, 0, &temp) // 0 is NVML_TEMPERATURE_GPU
// Memory
var usedMem, totalMem uint64
if c.isV2 {
var memory nvmlMemoryV2
memory.Version = 0x02000028 // (2 << 24) | 40 bytes
nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory)))
usedMem = memory.Used
totalMem = memory.Total
} else {
var memory nvmlMemoryV1
nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory)))
usedMem = memory.Used
totalMem = memory.Total
}
// Power
var power uint32
nvmlDeviceGetPowerUsage(device, &power)
gpu.Temperature = float64(temp)
gpu.MemoryUsed = float64(usedMem) / 1024 / 1024 / mebibytesInAMegabyte
gpu.MemoryTotal = float64(totalMem) / 1024 / 1024 / mebibytesInAMegabyte
gpu.Usage += float64(utilization.Gpu)
gpu.Power += float64(power) / 1000.0
gpu.Count++
slog.Debug("NVML: Collected data", "gpu", gpu)
}
}