mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-23 05:56:17 +01:00
add experimental nvml gpu collector (#1522)
This commit is contained in:
210
agent/gpu_nvml.go
Normal file
210
agent/gpu_nvml.go
Normal file
@@ -0,0 +1,210 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
"unsafe"
|
||||
|
||||
"github.com/ebitengine/purego"
|
||||
"github.com/henrygd/beszel/internal/entities/system"
|
||||
"golang.org/x/exp/slog"
|
||||
)
|
||||
|
||||
// NVML constants and types
|
||||
const (
|
||||
nvmlSuccess int = 0
|
||||
)
|
||||
|
||||
type nvmlDevice uintptr
|
||||
|
||||
type nvmlReturn int
|
||||
|
||||
type nvmlMemoryV1 struct {
|
||||
Total uint64
|
||||
Free uint64
|
||||
Used uint64
|
||||
}
|
||||
|
||||
type nvmlMemoryV2 struct {
|
||||
Version uint32
|
||||
Total uint64
|
||||
Reserved uint64
|
||||
Free uint64
|
||||
Used uint64
|
||||
}
|
||||
|
||||
type nvmlUtilization struct {
|
||||
Gpu uint32
|
||||
Memory uint32
|
||||
}
|
||||
|
||||
type nvmlPciInfo struct {
|
||||
BusId [16]byte
|
||||
Domain uint32
|
||||
Bus uint32
|
||||
Device uint32
|
||||
PciDeviceId uint32
|
||||
PciSubSystemId uint32
|
||||
}
|
||||
|
||||
// NVML function signatures
|
||||
var (
|
||||
nvmlInit func() nvmlReturn
|
||||
nvmlShutdown func() nvmlReturn
|
||||
nvmlDeviceGetCount func(count *uint32) nvmlReturn
|
||||
nvmlDeviceGetHandleByIndex func(index uint32, device *nvmlDevice) nvmlReturn
|
||||
nvmlDeviceGetName func(device nvmlDevice, name *byte, length uint32) nvmlReturn
|
||||
nvmlDeviceGetMemoryInfo func(device nvmlDevice, memory uintptr) nvmlReturn
|
||||
nvmlDeviceGetUtilizationRates func(device nvmlDevice, utilization *nvmlUtilization) nvmlReturn
|
||||
nvmlDeviceGetTemperature func(device nvmlDevice, sensorType int, temp *uint32) nvmlReturn
|
||||
nvmlDeviceGetPowerUsage func(device nvmlDevice, power *uint32) nvmlReturn
|
||||
nvmlDeviceGetPciInfo func(device nvmlDevice, pci *nvmlPciInfo) nvmlReturn
|
||||
nvmlErrorString func(result nvmlReturn) string
|
||||
)
|
||||
|
||||
type nvmlCollector struct {
|
||||
gm *GPUManager
|
||||
lib uintptr
|
||||
devices []nvmlDevice
|
||||
bdfs []string
|
||||
isV2 bool
|
||||
}
|
||||
|
||||
func (c *nvmlCollector) init() error {
|
||||
slog.Debug("NVML: Initializing")
|
||||
libPath := getNVMLPath()
|
||||
|
||||
lib, err := openLibrary(libPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load %s: %w", libPath, err)
|
||||
}
|
||||
c.lib = lib
|
||||
|
||||
purego.RegisterLibFunc(&nvmlInit, lib, "nvmlInit")
|
||||
purego.RegisterLibFunc(&nvmlShutdown, lib, "nvmlShutdown")
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetCount, lib, "nvmlDeviceGetCount")
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetHandleByIndex, lib, "nvmlDeviceGetHandleByIndex")
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetName, lib, "nvmlDeviceGetName")
|
||||
// Try to get v2 memory info, fallback to v1 if not available
|
||||
if hasSymbol(lib, "nvmlDeviceGetMemoryInfo_v2") {
|
||||
c.isV2 = true
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetMemoryInfo, lib, "nvmlDeviceGetMemoryInfo_v2")
|
||||
} else {
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetMemoryInfo, lib, "nvmlDeviceGetMemoryInfo")
|
||||
}
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetUtilizationRates, lib, "nvmlDeviceGetUtilizationRates")
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetTemperature, lib, "nvmlDeviceGetTemperature")
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetPowerUsage, lib, "nvmlDeviceGetPowerUsage")
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetPciInfo, lib, "nvmlDeviceGetPciInfo")
|
||||
purego.RegisterLibFunc(&nvmlErrorString, lib, "nvmlErrorString")
|
||||
|
||||
if ret := nvmlInit(); ret != nvmlReturn(nvmlSuccess) {
|
||||
return fmt.Errorf("nvmlInit failed: %v", ret)
|
||||
}
|
||||
|
||||
var count uint32
|
||||
if ret := nvmlDeviceGetCount(&count); ret != nvmlReturn(nvmlSuccess) {
|
||||
return fmt.Errorf("nvmlDeviceGetCount failed: %v", ret)
|
||||
}
|
||||
|
||||
for i := uint32(0); i < count; i++ {
|
||||
var device nvmlDevice
|
||||
if ret := nvmlDeviceGetHandleByIndex(i, &device); ret == nvmlReturn(nvmlSuccess) {
|
||||
c.devices = append(c.devices, device)
|
||||
// Get BDF for power state check
|
||||
var pci nvmlPciInfo
|
||||
if ret := nvmlDeviceGetPciInfo(device, &pci); ret == nvmlReturn(nvmlSuccess) {
|
||||
busID := string(pci.BusId[:])
|
||||
if idx := strings.Index(busID, "\x00"); idx != -1 {
|
||||
busID = busID[:idx]
|
||||
}
|
||||
c.bdfs = append(c.bdfs, strings.ToLower(busID))
|
||||
} else {
|
||||
c.bdfs = append(c.bdfs, "")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *nvmlCollector) start() {
|
||||
defer nvmlShutdown()
|
||||
ticker := time.Tick(3 * time.Second)
|
||||
|
||||
for range ticker {
|
||||
c.collect()
|
||||
}
|
||||
}
|
||||
|
||||
func (c *nvmlCollector) collect() {
|
||||
c.gm.Lock()
|
||||
defer c.gm.Unlock()
|
||||
|
||||
for i, device := range c.devices {
|
||||
id := fmt.Sprintf("%d", i)
|
||||
bdf := c.bdfs[i]
|
||||
|
||||
// Update GPUDataMap
|
||||
if _, ok := c.gm.GpuDataMap[id]; !ok {
|
||||
var nameBuf [64]byte
|
||||
if ret := nvmlDeviceGetName(device, &nameBuf[0], 64); ret != nvmlReturn(nvmlSuccess) {
|
||||
continue
|
||||
}
|
||||
name := string(nameBuf[:strings.Index(string(nameBuf[:]), "\x00")])
|
||||
name = strings.TrimPrefix(name, "NVIDIA ")
|
||||
c.gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")}
|
||||
}
|
||||
gpu := c.gm.GpuDataMap[id]
|
||||
|
||||
if bdf != "" && !c.isGPUActive(bdf) {
|
||||
slog.Debug("NVML: GPU is suspended, skipping", "bdf", bdf)
|
||||
gpu.Temperature = 0
|
||||
gpu.MemoryUsed = 0
|
||||
continue
|
||||
}
|
||||
|
||||
// Utilization
|
||||
var utilization nvmlUtilization
|
||||
if ret := nvmlDeviceGetUtilizationRates(device, &utilization); ret != nvmlReturn(nvmlSuccess) {
|
||||
slog.Debug("NVML: Utilization failed (GPU likely suspended)", "bdf", bdf, "ret", ret)
|
||||
gpu.Temperature = 0
|
||||
gpu.MemoryUsed = 0
|
||||
continue
|
||||
}
|
||||
|
||||
slog.Debug("NVML: Collecting data for GPU", "bdf", bdf)
|
||||
|
||||
// Temperature
|
||||
var temp uint32
|
||||
nvmlDeviceGetTemperature(device, 0, &temp) // 0 is NVML_TEMPERATURE_GPU
|
||||
|
||||
// Memory
|
||||
var usedMem, totalMem uint64
|
||||
if c.isV2 {
|
||||
var memory nvmlMemoryV2
|
||||
memory.Version = 0x02000028 // (2 << 24) | 40 bytes
|
||||
nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory)))
|
||||
usedMem = memory.Used
|
||||
totalMem = memory.Total
|
||||
} else {
|
||||
var memory nvmlMemoryV1
|
||||
nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory)))
|
||||
usedMem = memory.Used
|
||||
totalMem = memory.Total
|
||||
}
|
||||
|
||||
// Power
|
||||
var power uint32
|
||||
nvmlDeviceGetPowerUsage(device, &power)
|
||||
|
||||
gpu.Temperature = float64(temp)
|
||||
gpu.MemoryUsed = float64(usedMem) / 1024 / 1024 / mebibytesInAMegabyte
|
||||
gpu.MemoryTotal = float64(totalMem) / 1024 / 1024 / mebibytesInAMegabyte
|
||||
gpu.Usage += float64(utilization.Gpu)
|
||||
gpu.Power += float64(power) / 1000.0
|
||||
gpu.Count++
|
||||
slog.Debug("NVML: Collected data", "gpu", gpu)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user