mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-21 21:26:16 +01:00
225 lines
6.4 KiB
Go
225 lines
6.4 KiB
Go
//go:build (linux || windows) && (amd64 || arm64)
|
|
|
|
package agent
|
|
|
|
import (
|
|
"fmt"
|
|
"log/slog"
|
|
"strings"
|
|
"time"
|
|
"unsafe"
|
|
|
|
"github.com/ebitengine/purego"
|
|
"github.com/henrygd/beszel/internal/entities/system"
|
|
)
|
|
|
|
// NVML constants and types
|
|
const (
|
|
nvmlSuccess int = 0
|
|
)
|
|
|
|
type nvmlDevice uintptr
|
|
|
|
type nvmlReturn int
|
|
|
|
type nvmlMemoryV1 struct {
|
|
Total uint64
|
|
Free uint64
|
|
Used uint64
|
|
}
|
|
|
|
type nvmlMemoryV2 struct {
|
|
Version uint32
|
|
Total uint64
|
|
Reserved uint64
|
|
Free uint64
|
|
Used uint64
|
|
}
|
|
|
|
type nvmlUtilization struct {
|
|
Gpu uint32
|
|
Memory uint32
|
|
}
|
|
|
|
type nvmlPciInfo struct {
|
|
BusId [16]byte
|
|
Domain uint32
|
|
Bus uint32
|
|
Device uint32
|
|
PciDeviceId uint32
|
|
PciSubSystemId uint32
|
|
}
|
|
|
|
// NVML function signatures
|
|
var (
|
|
nvmlInit func() nvmlReturn
|
|
nvmlShutdown func() nvmlReturn
|
|
nvmlDeviceGetCount func(count *uint32) nvmlReturn
|
|
nvmlDeviceGetHandleByIndex func(index uint32, device *nvmlDevice) nvmlReturn
|
|
nvmlDeviceGetName func(device nvmlDevice, name *byte, length uint32) nvmlReturn
|
|
nvmlDeviceGetMemoryInfo func(device nvmlDevice, memory uintptr) nvmlReturn
|
|
nvmlDeviceGetUtilizationRates func(device nvmlDevice, utilization *nvmlUtilization) nvmlReturn
|
|
nvmlDeviceGetTemperature func(device nvmlDevice, sensorType int, temp *uint32) nvmlReturn
|
|
nvmlDeviceGetPowerUsage func(device nvmlDevice, power *uint32) nvmlReturn
|
|
nvmlDeviceGetPciInfo func(device nvmlDevice, pci *nvmlPciInfo) nvmlReturn
|
|
nvmlErrorString func(result nvmlReturn) string
|
|
)
|
|
|
|
type nvmlCollector struct {
|
|
gm *GPUManager
|
|
lib uintptr
|
|
devices []nvmlDevice
|
|
bdfs []string
|
|
isV2 bool
|
|
}
|
|
|
|
func (c *nvmlCollector) init() error {
|
|
slog.Debug("NVML: Initializing")
|
|
libPath := getNVMLPath()
|
|
|
|
lib, err := openLibrary(libPath)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to load %s: %w", libPath, err)
|
|
}
|
|
c.lib = lib
|
|
|
|
purego.RegisterLibFunc(&nvmlInit, lib, "nvmlInit")
|
|
purego.RegisterLibFunc(&nvmlShutdown, lib, "nvmlShutdown")
|
|
purego.RegisterLibFunc(&nvmlDeviceGetCount, lib, "nvmlDeviceGetCount")
|
|
purego.RegisterLibFunc(&nvmlDeviceGetHandleByIndex, lib, "nvmlDeviceGetHandleByIndex")
|
|
purego.RegisterLibFunc(&nvmlDeviceGetName, lib, "nvmlDeviceGetName")
|
|
// Try to get v2 memory info, fallback to v1 if not available
|
|
if hasSymbol(lib, "nvmlDeviceGetMemoryInfo_v2") {
|
|
c.isV2 = true
|
|
purego.RegisterLibFunc(&nvmlDeviceGetMemoryInfo, lib, "nvmlDeviceGetMemoryInfo_v2")
|
|
} else {
|
|
purego.RegisterLibFunc(&nvmlDeviceGetMemoryInfo, lib, "nvmlDeviceGetMemoryInfo")
|
|
}
|
|
purego.RegisterLibFunc(&nvmlDeviceGetUtilizationRates, lib, "nvmlDeviceGetUtilizationRates")
|
|
purego.RegisterLibFunc(&nvmlDeviceGetTemperature, lib, "nvmlDeviceGetTemperature")
|
|
purego.RegisterLibFunc(&nvmlDeviceGetPowerUsage, lib, "nvmlDeviceGetPowerUsage")
|
|
purego.RegisterLibFunc(&nvmlDeviceGetPciInfo, lib, "nvmlDeviceGetPciInfo")
|
|
purego.RegisterLibFunc(&nvmlErrorString, lib, "nvmlErrorString")
|
|
|
|
if ret := nvmlInit(); ret != nvmlReturn(nvmlSuccess) {
|
|
return fmt.Errorf("nvmlInit failed: %v", ret)
|
|
}
|
|
|
|
var count uint32
|
|
if ret := nvmlDeviceGetCount(&count); ret != nvmlReturn(nvmlSuccess) {
|
|
return fmt.Errorf("nvmlDeviceGetCount failed: %v", ret)
|
|
}
|
|
|
|
for i := uint32(0); i < count; i++ {
|
|
var device nvmlDevice
|
|
if ret := nvmlDeviceGetHandleByIndex(i, &device); ret == nvmlReturn(nvmlSuccess) {
|
|
c.devices = append(c.devices, device)
|
|
// Get BDF for power state check
|
|
var pci nvmlPciInfo
|
|
if ret := nvmlDeviceGetPciInfo(device, &pci); ret == nvmlReturn(nvmlSuccess) {
|
|
busID := string(pci.BusId[:])
|
|
if idx := strings.Index(busID, "\x00"); idx != -1 {
|
|
busID = busID[:idx]
|
|
}
|
|
c.bdfs = append(c.bdfs, strings.ToLower(busID))
|
|
} else {
|
|
c.bdfs = append(c.bdfs, "")
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *nvmlCollector) start() {
|
|
defer nvmlShutdown()
|
|
ticker := time.Tick(3 * time.Second)
|
|
|
|
for range ticker {
|
|
c.collect()
|
|
}
|
|
}
|
|
|
|
func (c *nvmlCollector) collect() {
|
|
c.gm.Lock()
|
|
defer c.gm.Unlock()
|
|
|
|
for i, device := range c.devices {
|
|
id := fmt.Sprintf("%d", i)
|
|
bdf := c.bdfs[i]
|
|
|
|
// Update GPUDataMap
|
|
if _, ok := c.gm.GpuDataMap[id]; !ok {
|
|
var nameBuf [64]byte
|
|
if ret := nvmlDeviceGetName(device, &nameBuf[0], 64); ret != nvmlReturn(nvmlSuccess) {
|
|
continue
|
|
}
|
|
name := string(nameBuf[:strings.Index(string(nameBuf[:]), "\x00")])
|
|
name = strings.TrimPrefix(name, "NVIDIA ")
|
|
c.gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")}
|
|
}
|
|
gpu := c.gm.GpuDataMap[id]
|
|
|
|
if bdf != "" && !c.isGPUActive(bdf) {
|
|
slog.Debug("NVML: GPU is suspended, skipping", "bdf", bdf)
|
|
gpu.Temperature = 0
|
|
gpu.MemoryUsed = 0
|
|
continue
|
|
}
|
|
|
|
// Utilization
|
|
var utilization nvmlUtilization
|
|
if ret := nvmlDeviceGetUtilizationRates(device, &utilization); ret != nvmlReturn(nvmlSuccess) {
|
|
slog.Debug("NVML: Utilization failed (GPU likely suspended)", "bdf", bdf, "ret", ret)
|
|
gpu.Temperature = 0
|
|
gpu.MemoryUsed = 0
|
|
continue
|
|
}
|
|
|
|
slog.Debug("NVML: Collecting data for GPU", "bdf", bdf)
|
|
|
|
// Temperature
|
|
var temp uint32
|
|
nvmlDeviceGetTemperature(device, 0, &temp) // 0 is NVML_TEMPERATURE_GPU
|
|
|
|
// Memory: only poll if GPU is active to avoid leaving D3cold state (#1522)
|
|
if utilization.Gpu > 0 {
|
|
var usedMem, totalMem uint64
|
|
if c.isV2 {
|
|
var memory nvmlMemoryV2
|
|
memory.Version = 0x02000028 // (2 << 24) | 40 bytes
|
|
if ret := nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory))); ret != nvmlReturn(nvmlSuccess) {
|
|
slog.Debug("NVML: MemoryInfo_v2 failed", "bdf", bdf, "ret", ret)
|
|
} else {
|
|
usedMem = memory.Used
|
|
totalMem = memory.Total
|
|
}
|
|
} else {
|
|
var memory nvmlMemoryV1
|
|
if ret := nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory))); ret != nvmlReturn(nvmlSuccess) {
|
|
slog.Debug("NVML: MemoryInfo failed", "bdf", bdf, "ret", ret)
|
|
} else {
|
|
usedMem = memory.Used
|
|
totalMem = memory.Total
|
|
}
|
|
}
|
|
if totalMem > 0 {
|
|
gpu.MemoryUsed = float64(usedMem) / 1024 / 1024 / mebibytesInAMegabyte
|
|
gpu.MemoryTotal = float64(totalMem) / 1024 / 1024 / mebibytesInAMegabyte
|
|
}
|
|
} else {
|
|
slog.Debug("NVML: Skipping memory info (utilization=0)", "bdf", bdf)
|
|
}
|
|
|
|
// Power
|
|
var power uint32
|
|
nvmlDeviceGetPowerUsage(device, &power)
|
|
|
|
gpu.Temperature = float64(temp)
|
|
gpu.Usage += float64(utilization.Gpu)
|
|
gpu.Power += float64(power) / 1000.0
|
|
gpu.Count++
|
|
slog.Debug("NVML: Collected data", "gpu", gpu)
|
|
}
|
|
}
|