mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-23 22:16:18 +01:00
238 lines
6.7 KiB
Go
238 lines
6.7 KiB
Go
package agent
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
"unsafe"
|
|
|
|
"github.com/ebitengine/purego"
|
|
"github.com/henrygd/beszel/internal/entities/system"
|
|
"golang.org/x/exp/slog"
|
|
)
|
|
|
|
// NVML constants and types
|
|
const (
|
|
nvmlSuccess int = 0
|
|
)
|
|
|
|
type nvmlDevice uintptr
|
|
|
|
type nvmlReturn int
|
|
|
|
type nvmlMemoryV1 struct {
|
|
Total uint64
|
|
Free uint64
|
|
Used uint64
|
|
}
|
|
|
|
type nvmlMemoryV2 struct {
|
|
Version uint32
|
|
Total uint64
|
|
Reserved uint64
|
|
Free uint64
|
|
Used uint64
|
|
}
|
|
|
|
type nvmlUtilization struct {
|
|
Gpu uint32
|
|
Memory uint32
|
|
}
|
|
|
|
type nvmlPciInfo struct {
|
|
BusId [16]byte
|
|
Domain uint32
|
|
Bus uint32
|
|
Device uint32
|
|
PciDeviceId uint32
|
|
PciSubSystemId uint32
|
|
}
|
|
|
|
// NVML function signatures
|
|
var (
|
|
nvmlInit func() nvmlReturn
|
|
nvmlShutdown func() nvmlReturn
|
|
nvmlDeviceGetCount func(count *uint32) nvmlReturn
|
|
nvmlDeviceGetHandleByIndex func(index uint32, device *nvmlDevice) nvmlReturn
|
|
nvmlDeviceGetName func(device nvmlDevice, name *byte, length uint32) nvmlReturn
|
|
nvmlDeviceGetMemoryInfo func(device nvmlDevice, memory uintptr) nvmlReturn
|
|
nvmlDeviceGetUtilizationRates func(device nvmlDevice, utilization *nvmlUtilization) nvmlReturn
|
|
nvmlDeviceGetTemperature func(device nvmlDevice, sensorType int, temp *uint32) nvmlReturn
|
|
nvmlDeviceGetPowerUsage func(device nvmlDevice, power *uint32) nvmlReturn
|
|
nvmlDeviceGetPciInfo func(device nvmlDevice, pci *nvmlPciInfo) nvmlReturn
|
|
nvmlErrorString func(result nvmlReturn) string
|
|
)
|
|
|
|
type nvmlCollector struct {
|
|
gm *GPUManager
|
|
lib uintptr
|
|
devices []nvmlDevice
|
|
bdfs []string
|
|
isV2 bool
|
|
}
|
|
|
|
func (c *nvmlCollector) init() error {
|
|
slog.Info("NVML: Initializing")
|
|
libPath := "libnvidia-ml.so.1"
|
|
|
|
// Check for standard locations if necessary, but purego/dlopen usually handles this
|
|
lib, err := purego.Dlopen(libPath, purego.RTLD_NOW|purego.RTLD_GLOBAL)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to load %s: %w", libPath, err)
|
|
}
|
|
c.lib = lib
|
|
|
|
purego.RegisterLibFunc(&nvmlInit, lib, "nvmlInit")
|
|
purego.RegisterLibFunc(&nvmlShutdown, lib, "nvmlShutdown")
|
|
purego.RegisterLibFunc(&nvmlDeviceGetCount, lib, "nvmlDeviceGetCount")
|
|
purego.RegisterLibFunc(&nvmlDeviceGetHandleByIndex, lib, "nvmlDeviceGetHandleByIndex")
|
|
purego.RegisterLibFunc(&nvmlDeviceGetName, lib, "nvmlDeviceGetName")
|
|
// Try to get v2 memory info, fallback to v1 if not available
|
|
_, err = purego.Dlsym(lib, "nvmlDeviceGetMemoryInfo_v2")
|
|
if err == nil {
|
|
c.isV2 = true
|
|
purego.RegisterLibFunc(&nvmlDeviceGetMemoryInfo, lib, "nvmlDeviceGetMemoryInfo_v2")
|
|
} else {
|
|
purego.RegisterLibFunc(&nvmlDeviceGetMemoryInfo, lib, "nvmlDeviceGetMemoryInfo")
|
|
}
|
|
purego.RegisterLibFunc(&nvmlDeviceGetUtilizationRates, lib, "nvmlDeviceGetUtilizationRates")
|
|
purego.RegisterLibFunc(&nvmlDeviceGetTemperature, lib, "nvmlDeviceGetTemperature")
|
|
purego.RegisterLibFunc(&nvmlDeviceGetPowerUsage, lib, "nvmlDeviceGetPowerUsage")
|
|
purego.RegisterLibFunc(&nvmlDeviceGetPciInfo, lib, "nvmlDeviceGetPciInfo")
|
|
purego.RegisterLibFunc(&nvmlErrorString, lib, "nvmlErrorString")
|
|
|
|
if ret := nvmlInit(); ret != nvmlReturn(nvmlSuccess) {
|
|
return fmt.Errorf("nvmlInit failed: %v", ret)
|
|
}
|
|
|
|
var count uint32
|
|
if ret := nvmlDeviceGetCount(&count); ret != nvmlReturn(nvmlSuccess) {
|
|
return fmt.Errorf("nvmlDeviceGetCount failed: %v", ret)
|
|
}
|
|
|
|
for i := uint32(0); i < count; i++ {
|
|
var device nvmlDevice
|
|
if ret := nvmlDeviceGetHandleByIndex(i, &device); ret == nvmlReturn(nvmlSuccess) {
|
|
c.devices = append(c.devices, device)
|
|
// Get BDF for power state check
|
|
var pci nvmlPciInfo
|
|
if ret := nvmlDeviceGetPciInfo(device, &pci); ret == nvmlReturn(nvmlSuccess) {
|
|
busID := string(pci.BusId[:])
|
|
if idx := strings.Index(busID, "\x00"); idx != -1 {
|
|
busID = busID[:idx]
|
|
}
|
|
c.bdfs = append(c.bdfs, strings.ToLower(busID))
|
|
} else {
|
|
c.bdfs = append(c.bdfs, "")
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *nvmlCollector) start() {
|
|
defer nvmlShutdown()
|
|
ticker := time.Tick(3 * time.Second)
|
|
|
|
for range ticker {
|
|
c.collect()
|
|
}
|
|
}
|
|
|
|
func (c *nvmlCollector) collect() {
|
|
c.gm.Lock()
|
|
defer c.gm.Unlock()
|
|
|
|
for i, device := range c.devices {
|
|
bdf := c.bdfs[i]
|
|
if bdf != "" && !c.isGPUActive(bdf) {
|
|
slog.Info("NVML: GPU is suspended, skipping", "bdf", bdf)
|
|
continue
|
|
}
|
|
slog.Info("NVML: Collecting data for GPU", "bdf", bdf)
|
|
|
|
id := fmt.Sprintf("%d", i)
|
|
|
|
// Utilization
|
|
var utilization nvmlUtilization
|
|
if ret := nvmlDeviceGetUtilizationRates(device, &utilization); ret != nvmlReturn(nvmlSuccess) {
|
|
continue
|
|
}
|
|
|
|
// Temperature
|
|
var temp uint32
|
|
nvmlDeviceGetTemperature(device, 0, &temp) // 0 is NVML_TEMPERATURE_GPU
|
|
|
|
// Memory
|
|
var usedMem, totalMem uint64
|
|
if c.isV2 {
|
|
var memory nvmlMemoryV2
|
|
memory.Version = 0x02000028 // (2 << 24) | 40 bytes
|
|
nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory)))
|
|
usedMem = memory.Used
|
|
totalMem = memory.Total
|
|
} else {
|
|
var memory nvmlMemoryV1
|
|
nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory)))
|
|
usedMem = memory.Used
|
|
totalMem = memory.Total
|
|
}
|
|
|
|
// Power
|
|
var power uint32
|
|
nvmlDeviceGetPowerUsage(device, &power)
|
|
|
|
// Update GPUDataMap
|
|
if _, ok := c.gm.GpuDataMap[id]; !ok {
|
|
var nameBuf [64]byte
|
|
nvmlDeviceGetName(device, &nameBuf[0], 64)
|
|
name := string(nameBuf[:strings.Index(string(nameBuf[:]), "\x00")])
|
|
name = strings.TrimPrefix(name, "NVIDIA ")
|
|
c.gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")}
|
|
}
|
|
|
|
gpu := c.gm.GpuDataMap[id]
|
|
gpu.Temperature = float64(temp)
|
|
gpu.MemoryUsed = float64(usedMem) / 1024 / 1024 / mebibytesInAMegabyte
|
|
gpu.MemoryTotal = float64(totalMem) / 1024 / 1024 / mebibytesInAMegabyte
|
|
gpu.Usage += float64(utilization.Gpu)
|
|
gpu.Power += float64(power) / 1000.0
|
|
gpu.Count++
|
|
slog.Info("NVML: Collected data", "gpu", gpu)
|
|
}
|
|
}
|
|
|
|
func (c *nvmlCollector) isGPUActive(bdf string) bool {
|
|
// runtime_status
|
|
statusPath := filepath.Join("/sys/bus/pci/devices", bdf, "power/runtime_status")
|
|
status, err := os.ReadFile(statusPath)
|
|
if err != nil {
|
|
slog.Info("NVML: Can't read runtime_status", "bdf", bdf, "err", err)
|
|
return true // Assume active if we can't read status
|
|
}
|
|
statusStr := strings.TrimSpace(string(status))
|
|
if statusStr != "active" && statusStr != "resuming" {
|
|
slog.Info("NVML: GPU is not active", "bdf", bdf, "status", statusStr)
|
|
return false
|
|
}
|
|
|
|
// power_state (D0 check)
|
|
// Find any drm card device power_state
|
|
pstatePathPattern := filepath.Join("/sys/bus/pci/devices", bdf, "drm/card*/device/power_state")
|
|
matches, _ := filepath.Glob(pstatePathPattern)
|
|
if len(matches) > 0 {
|
|
pstate, err := os.ReadFile(matches[0])
|
|
if err == nil {
|
|
pstateStr := strings.TrimSpace(string(pstate))
|
|
if pstateStr != "D0" {
|
|
return false
|
|
}
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|