mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-22 05:36:15 +01:00
add experimental nvml gpu collector (#1522)
This commit is contained in:
25
agent/gpu.go
25
agent/gpu.go
@@ -44,6 +44,7 @@ type GPUManager struct {
|
||||
rocmSmi bool
|
||||
tegrastats bool
|
||||
intelGpuStats bool
|
||||
nvml bool
|
||||
GpuDataMap map[string]*system.GPUData
|
||||
// lastAvgData stores the last calculated averages for each GPU
|
||||
// Used when a collection happens before new data arrives (Count == 0)
|
||||
@@ -297,8 +298,13 @@ func (gm *GPUManager) calculateGPUAverage(id string, gpu *system.GPUData, cacheK
|
||||
currentCount := uint32(gpu.Count)
|
||||
deltaCount := gm.calculateDeltaCount(currentCount, lastSnapshot)
|
||||
|
||||
// If no new data arrived, use last known average
|
||||
// If no new data arrived
|
||||
if deltaCount == 0 {
|
||||
// If GPU appears suspended (instantaneous values are 0), return zero values
|
||||
// Otherwise return last known average for temporary collection gaps
|
||||
if gpu.Temperature == 0 && gpu.MemoryUsed == 0 {
|
||||
return system.GPUData{Name: gpu.Name}
|
||||
}
|
||||
return gm.lastAvgData[id] // zero value if not found
|
||||
}
|
||||
|
||||
@@ -396,7 +402,7 @@ func (gm *GPUManager) detectGPUs() error {
|
||||
if _, err := exec.LookPath(intelGpuStatsCmd); err == nil {
|
||||
gm.intelGpuStats = true
|
||||
}
|
||||
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelGpuStats {
|
||||
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelGpuStats || gm.nvml {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, tegrastats, or intel_gpu_top")
|
||||
@@ -467,7 +473,20 @@ func NewGPUManager() (*GPUManager, error) {
|
||||
gm.GpuDataMap = make(map[string]*system.GPUData)
|
||||
|
||||
if gm.nvidiaSmi {
|
||||
gm.startCollector(nvidiaSmiCmd)
|
||||
if nvml, _ := GetEnv("NVML"); nvml == "true" {
|
||||
gm.nvml = true
|
||||
gm.nvidiaSmi = false
|
||||
collector := &nvmlCollector{gm: &gm}
|
||||
if err := collector.init(); err == nil {
|
||||
go collector.start()
|
||||
} else {
|
||||
slog.Warn("Failed to initialize NVML, falling back to nvidia-smi", "err", err)
|
||||
gm.nvidiaSmi = true
|
||||
gm.startCollector(nvidiaSmiCmd)
|
||||
}
|
||||
} else {
|
||||
gm.startCollector(nvidiaSmiCmd)
|
||||
}
|
||||
}
|
||||
if gm.rocmSmi {
|
||||
gm.startCollector(rocmSmiCmd)
|
||||
|
||||
210
agent/gpu_nvml.go
Normal file
210
agent/gpu_nvml.go
Normal file
@@ -0,0 +1,210 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
"unsafe"
|
||||
|
||||
"github.com/ebitengine/purego"
|
||||
"github.com/henrygd/beszel/internal/entities/system"
|
||||
"golang.org/x/exp/slog"
|
||||
)
|
||||
|
||||
// NVML constants and types
|
||||
const (
|
||||
nvmlSuccess int = 0
|
||||
)
|
||||
|
||||
type nvmlDevice uintptr
|
||||
|
||||
type nvmlReturn int
|
||||
|
||||
type nvmlMemoryV1 struct {
|
||||
Total uint64
|
||||
Free uint64
|
||||
Used uint64
|
||||
}
|
||||
|
||||
type nvmlMemoryV2 struct {
|
||||
Version uint32
|
||||
Total uint64
|
||||
Reserved uint64
|
||||
Free uint64
|
||||
Used uint64
|
||||
}
|
||||
|
||||
type nvmlUtilization struct {
|
||||
Gpu uint32
|
||||
Memory uint32
|
||||
}
|
||||
|
||||
type nvmlPciInfo struct {
|
||||
BusId [16]byte
|
||||
Domain uint32
|
||||
Bus uint32
|
||||
Device uint32
|
||||
PciDeviceId uint32
|
||||
PciSubSystemId uint32
|
||||
}
|
||||
|
||||
// NVML function signatures
|
||||
var (
|
||||
nvmlInit func() nvmlReturn
|
||||
nvmlShutdown func() nvmlReturn
|
||||
nvmlDeviceGetCount func(count *uint32) nvmlReturn
|
||||
nvmlDeviceGetHandleByIndex func(index uint32, device *nvmlDevice) nvmlReturn
|
||||
nvmlDeviceGetName func(device nvmlDevice, name *byte, length uint32) nvmlReturn
|
||||
nvmlDeviceGetMemoryInfo func(device nvmlDevice, memory uintptr) nvmlReturn
|
||||
nvmlDeviceGetUtilizationRates func(device nvmlDevice, utilization *nvmlUtilization) nvmlReturn
|
||||
nvmlDeviceGetTemperature func(device nvmlDevice, sensorType int, temp *uint32) nvmlReturn
|
||||
nvmlDeviceGetPowerUsage func(device nvmlDevice, power *uint32) nvmlReturn
|
||||
nvmlDeviceGetPciInfo func(device nvmlDevice, pci *nvmlPciInfo) nvmlReturn
|
||||
nvmlErrorString func(result nvmlReturn) string
|
||||
)
|
||||
|
||||
type nvmlCollector struct {
|
||||
gm *GPUManager
|
||||
lib uintptr
|
||||
devices []nvmlDevice
|
||||
bdfs []string
|
||||
isV2 bool
|
||||
}
|
||||
|
||||
func (c *nvmlCollector) init() error {
|
||||
slog.Debug("NVML: Initializing")
|
||||
libPath := getNVMLPath()
|
||||
|
||||
lib, err := openLibrary(libPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load %s: %w", libPath, err)
|
||||
}
|
||||
c.lib = lib
|
||||
|
||||
purego.RegisterLibFunc(&nvmlInit, lib, "nvmlInit")
|
||||
purego.RegisterLibFunc(&nvmlShutdown, lib, "nvmlShutdown")
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetCount, lib, "nvmlDeviceGetCount")
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetHandleByIndex, lib, "nvmlDeviceGetHandleByIndex")
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetName, lib, "nvmlDeviceGetName")
|
||||
// Try to get v2 memory info, fallback to v1 if not available
|
||||
if hasSymbol(lib, "nvmlDeviceGetMemoryInfo_v2") {
|
||||
c.isV2 = true
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetMemoryInfo, lib, "nvmlDeviceGetMemoryInfo_v2")
|
||||
} else {
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetMemoryInfo, lib, "nvmlDeviceGetMemoryInfo")
|
||||
}
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetUtilizationRates, lib, "nvmlDeviceGetUtilizationRates")
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetTemperature, lib, "nvmlDeviceGetTemperature")
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetPowerUsage, lib, "nvmlDeviceGetPowerUsage")
|
||||
purego.RegisterLibFunc(&nvmlDeviceGetPciInfo, lib, "nvmlDeviceGetPciInfo")
|
||||
purego.RegisterLibFunc(&nvmlErrorString, lib, "nvmlErrorString")
|
||||
|
||||
if ret := nvmlInit(); ret != nvmlReturn(nvmlSuccess) {
|
||||
return fmt.Errorf("nvmlInit failed: %v", ret)
|
||||
}
|
||||
|
||||
var count uint32
|
||||
if ret := nvmlDeviceGetCount(&count); ret != nvmlReturn(nvmlSuccess) {
|
||||
return fmt.Errorf("nvmlDeviceGetCount failed: %v", ret)
|
||||
}
|
||||
|
||||
for i := uint32(0); i < count; i++ {
|
||||
var device nvmlDevice
|
||||
if ret := nvmlDeviceGetHandleByIndex(i, &device); ret == nvmlReturn(nvmlSuccess) {
|
||||
c.devices = append(c.devices, device)
|
||||
// Get BDF for power state check
|
||||
var pci nvmlPciInfo
|
||||
if ret := nvmlDeviceGetPciInfo(device, &pci); ret == nvmlReturn(nvmlSuccess) {
|
||||
busID := string(pci.BusId[:])
|
||||
if idx := strings.Index(busID, "\x00"); idx != -1 {
|
||||
busID = busID[:idx]
|
||||
}
|
||||
c.bdfs = append(c.bdfs, strings.ToLower(busID))
|
||||
} else {
|
||||
c.bdfs = append(c.bdfs, "")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *nvmlCollector) start() {
|
||||
defer nvmlShutdown()
|
||||
ticker := time.Tick(3 * time.Second)
|
||||
|
||||
for range ticker {
|
||||
c.collect()
|
||||
}
|
||||
}
|
||||
|
||||
func (c *nvmlCollector) collect() {
|
||||
c.gm.Lock()
|
||||
defer c.gm.Unlock()
|
||||
|
||||
for i, device := range c.devices {
|
||||
id := fmt.Sprintf("%d", i)
|
||||
bdf := c.bdfs[i]
|
||||
|
||||
// Update GPUDataMap
|
||||
if _, ok := c.gm.GpuDataMap[id]; !ok {
|
||||
var nameBuf [64]byte
|
||||
if ret := nvmlDeviceGetName(device, &nameBuf[0], 64); ret != nvmlReturn(nvmlSuccess) {
|
||||
continue
|
||||
}
|
||||
name := string(nameBuf[:strings.Index(string(nameBuf[:]), "\x00")])
|
||||
name = strings.TrimPrefix(name, "NVIDIA ")
|
||||
c.gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")}
|
||||
}
|
||||
gpu := c.gm.GpuDataMap[id]
|
||||
|
||||
if bdf != "" && !c.isGPUActive(bdf) {
|
||||
slog.Debug("NVML: GPU is suspended, skipping", "bdf", bdf)
|
||||
gpu.Temperature = 0
|
||||
gpu.MemoryUsed = 0
|
||||
continue
|
||||
}
|
||||
|
||||
// Utilization
|
||||
var utilization nvmlUtilization
|
||||
if ret := nvmlDeviceGetUtilizationRates(device, &utilization); ret != nvmlReturn(nvmlSuccess) {
|
||||
slog.Debug("NVML: Utilization failed (GPU likely suspended)", "bdf", bdf, "ret", ret)
|
||||
gpu.Temperature = 0
|
||||
gpu.MemoryUsed = 0
|
||||
continue
|
||||
}
|
||||
|
||||
slog.Debug("NVML: Collecting data for GPU", "bdf", bdf)
|
||||
|
||||
// Temperature
|
||||
var temp uint32
|
||||
nvmlDeviceGetTemperature(device, 0, &temp) // 0 is NVML_TEMPERATURE_GPU
|
||||
|
||||
// Memory
|
||||
var usedMem, totalMem uint64
|
||||
if c.isV2 {
|
||||
var memory nvmlMemoryV2
|
||||
memory.Version = 0x02000028 // (2 << 24) | 40 bytes
|
||||
nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory)))
|
||||
usedMem = memory.Used
|
||||
totalMem = memory.Total
|
||||
} else {
|
||||
var memory nvmlMemoryV1
|
||||
nvmlDeviceGetMemoryInfo(device, uintptr(unsafe.Pointer(&memory)))
|
||||
usedMem = memory.Used
|
||||
totalMem = memory.Total
|
||||
}
|
||||
|
||||
// Power
|
||||
var power uint32
|
||||
nvmlDeviceGetPowerUsage(device, &power)
|
||||
|
||||
gpu.Temperature = float64(temp)
|
||||
gpu.MemoryUsed = float64(usedMem) / 1024 / 1024 / mebibytesInAMegabyte
|
||||
gpu.MemoryTotal = float64(totalMem) / 1024 / 1024 / mebibytesInAMegabyte
|
||||
gpu.Usage += float64(utilization.Gpu)
|
||||
gpu.Power += float64(power) / 1000.0
|
||||
gpu.Count++
|
||||
slog.Debug("NVML: Collected data", "gpu", gpu)
|
||||
}
|
||||
}
|
||||
57
agent/gpu_nvml_linux.go
Normal file
57
agent/gpu_nvml_linux.go
Normal file
@@ -0,0 +1,57 @@
|
||||
//go:build linux
|
||||
|
||||
package agent
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/ebitengine/purego"
|
||||
"golang.org/x/exp/slog"
|
||||
)
|
||||
|
||||
func openLibrary(name string) (uintptr, error) {
|
||||
return purego.Dlopen(name, purego.RTLD_NOW|purego.RTLD_GLOBAL)
|
||||
}
|
||||
|
||||
func getNVMLPath() string {
|
||||
return "libnvidia-ml.so.1"
|
||||
}
|
||||
|
||||
func hasSymbol(lib uintptr, symbol string) bool {
|
||||
_, err := purego.Dlsym(lib, symbol)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func (c *nvmlCollector) isGPUActive(bdf string) bool {
|
||||
// runtime_status
|
||||
statusPath := filepath.Join("/sys/bus/pci/devices", bdf, "power/runtime_status")
|
||||
status, err := os.ReadFile(statusPath)
|
||||
if err != nil {
|
||||
slog.Debug("NVML: Can't read runtime_status", "bdf", bdf, "err", err)
|
||||
return true // Assume active if we can't read status
|
||||
}
|
||||
statusStr := strings.TrimSpace(string(status))
|
||||
if statusStr != "active" && statusStr != "resuming" {
|
||||
slog.Debug("NVML: GPU not active", "bdf", bdf, "status", statusStr)
|
||||
return false
|
||||
}
|
||||
|
||||
// power_state (D0 check)
|
||||
// Find any drm card device power_state
|
||||
pstatePathPattern := filepath.Join("/sys/bus/pci/devices", bdf, "drm/card*/device/power_state")
|
||||
matches, _ := filepath.Glob(pstatePathPattern)
|
||||
if len(matches) > 0 {
|
||||
pstate, err := os.ReadFile(matches[0])
|
||||
if err == nil {
|
||||
pstateStr := strings.TrimSpace(string(pstate))
|
||||
if pstateStr != "D0" {
|
||||
slog.Debug("NVML: GPU not in D0 state", "bdf", bdf, "pstate", pstateStr)
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
21
agent/gpu_nvml_unsupported.go
Normal file
21
agent/gpu_nvml_unsupported.go
Normal file
@@ -0,0 +1,21 @@
|
||||
//go:build !linux && !windows
|
||||
|
||||
package agent
|
||||
|
||||
import "fmt"
|
||||
|
||||
func openLibrary(name string) (uintptr, error) {
|
||||
return 0, fmt.Errorf("nvml not supported on this platform")
|
||||
}
|
||||
|
||||
func getNVMLPath() string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func hasSymbol(lib uintptr, symbol string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func (c *nvmlCollector) isGPUActive(bdf string) bool {
|
||||
return true
|
||||
}
|
||||
25
agent/gpu_nvml_windows.go
Normal file
25
agent/gpu_nvml_windows.go
Normal file
@@ -0,0 +1,25 @@
|
||||
//go:build windows
|
||||
|
||||
package agent
|
||||
|
||||
import (
|
||||
"golang.org/x/sys/windows"
|
||||
)
|
||||
|
||||
func openLibrary(name string) (uintptr, error) {
|
||||
handle, err := windows.LoadLibrary(name)
|
||||
return uintptr(handle), err
|
||||
}
|
||||
|
||||
func getNVMLPath() string {
|
||||
return "nvml.dll"
|
||||
}
|
||||
|
||||
func hasSymbol(lib uintptr, symbol string) bool {
|
||||
_, err := windows.GetProcAddress(windows.Handle(lib), symbol)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func (c *nvmlCollector) isGPUActive(bdf string) bool {
|
||||
return true
|
||||
}
|
||||
Reference in New Issue
Block a user