From 14ecb1b06902dfea1ff3cca58a9288d9ed05d047 Mon Sep 17 00:00:00 2001 From: henrygd Date: Fri, 13 Feb 2026 17:10:16 -0500 Subject: [PATCH] add nvtop integration and introduce GPU_COLLECTOR env var --- agent/gpu.go | 412 +++++++++++++++++++++++++--------- agent/gpu_amd_linux.go | 3 +- agent/gpu_nvml_unsupported.go | 18 -- agent/gpu_nvtop.go | 159 +++++++++++++ agent/gpu_test.go | 372 +++++++++++++++++++++++++++--- agent/test-data/nvtop.json | 34 +++ 6 files changed, 834 insertions(+), 164 deletions(-) create mode 100644 agent/gpu_nvtop.go create mode 100644 agent/test-data/nvtop.json diff --git a/agent/gpu.go b/agent/gpu.go index 75c96526..b05a3619 100644 --- a/agent/gpu.go +++ b/agent/gpu.go @@ -21,13 +21,10 @@ const ( // Commands nvidiaSmiCmd string = "nvidia-smi" rocmSmiCmd string = "rocm-smi" - amdgpuCmd string = "amdgpu" // internal cmd for sysfs collection tegraStatsCmd string = "tegrastats" + nvtopCmd string = "nvtop" + noGPUFoundMsg string = "no GPU found - see https://beszel.dev/guide/gpu" - // Polling intervals - nvidiaSmiInterval string = "4" // in seconds - tegraStatsInterval string = "3700" // in milliseconds - rocmSmiInterval time.Duration = 4300 * time.Millisecond // Command retry and timeout constants retryWaitTime time.Duration = 5 * time.Second maxFailureRetries int = 5 @@ -40,13 +37,7 @@ const ( // GPUManager manages data collection for GPUs (either Nvidia or AMD) type GPUManager struct { sync.Mutex - nvidiaSmi bool - rocmSmi bool - amdgpu bool - tegrastats bool - intelGpuStats bool - nvml bool - GpuDataMap map[string]*system.GPUData + GpuDataMap map[string]*system.GPUData // lastAvgData stores the last calculated averages for each GPU // Used when a collection happens before new data arrives (Count == 0) lastAvgData map[string]system.GPUData @@ -87,6 +78,51 @@ type gpuCollector struct { var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data +// collectorSource identifies a selectable GPU collector in GPU_COLLECTOR. +type collectorSource string + +const ( + collectorSourceNVTop collectorSource = collectorSource(nvtopCmd) + collectorSourceNVML collectorSource = "nvml" + collectorSourceNvidiaSMI collectorSource = collectorSource(nvidiaSmiCmd) + collectorSourceIntelGpuTop collectorSource = collectorSource(intelGpuStatsCmd) + collectorSourceAmdSysfs collectorSource = "amd_sysfs" + collectorSourceRocmSMI collectorSource = collectorSource(rocmSmiCmd) + collectorGroupNvidia string = "nvidia" + collectorGroupIntel string = "intel" + collectorGroupAmd string = "amd" +) + +func isValidCollectorSource(source collectorSource) bool { + switch source { + case collectorSourceNVTop, + collectorSourceNVML, + collectorSourceNvidiaSMI, + collectorSourceIntelGpuTop, + collectorSourceAmdSysfs, + collectorSourceRocmSMI: + return true + } + return false +} + +// gpuCapabilities describes detected GPU tooling and sysfs support on the host. +type gpuCapabilities struct { + hasNvidiaSmi bool + hasRocmSmi bool + hasAmdSysfs bool + hasTegrastats bool + hasIntelGpuTop bool + hasNvtop bool +} + +type collectorDefinition struct { + group string + available bool + start func(onFailure func()) bool + deprecationWarning string +} + // starts and manages the ongoing collection of GPU data for the specified GPU management utility func (c *gpuCollector) start() { for { @@ -392,93 +428,257 @@ func (gm *GPUManager) storeSnapshot(id string, gpu *system.GPUData, cacheKey uin gm.lastSnapshots[cacheKey][id] = snapshot } -// detectGPUs checks for the presence of GPU management tools (nvidia-smi, rocm-smi, tegrastats) -// in the system path. It sets the corresponding flags in the GPUManager struct if any of these -// tools are found. If none of the tools are found, it returns an error indicating that no GPU -// management tools are available. -func (gm *GPUManager) detectGPUs() error { +// discoverGpuCapabilities checks for available GPU tooling and sysfs support. +// It only reports capability presence and does not apply policy decisions. +func (gm *GPUManager) discoverGpuCapabilities() gpuCapabilities { + caps := gpuCapabilities{ + hasAmdSysfs: gm.hasAmdSysfs(), + } if _, err := exec.LookPath(nvidiaSmiCmd); err == nil { - gm.nvidiaSmi = true + caps.hasNvidiaSmi = true } if _, err := exec.LookPath(rocmSmiCmd); err == nil { - if val, _ := GetEnv("AMD_SYSFS"); val == "true" { - gm.amdgpu = true - } else { - gm.rocmSmi = true - } - } else if gm.hasAmdSysfs() { - gm.amdgpu = true + caps.hasRocmSmi = true } if _, err := exec.LookPath(tegraStatsCmd); err == nil { - gm.tegrastats = true - gm.nvidiaSmi = false + caps.hasTegrastats = true } if _, err := exec.LookPath(intelGpuStatsCmd); err == nil { - gm.intelGpuStats = true + caps.hasIntelGpuTop = true } - if gm.nvidiaSmi || gm.rocmSmi || gm.amdgpu || gm.tegrastats || gm.intelGpuStats || gm.nvml { - return nil + if _, err := exec.LookPath(nvtopCmd); err == nil { + caps.hasNvtop = true } - return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or intel_gpu_top") + return caps } -// startCollector starts the appropriate GPU data collector based on the command -func (gm *GPUManager) startCollector(command string) { - collector := gpuCollector{ - name: command, - bufSize: 10 * 1024, - } - switch command { - case intelGpuStatsCmd: - go func() { - failures := 0 - for { - if err := gm.collectIntelStats(); err != nil { - failures++ - if failures > maxFailureRetries { - break - } - slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err) - time.Sleep(retryWaitTime) - continue +func hasAnyGpuCollector(caps gpuCapabilities) bool { + return caps.hasNvidiaSmi || caps.hasRocmSmi || caps.hasAmdSysfs || caps.hasTegrastats || caps.hasIntelGpuTop || caps.hasNvtop +} + +func (gm *GPUManager) startIntelCollector() { + go func() { + failures := 0 + for { + if err := gm.collectIntelStats(); err != nil { + failures++ + if failures > maxFailureRetries { + break } + slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err) + time.Sleep(retryWaitTime) + continue } - }() - case nvidiaSmiCmd: - collector.cmdArgs = []string{ - "-l", nvidiaSmiInterval, + } + }() +} + +func (gm *GPUManager) startNvidiaSmiCollector(intervalSeconds string) { + collector := gpuCollector{ + name: nvidiaSmiCmd, + bufSize: 10 * 1024, + cmdArgs: []string{ + "-l", intervalSeconds, "--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw", "--format=csv,noheader,nounits", - } - collector.parse = gm.parseNvidiaData - go collector.start() - case tegraStatsCmd: - collector.cmdArgs = []string{"--interval", tegraStatsInterval} - collector.parse = gm.getJetsonParser() - go collector.start() - case amdgpuCmd: - go func() { - if err := gm.collectAmdStats(); err != nil { - slog.Warn("Error collecting AMD GPU data via sysfs", "err", err) - } - }() - case rocmSmiCmd: - collector.cmdArgs = []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"} - collector.parse = gm.parseAmdData - go func() { - failures := 0 - for { - if err := collector.collect(); err != nil { - failures++ - if failures > maxFailureRetries { - break - } - slog.Warn("Error collecting AMD GPU data via rocm-smi", "err", err) - } - time.Sleep(rocmSmiInterval) - } - }() + }, + parse: gm.parseNvidiaData, } + go collector.start() +} + +func (gm *GPUManager) startTegraStatsCollector(intervalMilliseconds string) { + collector := gpuCollector{ + name: tegraStatsCmd, + bufSize: 10 * 1024, + cmdArgs: []string{"--interval", intervalMilliseconds}, + parse: gm.getJetsonParser(), + } + go collector.start() +} + +func (gm *GPUManager) startRocmSmiCollector(pollInterval time.Duration) { + collector := gpuCollector{ + name: rocmSmiCmd, + bufSize: 10 * 1024, + cmdArgs: []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"}, + parse: gm.parseAmdData, + } + go func() { + failures := 0 + for { + if err := collector.collect(); err != nil { + failures++ + if failures > maxFailureRetries { + break + } + slog.Warn("Error collecting AMD GPU data via rocm-smi", "err", err) + } + time.Sleep(pollInterval) + } + }() +} + +func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSource]collectorDefinition { + return map[collectorSource]collectorDefinition{ + collectorSourceNVML: { + group: collectorGroupNvidia, + available: caps.hasNvidiaSmi, + start: func(_ func()) bool { + return gm.startNvmlCollector() + }, + }, + collectorSourceNvidiaSMI: { + group: collectorGroupNvidia, + available: caps.hasNvidiaSmi, + start: func(_ func()) bool { + gm.startNvidiaSmiCollector("4") // seconds + return true + }, + }, + collectorSourceIntelGpuTop: { + group: collectorGroupIntel, + available: caps.hasIntelGpuTop, + start: func(_ func()) bool { + gm.startIntelCollector() + return true + }, + }, + collectorSourceAmdSysfs: { + group: collectorGroupAmd, + available: caps.hasAmdSysfs, + start: func(_ func()) bool { + return gm.startAmdSysfsCollector() + }, + }, + collectorSourceRocmSMI: { + group: collectorGroupAmd, + available: caps.hasRocmSmi, + deprecationWarning: "rocm-smi is deprecated and may be removed in a future release", + start: func(_ func()) bool { + gm.startRocmSmiCollector(4300 * time.Millisecond) + return true + }, + }, + collectorSourceNVTop: { + available: caps.hasNvtop, + start: func(onFailure func()) bool { + gm.startNvtopCollector("30", onFailure) // tens of milliseconds + return true + }, + }, + } +} + +// parseCollectorPriority parses GPU_COLLECTOR and returns valid ordered entries. +func parseCollectorPriority(value string) []collectorSource { + parts := strings.Split(value, ",") + priorities := make([]collectorSource, 0, len(parts)) + for _, raw := range parts { + name := collectorSource(strings.TrimSpace(strings.ToLower(raw))) + if !isValidCollectorSource(name) { + if name != "" { + slog.Warn("Ignoring unknown GPU collector", "collector", name) + } + continue + } + priorities = append(priorities, name) + } + return priorities +} + +// startNvmlCollector initializes NVML and starts its polling loop. +func (gm *GPUManager) startNvmlCollector() bool { + collector := &nvmlCollector{gm: gm} + if err := collector.init(); err != nil { + slog.Warn("Failed to initialize NVML", "err", err) + return false + } + go collector.start() + return true +} + +// startAmdSysfsCollector starts AMD GPU collection via sysfs. +func (gm *GPUManager) startAmdSysfsCollector() bool { + go func() { + if err := gm.collectAmdStats(); err != nil { + slog.Warn("Error collecting AMD GPU data via sysfs", "err", err) + } + }() + return true +} + +// startCollectorsByPriority starts collectors in order with one source per vendor group. +func (gm *GPUManager) startCollectorsByPriority(priorities []collectorSource, caps gpuCapabilities) int { + definitions := gm.collectorDefinitions(caps) + selectedGroups := make(map[string]bool, 3) + started := 0 + for i, source := range priorities { + definition, ok := definitions[source] + if !ok || !definition.available { + continue + } + // nvtop is not a vendor-specific collector, so should only be used if no other collectors are selected or it is first in GPU_COLLECTOR. + if source == collectorSourceNVTop { + if len(selectedGroups) > 0 { + slog.Warn("Skipping nvtop because other collectors are selected") + continue + } + // if nvtop fails, fall back to remaining collectors. + remaining := append([]collectorSource(nil), priorities[i+1:]...) + if definition.start(func() { + gm.startCollectorsByPriority(remaining, caps) + }) { + started++ + return started + } + } + group := definition.group + if group == "" || selectedGroups[group] { + continue + } + if definition.deprecationWarning != "" { + slog.Warn(definition.deprecationWarning) + } + if definition.start(nil) { + selectedGroups[group] = true + started++ + } + } + return started +} + +// resolveLegacyCollectorPriority builds the default collector order when GPU_COLLECTOR is unset. +func (gm *GPUManager) resolveLegacyCollectorPriority(caps gpuCapabilities) []collectorSource { + priorities := make([]collectorSource, 0, 4) + + if caps.hasNvidiaSmi && !caps.hasTegrastats { + if nvml, _ := GetEnv("NVML"); nvml == "true" { + priorities = append(priorities, collectorSourceNVML, collectorSourceNvidiaSMI) + } else { + priorities = append(priorities, collectorSourceNvidiaSMI) + } + } + + if caps.hasRocmSmi { + if val, _ := GetEnv("AMD_SYSFS"); val == "true" { + priorities = append(priorities, collectorSourceAmdSysfs) + } else { + priorities = append(priorities, collectorSourceRocmSMI) + } + } else if caps.hasAmdSysfs { + priorities = append(priorities, collectorSourceAmdSysfs) + } + + if caps.hasIntelGpuTop { + priorities = append(priorities, collectorSourceIntelGpuTop) + } + + // Keep nvtop as a legacy last resort only when no vendor collector exists. + if len(priorities) == 0 && caps.hasNvtop { + priorities = append(priorities, collectorSourceNVTop) + } + return priorities } // NewGPUManager creates and initializes a new GPUManager @@ -487,38 +687,30 @@ func NewGPUManager() (*GPUManager, error) { return nil, nil } var gm GPUManager - if err := gm.detectGPUs(); err != nil { - return nil, err + caps := gm.discoverGpuCapabilities() + if !hasAnyGpuCollector(caps) { + return nil, fmt.Errorf(noGPUFoundMsg) } gm.GpuDataMap = make(map[string]*system.GPUData) - if gm.nvidiaSmi { - if nvml, _ := GetEnv("NVML"); nvml == "true" { - gm.nvml = true - gm.nvidiaSmi = false - collector := &nvmlCollector{gm: &gm} - if err := collector.init(); err == nil { - go collector.start() - } else { - slog.Warn("Failed to initialize NVML, falling back to nvidia-smi", "err", err) - gm.nvidiaSmi = true - gm.startCollector(nvidiaSmiCmd) - } - } else { - gm.startCollector(nvidiaSmiCmd) + // Jetson devices should always use tegrastats (ignore GPU_COLLECTOR). + if caps.hasTegrastats { + gm.startTegraStatsCollector("3700") + return &gm, nil + } + + // if GPU_COLLECTOR is set, start user-defined collectors. + if collectorConfig, ok := GetEnv("GPU_COLLECTOR"); ok && strings.TrimSpace(collectorConfig) != "" { + priorities := parseCollectorPriority(collectorConfig) + if gm.startCollectorsByPriority(priorities, caps) == 0 { + return nil, fmt.Errorf("no configured GPU collectors are available") } + return &gm, nil } - if gm.rocmSmi { - gm.startCollector(rocmSmiCmd) - } - if gm.amdgpu { - gm.startCollector(amdgpuCmd) - } - if gm.tegrastats { - gm.startCollector(tegraStatsCmd) - } - if gm.intelGpuStats { - gm.startCollector(intelGpuStatsCmd) + + // auto-detect and start collectors when GPU_COLLECTOR is unset. + if gm.startCollectorsByPriority(gm.resolveLegacyCollectorPriority(caps), caps) == 0 { + return nil, fmt.Errorf(noGPUFoundMsg) } return &gm, nil diff --git a/agent/gpu_amd_linux.go b/agent/gpu_amd_linux.go index ac265b24..cba8fe5e 100644 --- a/agent/gpu_amd_linux.go +++ b/agent/gpu_amd_linux.go @@ -32,6 +32,7 @@ func (gm *GPUManager) hasAmdSysfs() bool { // collectAmdStats collects AMD GPU metrics directly from sysfs to avoid the overhead of rocm-smi func (gm *GPUManager) collectAmdStats() error { + sysfsPollInterval := 3000 * time.Millisecond cards, err := filepath.Glob("/sys/class/drm/card*") if err != nil { return err @@ -70,7 +71,7 @@ func (gm *GPUManager) collectAmdStats() error { continue } failures = 0 - time.Sleep(rocmSmiInterval) + time.Sleep(sysfsPollInterval) } } diff --git a/agent/gpu_nvml_unsupported.go b/agent/gpu_nvml_unsupported.go index b8a6d40a..48097436 100644 --- a/agent/gpu_nvml_unsupported.go +++ b/agent/gpu_nvml_unsupported.go @@ -13,21 +13,3 @@ func (c *nvmlCollector) init() error { } func (c *nvmlCollector) start() {} - -func (c *nvmlCollector) collect() {} - -func openLibrary(name string) (uintptr, error) { - return 0, fmt.Errorf("nvml not supported on this platform") -} - -func getNVMLPath() string { - return "" -} - -func hasSymbol(lib uintptr, symbol string) bool { - return false -} - -func (c *nvmlCollector) isGPUActive(bdf string) bool { - return true -} diff --git a/agent/gpu_nvtop.go b/agent/gpu_nvtop.go new file mode 100644 index 00000000..36efb42e --- /dev/null +++ b/agent/gpu_nvtop.go @@ -0,0 +1,159 @@ +package agent + +import ( + "encoding/json" + "io" + "log/slog" + "os/exec" + "strconv" + "strings" + "time" + + "github.com/henrygd/beszel/internal/entities/system" +) + +type nvtopSnapshot struct { + DeviceName string `json:"device_name"` + Temp *string `json:"temp"` + PowerDraw *string `json:"power_draw"` + GpuUtil *string `json:"gpu_util"` + MemTotal *string `json:"mem_total"` + MemUsed *string `json:"mem_used"` +} + +// parseNvtopNumber parses nvtop numeric strings with units (C/W/%). +func parseNvtopNumber(raw string) float64 { + cleaned := strings.TrimSpace(raw) + cleaned = strings.TrimSuffix(cleaned, "C") + cleaned = strings.TrimSuffix(cleaned, "W") + cleaned = strings.TrimSuffix(cleaned, "%") + val, _ := strconv.ParseFloat(cleaned, 64) + return val +} + +// parseNvtopData parses a single nvtop JSON snapshot payload. +func (gm *GPUManager) parseNvtopData(output []byte) bool { + var snapshots []nvtopSnapshot + if err := json.Unmarshal(output, &snapshots); err != nil || len(snapshots) == 0 { + return false + } + return gm.updateNvtopSnapshots(snapshots) +} + +// updateNvtopSnapshots applies one decoded nvtop snapshot batch to GPU accumulators. +func (gm *GPUManager) updateNvtopSnapshots(snapshots []nvtopSnapshot) bool { + gm.Lock() + defer gm.Unlock() + + valid := false + usedIDs := make(map[string]struct{}, len(snapshots)) + for i, sample := range snapshots { + if sample.DeviceName == "" { + continue + } + indexID := "n" + strconv.Itoa(i) + id := indexID + + // nvtop ordering can change, so prefer reusing an existing slot with matching device name. + if existingByIndex, ok := gm.GpuDataMap[indexID]; ok && existingByIndex.Name != "" && existingByIndex.Name != sample.DeviceName { + for existingID, gpu := range gm.GpuDataMap { + if !strings.HasPrefix(existingID, "n") { + continue + } + if _, taken := usedIDs[existingID]; taken { + continue + } + if gpu.Name == sample.DeviceName { + id = existingID + break + } + } + } + + if _, ok := gm.GpuDataMap[id]; !ok { + gm.GpuDataMap[id] = &system.GPUData{Name: sample.DeviceName} + } + gpu := gm.GpuDataMap[id] + gpu.Name = sample.DeviceName + + if sample.Temp != nil { + gpu.Temperature = parseNvtopNumber(*sample.Temp) + } + if sample.MemUsed != nil { + gpu.MemoryUsed = bytesToMegabytes(parseNvtopNumber(*sample.MemUsed)) + } + if sample.MemTotal != nil { + gpu.MemoryTotal = bytesToMegabytes(parseNvtopNumber(*sample.MemTotal)) + } + if sample.GpuUtil != nil { + gpu.Usage += parseNvtopNumber(*sample.GpuUtil) + } + if sample.PowerDraw != nil { + gpu.Power += parseNvtopNumber(*sample.PowerDraw) + } + gpu.Count++ + usedIDs[id] = struct{}{} + valid = true + } + return valid +} + +// collectNvtopStats runs nvtop loop mode and continuously decodes JSON snapshots. +func (gm *GPUManager) collectNvtopStats(interval string) error { + cmd := exec.Command(nvtopCmd, "-lP", "-d", interval) + stdout, err := cmd.StdoutPipe() + if err != nil { + return err + } + if err := cmd.Start(); err != nil { + return err + } + defer func() { + _ = stdout.Close() + if cmd.ProcessState == nil || !cmd.ProcessState.Exited() { + _ = cmd.Process.Kill() + } + _ = cmd.Wait() + }() + + decoder := json.NewDecoder(stdout) + foundValid := false + for { + var snapshots []nvtopSnapshot + if err := decoder.Decode(&snapshots); err != nil { + if err == io.EOF { + if foundValid { + return nil + } + return errNoValidData + } + return err + } + if gm.updateNvtopSnapshots(snapshots) { + foundValid = true + } + } +} + +// startNvtopCollector starts nvtop collection with retry or fallback callback handling. +func (gm *GPUManager) startNvtopCollector(interval string, onFailure func()) { + go func() { + failures := 0 + for { + if err := gm.collectNvtopStats(interval); err != nil { + if onFailure != nil { + slog.Warn("Error collecting GPU data via nvtop", "err", err) + onFailure() + return + } + failures++ + if failures > maxFailureRetries { + break + } + slog.Warn("Error collecting GPU data via nvtop", "err", err) + time.Sleep(retryWaitTime) + continue + } + } + }() +} diff --git a/agent/gpu_test.go b/agent/gpu_test.go index cfcab53d..7f4febfe 100644 --- a/agent/gpu_test.go +++ b/agent/gpu_test.go @@ -250,6 +250,100 @@ func TestParseAmdData(t *testing.T) { } } +func TestParseNvtopData(t *testing.T) { + input, err := os.ReadFile("test-data/nvtop.json") + require.NoError(t, err) + + gm := &GPUManager{ + GpuDataMap: make(map[string]*system.GPUData), + } + valid := gm.parseNvtopData(input) + require.True(t, valid) + + g0, ok := gm.GpuDataMap["n0"] + require.True(t, ok) + assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", g0.Name) + assert.Equal(t, 48.0, g0.Temperature) + assert.Equal(t, 5.0, g0.Usage) + assert.Equal(t, 13.0, g0.Power) + assert.Equal(t, bytesToMegabytes(349372416), g0.MemoryUsed) + assert.Equal(t, bytesToMegabytes(4294967296), g0.MemoryTotal) + assert.Equal(t, 1.0, g0.Count) + + g1, ok := gm.GpuDataMap["n1"] + require.True(t, ok) + assert.Equal(t, "AMD Radeon 680M", g1.Name) + assert.Equal(t, 48.0, g1.Temperature) + assert.Equal(t, 12.0, g1.Usage) + assert.Equal(t, 9.0, g1.Power) + assert.Equal(t, bytesToMegabytes(1213784064), g1.MemoryUsed) + assert.Equal(t, bytesToMegabytes(16929173504), g1.MemoryTotal) + assert.Equal(t, 1.0, g1.Count) +} + +func TestUpdateNvtopSnapshotsKeepsDeviceAssociationWhenOrderChanges(t *testing.T) { + strPtr := func(s string) *string { return &s } + + gm := &GPUManager{ + GpuDataMap: make(map[string]*system.GPUData), + } + + firstBatch := []nvtopSnapshot{ + { + DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU", + GpuUtil: strPtr("20%"), + PowerDraw: strPtr("10W"), + }, + { + DeviceName: "AMD Radeon 680M", + GpuUtil: strPtr("30%"), + PowerDraw: strPtr("20W"), + }, + } + secondBatchSwapped := []nvtopSnapshot{ + { + DeviceName: "AMD Radeon 680M", + GpuUtil: strPtr("40%"), + PowerDraw: strPtr("25W"), + }, + { + DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU", + GpuUtil: strPtr("50%"), + PowerDraw: strPtr("15W"), + }, + } + + require.True(t, gm.updateNvtopSnapshots(firstBatch)) + require.True(t, gm.updateNvtopSnapshots(secondBatchSwapped)) + + nvidia := gm.GpuDataMap["n0"] + require.NotNil(t, nvidia) + assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", nvidia.Name) + assert.Equal(t, 70.0, nvidia.Usage) + assert.Equal(t, 25.0, nvidia.Power) + assert.Equal(t, 2.0, nvidia.Count) + + amd := gm.GpuDataMap["n1"] + require.NotNil(t, amd) + assert.Equal(t, "AMD Radeon 680M", amd.Name) + assert.Equal(t, 70.0, amd.Usage) + assert.Equal(t, 45.0, amd.Power) + assert.Equal(t, 2.0, amd.Count) +} + +func TestParseCollectorPriority(t *testing.T) { + got := parseCollectorPriority(" nvml, nvidia-smi, intel_gpu_top, amd_sysfs, nvtop, rocm-smi, bad ") + want := []collectorSource{ + collectorSourceNVML, + collectorSourceNvidiaSMI, + collectorSourceIntelGpuTop, + collectorSourceAmdSysfs, + collectorSourceNVTop, + collectorSourceRocmSMI, + } + assert.Equal(t, want, got) +} + func TestParseJetsonData(t *testing.T) { tests := []struct { name string @@ -987,36 +1081,35 @@ func TestCalculateGPUAverage(t *testing.T) { }) } -func TestDetectGPUs(t *testing.T) { +func TestGPUCapabilitiesAndLegacyPriority(t *testing.T) { // Save original PATH origPath := os.Getenv("PATH") defer os.Setenv("PATH", origPath) - - // Set up temp dir with the commands - tempDir := t.TempDir() - os.Setenv("PATH", tempDir) + hasAmdSysfs := (&GPUManager{}).hasAmdSysfs() tests := []struct { name string - setupCommands func() error + setupCommands func(string) error wantNvidiaSmi bool wantRocmSmi bool wantTegrastats bool + wantNvtop bool wantErr bool }{ { name: "nvidia-smi not available", - setupCommands: func() error { + setupCommands: func(_ string) error { return nil }, wantNvidiaSmi: false, wantRocmSmi: false, wantTegrastats: false, + wantNvtop: false, wantErr: true, }, { name: "nvidia-smi available", - setupCommands: func() error { + setupCommands: func(tempDir string) error { path := filepath.Join(tempDir, "nvidia-smi") script := `#!/bin/sh echo "test"` @@ -1028,29 +1121,14 @@ echo "test"` wantNvidiaSmi: true, wantTegrastats: false, wantRocmSmi: false, + wantNvtop: false, wantErr: false, }, { name: "rocm-smi available", - setupCommands: func() error { + setupCommands: func(tempDir string) error { path := filepath.Join(tempDir, "rocm-smi") script := `#!/bin/sh -echo "test"` - if err := os.WriteFile(path, []byte(script), 0755); err != nil { - return err - } - return nil - }, - wantNvidiaSmi: true, - wantRocmSmi: true, - wantTegrastats: false, - wantErr: false, - }, - { - name: "tegrastats available", - setupCommands: func() error { - path := filepath.Join(tempDir, "tegrastats") - script := `#!/bin/sh echo "test"` if err := os.WriteFile(path, []byte(script), 0755); err != nil { return err @@ -1059,12 +1137,47 @@ echo "test"` }, wantNvidiaSmi: false, wantRocmSmi: true, + wantTegrastats: false, + wantNvtop: false, + wantErr: false, + }, + { + name: "tegrastats available", + setupCommands: func(tempDir string) error { + path := filepath.Join(tempDir, "tegrastats") + script := `#!/bin/sh +echo "test"` + if err := os.WriteFile(path, []byte(script), 0755); err != nil { + return err + } + return nil + }, + wantNvidiaSmi: false, + wantRocmSmi: false, wantTegrastats: true, + wantNvtop: false, + wantErr: false, + }, + { + name: "nvtop available", + setupCommands: func(tempDir string) error { + path := filepath.Join(tempDir, "nvtop") + script := `#!/bin/sh +echo "[]"` + if err := os.WriteFile(path, []byte(script), 0755); err != nil { + return err + } + return nil + }, + wantNvidiaSmi: false, + wantRocmSmi: false, + wantTegrastats: false, + wantNvtop: true, wantErr: false, }, { name: "no gpu tools available", - setupCommands: func() error { + setupCommands: func(_ string) error { os.Setenv("PATH", "") return nil }, @@ -1074,29 +1187,53 @@ echo "test"` for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if err := tt.setupCommands(); err != nil { + tempDir := t.TempDir() + os.Setenv("PATH", tempDir) + if err := tt.setupCommands(tempDir); err != nil { t.Fatal(err) } gm := &GPUManager{} - err := gm.detectGPUs() + caps := gm.discoverGpuCapabilities() + var err error + if !hasAnyGpuCollector(caps) { + err = fmt.Errorf(noGPUFoundMsg) + } + priorities := gm.resolveLegacyCollectorPriority(caps) + hasPriority := func(source collectorSource) bool { + for _, s := range priorities { + if s == source { + return true + } + } + return false + } + gotNvidiaSmi := hasPriority(collectorSourceNvidiaSMI) + gotRocmSmi := hasPriority(collectorSourceRocmSMI) + gotTegrastats := caps.hasTegrastats + gotNvtop := caps.hasNvtop - t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gm.nvidiaSmi, gm.rocmSmi, gm.tegrastats) + t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gotNvidiaSmi, gotRocmSmi, gotTegrastats) - if tt.wantErr { + wantErr := tt.wantErr + if hasAmdSysfs && (tt.name == "nvidia-smi not available" || tt.name == "no gpu tools available") { + wantErr = false + } + if wantErr { assert.Error(t, err) return } assert.NoError(t, err) - assert.Equal(t, tt.wantNvidiaSmi, gm.nvidiaSmi) - assert.Equal(t, tt.wantRocmSmi, gm.rocmSmi) - assert.Equal(t, tt.wantTegrastats, gm.tegrastats) + assert.Equal(t, tt.wantNvidiaSmi, gotNvidiaSmi) + assert.Equal(t, tt.wantRocmSmi, gotRocmSmi) + assert.Equal(t, tt.wantTegrastats, gotTegrastats) + assert.Equal(t, tt.wantNvtop, gotNvtop) }) } } -func TestStartCollector(t *testing.T) { +func TestCollectorStartHelpers(t *testing.T) { // Save original PATH origPath := os.Getenv("PATH") defer os.Setenv("PATH", origPath) @@ -1181,6 +1318,27 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m }, }, }, + { + name: "nvtop collector", + command: "nvtop", + setup: func(t *testing.T) error { + path := filepath.Join(dir, "nvtop") + script := `#!/bin/sh +echo '[{"device_name":"NVIDIA Test GPU","temp":"52C","power_draw":"31W","gpu_util":"37%","mem_total":"4294967296","mem_used":"536870912","processes":[]}]'` + if err := os.WriteFile(path, []byte(script), 0755); err != nil { + return err + } + return nil + }, + validate: func(t *testing.T, gm *GPUManager) { + gpu, exists := gm.GpuDataMap["n0"] + assert.True(t, exists) + if exists { + assert.Equal(t, "NVIDIA Test GPU", gpu.Name) + assert.Equal(t, 52.0, gpu.Temperature) + } + }, + }, } for _, tt := range tests { @@ -1193,13 +1351,157 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m GpuDataMap: make(map[string]*system.GPUData), } } - tt.gm.startCollector(tt.command) + switch tt.command { + case nvidiaSmiCmd: + tt.gm.startNvidiaSmiCollector("4") + case rocmSmiCmd: + tt.gm.startRocmSmiCollector(4300 * time.Millisecond) + case tegraStatsCmd: + tt.gm.startTegraStatsCollector("3700") + case nvtopCmd: + tt.gm.startNvtopCollector("30", nil) + default: + t.Fatalf("unknown test command %q", tt.command) + } time.Sleep(50 * time.Millisecond) // Give collector time to run tt.validate(t, tt.gm) }) } } +func TestNewGPUManagerPriorityNvtopFallback(t *testing.T) { + origPath := os.Getenv("PATH") + defer os.Setenv("PATH", origPath) + + dir := t.TempDir() + os.Setenv("PATH", dir) + t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvtop,nvidia-smi") + + nvtopPath := filepath.Join(dir, "nvtop") + nvtopScript := `#!/bin/sh +echo 'not-json'` + require.NoError(t, os.WriteFile(nvtopPath, []byte(nvtopScript), 0755)) + + nvidiaPath := filepath.Join(dir, "nvidia-smi") + nvidiaScript := `#!/bin/sh +echo "0, NVIDIA Priority GPU, 45, 512, 2048, 12, 25"` + require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755)) + + gm, err := NewGPUManager() + require.NoError(t, err) + require.NotNil(t, gm) + + time.Sleep(150 * time.Millisecond) + gpu, ok := gm.GpuDataMap["0"] + require.True(t, ok) + assert.Equal(t, "Priority GPU", gpu.Name) + assert.Equal(t, 45.0, gpu.Temperature) +} + +func TestNewGPUManagerPriorityMixedCollectors(t *testing.T) { + origPath := os.Getenv("PATH") + defer os.Setenv("PATH", origPath) + + dir := t.TempDir() + os.Setenv("PATH", dir) + t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "intel_gpu_top,rocm-smi") + + intelPath := filepath.Join(dir, "intel_gpu_top") + intelScript := `#!/bin/sh +echo "Freq MHz IRQ RC6 Power W IMC MiB/s RCS VCS" +echo " req act /s % gpu pkg rd wr % se wa % se wa" +echo "226 223 338 58 2.00 2.69 1820 965 0.00 0 0 0.00 0 0" +echo "189 187 412 67 1.80 2.45 1950 823 8.50 2 1 15.00 1 0" +` + require.NoError(t, os.WriteFile(intelPath, []byte(intelScript), 0755)) + + rocmPath := filepath.Join(dir, "rocm-smi") + rocmScript := `#!/bin/sh +echo '{"card0": {"Temperature (Sensor edge) (C)": "49.0", "Current Socket Graphics Package Power (W)": "28.159", "GPU use (%)": "0", "VRAM Total Memory (B)": "536870912", "VRAM Total Used Memory (B)": "445550592", "Card Series": "Rembrandt [Radeon 680M]", "GUID": "34756"}}' +` + require.NoError(t, os.WriteFile(rocmPath, []byte(rocmScript), 0755)) + + gm, err := NewGPUManager() + require.NoError(t, err) + require.NotNil(t, gm) + + time.Sleep(150 * time.Millisecond) + _, intelOk := gm.GpuDataMap["i0"] + _, amdOk := gm.GpuDataMap["34756"] + assert.True(t, intelOk) + assert.True(t, amdOk) +} + +func TestNewGPUManagerPriorityNvmlFallbackToNvidiaSmi(t *testing.T) { + origPath := os.Getenv("PATH") + defer os.Setenv("PATH", origPath) + + dir := t.TempDir() + os.Setenv("PATH", dir) + t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvml,nvidia-smi") + + nvidiaPath := filepath.Join(dir, "nvidia-smi") + nvidiaScript := `#!/bin/sh +echo "0, NVIDIA Fallback GPU, 41, 256, 1024, 8, 14"` + require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755)) + + gm, err := NewGPUManager() + require.NoError(t, err) + require.NotNil(t, gm) + + time.Sleep(150 * time.Millisecond) + gpu, ok := gm.GpuDataMap["0"] + require.True(t, ok) + assert.Equal(t, "Fallback GPU", gpu.Name) +} + +func TestNewGPUManagerConfiguredCollectorsMustStart(t *testing.T) { + origPath := os.Getenv("PATH") + defer os.Setenv("PATH", origPath) + + dir := t.TempDir() + os.Setenv("PATH", dir) + + t.Run("configured valid collector unavailable", func(t *testing.T) { + t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi") + gm, err := NewGPUManager() + require.Nil(t, gm) + require.Error(t, err) + assert.Contains(t, err.Error(), "no configured GPU collectors are available") + }) + + t.Run("configured collector list has only unknown entries", func(t *testing.T) { + t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "bad,unknown") + gm, err := NewGPUManager() + require.Nil(t, gm) + require.Error(t, err) + assert.Contains(t, err.Error(), "no configured GPU collectors are available") + }) +} + +func TestNewGPUManagerJetsonIgnoresCollectorConfig(t *testing.T) { + origPath := os.Getenv("PATH") + defer os.Setenv("PATH", origPath) + + dir := t.TempDir() + os.Setenv("PATH", dir) + t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi") + + tegraPath := filepath.Join(dir, "tegrastats") + tegraScript := `#!/bin/sh +echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000mW"` + require.NoError(t, os.WriteFile(tegraPath, []byte(tegraScript), 0755)) + + gm, err := NewGPUManager() + require.NoError(t, err) + require.NotNil(t, gm) + + time.Sleep(100 * time.Millisecond) + gpu, ok := gm.GpuDataMap["0"] + require.True(t, ok) + assert.Equal(t, "GPU", gpu.Name) +} + // TestAccumulationTableDriven tests the accumulation behavior for all three GPU types func TestAccumulation(t *testing.T) { type expectedGPUValues struct { diff --git a/agent/test-data/nvtop.json b/agent/test-data/nvtop.json new file mode 100644 index 00000000..22074286 --- /dev/null +++ b/agent/test-data/nvtop.json @@ -0,0 +1,34 @@ +[ + { + "device_name": "NVIDIA GeForce RTX 3050 Ti Laptop GPU", + "gpu_clock": "1485MHz", + "mem_clock": "6001MHz", + "temp": "48C", + "fan_speed": null, + "power_draw": "13W", + "gpu_util": "5%", + "encode": "0%", + "decode": "0%", + "mem_util": "8%", + "mem_total": "4294967296", + "mem_used": "349372416", + "mem_free": "3945594880", + "processes" : [] + }, + { + "device_name": "AMD Radeon 680M", + "gpu_clock": "2200MHz", + "mem_clock": "2400MHz", + "temp": "48C", + "fan_speed": "CPU Fan", + "power_draw": "9W", + "gpu_util": "12%", + "encode": null, + "decode": "0%", + "mem_util": "7%", + "mem_total": "16929173504", + "mem_used": "1213784064", + "mem_free": "15715389440", + "processes" : [] + } +]