add nvtop integration and introduce GPU_COLLECTOR env var

This commit is contained in:
henrygd
2026-02-13 17:10:16 -05:00
parent 1f1a448aef
commit 14ecb1b069
6 changed files with 834 additions and 164 deletions

View File

@@ -250,6 +250,100 @@ func TestParseAmdData(t *testing.T) {
}
}
func TestParseNvtopData(t *testing.T) {
input, err := os.ReadFile("test-data/nvtop.json")
require.NoError(t, err)
gm := &GPUManager{
GpuDataMap: make(map[string]*system.GPUData),
}
valid := gm.parseNvtopData(input)
require.True(t, valid)
g0, ok := gm.GpuDataMap["n0"]
require.True(t, ok)
assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", g0.Name)
assert.Equal(t, 48.0, g0.Temperature)
assert.Equal(t, 5.0, g0.Usage)
assert.Equal(t, 13.0, g0.Power)
assert.Equal(t, bytesToMegabytes(349372416), g0.MemoryUsed)
assert.Equal(t, bytesToMegabytes(4294967296), g0.MemoryTotal)
assert.Equal(t, 1.0, g0.Count)
g1, ok := gm.GpuDataMap["n1"]
require.True(t, ok)
assert.Equal(t, "AMD Radeon 680M", g1.Name)
assert.Equal(t, 48.0, g1.Temperature)
assert.Equal(t, 12.0, g1.Usage)
assert.Equal(t, 9.0, g1.Power)
assert.Equal(t, bytesToMegabytes(1213784064), g1.MemoryUsed)
assert.Equal(t, bytesToMegabytes(16929173504), g1.MemoryTotal)
assert.Equal(t, 1.0, g1.Count)
}
func TestUpdateNvtopSnapshotsKeepsDeviceAssociationWhenOrderChanges(t *testing.T) {
strPtr := func(s string) *string { return &s }
gm := &GPUManager{
GpuDataMap: make(map[string]*system.GPUData),
}
firstBatch := []nvtopSnapshot{
{
DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
GpuUtil: strPtr("20%"),
PowerDraw: strPtr("10W"),
},
{
DeviceName: "AMD Radeon 680M",
GpuUtil: strPtr("30%"),
PowerDraw: strPtr("20W"),
},
}
secondBatchSwapped := []nvtopSnapshot{
{
DeviceName: "AMD Radeon 680M",
GpuUtil: strPtr("40%"),
PowerDraw: strPtr("25W"),
},
{
DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
GpuUtil: strPtr("50%"),
PowerDraw: strPtr("15W"),
},
}
require.True(t, gm.updateNvtopSnapshots(firstBatch))
require.True(t, gm.updateNvtopSnapshots(secondBatchSwapped))
nvidia := gm.GpuDataMap["n0"]
require.NotNil(t, nvidia)
assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", nvidia.Name)
assert.Equal(t, 70.0, nvidia.Usage)
assert.Equal(t, 25.0, nvidia.Power)
assert.Equal(t, 2.0, nvidia.Count)
amd := gm.GpuDataMap["n1"]
require.NotNil(t, amd)
assert.Equal(t, "AMD Radeon 680M", amd.Name)
assert.Equal(t, 70.0, amd.Usage)
assert.Equal(t, 45.0, amd.Power)
assert.Equal(t, 2.0, amd.Count)
}
func TestParseCollectorPriority(t *testing.T) {
got := parseCollectorPriority(" nvml, nvidia-smi, intel_gpu_top, amd_sysfs, nvtop, rocm-smi, bad ")
want := []collectorSource{
collectorSourceNVML,
collectorSourceNvidiaSMI,
collectorSourceIntelGpuTop,
collectorSourceAmdSysfs,
collectorSourceNVTop,
collectorSourceRocmSMI,
}
assert.Equal(t, want, got)
}
func TestParseJetsonData(t *testing.T) {
tests := []struct {
name string
@@ -987,36 +1081,35 @@ func TestCalculateGPUAverage(t *testing.T) {
})
}
func TestDetectGPUs(t *testing.T) {
func TestGPUCapabilitiesAndLegacyPriority(t *testing.T) {
// Save original PATH
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
// Set up temp dir with the commands
tempDir := t.TempDir()
os.Setenv("PATH", tempDir)
hasAmdSysfs := (&GPUManager{}).hasAmdSysfs()
tests := []struct {
name string
setupCommands func() error
setupCommands func(string) error
wantNvidiaSmi bool
wantRocmSmi bool
wantTegrastats bool
wantNvtop bool
wantErr bool
}{
{
name: "nvidia-smi not available",
setupCommands: func() error {
setupCommands: func(_ string) error {
return nil
},
wantNvidiaSmi: false,
wantRocmSmi: false,
wantTegrastats: false,
wantNvtop: false,
wantErr: true,
},
{
name: "nvidia-smi available",
setupCommands: func() error {
setupCommands: func(tempDir string) error {
path := filepath.Join(tempDir, "nvidia-smi")
script := `#!/bin/sh
echo "test"`
@@ -1028,29 +1121,14 @@ echo "test"`
wantNvidiaSmi: true,
wantTegrastats: false,
wantRocmSmi: false,
wantNvtop: false,
wantErr: false,
},
{
name: "rocm-smi available",
setupCommands: func() error {
setupCommands: func(tempDir string) error {
path := filepath.Join(tempDir, "rocm-smi")
script := `#!/bin/sh
echo "test"`
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
return err
}
return nil
},
wantNvidiaSmi: true,
wantRocmSmi: true,
wantTegrastats: false,
wantErr: false,
},
{
name: "tegrastats available",
setupCommands: func() error {
path := filepath.Join(tempDir, "tegrastats")
script := `#!/bin/sh
echo "test"`
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
return err
@@ -1059,12 +1137,47 @@ echo "test"`
},
wantNvidiaSmi: false,
wantRocmSmi: true,
wantTegrastats: false,
wantNvtop: false,
wantErr: false,
},
{
name: "tegrastats available",
setupCommands: func(tempDir string) error {
path := filepath.Join(tempDir, "tegrastats")
script := `#!/bin/sh
echo "test"`
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
return err
}
return nil
},
wantNvidiaSmi: false,
wantRocmSmi: false,
wantTegrastats: true,
wantNvtop: false,
wantErr: false,
},
{
name: "nvtop available",
setupCommands: func(tempDir string) error {
path := filepath.Join(tempDir, "nvtop")
script := `#!/bin/sh
echo "[]"`
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
return err
}
return nil
},
wantNvidiaSmi: false,
wantRocmSmi: false,
wantTegrastats: false,
wantNvtop: true,
wantErr: false,
},
{
name: "no gpu tools available",
setupCommands: func() error {
setupCommands: func(_ string) error {
os.Setenv("PATH", "")
return nil
},
@@ -1074,29 +1187,53 @@ echo "test"`
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if err := tt.setupCommands(); err != nil {
tempDir := t.TempDir()
os.Setenv("PATH", tempDir)
if err := tt.setupCommands(tempDir); err != nil {
t.Fatal(err)
}
gm := &GPUManager{}
err := gm.detectGPUs()
caps := gm.discoverGpuCapabilities()
var err error
if !hasAnyGpuCollector(caps) {
err = fmt.Errorf(noGPUFoundMsg)
}
priorities := gm.resolveLegacyCollectorPriority(caps)
hasPriority := func(source collectorSource) bool {
for _, s := range priorities {
if s == source {
return true
}
}
return false
}
gotNvidiaSmi := hasPriority(collectorSourceNvidiaSMI)
gotRocmSmi := hasPriority(collectorSourceRocmSMI)
gotTegrastats := caps.hasTegrastats
gotNvtop := caps.hasNvtop
t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gm.nvidiaSmi, gm.rocmSmi, gm.tegrastats)
t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gotNvidiaSmi, gotRocmSmi, gotTegrastats)
if tt.wantErr {
wantErr := tt.wantErr
if hasAmdSysfs && (tt.name == "nvidia-smi not available" || tt.name == "no gpu tools available") {
wantErr = false
}
if wantErr {
assert.Error(t, err)
return
}
assert.NoError(t, err)
assert.Equal(t, tt.wantNvidiaSmi, gm.nvidiaSmi)
assert.Equal(t, tt.wantRocmSmi, gm.rocmSmi)
assert.Equal(t, tt.wantTegrastats, gm.tegrastats)
assert.Equal(t, tt.wantNvidiaSmi, gotNvidiaSmi)
assert.Equal(t, tt.wantRocmSmi, gotRocmSmi)
assert.Equal(t, tt.wantTegrastats, gotTegrastats)
assert.Equal(t, tt.wantNvtop, gotNvtop)
})
}
}
func TestStartCollector(t *testing.T) {
func TestCollectorStartHelpers(t *testing.T) {
// Save original PATH
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
@@ -1181,6 +1318,27 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m
},
},
},
{
name: "nvtop collector",
command: "nvtop",
setup: func(t *testing.T) error {
path := filepath.Join(dir, "nvtop")
script := `#!/bin/sh
echo '[{"device_name":"NVIDIA Test GPU","temp":"52C","power_draw":"31W","gpu_util":"37%","mem_total":"4294967296","mem_used":"536870912","processes":[]}]'`
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
return err
}
return nil
},
validate: func(t *testing.T, gm *GPUManager) {
gpu, exists := gm.GpuDataMap["n0"]
assert.True(t, exists)
if exists {
assert.Equal(t, "NVIDIA Test GPU", gpu.Name)
assert.Equal(t, 52.0, gpu.Temperature)
}
},
},
}
for _, tt := range tests {
@@ -1193,13 +1351,157 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m
GpuDataMap: make(map[string]*system.GPUData),
}
}
tt.gm.startCollector(tt.command)
switch tt.command {
case nvidiaSmiCmd:
tt.gm.startNvidiaSmiCollector("4")
case rocmSmiCmd:
tt.gm.startRocmSmiCollector(4300 * time.Millisecond)
case tegraStatsCmd:
tt.gm.startTegraStatsCollector("3700")
case nvtopCmd:
tt.gm.startNvtopCollector("30", nil)
default:
t.Fatalf("unknown test command %q", tt.command)
}
time.Sleep(50 * time.Millisecond) // Give collector time to run
tt.validate(t, tt.gm)
})
}
}
func TestNewGPUManagerPriorityNvtopFallback(t *testing.T) {
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvtop,nvidia-smi")
nvtopPath := filepath.Join(dir, "nvtop")
nvtopScript := `#!/bin/sh
echo 'not-json'`
require.NoError(t, os.WriteFile(nvtopPath, []byte(nvtopScript), 0755))
nvidiaPath := filepath.Join(dir, "nvidia-smi")
nvidiaScript := `#!/bin/sh
echo "0, NVIDIA Priority GPU, 45, 512, 2048, 12, 25"`
require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755))
gm, err := NewGPUManager()
require.NoError(t, err)
require.NotNil(t, gm)
time.Sleep(150 * time.Millisecond)
gpu, ok := gm.GpuDataMap["0"]
require.True(t, ok)
assert.Equal(t, "Priority GPU", gpu.Name)
assert.Equal(t, 45.0, gpu.Temperature)
}
func TestNewGPUManagerPriorityMixedCollectors(t *testing.T) {
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "intel_gpu_top,rocm-smi")
intelPath := filepath.Join(dir, "intel_gpu_top")
intelScript := `#!/bin/sh
echo "Freq MHz IRQ RC6 Power W IMC MiB/s RCS VCS"
echo " req act /s % gpu pkg rd wr % se wa % se wa"
echo "226 223 338 58 2.00 2.69 1820 965 0.00 0 0 0.00 0 0"
echo "189 187 412 67 1.80 2.45 1950 823 8.50 2 1 15.00 1 0"
`
require.NoError(t, os.WriteFile(intelPath, []byte(intelScript), 0755))
rocmPath := filepath.Join(dir, "rocm-smi")
rocmScript := `#!/bin/sh
echo '{"card0": {"Temperature (Sensor edge) (C)": "49.0", "Current Socket Graphics Package Power (W)": "28.159", "GPU use (%)": "0", "VRAM Total Memory (B)": "536870912", "VRAM Total Used Memory (B)": "445550592", "Card Series": "Rembrandt [Radeon 680M]", "GUID": "34756"}}'
`
require.NoError(t, os.WriteFile(rocmPath, []byte(rocmScript), 0755))
gm, err := NewGPUManager()
require.NoError(t, err)
require.NotNil(t, gm)
time.Sleep(150 * time.Millisecond)
_, intelOk := gm.GpuDataMap["i0"]
_, amdOk := gm.GpuDataMap["34756"]
assert.True(t, intelOk)
assert.True(t, amdOk)
}
func TestNewGPUManagerPriorityNvmlFallbackToNvidiaSmi(t *testing.T) {
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvml,nvidia-smi")
nvidiaPath := filepath.Join(dir, "nvidia-smi")
nvidiaScript := `#!/bin/sh
echo "0, NVIDIA Fallback GPU, 41, 256, 1024, 8, 14"`
require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755))
gm, err := NewGPUManager()
require.NoError(t, err)
require.NotNil(t, gm)
time.Sleep(150 * time.Millisecond)
gpu, ok := gm.GpuDataMap["0"]
require.True(t, ok)
assert.Equal(t, "Fallback GPU", gpu.Name)
}
func TestNewGPUManagerConfiguredCollectorsMustStart(t *testing.T) {
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
t.Run("configured valid collector unavailable", func(t *testing.T) {
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi")
gm, err := NewGPUManager()
require.Nil(t, gm)
require.Error(t, err)
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
})
t.Run("configured collector list has only unknown entries", func(t *testing.T) {
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "bad,unknown")
gm, err := NewGPUManager()
require.Nil(t, gm)
require.Error(t, err)
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
})
}
func TestNewGPUManagerJetsonIgnoresCollectorConfig(t *testing.T) {
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi")
tegraPath := filepath.Join(dir, "tegrastats")
tegraScript := `#!/bin/sh
echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000mW"`
require.NoError(t, os.WriteFile(tegraPath, []byte(tegraScript), 0755))
gm, err := NewGPUManager()
require.NoError(t, err)
require.NotNil(t, gm)
time.Sleep(100 * time.Millisecond)
gpu, ok := gm.GpuDataMap["0"]
require.True(t, ok)
assert.Equal(t, "GPU", gpu.Name)
}
// TestAccumulationTableDriven tests the accumulation behavior for all three GPU types
func TestAccumulation(t *testing.T) {
type expectedGPUValues struct {