mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-23 22:16:18 +01:00
add nvtop integration and introduce GPU_COLLECTOR env var
This commit is contained in:
@@ -250,6 +250,100 @@ func TestParseAmdData(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNvtopData(t *testing.T) {
|
||||
input, err := os.ReadFile("test-data/nvtop.json")
|
||||
require.NoError(t, err)
|
||||
|
||||
gm := &GPUManager{
|
||||
GpuDataMap: make(map[string]*system.GPUData),
|
||||
}
|
||||
valid := gm.parseNvtopData(input)
|
||||
require.True(t, valid)
|
||||
|
||||
g0, ok := gm.GpuDataMap["n0"]
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", g0.Name)
|
||||
assert.Equal(t, 48.0, g0.Temperature)
|
||||
assert.Equal(t, 5.0, g0.Usage)
|
||||
assert.Equal(t, 13.0, g0.Power)
|
||||
assert.Equal(t, bytesToMegabytes(349372416), g0.MemoryUsed)
|
||||
assert.Equal(t, bytesToMegabytes(4294967296), g0.MemoryTotal)
|
||||
assert.Equal(t, 1.0, g0.Count)
|
||||
|
||||
g1, ok := gm.GpuDataMap["n1"]
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, "AMD Radeon 680M", g1.Name)
|
||||
assert.Equal(t, 48.0, g1.Temperature)
|
||||
assert.Equal(t, 12.0, g1.Usage)
|
||||
assert.Equal(t, 9.0, g1.Power)
|
||||
assert.Equal(t, bytesToMegabytes(1213784064), g1.MemoryUsed)
|
||||
assert.Equal(t, bytesToMegabytes(16929173504), g1.MemoryTotal)
|
||||
assert.Equal(t, 1.0, g1.Count)
|
||||
}
|
||||
|
||||
func TestUpdateNvtopSnapshotsKeepsDeviceAssociationWhenOrderChanges(t *testing.T) {
|
||||
strPtr := func(s string) *string { return &s }
|
||||
|
||||
gm := &GPUManager{
|
||||
GpuDataMap: make(map[string]*system.GPUData),
|
||||
}
|
||||
|
||||
firstBatch := []nvtopSnapshot{
|
||||
{
|
||||
DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
|
||||
GpuUtil: strPtr("20%"),
|
||||
PowerDraw: strPtr("10W"),
|
||||
},
|
||||
{
|
||||
DeviceName: "AMD Radeon 680M",
|
||||
GpuUtil: strPtr("30%"),
|
||||
PowerDraw: strPtr("20W"),
|
||||
},
|
||||
}
|
||||
secondBatchSwapped := []nvtopSnapshot{
|
||||
{
|
||||
DeviceName: "AMD Radeon 680M",
|
||||
GpuUtil: strPtr("40%"),
|
||||
PowerDraw: strPtr("25W"),
|
||||
},
|
||||
{
|
||||
DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
|
||||
GpuUtil: strPtr("50%"),
|
||||
PowerDraw: strPtr("15W"),
|
||||
},
|
||||
}
|
||||
|
||||
require.True(t, gm.updateNvtopSnapshots(firstBatch))
|
||||
require.True(t, gm.updateNvtopSnapshots(secondBatchSwapped))
|
||||
|
||||
nvidia := gm.GpuDataMap["n0"]
|
||||
require.NotNil(t, nvidia)
|
||||
assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", nvidia.Name)
|
||||
assert.Equal(t, 70.0, nvidia.Usage)
|
||||
assert.Equal(t, 25.0, nvidia.Power)
|
||||
assert.Equal(t, 2.0, nvidia.Count)
|
||||
|
||||
amd := gm.GpuDataMap["n1"]
|
||||
require.NotNil(t, amd)
|
||||
assert.Equal(t, "AMD Radeon 680M", amd.Name)
|
||||
assert.Equal(t, 70.0, amd.Usage)
|
||||
assert.Equal(t, 45.0, amd.Power)
|
||||
assert.Equal(t, 2.0, amd.Count)
|
||||
}
|
||||
|
||||
func TestParseCollectorPriority(t *testing.T) {
|
||||
got := parseCollectorPriority(" nvml, nvidia-smi, intel_gpu_top, amd_sysfs, nvtop, rocm-smi, bad ")
|
||||
want := []collectorSource{
|
||||
collectorSourceNVML,
|
||||
collectorSourceNvidiaSMI,
|
||||
collectorSourceIntelGpuTop,
|
||||
collectorSourceAmdSysfs,
|
||||
collectorSourceNVTop,
|
||||
collectorSourceRocmSMI,
|
||||
}
|
||||
assert.Equal(t, want, got)
|
||||
}
|
||||
|
||||
func TestParseJetsonData(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
@@ -987,36 +1081,35 @@ func TestCalculateGPUAverage(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestDetectGPUs(t *testing.T) {
|
||||
func TestGPUCapabilitiesAndLegacyPriority(t *testing.T) {
|
||||
// Save original PATH
|
||||
origPath := os.Getenv("PATH")
|
||||
defer os.Setenv("PATH", origPath)
|
||||
|
||||
// Set up temp dir with the commands
|
||||
tempDir := t.TempDir()
|
||||
os.Setenv("PATH", tempDir)
|
||||
hasAmdSysfs := (&GPUManager{}).hasAmdSysfs()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
setupCommands func() error
|
||||
setupCommands func(string) error
|
||||
wantNvidiaSmi bool
|
||||
wantRocmSmi bool
|
||||
wantTegrastats bool
|
||||
wantNvtop bool
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "nvidia-smi not available",
|
||||
setupCommands: func() error {
|
||||
setupCommands: func(_ string) error {
|
||||
return nil
|
||||
},
|
||||
wantNvidiaSmi: false,
|
||||
wantRocmSmi: false,
|
||||
wantTegrastats: false,
|
||||
wantNvtop: false,
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "nvidia-smi available",
|
||||
setupCommands: func() error {
|
||||
setupCommands: func(tempDir string) error {
|
||||
path := filepath.Join(tempDir, "nvidia-smi")
|
||||
script := `#!/bin/sh
|
||||
echo "test"`
|
||||
@@ -1028,29 +1121,14 @@ echo "test"`
|
||||
wantNvidiaSmi: true,
|
||||
wantTegrastats: false,
|
||||
wantRocmSmi: false,
|
||||
wantNvtop: false,
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "rocm-smi available",
|
||||
setupCommands: func() error {
|
||||
setupCommands: func(tempDir string) error {
|
||||
path := filepath.Join(tempDir, "rocm-smi")
|
||||
script := `#!/bin/sh
|
||||
echo "test"`
|
||||
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
},
|
||||
wantNvidiaSmi: true,
|
||||
wantRocmSmi: true,
|
||||
wantTegrastats: false,
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "tegrastats available",
|
||||
setupCommands: func() error {
|
||||
path := filepath.Join(tempDir, "tegrastats")
|
||||
script := `#!/bin/sh
|
||||
echo "test"`
|
||||
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
||||
return err
|
||||
@@ -1059,12 +1137,47 @@ echo "test"`
|
||||
},
|
||||
wantNvidiaSmi: false,
|
||||
wantRocmSmi: true,
|
||||
wantTegrastats: false,
|
||||
wantNvtop: false,
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "tegrastats available",
|
||||
setupCommands: func(tempDir string) error {
|
||||
path := filepath.Join(tempDir, "tegrastats")
|
||||
script := `#!/bin/sh
|
||||
echo "test"`
|
||||
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
},
|
||||
wantNvidiaSmi: false,
|
||||
wantRocmSmi: false,
|
||||
wantTegrastats: true,
|
||||
wantNvtop: false,
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "nvtop available",
|
||||
setupCommands: func(tempDir string) error {
|
||||
path := filepath.Join(tempDir, "nvtop")
|
||||
script := `#!/bin/sh
|
||||
echo "[]"`
|
||||
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
},
|
||||
wantNvidiaSmi: false,
|
||||
wantRocmSmi: false,
|
||||
wantTegrastats: false,
|
||||
wantNvtop: true,
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "no gpu tools available",
|
||||
setupCommands: func() error {
|
||||
setupCommands: func(_ string) error {
|
||||
os.Setenv("PATH", "")
|
||||
return nil
|
||||
},
|
||||
@@ -1074,29 +1187,53 @@ echo "test"`
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if err := tt.setupCommands(); err != nil {
|
||||
tempDir := t.TempDir()
|
||||
os.Setenv("PATH", tempDir)
|
||||
if err := tt.setupCommands(tempDir); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
gm := &GPUManager{}
|
||||
err := gm.detectGPUs()
|
||||
caps := gm.discoverGpuCapabilities()
|
||||
var err error
|
||||
if !hasAnyGpuCollector(caps) {
|
||||
err = fmt.Errorf(noGPUFoundMsg)
|
||||
}
|
||||
priorities := gm.resolveLegacyCollectorPriority(caps)
|
||||
hasPriority := func(source collectorSource) bool {
|
||||
for _, s := range priorities {
|
||||
if s == source {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
gotNvidiaSmi := hasPriority(collectorSourceNvidiaSMI)
|
||||
gotRocmSmi := hasPriority(collectorSourceRocmSMI)
|
||||
gotTegrastats := caps.hasTegrastats
|
||||
gotNvtop := caps.hasNvtop
|
||||
|
||||
t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gm.nvidiaSmi, gm.rocmSmi, gm.tegrastats)
|
||||
t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gotNvidiaSmi, gotRocmSmi, gotTegrastats)
|
||||
|
||||
if tt.wantErr {
|
||||
wantErr := tt.wantErr
|
||||
if hasAmdSysfs && (tt.name == "nvidia-smi not available" || tt.name == "no gpu tools available") {
|
||||
wantErr = false
|
||||
}
|
||||
if wantErr {
|
||||
assert.Error(t, err)
|
||||
return
|
||||
}
|
||||
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, tt.wantNvidiaSmi, gm.nvidiaSmi)
|
||||
assert.Equal(t, tt.wantRocmSmi, gm.rocmSmi)
|
||||
assert.Equal(t, tt.wantTegrastats, gm.tegrastats)
|
||||
assert.Equal(t, tt.wantNvidiaSmi, gotNvidiaSmi)
|
||||
assert.Equal(t, tt.wantRocmSmi, gotRocmSmi)
|
||||
assert.Equal(t, tt.wantTegrastats, gotTegrastats)
|
||||
assert.Equal(t, tt.wantNvtop, gotNvtop)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestStartCollector(t *testing.T) {
|
||||
func TestCollectorStartHelpers(t *testing.T) {
|
||||
// Save original PATH
|
||||
origPath := os.Getenv("PATH")
|
||||
defer os.Setenv("PATH", origPath)
|
||||
@@ -1181,6 +1318,27 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "nvtop collector",
|
||||
command: "nvtop",
|
||||
setup: func(t *testing.T) error {
|
||||
path := filepath.Join(dir, "nvtop")
|
||||
script := `#!/bin/sh
|
||||
echo '[{"device_name":"NVIDIA Test GPU","temp":"52C","power_draw":"31W","gpu_util":"37%","mem_total":"4294967296","mem_used":"536870912","processes":[]}]'`
|
||||
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
},
|
||||
validate: func(t *testing.T, gm *GPUManager) {
|
||||
gpu, exists := gm.GpuDataMap["n0"]
|
||||
assert.True(t, exists)
|
||||
if exists {
|
||||
assert.Equal(t, "NVIDIA Test GPU", gpu.Name)
|
||||
assert.Equal(t, 52.0, gpu.Temperature)
|
||||
}
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
@@ -1193,13 +1351,157 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m
|
||||
GpuDataMap: make(map[string]*system.GPUData),
|
||||
}
|
||||
}
|
||||
tt.gm.startCollector(tt.command)
|
||||
switch tt.command {
|
||||
case nvidiaSmiCmd:
|
||||
tt.gm.startNvidiaSmiCollector("4")
|
||||
case rocmSmiCmd:
|
||||
tt.gm.startRocmSmiCollector(4300 * time.Millisecond)
|
||||
case tegraStatsCmd:
|
||||
tt.gm.startTegraStatsCollector("3700")
|
||||
case nvtopCmd:
|
||||
tt.gm.startNvtopCollector("30", nil)
|
||||
default:
|
||||
t.Fatalf("unknown test command %q", tt.command)
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond) // Give collector time to run
|
||||
tt.validate(t, tt.gm)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewGPUManagerPriorityNvtopFallback(t *testing.T) {
|
||||
origPath := os.Getenv("PATH")
|
||||
defer os.Setenv("PATH", origPath)
|
||||
|
||||
dir := t.TempDir()
|
||||
os.Setenv("PATH", dir)
|
||||
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvtop,nvidia-smi")
|
||||
|
||||
nvtopPath := filepath.Join(dir, "nvtop")
|
||||
nvtopScript := `#!/bin/sh
|
||||
echo 'not-json'`
|
||||
require.NoError(t, os.WriteFile(nvtopPath, []byte(nvtopScript), 0755))
|
||||
|
||||
nvidiaPath := filepath.Join(dir, "nvidia-smi")
|
||||
nvidiaScript := `#!/bin/sh
|
||||
echo "0, NVIDIA Priority GPU, 45, 512, 2048, 12, 25"`
|
||||
require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755))
|
||||
|
||||
gm, err := NewGPUManager()
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, gm)
|
||||
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
gpu, ok := gm.GpuDataMap["0"]
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, "Priority GPU", gpu.Name)
|
||||
assert.Equal(t, 45.0, gpu.Temperature)
|
||||
}
|
||||
|
||||
func TestNewGPUManagerPriorityMixedCollectors(t *testing.T) {
|
||||
origPath := os.Getenv("PATH")
|
||||
defer os.Setenv("PATH", origPath)
|
||||
|
||||
dir := t.TempDir()
|
||||
os.Setenv("PATH", dir)
|
||||
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "intel_gpu_top,rocm-smi")
|
||||
|
||||
intelPath := filepath.Join(dir, "intel_gpu_top")
|
||||
intelScript := `#!/bin/sh
|
||||
echo "Freq MHz IRQ RC6 Power W IMC MiB/s RCS VCS"
|
||||
echo " req act /s % gpu pkg rd wr % se wa % se wa"
|
||||
echo "226 223 338 58 2.00 2.69 1820 965 0.00 0 0 0.00 0 0"
|
||||
echo "189 187 412 67 1.80 2.45 1950 823 8.50 2 1 15.00 1 0"
|
||||
`
|
||||
require.NoError(t, os.WriteFile(intelPath, []byte(intelScript), 0755))
|
||||
|
||||
rocmPath := filepath.Join(dir, "rocm-smi")
|
||||
rocmScript := `#!/bin/sh
|
||||
echo '{"card0": {"Temperature (Sensor edge) (C)": "49.0", "Current Socket Graphics Package Power (W)": "28.159", "GPU use (%)": "0", "VRAM Total Memory (B)": "536870912", "VRAM Total Used Memory (B)": "445550592", "Card Series": "Rembrandt [Radeon 680M]", "GUID": "34756"}}'
|
||||
`
|
||||
require.NoError(t, os.WriteFile(rocmPath, []byte(rocmScript), 0755))
|
||||
|
||||
gm, err := NewGPUManager()
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, gm)
|
||||
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
_, intelOk := gm.GpuDataMap["i0"]
|
||||
_, amdOk := gm.GpuDataMap["34756"]
|
||||
assert.True(t, intelOk)
|
||||
assert.True(t, amdOk)
|
||||
}
|
||||
|
||||
func TestNewGPUManagerPriorityNvmlFallbackToNvidiaSmi(t *testing.T) {
|
||||
origPath := os.Getenv("PATH")
|
||||
defer os.Setenv("PATH", origPath)
|
||||
|
||||
dir := t.TempDir()
|
||||
os.Setenv("PATH", dir)
|
||||
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvml,nvidia-smi")
|
||||
|
||||
nvidiaPath := filepath.Join(dir, "nvidia-smi")
|
||||
nvidiaScript := `#!/bin/sh
|
||||
echo "0, NVIDIA Fallback GPU, 41, 256, 1024, 8, 14"`
|
||||
require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755))
|
||||
|
||||
gm, err := NewGPUManager()
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, gm)
|
||||
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
gpu, ok := gm.GpuDataMap["0"]
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, "Fallback GPU", gpu.Name)
|
||||
}
|
||||
|
||||
func TestNewGPUManagerConfiguredCollectorsMustStart(t *testing.T) {
|
||||
origPath := os.Getenv("PATH")
|
||||
defer os.Setenv("PATH", origPath)
|
||||
|
||||
dir := t.TempDir()
|
||||
os.Setenv("PATH", dir)
|
||||
|
||||
t.Run("configured valid collector unavailable", func(t *testing.T) {
|
||||
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi")
|
||||
gm, err := NewGPUManager()
|
||||
require.Nil(t, gm)
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
|
||||
})
|
||||
|
||||
t.Run("configured collector list has only unknown entries", func(t *testing.T) {
|
||||
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "bad,unknown")
|
||||
gm, err := NewGPUManager()
|
||||
require.Nil(t, gm)
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
|
||||
})
|
||||
}
|
||||
|
||||
func TestNewGPUManagerJetsonIgnoresCollectorConfig(t *testing.T) {
|
||||
origPath := os.Getenv("PATH")
|
||||
defer os.Setenv("PATH", origPath)
|
||||
|
||||
dir := t.TempDir()
|
||||
os.Setenv("PATH", dir)
|
||||
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi")
|
||||
|
||||
tegraPath := filepath.Join(dir, "tegrastats")
|
||||
tegraScript := `#!/bin/sh
|
||||
echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000mW"`
|
||||
require.NoError(t, os.WriteFile(tegraPath, []byte(tegraScript), 0755))
|
||||
|
||||
gm, err := NewGPUManager()
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, gm)
|
||||
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
gpu, ok := gm.GpuDataMap["0"]
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, "GPU", gpu.Name)
|
||||
}
|
||||
|
||||
// TestAccumulationTableDriven tests the accumulation behavior for all three GPU types
|
||||
func TestAccumulation(t *testing.T) {
|
||||
type expectedGPUValues struct {
|
||||
|
||||
Reference in New Issue
Block a user