From 283fa9d5c222937737e637be811ed8a4b364340e Mon Sep 17 00:00:00 2001 From: henrygd Date: Fri, 13 Feb 2026 20:06:37 -0500 Subject: [PATCH] include GTT memory in AMD GPU metrics (#1569) --- agent/gpu_amd_linux.go | 13 +++++- agent/gpu_amd_linux_test.go | 88 ++++++++++++++++++++++++------------- 2 files changed, 69 insertions(+), 32 deletions(-) diff --git a/agent/gpu_amd_linux.go b/agent/gpu_amd_linux.go index 0b1398b2..260fbe45 100644 --- a/agent/gpu_amd_linux.go +++ b/agent/gpu_amd_linux.go @@ -103,8 +103,17 @@ func (gm *GPUManager) updateAmdGpuData(cardPath string) bool { // Read all sysfs values first (no lock needed - these can be slow) usage, usageErr := readSysfsFloat(filepath.Join(devicePath, "gpu_busy_percent")) - memUsed, memUsedErr := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_used")) - memTotal, _ := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_total")) + vramUsed, memUsedErr := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_used")) + vramTotal, _ := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_total")) + memUsed := vramUsed + memTotal := vramTotal + // if gtt is present, add it to the memory used and total (https://github.com/henrygd/beszel/issues/1569#issuecomment-3837640484) + if gttUsed, err := readSysfsFloat(filepath.Join(devicePath, "mem_info_gtt_used")); err == nil && gttUsed > 0 { + if gttTotal, err := readSysfsFloat(filepath.Join(devicePath, "mem_info_gtt_total")); err == nil { + memUsed += gttUsed + memTotal += gttTotal + } + } var temp, power float64 hwmons, _ := filepath.Glob(filepath.Join(devicePath, "hwmon/hwmon*")) diff --git a/agent/gpu_amd_linux_test.go b/agent/gpu_amd_linux_test.go index 4423dd7f..a52d8e45 100644 --- a/agent/gpu_amd_linux_test.go +++ b/agent/gpu_amd_linux_test.go @@ -119,40 +119,68 @@ func TestAmdgpuNameCacheRoundTrip(t *testing.T) { } func TestUpdateAmdGpuDataWithFakeSysfs(t *testing.T) { - dir := t.TempDir() - cardPath := filepath.Join(dir, "card0") - devicePath := filepath.Join(cardPath, "device") - hwmonPath := filepath.Join(devicePath, "hwmon", "hwmon0") - require.NoError(t, os.MkdirAll(hwmonPath, 0o755)) - - write := func(name, content string) { - require.NoError(t, os.WriteFile(filepath.Join(devicePath, name), []byte(content), 0o644)) + tests := []struct { + name string + writeGTT bool + wantMemoryUsed float64 + wantMemoryTotal float64 + }{ + { + name: "sums vram and gtt when gtt is present", + writeGTT: true, + wantMemoryUsed: bytesToMegabytes(1073741824 + 536870912), + wantMemoryTotal: bytesToMegabytes(2147483648 + 4294967296), + }, + { + name: "falls back to vram when gtt is missing", + writeGTT: false, + wantMemoryUsed: bytesToMegabytes(1073741824), + wantMemoryTotal: bytesToMegabytes(2147483648), + }, } - write("vendor", "0x1002") - write("device", "0x1506") - write("revision", "0xc1") - write("gpu_busy_percent", "25") - write("mem_info_vram_used", "1073741824") - write("mem_info_vram_total", "2147483648") - require.NoError(t, os.WriteFile(filepath.Join(hwmonPath, "temp1_input"), []byte("45000"), 0o644)) - require.NoError(t, os.WriteFile(filepath.Join(hwmonPath, "power1_input"), []byte("20000000"), 0o644)) - // Pre-cache name so getAmdGpuName returns a known value (it uses system amdgpu.ids path) - cacheAmdgpuName("1506", "c1", "AMD Radeon 610M Graphics", true) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dir := t.TempDir() + cardPath := filepath.Join(dir, "card0") + devicePath := filepath.Join(cardPath, "device") + hwmonPath := filepath.Join(devicePath, "hwmon", "hwmon0") + require.NoError(t, os.MkdirAll(hwmonPath, 0o755)) - gm := &GPUManager{GpuDataMap: make(map[string]*system.GPUData)} - ok := gm.updateAmdGpuData(cardPath) - require.True(t, ok) + write := func(name, content string) { + require.NoError(t, os.WriteFile(filepath.Join(devicePath, name), []byte(content), 0o644)) + } + write("vendor", "0x1002") + write("device", "0x1506") + write("revision", "0xc1") + write("gpu_busy_percent", "25") + write("mem_info_vram_used", "1073741824") + write("mem_info_vram_total", "2147483648") + if tt.writeGTT { + write("mem_info_gtt_used", "536870912") + write("mem_info_gtt_total", "4294967296") + } + require.NoError(t, os.WriteFile(filepath.Join(hwmonPath, "temp1_input"), []byte("45000"), 0o644)) + require.NoError(t, os.WriteFile(filepath.Join(hwmonPath, "power1_input"), []byte("20000000"), 0o644)) - gpu, ok := gm.GpuDataMap["card0"] - require.True(t, ok) - assert.Equal(t, "AMD Radeon 610M", gpu.Name) - assert.Equal(t, 25.0, gpu.Usage) - assert.Equal(t, bytesToMegabytes(1073741824), gpu.MemoryUsed) - assert.Equal(t, bytesToMegabytes(2147483648), gpu.MemoryTotal) - assert.Equal(t, 45.0, gpu.Temperature) - assert.Equal(t, 20.0, gpu.Power) - assert.Equal(t, 1.0, gpu.Count) + // Pre-cache name so getAmdGpuName returns a known value (it uses system amdgpu.ids path) + cacheAmdgpuName("1506", "c1", "AMD Radeon 610M Graphics", true) + + gm := &GPUManager{GpuDataMap: make(map[string]*system.GPUData)} + ok := gm.updateAmdGpuData(cardPath) + require.True(t, ok) + + gpu, ok := gm.GpuDataMap["card0"] + require.True(t, ok) + assert.Equal(t, "AMD Radeon 610M", gpu.Name) + assert.Equal(t, 25.0, gpu.Usage) + assert.Equal(t, tt.wantMemoryUsed, gpu.MemoryUsed) + assert.Equal(t, tt.wantMemoryTotal, gpu.MemoryTotal) + assert.Equal(t, 45.0, gpu.Temperature) + assert.Equal(t, 20.0, gpu.Power) + assert.Equal(t, 1.0, gpu.Count) + }) + } } func TestLookupAmdgpuNameInFile(t *testing.T) {