diff --git a/agent/gpu.go b/agent/gpu.go index b2efca49..75c96526 100644 --- a/agent/gpu.go +++ b/agent/gpu.go @@ -21,6 +21,7 @@ const ( // Commands nvidiaSmiCmd string = "nvidia-smi" rocmSmiCmd string = "rocm-smi" + amdgpuCmd string = "amdgpu" // internal cmd for sysfs collection tegraStatsCmd string = "tegrastats" // Polling intervals @@ -41,6 +42,7 @@ type GPUManager struct { sync.Mutex nvidiaSmi bool rocmSmi bool + amdgpu bool tegrastats bool intelGpuStats bool nvml bool @@ -399,7 +401,13 @@ func (gm *GPUManager) detectGPUs() error { gm.nvidiaSmi = true } if _, err := exec.LookPath(rocmSmiCmd); err == nil { - gm.rocmSmi = true + if val, _ := GetEnv("AMD_SYSFS"); val == "true" { + gm.amdgpu = true + } else { + gm.rocmSmi = true + } + } else if gm.hasAmdSysfs() { + gm.amdgpu = true } if _, err := exec.LookPath(tegraStatsCmd); err == nil { gm.tegrastats = true @@ -408,10 +416,10 @@ func (gm *GPUManager) detectGPUs() error { if _, err := exec.LookPath(intelGpuStatsCmd); err == nil { gm.intelGpuStats = true } - if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelGpuStats || gm.nvml { + if gm.nvidiaSmi || gm.rocmSmi || gm.amdgpu || gm.tegrastats || gm.intelGpuStats || gm.nvml { return nil } - return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, tegrastats, or intel_gpu_top") + return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or intel_gpu_top") } // startCollector starts the appropriate GPU data collector based on the command @@ -448,6 +456,12 @@ func (gm *GPUManager) startCollector(command string) { collector.cmdArgs = []string{"--interval", tegraStatsInterval} collector.parse = gm.getJetsonParser() go collector.start() + case amdgpuCmd: + go func() { + if err := gm.collectAmdStats(); err != nil { + slog.Warn("Error collecting AMD GPU data via sysfs", "err", err) + } + }() case rocmSmiCmd: collector.cmdArgs = []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"} collector.parse = gm.parseAmdData @@ -459,7 +473,7 @@ func (gm *GPUManager) startCollector(command string) { if failures > maxFailureRetries { break } - slog.Warn("Error collecting AMD GPU data", "err", err) + slog.Warn("Error collecting AMD GPU data via rocm-smi", "err", err) } time.Sleep(rocmSmiInterval) } @@ -497,6 +511,9 @@ func NewGPUManager() (*GPUManager, error) { if gm.rocmSmi { gm.startCollector(rocmSmiCmd) } + if gm.amdgpu { + gm.startCollector(amdgpuCmd) + } if gm.tegrastats { gm.startCollector(tegraStatsCmd) } diff --git a/agent/gpu_amd_linux.go b/agent/gpu_amd_linux.go new file mode 100644 index 00000000..ac265b24 --- /dev/null +++ b/agent/gpu_amd_linux.go @@ -0,0 +1,184 @@ +//go:build linux + +package agent + +import ( + "fmt" + "log/slog" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "time" + + "github.com/henrygd/beszel/internal/entities/system" +) + +// hasAmdSysfs returns true if any AMD GPU sysfs nodes are found +func (gm *GPUManager) hasAmdSysfs() bool { + cards, err := filepath.Glob("/sys/class/drm/card*/device/vendor") + if err != nil { + return false + } + for _, vendorPath := range cards { + vendor, err := os.ReadFile(vendorPath) + if err == nil && strings.TrimSpace(string(vendor)) == "0x1002" { + return true + } + } + return false +} + +// collectAmdStats collects AMD GPU metrics directly from sysfs to avoid the overhead of rocm-smi +func (gm *GPUManager) collectAmdStats() error { + cards, err := filepath.Glob("/sys/class/drm/card*") + if err != nil { + return err + } + + var amdGpuPaths []string + for _, card := range cards { + // Ignore symbolic links and non-main card directories + if strings.Contains(filepath.Base(card), "-") || !isAmdGpu(card) { + continue + } + amdGpuPaths = append(amdGpuPaths, card) + } + + if len(amdGpuPaths) == 0 { + return errNoValidData + } + + slog.Debug("Using sysfs for AMD GPU data collection") + + failures := 0 + for { + hasData := false + for _, cardPath := range amdGpuPaths { + if gm.updateAmdGpuData(cardPath) { + hasData = true + } + } + if !hasData { + failures++ + if failures > maxFailureRetries { + return errNoValidData + } + slog.Warn("No AMD GPU data from sysfs", "failures", failures) + time.Sleep(retryWaitTime) + continue + } + failures = 0 + time.Sleep(rocmSmiInterval) + } +} + +func isAmdGpu(cardPath string) bool { + vendorPath := filepath.Join(cardPath, "device/vendor") + vendor, err := os.ReadFile(vendorPath) + if err != nil { + return false + } + return strings.TrimSpace(string(vendor)) == "0x1002" +} + +// updateAmdGpuData reads GPU metrics from sysfs and updates the GPU data map. +// Returns true if at least some data was successfully read. +func (gm *GPUManager) updateAmdGpuData(cardPath string) bool { + devicePath := filepath.Join(cardPath, "device") + id := filepath.Base(cardPath) + + // Read all sysfs values first (no lock needed - these can be slow) + usage, usageErr := readSysfsFloat(filepath.Join(devicePath, "gpu_busy_percent")) + memUsed, memUsedErr := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_used")) + memTotal, _ := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_total")) + + var temp, power float64 + hwmons, _ := filepath.Glob(filepath.Join(devicePath, "hwmon/hwmon*")) + for _, hwmonDir := range hwmons { + if t, err := readSysfsFloat(filepath.Join(hwmonDir, "temp1_input")); err == nil { + temp = t / 1000.0 + } + if p, err := readSysfsFloat(filepath.Join(hwmonDir, "power1_average")); err == nil { + power += p / 1000000.0 + } else if p, err := readSysfsFloat(filepath.Join(hwmonDir, "power1_input")); err == nil { + power += p / 1000000.0 + } + } + + // Check if we got any meaningful data + if usageErr != nil && memUsedErr != nil && temp == 0 { + return false + } + + // Single lock to update all values atomically + gm.Lock() + defer gm.Unlock() + + gpu, ok := gm.GpuDataMap[id] + if !ok { + gpu = &system.GPUData{Name: getAmdGpuName(devicePath)} + gm.GpuDataMap[id] = gpu + } + + if usageErr == nil { + gpu.Usage += usage + } + gpu.MemoryUsed = bytesToMegabytes(memUsed) + gpu.MemoryTotal = bytesToMegabytes(memTotal) + gpu.Temperature = temp + gpu.Power += power + gpu.Count++ + return true +} + +func readSysfsFloat(path string) (float64, error) { + val, err := os.ReadFile(path) + if err != nil { + return 0, err + } + return strconv.ParseFloat(strings.TrimSpace(string(val)), 64) +} + +// getAmdGpuName attempts to get a descriptive GPU name. +// First tries product_name (rarely available), then looks up the PCI device ID. +// Falls back to showing the raw device ID if not found in the lookup table. +func getAmdGpuName(devicePath string) string { + // Try product_name first (works for some enterprise GPUs) + if prod, err := os.ReadFile(filepath.Join(devicePath, "product_name")); err == nil { + return strings.TrimSpace(string(prod)) + } + + // Read PCI device ID and look it up + if deviceID, err := os.ReadFile(filepath.Join(devicePath, "device")); err == nil { + id := strings.TrimPrefix(strings.ToLower(strings.TrimSpace(string(deviceID))), "0x") + if name, ok := getRadeonNames()[id]; ok { + return fmt.Sprintf("Radeon %s", name) + } + return fmt.Sprintf("AMD GPU (%s)", id) + } + + return "AMD GPU" +} + +// getRadeonNames returns the AMD GPU name lookup table +// Device IDs from https://pci-ids.ucw.cz/read/PC/1002 +var getRadeonNames = sync.OnceValue(func() map[string]string { + return map[string]string{ + "7550": "RX 9070", + "7590": "RX 9060 XT", + "7551": "AI PRO R9700", + + "744c": "RX 7900", + + "1681": "680M", + + "7448": "PRO W7900", + "745e": "PRO W7800", + "7470": "PRO W7700", + "73e3": "PRO W6600", + "7422": "PRO W6400", + "7341": "PRO W5500", + } +}) diff --git a/agent/gpu_amd_unsupported.go b/agent/gpu_amd_unsupported.go new file mode 100644 index 00000000..ccfc1063 --- /dev/null +++ b/agent/gpu_amd_unsupported.go @@ -0,0 +1,15 @@ +//go:build !linux + +package agent + +import ( + "errors" +) + +func (gm *GPUManager) hasAmdSysfs() bool { + return false +} + +func (gm *GPUManager) collectAmdStats() error { + return errors.ErrUnsupported +}