diff --git a/agent/gpu.go b/agent/gpu.go index b05a3619..1cb6cc42 100644 --- a/agent/gpu.go +++ b/agent/gpu.go @@ -9,6 +9,7 @@ import ( "maps" "os/exec" "regexp" + "runtime" "strconv" "strings" "sync" @@ -19,11 +20,13 @@ import ( const ( // Commands - nvidiaSmiCmd string = "nvidia-smi" - rocmSmiCmd string = "rocm-smi" - tegraStatsCmd string = "tegrastats" - nvtopCmd string = "nvtop" - noGPUFoundMsg string = "no GPU found - see https://beszel.dev/guide/gpu" + nvidiaSmiCmd string = "nvidia-smi" + rocmSmiCmd string = "rocm-smi" + tegraStatsCmd string = "tegrastats" + nvtopCmd string = "nvtop" + powermetricsCmd string = "powermetrics" + macmonCmd string = "macmon" + noGPUFoundMsg string = "no GPU found - see https://beszel.dev/guide/gpu" // Command retry and timeout constants retryWaitTime time.Duration = 5 * time.Second @@ -82,15 +85,18 @@ var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing type collectorSource string const ( - collectorSourceNVTop collectorSource = collectorSource(nvtopCmd) - collectorSourceNVML collectorSource = "nvml" - collectorSourceNvidiaSMI collectorSource = collectorSource(nvidiaSmiCmd) - collectorSourceIntelGpuTop collectorSource = collectorSource(intelGpuStatsCmd) - collectorSourceAmdSysfs collectorSource = "amd_sysfs" - collectorSourceRocmSMI collectorSource = collectorSource(rocmSmiCmd) - collectorGroupNvidia string = "nvidia" - collectorGroupIntel string = "intel" - collectorGroupAmd string = "amd" + collectorSourceNVTop collectorSource = collectorSource(nvtopCmd) + collectorSourceNVML collectorSource = "nvml" + collectorSourceNvidiaSMI collectorSource = collectorSource(nvidiaSmiCmd) + collectorSourceIntelGpuTop collectorSource = collectorSource(intelGpuStatsCmd) + collectorSourceAmdSysfs collectorSource = "amd_sysfs" + collectorSourceRocmSMI collectorSource = collectorSource(rocmSmiCmd) + collectorSourceMacmon collectorSource = collectorSource(macmonCmd) + collectorSourcePowermetrics collectorSource = collectorSource(powermetricsCmd) + collectorGroupNvidia string = "nvidia" + collectorGroupIntel string = "intel" + collectorGroupAmd string = "amd" + collectorGroupApple string = "apple" ) func isValidCollectorSource(source collectorSource) bool { @@ -100,7 +106,9 @@ func isValidCollectorSource(source collectorSource) bool { collectorSourceNvidiaSMI, collectorSourceIntelGpuTop, collectorSourceAmdSysfs, - collectorSourceRocmSMI: + collectorSourceRocmSMI, + collectorSourceMacmon, + collectorSourcePowermetrics: return true } return false @@ -108,12 +116,14 @@ func isValidCollectorSource(source collectorSource) bool { // gpuCapabilities describes detected GPU tooling and sysfs support on the host. type gpuCapabilities struct { - hasNvidiaSmi bool - hasRocmSmi bool - hasAmdSysfs bool - hasTegrastats bool - hasIntelGpuTop bool - hasNvtop bool + hasNvidiaSmi bool + hasRocmSmi bool + hasAmdSysfs bool + hasTegrastats bool + hasIntelGpuTop bool + hasNvtop bool + hasMacmon bool + hasPowermetrics bool } type collectorDefinition struct { @@ -449,11 +459,19 @@ func (gm *GPUManager) discoverGpuCapabilities() gpuCapabilities { if _, err := exec.LookPath(nvtopCmd); err == nil { caps.hasNvtop = true } + if runtime.GOOS == "darwin" { + if _, err := exec.LookPath(macmonCmd); err == nil { + caps.hasMacmon = true + } + if _, err := exec.LookPath(powermetricsCmd); err == nil { + caps.hasPowermetrics = true + } + } return caps } func hasAnyGpuCollector(caps gpuCapabilities) bool { - return caps.hasNvidiaSmi || caps.hasRocmSmi || caps.hasAmdSysfs || caps.hasTegrastats || caps.hasIntelGpuTop || caps.hasNvtop + return caps.hasNvidiaSmi || caps.hasRocmSmi || caps.hasAmdSysfs || caps.hasTegrastats || caps.hasIntelGpuTop || caps.hasNvtop || caps.hasMacmon || caps.hasPowermetrics } func (gm *GPUManager) startIntelCollector() { @@ -567,6 +585,22 @@ func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSo return true }, }, + collectorSourceMacmon: { + group: collectorGroupApple, + available: caps.hasMacmon, + start: func(_ func()) bool { + gm.startMacmonCollector() + return true + }, + }, + collectorSourcePowermetrics: { + group: collectorGroupApple, + available: caps.hasPowermetrics, + start: func(_ func()) bool { + gm.startPowermetricsCollector() + return true + }, + }, } } @@ -674,6 +708,12 @@ func (gm *GPUManager) resolveLegacyCollectorPriority(caps gpuCapabilities) []col priorities = append(priorities, collectorSourceIntelGpuTop) } + // Prefer macmon on macOS (no sudo). Fall back to powermetrics if present. + if caps.hasMacmon { + priorities = append(priorities, collectorSourceMacmon) + } else if caps.hasPowermetrics { + priorities = append(priorities, collectorSourcePowermetrics) + } // Keep nvtop as a legacy last resort only when no vendor collector exists. if len(priorities) == 0 && caps.hasNvtop { priorities = append(priorities, collectorSourceNVTop) diff --git a/agent/gpu_apple_unsupported.go b/agent/gpu_apple_unsupported.go new file mode 100644 index 00000000..4ff5125f --- /dev/null +++ b/agent/gpu_apple_unsupported.go @@ -0,0 +1,9 @@ +//go:build !darwin + +package agent + +// startPowermetricsCollector is a no-op on non-darwin platforms; the real implementation is in gpu_darwin.go. +func (gm *GPUManager) startPowermetricsCollector() {} + +// startMacmonCollector is a no-op on non-darwin platforms; the real implementation is in gpu_darwin.go. +func (gm *GPUManager) startMacmonCollector() {} diff --git a/agent/gpu_darwin.go b/agent/gpu_darwin.go new file mode 100644 index 00000000..a3ba1e1f --- /dev/null +++ b/agent/gpu_darwin.go @@ -0,0 +1,252 @@ +//go:build darwin + +package agent + +import ( + "bufio" + "bytes" + "encoding/json" + "io" + "log/slog" + "os/exec" + "strconv" + "strings" + "time" + + "github.com/henrygd/beszel/internal/entities/system" +) + +const ( + // powermetricsSampleIntervalMs is the sampling interval passed to powermetrics (-i). + powermetricsSampleIntervalMs = 500 + // powermetricsPollInterval is how often we run powermetrics to collect a new sample. + powermetricsPollInterval = 2 * time.Second + // macmonIntervalMs is the sampling interval passed to macmon pipe (-i), in milliseconds. + macmonIntervalMs = 2500 +) + +const appleGPUID = "0" + +// startPowermetricsCollector runs powermetrics --samplers gpu_power in a loop and updates +// GPU usage and power. Requires root (sudo) on macOS. A single logical GPU is reported as id "0". +func (gm *GPUManager) startPowermetricsCollector() { + // Ensure single GPU entry for Apple GPU + if _, ok := gm.GpuDataMap[appleGPUID]; !ok { + gm.GpuDataMap[appleGPUID] = &system.GPUData{Name: "Apple GPU"} + } + + go func() { + failures := 0 + for { + if err := gm.collectPowermetrics(); err != nil { + failures++ + if failures > maxFailureRetries { + slog.Warn("powermetrics GPU collector failed repeatedly, stopping", "err", err) + break + } + slog.Warn("Error collecting macOS GPU data via powermetrics (may require sudo)", "err", err) + time.Sleep(retryWaitTime) + continue + } + failures = 0 + time.Sleep(powermetricsPollInterval) + } + }() +} + +// collectPowermetrics runs powermetrics once and parses GPU usage and power from its output. +func (gm *GPUManager) collectPowermetrics() error { + interval := strconv.Itoa(powermetricsSampleIntervalMs) + cmd := exec.Command(powermetricsCmd, "--samplers", "gpu_power", "-i", interval, "-n", "1") + cmd.Stderr = nil + out, err := cmd.Output() + if err != nil { + return err + } + if !gm.parsePowermetricsData(out) { + return errNoValidData + } + return nil +} + +// parsePowermetricsData parses powermetrics gpu_power output and updates GpuDataMap["0"]. +// Example output: +// +// **** GPU usage **** +// GPU HW active frequency: 444 MHz +// GPU HW active residency: 0.97% (444 MHz: .97% ... +// GPU idle residency: 99.03% +// GPU Power: 4 mW +func (gm *GPUManager) parsePowermetricsData(output []byte) bool { + var idleResidency, powerMW float64 + var gotIdle, gotPower bool + + scanner := bufio.NewScanner(bytes.NewReader(output)) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if strings.HasPrefix(line, "GPU idle residency:") { + // "GPU idle residency: 99.03%" + fields := strings.Fields(strings.TrimPrefix(line, "GPU idle residency:")) + if len(fields) >= 1 { + pct := strings.TrimSuffix(fields[0], "%") + if v, err := strconv.ParseFloat(pct, 64); err == nil { + idleResidency = v + gotIdle = true + } + } + } else if strings.HasPrefix(line, "GPU Power:") { + // "GPU Power: 4 mW" + fields := strings.Fields(strings.TrimPrefix(line, "GPU Power:")) + if len(fields) >= 1 { + if v, err := strconv.ParseFloat(fields[0], 64); err == nil { + powerMW = v + gotPower = true + } + } + } + } + if err := scanner.Err(); err != nil { + return false + } + if !gotIdle && !gotPower { + return false + } + + gm.Lock() + defer gm.Unlock() + + if _, ok := gm.GpuDataMap[appleGPUID]; !ok { + gm.GpuDataMap[appleGPUID] = &system.GPUData{Name: "Apple GPU"} + } + gpu := gm.GpuDataMap[appleGPUID] + + if gotIdle { + // Usage = 100 - idle residency (e.g. 100 - 99.03 = 0.97%) + gpu.Usage += 100 - idleResidency + } + if gotPower { + // mW -> W + gpu.Power += powerMW / milliwattsInAWatt + } + gpu.Count++ + return true +} + +// startMacmonCollector runs `macmon pipe` in a loop and parses one JSON object per line. +// This collector does not require sudo. A single logical GPU is reported as id "0". +func (gm *GPUManager) startMacmonCollector() { + if _, ok := gm.GpuDataMap[appleGPUID]; !ok { + gm.GpuDataMap[appleGPUID] = &system.GPUData{Name: "Apple GPU"} + } + + go func() { + failures := 0 + for { + if err := gm.collectMacmonPipe(); err != nil { + failures++ + if failures > maxFailureRetries { + slog.Warn("macmon GPU collector failed repeatedly, stopping", "err", err) + break + } + slog.Warn("Error collecting macOS GPU data via macmon", "err", err) + time.Sleep(retryWaitTime) + continue + } + failures = 0 + // `macmon pipe` is long-running; if it returns, wait a bit before restarting. + time.Sleep(retryWaitTime) + } + }() +} + +type macmonTemp struct { + GPUTempAvg float64 `json:"gpu_temp_avg"` +} + +type macmonSample struct { + GPUPower float64 `json:"gpu_power"` // watts (macmon reports fractional values) + GPURAMPower float64 `json:"gpu_ram_power"` // watts + GPUUsage []float64 `json:"gpu_usage"` // [freq_mhz, usage] where usage is typically 0..1 + Temp macmonTemp `json:"temp"` +} + +func (gm *GPUManager) collectMacmonPipe() (err error) { + cmd := exec.Command(macmonCmd, "pipe", "-i", strconv.Itoa(macmonIntervalMs)) + // Avoid blocking if macmon writes to stderr. + cmd.Stderr = io.Discard + stdout, err := cmd.StdoutPipe() + if err != nil { + return err + } + if err := cmd.Start(); err != nil { + return err + } + + // Ensure we always reap the child to avoid zombies on any return path and + // propagate a non-zero exit code if no other error was set. + defer func() { + _ = stdout.Close() + if cmd.ProcessState == nil || !cmd.ProcessState.Exited() { + _ = cmd.Process.Kill() + } + if waitErr := cmd.Wait(); err == nil && waitErr != nil { + err = waitErr + } + }() + + scanner := bufio.NewScanner(stdout) + var hadSample bool + for scanner.Scan() { + line := bytes.TrimSpace(scanner.Bytes()) + if len(line) == 0 { + continue + } + if gm.parseMacmonLine(line) { + hadSample = true + } + } + if scanErr := scanner.Err(); scanErr != nil { + return scanErr + } + if !hadSample { + return errNoValidData + } + return nil +} + +// parseMacmonLine parses a single macmon JSON line and updates Apple GPU metrics. +func (gm *GPUManager) parseMacmonLine(line []byte) bool { + var sample macmonSample + if err := json.Unmarshal(line, &sample); err != nil { + return false + } + + usage := 0.0 + if len(sample.GPUUsage) >= 2 { + usage = sample.GPUUsage[1] + // Heuristic: macmon typically reports 0..1; convert to percentage. + if usage <= 1.0 { + usage *= 100 + } + } + + // Consider the line valid if it contains at least one GPU metric. + if usage == 0 && sample.GPUPower == 0 && sample.Temp.GPUTempAvg == 0 { + return false + } + + gm.Lock() + defer gm.Unlock() + + gpu, ok := gm.GpuDataMap[appleGPUID] + if !ok { + gpu = &system.GPUData{Name: "Apple GPU"} + gm.GpuDataMap[appleGPUID] = gpu + } + gpu.Temperature = sample.Temp.GPUTempAvg + gpu.Usage += usage + // macmon reports power in watts; include VRAM power if present. + gpu.Power += sample.GPUPower + sample.GPURAMPower + gpu.Count++ + return true +} diff --git a/agent/gpu_darwin_test.go b/agent/gpu_darwin_test.go new file mode 100644 index 00000000..0f82a2d7 --- /dev/null +++ b/agent/gpu_darwin_test.go @@ -0,0 +1,81 @@ +//go:build darwin + +package agent + +import ( + "testing" + + "github.com/henrygd/beszel/internal/entities/system" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParsePowermetricsData(t *testing.T) { + input := ` +Machine model: Mac14,10 +OS version: 25D125 + +*** Sampled system activity (Sat Feb 14 00:42:06 2026 -0500) (503.05ms elapsed) *** + +**** GPU usage **** + +GPU HW active frequency: 444 MHz +GPU HW active residency: 0.97% (444 MHz: .97% 612 MHz: 0% 808 MHz: 0% 968 MHz: 0% 1110 MHz: 0% 1236 MHz: 0% 1338 MHz: 0% 1398 MHz: 0%) +GPU SW requested state: (P1 : 100% P2 : 0% P3 : 0% P4 : 0% P5 : 0% P6 : 0% P7 : 0% P8 : 0%) +GPU idle residency: 99.03% +GPU Power: 4 mW +` + gm := &GPUManager{ + GpuDataMap: make(map[string]*system.GPUData), + } + valid := gm.parsePowermetricsData([]byte(input)) + require.True(t, valid) + + g0, ok := gm.GpuDataMap["0"] + require.True(t, ok) + assert.Equal(t, "Apple GPU", g0.Name) + // Usage = 100 - 99.03 = 0.97 + assert.InDelta(t, 0.97, g0.Usage, 0.01) + // 4 mW -> 0.004 W + assert.InDelta(t, 0.004, g0.Power, 0.0001) + assert.Equal(t, 1.0, g0.Count) +} + +func TestParsePowermetricsDataPartial(t *testing.T) { + // Only power line (e.g. older macOS or different sampler output) + input := ` +**** GPU usage **** +GPU Power: 120 mW +` + gm := &GPUManager{ + GpuDataMap: make(map[string]*system.GPUData), + } + valid := gm.parsePowermetricsData([]byte(input)) + require.True(t, valid) + + g0, ok := gm.GpuDataMap["0"] + require.True(t, ok) + assert.Equal(t, "Apple GPU", g0.Name) + assert.InDelta(t, 0.12, g0.Power, 0.001) + assert.Equal(t, 1.0, g0.Count) +} + +func TestParseMacmonLine(t *testing.T) { + input := `{"all_power":0.6468324661254883,"ane_power":0.0,"cpu_power":0.6359732151031494,"ecpu_usage":[2061,0.1726151406764984],"gpu_power":0.010859241709113121,"gpu_ram_power":0.000965250947047025,"gpu_usage":[503,0.013633215799927711],"memory":{"ram_total":17179869184,"ram_usage":12322914304,"swap_total":0,"swap_usage":0},"pcpu_usage":[1248,0.11792058497667313],"ram_power":0.14885640144348145,"sys_power":10.4955415725708,"temp":{"cpu_temp_avg":23.041261672973633,"gpu_temp_avg":29.44516944885254},"timestamp":"2026-02-17T19:34:27.942556+00:00"}` + + gm := &GPUManager{ + GpuDataMap: make(map[string]*system.GPUData), + } + valid := gm.parseMacmonLine([]byte(input)) + require.True(t, valid) + + g0, ok := gm.GpuDataMap["0"] + require.True(t, ok) + assert.Equal(t, "Apple GPU", g0.Name) + // macmon reports usage fraction 0..1; expect percent conversion. + assert.InDelta(t, 1.3633, g0.Usage, 0.05) + // power includes gpu_power + gpu_ram_power + assert.InDelta(t, 0.011824, g0.Power, 0.0005) + assert.InDelta(t, 29.445, g0.Temperature, 0.01) + assert.Equal(t, 1.0, g0.Count) +} diff --git a/readme.md b/readme.md index 6b9740fc..a524b474 100644 --- a/readme.md +++ b/readme.md @@ -48,7 +48,7 @@ The [quick start guide](https://beszel.dev/guide/getting-started) and other docu - **Network usage** - Host system and containers. - **Load average** - Host system. - **Temperature** - Host system sensors. -- **GPU usage / power draw** - Nvidia, AMD, and Intel. +- **GPU usage / power draw** - Nvidia, AMD, Intel, and Apple (macOS via macmon or powermetrics). - **Battery** - Host system battery charge. - **Containers** - Status and metrics of all running Docker / Podman containers. - **S.M.A.R.T.** - Host system disk health (includes eMMC wear/EOL via Linux sysfs when available).