mirror of
https://github.com/henrygd/beszel.git
synced 2026-04-10 23:11:50 +02:00
Compare commits
4 Commits
10d853c004
...
apple-gpu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c561aef409 | ||
|
|
f792f9b102 | ||
|
|
1def7d8d3a | ||
|
|
ef92b254bf |
@@ -72,6 +72,7 @@ type dockerManager struct {
|
|||||||
// cacheTimeMs -> DeltaTracker for network bytes sent/received
|
// cacheTimeMs -> DeltaTracker for network bytes sent/received
|
||||||
networkSentTrackers map[uint16]*deltatracker.DeltaTracker[string, uint64]
|
networkSentTrackers map[uint16]*deltatracker.DeltaTracker[string, uint64]
|
||||||
networkRecvTrackers map[uint16]*deltatracker.DeltaTracker[string, uint64]
|
networkRecvTrackers map[uint16]*deltatracker.DeltaTracker[string, uint64]
|
||||||
|
retrySleep func(time.Duration)
|
||||||
}
|
}
|
||||||
|
|
||||||
// userAgentRoundTripper is a custom http.RoundTripper that adds a User-Agent header to all requests
|
// userAgentRoundTripper is a custom http.RoundTripper that adds a User-Agent header to all requests
|
||||||
@@ -565,6 +566,7 @@ func newDockerManager() *dockerManager {
|
|||||||
lastCpuReadTime: make(map[uint16]map[string]time.Time),
|
lastCpuReadTime: make(map[uint16]map[string]time.Time),
|
||||||
networkSentTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
|
networkSentTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
|
||||||
networkRecvTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
|
networkRecvTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
|
||||||
|
retrySleep: time.Sleep,
|
||||||
}
|
}
|
||||||
|
|
||||||
// If using podman, return client
|
// If using podman, return client
|
||||||
@@ -574,7 +576,7 @@ func newDockerManager() *dockerManager {
|
|||||||
return manager
|
return manager
|
||||||
}
|
}
|
||||||
|
|
||||||
// this can take up to 5 seconds with retry, so run in goroutine
|
// run version check in goroutine to avoid blocking (server may not be ready and requires retries)
|
||||||
go manager.checkDockerVersion()
|
go manager.checkDockerVersion()
|
||||||
|
|
||||||
// give version check a chance to complete before returning
|
// give version check a chance to complete before returning
|
||||||
@@ -594,18 +596,18 @@ func (dm *dockerManager) checkDockerVersion() {
|
|||||||
const versionMaxTries = 2
|
const versionMaxTries = 2
|
||||||
for i := 1; i <= versionMaxTries; i++ {
|
for i := 1; i <= versionMaxTries; i++ {
|
||||||
resp, err = dm.client.Get("http://localhost/version")
|
resp, err = dm.client.Get("http://localhost/version")
|
||||||
if err == nil {
|
if err == nil && resp.StatusCode == http.StatusOK {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
if resp != nil {
|
if resp != nil {
|
||||||
resp.Body.Close()
|
resp.Body.Close()
|
||||||
}
|
}
|
||||||
if i < versionMaxTries {
|
if i < versionMaxTries {
|
||||||
slog.Debug("Failed to get Docker version; retrying", "attempt", i, "error", err)
|
slog.Debug("Failed to get Docker version; retrying", "attempt", i, "err", err, "response", resp)
|
||||||
time.Sleep(5 * time.Second)
|
dm.retrySleep(5 * time.Second)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil || resp.StatusCode != http.StatusOK {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if err := dm.decode(resp, &versionInfo); err != nil {
|
if err := dm.decode(resp, &versionInfo); err != nil {
|
||||||
|
|||||||
@@ -5,7 +5,13 @@ package agent
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"net"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
@@ -379,6 +385,117 @@ func TestDockerManagerCreation(t *testing.T) {
|
|||||||
assert.NotNil(t, dm.networkRecvTrackers)
|
assert.NotNil(t, dm.networkRecvTrackers)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestCheckDockerVersion(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
responses []struct {
|
||||||
|
statusCode int
|
||||||
|
body string
|
||||||
|
}
|
||||||
|
expectedGood bool
|
||||||
|
expectedRequests int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "200 with good version on first try",
|
||||||
|
responses: []struct {
|
||||||
|
statusCode int
|
||||||
|
body string
|
||||||
|
}{
|
||||||
|
{http.StatusOK, `{"Version":"25.0.1"}`},
|
||||||
|
},
|
||||||
|
expectedGood: true,
|
||||||
|
expectedRequests: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "200 with old version on first try",
|
||||||
|
responses: []struct {
|
||||||
|
statusCode int
|
||||||
|
body string
|
||||||
|
}{
|
||||||
|
{http.StatusOK, `{"Version":"24.0.7"}`},
|
||||||
|
},
|
||||||
|
expectedGood: false,
|
||||||
|
expectedRequests: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "non-200 then 200 with good version",
|
||||||
|
responses: []struct {
|
||||||
|
statusCode int
|
||||||
|
body string
|
||||||
|
}{
|
||||||
|
{http.StatusServiceUnavailable, `"not ready"`},
|
||||||
|
{http.StatusOK, `{"Version":"25.1.0"}`},
|
||||||
|
},
|
||||||
|
expectedGood: true,
|
||||||
|
expectedRequests: 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "non-200 on all retries",
|
||||||
|
responses: []struct {
|
||||||
|
statusCode int
|
||||||
|
body string
|
||||||
|
}{
|
||||||
|
{http.StatusInternalServerError, `"error"`},
|
||||||
|
{http.StatusUnauthorized, `"error"`},
|
||||||
|
},
|
||||||
|
expectedGood: false,
|
||||||
|
expectedRequests: 2,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
requestCount := 0
|
||||||
|
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
idx := requestCount
|
||||||
|
requestCount++
|
||||||
|
if idx >= len(tt.responses) {
|
||||||
|
idx = len(tt.responses) - 1
|
||||||
|
}
|
||||||
|
w.WriteHeader(tt.responses[idx].statusCode)
|
||||||
|
fmt.Fprint(w, tt.responses[idx].body)
|
||||||
|
}))
|
||||||
|
defer server.Close()
|
||||||
|
|
||||||
|
dm := &dockerManager{
|
||||||
|
client: &http.Client{
|
||||||
|
Transport: &http.Transport{
|
||||||
|
DialContext: func(_ context.Context, network, _ string) (net.Conn, error) {
|
||||||
|
return net.Dial(network, server.Listener.Addr().String())
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
retrySleep: func(time.Duration) {},
|
||||||
|
}
|
||||||
|
|
||||||
|
dm.checkDockerVersion()
|
||||||
|
|
||||||
|
assert.Equal(t, tt.expectedGood, dm.goodDockerVersion)
|
||||||
|
assert.Equal(t, tt.expectedRequests, requestCount)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Run("request error on all retries", func(t *testing.T) {
|
||||||
|
requestCount := 0
|
||||||
|
dm := &dockerManager{
|
||||||
|
client: &http.Client{
|
||||||
|
Transport: &http.Transport{
|
||||||
|
DialContext: func(_ context.Context, _, _ string) (net.Conn, error) {
|
||||||
|
requestCount++
|
||||||
|
return nil, errors.New("connection refused")
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
retrySleep: func(time.Duration) {},
|
||||||
|
}
|
||||||
|
|
||||||
|
dm.checkDockerVersion()
|
||||||
|
|
||||||
|
assert.False(t, dm.goodDockerVersion)
|
||||||
|
assert.Equal(t, 2, requestCount)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func TestCycleCpuDeltas(t *testing.T) {
|
func TestCycleCpuDeltas(t *testing.T) {
|
||||||
dm := &dockerManager{
|
dm := &dockerManager{
|
||||||
lastCpuContainer: map[uint16]map[string]uint64{
|
lastCpuContainer: map[uint16]map[string]uint64{
|
||||||
|
|||||||
83
agent/gpu.go
83
agent/gpu.go
@@ -9,6 +9,7 @@ import (
|
|||||||
"maps"
|
"maps"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"runtime"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -19,11 +20,13 @@ import (
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
// Commands
|
// Commands
|
||||||
nvidiaSmiCmd string = "nvidia-smi"
|
nvidiaSmiCmd string = "nvidia-smi"
|
||||||
rocmSmiCmd string = "rocm-smi"
|
rocmSmiCmd string = "rocm-smi"
|
||||||
tegraStatsCmd string = "tegrastats"
|
tegraStatsCmd string = "tegrastats"
|
||||||
nvtopCmd string = "nvtop"
|
nvtopCmd string = "nvtop"
|
||||||
noGPUFoundMsg string = "no GPU found - see https://beszel.dev/guide/gpu"
|
powermetricsCmd string = "powermetrics"
|
||||||
|
macmonCmd string = "macmon"
|
||||||
|
noGPUFoundMsg string = "no GPU found - see https://beszel.dev/guide/gpu"
|
||||||
|
|
||||||
// Command retry and timeout constants
|
// Command retry and timeout constants
|
||||||
retryWaitTime time.Duration = 5 * time.Second
|
retryWaitTime time.Duration = 5 * time.Second
|
||||||
@@ -82,15 +85,18 @@ var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing
|
|||||||
type collectorSource string
|
type collectorSource string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
collectorSourceNVTop collectorSource = collectorSource(nvtopCmd)
|
collectorSourceNVTop collectorSource = collectorSource(nvtopCmd)
|
||||||
collectorSourceNVML collectorSource = "nvml"
|
collectorSourceNVML collectorSource = "nvml"
|
||||||
collectorSourceNvidiaSMI collectorSource = collectorSource(nvidiaSmiCmd)
|
collectorSourceNvidiaSMI collectorSource = collectorSource(nvidiaSmiCmd)
|
||||||
collectorSourceIntelGpuTop collectorSource = collectorSource(intelGpuStatsCmd)
|
collectorSourceIntelGpuTop collectorSource = collectorSource(intelGpuStatsCmd)
|
||||||
collectorSourceAmdSysfs collectorSource = "amd_sysfs"
|
collectorSourceAmdSysfs collectorSource = "amd_sysfs"
|
||||||
collectorSourceRocmSMI collectorSource = collectorSource(rocmSmiCmd)
|
collectorSourceRocmSMI collectorSource = collectorSource(rocmSmiCmd)
|
||||||
collectorGroupNvidia string = "nvidia"
|
collectorSourceMacmon collectorSource = collectorSource(macmonCmd)
|
||||||
collectorGroupIntel string = "intel"
|
collectorSourcePowermetrics collectorSource = collectorSource(powermetricsCmd)
|
||||||
collectorGroupAmd string = "amd"
|
collectorGroupNvidia string = "nvidia"
|
||||||
|
collectorGroupIntel string = "intel"
|
||||||
|
collectorGroupAmd string = "amd"
|
||||||
|
collectorGroupApple string = "apple"
|
||||||
)
|
)
|
||||||
|
|
||||||
func isValidCollectorSource(source collectorSource) bool {
|
func isValidCollectorSource(source collectorSource) bool {
|
||||||
@@ -100,7 +106,9 @@ func isValidCollectorSource(source collectorSource) bool {
|
|||||||
collectorSourceNvidiaSMI,
|
collectorSourceNvidiaSMI,
|
||||||
collectorSourceIntelGpuTop,
|
collectorSourceIntelGpuTop,
|
||||||
collectorSourceAmdSysfs,
|
collectorSourceAmdSysfs,
|
||||||
collectorSourceRocmSMI:
|
collectorSourceRocmSMI,
|
||||||
|
collectorSourceMacmon,
|
||||||
|
collectorSourcePowermetrics:
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
@@ -108,12 +116,14 @@ func isValidCollectorSource(source collectorSource) bool {
|
|||||||
|
|
||||||
// gpuCapabilities describes detected GPU tooling and sysfs support on the host.
|
// gpuCapabilities describes detected GPU tooling and sysfs support on the host.
|
||||||
type gpuCapabilities struct {
|
type gpuCapabilities struct {
|
||||||
hasNvidiaSmi bool
|
hasNvidiaSmi bool
|
||||||
hasRocmSmi bool
|
hasRocmSmi bool
|
||||||
hasAmdSysfs bool
|
hasAmdSysfs bool
|
||||||
hasTegrastats bool
|
hasTegrastats bool
|
||||||
hasIntelGpuTop bool
|
hasIntelGpuTop bool
|
||||||
hasNvtop bool
|
hasNvtop bool
|
||||||
|
hasMacmon bool
|
||||||
|
hasPowermetrics bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type collectorDefinition struct {
|
type collectorDefinition struct {
|
||||||
@@ -449,11 +459,19 @@ func (gm *GPUManager) discoverGpuCapabilities() gpuCapabilities {
|
|||||||
if _, err := exec.LookPath(nvtopCmd); err == nil {
|
if _, err := exec.LookPath(nvtopCmd); err == nil {
|
||||||
caps.hasNvtop = true
|
caps.hasNvtop = true
|
||||||
}
|
}
|
||||||
|
if runtime.GOOS == "darwin" {
|
||||||
|
if _, err := exec.LookPath(macmonCmd); err == nil {
|
||||||
|
caps.hasMacmon = true
|
||||||
|
}
|
||||||
|
if _, err := exec.LookPath(powermetricsCmd); err == nil {
|
||||||
|
caps.hasPowermetrics = true
|
||||||
|
}
|
||||||
|
}
|
||||||
return caps
|
return caps
|
||||||
}
|
}
|
||||||
|
|
||||||
func hasAnyGpuCollector(caps gpuCapabilities) bool {
|
func hasAnyGpuCollector(caps gpuCapabilities) bool {
|
||||||
return caps.hasNvidiaSmi || caps.hasRocmSmi || caps.hasAmdSysfs || caps.hasTegrastats || caps.hasIntelGpuTop || caps.hasNvtop
|
return caps.hasNvidiaSmi || caps.hasRocmSmi || caps.hasAmdSysfs || caps.hasTegrastats || caps.hasIntelGpuTop || caps.hasNvtop || caps.hasMacmon || caps.hasPowermetrics
|
||||||
}
|
}
|
||||||
|
|
||||||
func (gm *GPUManager) startIntelCollector() {
|
func (gm *GPUManager) startIntelCollector() {
|
||||||
@@ -567,6 +585,22 @@ func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSo
|
|||||||
return true
|
return true
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
collectorSourceMacmon: {
|
||||||
|
group: collectorGroupApple,
|
||||||
|
available: caps.hasMacmon,
|
||||||
|
start: func(_ func()) bool {
|
||||||
|
gm.startMacmonCollector()
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
},
|
||||||
|
collectorSourcePowermetrics: {
|
||||||
|
group: collectorGroupApple,
|
||||||
|
available: caps.hasPowermetrics,
|
||||||
|
start: func(_ func()) bool {
|
||||||
|
gm.startPowermetricsCollector()
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -674,7 +708,10 @@ func (gm *GPUManager) resolveLegacyCollectorPriority(caps gpuCapabilities) []col
|
|||||||
priorities = append(priorities, collectorSourceIntelGpuTop)
|
priorities = append(priorities, collectorSourceIntelGpuTop)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Keep nvtop as a legacy last resort only when no vendor collector exists.
|
// Apple collectors are currently opt-in only.
|
||||||
|
// Enable them with GPU_COLLECTOR=macmon or GPU_COLLECTOR=powermetrics.
|
||||||
|
|
||||||
|
// Keep nvtop as a last resort only when no vendor collector exists.
|
||||||
if len(priorities) == 0 && caps.hasNvtop {
|
if len(priorities) == 0 && caps.hasNvtop {
|
||||||
priorities = append(priorities, collectorSourceNVTop)
|
priorities = append(priorities, collectorSourceNVTop)
|
||||||
}
|
}
|
||||||
|
|||||||
9
agent/gpu_apple_unsupported.go
Normal file
9
agent/gpu_apple_unsupported.go
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
//go:build !darwin
|
||||||
|
|
||||||
|
package agent
|
||||||
|
|
||||||
|
// startPowermetricsCollector is a no-op on non-darwin platforms; the real implementation is in gpu_darwin.go.
|
||||||
|
func (gm *GPUManager) startPowermetricsCollector() {}
|
||||||
|
|
||||||
|
// startMacmonCollector is a no-op on non-darwin platforms; the real implementation is in gpu_darwin.go.
|
||||||
|
func (gm *GPUManager) startMacmonCollector() {}
|
||||||
252
agent/gpu_darwin.go
Normal file
252
agent/gpu_darwin.go
Normal file
@@ -0,0 +1,252 @@
|
|||||||
|
//go:build darwin
|
||||||
|
|
||||||
|
package agent
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"bytes"
|
||||||
|
"encoding/json"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/henrygd/beszel/internal/entities/system"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// powermetricsSampleIntervalMs is the sampling interval passed to powermetrics (-i).
|
||||||
|
powermetricsSampleIntervalMs = 500
|
||||||
|
// powermetricsPollInterval is how often we run powermetrics to collect a new sample.
|
||||||
|
powermetricsPollInterval = 2 * time.Second
|
||||||
|
// macmonIntervalMs is the sampling interval passed to macmon pipe (-i), in milliseconds.
|
||||||
|
macmonIntervalMs = 2500
|
||||||
|
)
|
||||||
|
|
||||||
|
const appleGPUID = "0"
|
||||||
|
|
||||||
|
// startPowermetricsCollector runs powermetrics --samplers gpu_power in a loop and updates
|
||||||
|
// GPU usage and power. Requires root (sudo) on macOS. A single logical GPU is reported as id "0".
|
||||||
|
func (gm *GPUManager) startPowermetricsCollector() {
|
||||||
|
// Ensure single GPU entry for Apple GPU
|
||||||
|
if _, ok := gm.GpuDataMap[appleGPUID]; !ok {
|
||||||
|
gm.GpuDataMap[appleGPUID] = &system.GPUData{Name: "Apple GPU"}
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
failures := 0
|
||||||
|
for {
|
||||||
|
if err := gm.collectPowermetrics(); err != nil {
|
||||||
|
failures++
|
||||||
|
if failures > maxFailureRetries {
|
||||||
|
slog.Warn("powermetrics GPU collector failed repeatedly, stopping", "err", err)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
slog.Warn("Error collecting macOS GPU data via powermetrics (may require sudo)", "err", err)
|
||||||
|
time.Sleep(retryWaitTime)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
failures = 0
|
||||||
|
time.Sleep(powermetricsPollInterval)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectPowermetrics runs powermetrics once and parses GPU usage and power from its output.
|
||||||
|
func (gm *GPUManager) collectPowermetrics() error {
|
||||||
|
interval := strconv.Itoa(powermetricsSampleIntervalMs)
|
||||||
|
cmd := exec.Command(powermetricsCmd, "--samplers", "gpu_power", "-i", interval, "-n", "1")
|
||||||
|
cmd.Stderr = nil
|
||||||
|
out, err := cmd.Output()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if !gm.parsePowermetricsData(out) {
|
||||||
|
return errNoValidData
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// parsePowermetricsData parses powermetrics gpu_power output and updates GpuDataMap["0"].
|
||||||
|
// Example output:
|
||||||
|
//
|
||||||
|
// **** GPU usage ****
|
||||||
|
// GPU HW active frequency: 444 MHz
|
||||||
|
// GPU HW active residency: 0.97% (444 MHz: .97% ...
|
||||||
|
// GPU idle residency: 99.03%
|
||||||
|
// GPU Power: 4 mW
|
||||||
|
func (gm *GPUManager) parsePowermetricsData(output []byte) bool {
|
||||||
|
var idleResidency, powerMW float64
|
||||||
|
var gotIdle, gotPower bool
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(bytes.NewReader(output))
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := strings.TrimSpace(scanner.Text())
|
||||||
|
if strings.HasPrefix(line, "GPU idle residency:") {
|
||||||
|
// "GPU idle residency: 99.03%"
|
||||||
|
fields := strings.Fields(strings.TrimPrefix(line, "GPU idle residency:"))
|
||||||
|
if len(fields) >= 1 {
|
||||||
|
pct := strings.TrimSuffix(fields[0], "%")
|
||||||
|
if v, err := strconv.ParseFloat(pct, 64); err == nil {
|
||||||
|
idleResidency = v
|
||||||
|
gotIdle = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if strings.HasPrefix(line, "GPU Power:") {
|
||||||
|
// "GPU Power: 4 mW"
|
||||||
|
fields := strings.Fields(strings.TrimPrefix(line, "GPU Power:"))
|
||||||
|
if len(fields) >= 1 {
|
||||||
|
if v, err := strconv.ParseFloat(fields[0], 64); err == nil {
|
||||||
|
powerMW = v
|
||||||
|
gotPower = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if !gotIdle && !gotPower {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
gm.Lock()
|
||||||
|
defer gm.Unlock()
|
||||||
|
|
||||||
|
if _, ok := gm.GpuDataMap[appleGPUID]; !ok {
|
||||||
|
gm.GpuDataMap[appleGPUID] = &system.GPUData{Name: "Apple GPU"}
|
||||||
|
}
|
||||||
|
gpu := gm.GpuDataMap[appleGPUID]
|
||||||
|
|
||||||
|
if gotIdle {
|
||||||
|
// Usage = 100 - idle residency (e.g. 100 - 99.03 = 0.97%)
|
||||||
|
gpu.Usage += 100 - idleResidency
|
||||||
|
}
|
||||||
|
if gotPower {
|
||||||
|
// mW -> W
|
||||||
|
gpu.Power += powerMW / milliwattsInAWatt
|
||||||
|
}
|
||||||
|
gpu.Count++
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// startMacmonCollector runs `macmon pipe` in a loop and parses one JSON object per line.
|
||||||
|
// This collector does not require sudo. A single logical GPU is reported as id "0".
|
||||||
|
func (gm *GPUManager) startMacmonCollector() {
|
||||||
|
if _, ok := gm.GpuDataMap[appleGPUID]; !ok {
|
||||||
|
gm.GpuDataMap[appleGPUID] = &system.GPUData{Name: "Apple GPU"}
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
failures := 0
|
||||||
|
for {
|
||||||
|
if err := gm.collectMacmonPipe(); err != nil {
|
||||||
|
failures++
|
||||||
|
if failures > maxFailureRetries {
|
||||||
|
slog.Warn("macmon GPU collector failed repeatedly, stopping", "err", err)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
slog.Warn("Error collecting macOS GPU data via macmon", "err", err)
|
||||||
|
time.Sleep(retryWaitTime)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
failures = 0
|
||||||
|
// `macmon pipe` is long-running; if it returns, wait a bit before restarting.
|
||||||
|
time.Sleep(retryWaitTime)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
type macmonTemp struct {
|
||||||
|
GPUTempAvg float64 `json:"gpu_temp_avg"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type macmonSample struct {
|
||||||
|
GPUPower float64 `json:"gpu_power"` // watts (macmon reports fractional values)
|
||||||
|
GPURAMPower float64 `json:"gpu_ram_power"` // watts
|
||||||
|
GPUUsage []float64 `json:"gpu_usage"` // [freq_mhz, usage] where usage is typically 0..1
|
||||||
|
Temp macmonTemp `json:"temp"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (gm *GPUManager) collectMacmonPipe() (err error) {
|
||||||
|
cmd := exec.Command(macmonCmd, "pipe", "-i", strconv.Itoa(macmonIntervalMs))
|
||||||
|
// Avoid blocking if macmon writes to stderr.
|
||||||
|
cmd.Stderr = io.Discard
|
||||||
|
stdout, err := cmd.StdoutPipe()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure we always reap the child to avoid zombies on any return path and
|
||||||
|
// propagate a non-zero exit code if no other error was set.
|
||||||
|
defer func() {
|
||||||
|
_ = stdout.Close()
|
||||||
|
if cmd.ProcessState == nil || !cmd.ProcessState.Exited() {
|
||||||
|
_ = cmd.Process.Kill()
|
||||||
|
}
|
||||||
|
if waitErr := cmd.Wait(); err == nil && waitErr != nil {
|
||||||
|
err = waitErr
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(stdout)
|
||||||
|
var hadSample bool
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := bytes.TrimSpace(scanner.Bytes())
|
||||||
|
if len(line) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if gm.parseMacmonLine(line) {
|
||||||
|
hadSample = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if scanErr := scanner.Err(); scanErr != nil {
|
||||||
|
return scanErr
|
||||||
|
}
|
||||||
|
if !hadSample {
|
||||||
|
return errNoValidData
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseMacmonLine parses a single macmon JSON line and updates Apple GPU metrics.
|
||||||
|
func (gm *GPUManager) parseMacmonLine(line []byte) bool {
|
||||||
|
var sample macmonSample
|
||||||
|
if err := json.Unmarshal(line, &sample); err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
usage := 0.0
|
||||||
|
if len(sample.GPUUsage) >= 2 {
|
||||||
|
usage = sample.GPUUsage[1]
|
||||||
|
// Heuristic: macmon typically reports 0..1; convert to percentage.
|
||||||
|
if usage <= 1.0 {
|
||||||
|
usage *= 100
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consider the line valid if it contains at least one GPU metric.
|
||||||
|
if usage == 0 && sample.GPUPower == 0 && sample.Temp.GPUTempAvg == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
gm.Lock()
|
||||||
|
defer gm.Unlock()
|
||||||
|
|
||||||
|
gpu, ok := gm.GpuDataMap[appleGPUID]
|
||||||
|
if !ok {
|
||||||
|
gpu = &system.GPUData{Name: "Apple GPU"}
|
||||||
|
gm.GpuDataMap[appleGPUID] = gpu
|
||||||
|
}
|
||||||
|
gpu.Temperature = sample.Temp.GPUTempAvg
|
||||||
|
gpu.Usage += usage
|
||||||
|
// macmon reports power in watts; include VRAM power if present.
|
||||||
|
gpu.Power += sample.GPUPower + sample.GPURAMPower
|
||||||
|
gpu.Count++
|
||||||
|
return true
|
||||||
|
}
|
||||||
81
agent/gpu_darwin_test.go
Normal file
81
agent/gpu_darwin_test.go
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
//go:build darwin
|
||||||
|
|
||||||
|
package agent
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/henrygd/beszel/internal/entities/system"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParsePowermetricsData(t *testing.T) {
|
||||||
|
input := `
|
||||||
|
Machine model: Mac14,10
|
||||||
|
OS version: 25D125
|
||||||
|
|
||||||
|
*** Sampled system activity (Sat Feb 14 00:42:06 2026 -0500) (503.05ms elapsed) ***
|
||||||
|
|
||||||
|
**** GPU usage ****
|
||||||
|
|
||||||
|
GPU HW active frequency: 444 MHz
|
||||||
|
GPU HW active residency: 0.97% (444 MHz: .97% 612 MHz: 0% 808 MHz: 0% 968 MHz: 0% 1110 MHz: 0% 1236 MHz: 0% 1338 MHz: 0% 1398 MHz: 0%)
|
||||||
|
GPU SW requested state: (P1 : 100% P2 : 0% P3 : 0% P4 : 0% P5 : 0% P6 : 0% P7 : 0% P8 : 0%)
|
||||||
|
GPU idle residency: 99.03%
|
||||||
|
GPU Power: 4 mW
|
||||||
|
`
|
||||||
|
gm := &GPUManager{
|
||||||
|
GpuDataMap: make(map[string]*system.GPUData),
|
||||||
|
}
|
||||||
|
valid := gm.parsePowermetricsData([]byte(input))
|
||||||
|
require.True(t, valid)
|
||||||
|
|
||||||
|
g0, ok := gm.GpuDataMap["0"]
|
||||||
|
require.True(t, ok)
|
||||||
|
assert.Equal(t, "Apple GPU", g0.Name)
|
||||||
|
// Usage = 100 - 99.03 = 0.97
|
||||||
|
assert.InDelta(t, 0.97, g0.Usage, 0.01)
|
||||||
|
// 4 mW -> 0.004 W
|
||||||
|
assert.InDelta(t, 0.004, g0.Power, 0.0001)
|
||||||
|
assert.Equal(t, 1.0, g0.Count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParsePowermetricsDataPartial(t *testing.T) {
|
||||||
|
// Only power line (e.g. older macOS or different sampler output)
|
||||||
|
input := `
|
||||||
|
**** GPU usage ****
|
||||||
|
GPU Power: 120 mW
|
||||||
|
`
|
||||||
|
gm := &GPUManager{
|
||||||
|
GpuDataMap: make(map[string]*system.GPUData),
|
||||||
|
}
|
||||||
|
valid := gm.parsePowermetricsData([]byte(input))
|
||||||
|
require.True(t, valid)
|
||||||
|
|
||||||
|
g0, ok := gm.GpuDataMap["0"]
|
||||||
|
require.True(t, ok)
|
||||||
|
assert.Equal(t, "Apple GPU", g0.Name)
|
||||||
|
assert.InDelta(t, 0.12, g0.Power, 0.001)
|
||||||
|
assert.Equal(t, 1.0, g0.Count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseMacmonLine(t *testing.T) {
|
||||||
|
input := `{"all_power":0.6468324661254883,"ane_power":0.0,"cpu_power":0.6359732151031494,"ecpu_usage":[2061,0.1726151406764984],"gpu_power":0.010859241709113121,"gpu_ram_power":0.000965250947047025,"gpu_usage":[503,0.013633215799927711],"memory":{"ram_total":17179869184,"ram_usage":12322914304,"swap_total":0,"swap_usage":0},"pcpu_usage":[1248,0.11792058497667313],"ram_power":0.14885640144348145,"sys_power":10.4955415725708,"temp":{"cpu_temp_avg":23.041261672973633,"gpu_temp_avg":29.44516944885254},"timestamp":"2026-02-17T19:34:27.942556+00:00"}`
|
||||||
|
|
||||||
|
gm := &GPUManager{
|
||||||
|
GpuDataMap: make(map[string]*system.GPUData),
|
||||||
|
}
|
||||||
|
valid := gm.parseMacmonLine([]byte(input))
|
||||||
|
require.True(t, valid)
|
||||||
|
|
||||||
|
g0, ok := gm.GpuDataMap["0"]
|
||||||
|
require.True(t, ok)
|
||||||
|
assert.Equal(t, "Apple GPU", g0.Name)
|
||||||
|
// macmon reports usage fraction 0..1; expect percent conversion.
|
||||||
|
assert.InDelta(t, 1.3633, g0.Usage, 0.05)
|
||||||
|
// power includes gpu_power + gpu_ram_power
|
||||||
|
assert.InDelta(t, 0.011824, g0.Power, 0.0005)
|
||||||
|
assert.InDelta(t, 29.445, g0.Temperature, 0.01)
|
||||||
|
assert.Equal(t, 1.0, g0.Count)
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user