initial support for one intel gpu with intel_gpu_top

This commit is contained in:
henrygd
2025-09-22 16:36:10 -04:00
parent 240e75f025
commit 3abb7c213b
8 changed files with 355 additions and 71 deletions

View File

@@ -27,13 +27,10 @@ const (
nvidiaSmiInterval string = "4" // in seconds
tegraStatsInterval string = "3700" // in milliseconds
rocmSmiInterval time.Duration = 4300 * time.Millisecond
// Command retry and timeout constants
retryWaitTime time.Duration = 5 * time.Second
maxFailureRetries int = 5
cmdBufferSize uint16 = 10 * 1024
// Unit Conversions
mebibytesInAMegabyte float64 = 1.024 // nvidia-smi reports memory in MiB
milliwattsInAWatt float64 = 1000.0 // tegrastats reports power in mW
@@ -42,10 +39,11 @@ const (
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
type GPUManager struct {
sync.Mutex
nvidiaSmi bool
rocmSmi bool
tegrastats bool
GpuDataMap map[string]*system.GPUData
nvidiaSmi bool
rocmSmi bool
tegrastats bool
intelGpuStats bool
GpuDataMap map[string]*system.GPUData
}
// RocmSmiJson represents the JSON structure of rocm-smi output
@@ -66,6 +64,7 @@ type gpuCollector struct {
cmdArgs []string
parse func([]byte) bool // returns true if valid data was found
buf []byte
bufSize uint16
}
var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data
@@ -99,7 +98,7 @@ func (c *gpuCollector) collect() error {
scanner := bufio.NewScanner(stdout)
if c.buf == nil {
c.buf = make([]byte, 0, cmdBufferSize)
c.buf = make([]byte, 0, c.bufSize)
}
scanner.Buffer(c.buf, bufio.MaxScanTokenSize)
@@ -244,20 +243,31 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData {
// copy / reset the data
gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap))
for id, gpu := range gm.GpuDataMap {
gpuAvg := *gpu
gpuAvg.Temperature = twoDecimals(gpu.Temperature)
gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed)
gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal)
// avoid division by zero
if gpu.Count > 0 {
gpuAvg.Usage = twoDecimals(gpu.Usage / gpu.Count)
gpuAvg.Power = twoDecimals(gpu.Power / gpu.Count)
count := max(gpu.Count, 1)
// average the data
gpuAvg := *gpu
gpuAvg.Temperature = twoDecimals(gpu.Temperature)
gpuAvg.Power = twoDecimals(gpu.Power / count)
// intel gpu stats doesn't provide usage, memory used, or memory total
if gpu.Engines != nil {
maxEngineUsage := 0.0
for name, engine := range gpu.Engines {
gpuAvg.Engines[name] = twoDecimals(engine / count)
maxEngineUsage = max(maxEngineUsage, engine/count)
}
gpuAvg.Usage = twoDecimals(maxEngineUsage)
} else {
gpuAvg.Usage = twoDecimals(gpu.Usage / count)
gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed)
gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal)
}
// reset accumulators in the original
gpu.Usage, gpu.Power, gpu.Count = 0, 0, 0
// reset accumulators in the original gpu data for next collection
gpu.Usage, gpu.Power, gpu.Count = gpuAvg.Usage, gpuAvg.Power, 1
gpu.Engines = gpuAvg.Engines
// append id to the name if there are multiple GPUs with the same name
if nameCounts[gpu.Name] > 1 {
@@ -284,18 +294,37 @@ func (gm *GPUManager) detectGPUs() error {
gm.tegrastats = true
gm.nvidiaSmi = false
}
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats {
if _, err := exec.LookPath(intelGpuStatsCmd); err == nil {
gm.intelGpuStats = true
}
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelGpuStats {
return nil
}
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or tegrastats")
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, tegrastats, or intel_gpu_top")
}
// startCollector starts the appropriate GPU data collector based on the command
func (gm *GPUManager) startCollector(command string) {
collector := gpuCollector{
name: command,
name: command,
bufSize: 10 * 1024,
}
switch command {
case intelGpuStatsCmd:
go func() {
failures := 0
for {
if err := gm.collectIntelStats(); err != nil {
failures++
if failures > maxFailureRetries {
break
}
slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err)
time.Sleep(retryWaitTime)
continue
}
}
}()
case nvidiaSmiCmd:
collector.cmdArgs = []string{
"-l", nvidiaSmiInterval,
@@ -344,6 +373,9 @@ func NewGPUManager() (*GPUManager, error) {
if gm.tegrastats {
gm.startCollector(tegraStatsCmd)
}
if gm.intelGpuStats {
gm.startCollector(intelGpuStatsCmd)
}
return &gm, nil
}

102
agent/gpu_intel.go Normal file
View File

@@ -0,0 +1,102 @@
package agent
import (
"encoding/json"
"fmt"
"os/exec"
"github.com/henrygd/beszel/internal/entities/system"
)
const (
intelGpuStatsCmd string = "intel_gpu_top"
intelGpuStatsInterval string = "3300" // in milliseconds
)
type intelGpuStats struct {
Power struct {
GPU float64 `json:"GPU"`
} `json:"power"`
Engines map[string]struct {
Busy float64 `json:"busy"`
} `json:"engines"`
}
// updateIntelFromStats updates aggregated GPU data from a single intelGpuStats sample
func (gm *GPUManager) updateIntelFromStats(sample *intelGpuStats) bool {
gm.Lock()
defer gm.Unlock()
// only one gpu for now - cmd doesn't provide all by default
gpuData, ok := gm.GpuDataMap["0"]
if !ok {
gpuData = &system.GPUData{Name: "GPU", Engines: make(map[string]float64)}
gm.GpuDataMap["0"] = gpuData
}
if sample.Power.GPU > 0 {
gpuData.Power += sample.Power.GPU
}
if gpuData.Engines == nil {
gpuData.Engines = make(map[string]float64, len(sample.Engines))
}
for name, engine := range sample.Engines {
gpuData.Engines[name] += engine.Busy
}
gpuData.Count++
return true
}
// collectIntelStats executes intel_gpu_top in JSON mode and stream-decodes the array of samples
func (gm *GPUManager) collectIntelStats() error {
cmd := exec.Command(intelGpuStatsCmd, "-s", intelGpuStatsInterval, "-J")
stdout, err := cmd.StdoutPipe()
if err != nil {
return err
}
if err := cmd.Start(); err != nil {
return err
}
dec := json.NewDecoder(stdout)
// Expect a JSON array stream: [ { ... }, { ... }, ... ]
tok, err := dec.Token()
if err != nil {
return err
}
if delim, ok := tok.(json.Delim); !ok || delim != '[' {
return fmt.Errorf("unexpected JSON start token: %v", tok)
}
var sample intelGpuStats
for {
if dec.More() {
// Clear the engines map before decoding
if sample.Engines != nil {
for k := range sample.Engines {
delete(sample.Engines, k)
}
}
if err := dec.Decode(&sample); err != nil {
return fmt.Errorf("decode intel gpu: %w", err)
}
gm.updateIntelFromStats(&sample)
continue
}
// Attempt to read closing bracket (will only be present when process exits)
tok, err = dec.Token()
if err != nil {
// When the process is still running, decoder will block in More/Decode; any error here is terminal
return err
}
if delim, ok := tok.(json.Delim); ok && delim == ']' {
break
}
}
return cmd.Wait()
}

View File

@@ -379,12 +379,12 @@ func TestGetCurrentData(t *testing.T) {
assert.InDelta(t, 60.0, result["1"].Power, 0.01)
// Verify that accumulators in the original map are reset
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Count, "GPU 0 Count should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Usage, "GPU 0 Usage should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Power, "GPU 0 Power should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["1"].Count, "GPU 1 Count should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["1"].Usage, "GPU 1 Usage should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["1"].Power, "GPU 1 Power should be reset")
assert.EqualValues(t, float64(1), gm.GpuDataMap["0"].Count, "GPU 0 Count should be reset")
assert.EqualValues(t, float64(50.0), gm.GpuDataMap["0"].Usage, "GPU 0 Usage should be reset")
assert.Equal(t, float64(100.0), gm.GpuDataMap["0"].Power, "GPU 0 Power should be reset")
assert.Equal(t, float64(1), gm.GpuDataMap["1"].Count, "GPU 1 Count should be reset")
assert.Equal(t, float64(30), gm.GpuDataMap["1"].Usage, "GPU 1 Usage should be reset")
assert.Equal(t, float64(60), gm.GpuDataMap["1"].Power, "GPU 1 Power should be reset")
})
t.Run("handles zero count without panicking", func(t *testing.T) {
@@ -409,7 +409,7 @@ func TestGetCurrentData(t *testing.T) {
assert.Equal(t, 0.0, result["0"].Power)
// Verify reset count
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Count)
assert.EqualValues(t, 1, gm.GpuDataMap["0"].Count)
})
}
@@ -779,16 +779,109 @@ func TestAccumulation(t *testing.T) {
}
// Verify that accumulators in the original map are reset
for id := range tt.expectedValues {
for id, expected := range tt.expectedValues {
gpu, exists := gm.GpuDataMap[id]
assert.True(t, exists, "GPU with ID %s should still exist after GetCurrentData", id)
if !exists {
continue
}
assert.Equal(t, float64(0), gpu.Count, "Count should be reset for GPU ID %s", id)
assert.Equal(t, float64(0), gpu.Usage, "Usage should be reset for GPU ID %s", id)
assert.Equal(t, float64(0), gpu.Power, "Power should be reset for GPU ID %s", id)
assert.EqualValues(t, 1, gpu.Count, "Count should be reset for GPU ID %s", id)
assert.EqualValues(t, expected.avgUsage, gpu.Usage, "Usage should be reset for GPU ID %s", id)
assert.EqualValues(t, expected.avgPower, gpu.Power, "Power should be reset for GPU ID %s", id)
}
})
}
}
func TestIntelUpdateFromStats(t *testing.T) {
gm := &GPUManager{
GpuDataMap: make(map[string]*system.GPUData),
}
// First sample with power and two engines
sample1 := intelGpuStats{
Engines: map[string]struct {
Busy float64 `json:"busy"`
}{
"Render/3D": {Busy: 20.0},
"Video": {Busy: 5.0},
},
}
sample1.Power.GPU = 10.5
ok := gm.updateIntelFromStats(&sample1)
assert.True(t, ok)
gpu := gm.GpuDataMap["0"]
require.NotNil(t, gpu)
assert.Equal(t, "GPU", gpu.Name)
assert.InDelta(t, 10.5, gpu.Power, 0.001)
assert.InDelta(t, 20.0, gpu.Engines["Render/3D"], 0.001)
assert.InDelta(t, 5.0, gpu.Engines["Video"], 0.001)
assert.Equal(t, float64(1), gpu.Count)
// Second sample with zero power (should not add) and additional engine busy
sample2 := intelGpuStats{
Engines: map[string]struct {
Busy float64 `json:"busy"`
}{
"Render/3D": {Busy: 10.0},
"Video": {Busy: 2.5},
"Blitter": {Busy: 1.0},
},
}
// zero power should not increment power accumulator
sample2.Power.GPU = 0.0
ok = gm.updateIntelFromStats(&sample2)
assert.True(t, ok)
gpu = gm.GpuDataMap["0"]
require.NotNil(t, gpu)
assert.InDelta(t, 10.5, gpu.Power, 0.001)
assert.InDelta(t, 30.0, gpu.Engines["Render/3D"], 0.001) // 20 + 10
assert.InDelta(t, 7.5, gpu.Engines["Video"], 0.001) // 5 + 2.5
assert.InDelta(t, 1.0, gpu.Engines["Blitter"], 0.001)
assert.Equal(t, float64(2), gpu.Count)
}
func TestIntelCollectorStreaming(t *testing.T) {
// Save and override PATH
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
// Create a fake intel_gpu_top that prints a JSON array with two samples and exits
scriptPath := filepath.Join(dir, "intel_gpu_top")
script := `#!/bin/sh
# Ignore args -s and -J
# Emit a JSON array with two objects, separated by a comma, then exit
(echo '['; \
echo '{"power":{"GPU":1.5},"engines":{"Render/3D":{"busy":12.34}}},'; \
echo '{"power":{"GPU":2.0},"engines":{"Video":{"busy":5}}}'; \
echo ']')`
if err := os.WriteFile(scriptPath, []byte(script), 0755); err != nil {
t.Fatal(err)
}
gm := &GPUManager{
GpuDataMap: make(map[string]*system.GPUData),
}
// Run the collector once; it should read two samples and return
if err := gm.collectIntelStats(); err != nil {
t.Fatalf("collectIntelStats error: %v", err)
}
gpu := gm.GpuDataMap["0"]
require.NotNil(t, gpu)
// Power should be sum of non-zero samples: 1.5 + 2.0 = 3.5
assert.InDelta(t, 3.5, gpu.Power, 0.001)
// Engines aggregated
assert.InDelta(t, 12.34, gpu.Engines["Render/3D"], 0.001)
assert.InDelta(t, 5.0, gpu.Engines["Video"], 0.001)
// Count should be 2 samples
assert.Equal(t, float64(2), gpu.Count)
}