diff --git a/agent/gpu.go b/agent/gpu.go index 5ba70adb..5af8e930 100644 --- a/agent/gpu.go +++ b/agent/gpu.go @@ -27,13 +27,10 @@ const ( nvidiaSmiInterval string = "4" // in seconds tegraStatsInterval string = "3700" // in milliseconds rocmSmiInterval time.Duration = 4300 * time.Millisecond - // Command retry and timeout constants retryWaitTime time.Duration = 5 * time.Second maxFailureRetries int = 5 - cmdBufferSize uint16 = 10 * 1024 - // Unit Conversions mebibytesInAMegabyte float64 = 1.024 // nvidia-smi reports memory in MiB milliwattsInAWatt float64 = 1000.0 // tegrastats reports power in mW @@ -42,10 +39,11 @@ const ( // GPUManager manages data collection for GPUs (either Nvidia or AMD) type GPUManager struct { sync.Mutex - nvidiaSmi bool - rocmSmi bool - tegrastats bool - GpuDataMap map[string]*system.GPUData + nvidiaSmi bool + rocmSmi bool + tegrastats bool + intelGpuStats bool + GpuDataMap map[string]*system.GPUData } // RocmSmiJson represents the JSON structure of rocm-smi output @@ -66,6 +64,7 @@ type gpuCollector struct { cmdArgs []string parse func([]byte) bool // returns true if valid data was found buf []byte + bufSize uint16 } var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data @@ -99,7 +98,7 @@ func (c *gpuCollector) collect() error { scanner := bufio.NewScanner(stdout) if c.buf == nil { - c.buf = make([]byte, 0, cmdBufferSize) + c.buf = make([]byte, 0, c.bufSize) } scanner.Buffer(c.buf, bufio.MaxScanTokenSize) @@ -244,20 +243,31 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData { // copy / reset the data gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap)) for id, gpu := range gm.GpuDataMap { - gpuAvg := *gpu - - gpuAvg.Temperature = twoDecimals(gpu.Temperature) - gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed) - gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal) - // avoid division by zero - if gpu.Count > 0 { - gpuAvg.Usage = twoDecimals(gpu.Usage / gpu.Count) - gpuAvg.Power = twoDecimals(gpu.Power / gpu.Count) + count := max(gpu.Count, 1) + + // average the data + gpuAvg := *gpu + gpuAvg.Temperature = twoDecimals(gpu.Temperature) + gpuAvg.Power = twoDecimals(gpu.Power / count) + + // intel gpu stats doesn't provide usage, memory used, or memory total + if gpu.Engines != nil { + maxEngineUsage := 0.0 + for name, engine := range gpu.Engines { + gpuAvg.Engines[name] = twoDecimals(engine / count) + maxEngineUsage = max(maxEngineUsage, engine/count) + } + gpuAvg.Usage = twoDecimals(maxEngineUsage) + } else { + gpuAvg.Usage = twoDecimals(gpu.Usage / count) + gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed) + gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal) } - // reset accumulators in the original - gpu.Usage, gpu.Power, gpu.Count = 0, 0, 0 + // reset accumulators in the original gpu data for next collection + gpu.Usage, gpu.Power, gpu.Count = gpuAvg.Usage, gpuAvg.Power, 1 + gpu.Engines = gpuAvg.Engines // append id to the name if there are multiple GPUs with the same name if nameCounts[gpu.Name] > 1 { @@ -284,18 +294,37 @@ func (gm *GPUManager) detectGPUs() error { gm.tegrastats = true gm.nvidiaSmi = false } - if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats { + if _, err := exec.LookPath(intelGpuStatsCmd); err == nil { + gm.intelGpuStats = true + } + if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelGpuStats { return nil } - return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or tegrastats") + return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, tegrastats, or intel_gpu_top") } // startCollector starts the appropriate GPU data collector based on the command func (gm *GPUManager) startCollector(command string) { collector := gpuCollector{ - name: command, + name: command, + bufSize: 10 * 1024, } switch command { + case intelGpuStatsCmd: + go func() { + failures := 0 + for { + if err := gm.collectIntelStats(); err != nil { + failures++ + if failures > maxFailureRetries { + break + } + slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err) + time.Sleep(retryWaitTime) + continue + } + } + }() case nvidiaSmiCmd: collector.cmdArgs = []string{ "-l", nvidiaSmiInterval, @@ -344,6 +373,9 @@ func NewGPUManager() (*GPUManager, error) { if gm.tegrastats { gm.startCollector(tegraStatsCmd) } + if gm.intelGpuStats { + gm.startCollector(intelGpuStatsCmd) + } return &gm, nil } diff --git a/agent/gpu_intel.go b/agent/gpu_intel.go new file mode 100644 index 00000000..48c6f756 --- /dev/null +++ b/agent/gpu_intel.go @@ -0,0 +1,102 @@ +package agent + +import ( + "encoding/json" + "fmt" + "os/exec" + + "github.com/henrygd/beszel/internal/entities/system" +) + +const ( + intelGpuStatsCmd string = "intel_gpu_top" + intelGpuStatsInterval string = "3300" // in milliseconds +) + +type intelGpuStats struct { + Power struct { + GPU float64 `json:"GPU"` + } `json:"power"` + Engines map[string]struct { + Busy float64 `json:"busy"` + } `json:"engines"` +} + +// updateIntelFromStats updates aggregated GPU data from a single intelGpuStats sample +func (gm *GPUManager) updateIntelFromStats(sample *intelGpuStats) bool { + gm.Lock() + defer gm.Unlock() + + // only one gpu for now - cmd doesn't provide all by default + gpuData, ok := gm.GpuDataMap["0"] + if !ok { + gpuData = &system.GPUData{Name: "GPU", Engines: make(map[string]float64)} + gm.GpuDataMap["0"] = gpuData + } + + if sample.Power.GPU > 0 { + gpuData.Power += sample.Power.GPU + } + + if gpuData.Engines == nil { + gpuData.Engines = make(map[string]float64, len(sample.Engines)) + } + for name, engine := range sample.Engines { + gpuData.Engines[name] += engine.Busy + } + + gpuData.Count++ + return true +} + +// collectIntelStats executes intel_gpu_top in JSON mode and stream-decodes the array of samples +func (gm *GPUManager) collectIntelStats() error { + cmd := exec.Command(intelGpuStatsCmd, "-s", intelGpuStatsInterval, "-J") + stdout, err := cmd.StdoutPipe() + if err != nil { + return err + } + if err := cmd.Start(); err != nil { + return err + } + + dec := json.NewDecoder(stdout) + + // Expect a JSON array stream: [ { ... }, { ... }, ... ] + tok, err := dec.Token() + if err != nil { + return err + } + if delim, ok := tok.(json.Delim); !ok || delim != '[' { + return fmt.Errorf("unexpected JSON start token: %v", tok) + } + + var sample intelGpuStats + for { + if dec.More() { + // Clear the engines map before decoding + if sample.Engines != nil { + for k := range sample.Engines { + delete(sample.Engines, k) + } + } + + if err := dec.Decode(&sample); err != nil { + return fmt.Errorf("decode intel gpu: %w", err) + } + gm.updateIntelFromStats(&sample) + continue + } + // Attempt to read closing bracket (will only be present when process exits) + tok, err = dec.Token() + if err != nil { + // When the process is still running, decoder will block in More/Decode; any error here is terminal + return err + } + if delim, ok := tok.(json.Delim); ok && delim == ']' { + break + } + } + + return cmd.Wait() +} diff --git a/agent/gpu_test.go b/agent/gpu_test.go index 6bf070ec..c2497275 100644 --- a/agent/gpu_test.go +++ b/agent/gpu_test.go @@ -379,12 +379,12 @@ func TestGetCurrentData(t *testing.T) { assert.InDelta(t, 60.0, result["1"].Power, 0.01) // Verify that accumulators in the original map are reset - assert.Equal(t, float64(0), gm.GpuDataMap["0"].Count, "GPU 0 Count should be reset") - assert.Equal(t, float64(0), gm.GpuDataMap["0"].Usage, "GPU 0 Usage should be reset") - assert.Equal(t, float64(0), gm.GpuDataMap["0"].Power, "GPU 0 Power should be reset") - assert.Equal(t, float64(0), gm.GpuDataMap["1"].Count, "GPU 1 Count should be reset") - assert.Equal(t, float64(0), gm.GpuDataMap["1"].Usage, "GPU 1 Usage should be reset") - assert.Equal(t, float64(0), gm.GpuDataMap["1"].Power, "GPU 1 Power should be reset") + assert.EqualValues(t, float64(1), gm.GpuDataMap["0"].Count, "GPU 0 Count should be reset") + assert.EqualValues(t, float64(50.0), gm.GpuDataMap["0"].Usage, "GPU 0 Usage should be reset") + assert.Equal(t, float64(100.0), gm.GpuDataMap["0"].Power, "GPU 0 Power should be reset") + assert.Equal(t, float64(1), gm.GpuDataMap["1"].Count, "GPU 1 Count should be reset") + assert.Equal(t, float64(30), gm.GpuDataMap["1"].Usage, "GPU 1 Usage should be reset") + assert.Equal(t, float64(60), gm.GpuDataMap["1"].Power, "GPU 1 Power should be reset") }) t.Run("handles zero count without panicking", func(t *testing.T) { @@ -409,7 +409,7 @@ func TestGetCurrentData(t *testing.T) { assert.Equal(t, 0.0, result["0"].Power) // Verify reset count - assert.Equal(t, float64(0), gm.GpuDataMap["0"].Count) + assert.EqualValues(t, 1, gm.GpuDataMap["0"].Count) }) } @@ -779,16 +779,109 @@ func TestAccumulation(t *testing.T) { } // Verify that accumulators in the original map are reset - for id := range tt.expectedValues { + for id, expected := range tt.expectedValues { gpu, exists := gm.GpuDataMap[id] assert.True(t, exists, "GPU with ID %s should still exist after GetCurrentData", id) if !exists { continue } - assert.Equal(t, float64(0), gpu.Count, "Count should be reset for GPU ID %s", id) - assert.Equal(t, float64(0), gpu.Usage, "Usage should be reset for GPU ID %s", id) - assert.Equal(t, float64(0), gpu.Power, "Power should be reset for GPU ID %s", id) + assert.EqualValues(t, 1, gpu.Count, "Count should be reset for GPU ID %s", id) + assert.EqualValues(t, expected.avgUsage, gpu.Usage, "Usage should be reset for GPU ID %s", id) + assert.EqualValues(t, expected.avgPower, gpu.Power, "Power should be reset for GPU ID %s", id) } }) } } + +func TestIntelUpdateFromStats(t *testing.T) { + gm := &GPUManager{ + GpuDataMap: make(map[string]*system.GPUData), + } + + // First sample with power and two engines + sample1 := intelGpuStats{ + Engines: map[string]struct { + Busy float64 `json:"busy"` + }{ + "Render/3D": {Busy: 20.0}, + "Video": {Busy: 5.0}, + }, + } + sample1.Power.GPU = 10.5 + + ok := gm.updateIntelFromStats(&sample1) + assert.True(t, ok) + + gpu := gm.GpuDataMap["0"] + require.NotNil(t, gpu) + assert.Equal(t, "GPU", gpu.Name) + assert.InDelta(t, 10.5, gpu.Power, 0.001) + assert.InDelta(t, 20.0, gpu.Engines["Render/3D"], 0.001) + assert.InDelta(t, 5.0, gpu.Engines["Video"], 0.001) + assert.Equal(t, float64(1), gpu.Count) + + // Second sample with zero power (should not add) and additional engine busy + sample2 := intelGpuStats{ + Engines: map[string]struct { + Busy float64 `json:"busy"` + }{ + "Render/3D": {Busy: 10.0}, + "Video": {Busy: 2.5}, + "Blitter": {Busy: 1.0}, + }, + } + // zero power should not increment power accumulator + sample2.Power.GPU = 0.0 + + ok = gm.updateIntelFromStats(&sample2) + assert.True(t, ok) + + gpu = gm.GpuDataMap["0"] + require.NotNil(t, gpu) + assert.InDelta(t, 10.5, gpu.Power, 0.001) + assert.InDelta(t, 30.0, gpu.Engines["Render/3D"], 0.001) // 20 + 10 + assert.InDelta(t, 7.5, gpu.Engines["Video"], 0.001) // 5 + 2.5 + assert.InDelta(t, 1.0, gpu.Engines["Blitter"], 0.001) + assert.Equal(t, float64(2), gpu.Count) +} + +func TestIntelCollectorStreaming(t *testing.T) { + // Save and override PATH + origPath := os.Getenv("PATH") + defer os.Setenv("PATH", origPath) + + dir := t.TempDir() + os.Setenv("PATH", dir) + + // Create a fake intel_gpu_top that prints a JSON array with two samples and exits + scriptPath := filepath.Join(dir, "intel_gpu_top") + script := `#!/bin/sh +# Ignore args -s and -J +# Emit a JSON array with two objects, separated by a comma, then exit +(echo '['; \ + echo '{"power":{"GPU":1.5},"engines":{"Render/3D":{"busy":12.34}}},'; \ + echo '{"power":{"GPU":2.0},"engines":{"Video":{"busy":5}}}'; \ + echo ']')` + if err := os.WriteFile(scriptPath, []byte(script), 0755); err != nil { + t.Fatal(err) + } + + gm := &GPUManager{ + GpuDataMap: make(map[string]*system.GPUData), + } + + // Run the collector once; it should read two samples and return + if err := gm.collectIntelStats(); err != nil { + t.Fatalf("collectIntelStats error: %v", err) + } + + gpu := gm.GpuDataMap["0"] + require.NotNil(t, gpu) + // Power should be sum of non-zero samples: 1.5 + 2.0 = 3.5 + assert.InDelta(t, 3.5, gpu.Power, 0.001) + // Engines aggregated + assert.InDelta(t, 12.34, gpu.Engines["Render/3D"], 0.001) + assert.InDelta(t, 5.0, gpu.Engines["Video"], 0.001) + // Count should be 2 samples + assert.Equal(t, float64(2), gpu.Count) +} diff --git a/internal/entities/system/system.go b/internal/entities/system/system.go index 47c2f0ab..16d96004 100644 --- a/internal/entities/system/system.go +++ b/internal/entities/system/system.go @@ -45,13 +45,14 @@ type Stats struct { } type GPUData struct { - Name string `json:"n" cbor:"0,keyasint"` - Temperature float64 `json:"-"` - MemoryUsed float64 `json:"mu,omitempty" cbor:"1,keyasint,omitempty"` - MemoryTotal float64 `json:"mt,omitempty" cbor:"2,keyasint,omitempty"` - Usage float64 `json:"u" cbor:"3,keyasint"` - Power float64 `json:"p,omitempty" cbor:"4,keyasint,omitempty"` - Count float64 `json:"-"` + Name string `json:"n" cbor:"0,keyasint"` + Temperature float64 `json:"-"` + MemoryUsed float64 `json:"mu,omitempty,omitzero" cbor:"1,keyasint,omitempty,omitzero"` + MemoryTotal float64 `json:"mt,omitempty,omitzero" cbor:"2,keyasint,omitempty,omitzero"` + Usage float64 `json:"u" cbor:"3,keyasint,omitempty"` + Power float64 `json:"p,omitempty" cbor:"4,keyasint,omitempty"` + Count float64 `json:"-"` + Engines map[string]float64 `json:"e,omitempty" cbor:"5,keyasint,omitempty"` } type FsStats struct { diff --git a/internal/records/records.go b/internal/records/records.go index 428bba0a..416ecb24 100644 --- a/internal/records/records.go +++ b/internal/records/records.go @@ -284,6 +284,16 @@ func (rm *RecordManager) AverageSystemStats(db dbx.Builder, records RecordIds) * gpu.Usage += value.Usage gpu.Power += value.Power gpu.Count += value.Count + + if value.Engines != nil { + if gpu.Engines == nil { + gpu.Engines = make(map[string]float64, len(value.Engines)) + } + for engineKey, engineValue := range value.Engines { + gpu.Engines[engineKey] += engineValue + } + } + sum.GPUData[id] = gpu } } @@ -353,6 +363,13 @@ func (rm *RecordManager) AverageSystemStats(db dbx.Builder, records RecordIds) * gpu.Usage = twoDecimals(gpu.Usage / count) gpu.Power = twoDecimals(gpu.Power / count) gpu.Count = twoDecimals(gpu.Count / count) + + if gpu.Engines != nil { + for engineKey := range gpu.Engines { + gpu.Engines[engineKey] = twoDecimals(gpu.Engines[engineKey] / count) + } + } + sum.GPUData[id] = gpu } } diff --git a/internal/site/src/components/charts/hooks.ts b/internal/site/src/components/charts/hooks.ts index a5946026..8bb75076 100644 --- a/internal/site/src/components/charts/hooks.ts +++ b/internal/site/src/components/charts/hooks.ts @@ -115,11 +115,11 @@ export function useNetworkInterfaces(interfaces: SystemStats["ni"]) { data: (index = 3) => { return sortedKeys.map((key) => ({ label: key, - dataKey: (stats: SystemStatsRecord) => stats.stats?.ni?.[key]?.[index], + dataKey: ({ stats }: SystemStatsRecord) => stats?.ni?.[key]?.[index], color: `hsl(${220 + (((sortedKeys.indexOf(key) * 360) / sortedKeys.length) % 360)}, 70%, 50%)`, opacity: 0.3, })) }, } -} +} \ No newline at end of file diff --git a/internal/site/src/components/routes/system.tsx b/internal/site/src/components/routes/system.tsx index 345c7f57..3699db24 100644 --- a/internal/site/src/components/routes/system.tsx +++ b/internal/site/src/components/routes/system.tsx @@ -61,6 +61,7 @@ import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from ". import { Separator } from "../ui/separator" import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "../ui/tooltip" import NetworkSheet from "./system/network-sheet" +import LineChartDefault from "../charts/line-chart" type ChartTimeData = { time: number @@ -398,6 +399,7 @@ export default memo(function SystemDetail({ name }: { name: string }) { const lastGpuVals = Object.values(systemStats.at(-1)?.stats.g ?? {}) const hasGpuData = lastGpuVals.length > 0 const hasGpuPowerData = lastGpuVals.some((gpu) => gpu.p !== undefined) + const hasGpuEnginesData = lastGpuVals.some((gpu) => gpu.e !== undefined) let translatedStatus: string = system.status if (system.status === SystemStatus.Up) { @@ -758,7 +760,6 @@ export default memo(function SystemDetail({ name }: { name: string }) { /> )} - {/* GPU power draw chart */} {hasGpuPowerData && ( - {/* GPU charts */} + {/* Non-power GPU charts */} {hasGpuData && (
+ {hasGpuEnginesData && ( + + + + )} {Object.keys(systemStats.at(-1)?.stats.g ?? {}).map((id) => { const gpu = systemStats.at(-1)?.stats.g?.[id] as GPUData return ( @@ -799,33 +811,36 @@ export default memo(function SystemDetail({ name }: { name: string }) { contentFormatter={({ value }) => `${decimalString(value)}%`} /> - - stats?.g?.[id]?.mu ?? 0, - color: 2, - opacity: 0.25, - }, - ]} - max={gpu.mt} - tickFormatter={(val) => { - const { value, unit } = formatBytes(val, false, Unit.Bytes, true) - return `${toFixedFloat(value, value >= 10 ? 0 : 1)} ${unit}` - }} - contentFormatter={({ value }) => { - const { value: convertedValue, unit } = formatBytes(value, false, Unit.Bytes, true) - return `${decimalString(convertedValue)} ${unit}` - }} - /> - + + {(gpu.mt ?? 0) > 0 && ( + + stats?.g?.[id]?.mu ?? 0, + color: 2, + opacity: 0.25, + }, + ]} + max={gpu.mt} + tickFormatter={(val) => { + const { value, unit } = formatBytes(val, false, Unit.Bytes, true) + return `${toFixedFloat(value, value >= 10 ? 0 : 1)} ${unit}` + }} + contentFormatter={({ value }) => { + const { value: convertedValue, unit } = formatBytes(value, false, Unit.Bytes, true) + return `${decimalString(convertedValue)} ${unit}` + }} + /> + + )}
) })} @@ -897,6 +912,28 @@ export default memo(function SystemDetail({ name }: { name: string }) { ) }) +function GpuEnginesChart({ chartData }: { chartData: ChartData }) { + const dataPoints = [] + const engines = Object.keys(chartData.systemStats?.at(-1)?.stats.g?.[0]?.e ?? {}).sort() + for (const engine of engines) { + dataPoints.push({ + label: engine, + dataKey: ({ stats }: SystemStatsRecord) => stats?.g?.[0]?.e?.[engine] ?? 0, + color: `hsl(${140 + (((engines.indexOf(engine) * 360) / engines.length) % 360)}, 65%, 52%)`, + opacity: 0.35, + }) + } + return ( + `${toFixedFloat(val, 2)}%`} + contentFormatter={({ value }) => `${decimalString(value)}%`} + /> + ) +} + function FilterBar({ store = $containerFilter }: { store?: typeof $containerFilter }) { const containerFilter = useStore(store) const { t } = useLingui() diff --git a/internal/site/src/types.d.ts b/internal/site/src/types.d.ts index 5d8d25f7..4925ebbf 100644 --- a/internal/site/src/types.d.ts +++ b/internal/site/src/types.d.ts @@ -158,6 +158,8 @@ export interface GPUData { u: number /** power (w) */ p?: number + /** engines */ + e?: Record } export interface ExtraFsStats {