diff --git a/agent/gpu.go b/agent/gpu.go index 205819f7..97e7567d 100644 --- a/agent/gpu.go +++ b/agent/gpu.go @@ -249,13 +249,20 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData { // average the data gpuAvg := *gpu gpuAvg.Temperature = twoDecimals(gpu.Temperature) - gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed) - gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal) - gpuAvg.Usage = twoDecimals(gpu.Usage / count) gpuAvg.Power = twoDecimals(gpu.Power / count) - gpuAvg.Engines = make(map[string]float64, len(gpu.Engines)) - for name, engine := range gpu.Engines { - gpuAvg.Engines[name] = twoDecimals(engine / count) + + // intel gpu stats doesn't provide usage, memory used, or memory total + if gm.intelGpuStats { + maxEngineUsage := 0.0 + for name, engine := range gpu.Engines { + gpuAvg.Engines[name] = twoDecimals(engine / count) + maxEngineUsage = max(maxEngineUsage, engine/count) + } + gpuAvg.Usage = twoDecimals(maxEngineUsage) + } else { + gpuAvg.Usage = twoDecimals(gpu.Usage / count) + gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed) + gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal) } // reset accumulators in the original gpu data for next collection @@ -288,7 +295,6 @@ func (gm *GPUManager) detectGPUs() error { gm.nvidiaSmi = false } if _, err := exec.LookPath(intelGpuStatsCmd); err == nil { - slog.Info("Intel GPU stats found") gm.intelGpuStats = true } if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelGpuStats { @@ -305,10 +311,20 @@ func (gm *GPUManager) startCollector(command string) { } switch command { case intelGpuStatsCmd: - slog.Info("Starting Intel GPU stats collector") - collector.cmdArgs = []string{"-s", intelGpuStatsInterval, "-J"} - collector.parse = gm.parseIntelData - go collector.start() + go func() { + failures := 0 + for { + if err := gm.collectIntelStats(); err != nil { + failures++ + if failures > maxFailureRetries { + break + } + slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err) + time.Sleep(retryWaitTime) + continue + } + } + }() case nvidiaSmiCmd: collector.cmdArgs = []string{ "-l", nvidiaSmiInterval, diff --git a/agent/gpu_intel.go b/agent/gpu_intel.go index f4df97c8..48c6f756 100644 --- a/agent/gpu_intel.go +++ b/agent/gpu_intel.go @@ -2,52 +2,101 @@ package agent import ( "encoding/json" - "log/slog" + "fmt" + "os/exec" "github.com/henrygd/beszel/internal/entities/system" ) const ( intelGpuStatsCmd string = "intel_gpu_top" - intelGpuStatsInterval string = "3800" // in milliseconds + intelGpuStatsInterval string = "3300" // in milliseconds ) type intelGpuStats struct { Power struct { - GPU float64 `json:"gpu"` + GPU float64 `json:"GPU"` } `json:"power"` Engines map[string]struct { Busy float64 `json:"busy"` } `json:"engines"` } -func (gm *GPUManager) parseIntelData(output []byte) bool { - slog.Info("Parsing Intel GPU stats") - var intelGpuStats intelGpuStats - if err := json.Unmarshal(output, &intelGpuStats); err != nil { - slog.Error("Error parsing Intel GPU stats", "err", err) - return false - } +// updateIntelFromStats updates aggregated GPU data from a single intelGpuStats sample +func (gm *GPUManager) updateIntelFromStats(sample *intelGpuStats) bool { gm.Lock() defer gm.Unlock() // only one gpu for now - cmd doesn't provide all by default gpuData, ok := gm.GpuDataMap["0"] if !ok { - gpuData = &system.GPUData{Name: "GPU", Engines: make(map[string]float64, len(intelGpuStats.Engines))} + gpuData = &system.GPUData{Name: "GPU", Engines: make(map[string]float64)} gm.GpuDataMap["0"] = gpuData } - if intelGpuStats.Power.GPU > 0 { - gpuData.Power += intelGpuStats.Power.GPU + if sample.Power.GPU > 0 { + gpuData.Power += sample.Power.GPU } - for name, engine := range intelGpuStats.Engines { + if gpuData.Engines == nil { + gpuData.Engines = make(map[string]float64, len(sample.Engines)) + } + for name, engine := range sample.Engines { gpuData.Engines[name] += engine.Busy } gpuData.Count++ - - slog.Info("GPU Data", "gpuData", gpuData) return true } + +// collectIntelStats executes intel_gpu_top in JSON mode and stream-decodes the array of samples +func (gm *GPUManager) collectIntelStats() error { + cmd := exec.Command(intelGpuStatsCmd, "-s", intelGpuStatsInterval, "-J") + stdout, err := cmd.StdoutPipe() + if err != nil { + return err + } + if err := cmd.Start(); err != nil { + return err + } + + dec := json.NewDecoder(stdout) + + // Expect a JSON array stream: [ { ... }, { ... }, ... ] + tok, err := dec.Token() + if err != nil { + return err + } + if delim, ok := tok.(json.Delim); !ok || delim != '[' { + return fmt.Errorf("unexpected JSON start token: %v", tok) + } + + var sample intelGpuStats + for { + if dec.More() { + // Clear the engines map before decoding + if sample.Engines != nil { + for k := range sample.Engines { + delete(sample.Engines, k) + } + } + + if err := dec.Decode(&sample); err != nil { + return fmt.Errorf("decode intel gpu: %w", err) + } + gm.updateIntelFromStats(&sample) + continue + } + // Attempt to read closing bracket (will only be present when process exits) + tok, err = dec.Token() + if err != nil { + // When the process is still running, decoder will block in More/Decode; any error here is terminal + return err + } + if delim, ok := tok.(json.Delim); ok && delim == ']' { + break + } + } + + return cmd.Wait() +} diff --git a/agent/gpu_test.go b/agent/gpu_test.go index 6bf070ec..1f2fcc99 100644 --- a/agent/gpu_test.go +++ b/agent/gpu_test.go @@ -792,3 +792,96 @@ func TestAccumulation(t *testing.T) { }) } } + +func TestIntelUpdateFromStats(t *testing.T) { + gm := &GPUManager{ + GpuDataMap: make(map[string]*system.GPUData), + } + + // First sample with power and two engines + sample1 := intelGpuStats{ + Engines: map[string]struct { + Busy float64 `json:"busy"` + }{ + "Render/3D": {Busy: 20.0}, + "Video": {Busy: 5.0}, + }, + } + sample1.Power.GPU = 10.5 + + ok := gm.updateIntelFromStats(&sample1) + assert.True(t, ok) + + gpu := gm.GpuDataMap["0"] + require.NotNil(t, gpu) + assert.Equal(t, "GPU", gpu.Name) + assert.InDelta(t, 10.5, gpu.Power, 0.001) + assert.InDelta(t, 20.0, gpu.Engines["Render/3D"], 0.001) + assert.InDelta(t, 5.0, gpu.Engines["Video"], 0.001) + assert.Equal(t, float64(1), gpu.Count) + + // Second sample with zero power (should not add) and additional engine busy + sample2 := intelGpuStats{ + Engines: map[string]struct { + Busy float64 `json:"busy"` + }{ + "Render/3D": {Busy: 10.0}, + "Video": {Busy: 2.5}, + "Blitter": {Busy: 1.0}, + }, + } + // zero power should not increment power accumulator + sample2.Power.GPU = 0.0 + + ok = gm.updateIntelFromStats(&sample2) + assert.True(t, ok) + + gpu = gm.GpuDataMap["0"] + require.NotNil(t, gpu) + assert.InDelta(t, 10.5, gpu.Power, 0.001) + assert.InDelta(t, 30.0, gpu.Engines["Render/3D"], 0.001) // 20 + 10 + assert.InDelta(t, 7.5, gpu.Engines["Video"], 0.001) // 5 + 2.5 + assert.InDelta(t, 1.0, gpu.Engines["Blitter"], 0.001) + assert.Equal(t, float64(2), gpu.Count) +} + +func TestIntelCollectorStreaming(t *testing.T) { + // Save and override PATH + origPath := os.Getenv("PATH") + defer os.Setenv("PATH", origPath) + + dir := t.TempDir() + os.Setenv("PATH", dir) + + // Create a fake intel_gpu_top that prints a JSON array with two samples and exits + scriptPath := filepath.Join(dir, "intel_gpu_top") + script := `#!/bin/sh +# Ignore args -s and -J +# Emit a JSON array with two objects, separated by a comma, then exit +(echo '['; \ + echo '{"power":{"GPU":1.5},"engines":{"Render/3D":{"busy":12.34}}},'; \ + echo '{"power":{"GPU":2.0},"engines":{"Video":{"busy":5}}}'; \ + echo ']')` + if err := os.WriteFile(scriptPath, []byte(script), 0755); err != nil { + t.Fatal(err) + } + + gm := &GPUManager{ + GpuDataMap: make(map[string]*system.GPUData), + } + + // Run the collector once; it should read two samples and return + if err := gm.collectIntelStats(); err != nil { + t.Fatalf("collectIntelStats error: %v", err) + } + + gpu := gm.GpuDataMap["0"] + require.NotNil(t, gpu) + // Power should be sum of non-zero samples: 1.5 + 2.0 = 3.5 + assert.InDelta(t, 3.5, gpu.Power, 0.001) + // Engines aggregated + assert.InDelta(t, 12.34, gpu.Engines["Render/3D"], 0.001) + assert.InDelta(t, 5.0, gpu.Engines["Video"], 0.001) + // Count should be 2 samples + assert.Equal(t, float64(2), gpu.Count) +} diff --git a/internal/entities/system/system.go b/internal/entities/system/system.go index 7011b127..16d96004 100644 --- a/internal/entities/system/system.go +++ b/internal/entities/system/system.go @@ -47,9 +47,9 @@ type Stats struct { type GPUData struct { Name string `json:"n" cbor:"0,keyasint"` Temperature float64 `json:"-"` - MemoryUsed float64 `json:"mu,omitempty" cbor:"1,keyasint,omitempty"` - MemoryTotal float64 `json:"mt,omitempty" cbor:"2,keyasint,omitempty"` - Usage float64 `json:"u" cbor:"3,keyasint"` + MemoryUsed float64 `json:"mu,omitempty,omitzero" cbor:"1,keyasint,omitempty,omitzero"` + MemoryTotal float64 `json:"mt,omitempty,omitzero" cbor:"2,keyasint,omitempty,omitzero"` + Usage float64 `json:"u" cbor:"3,keyasint,omitempty"` Power float64 `json:"p,omitempty" cbor:"4,keyasint,omitempty"` Count float64 `json:"-"` Engines map[string]float64 `json:"e,omitempty" cbor:"5,keyasint,omitempty"` diff --git a/internal/records/records.go b/internal/records/records.go index 428bba0a..416ecb24 100644 --- a/internal/records/records.go +++ b/internal/records/records.go @@ -284,6 +284,16 @@ func (rm *RecordManager) AverageSystemStats(db dbx.Builder, records RecordIds) * gpu.Usage += value.Usage gpu.Power += value.Power gpu.Count += value.Count + + if value.Engines != nil { + if gpu.Engines == nil { + gpu.Engines = make(map[string]float64, len(value.Engines)) + } + for engineKey, engineValue := range value.Engines { + gpu.Engines[engineKey] += engineValue + } + } + sum.GPUData[id] = gpu } } @@ -353,6 +363,13 @@ func (rm *RecordManager) AverageSystemStats(db dbx.Builder, records RecordIds) * gpu.Usage = twoDecimals(gpu.Usage / count) gpu.Power = twoDecimals(gpu.Power / count) gpu.Count = twoDecimals(gpu.Count / count) + + if gpu.Engines != nil { + for engineKey := range gpu.Engines { + gpu.Engines[engineKey] = twoDecimals(gpu.Engines[engineKey] / count) + } + } + sum.GPUData[id] = gpu } } diff --git a/internal/site/src/components/charts/hooks.ts b/internal/site/src/components/charts/hooks.ts index a72d9639..8bb75076 100644 --- a/internal/site/src/components/charts/hooks.ts +++ b/internal/site/src/components/charts/hooks.ts @@ -122,16 +122,4 @@ export function useNetworkInterfaces(interfaces: SystemStats["ni"]) { })) }, } -} - -/** Generates chart configurations for GPU engines */ -export function useGpuEngines(systemStats?: SystemStatsRecord) { - const keys = Object.keys(systemStats?.stats.g?.[0]?.e ?? {}) - const sortedKeys = keys.sort() - return sortedKeys.map((engine) => ({ - label: engine, - dataKey: ({ stats }: SystemStatsRecord) => stats?.g?.[0]?.e?.[engine] ?? 0, - color: `hsl(${220 + ((sortedKeys.indexOf(engine) * 360) / sortedKeys.length) % 360}, 65%, 52%)`, - opacity: 0.35, - })) -} +} \ No newline at end of file diff --git a/internal/site/src/components/routes/system.tsx b/internal/site/src/components/routes/system.tsx index 0a47fadf..73aa25be 100644 --- a/internal/site/src/components/routes/system.tsx +++ b/internal/site/src/components/routes/system.tsx @@ -18,7 +18,7 @@ import AreaChartDefault from "@/components/charts/area-chart" import ContainerChart from "@/components/charts/container-chart" import DiskChart from "@/components/charts/disk-chart" import GpuPowerChart from "@/components/charts/gpu-power-chart" -import { useContainerChartConfigs, useGpuEngines } from "@/components/charts/hooks" +import { useContainerChartConfigs } from "@/components/charts/hooks" import LoadAverageChart from "@/components/charts/load-average-chart" import MemChart from "@/components/charts/mem-chart" import SwapChart from "@/components/charts/swap-chart" @@ -761,6 +761,12 @@ export default memo(function SystemDetail({ name }: { name: string }) { )} + + + + {/* Non-power GPU charts */} + {hasGpuData && ( +
{/* GPU power draw chart */} {hasGpuPowerData && ( )} - - {hasGpuEnginesData && ( - - - - )} -
- - {/* GPU charts */} - {hasGpuData && ( -
+ {hasGpuEnginesData && ( + + + + )} {Object.keys(systemStats.at(-1)?.stats.g ?? {}).map((id) => { const gpu = systemStats.at(-1)?.stats.g?.[id] as GPUData return ( @@ -812,6 +812,8 @@ export default memo(function SystemDetail({ name }: { name: string }) { contentFormatter={({ value }) => `${decimalString(value)}%`} /> + + {(gpu.mt ?? 0) > 0 && ( + )}
+ ) })} @@ -911,9 +915,18 @@ export default memo(function SystemDetail({ name }: { name: string }) { }) function GpuEnginesChart({ chartData }: { chartData: ChartData }) { - const engineData = useGpuEngines(chartData.systemStats.at(-1)) + const dataPoints = [] + const engines = Object.keys(chartData.systemStats?.at(-1)?.stats.g?.[0]?.e ?? {}).sort() + for (const engine of engines) { + dataPoints.push({ + label: engine, + dataKey: ({ stats }: SystemStatsRecord) => stats?.g?.[0]?.e?.[engine] ?? 0, + color: `hsl(${140 + ((engines.indexOf(engine) * 360) / engines.length) % 360}, 65%, 52%)`, + opacity: 0.35, + }) + } return ( - `${toFixedFloat(val, 2)}%`} contentFormatter={({ value }) => `${decimalString(value)}%`} /> + `${toFixedFloat(val, 2)}%`} contentFormatter={({ value }) => `${decimalString(value)}%`} /> ) }