working intel gpu stats for one gpu

This commit is contained in:
henrygd
2025-09-22 16:13:04 -04:00
parent 6a406e5206
commit 9d87102e0c
7 changed files with 238 additions and 62 deletions

View File

@@ -249,13 +249,20 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData {
// average the data
gpuAvg := *gpu
gpuAvg.Temperature = twoDecimals(gpu.Temperature)
gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed)
gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal)
gpuAvg.Usage = twoDecimals(gpu.Usage / count)
gpuAvg.Power = twoDecimals(gpu.Power / count)
gpuAvg.Engines = make(map[string]float64, len(gpu.Engines))
for name, engine := range gpu.Engines {
gpuAvg.Engines[name] = twoDecimals(engine / count)
// intel gpu stats doesn't provide usage, memory used, or memory total
if gm.intelGpuStats {
maxEngineUsage := 0.0
for name, engine := range gpu.Engines {
gpuAvg.Engines[name] = twoDecimals(engine / count)
maxEngineUsage = max(maxEngineUsage, engine/count)
}
gpuAvg.Usage = twoDecimals(maxEngineUsage)
} else {
gpuAvg.Usage = twoDecimals(gpu.Usage / count)
gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed)
gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal)
}
// reset accumulators in the original gpu data for next collection
@@ -288,7 +295,6 @@ func (gm *GPUManager) detectGPUs() error {
gm.nvidiaSmi = false
}
if _, err := exec.LookPath(intelGpuStatsCmd); err == nil {
slog.Info("Intel GPU stats found")
gm.intelGpuStats = true
}
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelGpuStats {
@@ -305,10 +311,20 @@ func (gm *GPUManager) startCollector(command string) {
}
switch command {
case intelGpuStatsCmd:
slog.Info("Starting Intel GPU stats collector")
collector.cmdArgs = []string{"-s", intelGpuStatsInterval, "-J"}
collector.parse = gm.parseIntelData
go collector.start()
go func() {
failures := 0
for {
if err := gm.collectIntelStats(); err != nil {
failures++
if failures > maxFailureRetries {
break
}
slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err)
time.Sleep(retryWaitTime)
continue
}
}
}()
case nvidiaSmiCmd:
collector.cmdArgs = []string{
"-l", nvidiaSmiInterval,

View File

@@ -2,52 +2,101 @@ package agent
import (
"encoding/json"
"log/slog"
"fmt"
"os/exec"
"github.com/henrygd/beszel/internal/entities/system"
)
const (
intelGpuStatsCmd string = "intel_gpu_top"
intelGpuStatsInterval string = "3800" // in milliseconds
intelGpuStatsInterval string = "3300" // in milliseconds
)
type intelGpuStats struct {
Power struct {
GPU float64 `json:"gpu"`
GPU float64 `json:"GPU"`
} `json:"power"`
Engines map[string]struct {
Busy float64 `json:"busy"`
} `json:"engines"`
}
func (gm *GPUManager) parseIntelData(output []byte) bool {
slog.Info("Parsing Intel GPU stats")
var intelGpuStats intelGpuStats
if err := json.Unmarshal(output, &intelGpuStats); err != nil {
slog.Error("Error parsing Intel GPU stats", "err", err)
return false
}
// updateIntelFromStats updates aggregated GPU data from a single intelGpuStats sample
func (gm *GPUManager) updateIntelFromStats(sample *intelGpuStats) bool {
gm.Lock()
defer gm.Unlock()
// only one gpu for now - cmd doesn't provide all by default
gpuData, ok := gm.GpuDataMap["0"]
if !ok {
gpuData = &system.GPUData{Name: "GPU", Engines: make(map[string]float64, len(intelGpuStats.Engines))}
gpuData = &system.GPUData{Name: "GPU", Engines: make(map[string]float64)}
gm.GpuDataMap["0"] = gpuData
}
if intelGpuStats.Power.GPU > 0 {
gpuData.Power += intelGpuStats.Power.GPU
if sample.Power.GPU > 0 {
gpuData.Power += sample.Power.GPU
}
for name, engine := range intelGpuStats.Engines {
if gpuData.Engines == nil {
gpuData.Engines = make(map[string]float64, len(sample.Engines))
}
for name, engine := range sample.Engines {
gpuData.Engines[name] += engine.Busy
}
gpuData.Count++
slog.Info("GPU Data", "gpuData", gpuData)
return true
}
// collectIntelStats executes intel_gpu_top in JSON mode and stream-decodes the array of samples
func (gm *GPUManager) collectIntelStats() error {
cmd := exec.Command(intelGpuStatsCmd, "-s", intelGpuStatsInterval, "-J")
stdout, err := cmd.StdoutPipe()
if err != nil {
return err
}
if err := cmd.Start(); err != nil {
return err
}
dec := json.NewDecoder(stdout)
// Expect a JSON array stream: [ { ... }, { ... }, ... ]
tok, err := dec.Token()
if err != nil {
return err
}
if delim, ok := tok.(json.Delim); !ok || delim != '[' {
return fmt.Errorf("unexpected JSON start token: %v", tok)
}
var sample intelGpuStats
for {
if dec.More() {
// Clear the engines map before decoding
if sample.Engines != nil {
for k := range sample.Engines {
delete(sample.Engines, k)
}
}
if err := dec.Decode(&sample); err != nil {
return fmt.Errorf("decode intel gpu: %w", err)
}
gm.updateIntelFromStats(&sample)
continue
}
// Attempt to read closing bracket (will only be present when process exits)
tok, err = dec.Token()
if err != nil {
// When the process is still running, decoder will block in More/Decode; any error here is terminal
return err
}
if delim, ok := tok.(json.Delim); ok && delim == ']' {
break
}
}
return cmd.Wait()
}

View File

@@ -792,3 +792,96 @@ func TestAccumulation(t *testing.T) {
})
}
}
func TestIntelUpdateFromStats(t *testing.T) {
gm := &GPUManager{
GpuDataMap: make(map[string]*system.GPUData),
}
// First sample with power and two engines
sample1 := intelGpuStats{
Engines: map[string]struct {
Busy float64 `json:"busy"`
}{
"Render/3D": {Busy: 20.0},
"Video": {Busy: 5.0},
},
}
sample1.Power.GPU = 10.5
ok := gm.updateIntelFromStats(&sample1)
assert.True(t, ok)
gpu := gm.GpuDataMap["0"]
require.NotNil(t, gpu)
assert.Equal(t, "GPU", gpu.Name)
assert.InDelta(t, 10.5, gpu.Power, 0.001)
assert.InDelta(t, 20.0, gpu.Engines["Render/3D"], 0.001)
assert.InDelta(t, 5.0, gpu.Engines["Video"], 0.001)
assert.Equal(t, float64(1), gpu.Count)
// Second sample with zero power (should not add) and additional engine busy
sample2 := intelGpuStats{
Engines: map[string]struct {
Busy float64 `json:"busy"`
}{
"Render/3D": {Busy: 10.0},
"Video": {Busy: 2.5},
"Blitter": {Busy: 1.0},
},
}
// zero power should not increment power accumulator
sample2.Power.GPU = 0.0
ok = gm.updateIntelFromStats(&sample2)
assert.True(t, ok)
gpu = gm.GpuDataMap["0"]
require.NotNil(t, gpu)
assert.InDelta(t, 10.5, gpu.Power, 0.001)
assert.InDelta(t, 30.0, gpu.Engines["Render/3D"], 0.001) // 20 + 10
assert.InDelta(t, 7.5, gpu.Engines["Video"], 0.001) // 5 + 2.5
assert.InDelta(t, 1.0, gpu.Engines["Blitter"], 0.001)
assert.Equal(t, float64(2), gpu.Count)
}
func TestIntelCollectorStreaming(t *testing.T) {
// Save and override PATH
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
// Create a fake intel_gpu_top that prints a JSON array with two samples and exits
scriptPath := filepath.Join(dir, "intel_gpu_top")
script := `#!/bin/sh
# Ignore args -s and -J
# Emit a JSON array with two objects, separated by a comma, then exit
(echo '['; \
echo '{"power":{"GPU":1.5},"engines":{"Render/3D":{"busy":12.34}}},'; \
echo '{"power":{"GPU":2.0},"engines":{"Video":{"busy":5}}}'; \
echo ']')`
if err := os.WriteFile(scriptPath, []byte(script), 0755); err != nil {
t.Fatal(err)
}
gm := &GPUManager{
GpuDataMap: make(map[string]*system.GPUData),
}
// Run the collector once; it should read two samples and return
if err := gm.collectIntelStats(); err != nil {
t.Fatalf("collectIntelStats error: %v", err)
}
gpu := gm.GpuDataMap["0"]
require.NotNil(t, gpu)
// Power should be sum of non-zero samples: 1.5 + 2.0 = 3.5
assert.InDelta(t, 3.5, gpu.Power, 0.001)
// Engines aggregated
assert.InDelta(t, 12.34, gpu.Engines["Render/3D"], 0.001)
assert.InDelta(t, 5.0, gpu.Engines["Video"], 0.001)
// Count should be 2 samples
assert.Equal(t, float64(2), gpu.Count)
}

View File

@@ -47,9 +47,9 @@ type Stats struct {
type GPUData struct {
Name string `json:"n" cbor:"0,keyasint"`
Temperature float64 `json:"-"`
MemoryUsed float64 `json:"mu,omitempty" cbor:"1,keyasint,omitempty"`
MemoryTotal float64 `json:"mt,omitempty" cbor:"2,keyasint,omitempty"`
Usage float64 `json:"u" cbor:"3,keyasint"`
MemoryUsed float64 `json:"mu,omitempty,omitzero" cbor:"1,keyasint,omitempty,omitzero"`
MemoryTotal float64 `json:"mt,omitempty,omitzero" cbor:"2,keyasint,omitempty,omitzero"`
Usage float64 `json:"u" cbor:"3,keyasint,omitempty"`
Power float64 `json:"p,omitempty" cbor:"4,keyasint,omitempty"`
Count float64 `json:"-"`
Engines map[string]float64 `json:"e,omitempty" cbor:"5,keyasint,omitempty"`

View File

@@ -284,6 +284,16 @@ func (rm *RecordManager) AverageSystemStats(db dbx.Builder, records RecordIds) *
gpu.Usage += value.Usage
gpu.Power += value.Power
gpu.Count += value.Count
if value.Engines != nil {
if gpu.Engines == nil {
gpu.Engines = make(map[string]float64, len(value.Engines))
}
for engineKey, engineValue := range value.Engines {
gpu.Engines[engineKey] += engineValue
}
}
sum.GPUData[id] = gpu
}
}
@@ -353,6 +363,13 @@ func (rm *RecordManager) AverageSystemStats(db dbx.Builder, records RecordIds) *
gpu.Usage = twoDecimals(gpu.Usage / count)
gpu.Power = twoDecimals(gpu.Power / count)
gpu.Count = twoDecimals(gpu.Count / count)
if gpu.Engines != nil {
for engineKey := range gpu.Engines {
gpu.Engines[engineKey] = twoDecimals(gpu.Engines[engineKey] / count)
}
}
sum.GPUData[id] = gpu
}
}

View File

@@ -122,16 +122,4 @@ export function useNetworkInterfaces(interfaces: SystemStats["ni"]) {
}))
},
}
}
/** Generates chart configurations for GPU engines */
export function useGpuEngines(systemStats?: SystemStatsRecord) {
const keys = Object.keys(systemStats?.stats.g?.[0]?.e ?? {})
const sortedKeys = keys.sort()
return sortedKeys.map((engine) => ({
label: engine,
dataKey: ({ stats }: SystemStatsRecord) => stats?.g?.[0]?.e?.[engine] ?? 0,
color: `hsl(${220 + ((sortedKeys.indexOf(engine) * 360) / sortedKeys.length) % 360}, 65%, 52%)`,
opacity: 0.35,
}))
}
}

View File

@@ -18,7 +18,7 @@ import AreaChartDefault from "@/components/charts/area-chart"
import ContainerChart from "@/components/charts/container-chart"
import DiskChart from "@/components/charts/disk-chart"
import GpuPowerChart from "@/components/charts/gpu-power-chart"
import { useContainerChartConfigs, useGpuEngines } from "@/components/charts/hooks"
import { useContainerChartConfigs } from "@/components/charts/hooks"
import LoadAverageChart from "@/components/charts/load-average-chart"
import MemChart from "@/components/charts/mem-chart"
import SwapChart from "@/components/charts/swap-chart"
@@ -761,6 +761,12 @@ export default memo(function SystemDetail({ name }: { name: string }) {
</ChartCard>
)}
</div>
{/* Non-power GPU charts */}
{hasGpuData && (
<div className="grid xl:grid-cols-2 gap-4">
{/* GPU power draw chart */}
{hasGpuPowerData && (
<ChartCard
@@ -772,22 +778,16 @@ export default memo(function SystemDetail({ name }: { name: string }) {
<GpuPowerChart chartData={chartData} />
</ChartCard>
)}
{hasGpuEnginesData && (
<ChartCard
empty={dataEmpty}
grid={grid}
title={t`GPU Engines`}
description={t`Average utilization of GPU engines`}
>
<GpuEnginesChart chartData={chartData} />
</ChartCard>
)}
</div>
{/* GPU charts */}
{hasGpuData && (
<div className="grid xl:grid-cols-2 gap-4">
{hasGpuEnginesData && (
<ChartCard
empty={dataEmpty}
grid={grid}
title={t`GPU Engines`}
description={t`Average utilization of GPU engines`}
>
<GpuEnginesChart chartData={chartData} />
</ChartCard>
)}
{Object.keys(systemStats.at(-1)?.stats.g ?? {}).map((id) => {
const gpu = systemStats.at(-1)?.stats.g?.[id] as GPUData
return (
@@ -812,6 +812,8 @@ export default memo(function SystemDetail({ name }: { name: string }) {
contentFormatter={({ value }) => `${decimalString(value)}%`}
/>
</ChartCard>
{(gpu.mt ?? 0) > 0 && (
<ChartCard
empty={dataEmpty}
grid={grid}
@@ -839,7 +841,9 @@ export default memo(function SystemDetail({ name }: { name: string }) {
}}
/>
</ChartCard>
)}
</div>
)
})}
</div>
@@ -911,9 +915,18 @@ export default memo(function SystemDetail({ name }: { name: string }) {
})
function GpuEnginesChart({ chartData }: { chartData: ChartData }) {
const engineData = useGpuEngines(chartData.systemStats.at(-1))
const dataPoints = []
const engines = Object.keys(chartData.systemStats?.at(-1)?.stats.g?.[0]?.e ?? {}).sort()
for (const engine of engines) {
dataPoints.push({
label: engine,
dataKey: ({ stats }: SystemStatsRecord) => stats?.g?.[0]?.e?.[engine] ?? 0,
color: `hsl(${140 + ((engines.indexOf(engine) * 360) / engines.length) % 360}, 65%, 52%)`,
opacity: 0.35,
})
}
return (
<LineChartDefault legend={true} chartData={chartData} dataPoints={engineData} tickFormatter={(val) => `${toFixedFloat(val, 2)}%`} contentFormatter={({ value }) => `${decimalString(value)}%`} />
<LineChartDefault legend={true} chartData={chartData} dataPoints={dataPoints} tickFormatter={(val) => `${toFixedFloat(val, 2)}%`} contentFormatter={({ value }) => `${decimalString(value)}%`} />
)
}