From 6a406e520672ec6d1b69a181df4f520f900a6324 Mon Sep 17 00:00:00 2001 From: henrygd Date: Tue, 16 Sep 2025 16:37:44 -0400 Subject: [PATCH] intel_gpu_top testing --- agent/gpu.go | 54 ++++++++++++------- agent/gpu_intel.go | 53 ++++++++++++++++++ internal/entities/system/system.go | 15 +++--- internal/site/src/components/charts/hooks.ts | 14 ++++- .../site/src/components/routes/system.tsx | 22 +++++++- internal/site/src/types.d.ts | 2 + 6 files changed, 132 insertions(+), 28 deletions(-) create mode 100644 agent/gpu_intel.go diff --git a/agent/gpu.go b/agent/gpu.go index 5ba70adb..205819f7 100644 --- a/agent/gpu.go +++ b/agent/gpu.go @@ -27,13 +27,10 @@ const ( nvidiaSmiInterval string = "4" // in seconds tegraStatsInterval string = "3700" // in milliseconds rocmSmiInterval time.Duration = 4300 * time.Millisecond - // Command retry and timeout constants retryWaitTime time.Duration = 5 * time.Second maxFailureRetries int = 5 - cmdBufferSize uint16 = 10 * 1024 - // Unit Conversions mebibytesInAMegabyte float64 = 1.024 // nvidia-smi reports memory in MiB milliwattsInAWatt float64 = 1000.0 // tegrastats reports power in mW @@ -42,10 +39,11 @@ const ( // GPUManager manages data collection for GPUs (either Nvidia or AMD) type GPUManager struct { sync.Mutex - nvidiaSmi bool - rocmSmi bool - tegrastats bool - GpuDataMap map[string]*system.GPUData + nvidiaSmi bool + rocmSmi bool + tegrastats bool + intelGpuStats bool + GpuDataMap map[string]*system.GPUData } // RocmSmiJson represents the JSON structure of rocm-smi output @@ -66,6 +64,7 @@ type gpuCollector struct { cmdArgs []string parse func([]byte) bool // returns true if valid data was found buf []byte + bufSize uint16 } var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data @@ -99,7 +98,7 @@ func (c *gpuCollector) collect() error { scanner := bufio.NewScanner(stdout) if c.buf == nil { - c.buf = make([]byte, 0, cmdBufferSize) + c.buf = make([]byte, 0, c.bufSize) } scanner.Buffer(c.buf, bufio.MaxScanTokenSize) @@ -244,20 +243,24 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData { // copy / reset the data gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap)) for id, gpu := range gm.GpuDataMap { - gpuAvg := *gpu + // avoid division by zero + count := max(gpu.Count, 1) + // average the data + gpuAvg := *gpu gpuAvg.Temperature = twoDecimals(gpu.Temperature) gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed) gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal) - - // avoid division by zero - if gpu.Count > 0 { - gpuAvg.Usage = twoDecimals(gpu.Usage / gpu.Count) - gpuAvg.Power = twoDecimals(gpu.Power / gpu.Count) + gpuAvg.Usage = twoDecimals(gpu.Usage / count) + gpuAvg.Power = twoDecimals(gpu.Power / count) + gpuAvg.Engines = make(map[string]float64, len(gpu.Engines)) + for name, engine := range gpu.Engines { + gpuAvg.Engines[name] = twoDecimals(engine / count) } - // reset accumulators in the original - gpu.Usage, gpu.Power, gpu.Count = 0, 0, 0 + // reset accumulators in the original gpu data for next collection + gpu.Usage, gpu.Power, gpu.Count = gpuAvg.Usage, gpuAvg.Power, 1 + gpu.Engines = gpuAvg.Engines // append id to the name if there are multiple GPUs with the same name if nameCounts[gpu.Name] > 1 { @@ -284,18 +287,28 @@ func (gm *GPUManager) detectGPUs() error { gm.tegrastats = true gm.nvidiaSmi = false } - if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats { + if _, err := exec.LookPath(intelGpuStatsCmd); err == nil { + slog.Info("Intel GPU stats found") + gm.intelGpuStats = true + } + if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelGpuStats { return nil } - return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or tegrastats") + return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, tegrastats, or intel_gpu_top") } // startCollector starts the appropriate GPU data collector based on the command func (gm *GPUManager) startCollector(command string) { collector := gpuCollector{ - name: command, + name: command, + bufSize: 10 * 1024, } switch command { + case intelGpuStatsCmd: + slog.Info("Starting Intel GPU stats collector") + collector.cmdArgs = []string{"-s", intelGpuStatsInterval, "-J"} + collector.parse = gm.parseIntelData + go collector.start() case nvidiaSmiCmd: collector.cmdArgs = []string{ "-l", nvidiaSmiInterval, @@ -344,6 +357,9 @@ func NewGPUManager() (*GPUManager, error) { if gm.tegrastats { gm.startCollector(tegraStatsCmd) } + if gm.intelGpuStats { + gm.startCollector(intelGpuStatsCmd) + } return &gm, nil } diff --git a/agent/gpu_intel.go b/agent/gpu_intel.go new file mode 100644 index 00000000..f4df97c8 --- /dev/null +++ b/agent/gpu_intel.go @@ -0,0 +1,53 @@ +package agent + +import ( + "encoding/json" + "log/slog" + + "github.com/henrygd/beszel/internal/entities/system" +) + +const ( + intelGpuStatsCmd string = "intel_gpu_top" + intelGpuStatsInterval string = "3800" // in milliseconds +) + +type intelGpuStats struct { + Power struct { + GPU float64 `json:"gpu"` + } `json:"power"` + Engines map[string]struct { + Busy float64 `json:"busy"` + } `json:"engines"` +} + +func (gm *GPUManager) parseIntelData(output []byte) bool { + slog.Info("Parsing Intel GPU stats") + var intelGpuStats intelGpuStats + if err := json.Unmarshal(output, &intelGpuStats); err != nil { + slog.Error("Error parsing Intel GPU stats", "err", err) + return false + } + gm.Lock() + defer gm.Unlock() + + // only one gpu for now - cmd doesn't provide all by default + gpuData, ok := gm.GpuDataMap["0"] + if !ok { + gpuData = &system.GPUData{Name: "GPU", Engines: make(map[string]float64, len(intelGpuStats.Engines))} + gm.GpuDataMap["0"] = gpuData + } + + if intelGpuStats.Power.GPU > 0 { + gpuData.Power += intelGpuStats.Power.GPU + } + + for name, engine := range intelGpuStats.Engines { + gpuData.Engines[name] += engine.Busy + } + + gpuData.Count++ + + slog.Info("GPU Data", "gpuData", gpuData) + return true +} diff --git a/internal/entities/system/system.go b/internal/entities/system/system.go index 47c2f0ab..7011b127 100644 --- a/internal/entities/system/system.go +++ b/internal/entities/system/system.go @@ -45,13 +45,14 @@ type Stats struct { } type GPUData struct { - Name string `json:"n" cbor:"0,keyasint"` - Temperature float64 `json:"-"` - MemoryUsed float64 `json:"mu,omitempty" cbor:"1,keyasint,omitempty"` - MemoryTotal float64 `json:"mt,omitempty" cbor:"2,keyasint,omitempty"` - Usage float64 `json:"u" cbor:"3,keyasint"` - Power float64 `json:"p,omitempty" cbor:"4,keyasint,omitempty"` - Count float64 `json:"-"` + Name string `json:"n" cbor:"0,keyasint"` + Temperature float64 `json:"-"` + MemoryUsed float64 `json:"mu,omitempty" cbor:"1,keyasint,omitempty"` + MemoryTotal float64 `json:"mt,omitempty" cbor:"2,keyasint,omitempty"` + Usage float64 `json:"u" cbor:"3,keyasint"` + Power float64 `json:"p,omitempty" cbor:"4,keyasint,omitempty"` + Count float64 `json:"-"` + Engines map[string]float64 `json:"e,omitempty" cbor:"5,keyasint,omitempty"` } type FsStats struct { diff --git a/internal/site/src/components/charts/hooks.ts b/internal/site/src/components/charts/hooks.ts index a5946026..a72d9639 100644 --- a/internal/site/src/components/charts/hooks.ts +++ b/internal/site/src/components/charts/hooks.ts @@ -115,7 +115,7 @@ export function useNetworkInterfaces(interfaces: SystemStats["ni"]) { data: (index = 3) => { return sortedKeys.map((key) => ({ label: key, - dataKey: (stats: SystemStatsRecord) => stats.stats?.ni?.[key]?.[index], + dataKey: ({ stats }: SystemStatsRecord) => stats?.ni?.[key]?.[index], color: `hsl(${220 + (((sortedKeys.indexOf(key) * 360) / sortedKeys.length) % 360)}, 70%, 50%)`, opacity: 0.3, @@ -123,3 +123,15 @@ export function useNetworkInterfaces(interfaces: SystemStats["ni"]) { }, } } + +/** Generates chart configurations for GPU engines */ +export function useGpuEngines(systemStats?: SystemStatsRecord) { + const keys = Object.keys(systemStats?.stats.g?.[0]?.e ?? {}) + const sortedKeys = keys.sort() + return sortedKeys.map((engine) => ({ + label: engine, + dataKey: ({ stats }: SystemStatsRecord) => stats?.g?.[0]?.e?.[engine] ?? 0, + color: `hsl(${220 + ((sortedKeys.indexOf(engine) * 360) / sortedKeys.length) % 360}, 65%, 52%)`, + opacity: 0.35, + })) +} diff --git a/internal/site/src/components/routes/system.tsx b/internal/site/src/components/routes/system.tsx index 345c7f57..0a47fadf 100644 --- a/internal/site/src/components/routes/system.tsx +++ b/internal/site/src/components/routes/system.tsx @@ -18,7 +18,7 @@ import AreaChartDefault from "@/components/charts/area-chart" import ContainerChart from "@/components/charts/container-chart" import DiskChart from "@/components/charts/disk-chart" import GpuPowerChart from "@/components/charts/gpu-power-chart" -import { useContainerChartConfigs } from "@/components/charts/hooks" +import { useContainerChartConfigs, useGpuEngines } from "@/components/charts/hooks" import LoadAverageChart from "@/components/charts/load-average-chart" import MemChart from "@/components/charts/mem-chart" import SwapChart from "@/components/charts/swap-chart" @@ -61,6 +61,7 @@ import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from ". import { Separator } from "../ui/separator" import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "../ui/tooltip" import NetworkSheet from "./system/network-sheet" +import LineChartDefault from "../charts/line-chart" type ChartTimeData = { time: number @@ -398,6 +399,7 @@ export default memo(function SystemDetail({ name }: { name: string }) { const lastGpuVals = Object.values(systemStats.at(-1)?.stats.g ?? {}) const hasGpuData = lastGpuVals.length > 0 const hasGpuPowerData = lastGpuVals.some((gpu) => gpu.p !== undefined) + const hasGpuEnginesData = lastGpuVals.some((gpu) => gpu.e !== undefined) let translatedStatus: string = system.status if (system.status === SystemStatus.Up) { @@ -770,6 +772,17 @@ export default memo(function SystemDetail({ name }: { name: string }) { )} + + {hasGpuEnginesData && ( + + + + )} {/* GPU charts */} @@ -897,6 +910,13 @@ export default memo(function SystemDetail({ name }: { name: string }) { ) }) +function GpuEnginesChart({ chartData }: { chartData: ChartData }) { + const engineData = useGpuEngines(chartData.systemStats.at(-1)) + return ( + `${toFixedFloat(val, 2)}%`} contentFormatter={({ value }) => `${decimalString(value)}%`} /> + ) +} + function FilterBar({ store = $containerFilter }: { store?: typeof $containerFilter }) { const containerFilter = useStore(store) const { t } = useLingui() diff --git a/internal/site/src/types.d.ts b/internal/site/src/types.d.ts index 5d8d25f7..4925ebbf 100644 --- a/internal/site/src/types.d.ts +++ b/internal/site/src/types.d.ts @@ -158,6 +158,8 @@ export interface GPUData { u: number /** power (w) */ p?: number + /** engines */ + e?: Record } export interface ExtraFsStats {