From 960cac40600d6b42a04a2c287b78a9fa44da6c5d Mon Sep 17 00:00:00 2001 From: henrygd Date: Thu, 25 Sep 2025 19:15:36 -0400 Subject: [PATCH] fix `intel_gpu_top` restart loop and add intel gpu pkg power (#1203) --- agent/gpu.go | 3 +- agent/gpu_intel.go | 48 +++++++--- agent/gpu_test.go | 31 ++++--- internal/entities/system/system.go | 1 + .../src/components/charts/gpu-power-chart.tsx | 88 +++++++++++-------- .../site/src/components/routes/system.tsx | 2 +- internal/site/src/types.d.ts | 2 + 7 files changed, 109 insertions(+), 66 deletions(-) diff --git a/agent/gpu.go b/agent/gpu.go index af965dcc..d6862fc8 100644 --- a/agent/gpu.go +++ b/agent/gpu.go @@ -258,6 +258,7 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData { gpuAvg.Engines[name] = twoDecimals(engine / count) maxEngineUsage = max(maxEngineUsage, engine/count) } + gpuAvg.PowerPkg = twoDecimals(gpu.PowerPkg / count) gpuAvg.Usage = twoDecimals(maxEngineUsage) } else { gpuAvg.Usage = twoDecimals(gpu.Usage / count) @@ -266,7 +267,7 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData { } // reset accumulators in the original gpu data for next collection - gpu.Usage, gpu.Power, gpu.Count = gpuAvg.Usage, gpuAvg.Power, 1 + gpu.Usage, gpu.Power, gpu.PowerPkg, gpu.Count = gpuAvg.Usage, gpuAvg.Power, gpuAvg.PowerPkg, 1 gpu.Engines = gpuAvg.Engines // append id to the name if there are multiple GPUs with the same name diff --git a/agent/gpu_intel.go b/agent/gpu_intel.go index 172b0759..2a718a74 100644 --- a/agent/gpu_intel.go +++ b/agent/gpu_intel.go @@ -17,6 +17,7 @@ const ( type intelGpuStats struct { PowerGPU float64 + PowerPkg float64 Engines map[string]float64 } @@ -33,6 +34,7 @@ func (gm *GPUManager) updateIntelFromStats(sample *intelGpuStats) bool { } gpuData.Power += sample.PowerGPU + gpuData.PowerPkg += sample.PowerPkg if gpuData.Engines == nil { gpuData.Engines = make(map[string]float64, len(sample.Engines)) @@ -46,7 +48,7 @@ func (gm *GPUManager) updateIntelFromStats(sample *intelGpuStats) bool { } // collectIntelStats executes intel_gpu_top in text mode (-l) and parses the output -func (gm *GPUManager) collectIntelStats() error { +func (gm *GPUManager) collectIntelStats() (err error) { cmd := exec.Command(intelGpuStatsCmd, "-s", intelGpuStatsInterval, "-l") // Avoid blocking if intel_gpu_top writes to stderr cmd.Stderr = io.Discard @@ -58,23 +60,28 @@ func (gm *GPUManager) collectIntelStats() error { return err } - // Ensure we always reap the child to avoid zombies on any return path. + // Ensure we always reap the child to avoid zombies on any return path and + // propagate a non-zero exit code if no other error was set. defer func() { // Best-effort close of the pipe (unblock the child if it writes) _ = stdout.Close() if cmd.ProcessState == nil || !cmd.ProcessState.Exited() { _ = cmd.Process.Kill() } - _ = cmd.Wait() + if waitErr := cmd.Wait(); err == nil && waitErr != nil { + err = waitErr + } }() scanner := bufio.NewScanner(stdout) var header1 string - var header2 string var engineNames []string var friendlyNames []string var preEngineCols int var powerIndex int + var hadDataRow bool + // skip first data row because it sometimes has erroneous data + var skippedFirstDataRow bool for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) @@ -83,24 +90,34 @@ func (gm *GPUManager) collectIntelStats() error { } // first header line - if header1 == "" { + if strings.HasPrefix(line, "Freq") { header1 = line continue } // second header line - if header2 == "" { + if strings.HasPrefix(line, "req") { engineNames, friendlyNames, powerIndex, preEngineCols = gm.parseIntelHeaders(header1, line) - header1, header2 = "x", "x" // don't need these anymore continue } // Data row - sample := gm.parseIntelData(line, engineNames, friendlyNames, powerIndex, preEngineCols) + if !skippedFirstDataRow { + skippedFirstDataRow = true + continue + } + sample, err := gm.parseIntelData(line, engineNames, friendlyNames, powerIndex, preEngineCols) + if err != nil { + return err + } + hadDataRow = true gm.updateIntelFromStats(&sample) } - if err := scanner.Err(); err != nil { - return err + if scanErr := scanner.Err(); scanErr != nil { + return scanErr + } + if !hadDataRow { + return errNoValidData } return nil } @@ -145,19 +162,22 @@ func (gm *GPUManager) parseIntelHeaders(header1 string, header2 string) (engineN return engineNames, friendlyNames, powerIndex, preEngineCols } -func (gm *GPUManager) parseIntelData(line string, engineNames []string, friendlyNames []string, powerIndex int, preEngineCols int) (sample intelGpuStats) { +func (gm *GPUManager) parseIntelData(line string, engineNames []string, friendlyNames []string, powerIndex int, preEngineCols int) (sample intelGpuStats, err error) { fields := strings.Fields(line) if len(fields) == 0 { - return sample + return sample, errNoValidData } // Make sure row has enough columns for engines if need := preEngineCols + 3*len(engineNames); len(fields) < need { - return sample + return sample, errNoValidData } if powerIndex >= 0 && powerIndex < len(fields) { if v, perr := strconv.ParseFloat(fields[powerIndex], 64); perr == nil { sample.PowerGPU = v } + if v, perr := strconv.ParseFloat(fields[powerIndex+1], 64); perr == nil { + sample.PowerPkg = v + } } if len(engineNames) > 0 { sample.Engines = make(map[string]float64, len(engineNames)) @@ -175,5 +195,5 @@ func (gm *GPUManager) parseIntelData(line string, engineNames []string, friendly } } } - return sample + return sample, nil } diff --git a/agent/gpu_test.go b/agent/gpu_test.go index 1b5f129c..92ec531a 100644 --- a/agent/gpu_test.go +++ b/agent/gpu_test.go @@ -849,13 +849,15 @@ func TestIntelCollectorStreaming(t *testing.T) { dir := t.TempDir() os.Setenv("PATH", dir) - // Create a fake intel_gpu_top that prints -l format with two samples and exits + // Create a fake intel_gpu_top that prints -l format with four samples (first will be skipped) and exits scriptPath := filepath.Join(dir, "intel_gpu_top") script := `#!/bin/sh echo "Freq MHz IRQ RC6 Power W IMC MiB/s RCS BCS VCS" echo " req act /s % gpu pkg rd wr % se wa % se wa % se wa" echo "373 373 224 45 1.50 4.13 2554 714 12.34 0 0 0.00 0 0 5.00 0 0" -echo "226 223 338 58 2.00 2.69 1820 965 0.00 0 0 0.00 0 0 0.00 0 0"` +echo "226 223 338 58 2.00 2.69 1820 965 0.00 0 0 0.00 0 0 0.00 0 0" +echo "189 187 412 67 1.80 2.45 1950 823 8.50 2 1 15.00 1 0 22.00 0 1" +echo "298 295 278 51 2.20 3.12 1675 942 5.75 1 2 9.50 3 1 12.00 1 0"` if err := os.WriteFile(scriptPath, []byte(script), 0755); err != nil { t.Fatal(err) } @@ -864,21 +866,22 @@ echo "226 223 338 58 2.00 2.69 1820 965 0.00 0 0 0.00 GpuDataMap: make(map[string]*system.GPUData), } - // Run the collector once; it should read two samples and return + // Run the collector once; it should read four samples but skip the first and return if err := gm.collectIntelStats(); err != nil { t.Fatalf("collectIntelStats error: %v", err) } gpu := gm.GpuDataMap["0"] require.NotNil(t, gpu) - // Power should be sum of samples: 1.5 + 2.0 = 3.5 - assert.EqualValues(t, 3.5, gpu.Power) - // Engines aggregated - assert.EqualValues(t, 12.34, gpu.Engines["Render/3D"]) - assert.EqualValues(t, 5.0, gpu.Engines["Video"]) - assert.EqualValues(t, 0.0, gpu.Engines["Blitter"]) // BCS is zero in both samples - // Count should be 2 samples - assert.Equal(t, float64(2), gpu.Count) + // Power should be sum of samples 2-4 (first is skipped): 2.0 + 1.8 + 2.2 = 6.0 + assert.EqualValues(t, 6.0, gpu.Power) + assert.InDelta(t, 8.26, gpu.PowerPkg, 0.01) // Allow small floating point differences + // Engines aggregated from samples 2-4 + assert.EqualValues(t, 14.25, gpu.Engines["Render/3D"]) // 0.00 + 8.50 + 5.75 + assert.EqualValues(t, 34.0, gpu.Engines["Video"]) // 0.00 + 22.00 + 12.00 + assert.EqualValues(t, 24.5, gpu.Engines["Blitter"]) // 0.00 + 15.00 + 9.50 + // Count should be 3 samples (first is skipped) + assert.Equal(t, float64(3), gpu.Count) } func TestParseIntelHeaders(t *testing.T) { @@ -970,6 +973,7 @@ func TestParseIntelData(t *testing.T) { preEngineCols int wantPowerGPU float64 wantEngines map[string]float64 + wantErr error }{ { name: "basic data with power and engines", @@ -1022,6 +1026,7 @@ func TestParseIntelData(t *testing.T) { preEngineCols: 8, wantPowerGPU: 0.0, wantEngines: nil, // empty sample returned + wantErr: errNoValidData, }, { name: "empty line", @@ -1032,6 +1037,7 @@ func TestParseIntelData(t *testing.T) { preEngineCols: 8, wantPowerGPU: 0.0, wantEngines: nil, + wantErr: errNoValidData, }, { name: "data with invalid power value", @@ -1076,7 +1082,8 @@ func TestParseIntelData(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { gm := &GPUManager{} - sample := gm.parseIntelData(tt.line, tt.engineNames, tt.friendlyNames, tt.powerIndex, tt.preEngineCols) + sample, err := gm.parseIntelData(tt.line, tt.engineNames, tt.friendlyNames, tt.powerIndex, tt.preEngineCols) + assert.Equal(t, tt.wantErr, err) assert.Equal(t, tt.wantPowerGPU, sample.PowerGPU) assert.Equal(t, tt.wantEngines, sample.Engines) diff --git a/internal/entities/system/system.go b/internal/entities/system/system.go index 16d96004..c04e0404 100644 --- a/internal/entities/system/system.go +++ b/internal/entities/system/system.go @@ -53,6 +53,7 @@ type GPUData struct { Power float64 `json:"p,omitempty" cbor:"4,keyasint,omitempty"` Count float64 `json:"-"` Engines map[string]float64 `json:"e,omitempty" cbor:"5,keyasint,omitempty"` + PowerPkg float64 `json:"pp,omitempty" cbor:"6,keyasint,omitempty"` } type FsStats struct { diff --git a/internal/site/src/components/charts/gpu-power-chart.tsx b/internal/site/src/components/charts/gpu-power-chart.tsx index d5bff44a..c09287e2 100644 --- a/internal/site/src/components/charts/gpu-power-chart.tsx +++ b/internal/site/src/components/charts/gpu-power-chart.tsx @@ -9,46 +9,58 @@ import { xAxis, } from "@/components/ui/chart" import { chartMargin, cn, decimalString, formatShortDate, toFixedFloat } from "@/lib/utils" -import type { ChartData } from "@/types" +import type { ChartData, GPUData } from "@/types" import { useYAxisWidth } from "./hooks" +import type { DataPoint } from "./line-chart" export default memo(function GpuPowerChart({ chartData }: { chartData: ChartData }) { const { yAxisWidth, updateYAxisWidth } = useYAxisWidth() + const packageKey = " package" + + const { gpuData, dataPoints } = useMemo(() => { + const dataPoints = [] as DataPoint[] + const gpuData = [] as Record[] + const addedKeys = new Map() + + const addKey = (key: string, value: number) => { + addedKeys.set(key, (addedKeys.get(key) ?? 0) + value) + } + + for (const stats of chartData.systemStats) { + const gpus = stats.stats?.g ?? {} + const data = { created: stats.created } as Record + for (const id in gpus) { + const gpu = gpus[id] as GPUData + data[gpu.n] = gpu + addKey(gpu.n, gpu.p ?? 0) + if (gpu.pp) { + data[`${gpu.n}${packageKey}`] = gpu + addKey(`${gpu.n}${packageKey}`, gpu.pp ?? 0) + } + } + gpuData.push(data) + } + const sortedKeys = Array.from(addedKeys.entries()) + .sort(([, a], [, b]) => b - a) + .map(([key]) => key) + + for (let i = 0; i < sortedKeys.length; i++) { + const id = sortedKeys[i] + dataPoints.push({ + label: id, + dataKey: (gpuData: Record) => { + return id.endsWith(packageKey) ? (gpuData[id]?.pp ?? 0) : (gpuData[id]?.p ?? 0) + }, + color: `hsl(${226 + (((i * 360) / addedKeys.size) % 360)}, 65%, 52%)`, + }) + } + return { gpuData, dataPoints } + }, [chartData]) if (chartData.systemStats.length === 0) { return null } - /** Format temperature data for chart and assign colors */ - const newChartData = useMemo(() => { - const newChartData = { data: [], colors: {} } as { - data: Record[] - colors: Record - } - const powerSums = {} as Record - for (const data of chartData.systemStats) { - const newData = { created: data.created } as Record - - for (const gpu of Object.values(data.stats?.g ?? {})) { - if (gpu.p) { - const name = gpu.n - newData[name] = gpu.p - powerSums[name] = (powerSums[name] ?? 0) + newData[name] - } - } - newChartData.data.push(newData) - } - const keys = Object.keys(powerSums).sort((a, b) => powerSums[b] - powerSums[a]) - for (const key of keys) { - newChartData.colors[key] = `hsl(${(226 + (keys.indexOf(key) * 360) / keys.length) % 360}, 60%, 55%)` - } - return newChartData - }, [chartData]) - - const colors = Object.keys(newChartData.colors) - - // console.log('rendered at', new Date()) - return (
- + } /> - {colors.map((key) => ( + {dataPoints.map((dataPoint) => ( ))} - {colors.length > 1 && } />} + {dataPoints.length > 1 && } />}
diff --git a/internal/site/src/components/routes/system.tsx b/internal/site/src/components/routes/system.tsx index 889cf7c0..671871b8 100644 --- a/internal/site/src/components/routes/system.tsx +++ b/internal/site/src/components/routes/system.tsx @@ -398,7 +398,7 @@ export default memo(function SystemDetail({ name }: { name: string }) { const dataEmpty = !chartLoading && chartData.systemStats.length === 0 const lastGpuVals = Object.values(systemStats.at(-1)?.stats.g ?? {}) const hasGpuData = lastGpuVals.length > 0 - const hasGpuPowerData = lastGpuVals.some((gpu) => gpu.p !== undefined) + const hasGpuPowerData = lastGpuVals.some((gpu) => gpu.p !== undefined || gpu.pp !== undefined) const hasGpuEnginesData = lastGpuVals.some((gpu) => gpu.e !== undefined) let translatedStatus: string = system.status diff --git a/internal/site/src/types.d.ts b/internal/site/src/types.d.ts index 4925ebbf..8656fb81 100644 --- a/internal/site/src/types.d.ts +++ b/internal/site/src/types.d.ts @@ -158,6 +158,8 @@ export interface GPUData { u: number /** power (w) */ p?: number + /** power package (w) */ + pp?: number /** engines */ e?: Record }