From 9ad3cd0ab9ac20176bd704e17537d18285c14107 Mon Sep 17 00:00:00 2001 From: henrygd Date: Mon, 12 Jan 2026 17:27:35 -0500 Subject: [PATCH] fix: GPU ID collision between Intel and NVIDIA collectors (#1522) - Prefix Intel GPU ID as i0 to avoid NVML/NVIDIA index IDs like 0 - Update frontend GPU engines chart to select a GPU by id instead of assuming g[0] - Adjust tests to use the new Intel GPU id --- agent/gpu.go | 7 +-- agent/gpu_intel.go | 5 +- agent/gpu_test.go | 6 +-- .../site/src/components/routes/system.tsx | 50 +++++++++++-------- 4 files changed, 38 insertions(+), 30 deletions(-) diff --git a/agent/gpu.go b/agent/gpu.go index 80e8bae4..b2efca49 100644 --- a/agent/gpu.go +++ b/agent/gpu.go @@ -237,10 +237,11 @@ func (gm *GPUManager) parseAmdData(output []byte) bool { totalMemory, _ := strconv.ParseFloat(v.MemoryTotal, 64) usage, _ := strconv.ParseFloat(v.Usage, 64) - if _, ok := gm.GpuDataMap[v.ID]; !ok { - gm.GpuDataMap[v.ID] = &system.GPUData{Name: v.Name} + id := v.ID + if _, ok := gm.GpuDataMap[id]; !ok { + gm.GpuDataMap[id] = &system.GPUData{Name: v.Name} } - gpu := gm.GpuDataMap[v.ID] + gpu := gm.GpuDataMap[id] gpu.Temperature, _ = strconv.ParseFloat(v.Temperature, 64) gpu.MemoryUsed = bytesToMegabytes(memoryUsage) gpu.MemoryTotal = bytesToMegabytes(totalMemory) diff --git a/agent/gpu_intel.go b/agent/gpu_intel.go index 2d5f9370..1eaeb11d 100644 --- a/agent/gpu_intel.go +++ b/agent/gpu_intel.go @@ -27,10 +27,11 @@ func (gm *GPUManager) updateIntelFromStats(sample *intelGpuStats) bool { defer gm.Unlock() // only one gpu for now - cmd doesn't provide all by default - gpuData, ok := gm.GpuDataMap["0"] + id := "i0" // prefix with i to avoid conflicts with nvidia card ids + gpuData, ok := gm.GpuDataMap[id] if !ok { gpuData = &system.GPUData{Name: "GPU", Engines: make(map[string]float64)} - gm.GpuDataMap["0"] = gpuData + gm.GpuDataMap[id] = gpuData } gpuData.Power += sample.PowerGPU diff --git a/agent/gpu_test.go b/agent/gpu_test.go index 65520fb2..cfcab53d 100644 --- a/agent/gpu_test.go +++ b/agent/gpu_test.go @@ -1385,7 +1385,7 @@ func TestIntelUpdateFromStats(t *testing.T) { ok := gm.updateIntelFromStats(&sample1) assert.True(t, ok) - gpu := gm.GpuDataMap["0"] + gpu := gm.GpuDataMap["i0"] require.NotNil(t, gpu) assert.Equal(t, "GPU", gpu.Name) assert.EqualValues(t, 10.5, gpu.Power) @@ -1407,7 +1407,7 @@ func TestIntelUpdateFromStats(t *testing.T) { ok = gm.updateIntelFromStats(&sample2) assert.True(t, ok) - gpu = gm.GpuDataMap["0"] + gpu = gm.GpuDataMap["i0"] require.NotNil(t, gpu) assert.EqualValues(t, 10.5, gpu.Power) assert.EqualValues(t, 30.0, gpu.Engines["Render/3D"]) // 20 + 10 @@ -1446,7 +1446,7 @@ echo "298 295 278 51 2.20 3.12 1675 942 5.75 1 2 9.50 t.Fatalf("collectIntelStats error: %v", err) } - gpu := gm.GpuDataMap["0"] + gpu := gm.GpuDataMap["i0"] require.NotNil(t, gpu) // Power should be sum of samples 2-4 (first is skipped): 2.0 + 1.8 + 2.2 = 6.0 assert.EqualValues(t, 6.0, gpu.Power) diff --git a/internal/site/src/components/routes/system.tsx b/internal/site/src/components/routes/system.tsx index 6242864f..7b183148 100644 --- a/internal/site/src/components/routes/system.tsx +++ b/internal/site/src/components/routes/system.tsx @@ -409,26 +409,18 @@ export default memo(function SystemDetail({ id }: { id: string }) { if (lastGpus) { // check if there are any GPUs at all hasGpuData = Object.keys(lastGpus).length > 0 - // check if there are any GPUs with engines - for (let i = 0; i < systemStats.length && !hasGpuEnginesData; i++) { + // check if there are any GPUs with engines or power data + for (let i = 0; i < systemStats.length && (!hasGpuEnginesData || !hasGpuPowerData); i++) { const gpus = systemStats[i].stats?.g if (!gpus) continue for (const id in gpus) { - if (gpus[id].e !== undefined) { + if (!hasGpuEnginesData && gpus[id].e !== undefined) { hasGpuEnginesData = true - break } - } - } - // check if there are any GPUs with power data - for (let i = 0; i < systemStats.length && !hasGpuPowerData; i++) { - const gpus = systemStats[i].stats?.g - if (!gpus) continue - for (const id in gpus) { - if (gpus[id].p !== undefined || gpus[id].pp !== undefined) { + if (!hasGpuPowerData && (gpus[id].p !== undefined || gpus[id].pp !== undefined)) { hasGpuPowerData = true - break } + if (hasGpuEnginesData && hasGpuPowerData) break } } } @@ -896,16 +888,30 @@ export default memo(function SystemDetail({ id }: { id: string }) { }) function GpuEnginesChart({ chartData }: { chartData: ChartData }) { - const dataPoints: DataPoint[] = [] - const engines = Object.keys(chartData.systemStats?.at(-1)?.stats.g?.[0]?.e ?? {}).sort() - for (const engine of engines) { - dataPoints.push({ - label: engine, - dataKey: ({ stats }: SystemStatsRecord) => stats?.g?.[0]?.e?.[engine] ?? 0, - color: `hsl(${140 + (((engines.indexOf(engine) * 360) / engines.length) % 360)}, 65%, 52%)`, - opacity: 0.35, - }) + const { gpuId, engines } = useMemo(() => { + for (let i = chartData.systemStats.length - 1; i >= 0; i--) { + const gpus = chartData.systemStats[i].stats?.g + if (!gpus) continue + for (const id in gpus) { + if (gpus[id].e) { + return { gpuId: id, engines: Object.keys(gpus[id].e).sort() } + } + } + } + return { gpuId: null, engines: [] } + }, [chartData.systemStats]) + + if (!gpuId) { + return null } + + const dataPoints: DataPoint[] = engines.map((engine, i) => ({ + label: engine, + dataKey: ({ stats }: SystemStatsRecord) => stats?.g?.[gpuId]?.e?.[engine] ?? 0, + color: `hsl(${140 + (((i * 360) / engines.length) % 360)}, 65%, 52%)`, + opacity: 0.35, + })) + return (