fix: GPU ID collision between Intel and NVIDIA collectors (#1522)

- Prefix Intel GPU ID as i0 to avoid NVML/NVIDIA index IDs like 0
- Update frontend GPU engines chart to select a GPU by id instead of
assuming g[0]
- Adjust tests to use the new Intel GPU id
This commit is contained in:
henrygd
2026-01-12 17:27:35 -05:00
parent 00def272b0
commit 9ad3cd0ab9
4 changed files with 38 additions and 30 deletions

View File

@@ -237,10 +237,11 @@ func (gm *GPUManager) parseAmdData(output []byte) bool {
totalMemory, _ := strconv.ParseFloat(v.MemoryTotal, 64) totalMemory, _ := strconv.ParseFloat(v.MemoryTotal, 64)
usage, _ := strconv.ParseFloat(v.Usage, 64) usage, _ := strconv.ParseFloat(v.Usage, 64)
if _, ok := gm.GpuDataMap[v.ID]; !ok { id := v.ID
gm.GpuDataMap[v.ID] = &system.GPUData{Name: v.Name} if _, ok := gm.GpuDataMap[id]; !ok {
gm.GpuDataMap[id] = &system.GPUData{Name: v.Name}
} }
gpu := gm.GpuDataMap[v.ID] gpu := gm.GpuDataMap[id]
gpu.Temperature, _ = strconv.ParseFloat(v.Temperature, 64) gpu.Temperature, _ = strconv.ParseFloat(v.Temperature, 64)
gpu.MemoryUsed = bytesToMegabytes(memoryUsage) gpu.MemoryUsed = bytesToMegabytes(memoryUsage)
gpu.MemoryTotal = bytesToMegabytes(totalMemory) gpu.MemoryTotal = bytesToMegabytes(totalMemory)

View File

@@ -27,10 +27,11 @@ func (gm *GPUManager) updateIntelFromStats(sample *intelGpuStats) bool {
defer gm.Unlock() defer gm.Unlock()
// only one gpu for now - cmd doesn't provide all by default // only one gpu for now - cmd doesn't provide all by default
gpuData, ok := gm.GpuDataMap["0"] id := "i0" // prefix with i to avoid conflicts with nvidia card ids
gpuData, ok := gm.GpuDataMap[id]
if !ok { if !ok {
gpuData = &system.GPUData{Name: "GPU", Engines: make(map[string]float64)} gpuData = &system.GPUData{Name: "GPU", Engines: make(map[string]float64)}
gm.GpuDataMap["0"] = gpuData gm.GpuDataMap[id] = gpuData
} }
gpuData.Power += sample.PowerGPU gpuData.Power += sample.PowerGPU

View File

@@ -1385,7 +1385,7 @@ func TestIntelUpdateFromStats(t *testing.T) {
ok := gm.updateIntelFromStats(&sample1) ok := gm.updateIntelFromStats(&sample1)
assert.True(t, ok) assert.True(t, ok)
gpu := gm.GpuDataMap["0"] gpu := gm.GpuDataMap["i0"]
require.NotNil(t, gpu) require.NotNil(t, gpu)
assert.Equal(t, "GPU", gpu.Name) assert.Equal(t, "GPU", gpu.Name)
assert.EqualValues(t, 10.5, gpu.Power) assert.EqualValues(t, 10.5, gpu.Power)
@@ -1407,7 +1407,7 @@ func TestIntelUpdateFromStats(t *testing.T) {
ok = gm.updateIntelFromStats(&sample2) ok = gm.updateIntelFromStats(&sample2)
assert.True(t, ok) assert.True(t, ok)
gpu = gm.GpuDataMap["0"] gpu = gm.GpuDataMap["i0"]
require.NotNil(t, gpu) require.NotNil(t, gpu)
assert.EqualValues(t, 10.5, gpu.Power) assert.EqualValues(t, 10.5, gpu.Power)
assert.EqualValues(t, 30.0, gpu.Engines["Render/3D"]) // 20 + 10 assert.EqualValues(t, 30.0, gpu.Engines["Render/3D"]) // 20 + 10
@@ -1446,7 +1446,7 @@ echo "298 295 278 51 2.20 3.12 1675 942 5.75 1 2 9.50
t.Fatalf("collectIntelStats error: %v", err) t.Fatalf("collectIntelStats error: %v", err)
} }
gpu := gm.GpuDataMap["0"] gpu := gm.GpuDataMap["i0"]
require.NotNil(t, gpu) require.NotNil(t, gpu)
// Power should be sum of samples 2-4 (first is skipped): 2.0 + 1.8 + 2.2 = 6.0 // Power should be sum of samples 2-4 (first is skipped): 2.0 + 1.8 + 2.2 = 6.0
assert.EqualValues(t, 6.0, gpu.Power) assert.EqualValues(t, 6.0, gpu.Power)

View File

@@ -409,26 +409,18 @@ export default memo(function SystemDetail({ id }: { id: string }) {
if (lastGpus) { if (lastGpus) {
// check if there are any GPUs at all // check if there are any GPUs at all
hasGpuData = Object.keys(lastGpus).length > 0 hasGpuData = Object.keys(lastGpus).length > 0
// check if there are any GPUs with engines // check if there are any GPUs with engines or power data
for (let i = 0; i < systemStats.length && !hasGpuEnginesData; i++) { for (let i = 0; i < systemStats.length && (!hasGpuEnginesData || !hasGpuPowerData); i++) {
const gpus = systemStats[i].stats?.g const gpus = systemStats[i].stats?.g
if (!gpus) continue if (!gpus) continue
for (const id in gpus) { for (const id in gpus) {
if (gpus[id].e !== undefined) { if (!hasGpuEnginesData && gpus[id].e !== undefined) {
hasGpuEnginesData = true hasGpuEnginesData = true
break
} }
} if (!hasGpuPowerData && (gpus[id].p !== undefined || gpus[id].pp !== undefined)) {
}
// check if there are any GPUs with power data
for (let i = 0; i < systemStats.length && !hasGpuPowerData; i++) {
const gpus = systemStats[i].stats?.g
if (!gpus) continue
for (const id in gpus) {
if (gpus[id].p !== undefined || gpus[id].pp !== undefined) {
hasGpuPowerData = true hasGpuPowerData = true
break
} }
if (hasGpuEnginesData && hasGpuPowerData) break
} }
} }
} }
@@ -896,16 +888,30 @@ export default memo(function SystemDetail({ id }: { id: string }) {
}) })
function GpuEnginesChart({ chartData }: { chartData: ChartData }) { function GpuEnginesChart({ chartData }: { chartData: ChartData }) {
const dataPoints: DataPoint[] = [] const { gpuId, engines } = useMemo(() => {
const engines = Object.keys(chartData.systemStats?.at(-1)?.stats.g?.[0]?.e ?? {}).sort() for (let i = chartData.systemStats.length - 1; i >= 0; i--) {
for (const engine of engines) { const gpus = chartData.systemStats[i].stats?.g
dataPoints.push({ if (!gpus) continue
label: engine, for (const id in gpus) {
dataKey: ({ stats }: SystemStatsRecord) => stats?.g?.[0]?.e?.[engine] ?? 0, if (gpus[id].e) {
color: `hsl(${140 + (((engines.indexOf(engine) * 360) / engines.length) % 360)}, 65%, 52%)`, return { gpuId: id, engines: Object.keys(gpus[id].e).sort() }
opacity: 0.35, }
}) }
}
return { gpuId: null, engines: [] }
}, [chartData.systemStats])
if (!gpuId) {
return null
} }
const dataPoints: DataPoint[] = engines.map((engine, i) => ({
label: engine,
dataKey: ({ stats }: SystemStatsRecord) => stats?.g?.[gpuId]?.e?.[engine] ?? 0,
color: `hsl(${140 + (((i * 360) / engines.length) % 360)}, 65%, 52%)`,
opacity: 0.35,
}))
return ( return (
<LineChartDefault <LineChartDefault
legend={true} legend={true}