This commit is contained in:
henrygd
2025-12-21 16:44:20 -05:00
parent ea354ec030
commit 5f74438bd7
2 changed files with 27 additions and 14 deletions

View File

@@ -298,8 +298,13 @@ func (gm *GPUManager) calculateGPUAverage(id string, gpu *system.GPUData, cacheK
currentCount := uint32(gpu.Count)
deltaCount := gm.calculateDeltaCount(currentCount, lastSnapshot)
// If no new data arrived, use last known average
// If no new data arrived
if deltaCount == 0 {
// If GPU appears suspended (instantaneous values are 0), return zero values
// Otherwise return last known average for temporary collection gaps
if gpu.Temperature == 0 && gpu.MemoryUsed == 0 {
return system.GPUData{Name: gpu.Name}
}
return gm.lastAvgData[id] // zero value if not found
}

View File

@@ -143,21 +143,39 @@ func (c *nvmlCollector) collect() {
defer c.gm.Unlock()
for i, device := range c.devices {
id := fmt.Sprintf("%d", i)
bdf := c.bdfs[i]
// Update GPUDataMap
if _, ok := c.gm.GpuDataMap[id]; !ok {
var nameBuf [64]byte
if ret := nvmlDeviceGetName(device, &nameBuf[0], 64); ret != nvmlReturn(nvmlSuccess) {
continue
}
name := string(nameBuf[:strings.Index(string(nameBuf[:]), "\x00")])
name = strings.TrimPrefix(name, "NVIDIA ")
c.gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")}
}
gpu := c.gm.GpuDataMap[id]
if bdf != "" && !c.isGPUActive(bdf) {
slog.Info("NVML: GPU is suspended, skipping", "bdf", bdf)
gpu.Temperature = 0
gpu.MemoryUsed = 0
continue
}
slog.Info("NVML: Collecting data for GPU", "bdf", bdf)
id := fmt.Sprintf("%d", i)
// Utilization
var utilization nvmlUtilization
if ret := nvmlDeviceGetUtilizationRates(device, &utilization); ret != nvmlReturn(nvmlSuccess) {
slog.Info("NVML: Utilization failed (GPU likely suspended)", "bdf", bdf, "ret", ret)
gpu.Temperature = 0
gpu.MemoryUsed = 0
continue
}
slog.Info("NVML: Collecting data for GPU", "bdf", bdf)
// Temperature
var temp uint32
nvmlDeviceGetTemperature(device, 0, &temp) // 0 is NVML_TEMPERATURE_GPU
@@ -181,16 +199,6 @@ func (c *nvmlCollector) collect() {
var power uint32
nvmlDeviceGetPowerUsage(device, &power)
// Update GPUDataMap
if _, ok := c.gm.GpuDataMap[id]; !ok {
var nameBuf [64]byte
nvmlDeviceGetName(device, &nameBuf[0], 64)
name := string(nameBuf[:strings.Index(string(nameBuf[:]), "\x00")])
name = strings.TrimPrefix(name, "NVIDIA ")
c.gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")}
}
gpu := c.gm.GpuDataMap[id]
gpu.Temperature = float64(temp)
gpu.MemoryUsed = float64(usedMem) / 1024 / 1024 / mebibytesInAMegabyte
gpu.MemoryTotal = float64(totalMem) / 1024 / 1024 / mebibytesInAMegabyte