From 5f74438bd7025dff36a96354cffe7ca3f0ef8584 Mon Sep 17 00:00:00 2001 From: henrygd Date: Sun, 21 Dec 2025 16:44:20 -0500 Subject: [PATCH] update --- agent/gpu.go | 7 ++++++- agent/gpu_nvml.go | 34 +++++++++++++++++++++------------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/agent/gpu.go b/agent/gpu.go index 96a5a41e..e9c7c54b 100644 --- a/agent/gpu.go +++ b/agent/gpu.go @@ -298,8 +298,13 @@ func (gm *GPUManager) calculateGPUAverage(id string, gpu *system.GPUData, cacheK currentCount := uint32(gpu.Count) deltaCount := gm.calculateDeltaCount(currentCount, lastSnapshot) - // If no new data arrived, use last known average + // If no new data arrived if deltaCount == 0 { + // If GPU appears suspended (instantaneous values are 0), return zero values + // Otherwise return last known average for temporary collection gaps + if gpu.Temperature == 0 && gpu.MemoryUsed == 0 { + return system.GPUData{Name: gpu.Name} + } return gm.lastAvgData[id] // zero value if not found } diff --git a/agent/gpu_nvml.go b/agent/gpu_nvml.go index 5917a522..b9fb951a 100644 --- a/agent/gpu_nvml.go +++ b/agent/gpu_nvml.go @@ -143,21 +143,39 @@ func (c *nvmlCollector) collect() { defer c.gm.Unlock() for i, device := range c.devices { + id := fmt.Sprintf("%d", i) bdf := c.bdfs[i] + + // Update GPUDataMap + if _, ok := c.gm.GpuDataMap[id]; !ok { + var nameBuf [64]byte + if ret := nvmlDeviceGetName(device, &nameBuf[0], 64); ret != nvmlReturn(nvmlSuccess) { + continue + } + name := string(nameBuf[:strings.Index(string(nameBuf[:]), "\x00")]) + name = strings.TrimPrefix(name, "NVIDIA ") + c.gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")} + } + gpu := c.gm.GpuDataMap[id] + if bdf != "" && !c.isGPUActive(bdf) { slog.Info("NVML: GPU is suspended, skipping", "bdf", bdf) + gpu.Temperature = 0 + gpu.MemoryUsed = 0 continue } - slog.Info("NVML: Collecting data for GPU", "bdf", bdf) - - id := fmt.Sprintf("%d", i) // Utilization var utilization nvmlUtilization if ret := nvmlDeviceGetUtilizationRates(device, &utilization); ret != nvmlReturn(nvmlSuccess) { + slog.Info("NVML: Utilization failed (GPU likely suspended)", "bdf", bdf, "ret", ret) + gpu.Temperature = 0 + gpu.MemoryUsed = 0 continue } + slog.Info("NVML: Collecting data for GPU", "bdf", bdf) + // Temperature var temp uint32 nvmlDeviceGetTemperature(device, 0, &temp) // 0 is NVML_TEMPERATURE_GPU @@ -181,16 +199,6 @@ func (c *nvmlCollector) collect() { var power uint32 nvmlDeviceGetPowerUsage(device, &power) - // Update GPUDataMap - if _, ok := c.gm.GpuDataMap[id]; !ok { - var nameBuf [64]byte - nvmlDeviceGetName(device, &nameBuf[0], 64) - name := string(nameBuf[:strings.Index(string(nameBuf[:]), "\x00")]) - name = strings.TrimPrefix(name, "NVIDIA ") - c.gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")} - } - - gpu := c.gm.GpuDataMap[id] gpu.Temperature = float64(temp) gpu.MemoryUsed = float64(usedMem) / 1024 / 1024 / mebibytesInAMegabyte gpu.MemoryTotal = float64(totalMem) / 1024 / 1024 / mebibytesInAMegabyte