intel_gpu_top testing

This commit is contained in:
henrygd
2025-09-16 16:37:44 -04:00
parent 240e75f025
commit 6a406e5206
6 changed files with 132 additions and 28 deletions

View File

@@ -27,13 +27,10 @@ const (
nvidiaSmiInterval string = "4" // in seconds
tegraStatsInterval string = "3700" // in milliseconds
rocmSmiInterval time.Duration = 4300 * time.Millisecond
// Command retry and timeout constants
retryWaitTime time.Duration = 5 * time.Second
maxFailureRetries int = 5
cmdBufferSize uint16 = 10 * 1024
// Unit Conversions
mebibytesInAMegabyte float64 = 1.024 // nvidia-smi reports memory in MiB
milliwattsInAWatt float64 = 1000.0 // tegrastats reports power in mW
@@ -42,10 +39,11 @@ const (
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
type GPUManager struct {
sync.Mutex
nvidiaSmi bool
rocmSmi bool
tegrastats bool
GpuDataMap map[string]*system.GPUData
nvidiaSmi bool
rocmSmi bool
tegrastats bool
intelGpuStats bool
GpuDataMap map[string]*system.GPUData
}
// RocmSmiJson represents the JSON structure of rocm-smi output
@@ -66,6 +64,7 @@ type gpuCollector struct {
cmdArgs []string
parse func([]byte) bool // returns true if valid data was found
buf []byte
bufSize uint16
}
var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data
@@ -99,7 +98,7 @@ func (c *gpuCollector) collect() error {
scanner := bufio.NewScanner(stdout)
if c.buf == nil {
c.buf = make([]byte, 0, cmdBufferSize)
c.buf = make([]byte, 0, c.bufSize)
}
scanner.Buffer(c.buf, bufio.MaxScanTokenSize)
@@ -244,20 +243,24 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData {
// copy / reset the data
gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap))
for id, gpu := range gm.GpuDataMap {
gpuAvg := *gpu
// avoid division by zero
count := max(gpu.Count, 1)
// average the data
gpuAvg := *gpu
gpuAvg.Temperature = twoDecimals(gpu.Temperature)
gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed)
gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal)
// avoid division by zero
if gpu.Count > 0 {
gpuAvg.Usage = twoDecimals(gpu.Usage / gpu.Count)
gpuAvg.Power = twoDecimals(gpu.Power / gpu.Count)
gpuAvg.Usage = twoDecimals(gpu.Usage / count)
gpuAvg.Power = twoDecimals(gpu.Power / count)
gpuAvg.Engines = make(map[string]float64, len(gpu.Engines))
for name, engine := range gpu.Engines {
gpuAvg.Engines[name] = twoDecimals(engine / count)
}
// reset accumulators in the original
gpu.Usage, gpu.Power, gpu.Count = 0, 0, 0
// reset accumulators in the original gpu data for next collection
gpu.Usage, gpu.Power, gpu.Count = gpuAvg.Usage, gpuAvg.Power, 1
gpu.Engines = gpuAvg.Engines
// append id to the name if there are multiple GPUs with the same name
if nameCounts[gpu.Name] > 1 {
@@ -284,18 +287,28 @@ func (gm *GPUManager) detectGPUs() error {
gm.tegrastats = true
gm.nvidiaSmi = false
}
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats {
if _, err := exec.LookPath(intelGpuStatsCmd); err == nil {
slog.Info("Intel GPU stats found")
gm.intelGpuStats = true
}
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelGpuStats {
return nil
}
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or tegrastats")
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, tegrastats, or intel_gpu_top")
}
// startCollector starts the appropriate GPU data collector based on the command
func (gm *GPUManager) startCollector(command string) {
collector := gpuCollector{
name: command,
name: command,
bufSize: 10 * 1024,
}
switch command {
case intelGpuStatsCmd:
slog.Info("Starting Intel GPU stats collector")
collector.cmdArgs = []string{"-s", intelGpuStatsInterval, "-J"}
collector.parse = gm.parseIntelData
go collector.start()
case nvidiaSmiCmd:
collector.cmdArgs = []string{
"-l", nvidiaSmiInterval,
@@ -344,6 +357,9 @@ func NewGPUManager() (*GPUManager, error) {
if gm.tegrastats {
gm.startCollector(tegraStatsCmd)
}
if gm.intelGpuStats {
gm.startCollector(intelGpuStatsCmd)
}
return &gm, nil
}

53
agent/gpu_intel.go Normal file
View File

@@ -0,0 +1,53 @@
package agent
import (
"encoding/json"
"log/slog"
"github.com/henrygd/beszel/internal/entities/system"
)
const (
intelGpuStatsCmd string = "intel_gpu_top"
intelGpuStatsInterval string = "3800" // in milliseconds
)
type intelGpuStats struct {
Power struct {
GPU float64 `json:"gpu"`
} `json:"power"`
Engines map[string]struct {
Busy float64 `json:"busy"`
} `json:"engines"`
}
func (gm *GPUManager) parseIntelData(output []byte) bool {
slog.Info("Parsing Intel GPU stats")
var intelGpuStats intelGpuStats
if err := json.Unmarshal(output, &intelGpuStats); err != nil {
slog.Error("Error parsing Intel GPU stats", "err", err)
return false
}
gm.Lock()
defer gm.Unlock()
// only one gpu for now - cmd doesn't provide all by default
gpuData, ok := gm.GpuDataMap["0"]
if !ok {
gpuData = &system.GPUData{Name: "GPU", Engines: make(map[string]float64, len(intelGpuStats.Engines))}
gm.GpuDataMap["0"] = gpuData
}
if intelGpuStats.Power.GPU > 0 {
gpuData.Power += intelGpuStats.Power.GPU
}
for name, engine := range intelGpuStats.Engines {
gpuData.Engines[name] += engine.Busy
}
gpuData.Count++
slog.Info("GPU Data", "gpuData", gpuData)
return true
}