support xpu-smi for intel stats (#755)

This commit is contained in:
Yorick
2025-11-14 16:58:28 +01:00
committed by GitHub
parent aab5725d82
commit 14f7480915

View File

@@ -4,6 +4,7 @@ import (
"beszel/internal/entities/system" "beszel/internal/entities/system"
"bufio" "bufio"
"bytes" "bytes"
"encoding/csv"
"encoding/json" "encoding/json"
"fmt" "fmt"
"os/exec" "os/exec"
@@ -21,11 +22,13 @@ const (
nvidiaSmiCmd = "nvidia-smi" nvidiaSmiCmd = "nvidia-smi"
rocmSmiCmd = "rocm-smi" rocmSmiCmd = "rocm-smi"
tegraStatsCmd = "tegrastats" tegraStatsCmd = "tegrastats"
xpuSmiCmd = "xpu-smi"
// Polling intervals // Polling intervals
nvidiaSmiInterval = "4" // in seconds nvidiaSmiInterval = "4" // in seconds
tegraStatsInterval = "3700" // in milliseconds tegraStatsInterval = "3700" // in milliseconds
rocmSmiInterval = 4300 * time.Millisecond rocmSmiInterval = 4300 * time.Millisecond
xpuSmiInterval = 4
// Command retry and timeout constants // Command retry and timeout constants
retryWaitTime = 5 * time.Second retryWaitTime = 5 * time.Second
@@ -44,6 +47,7 @@ type GPUManager struct {
nvidiaSmi bool nvidiaSmi bool
rocmSmi bool rocmSmi bool
tegrastats bool tegrastats bool
intelXpuSmi bool
GpuDataMap map[string]*system.GPUData GpuDataMap map[string]*system.GPUData
} }
@@ -160,6 +164,59 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool {
} }
} }
func (gm *GPUManager) parseIntelData(output []byte) bool {
gm.Lock()
defer gm.Unlock()
reader := csv.NewReader(bytes.NewReader(output))
records, err := reader.ReadAll()
if err != nil {
slog.Warn("Failed to parse Intel GPU data", "err", err)
return false
}
header := []string{"Timestamp", "DeviceId", "GPU Power (W)", "GPU Frequency (MHz)", "GPU Memory Utilization (%)", "GPU Memory Used (MiB)"}
gpuData := &system.GPUData{Name: "GPU"}
gm.GpuDataMap["0"] = gpuData
for _, record := range records {
if strings.Join(record, ",") == strings.Join(header, ",") {
slog.Debug("Skipping header", "header", record)
continue
}
var memoryUtilization *float64
var memoryUsed *float64
for i, field := range header {
if field == "Timestamp" {
continue
}
stripped := strings.TrimSpace(record[i])
value, err := strconv.ParseFloat(stripped, 64)
if err != nil {
slog.Warn("Failed to parse field", "field", field, "value", stripped, "err", err)
continue
}
switch field {
case "GPU Power (W)":
gpuData.Power += value
case "GPU Frequency (MHz)":
gpuData.Usage += value
case "GPU Memory Utilization (%)":
memoryUtilization = &value
case "GPU Memory Used (MiB)":
memoryUsed = &value
}
}
if memoryUtilization != nil && memoryUsed != nil {
gpuData.MemoryUsed = *memoryUsed
gpuData.MemoryTotal = (*memoryUsed / *memoryUtilization) * 100 // convert to total memory
}
}
gpuData.Count++
return true
}
// parseNvidiaData parses the output of nvidia-smi and updates the GPUData map // parseNvidiaData parses the output of nvidia-smi and updates the GPUData map
func (gm *GPUManager) parseNvidiaData(output []byte) bool { func (gm *GPUManager) parseNvidiaData(output []byte) bool {
gm.Lock() gm.Lock()
@@ -278,10 +335,14 @@ func (gm *GPUManager) detectGPUs() error {
gm.tegrastats = true gm.tegrastats = true
gm.nvidiaSmi = false gm.nvidiaSmi = false
} }
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats { fmt.Println("Looking for gpus")
if _, err := exec.LookPath(xpuSmiCmd); err == nil {
gm.intelXpuSmi = true
}
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelXpuSmi {
return nil return nil
} }
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or tegrastats") return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, intel_gpu_top, or tegrastats")
} }
// startCollector starts the appropriate GPU data collector based on the command // startCollector starts the appropriate GPU data collector based on the command
@@ -318,6 +379,10 @@ func (gm *GPUManager) startCollector(command string) {
time.Sleep(rocmSmiInterval) time.Sleep(rocmSmiInterval)
} }
}() }()
case xpuSmiCmd:
collector.cmdArgs = []string{"dump", "-d", "-1", "-m", "1,2,5,18", "-i", strconv.Itoa(xpuSmiInterval)}
collector.parse = gm.parseIntelData
go collector.start()
} }
} }
@@ -338,6 +403,9 @@ func NewGPUManager() (*GPUManager, error) {
if gm.tegrastats { if gm.tegrastats {
gm.startCollector(tegraStatsCmd) gm.startCollector(tegraStatsCmd)
} }
if gm.intelXpuSmi {
gm.startCollector(xpuSmiCmd)
}
return &gm, nil return &gm, nil
} }