mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-22 05:36:15 +01:00
Compare commits
1 Commits
v0.11.1
...
755-xpu-sm
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
14f7480915 |
@@ -4,6 +4,7 @@ import (
|
|||||||
"beszel/internal/entities/system"
|
"beszel/internal/entities/system"
|
||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"encoding/csv"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
@@ -21,11 +22,13 @@ const (
|
|||||||
nvidiaSmiCmd = "nvidia-smi"
|
nvidiaSmiCmd = "nvidia-smi"
|
||||||
rocmSmiCmd = "rocm-smi"
|
rocmSmiCmd = "rocm-smi"
|
||||||
tegraStatsCmd = "tegrastats"
|
tegraStatsCmd = "tegrastats"
|
||||||
|
xpuSmiCmd = "xpu-smi"
|
||||||
|
|
||||||
// Polling intervals
|
// Polling intervals
|
||||||
nvidiaSmiInterval = "4" // in seconds
|
nvidiaSmiInterval = "4" // in seconds
|
||||||
tegraStatsInterval = "3700" // in milliseconds
|
tegraStatsInterval = "3700" // in milliseconds
|
||||||
rocmSmiInterval = 4300 * time.Millisecond
|
rocmSmiInterval = 4300 * time.Millisecond
|
||||||
|
xpuSmiInterval = 4
|
||||||
|
|
||||||
// Command retry and timeout constants
|
// Command retry and timeout constants
|
||||||
retryWaitTime = 5 * time.Second
|
retryWaitTime = 5 * time.Second
|
||||||
@@ -41,10 +44,11 @@ const (
|
|||||||
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
|
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
|
||||||
type GPUManager struct {
|
type GPUManager struct {
|
||||||
sync.Mutex
|
sync.Mutex
|
||||||
nvidiaSmi bool
|
nvidiaSmi bool
|
||||||
rocmSmi bool
|
rocmSmi bool
|
||||||
tegrastats bool
|
tegrastats bool
|
||||||
GpuDataMap map[string]*system.GPUData
|
intelXpuSmi bool
|
||||||
|
GpuDataMap map[string]*system.GPUData
|
||||||
}
|
}
|
||||||
|
|
||||||
// RocmSmiJson represents the JSON structure of rocm-smi output
|
// RocmSmiJson represents the JSON structure of rocm-smi output
|
||||||
@@ -160,6 +164,59 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (gm *GPUManager) parseIntelData(output []byte) bool {
|
||||||
|
gm.Lock()
|
||||||
|
defer gm.Unlock()
|
||||||
|
reader := csv.NewReader(bytes.NewReader(output))
|
||||||
|
records, err := reader.ReadAll()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("Failed to parse Intel GPU data", "err", err)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
header := []string{"Timestamp", "DeviceId", "GPU Power (W)", "GPU Frequency (MHz)", "GPU Memory Utilization (%)", "GPU Memory Used (MiB)"}
|
||||||
|
gpuData := &system.GPUData{Name: "GPU"}
|
||||||
|
gm.GpuDataMap["0"] = gpuData
|
||||||
|
|
||||||
|
for _, record := range records {
|
||||||
|
if strings.Join(record, ",") == strings.Join(header, ",") {
|
||||||
|
slog.Debug("Skipping header", "header", record)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
var memoryUtilization *float64
|
||||||
|
var memoryUsed *float64
|
||||||
|
for i, field := range header {
|
||||||
|
if field == "Timestamp" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
stripped := strings.TrimSpace(record[i])
|
||||||
|
value, err := strconv.ParseFloat(stripped, 64)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("Failed to parse field", "field", field, "value", stripped, "err", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
switch field {
|
||||||
|
case "GPU Power (W)":
|
||||||
|
gpuData.Power += value
|
||||||
|
case "GPU Frequency (MHz)":
|
||||||
|
gpuData.Usage += value
|
||||||
|
case "GPU Memory Utilization (%)":
|
||||||
|
memoryUtilization = &value
|
||||||
|
case "GPU Memory Used (MiB)":
|
||||||
|
memoryUsed = &value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if memoryUtilization != nil && memoryUsed != nil {
|
||||||
|
gpuData.MemoryUsed = *memoryUsed
|
||||||
|
gpuData.MemoryTotal = (*memoryUsed / *memoryUtilization) * 100 // convert to total memory
|
||||||
|
}
|
||||||
|
}
|
||||||
|
gpuData.Count++
|
||||||
|
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
// parseNvidiaData parses the output of nvidia-smi and updates the GPUData map
|
// parseNvidiaData parses the output of nvidia-smi and updates the GPUData map
|
||||||
func (gm *GPUManager) parseNvidiaData(output []byte) bool {
|
func (gm *GPUManager) parseNvidiaData(output []byte) bool {
|
||||||
gm.Lock()
|
gm.Lock()
|
||||||
@@ -278,10 +335,14 @@ func (gm *GPUManager) detectGPUs() error {
|
|||||||
gm.tegrastats = true
|
gm.tegrastats = true
|
||||||
gm.nvidiaSmi = false
|
gm.nvidiaSmi = false
|
||||||
}
|
}
|
||||||
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats {
|
fmt.Println("Looking for gpus")
|
||||||
|
if _, err := exec.LookPath(xpuSmiCmd); err == nil {
|
||||||
|
gm.intelXpuSmi = true
|
||||||
|
}
|
||||||
|
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelXpuSmi {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or tegrastats")
|
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, intel_gpu_top, or tegrastats")
|
||||||
}
|
}
|
||||||
|
|
||||||
// startCollector starts the appropriate GPU data collector based on the command
|
// startCollector starts the appropriate GPU data collector based on the command
|
||||||
@@ -318,6 +379,10 @@ func (gm *GPUManager) startCollector(command string) {
|
|||||||
time.Sleep(rocmSmiInterval)
|
time.Sleep(rocmSmiInterval)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
case xpuSmiCmd:
|
||||||
|
collector.cmdArgs = []string{"dump", "-d", "-1", "-m", "1,2,5,18", "-i", strconv.Itoa(xpuSmiInterval)}
|
||||||
|
collector.parse = gm.parseIntelData
|
||||||
|
go collector.start()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -338,6 +403,9 @@ func NewGPUManager() (*GPUManager, error) {
|
|||||||
if gm.tegrastats {
|
if gm.tegrastats {
|
||||||
gm.startCollector(tegraStatsCmd)
|
gm.startCollector(tegraStatsCmd)
|
||||||
}
|
}
|
||||||
|
if gm.intelXpuSmi {
|
||||||
|
gm.startCollector(xpuSmiCmd)
|
||||||
|
}
|
||||||
|
|
||||||
return &gm, nil
|
return &gm, nil
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user