mirror of
https://github.com/henrygd/beszel.git
synced 2025-12-17 10:46:16 +01:00
235 lines
6.7 KiB
Go
235 lines
6.7 KiB
Go
package agent
|
|
|
|
import (
|
|
"beszel/internal/entities/system"
|
|
"bufio"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os/exec"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"golang.org/x/exp/slog"
|
|
)
|
|
|
|
type GPUManager struct {
|
|
nvidiaSmi bool
|
|
rocmSmi bool
|
|
GpuDataMap map[string]*system.GPUData
|
|
mutex sync.Mutex
|
|
}
|
|
|
|
type RocmSmiJson struct {
|
|
ID string `json:"Device ID"`
|
|
Name string `json:"Card series"`
|
|
Temperature string `json:"Temperature (Sensor edge) (C)"`
|
|
MemoryUsed string `json:"VRAM Total Used Memory (B)"`
|
|
MemoryTotal string `json:"VRAM Total Memory (B)"`
|
|
Usage string `json:"GPU use (%)"`
|
|
Power string `json:"Current Socket Graphics Package Power (W)"`
|
|
}
|
|
|
|
// startNvidiaCollector oversees collectNvidiaStats and restarts nvidia-smi if it fails
|
|
func (gm *GPUManager) startNvidiaCollector() error {
|
|
for {
|
|
if err := gm.collectNvidiaStats(); err != nil {
|
|
slog.Warn("Restarting nvidia-smi", "err", err)
|
|
time.Sleep(time.Second) // Wait before retrying
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
|
|
// collectNvidiaStats runs nvidia-smi in a loop and passes the output to parseNvidiaData
|
|
func (gm *GPUManager) collectNvidiaStats() error {
|
|
// Set up the command
|
|
cmd := exec.Command("nvidia-smi", "-l", "4", "--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw", "--format=csv,noheader,nounits")
|
|
// Set up a pipe to capture stdout
|
|
stdout, err := cmd.StdoutPipe()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Start the command
|
|
if err := cmd.Start(); err != nil {
|
|
return err
|
|
}
|
|
// Use a scanner to read each line of output
|
|
scanner := bufio.NewScanner(stdout)
|
|
buf := make([]byte, 0, 64*1024) // 64KB buffer
|
|
scanner.Buffer(buf, bufio.MaxScanTokenSize)
|
|
for scanner.Scan() {
|
|
line := scanner.Bytes()
|
|
gm.parseNvidiaData(line) // Run your function on each new line
|
|
}
|
|
// Check for any errors encountered during scanning
|
|
if err := scanner.Err(); err != nil {
|
|
return err
|
|
}
|
|
// Wait for the command to complete
|
|
return cmd.Wait()
|
|
}
|
|
|
|
// parseNvidiaData parses the output of nvidia-smi and updates the GPUData map
|
|
func (gm *GPUManager) parseNvidiaData(output []byte) {
|
|
gm.mutex.Lock()
|
|
defer gm.mutex.Unlock()
|
|
lines := strings.Split(string(output), "\n")
|
|
for _, line := range lines {
|
|
if line != "" {
|
|
fields := strings.Split(line, ", ")
|
|
if len(fields) >= 7 {
|
|
id := fields[0]
|
|
temp, _ := strconv.ParseFloat(fields[2], 64)
|
|
memoryUsage, _ := strconv.ParseFloat(fields[3], 64)
|
|
totalMemory, _ := strconv.ParseFloat(fields[4], 64)
|
|
usage, _ := strconv.ParseFloat(fields[5], 64)
|
|
power, _ := strconv.ParseFloat(fields[6], 64)
|
|
// add gpu if not exists
|
|
if _, ok := gm.GpuDataMap[id]; !ok {
|
|
name := strings.TrimPrefix(fields[1], "NVIDIA ")
|
|
gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")}
|
|
}
|
|
// update gpu data
|
|
gpu := gm.GpuDataMap[id]
|
|
gpu.Temperature += temp
|
|
gpu.MemoryUsed += memoryUsage / 1.024
|
|
gpu.MemoryTotal += totalMemory / 1.024
|
|
gpu.Usage += usage
|
|
gpu.Power += power
|
|
gpu.Count++
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// startAmdCollector oversees collectAmdStats and restarts rocm-smi if it fails
|
|
func (gm *GPUManager) startAmdCollector() {
|
|
for {
|
|
if err := gm.collectAmdStats(); err != nil {
|
|
slog.Warn("Restarting rocm-smi", "err", err)
|
|
time.Sleep(time.Second) // Wait before retrying
|
|
continue
|
|
} else {
|
|
// break if no error (command runs but no card found)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// collectAmdStats runs rocm-smi in a loop and passes the output to parseAmdData
|
|
func (gm *GPUManager) collectAmdStats() error {
|
|
cmd := exec.Command("/bin/sh", "-c", "while true; do rocm-smi --showid --showtemp --showuse --showpower --showproductname --showmeminfo vram --json; sleep 4.7; done")
|
|
// Set up a pipe to capture stdout
|
|
stdout, err := cmd.StdoutPipe()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Start the command
|
|
if err := cmd.Start(); err != nil {
|
|
return err
|
|
}
|
|
// Use a scanner to read each line of output
|
|
scanner := bufio.NewScanner(stdout)
|
|
buf := make([]byte, 0, 64*1024) // 64KB buffer
|
|
scanner.Buffer(buf, bufio.MaxScanTokenSize)
|
|
for scanner.Scan() {
|
|
var rocmSmiInfo map[string]RocmSmiJson
|
|
if err := json.Unmarshal(scanner.Bytes(), &rocmSmiInfo); err != nil {
|
|
return err
|
|
}
|
|
if len(rocmSmiInfo) > 0 {
|
|
// slog.Info("rocm-smi", "data", rocmSmiInfo)
|
|
gm.parseAmdData(&rocmSmiInfo)
|
|
} else {
|
|
slog.Warn("rocm-smi returned no GPU")
|
|
return nil
|
|
}
|
|
}
|
|
if err := scanner.Err(); err != nil {
|
|
return err
|
|
}
|
|
return cmd.Wait()
|
|
}
|
|
|
|
// parseAmdData parses the output of rocm-smi and updates the GPUData map
|
|
func (gm *GPUManager) parseAmdData(rocmSmiInfo *map[string]RocmSmiJson) {
|
|
for _, v := range *rocmSmiInfo {
|
|
temp, _ := strconv.ParseFloat(v.Temperature, 64)
|
|
memoryUsage, _ := strconv.ParseFloat(v.MemoryUsed, 64)
|
|
totalMemory, _ := strconv.ParseFloat(v.MemoryTotal, 64)
|
|
usage, _ := strconv.ParseFloat(v.Usage, 64)
|
|
power, _ := strconv.ParseFloat(v.Power, 64)
|
|
memoryUsage = bytesToMegabytes(memoryUsage)
|
|
totalMemory = bytesToMegabytes(totalMemory)
|
|
|
|
if _, ok := gm.GpuDataMap[v.ID]; !ok {
|
|
gm.GpuDataMap[v.ID] = &system.GPUData{Name: v.Name}
|
|
}
|
|
gpu := gm.GpuDataMap[v.ID]
|
|
gpu.Temperature += temp
|
|
gpu.MemoryUsed += memoryUsage
|
|
gpu.MemoryTotal += totalMemory
|
|
gpu.Usage += usage
|
|
gpu.Power += power
|
|
gpu.Count++
|
|
}
|
|
}
|
|
|
|
// sums and resets the current GPU utilization data since the last update
|
|
func (gm *GPUManager) GetCurrentData() map[string]system.GPUData {
|
|
gm.mutex.Lock()
|
|
defer gm.mutex.Unlock()
|
|
// copy / reset the data
|
|
gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap))
|
|
for id, gpu := range gm.GpuDataMap {
|
|
// sum the data
|
|
gpu.Temperature = twoDecimals(gpu.Temperature / gpu.Count)
|
|
gpu.MemoryUsed = twoDecimals(gpu.MemoryUsed / gpu.Count)
|
|
gpu.MemoryTotal = twoDecimals(gpu.MemoryTotal / gpu.Count)
|
|
gpu.Usage = twoDecimals(gpu.Usage / gpu.Count)
|
|
gpu.Power = twoDecimals(gpu.Power / gpu.Count)
|
|
gpuData[id] = *gpu
|
|
// reset the data
|
|
gpu.Temperature = 0
|
|
gpu.MemoryUsed = 0
|
|
gpu.MemoryTotal = 0
|
|
gpu.Usage = 0
|
|
gpu.Power = 0
|
|
gpu.Count = 0
|
|
}
|
|
return gpuData
|
|
}
|
|
|
|
// detectGPU returns the GPU brand (nvidia or amd) or an error if none is found
|
|
// todo: make sure there's actually a GPU, not just if the command exists
|
|
func (gm *GPUManager) detectGPU() error {
|
|
if err := exec.Command("nvidia-smi").Run(); err == nil {
|
|
gm.nvidiaSmi = true
|
|
}
|
|
if err := exec.Command("rocm-smi").Run(); err == nil {
|
|
gm.rocmSmi = true
|
|
}
|
|
if gm.nvidiaSmi || gm.rocmSmi {
|
|
return nil
|
|
}
|
|
return fmt.Errorf("no GPU found - install nvidia-smi or rocm-smi")
|
|
}
|
|
|
|
// NewGPUManager returns a new GPUManager
|
|
func NewGPUManager() (*GPUManager, error) {
|
|
var gm GPUManager
|
|
if err := gm.detectGPU(); err != nil {
|
|
return nil, err
|
|
}
|
|
gm.GpuDataMap = make(map[string]*system.GPUData, 1)
|
|
if gm.nvidiaSmi {
|
|
go gm.startNvidiaCollector()
|
|
}
|
|
if gm.rocmSmi {
|
|
go gm.startAmdCollector()
|
|
}
|
|
return &gm, nil
|
|
}
|