mirror of
https://github.com/henrygd/beszel.git
synced 2026-04-08 05:51:50 +02:00
Compare commits
4 Commits
0526c88ce0
...
755-xpu-sm
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
14f7480915 | ||
|
|
aab5725d82 | ||
|
|
e94a1cd421 | ||
|
|
73c1a1b208 |
@@ -4,6 +4,7 @@ import (
|
||||
"beszel/internal/entities/system"
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/csv"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
@@ -21,11 +22,13 @@ const (
|
||||
nvidiaSmiCmd = "nvidia-smi"
|
||||
rocmSmiCmd = "rocm-smi"
|
||||
tegraStatsCmd = "tegrastats"
|
||||
xpuSmiCmd = "xpu-smi"
|
||||
|
||||
// Polling intervals
|
||||
nvidiaSmiInterval = "4" // in seconds
|
||||
tegraStatsInterval = "3700" // in milliseconds
|
||||
rocmSmiInterval = 4300 * time.Millisecond
|
||||
xpuSmiInterval = 4
|
||||
|
||||
// Command retry and timeout constants
|
||||
retryWaitTime = 5 * time.Second
|
||||
@@ -41,10 +44,11 @@ const (
|
||||
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
|
||||
type GPUManager struct {
|
||||
sync.Mutex
|
||||
nvidiaSmi bool
|
||||
rocmSmi bool
|
||||
tegrastats bool
|
||||
GpuDataMap map[string]*system.GPUData
|
||||
nvidiaSmi bool
|
||||
rocmSmi bool
|
||||
tegrastats bool
|
||||
intelXpuSmi bool
|
||||
GpuDataMap map[string]*system.GPUData
|
||||
}
|
||||
|
||||
// RocmSmiJson represents the JSON structure of rocm-smi output
|
||||
@@ -160,6 +164,59 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool {
|
||||
}
|
||||
}
|
||||
|
||||
func (gm *GPUManager) parseIntelData(output []byte) bool {
|
||||
gm.Lock()
|
||||
defer gm.Unlock()
|
||||
reader := csv.NewReader(bytes.NewReader(output))
|
||||
records, err := reader.ReadAll()
|
||||
if err != nil {
|
||||
slog.Warn("Failed to parse Intel GPU data", "err", err)
|
||||
return false
|
||||
}
|
||||
|
||||
header := []string{"Timestamp", "DeviceId", "GPU Power (W)", "GPU Frequency (MHz)", "GPU Memory Utilization (%)", "GPU Memory Used (MiB)"}
|
||||
gpuData := &system.GPUData{Name: "GPU"}
|
||||
gm.GpuDataMap["0"] = gpuData
|
||||
|
||||
for _, record := range records {
|
||||
if strings.Join(record, ",") == strings.Join(header, ",") {
|
||||
slog.Debug("Skipping header", "header", record)
|
||||
continue
|
||||
}
|
||||
var memoryUtilization *float64
|
||||
var memoryUsed *float64
|
||||
for i, field := range header {
|
||||
if field == "Timestamp" {
|
||||
continue
|
||||
}
|
||||
stripped := strings.TrimSpace(record[i])
|
||||
value, err := strconv.ParseFloat(stripped, 64)
|
||||
if err != nil {
|
||||
slog.Warn("Failed to parse field", "field", field, "value", stripped, "err", err)
|
||||
continue
|
||||
}
|
||||
|
||||
switch field {
|
||||
case "GPU Power (W)":
|
||||
gpuData.Power += value
|
||||
case "GPU Frequency (MHz)":
|
||||
gpuData.Usage += value
|
||||
case "GPU Memory Utilization (%)":
|
||||
memoryUtilization = &value
|
||||
case "GPU Memory Used (MiB)":
|
||||
memoryUsed = &value
|
||||
}
|
||||
}
|
||||
if memoryUtilization != nil && memoryUsed != nil {
|
||||
gpuData.MemoryUsed = *memoryUsed
|
||||
gpuData.MemoryTotal = (*memoryUsed / *memoryUtilization) * 100 // convert to total memory
|
||||
}
|
||||
}
|
||||
gpuData.Count++
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// parseNvidiaData parses the output of nvidia-smi and updates the GPUData map
|
||||
func (gm *GPUManager) parseNvidiaData(output []byte) bool {
|
||||
gm.Lock()
|
||||
@@ -278,10 +335,14 @@ func (gm *GPUManager) detectGPUs() error {
|
||||
gm.tegrastats = true
|
||||
gm.nvidiaSmi = false
|
||||
}
|
||||
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats {
|
||||
fmt.Println("Looking for gpus")
|
||||
if _, err := exec.LookPath(xpuSmiCmd); err == nil {
|
||||
gm.intelXpuSmi = true
|
||||
}
|
||||
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelXpuSmi {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or tegrastats")
|
||||
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, intel_gpu_top, or tegrastats")
|
||||
}
|
||||
|
||||
// startCollector starts the appropriate GPU data collector based on the command
|
||||
@@ -318,6 +379,10 @@ func (gm *GPUManager) startCollector(command string) {
|
||||
time.Sleep(rocmSmiInterval)
|
||||
}
|
||||
}()
|
||||
case xpuSmiCmd:
|
||||
collector.cmdArgs = []string{"dump", "-d", "-1", "-m", "1,2,5,18", "-i", strconv.Itoa(xpuSmiInterval)}
|
||||
collector.parse = gm.parseIntelData
|
||||
go collector.start()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -338,6 +403,9 @@ func NewGPUManager() (*GPUManager, error) {
|
||||
if gm.tegrastats {
|
||||
gm.startCollector(tegraStatsCmd)
|
||||
}
|
||||
if gm.intelXpuSmi {
|
||||
gm.startCollector(xpuSmiCmd)
|
||||
}
|
||||
|
||||
return &gm, nil
|
||||
}
|
||||
|
||||
@@ -13,26 +13,31 @@ import (
|
||||
)
|
||||
|
||||
type SensorConfig struct {
|
||||
context context.Context
|
||||
sensors map[string]struct{}
|
||||
primarySensor string
|
||||
isBlacklist bool
|
||||
hasWildcards bool
|
||||
context context.Context
|
||||
sensors map[string]struct{}
|
||||
primarySensor string
|
||||
isBlacklist bool
|
||||
hasWildcards bool
|
||||
skipCollection bool
|
||||
}
|
||||
|
||||
func (a *Agent) newSensorConfig() *SensorConfig {
|
||||
primarySensor, _ := GetEnv("PRIMARY_SENSOR")
|
||||
sysSensors, _ := GetEnv("SYS_SENSORS")
|
||||
sensors, _ := GetEnv("SENSORS")
|
||||
sensorsEnvVal, sensorsSet := GetEnv("SENSORS")
|
||||
skipCollection := sensorsSet && sensorsEnvVal == ""
|
||||
|
||||
return a.newSensorConfigWithEnv(primarySensor, sysSensors, sensors)
|
||||
return a.newSensorConfigWithEnv(primarySensor, sysSensors, sensorsEnvVal, skipCollection)
|
||||
}
|
||||
|
||||
// newSensorConfigWithEnv creates a SensorConfig with the provided environment variables
|
||||
func (a *Agent) newSensorConfigWithEnv(primarySensor, sysSensors, sensors string) *SensorConfig {
|
||||
// sensorsSet indicates if the SENSORS environment variable was explicitly set (even to empty string)
|
||||
func (a *Agent) newSensorConfigWithEnv(primarySensor, sysSensors, sensorsEnvVal string, skipCollection bool) *SensorConfig {
|
||||
config := &SensorConfig{
|
||||
context: context.Background(),
|
||||
primarySensor: primarySensor,
|
||||
context: context.Background(),
|
||||
primarySensor: primarySensor,
|
||||
skipCollection: skipCollection,
|
||||
sensors: make(map[string]struct{}),
|
||||
}
|
||||
|
||||
// Set sensors context (allows overriding sys location for sensors)
|
||||
@@ -43,22 +48,18 @@ func (a *Agent) newSensorConfigWithEnv(primarySensor, sysSensors, sensors string
|
||||
)
|
||||
}
|
||||
|
||||
// Set sensors whitelist
|
||||
if sensors != "" {
|
||||
// handle blacklist
|
||||
if strings.HasPrefix(sensors, "-") {
|
||||
config.isBlacklist = true
|
||||
sensors = sensors[1:]
|
||||
}
|
||||
// handle blacklist
|
||||
if strings.HasPrefix(sensorsEnvVal, "-") {
|
||||
config.isBlacklist = true
|
||||
sensorsEnvVal = sensorsEnvVal[1:]
|
||||
}
|
||||
|
||||
config.sensors = make(map[string]struct{})
|
||||
for sensor := range strings.SplitSeq(sensors, ",") {
|
||||
sensor = strings.TrimSpace(sensor)
|
||||
if sensor != "" {
|
||||
config.sensors[sensor] = struct{}{}
|
||||
if strings.Contains(sensor, "*") {
|
||||
config.hasWildcards = true
|
||||
}
|
||||
for sensor := range strings.SplitSeq(sensorsEnvVal, ",") {
|
||||
sensor = strings.TrimSpace(sensor)
|
||||
if sensor != "" {
|
||||
config.sensors[sensor] = struct{}{}
|
||||
if strings.Contains(sensor, "*") {
|
||||
config.hasWildcards = true
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -69,7 +70,7 @@ func (a *Agent) newSensorConfigWithEnv(primarySensor, sysSensors, sensors string
|
||||
// updateTemperatures updates the agent with the latest sensor temperatures
|
||||
func (a *Agent) updateTemperatures(systemStats *system.Stats) {
|
||||
// skip if sensors whitelist is set to empty string
|
||||
if a.sensorConfig.sensors != nil && len(a.sensorConfig.sensors) == 0 {
|
||||
if a.sensorConfig.skipCollection {
|
||||
slog.Debug("Skipping temperature collection")
|
||||
return
|
||||
}
|
||||
@@ -113,8 +114,8 @@ func (a *Agent) updateTemperatures(systemStats *system.Stats) {
|
||||
|
||||
// isValidSensor checks if a sensor is valid based on the sensor name and the sensor config
|
||||
func isValidSensor(sensorName string, config *SensorConfig) bool {
|
||||
// If no sensors configuration, everything is valid
|
||||
if config.sensors == nil {
|
||||
// if no sensors configured, everything is valid
|
||||
if len(config.sensors) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -123,7 +124,7 @@ func isValidSensor(sensorName string, config *SensorConfig) bool {
|
||||
return !config.isBlacklist
|
||||
}
|
||||
|
||||
// If no wildcards, return false if blacklist, true if whitelist
|
||||
// If no wildcards, return true if blacklist, false if whitelist
|
||||
if !config.hasWildcards {
|
||||
return config.isBlacklist
|
||||
}
|
||||
|
||||
@@ -97,10 +97,13 @@ func TestIsValidSensor(t *testing.T) {
|
||||
expectedValid: true,
|
||||
},
|
||||
{
|
||||
name: "Nil sensor config",
|
||||
name: "No sensors configured",
|
||||
sensorName: "any_temp",
|
||||
config: &SensorConfig{
|
||||
sensors: nil,
|
||||
sensors: map[string]struct{}{},
|
||||
isBlacklist: false,
|
||||
hasWildcards: false,
|
||||
skipCollection: false,
|
||||
},
|
||||
expectedValid: true,
|
||||
},
|
||||
@@ -162,6 +165,7 @@ func TestNewSensorConfigWithEnv(t *testing.T) {
|
||||
primarySensor string
|
||||
sysSensors string
|
||||
sensors string
|
||||
skipCollection bool
|
||||
expectedConfig *SensorConfig
|
||||
}{
|
||||
{
|
||||
@@ -170,22 +174,38 @@ func TestNewSensorConfigWithEnv(t *testing.T) {
|
||||
sysSensors: "",
|
||||
sensors: "",
|
||||
expectedConfig: &SensorConfig{
|
||||
context: context.Background(),
|
||||
primarySensor: "",
|
||||
sensors: nil,
|
||||
isBlacklist: false,
|
||||
hasWildcards: false,
|
||||
context: context.Background(),
|
||||
primarySensor: "",
|
||||
sensors: map[string]struct{}{},
|
||||
isBlacklist: false,
|
||||
hasWildcards: false,
|
||||
skipCollection: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "Primary sensor only",
|
||||
name: "Explicitly set to empty string",
|
||||
primarySensor: "",
|
||||
sysSensors: "",
|
||||
sensors: "",
|
||||
skipCollection: true,
|
||||
expectedConfig: &SensorConfig{
|
||||
context: context.Background(),
|
||||
primarySensor: "",
|
||||
sensors: map[string]struct{}{},
|
||||
isBlacklist: false,
|
||||
hasWildcards: false,
|
||||
skipCollection: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "Primary sensor only - should create sensor map",
|
||||
primarySensor: "cpu_temp",
|
||||
sysSensors: "",
|
||||
sensors: "",
|
||||
expectedConfig: &SensorConfig{
|
||||
context: context.Background(),
|
||||
primarySensor: "cpu_temp",
|
||||
sensors: nil,
|
||||
sensors: map[string]struct{}{},
|
||||
isBlacklist: false,
|
||||
hasWildcards: false,
|
||||
},
|
||||
@@ -238,6 +258,22 @@ func TestNewSensorConfigWithEnv(t *testing.T) {
|
||||
hasWildcards: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "Sensors with whitespace",
|
||||
primarySensor: "cpu_temp",
|
||||
sysSensors: "",
|
||||
sensors: "cpu_*, gpu_temp",
|
||||
expectedConfig: &SensorConfig{
|
||||
context: context.Background(),
|
||||
primarySensor: "cpu_temp",
|
||||
sensors: map[string]struct{}{
|
||||
"cpu_*": {},
|
||||
"gpu_temp": {},
|
||||
},
|
||||
isBlacklist: false,
|
||||
hasWildcards: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "With SYS_SENSORS path",
|
||||
primarySensor: "cpu_temp",
|
||||
@@ -256,7 +292,7 @@ func TestNewSensorConfigWithEnv(t *testing.T) {
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := agent.newSensorConfigWithEnv(tt.primarySensor, tt.sysSensors, tt.sensors)
|
||||
result := agent.newSensorConfigWithEnv(tt.primarySensor, tt.sysSensors, tt.sensors, tt.skipCollection)
|
||||
|
||||
// Check primary sensor
|
||||
assert.Equal(t, tt.expectedConfig.primarySensor, result.primarySensor)
|
||||
|
||||
@@ -199,16 +199,24 @@ func (a *Agent) getSystemStats() system.Stats {
|
||||
if systemStats.Temperatures == nil {
|
||||
systemStats.Temperatures = make(map[string]float64, len(gpuData))
|
||||
}
|
||||
highestTemp := 0.0
|
||||
for _, gpu := range gpuData {
|
||||
if gpu.Temperature > 0 {
|
||||
systemStats.Temperatures[gpu.Name] = gpu.Temperature
|
||||
if a.sensorConfig.primarySensor == gpu.Name {
|
||||
a.systemInfo.DashboardTemp = gpu.Temperature
|
||||
}
|
||||
if gpu.Temperature > highestTemp {
|
||||
highestTemp = gpu.Temperature
|
||||
}
|
||||
}
|
||||
// update high gpu percent for dashboard
|
||||
a.systemInfo.GpuPct = max(a.systemInfo.GpuPct, gpu.Usage)
|
||||
}
|
||||
// use highest temp for dashboard temp if dashboard temp is unset
|
||||
if a.systemInfo.DashboardTemp == 0 {
|
||||
a.systemInfo.DashboardTemp = highestTemp
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -67,7 +67,7 @@ fi
|
||||
mkdir -p ~/.config/beszel ~/.cache/beszel
|
||||
|
||||
echo "KEY=\"$KEY\"" >~/.config/beszel/beszel-agent.env
|
||||
echo "PORT=$PORT" >>~/.config/beszel/beszel-agent.env
|
||||
echo "LISTEN=$PORT" >>~/.config/beszel/beszel-agent.env
|
||||
|
||||
brew tap henrygd/beszel
|
||||
brew install beszel-agent
|
||||
|
||||
Reference in New Issue
Block a user