Compare commits

...

4 Commits

Author SHA1 Message Date
Yorick
14f7480915 support xpu-smi for intel stats (#755) 2025-11-14 10:58:28 -05:00
henrygd
aab5725d82 Use gpu temp as primary sensor if no other sensors 2025-04-18 18:00:39 -04:00
henrygd
e94a1cd421 brew install - change env var from PORT to LISTEN 2025-04-18 17:59:59 -04:00
henrygd
73c1a1b208 Refactor sensor configuration handling in tests and implementation
- Add skipCollection propery
- Ensure that sensors are initialized as an empty map
2025-04-18 17:59:25 -04:00
5 changed files with 159 additions and 46 deletions

View File

@@ -4,6 +4,7 @@ import (
"beszel/internal/entities/system"
"bufio"
"bytes"
"encoding/csv"
"encoding/json"
"fmt"
"os/exec"
@@ -21,11 +22,13 @@ const (
nvidiaSmiCmd = "nvidia-smi"
rocmSmiCmd = "rocm-smi"
tegraStatsCmd = "tegrastats"
xpuSmiCmd = "xpu-smi"
// Polling intervals
nvidiaSmiInterval = "4" // in seconds
tegraStatsInterval = "3700" // in milliseconds
rocmSmiInterval = 4300 * time.Millisecond
xpuSmiInterval = 4
// Command retry and timeout constants
retryWaitTime = 5 * time.Second
@@ -41,10 +44,11 @@ const (
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
type GPUManager struct {
sync.Mutex
nvidiaSmi bool
rocmSmi bool
tegrastats bool
GpuDataMap map[string]*system.GPUData
nvidiaSmi bool
rocmSmi bool
tegrastats bool
intelXpuSmi bool
GpuDataMap map[string]*system.GPUData
}
// RocmSmiJson represents the JSON structure of rocm-smi output
@@ -160,6 +164,59 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool {
}
}
func (gm *GPUManager) parseIntelData(output []byte) bool {
gm.Lock()
defer gm.Unlock()
reader := csv.NewReader(bytes.NewReader(output))
records, err := reader.ReadAll()
if err != nil {
slog.Warn("Failed to parse Intel GPU data", "err", err)
return false
}
header := []string{"Timestamp", "DeviceId", "GPU Power (W)", "GPU Frequency (MHz)", "GPU Memory Utilization (%)", "GPU Memory Used (MiB)"}
gpuData := &system.GPUData{Name: "GPU"}
gm.GpuDataMap["0"] = gpuData
for _, record := range records {
if strings.Join(record, ",") == strings.Join(header, ",") {
slog.Debug("Skipping header", "header", record)
continue
}
var memoryUtilization *float64
var memoryUsed *float64
for i, field := range header {
if field == "Timestamp" {
continue
}
stripped := strings.TrimSpace(record[i])
value, err := strconv.ParseFloat(stripped, 64)
if err != nil {
slog.Warn("Failed to parse field", "field", field, "value", stripped, "err", err)
continue
}
switch field {
case "GPU Power (W)":
gpuData.Power += value
case "GPU Frequency (MHz)":
gpuData.Usage += value
case "GPU Memory Utilization (%)":
memoryUtilization = &value
case "GPU Memory Used (MiB)":
memoryUsed = &value
}
}
if memoryUtilization != nil && memoryUsed != nil {
gpuData.MemoryUsed = *memoryUsed
gpuData.MemoryTotal = (*memoryUsed / *memoryUtilization) * 100 // convert to total memory
}
}
gpuData.Count++
return true
}
// parseNvidiaData parses the output of nvidia-smi and updates the GPUData map
func (gm *GPUManager) parseNvidiaData(output []byte) bool {
gm.Lock()
@@ -278,10 +335,14 @@ func (gm *GPUManager) detectGPUs() error {
gm.tegrastats = true
gm.nvidiaSmi = false
}
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats {
fmt.Println("Looking for gpus")
if _, err := exec.LookPath(xpuSmiCmd); err == nil {
gm.intelXpuSmi = true
}
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelXpuSmi {
return nil
}
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or tegrastats")
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, intel_gpu_top, or tegrastats")
}
// startCollector starts the appropriate GPU data collector based on the command
@@ -318,6 +379,10 @@ func (gm *GPUManager) startCollector(command string) {
time.Sleep(rocmSmiInterval)
}
}()
case xpuSmiCmd:
collector.cmdArgs = []string{"dump", "-d", "-1", "-m", "1,2,5,18", "-i", strconv.Itoa(xpuSmiInterval)}
collector.parse = gm.parseIntelData
go collector.start()
}
}
@@ -338,6 +403,9 @@ func NewGPUManager() (*GPUManager, error) {
if gm.tegrastats {
gm.startCollector(tegraStatsCmd)
}
if gm.intelXpuSmi {
gm.startCollector(xpuSmiCmd)
}
return &gm, nil
}

View File

@@ -13,26 +13,31 @@ import (
)
type SensorConfig struct {
context context.Context
sensors map[string]struct{}
primarySensor string
isBlacklist bool
hasWildcards bool
context context.Context
sensors map[string]struct{}
primarySensor string
isBlacklist bool
hasWildcards bool
skipCollection bool
}
func (a *Agent) newSensorConfig() *SensorConfig {
primarySensor, _ := GetEnv("PRIMARY_SENSOR")
sysSensors, _ := GetEnv("SYS_SENSORS")
sensors, _ := GetEnv("SENSORS")
sensorsEnvVal, sensorsSet := GetEnv("SENSORS")
skipCollection := sensorsSet && sensorsEnvVal == ""
return a.newSensorConfigWithEnv(primarySensor, sysSensors, sensors)
return a.newSensorConfigWithEnv(primarySensor, sysSensors, sensorsEnvVal, skipCollection)
}
// newSensorConfigWithEnv creates a SensorConfig with the provided environment variables
func (a *Agent) newSensorConfigWithEnv(primarySensor, sysSensors, sensors string) *SensorConfig {
// sensorsSet indicates if the SENSORS environment variable was explicitly set (even to empty string)
func (a *Agent) newSensorConfigWithEnv(primarySensor, sysSensors, sensorsEnvVal string, skipCollection bool) *SensorConfig {
config := &SensorConfig{
context: context.Background(),
primarySensor: primarySensor,
context: context.Background(),
primarySensor: primarySensor,
skipCollection: skipCollection,
sensors: make(map[string]struct{}),
}
// Set sensors context (allows overriding sys location for sensors)
@@ -43,22 +48,18 @@ func (a *Agent) newSensorConfigWithEnv(primarySensor, sysSensors, sensors string
)
}
// Set sensors whitelist
if sensors != "" {
// handle blacklist
if strings.HasPrefix(sensors, "-") {
config.isBlacklist = true
sensors = sensors[1:]
}
// handle blacklist
if strings.HasPrefix(sensorsEnvVal, "-") {
config.isBlacklist = true
sensorsEnvVal = sensorsEnvVal[1:]
}
config.sensors = make(map[string]struct{})
for sensor := range strings.SplitSeq(sensors, ",") {
sensor = strings.TrimSpace(sensor)
if sensor != "" {
config.sensors[sensor] = struct{}{}
if strings.Contains(sensor, "*") {
config.hasWildcards = true
}
for sensor := range strings.SplitSeq(sensorsEnvVal, ",") {
sensor = strings.TrimSpace(sensor)
if sensor != "" {
config.sensors[sensor] = struct{}{}
if strings.Contains(sensor, "*") {
config.hasWildcards = true
}
}
}
@@ -69,7 +70,7 @@ func (a *Agent) newSensorConfigWithEnv(primarySensor, sysSensors, sensors string
// updateTemperatures updates the agent with the latest sensor temperatures
func (a *Agent) updateTemperatures(systemStats *system.Stats) {
// skip if sensors whitelist is set to empty string
if a.sensorConfig.sensors != nil && len(a.sensorConfig.sensors) == 0 {
if a.sensorConfig.skipCollection {
slog.Debug("Skipping temperature collection")
return
}
@@ -113,8 +114,8 @@ func (a *Agent) updateTemperatures(systemStats *system.Stats) {
// isValidSensor checks if a sensor is valid based on the sensor name and the sensor config
func isValidSensor(sensorName string, config *SensorConfig) bool {
// If no sensors configuration, everything is valid
if config.sensors == nil {
// if no sensors configured, everything is valid
if len(config.sensors) == 0 {
return true
}
@@ -123,7 +124,7 @@ func isValidSensor(sensorName string, config *SensorConfig) bool {
return !config.isBlacklist
}
// If no wildcards, return false if blacklist, true if whitelist
// If no wildcards, return true if blacklist, false if whitelist
if !config.hasWildcards {
return config.isBlacklist
}

View File

@@ -97,10 +97,13 @@ func TestIsValidSensor(t *testing.T) {
expectedValid: true,
},
{
name: "Nil sensor config",
name: "No sensors configured",
sensorName: "any_temp",
config: &SensorConfig{
sensors: nil,
sensors: map[string]struct{}{},
isBlacklist: false,
hasWildcards: false,
skipCollection: false,
},
expectedValid: true,
},
@@ -162,6 +165,7 @@ func TestNewSensorConfigWithEnv(t *testing.T) {
primarySensor string
sysSensors string
sensors string
skipCollection bool
expectedConfig *SensorConfig
}{
{
@@ -170,22 +174,38 @@ func TestNewSensorConfigWithEnv(t *testing.T) {
sysSensors: "",
sensors: "",
expectedConfig: &SensorConfig{
context: context.Background(),
primarySensor: "",
sensors: nil,
isBlacklist: false,
hasWildcards: false,
context: context.Background(),
primarySensor: "",
sensors: map[string]struct{}{},
isBlacklist: false,
hasWildcards: false,
skipCollection: false,
},
},
{
name: "Primary sensor only",
name: "Explicitly set to empty string",
primarySensor: "",
sysSensors: "",
sensors: "",
skipCollection: true,
expectedConfig: &SensorConfig{
context: context.Background(),
primarySensor: "",
sensors: map[string]struct{}{},
isBlacklist: false,
hasWildcards: false,
skipCollection: true,
},
},
{
name: "Primary sensor only - should create sensor map",
primarySensor: "cpu_temp",
sysSensors: "",
sensors: "",
expectedConfig: &SensorConfig{
context: context.Background(),
primarySensor: "cpu_temp",
sensors: nil,
sensors: map[string]struct{}{},
isBlacklist: false,
hasWildcards: false,
},
@@ -238,6 +258,22 @@ func TestNewSensorConfigWithEnv(t *testing.T) {
hasWildcards: true,
},
},
{
name: "Sensors with whitespace",
primarySensor: "cpu_temp",
sysSensors: "",
sensors: "cpu_*, gpu_temp",
expectedConfig: &SensorConfig{
context: context.Background(),
primarySensor: "cpu_temp",
sensors: map[string]struct{}{
"cpu_*": {},
"gpu_temp": {},
},
isBlacklist: false,
hasWildcards: true,
},
},
{
name: "With SYS_SENSORS path",
primarySensor: "cpu_temp",
@@ -256,7 +292,7 @@ func TestNewSensorConfigWithEnv(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := agent.newSensorConfigWithEnv(tt.primarySensor, tt.sysSensors, tt.sensors)
result := agent.newSensorConfigWithEnv(tt.primarySensor, tt.sysSensors, tt.sensors, tt.skipCollection)
// Check primary sensor
assert.Equal(t, tt.expectedConfig.primarySensor, result.primarySensor)

View File

@@ -199,16 +199,24 @@ func (a *Agent) getSystemStats() system.Stats {
if systemStats.Temperatures == nil {
systemStats.Temperatures = make(map[string]float64, len(gpuData))
}
highestTemp := 0.0
for _, gpu := range gpuData {
if gpu.Temperature > 0 {
systemStats.Temperatures[gpu.Name] = gpu.Temperature
if a.sensorConfig.primarySensor == gpu.Name {
a.systemInfo.DashboardTemp = gpu.Temperature
}
if gpu.Temperature > highestTemp {
highestTemp = gpu.Temperature
}
}
// update high gpu percent for dashboard
a.systemInfo.GpuPct = max(a.systemInfo.GpuPct, gpu.Usage)
}
// use highest temp for dashboard temp if dashboard temp is unset
if a.systemInfo.DashboardTemp == 0 {
a.systemInfo.DashboardTemp = highestTemp
}
}
}

View File

@@ -67,7 +67,7 @@ fi
mkdir -p ~/.config/beszel ~/.cache/beszel
echo "KEY=\"$KEY\"" >~/.config/beszel/beszel-agent.env
echo "PORT=$PORT" >>~/.config/beszel/beszel-agent.env
echo "LISTEN=$PORT" >>~/.config/beszel/beszel-agent.env
brew tap henrygd/beszel
brew install beszel-agent