Compare commits

...

5 Commits

Author SHA1 Message Date
henrygd
4395520a28 Probable fix for Jetson gpu issue (#895) 2025-06-26 22:11:48 -04:00
Alexander Mnich
8c52f30a71 add GITHUB_TOKEN fallback for goreleaser (#925)
adding the fallback to the GITHUB_TOKEN allows execution of goreleaser in a fork without additional configuration
2025-06-26 21:03:19 -04:00
SSU
46316ebffa fix(install): suppress scoop output to avoid nssm path pollution (#918)
Suppressed the output of “scoop install beszel-agent” to ensure the NSSM service path
contains only the executable location.

Closes #915

Co-authored-by: suseol <suseol@geosr.com>
2025-06-25 13:52:45 -04:00
henrygd
0b04f60b6c Add panic recovery for sensors.TemperaturesWithContext (#796) 2025-06-23 19:50:11 -04:00
HansAndreManfredson
20b822d072 Fix missing groups #892 (#893) 2025-06-17 16:08:32 -04:00
7 changed files with 250 additions and 56 deletions

View File

@@ -39,4 +39,4 @@ jobs:
version: latest
args: release --clean
env:
GITHUB_TOKEN: ${{ secrets.TOKEN }}
GITHUB_TOKEN: ${{ secrets.TOKEN || secrets.GITHUB_TOKEN }}

View File

@@ -243,21 +243,26 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData {
// copy / reset the data
gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap))
for id, gpu := range gm.GpuDataMap {
// sum the data
gpu.Temperature = twoDecimals(gpu.Temperature)
gpu.MemoryUsed = twoDecimals(gpu.MemoryUsed)
gpu.MemoryTotal = twoDecimals(gpu.MemoryTotal)
gpu.Usage = twoDecimals(gpu.Usage / gpu.Count)
gpu.Power = twoDecimals(gpu.Power / gpu.Count)
// reset the count
gpu.Count = 1
// dereference to avoid overwriting anything else
gpuCopy := *gpu
var gpuAvg system.GPUData
gpuAvg.Temperature = twoDecimals(gpu.Temperature)
gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed)
gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal)
// avoid division by zero
if gpu.Count > 0 {
gpuAvg.Usage = twoDecimals(gpu.Usage / gpu.Count)
gpuAvg.Power = twoDecimals(gpu.Power / gpu.Count)
}
// reset accumulators in the original
gpu.Usage, gpu.Power, gpu.Count = 0, 0, 0
// append id to the name if there are multiple GPUs with the same name
if nameCounts[gpu.Name] > 1 {
gpuCopy.Name = fmt.Sprintf("%s %s", gpu.Name, id)
gpuAvg.Name = fmt.Sprintf("%s %s", gpu.Name, id)
}
gpuData[id] = gpuCopy
gpuData[id] = gpuAvg
}
slog.Debug("GPU", "data", gpuData)
return gpuData

View File

@@ -279,6 +279,19 @@ func TestParseJetsonData(t *testing.T) {
Count: 1,
},
},
{
name: "orin nano",
input: "06-18-2025 11:25:24 RAM 3452/7620MB (lfb 25x4MB) SWAP 1518/16384MB (cached 174MB) CPU [1%@1420,2%@1420,0%@1420,2%@1420,2%@729,1%@729] GR3D_FREQ 0% cpu@50.031C soc2@49.031C soc0@50C gpu@49.031C tj@50.25C soc1@50.25C VDD_IN 4824mW/4824mW VDD_CPU_GPU_CV 518mW/518mW VDD_SOC 1475mW/1475mW",
wantMetrics: &system.GPUData{
Name: "GPU",
MemoryUsed: 3452.0,
MemoryTotal: 7620.0,
Usage: 0.0,
Temperature: 50.25,
Power: 0.518,
Count: 1,
},
},
{
name: "missing temperature",
input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW",
@@ -318,44 +331,75 @@ func TestParseJetsonData(t *testing.T) {
}
func TestGetCurrentData(t *testing.T) {
gm := &GPUManager{
GpuDataMap: map[string]*system.GPUData{
"0": {
Name: "GPU1",
Temperature: 50,
MemoryUsed: 2048,
MemoryTotal: 4096,
Usage: 100, // 100 over 2 counts = 50 avg
Power: 200, // 200 over 2 counts = 100 avg
Count: 2,
t.Run("calculates averages and resets accumulators", func(t *testing.T) {
gm := &GPUManager{
GpuDataMap: map[string]*system.GPUData{
"0": {
Name: "GPU1",
Temperature: 50,
MemoryUsed: 2048,
MemoryTotal: 4096,
Usage: 100, // 100 over 2 counts = 50 avg
Power: 200, // 200 over 2 counts = 100 avg
Count: 2,
},
"1": {
Name: "GPU1",
Temperature: 60,
MemoryUsed: 3072,
MemoryTotal: 8192,
Usage: 30,
Power: 60,
Count: 1,
},
},
"1": {
Name: "GPU1",
Temperature: 60,
MemoryUsed: 3072,
MemoryTotal: 8192,
Usage: 30,
Power: 60,
Count: 1,
}
result := gm.GetCurrentData()
// Verify name disambiguation
assert.Equal(t, "GPU1 0", result["0"].Name)
assert.Equal(t, "GPU1 1", result["1"].Name)
// Check averaged values in the result
assert.InDelta(t, 50.0, result["0"].Usage, 0.01)
assert.InDelta(t, 100.0, result["0"].Power, 0.01)
assert.InDelta(t, 30.0, result["1"].Usage, 0.01)
assert.InDelta(t, 60.0, result["1"].Power, 0.01)
// Verify that accumulators in the original map are reset
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Count, "GPU 0 Count should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Usage, "GPU 0 Usage should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Power, "GPU 0 Power should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["1"].Count, "GPU 1 Count should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["1"].Usage, "GPU 1 Usage should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["1"].Power, "GPU 1 Power should be reset")
})
t.Run("handles zero count without panicking", func(t *testing.T) {
gm := &GPUManager{
GpuDataMap: map[string]*system.GPUData{
"0": {
Name: "TestGPU",
Count: 0,
Usage: 0,
Power: 0,
},
},
},
}
}
result := gm.GetCurrentData()
var result map[string]system.GPUData
assert.NotPanics(t, func() {
result = gm.GetCurrentData()
})
// Verify name disambiguation
assert.Equal(t, "GPU1 0", result["0"].Name)
assert.Equal(t, "GPU1 1", result["1"].Name)
// Check that usage and power are 0
assert.Equal(t, 0.0, result["0"].Usage)
assert.Equal(t, 0.0, result["0"].Power)
// Check averaged values
assert.InDelta(t, 50.0, result["0"].Usage, 0.01)
assert.InDelta(t, 100.0, result["0"].Power, 0.01)
assert.InDelta(t, 30.0, result["1"].Usage, 0.01)
assert.InDelta(t, 60.0, result["1"].Power, 0.01)
// Verify reset counts
assert.Equal(t, float64(1), gm.GpuDataMap["0"].Count)
assert.Equal(t, float64(1), gm.GpuDataMap["1"].Count)
// Verify reset count
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Count)
})
}
func TestDetectGPUs(t *testing.T) {
@@ -722,6 +766,18 @@ func TestAccumulation(t *testing.T) {
assert.InDelta(t, expected.avgUsage, gpu.Usage, 0.01, "Average usage in GetCurrentData should match")
assert.InDelta(t, expected.avgPower, gpu.Power, 0.01, "Average power in GetCurrentData should match")
}
// Verify that accumulators in the original map are reset
for id := range tt.expectedValues {
gpu, exists := gm.GpuDataMap[id]
assert.True(t, exists, "GPU with ID %s should still exist after GetCurrentData", id)
if !exists {
continue
}
assert.Equal(t, float64(0), gpu.Count, "Count should be reset for GPU ID %s", id)
assert.Equal(t, float64(0), gpu.Usage, "Usage should be reset for GPU ID %s", id)
assert.Equal(t, float64(0), gpu.Power, "Power should be reset for GPU ID %s", id)
}
})
}
}

View File

@@ -3,6 +3,7 @@ package agent
import (
"beszel/internal/entities/system"
"context"
"fmt"
"log/slog"
"path"
"strconv"
@@ -30,6 +31,9 @@ func (a *Agent) newSensorConfig() *SensorConfig {
return a.newSensorConfigWithEnv(primarySensor, sysSensors, sensorsEnvVal, skipCollection)
}
// Matches sensors.TemperaturesWithContext to allow for panic recovery (gopsutil/issues/1832)
type getTempsFn func(ctx context.Context) ([]sensors.TemperatureStat, error)
// newSensorConfigWithEnv creates a SensorConfig with the provided environment variables
// sensorsSet indicates if the SENSORS environment variable was explicitly set (even to empty string)
func (a *Agent) newSensorConfigWithEnv(primarySensor, sysSensors, sensorsEnvVal string, skipCollection bool) *SensorConfig {
@@ -78,8 +82,18 @@ func (a *Agent) updateTemperatures(systemStats *system.Stats) {
// reset high temp
a.systemInfo.DashboardTemp = 0
// get sensor data
temps, _ := sensors.TemperaturesWithContext(a.sensorConfig.context)
temps, err := a.getTempsWithPanicRecovery(sensors.TemperaturesWithContext)
if err != nil {
// retry once on panic (gopsutil/issues/1832)
temps, err = a.getTempsWithPanicRecovery(sensors.TemperaturesWithContext)
if err != nil {
slog.Warn("Error updating temperatures", "err", err)
if len(systemStats.Temperatures) > 0 {
systemStats.Temperatures = make(map[string]float64)
}
return
}
}
slog.Debug("Temperature", "sensors", temps)
// return if no sensors
@@ -107,15 +121,28 @@ func (a *Agent) updateTemperatures(systemStats *system.Stats) {
continue
}
// set dashboard temperature
if a.sensorConfig.primarySensor == "" {
switch a.sensorConfig.primarySensor {
case "":
a.systemInfo.DashboardTemp = max(a.systemInfo.DashboardTemp, sensor.Temperature)
} else if a.sensorConfig.primarySensor == sensorName {
case sensorName:
a.systemInfo.DashboardTemp = sensor.Temperature
}
systemStats.Temperatures[sensorName] = twoDecimals(sensor.Temperature)
}
}
// getTempsWithPanicRecovery wraps sensors.TemperaturesWithContext to recover from panics (gopsutil/issues/1832)
func (a *Agent) getTempsWithPanicRecovery(getTemps getTempsFn) (temps []sensors.TemperatureStat, err error) {
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("panic: %v", r)
}
}()
// get sensor data (error ignored intentionally as it may be only with one sensor)
temps, _ = getTemps(a.sensorConfig.context)
return
}
// isValidSensor checks if a sensor is valid based on the sensor name and the sensor config
func isValidSensor(sensorName string, config *SensorConfig) bool {
// if no sensors configured, everything is valid

View File

@@ -4,11 +4,14 @@
package agent
import (
"beszel/internal/entities/system"
"context"
"fmt"
"os"
"testing"
"github.com/shirou/gopsutil/v4/common"
"github.com/shirou/gopsutil/v4/sensors"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
@@ -454,3 +457,97 @@ func TestScaleTemperatureLogic(t *testing.T) {
result, expected)
})
}
func TestGetTempsWithPanicRecovery(t *testing.T) {
agent := &Agent{
systemInfo: system.Info{},
sensorConfig: &SensorConfig{
context: context.Background(),
},
}
tests := []struct {
name string
getTempsFn getTempsFn
expectError bool
errorMsg string
}{
{
name: "successful_function_call",
getTempsFn: func(ctx context.Context) ([]sensors.TemperatureStat, error) {
return []sensors.TemperatureStat{
{SensorKey: "test_sensor", Temperature: 45.0},
}, nil
},
expectError: false,
},
{
name: "function_returns_error",
getTempsFn: func(ctx context.Context) ([]sensors.TemperatureStat, error) {
return []sensors.TemperatureStat{
{SensorKey: "test_sensor", Temperature: 45.0},
}, fmt.Errorf("sensor error")
},
expectError: false, // getTempsWithPanicRecovery ignores errors from the function
},
{
name: "function_panics_with_string",
getTempsFn: func(ctx context.Context) ([]sensors.TemperatureStat, error) {
panic("test panic")
},
expectError: true,
errorMsg: "panic: test panic",
},
{
name: "function_panics_with_error",
getTempsFn: func(ctx context.Context) ([]sensors.TemperatureStat, error) {
panic(fmt.Errorf("panic error"))
},
expectError: true,
errorMsg: "panic:",
},
{
name: "function_panics_with_index_out_of_bounds",
getTempsFn: func(ctx context.Context) ([]sensors.TemperatureStat, error) {
slice := []int{1, 2, 3}
_ = slice[10] // out of bounds panic
return nil, nil
},
expectError: true,
errorMsg: "panic:",
},
{
name: "function_panics_with_any_conversion",
getTempsFn: func(ctx context.Context) ([]sensors.TemperatureStat, error) {
var i any = "string"
_ = i.(int) // type assertion panic
return nil, nil
},
expectError: true,
errorMsg: "panic:",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var temps []sensors.TemperatureStat
var err error
// The function should not panic, regardless of what the injected function does
assert.NotPanics(t, func() {
temps, err = agent.getTempsWithPanicRecovery(tt.getTempsFn)
}, "getTempsWithPanicRecovery should not panic")
if tt.expectError {
assert.Error(t, err, "Expected an error to be returned")
if tt.errorMsg != "" {
assert.Contains(t, err.Error(), tt.errorMsg,
"Error message should contain expected text")
}
assert.Nil(t, temps, "Temps should be nil when panic occurs")
} else {
assert.NoError(t, err, "Should not return error for successful calls")
}
})
}
}

View File

@@ -182,7 +182,7 @@ function Install-BeszelAgentWithScoop {
scoop bucket add beszel https://github.com/henrygd/beszel-scoops | Out-Null
Write-Host "Installing / updating beszel-agent..."
scoop install beszel-agent
scoop install beszel-agent | Out-Null
if (-not (Test-CommandExists "beszel-agent")) {
throw "Failed to install beszel-agent"

View File

@@ -316,18 +316,27 @@ fi
# Create a dedicated user for the service if it doesn't exist
if is_alpine; then
if ! id -u beszel >/dev/null 2>&1; then
echo "Creating a dedicated group for the Beszel Agent service..."
addgroup beszel
echo "Creating a dedicated user for the Beszel Agent service..."
adduser -S -D -H -s /sbin/nologin beszel
adduser -S -D -H -s /sbin/nologin -G beszel beszel
fi
# Add the user to the docker group to allow access to the Docker socket
addgroup beszel docker
# Add the user to the docker group to allow access to the Docker socket if group docker exists
if getent group docker; then
echo "Adding besel to docker group"
usermod -aG docker beszel
fi
else
if ! id -u beszel >/dev/null 2>&1; then
echo "Creating a dedicated user for the Beszel Agent service..."
useradd --system --home-dir /nonexistent --shell /bin/false beszel
fi
# Add the user to the docker group to allow access to the Docker socket
usermod -aG docker beszel
# Add the user to the docker group to allow access to the Docker socket if group docker exists
if getent group docker; then
echo "Adding besel to docker group"
usermod -aG docker beszel
fi
fi
# Create the directory for the Beszel Agent