Compare commits

...

5 Commits

Author SHA1 Message Date
henrygd
4395520a28 Probable fix for Jetson gpu issue (#895) 2025-06-26 22:11:48 -04:00
Alexander Mnich
8c52f30a71 add GITHUB_TOKEN fallback for goreleaser (#925)
adding the fallback to the GITHUB_TOKEN allows execution of goreleaser in a fork without additional configuration
2025-06-26 21:03:19 -04:00
SSU
46316ebffa fix(install): suppress scoop output to avoid nssm path pollution (#918)
Suppressed the output of “scoop install beszel-agent” to ensure the NSSM service path
contains only the executable location.

Closes #915

Co-authored-by: suseol <suseol@geosr.com>
2025-06-25 13:52:45 -04:00
henrygd
0b04f60b6c Add panic recovery for sensors.TemperaturesWithContext (#796) 2025-06-23 19:50:11 -04:00
HansAndreManfredson
20b822d072 Fix missing groups #892 (#893) 2025-06-17 16:08:32 -04:00
7 changed files with 250 additions and 56 deletions

View File

@@ -39,4 +39,4 @@ jobs:
version: latest version: latest
args: release --clean args: release --clean
env: env:
GITHUB_TOKEN: ${{ secrets.TOKEN }} GITHUB_TOKEN: ${{ secrets.TOKEN || secrets.GITHUB_TOKEN }}

View File

@@ -243,21 +243,26 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData {
// copy / reset the data // copy / reset the data
gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap)) gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap))
for id, gpu := range gm.GpuDataMap { for id, gpu := range gm.GpuDataMap {
// sum the data var gpuAvg system.GPUData
gpu.Temperature = twoDecimals(gpu.Temperature)
gpu.MemoryUsed = twoDecimals(gpu.MemoryUsed) gpuAvg.Temperature = twoDecimals(gpu.Temperature)
gpu.MemoryTotal = twoDecimals(gpu.MemoryTotal) gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed)
gpu.Usage = twoDecimals(gpu.Usage / gpu.Count) gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal)
gpu.Power = twoDecimals(gpu.Power / gpu.Count)
// reset the count // avoid division by zero
gpu.Count = 1 if gpu.Count > 0 {
// dereference to avoid overwriting anything else gpuAvg.Usage = twoDecimals(gpu.Usage / gpu.Count)
gpuCopy := *gpu gpuAvg.Power = twoDecimals(gpu.Power / gpu.Count)
}
// reset accumulators in the original
gpu.Usage, gpu.Power, gpu.Count = 0, 0, 0
// append id to the name if there are multiple GPUs with the same name // append id to the name if there are multiple GPUs with the same name
if nameCounts[gpu.Name] > 1 { if nameCounts[gpu.Name] > 1 {
gpuCopy.Name = fmt.Sprintf("%s %s", gpu.Name, id) gpuAvg.Name = fmt.Sprintf("%s %s", gpu.Name, id)
} }
gpuData[id] = gpuCopy gpuData[id] = gpuAvg
} }
slog.Debug("GPU", "data", gpuData) slog.Debug("GPU", "data", gpuData)
return gpuData return gpuData

View File

@@ -279,6 +279,19 @@ func TestParseJetsonData(t *testing.T) {
Count: 1, Count: 1,
}, },
}, },
{
name: "orin nano",
input: "06-18-2025 11:25:24 RAM 3452/7620MB (lfb 25x4MB) SWAP 1518/16384MB (cached 174MB) CPU [1%@1420,2%@1420,0%@1420,2%@1420,2%@729,1%@729] GR3D_FREQ 0% cpu@50.031C soc2@49.031C soc0@50C gpu@49.031C tj@50.25C soc1@50.25C VDD_IN 4824mW/4824mW VDD_CPU_GPU_CV 518mW/518mW VDD_SOC 1475mW/1475mW",
wantMetrics: &system.GPUData{
Name: "GPU",
MemoryUsed: 3452.0,
MemoryTotal: 7620.0,
Usage: 0.0,
Temperature: 50.25,
Power: 0.518,
Count: 1,
},
},
{ {
name: "missing temperature", name: "missing temperature",
input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW", input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW",
@@ -318,44 +331,75 @@ func TestParseJetsonData(t *testing.T) {
} }
func TestGetCurrentData(t *testing.T) { func TestGetCurrentData(t *testing.T) {
gm := &GPUManager{ t.Run("calculates averages and resets accumulators", func(t *testing.T) {
GpuDataMap: map[string]*system.GPUData{ gm := &GPUManager{
"0": { GpuDataMap: map[string]*system.GPUData{
Name: "GPU1", "0": {
Temperature: 50, Name: "GPU1",
MemoryUsed: 2048, Temperature: 50,
MemoryTotal: 4096, MemoryUsed: 2048,
Usage: 100, // 100 over 2 counts = 50 avg MemoryTotal: 4096,
Power: 200, // 200 over 2 counts = 100 avg Usage: 100, // 100 over 2 counts = 50 avg
Count: 2, Power: 200, // 200 over 2 counts = 100 avg
Count: 2,
},
"1": {
Name: "GPU1",
Temperature: 60,
MemoryUsed: 3072,
MemoryTotal: 8192,
Usage: 30,
Power: 60,
Count: 1,
},
}, },
"1": { }
Name: "GPU1",
Temperature: 60, result := gm.GetCurrentData()
MemoryUsed: 3072,
MemoryTotal: 8192, // Verify name disambiguation
Usage: 30, assert.Equal(t, "GPU1 0", result["0"].Name)
Power: 60, assert.Equal(t, "GPU1 1", result["1"].Name)
Count: 1,
// Check averaged values in the result
assert.InDelta(t, 50.0, result["0"].Usage, 0.01)
assert.InDelta(t, 100.0, result["0"].Power, 0.01)
assert.InDelta(t, 30.0, result["1"].Usage, 0.01)
assert.InDelta(t, 60.0, result["1"].Power, 0.01)
// Verify that accumulators in the original map are reset
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Count, "GPU 0 Count should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Usage, "GPU 0 Usage should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Power, "GPU 0 Power should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["1"].Count, "GPU 1 Count should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["1"].Usage, "GPU 1 Usage should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["1"].Power, "GPU 1 Power should be reset")
})
t.Run("handles zero count without panicking", func(t *testing.T) {
gm := &GPUManager{
GpuDataMap: map[string]*system.GPUData{
"0": {
Name: "TestGPU",
Count: 0,
Usage: 0,
Power: 0,
},
}, },
}, }
}
result := gm.GetCurrentData() var result map[string]system.GPUData
assert.NotPanics(t, func() {
result = gm.GetCurrentData()
})
// Verify name disambiguation // Check that usage and power are 0
assert.Equal(t, "GPU1 0", result["0"].Name) assert.Equal(t, 0.0, result["0"].Usage)
assert.Equal(t, "GPU1 1", result["1"].Name) assert.Equal(t, 0.0, result["0"].Power)
// Check averaged values // Verify reset count
assert.InDelta(t, 50.0, result["0"].Usage, 0.01) assert.Equal(t, float64(0), gm.GpuDataMap["0"].Count)
assert.InDelta(t, 100.0, result["0"].Power, 0.01) })
assert.InDelta(t, 30.0, result["1"].Usage, 0.01)
assert.InDelta(t, 60.0, result["1"].Power, 0.01)
// Verify reset counts
assert.Equal(t, float64(1), gm.GpuDataMap["0"].Count)
assert.Equal(t, float64(1), gm.GpuDataMap["1"].Count)
} }
func TestDetectGPUs(t *testing.T) { func TestDetectGPUs(t *testing.T) {
@@ -722,6 +766,18 @@ func TestAccumulation(t *testing.T) {
assert.InDelta(t, expected.avgUsage, gpu.Usage, 0.01, "Average usage in GetCurrentData should match") assert.InDelta(t, expected.avgUsage, gpu.Usage, 0.01, "Average usage in GetCurrentData should match")
assert.InDelta(t, expected.avgPower, gpu.Power, 0.01, "Average power in GetCurrentData should match") assert.InDelta(t, expected.avgPower, gpu.Power, 0.01, "Average power in GetCurrentData should match")
} }
// Verify that accumulators in the original map are reset
for id := range tt.expectedValues {
gpu, exists := gm.GpuDataMap[id]
assert.True(t, exists, "GPU with ID %s should still exist after GetCurrentData", id)
if !exists {
continue
}
assert.Equal(t, float64(0), gpu.Count, "Count should be reset for GPU ID %s", id)
assert.Equal(t, float64(0), gpu.Usage, "Usage should be reset for GPU ID %s", id)
assert.Equal(t, float64(0), gpu.Power, "Power should be reset for GPU ID %s", id)
}
}) })
} }
} }

View File

@@ -3,6 +3,7 @@ package agent
import ( import (
"beszel/internal/entities/system" "beszel/internal/entities/system"
"context" "context"
"fmt"
"log/slog" "log/slog"
"path" "path"
"strconv" "strconv"
@@ -30,6 +31,9 @@ func (a *Agent) newSensorConfig() *SensorConfig {
return a.newSensorConfigWithEnv(primarySensor, sysSensors, sensorsEnvVal, skipCollection) return a.newSensorConfigWithEnv(primarySensor, sysSensors, sensorsEnvVal, skipCollection)
} }
// Matches sensors.TemperaturesWithContext to allow for panic recovery (gopsutil/issues/1832)
type getTempsFn func(ctx context.Context) ([]sensors.TemperatureStat, error)
// newSensorConfigWithEnv creates a SensorConfig with the provided environment variables // newSensorConfigWithEnv creates a SensorConfig with the provided environment variables
// sensorsSet indicates if the SENSORS environment variable was explicitly set (even to empty string) // sensorsSet indicates if the SENSORS environment variable was explicitly set (even to empty string)
func (a *Agent) newSensorConfigWithEnv(primarySensor, sysSensors, sensorsEnvVal string, skipCollection bool) *SensorConfig { func (a *Agent) newSensorConfigWithEnv(primarySensor, sysSensors, sensorsEnvVal string, skipCollection bool) *SensorConfig {
@@ -78,8 +82,18 @@ func (a *Agent) updateTemperatures(systemStats *system.Stats) {
// reset high temp // reset high temp
a.systemInfo.DashboardTemp = 0 a.systemInfo.DashboardTemp = 0
// get sensor data temps, err := a.getTempsWithPanicRecovery(sensors.TemperaturesWithContext)
temps, _ := sensors.TemperaturesWithContext(a.sensorConfig.context) if err != nil {
// retry once on panic (gopsutil/issues/1832)
temps, err = a.getTempsWithPanicRecovery(sensors.TemperaturesWithContext)
if err != nil {
slog.Warn("Error updating temperatures", "err", err)
if len(systemStats.Temperatures) > 0 {
systemStats.Temperatures = make(map[string]float64)
}
return
}
}
slog.Debug("Temperature", "sensors", temps) slog.Debug("Temperature", "sensors", temps)
// return if no sensors // return if no sensors
@@ -107,15 +121,28 @@ func (a *Agent) updateTemperatures(systemStats *system.Stats) {
continue continue
} }
// set dashboard temperature // set dashboard temperature
if a.sensorConfig.primarySensor == "" { switch a.sensorConfig.primarySensor {
case "":
a.systemInfo.DashboardTemp = max(a.systemInfo.DashboardTemp, sensor.Temperature) a.systemInfo.DashboardTemp = max(a.systemInfo.DashboardTemp, sensor.Temperature)
} else if a.sensorConfig.primarySensor == sensorName { case sensorName:
a.systemInfo.DashboardTemp = sensor.Temperature a.systemInfo.DashboardTemp = sensor.Temperature
} }
systemStats.Temperatures[sensorName] = twoDecimals(sensor.Temperature) systemStats.Temperatures[sensorName] = twoDecimals(sensor.Temperature)
} }
} }
// getTempsWithPanicRecovery wraps sensors.TemperaturesWithContext to recover from panics (gopsutil/issues/1832)
func (a *Agent) getTempsWithPanicRecovery(getTemps getTempsFn) (temps []sensors.TemperatureStat, err error) {
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("panic: %v", r)
}
}()
// get sensor data (error ignored intentionally as it may be only with one sensor)
temps, _ = getTemps(a.sensorConfig.context)
return
}
// isValidSensor checks if a sensor is valid based on the sensor name and the sensor config // isValidSensor checks if a sensor is valid based on the sensor name and the sensor config
func isValidSensor(sensorName string, config *SensorConfig) bool { func isValidSensor(sensorName string, config *SensorConfig) bool {
// if no sensors configured, everything is valid // if no sensors configured, everything is valid

View File

@@ -4,11 +4,14 @@
package agent package agent
import ( import (
"beszel/internal/entities/system"
"context" "context"
"fmt"
"os" "os"
"testing" "testing"
"github.com/shirou/gopsutil/v4/common" "github.com/shirou/gopsutil/v4/common"
"github.com/shirou/gopsutil/v4/sensors"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
) )
@@ -454,3 +457,97 @@ func TestScaleTemperatureLogic(t *testing.T) {
result, expected) result, expected)
}) })
} }
func TestGetTempsWithPanicRecovery(t *testing.T) {
agent := &Agent{
systemInfo: system.Info{},
sensorConfig: &SensorConfig{
context: context.Background(),
},
}
tests := []struct {
name string
getTempsFn getTempsFn
expectError bool
errorMsg string
}{
{
name: "successful_function_call",
getTempsFn: func(ctx context.Context) ([]sensors.TemperatureStat, error) {
return []sensors.TemperatureStat{
{SensorKey: "test_sensor", Temperature: 45.0},
}, nil
},
expectError: false,
},
{
name: "function_returns_error",
getTempsFn: func(ctx context.Context) ([]sensors.TemperatureStat, error) {
return []sensors.TemperatureStat{
{SensorKey: "test_sensor", Temperature: 45.0},
}, fmt.Errorf("sensor error")
},
expectError: false, // getTempsWithPanicRecovery ignores errors from the function
},
{
name: "function_panics_with_string",
getTempsFn: func(ctx context.Context) ([]sensors.TemperatureStat, error) {
panic("test panic")
},
expectError: true,
errorMsg: "panic: test panic",
},
{
name: "function_panics_with_error",
getTempsFn: func(ctx context.Context) ([]sensors.TemperatureStat, error) {
panic(fmt.Errorf("panic error"))
},
expectError: true,
errorMsg: "panic:",
},
{
name: "function_panics_with_index_out_of_bounds",
getTempsFn: func(ctx context.Context) ([]sensors.TemperatureStat, error) {
slice := []int{1, 2, 3}
_ = slice[10] // out of bounds panic
return nil, nil
},
expectError: true,
errorMsg: "panic:",
},
{
name: "function_panics_with_any_conversion",
getTempsFn: func(ctx context.Context) ([]sensors.TemperatureStat, error) {
var i any = "string"
_ = i.(int) // type assertion panic
return nil, nil
},
expectError: true,
errorMsg: "panic:",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var temps []sensors.TemperatureStat
var err error
// The function should not panic, regardless of what the injected function does
assert.NotPanics(t, func() {
temps, err = agent.getTempsWithPanicRecovery(tt.getTempsFn)
}, "getTempsWithPanicRecovery should not panic")
if tt.expectError {
assert.Error(t, err, "Expected an error to be returned")
if tt.errorMsg != "" {
assert.Contains(t, err.Error(), tt.errorMsg,
"Error message should contain expected text")
}
assert.Nil(t, temps, "Temps should be nil when panic occurs")
} else {
assert.NoError(t, err, "Should not return error for successful calls")
}
})
}
}

View File

@@ -182,7 +182,7 @@ function Install-BeszelAgentWithScoop {
scoop bucket add beszel https://github.com/henrygd/beszel-scoops | Out-Null scoop bucket add beszel https://github.com/henrygd/beszel-scoops | Out-Null
Write-Host "Installing / updating beszel-agent..." Write-Host "Installing / updating beszel-agent..."
scoop install beszel-agent scoop install beszel-agent | Out-Null
if (-not (Test-CommandExists "beszel-agent")) { if (-not (Test-CommandExists "beszel-agent")) {
throw "Failed to install beszel-agent" throw "Failed to install beszel-agent"

View File

@@ -316,18 +316,27 @@ fi
# Create a dedicated user for the service if it doesn't exist # Create a dedicated user for the service if it doesn't exist
if is_alpine; then if is_alpine; then
if ! id -u beszel >/dev/null 2>&1; then if ! id -u beszel >/dev/null 2>&1; then
echo "Creating a dedicated group for the Beszel Agent service..."
addgroup beszel
echo "Creating a dedicated user for the Beszel Agent service..." echo "Creating a dedicated user for the Beszel Agent service..."
adduser -S -D -H -s /sbin/nologin beszel adduser -S -D -H -s /sbin/nologin -G beszel beszel
fi fi
# Add the user to the docker group to allow access to the Docker socket # Add the user to the docker group to allow access to the Docker socket if group docker exists
addgroup beszel docker if getent group docker; then
echo "Adding besel to docker group"
usermod -aG docker beszel
fi
else else
if ! id -u beszel >/dev/null 2>&1; then if ! id -u beszel >/dev/null 2>&1; then
echo "Creating a dedicated user for the Beszel Agent service..." echo "Creating a dedicated user for the Beszel Agent service..."
useradd --system --home-dir /nonexistent --shell /bin/false beszel useradd --system --home-dir /nonexistent --shell /bin/false beszel
fi fi
# Add the user to the docker group to allow access to the Docker socket # Add the user to the docker group to allow access to the Docker socket if group docker exists
usermod -aG docker beszel if getent group docker; then
echo "Adding besel to docker group"
usermod -aG docker beszel
fi
fi fi
# Create the directory for the Beszel Agent # Create the directory for the Beszel Agent