From ad21cab457ef963543f519636b44a3c978380ef5 Mon Sep 17 00:00:00 2001 From: Jim Haff Date: Thu, 26 Mar 2026 19:03:51 -0500 Subject: [PATCH] Prevent temperature collection from blocking agent stats (#1839) --- agent/sensors.go | 33 ++++++++++++++++++++-- agent/sensors_test.go | 64 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 2 deletions(-) diff --git a/agent/sensors.go b/agent/sensors.go index 08cd29dd..e4c0e7c9 100644 --- a/agent/sensors.go +++ b/agent/sensors.go @@ -2,12 +2,14 @@ package agent import ( "context" + "errors" "fmt" "log/slog" "path" "runtime" "strconv" "strings" + "time" "unicode/utf8" "github.com/henrygd/beszel/agent/utils" @@ -38,6 +40,11 @@ func (a *Agent) newSensorConfig() *SensorConfig { // Matches sensors.TemperaturesWithContext to allow for panic recovery (gopsutil/issues/1832) type getTempsFn func(ctx context.Context) ([]sensors.TemperatureStat, error) +var ( + errTemperatureFetchTimeout = errors.New("temperature collection timed out") + temperatureFetchTimeout = 2 * time.Second +) + // newSensorConfigWithEnv creates a SensorConfig with the provided environment variables // sensorsSet indicates if the SENSORS environment variable was explicitly set (even to empty string) func (a *Agent) newSensorConfigWithEnv(primarySensor, sysSensors, sensorsEnvVal string, skipCollection bool) *SensorConfig { @@ -86,10 +93,12 @@ func (a *Agent) updateTemperatures(systemStats *system.Stats) { // reset high temp a.systemInfo.DashboardTemp = 0 - temps, err := a.getTempsWithPanicRecovery(getSensorTemps) + temps, err := a.getTempsWithTimeout(getSensorTemps) if err != nil { // retry once on panic (gopsutil/issues/1832) - temps, err = a.getTempsWithPanicRecovery(getSensorTemps) + if !errors.Is(err, errTemperatureFetchTimeout) { + temps, err = a.getTempsWithTimeout(getSensorTemps) + } if err != nil { slog.Warn("Error updating temperatures", "err", err) if len(systemStats.Temperatures) > 0 { @@ -152,6 +161,26 @@ func (a *Agent) getTempsWithPanicRecovery(getTemps getTempsFn) (temps []sensors. return } +func (a *Agent) getTempsWithTimeout(getTemps getTempsFn) ([]sensors.TemperatureStat, error) { + type result struct { + temps []sensors.TemperatureStat + err error + } + + resultCh := make(chan result, 1) + go func() { + temps, err := a.getTempsWithPanicRecovery(getTemps) + resultCh <- result{temps: temps, err: err} + }() + + select { + case res := <-resultCh: + return res.temps, res.err + case <-time.After(temperatureFetchTimeout): + return nil, errTemperatureFetchTimeout + } +} + // isValidSensor checks if a sensor is valid based on the sensor name and the sensor config func isValidSensor(sensorName string, config *SensorConfig) bool { // if no sensors configured, everything is valid diff --git a/agent/sensors_test.go b/agent/sensors_test.go index de27ca35..23e2db6a 100644 --- a/agent/sensors_test.go +++ b/agent/sensors_test.go @@ -6,6 +6,7 @@ import ( "context" "fmt" "testing" + "time" "github.com/henrygd/beszel/internal/entities/system" @@ -526,3 +527,66 @@ func TestGetTempsWithPanicRecovery(t *testing.T) { }) } } + +func TestGetTempsWithTimeout(t *testing.T) { + agent := &Agent{ + sensorConfig: &SensorConfig{ + context: context.Background(), + }, + } + + originalTimeout := temperatureFetchTimeout + t.Cleanup(func() { + temperatureFetchTimeout = originalTimeout + }) + temperatureFetchTimeout = 10 * time.Millisecond + + t.Run("returns temperatures before timeout", func(t *testing.T) { + temps, err := agent.getTempsWithTimeout(func(ctx context.Context) ([]sensors.TemperatureStat, error) { + return []sensors.TemperatureStat{{SensorKey: "cpu_temp", Temperature: 42}}, nil + }) + + require.NoError(t, err) + require.Len(t, temps, 1) + assert.Equal(t, "cpu_temp", temps[0].SensorKey) + }) + + t.Run("returns timeout error when collector hangs", func(t *testing.T) { + temps, err := agent.getTempsWithTimeout(func(ctx context.Context) ([]sensors.TemperatureStat, error) { + time.Sleep(50 * time.Millisecond) + return []sensors.TemperatureStat{{SensorKey: "cpu_temp", Temperature: 42}}, nil + }) + + assert.Nil(t, temps) + assert.ErrorIs(t, err, errTemperatureFetchTimeout) + }) +} + +func TestUpdateTemperaturesSkipsOnTimeout(t *testing.T) { + agent := &Agent{ + systemInfo: system.Info{DashboardTemp: 99}, + sensorConfig: &SensorConfig{ + context: context.Background(), + }, + } + + originalTimeout := temperatureFetchTimeout + t.Cleanup(func() { + temperatureFetchTimeout = originalTimeout + getSensorTemps = sensors.TemperaturesWithContext + }) + temperatureFetchTimeout = 10 * time.Millisecond + getSensorTemps = func(ctx context.Context) ([]sensors.TemperatureStat, error) { + time.Sleep(50 * time.Millisecond) + return nil, nil + } + + stats := &system.Stats{ + Temperatures: map[string]float64{"stale": 50}, + } + + agent.updateTemperatures(stats) + + assert.Equal(t, 0.0, agent.systemInfo.DashboardTemp) + assert.Equal(t, map[string]float64{}, stats.Temperatures) +}