Prevent temperature collection from blocking agent stats (#1839)

This commit is contained in:
Jim Haff
2026-03-26 19:03:51 -05:00
committed by GitHub
parent f04684b30a
commit ad21cab457
2 changed files with 95 additions and 2 deletions

View File

@@ -2,12 +2,14 @@ package agent
import (
"context"
"errors"
"fmt"
"log/slog"
"path"
"runtime"
"strconv"
"strings"
"time"
"unicode/utf8"
"github.com/henrygd/beszel/agent/utils"
@@ -38,6 +40,11 @@ func (a *Agent) newSensorConfig() *SensorConfig {
// Matches sensors.TemperaturesWithContext to allow for panic recovery (gopsutil/issues/1832)
type getTempsFn func(ctx context.Context) ([]sensors.TemperatureStat, error)
var (
errTemperatureFetchTimeout = errors.New("temperature collection timed out")
temperatureFetchTimeout = 2 * time.Second
)
// newSensorConfigWithEnv creates a SensorConfig with the provided environment variables
// sensorsSet indicates if the SENSORS environment variable was explicitly set (even to empty string)
func (a *Agent) newSensorConfigWithEnv(primarySensor, sysSensors, sensorsEnvVal string, skipCollection bool) *SensorConfig {
@@ -86,10 +93,12 @@ func (a *Agent) updateTemperatures(systemStats *system.Stats) {
// reset high temp
a.systemInfo.DashboardTemp = 0
temps, err := a.getTempsWithPanicRecovery(getSensorTemps)
temps, err := a.getTempsWithTimeout(getSensorTemps)
if err != nil {
// retry once on panic (gopsutil/issues/1832)
temps, err = a.getTempsWithPanicRecovery(getSensorTemps)
if !errors.Is(err, errTemperatureFetchTimeout) {
temps, err = a.getTempsWithTimeout(getSensorTemps)
}
if err != nil {
slog.Warn("Error updating temperatures", "err", err)
if len(systemStats.Temperatures) > 0 {
@@ -152,6 +161,26 @@ func (a *Agent) getTempsWithPanicRecovery(getTemps getTempsFn) (temps []sensors.
return
}
func (a *Agent) getTempsWithTimeout(getTemps getTempsFn) ([]sensors.TemperatureStat, error) {
type result struct {
temps []sensors.TemperatureStat
err error
}
resultCh := make(chan result, 1)
go func() {
temps, err := a.getTempsWithPanicRecovery(getTemps)
resultCh <- result{temps: temps, err: err}
}()
select {
case res := <-resultCh:
return res.temps, res.err
case <-time.After(temperatureFetchTimeout):
return nil, errTemperatureFetchTimeout
}
}
// isValidSensor checks if a sensor is valid based on the sensor name and the sensor config
func isValidSensor(sensorName string, config *SensorConfig) bool {
// if no sensors configured, everything is valid

View File

@@ -6,6 +6,7 @@ import (
"context"
"fmt"
"testing"
"time"
"github.com/henrygd/beszel/internal/entities/system"
@@ -526,3 +527,66 @@ func TestGetTempsWithPanicRecovery(t *testing.T) {
})
}
}
func TestGetTempsWithTimeout(t *testing.T) {
agent := &Agent{
sensorConfig: &SensorConfig{
context: context.Background(),
},
}
originalTimeout := temperatureFetchTimeout
t.Cleanup(func() {
temperatureFetchTimeout = originalTimeout
})
temperatureFetchTimeout = 10 * time.Millisecond
t.Run("returns temperatures before timeout", func(t *testing.T) {
temps, err := agent.getTempsWithTimeout(func(ctx context.Context) ([]sensors.TemperatureStat, error) {
return []sensors.TemperatureStat{{SensorKey: "cpu_temp", Temperature: 42}}, nil
})
require.NoError(t, err)
require.Len(t, temps, 1)
assert.Equal(t, "cpu_temp", temps[0].SensorKey)
})
t.Run("returns timeout error when collector hangs", func(t *testing.T) {
temps, err := agent.getTempsWithTimeout(func(ctx context.Context) ([]sensors.TemperatureStat, error) {
time.Sleep(50 * time.Millisecond)
return []sensors.TemperatureStat{{SensorKey: "cpu_temp", Temperature: 42}}, nil
})
assert.Nil(t, temps)
assert.ErrorIs(t, err, errTemperatureFetchTimeout)
})
}
func TestUpdateTemperaturesSkipsOnTimeout(t *testing.T) {
agent := &Agent{
systemInfo: system.Info{DashboardTemp: 99},
sensorConfig: &SensorConfig{
context: context.Background(),
},
}
originalTimeout := temperatureFetchTimeout
t.Cleanup(func() {
temperatureFetchTimeout = originalTimeout
getSensorTemps = sensors.TemperaturesWithContext
})
temperatureFetchTimeout = 10 * time.Millisecond
getSensorTemps = func(ctx context.Context) ([]sensors.TemperatureStat, error) {
time.Sleep(50 * time.Millisecond)
return nil, nil
}
stats := &system.Stats{
Temperatures: map[string]float64{"stale": 50},
}
agent.updateTemperatures(stats)
assert.Equal(t, 0.0, agent.systemInfo.DashboardTemp)
assert.Equal(t, map[string]float64{}, stats.Temperatures)
}