From 5bd43ed4614129c8ea53dac50bf5e65bf344a34f Mon Sep 17 00:00:00 2001 From: henrygd Date: Sat, 28 Mar 2026 20:47:16 -0400 Subject: [PATCH] hub: reset smart interval on agent reconnect if agent hasn't successfully saved smart devices this is so people trying to get smart working can see the config changes immediately. not need to wait for the smart interval. --- internal/hub/systems/system.go | 5 +- internal/hub/systems/system_manager.go | 14 ++++- internal/hub/systems/system_smart.go | 34 +++++++++- internal/hub/systems/system_smart_test.go | 75 +++++++++++++++++++++++ 4 files changed, 121 insertions(+), 7 deletions(-) create mode 100644 internal/hub/systems/system_smart_test.go diff --git a/internal/hub/systems/system.go b/internal/hub/systems/system.go index d285bab7..5991db4b 100644 --- a/internal/hub/systems/system.go +++ b/internal/hub/systems/system.go @@ -156,11 +156,9 @@ func (sys *System) update() error { if sys.smartInterval <= 0 { sys.smartInterval = time.Hour } - lastFetch, _ := sys.manager.smartFetchMap.GetOk(sys.Id) - if time.Since(time.UnixMilli(lastFetch-1e4)) >= sys.smartInterval && sys.smartFetching.CompareAndSwap(false, true) { + if sys.shouldFetchSmart() && sys.smartFetching.CompareAndSwap(false, true) { go func() { defer sys.smartFetching.Store(false) - sys.manager.smartFetchMap.Set(sys.Id, time.Now().UnixMilli(), sys.smartInterval+time.Minute) _ = sys.FetchAndSaveSmartDevices() }() } @@ -643,6 +641,7 @@ func (s *System) createSSHClient() error { return err } s.agentVersion, _ = extractAgentVersion(string(s.client.Conn.ServerVersion())) + s.manager.resetFailedSmartFetchState(s.Id) return nil } diff --git a/internal/hub/systems/system_manager.go b/internal/hub/systems/system_manager.go index ca94b52c..f76293de 100644 --- a/internal/hub/systems/system_manager.go +++ b/internal/hub/systems/system_manager.go @@ -44,7 +44,7 @@ type SystemManager struct { hub hubLike // Hub interface for database and alert operations systems *store.Store[string, *System] // Thread-safe store of active systems sshConfig *ssh.ClientConfig // SSH client configuration for system connections - smartFetchMap *expirymap.ExpiryMap[int64] // Stores last SMART fetch time per system ID + smartFetchMap *expirymap.ExpiryMap[bool] // Stores whether the last SMART fetch succeeded while entry TTL enforces fetch interval } // hubLike defines the interface requirements for the hub dependency. @@ -62,7 +62,7 @@ func NewSystemManager(hub hubLike) *SystemManager { return &SystemManager{ systems: store.New(map[string]*System{}), hub: hub, - smartFetchMap: expirymap.New[int64](time.Hour), + smartFetchMap: expirymap.New[bool](time.Hour), } } @@ -306,6 +306,7 @@ func (sm *SystemManager) AddWebSocketSystem(systemId string, agentVersion semver if err != nil { return err } + sm.resetFailedSmartFetchState(systemId) system := sm.NewSystem(systemId) system.WsConn = wsConn @@ -317,6 +318,15 @@ func (sm *SystemManager) AddWebSocketSystem(systemId string, agentVersion semver return nil } +// resetFailedSmartFetchState clears only failed SMART cooldown entries so a fresh +// agent reconnect retries SMART discovery immediately after configuration changes. +func (sm *SystemManager) resetFailedSmartFetchState(systemID string) { + succeeded, ok := sm.smartFetchMap.GetOk(systemID) + if ok && !succeeded { + sm.smartFetchMap.Remove(systemID) + } +} + // createSSHClientConfig initializes the SSH client configuration for connecting to an agent's server func (sm *SystemManager) createSSHClientConfig() error { privateKey, err := sm.hub.GetSSHKey("") diff --git a/internal/hub/systems/system_smart.go b/internal/hub/systems/system_smart.go index ca38f5b9..bd6c573b 100644 --- a/internal/hub/systems/system_smart.go +++ b/internal/hub/systems/system_smart.go @@ -4,6 +4,7 @@ import ( "database/sql" "errors" "strings" + "time" "github.com/henrygd/beszel/internal/entities/smart" "github.com/pocketbase/pocketbase/core" @@ -12,10 +13,39 @@ import ( // FetchAndSaveSmartDevices fetches SMART data from the agent and saves it to the database func (sys *System) FetchAndSaveSmartDevices() error { smartData, err := sys.FetchSmartDataFromAgent() - if err != nil || len(smartData) == 0 { + if err != nil { + sys.recordSmartFetchResult(err, 0) return err } - return sys.saveSmartDevices(smartData) + err = sys.saveSmartDevices(smartData) + sys.recordSmartFetchResult(err, len(smartData)) + return err +} + +// recordSmartFetchResult stores a cooldown entry for the SMART interval and marks +// whether the last fetch produced any devices, so failed setup can retry on reconnect. +func (sys *System) recordSmartFetchResult(err error, deviceCount int) { + if sys.manager == nil { + return + } + sys.manager.smartFetchMap.Set(sys.Id, err == nil && deviceCount > 0, sys.smartFetchInterval()+time.Minute) +} + +// shouldFetchSmart returns true when there is no active SMART cooldown entry for this system. +func (sys *System) shouldFetchSmart() bool { + if sys.manager == nil { + return true + } + _, ok := sys.manager.smartFetchMap.GetOk(sys.Id) + return !ok +} + +// smartFetchInterval returns the agent-provided SMART interval or the default when unset. +func (sys *System) smartFetchInterval() time.Duration { + if sys.smartInterval > 0 { + return sys.smartInterval + } + return time.Hour } // saveSmartDevices saves SMART device data to the smart_devices collection diff --git a/internal/hub/systems/system_smart_test.go b/internal/hub/systems/system_smart_test.go new file mode 100644 index 00000000..bf1985ff --- /dev/null +++ b/internal/hub/systems/system_smart_test.go @@ -0,0 +1,75 @@ +//go:build testing + +package systems + +import ( + "errors" + "testing" + "time" + + "github.com/henrygd/beszel/internal/hub/expirymap" + "github.com/stretchr/testify/assert" +) + +func TestRecordSmartFetchResult(t *testing.T) { + sm := &SystemManager{smartFetchMap: expirymap.New[bool](time.Hour)} + t.Cleanup(sm.smartFetchMap.StopCleaner) + + sys := &System{ + Id: "system-1", + manager: sm, + smartInterval: time.Hour, + } + + // Successful fetch with devices + sys.recordSmartFetchResult(nil, 5) + succeeded, ok := sm.smartFetchMap.GetOk(sys.Id) + assert.True(t, ok, "expected smart fetch result to be stored") + assert.True(t, succeeded, "expected successful fetch state to be recorded") + + // Failed fetch + sys.recordSmartFetchResult(errors.New("failed"), 0) + succeeded, ok = sm.smartFetchMap.GetOk(sys.Id) + assert.True(t, ok, "expected failed smart fetch state to be stored") + assert.False(t, succeeded, "expected failed smart fetch state to be marked unsuccessful") + + // Successful fetch but no devices + sys.recordSmartFetchResult(nil, 0) + succeeded, ok = sm.smartFetchMap.GetOk(sys.Id) + assert.True(t, ok, "expected fetch with zero devices to be stored") + assert.False(t, succeeded, "expected fetch with zero devices to be marked unsuccessful") +} + +func TestShouldFetchSmart(t *testing.T) { + sm := &SystemManager{smartFetchMap: expirymap.New[bool](time.Hour)} + t.Cleanup(sm.smartFetchMap.StopCleaner) + + sys := &System{ + Id: "system-1", + manager: sm, + smartInterval: time.Hour, + } + + assert.True(t, sys.shouldFetchSmart(), "expected initial smart fetch to be allowed") + + sys.recordSmartFetchResult(errors.New("failed"), 0) + assert.False(t, sys.shouldFetchSmart(), "expected smart fetch to be blocked while interval entry exists") + + sm.smartFetchMap.Remove(sys.Id) + assert.True(t, sys.shouldFetchSmart(), "expected smart fetch to be allowed after interval entry is cleared") +} + +func TestResetFailedSmartFetchState(t *testing.T) { + sm := &SystemManager{smartFetchMap: expirymap.New[bool](time.Hour)} + t.Cleanup(sm.smartFetchMap.StopCleaner) + + sm.smartFetchMap.Set("system-1", false, time.Hour) + sm.resetFailedSmartFetchState("system-1") + _, ok := sm.smartFetchMap.GetOk("system-1") + assert.False(t, ok, "expected failed smart fetch state to be cleared on reconnect") + + sm.smartFetchMap.Set("system-1", true, time.Hour) + sm.resetFailedSmartFetchState("system-1") + _, ok = sm.smartFetchMap.GetOk("system-1") + assert.True(t, ok, "expected successful smart fetch state to be preserved") +}