diff --git a/internal/hub/systems/system.go b/internal/hub/systems/system.go index d285bab7..5991db4b 100644 --- a/internal/hub/systems/system.go +++ b/internal/hub/systems/system.go @@ -156,11 +156,9 @@ func (sys *System) update() error { if sys.smartInterval <= 0 { sys.smartInterval = time.Hour } - lastFetch, _ := sys.manager.smartFetchMap.GetOk(sys.Id) - if time.Since(time.UnixMilli(lastFetch-1e4)) >= sys.smartInterval && sys.smartFetching.CompareAndSwap(false, true) { + if sys.shouldFetchSmart() && sys.smartFetching.CompareAndSwap(false, true) { go func() { defer sys.smartFetching.Store(false) - sys.manager.smartFetchMap.Set(sys.Id, time.Now().UnixMilli(), sys.smartInterval+time.Minute) _ = sys.FetchAndSaveSmartDevices() }() } @@ -643,6 +641,7 @@ func (s *System) createSSHClient() error { return err } s.agentVersion, _ = extractAgentVersion(string(s.client.Conn.ServerVersion())) + s.manager.resetFailedSmartFetchState(s.Id) return nil } diff --git a/internal/hub/systems/system_manager.go b/internal/hub/systems/system_manager.go index ca94b52c..f76293de 100644 --- a/internal/hub/systems/system_manager.go +++ b/internal/hub/systems/system_manager.go @@ -44,7 +44,7 @@ type SystemManager struct { hub hubLike // Hub interface for database and alert operations systems *store.Store[string, *System] // Thread-safe store of active systems sshConfig *ssh.ClientConfig // SSH client configuration for system connections - smartFetchMap *expirymap.ExpiryMap[int64] // Stores last SMART fetch time per system ID + smartFetchMap *expirymap.ExpiryMap[bool] // Stores whether the last SMART fetch succeeded while entry TTL enforces fetch interval } // hubLike defines the interface requirements for the hub dependency. @@ -62,7 +62,7 @@ func NewSystemManager(hub hubLike) *SystemManager { return &SystemManager{ systems: store.New(map[string]*System{}), hub: hub, - smartFetchMap: expirymap.New[int64](time.Hour), + smartFetchMap: expirymap.New[bool](time.Hour), } } @@ -306,6 +306,7 @@ func (sm *SystemManager) AddWebSocketSystem(systemId string, agentVersion semver if err != nil { return err } + sm.resetFailedSmartFetchState(systemId) system := sm.NewSystem(systemId) system.WsConn = wsConn @@ -317,6 +318,15 @@ func (sm *SystemManager) AddWebSocketSystem(systemId string, agentVersion semver return nil } +// resetFailedSmartFetchState clears only failed SMART cooldown entries so a fresh +// agent reconnect retries SMART discovery immediately after configuration changes. +func (sm *SystemManager) resetFailedSmartFetchState(systemID string) { + succeeded, ok := sm.smartFetchMap.GetOk(systemID) + if ok && !succeeded { + sm.smartFetchMap.Remove(systemID) + } +} + // createSSHClientConfig initializes the SSH client configuration for connecting to an agent's server func (sm *SystemManager) createSSHClientConfig() error { privateKey, err := sm.hub.GetSSHKey("") diff --git a/internal/hub/systems/system_smart.go b/internal/hub/systems/system_smart.go index ca38f5b9..bd6c573b 100644 --- a/internal/hub/systems/system_smart.go +++ b/internal/hub/systems/system_smart.go @@ -4,6 +4,7 @@ import ( "database/sql" "errors" "strings" + "time" "github.com/henrygd/beszel/internal/entities/smart" "github.com/pocketbase/pocketbase/core" @@ -12,10 +13,39 @@ import ( // FetchAndSaveSmartDevices fetches SMART data from the agent and saves it to the database func (sys *System) FetchAndSaveSmartDevices() error { smartData, err := sys.FetchSmartDataFromAgent() - if err != nil || len(smartData) == 0 { + if err != nil { + sys.recordSmartFetchResult(err, 0) return err } - return sys.saveSmartDevices(smartData) + err = sys.saveSmartDevices(smartData) + sys.recordSmartFetchResult(err, len(smartData)) + return err +} + +// recordSmartFetchResult stores a cooldown entry for the SMART interval and marks +// whether the last fetch produced any devices, so failed setup can retry on reconnect. +func (sys *System) recordSmartFetchResult(err error, deviceCount int) { + if sys.manager == nil { + return + } + sys.manager.smartFetchMap.Set(sys.Id, err == nil && deviceCount > 0, sys.smartFetchInterval()+time.Minute) +} + +// shouldFetchSmart returns true when there is no active SMART cooldown entry for this system. +func (sys *System) shouldFetchSmart() bool { + if sys.manager == nil { + return true + } + _, ok := sys.manager.smartFetchMap.GetOk(sys.Id) + return !ok +} + +// smartFetchInterval returns the agent-provided SMART interval or the default when unset. +func (sys *System) smartFetchInterval() time.Duration { + if sys.smartInterval > 0 { + return sys.smartInterval + } + return time.Hour } // saveSmartDevices saves SMART device data to the smart_devices collection diff --git a/internal/hub/systems/system_smart_test.go b/internal/hub/systems/system_smart_test.go new file mode 100644 index 00000000..bf1985ff --- /dev/null +++ b/internal/hub/systems/system_smart_test.go @@ -0,0 +1,75 @@ +//go:build testing + +package systems + +import ( + "errors" + "testing" + "time" + + "github.com/henrygd/beszel/internal/hub/expirymap" + "github.com/stretchr/testify/assert" +) + +func TestRecordSmartFetchResult(t *testing.T) { + sm := &SystemManager{smartFetchMap: expirymap.New[bool](time.Hour)} + t.Cleanup(sm.smartFetchMap.StopCleaner) + + sys := &System{ + Id: "system-1", + manager: sm, + smartInterval: time.Hour, + } + + // Successful fetch with devices + sys.recordSmartFetchResult(nil, 5) + succeeded, ok := sm.smartFetchMap.GetOk(sys.Id) + assert.True(t, ok, "expected smart fetch result to be stored") + assert.True(t, succeeded, "expected successful fetch state to be recorded") + + // Failed fetch + sys.recordSmartFetchResult(errors.New("failed"), 0) + succeeded, ok = sm.smartFetchMap.GetOk(sys.Id) + assert.True(t, ok, "expected failed smart fetch state to be stored") + assert.False(t, succeeded, "expected failed smart fetch state to be marked unsuccessful") + + // Successful fetch but no devices + sys.recordSmartFetchResult(nil, 0) + succeeded, ok = sm.smartFetchMap.GetOk(sys.Id) + assert.True(t, ok, "expected fetch with zero devices to be stored") + assert.False(t, succeeded, "expected fetch with zero devices to be marked unsuccessful") +} + +func TestShouldFetchSmart(t *testing.T) { + sm := &SystemManager{smartFetchMap: expirymap.New[bool](time.Hour)} + t.Cleanup(sm.smartFetchMap.StopCleaner) + + sys := &System{ + Id: "system-1", + manager: sm, + smartInterval: time.Hour, + } + + assert.True(t, sys.shouldFetchSmart(), "expected initial smart fetch to be allowed") + + sys.recordSmartFetchResult(errors.New("failed"), 0) + assert.False(t, sys.shouldFetchSmart(), "expected smart fetch to be blocked while interval entry exists") + + sm.smartFetchMap.Remove(sys.Id) + assert.True(t, sys.shouldFetchSmart(), "expected smart fetch to be allowed after interval entry is cleared") +} + +func TestResetFailedSmartFetchState(t *testing.T) { + sm := &SystemManager{smartFetchMap: expirymap.New[bool](time.Hour)} + t.Cleanup(sm.smartFetchMap.StopCleaner) + + sm.smartFetchMap.Set("system-1", false, time.Hour) + sm.resetFailedSmartFetchState("system-1") + _, ok := sm.smartFetchMap.GetOk("system-1") + assert.False(t, ok, "expected failed smart fetch state to be cleared on reconnect") + + sm.smartFetchMap.Set("system-1", true, time.Hour) + sm.resetFailedSmartFetchState("system-1") + _, ok = sm.smartFetchMap.GetOk("system-1") + assert.True(t, ok, "expected successful smart fetch state to be preserved") +}