From 59057a2ba445b19c0efdeab43f41102e345c3e26 Mon Sep 17 00:00:00 2001 From: henrygd Date: Wed, 17 Sep 2025 13:31:49 -0400 Subject: [PATCH] add check for status alerts which are not properly resolved (#1052) --- agent/deltatracker/deltatracker.go | 1 + agent/deltatracker/deltatracker_test.go | 1 - internal/alerts/alerts_status.go | 43 +++++++- internal/alerts/alerts_test.go | 128 ++++++++++++++++++----- internal/alerts/alerts_test_helpers.go | 7 ++ internal/records/records_test.go | 10 +- internal/records/records_test_helpers.go | 12 +-- internal/tests/hub.go | 25 +++++ 8 files changed, 187 insertions(+), 40 deletions(-) diff --git a/agent/deltatracker/deltatracker.go b/agent/deltatracker/deltatracker.go index 08f86c38..c57ff24e 100644 --- a/agent/deltatracker/deltatracker.go +++ b/agent/deltatracker/deltatracker.go @@ -1,3 +1,4 @@ +// Package deltatracker provides a tracker for calculating differences in numeric values over time. package deltatracker import ( diff --git a/agent/deltatracker/deltatracker_test.go b/agent/deltatracker/deltatracker_test.go index 50059ad1..2b2fd5a9 100644 --- a/agent/deltatracker/deltatracker_test.go +++ b/agent/deltatracker/deltatracker_test.go @@ -1,4 +1,3 @@ -// Package deltatracker provides a tracker for calculating differences in numeric values over time. package deltatracker import ( diff --git a/internal/alerts/alerts_status.go b/internal/alerts/alerts_status.go index 47e5a778..8e7df5a7 100644 --- a/internal/alerts/alerts_status.go +++ b/internal/alerts/alerts_status.go @@ -25,7 +25,12 @@ type alertInfo struct { // startWorker is a long-running goroutine that processes alert tasks // every x seconds. It must be running to process status alerts. func (am *AlertManager) startWorker() { - tick := time.Tick(15 * time.Second) + processPendingAlerts := time.Tick(15 * time.Second) + + // check for status alerts that are not resolved when system comes up + // (can be removed if we figure out core bug in #1052) + checkStatusAlerts := time.Tick(561 * time.Second) + for { select { case <-am.stopChan: @@ -41,7 +46,9 @@ func (am *AlertManager) startWorker() { case "cancel": am.pendingAlerts.Delete(task.alertRecord.Id) } - case <-tick: + case <-checkStatusAlerts: + resolveStatusAlerts(am.hub) + case <-processPendingAlerts: // Check for expired alerts every tick now := time.Now() for key, value := range am.pendingAlerts.Range { @@ -170,3 +177,35 @@ func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, a LinkText: "View " + systemName, }) } + +// resolveStatusAlerts resolves any status alerts that weren't resolved +// when system came up (https://github.com/henrygd/beszel/issues/1052) +func resolveStatusAlerts(app core.App) error { + db := app.DB() + // Find all active status alerts where the system is actually up + var alertIds []string + err := db.NewQuery(` + SELECT a.id + FROM alerts a + JOIN systems s ON a.system = s.id + WHERE a.name = 'Status' + AND a.triggered = true + AND s.status = 'up' + `).Column(&alertIds) + if err != nil { + return err + } + // resolve all matching alert records + for _, alertId := range alertIds { + alert, err := app.FindRecordById("alerts", alertId) + if err != nil { + return err + } + alert.Set("triggered", false) + err = app.Save(alert) + if err != nil { + return err + } + } + return nil +} diff --git a/internal/alerts/alerts_test.go b/internal/alerts/alerts_test.go index d14722fa..ee1b5bf5 100644 --- a/internal/alerts/alerts_test.go +++ b/internal/alerts/alerts_test.go @@ -13,6 +13,7 @@ import ( "testing/synctest" "time" + "github.com/henrygd/beszel/internal/alerts" beszelTests "github.com/henrygd/beszel/internal/tests" "github.com/pocketbase/dbx" @@ -369,33 +370,9 @@ func TestUserAlertsApi(t *testing.T) { } } -func getHubWithUser(t *testing.T) (*beszelTests.TestHub, *core.Record) { - hub, err := beszelTests.NewTestHub(t.TempDir()) - assert.NoError(t, err) - hub.StartHub() - - // Manually initialize the system manager to bind event hooks - err = hub.GetSystemManager().Initialize() - assert.NoError(t, err) - - // Create a test user - user, err := beszelTests.CreateUser(hub, "test@example.com", "password") - assert.NoError(t, err) - - // Create user settings for the test user (required for alert notifications) - userSettingsData := map[string]any{ - "user": user.Id, - "settings": `{"emails":[test@example.com],"webhooks":[]}`, - } - _, err = beszelTests.CreateRecord(hub, "user_settings", userSettingsData) - assert.NoError(t, err) - - return hub, user -} - func TestStatusAlerts(t *testing.T) { synctest.Test(t, func(t *testing.T) { - hub, user := getHubWithUser(t) + hub, user := beszelTests.GetHubWithUser(t) defer hub.Cleanup() systems, err := beszelTests.CreateSystems(hub, 4, user.Id, "paused") @@ -476,7 +453,7 @@ func TestStatusAlerts(t *testing.T) { func TestAlertsHistory(t *testing.T) { synctest.Test(t, func(t *testing.T) { - hub, user := getHubWithUser(t) + hub, user := beszelTests.GetHubWithUser(t) defer hub.Cleanup() // Create systems and alerts @@ -602,3 +579,102 @@ func TestAlertsHistory(t *testing.T) { assert.EqualValues(t, 2, totalHistoryCount, "Should have 2 total alert history records") }) } +func TestResolveStatusAlerts(t *testing.T) { + hub, user := beszelTests.GetHubWithUser(t) + defer hub.Cleanup() + + // Create a systemUp + systemUp, err := beszelTests.CreateRecord(hub, "systems", map[string]any{ + "name": "test-system", + "users": []string{user.Id}, + "host": "127.0.0.1", + "status": "up", + }) + assert.NoError(t, err) + + systemDown, err := beszelTests.CreateRecord(hub, "systems", map[string]any{ + "name": "test-system-2", + "users": []string{user.Id}, + "host": "127.0.0.2", + "status": "up", + }) + assert.NoError(t, err) + + // Create a status alertUp for the system + alertUp, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{ + "name": "Status", + "system": systemUp.Id, + "user": user.Id, + "min": 1, + }) + assert.NoError(t, err) + + alertDown, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{ + "name": "Status", + "system": systemDown.Id, + "user": user.Id, + "min": 1, + }) + assert.NoError(t, err) + + // Verify alert is not triggered initially + assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered initially") + + // Set the system to 'up' (this should not trigger the alert) + systemUp.Set("status", "up") + err = hub.SaveNoValidate(systemUp) + assert.NoError(t, err) + + systemDown.Set("status", "down") + err = hub.SaveNoValidate(systemDown) + assert.NoError(t, err) + + // Wait a moment for any processing + time.Sleep(10 * time.Millisecond) + + // Verify alertUp is still not triggered after setting system to up + alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id}) + assert.NoError(t, err) + assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered when system is up") + + // Manually set both alerts triggered to true + alertUp.Set("triggered", true) + err = hub.SaveNoValidate(alertUp) + assert.NoError(t, err) + alertDown.Set("triggered", true) + err = hub.SaveNoValidate(alertDown) + assert.NoError(t, err) + + // Verify we have exactly one alert with triggered true + triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true}) + assert.NoError(t, err) + assert.EqualValues(t, 2, triggeredCount, "Should have exactly two alerts with triggered true") + + // Verify the specific alertUp is triggered + alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id}) + assert.NoError(t, err) + assert.True(t, alertUp.GetBool("triggered"), "Alert should be triggered") + + // Verify we have two unresolved alert history records + alertHistoryCount, err := hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""}) + assert.NoError(t, err) + assert.EqualValues(t, 2, alertHistoryCount, "Should have exactly two unresolved alert history records") + + err = alerts.ResolveStatusAlerts(hub) + assert.NoError(t, err) + + // Verify alertUp is not triggered after resolving + alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id}) + assert.NoError(t, err) + assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered after resolving") + // Verify alertDown is still triggered + alertDown, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertDown.Id}) + assert.NoError(t, err) + assert.True(t, alertDown.GetBool("triggered"), "Alert should still be triggered after resolving") + + // Verify we have one unresolved alert history record + alertHistoryCount, err = hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""}) + assert.NoError(t, err) + assert.EqualValues(t, 1, alertHistoryCount, "Should have exactly one unresolved alert history record") + +} diff --git a/internal/alerts/alerts_test_helpers.go b/internal/alerts/alerts_test_helpers.go index 519efa36..6161f570 100644 --- a/internal/alerts/alerts_test_helpers.go +++ b/internal/alerts/alerts_test_helpers.go @@ -1,3 +1,6 @@ +//go:build testing +// +build testing + package alerts import ( @@ -53,3 +56,7 @@ func (am *AlertManager) ForceExpirePendingAlerts() { return true }) } + +func ResolveStatusAlerts(app core.App) error { + return resolveStatusAlerts(app) +} diff --git a/internal/records/records_test.go b/internal/records/records_test.go index b4056a30..a2cd2de5 100644 --- a/internal/records/records_test.go +++ b/internal/records/records_test.go @@ -175,7 +175,7 @@ func TestDeleteOldSystemStats(t *testing.T) { } // Run deletion - err = records.TestDeleteOldSystemStats(hub) + err = records.DeleteOldSystemStats(hub) require.NoError(t, err) // Verify results @@ -268,7 +268,7 @@ func TestDeleteOldAlertsHistory(t *testing.T) { assert.Equal(t, int64(tc.alertCount), countBefore, "Initial count should match") // Run deletion - err = records.TestDeleteOldAlertsHistory(hub, tc.countToKeep, tc.countBeforeDeletion) + err = records.DeleteOldAlertsHistory(hub, tc.countToKeep, tc.countBeforeDeletion) require.NoError(t, err) // Count after deletion @@ -332,7 +332,7 @@ func TestDeleteOldAlertsHistoryEdgeCases(t *testing.T) { } // Should not error and should not delete anything - err = records.TestDeleteOldAlertsHistory(hub, 10, 20) + err = records.DeleteOldAlertsHistory(hub, 10, 20) require.NoError(t, err) count, err := hub.CountRecords("alerts_history") @@ -346,7 +346,7 @@ func TestDeleteOldAlertsHistoryEdgeCases(t *testing.T) { require.NoError(t, err) // Should not error with empty table - err = records.TestDeleteOldAlertsHistory(hub, 10, 20) + err = records.DeleteOldAlertsHistory(hub, 10, 20) require.NoError(t, err) }) } @@ -376,7 +376,7 @@ func TestTwoDecimals(t *testing.T) { } for _, tc := range testCases { - result := records.TestTwoDecimals(tc.input) + result := records.TwoDecimals(tc.input) assert.InDelta(t, tc.expected, result, 0.02, "twoDecimals(%f) should equal %f", tc.input, tc.expected) } } diff --git a/internal/records/records_test_helpers.go b/internal/records/records_test_helpers.go index f4909b60..dfc3bffe 100644 --- a/internal/records/records_test_helpers.go +++ b/internal/records/records_test_helpers.go @@ -7,17 +7,17 @@ import ( "github.com/pocketbase/pocketbase/core" ) -// TestDeleteOldSystemStats exposes deleteOldSystemStats for testing -func TestDeleteOldSystemStats(app core.App) error { +// DeleteOldSystemStats exposes deleteOldSystemStats for testing +func DeleteOldSystemStats(app core.App) error { return deleteOldSystemStats(app) } -// TestDeleteOldAlertsHistory exposes deleteOldAlertsHistory for testing -func TestDeleteOldAlertsHistory(app core.App, countToKeep, countBeforeDeletion int) error { +// DeleteOldAlertsHistory exposes deleteOldAlertsHistory for testing +func DeleteOldAlertsHistory(app core.App, countToKeep, countBeforeDeletion int) error { return deleteOldAlertsHistory(app, countToKeep, countBeforeDeletion) } -// TestTwoDecimals exposes twoDecimals for testing -func TestTwoDecimals(value float64) float64 { +// TwoDecimals exposes twoDecimals for testing +func TwoDecimals(value float64) float64 { return twoDecimals(value) } diff --git a/internal/tests/hub.go b/internal/tests/hub.go index a4b8c17e..e1f4d0a6 100644 --- a/internal/tests/hub.go +++ b/internal/tests/hub.go @@ -125,3 +125,28 @@ func CreateSystems(app core.App, count int, userId string, status string) ([]*co } return systems, nil } + +// GetHubWithUser creates a test hub with a test user and user settings +func GetHubWithUser(t *testing.T) (*TestHub, *core.Record) { + hub, err := NewTestHub(t.TempDir()) + assert.NoError(t, err) + hub.StartHub() + + // Manually initialize the system manager to bind event hooks + err = hub.GetSystemManager().Initialize() + assert.NoError(t, err) + + // Create a test user + user, err := CreateUser(hub, "test@example.com", "password") + assert.NoError(t, err) + + // Create user settings for the test user (required for alert notifications) + userSettingsData := map[string]any{ + "user": user.Id, + "settings": `{"emails":[test@example.com],"webhooks":[]}`, + } + _, err = CreateRecord(hub, "user_settings", userSettingsData) + assert.NoError(t, err) + + return hub, user +}