refactor: simplify/improve status alert handling (#1519)

also adds new functionality to restore any pending down alerts that were lost by hub restart before creation
2026-05-30 13:21:50 +02:00 · 2026-03-12 15:53:40 -04:00
parent 0d3dfcb207
commit 8e2316f845
7 changed files with 779 additions and 270 deletions
--- a/internal/alerts/alerts_status.go
+++ b/internal/alerts/alerts_status.go
@@ -9,63 +9,25 @@ import (
 	"github.com/pocketbase/pocketbase/core"
 )

-type alertTask struct {
-	action      string // "schedule" or "cancel"
-	systemName  string
-	alertRecord *core.Record
-	delay       time.Duration
-}
-
 type alertInfo struct {
 	systemName  string
 	alertRecord *core.Record
 	expireTime  time.Time
+	timer       *time.Timer
 }

-// startWorker is a long-running goroutine that processes alert tasks
-// every x seconds. It must be running to process status alerts.
-func (am *AlertManager) startWorker() {
-	processPendingAlerts := time.Tick(15 * time.Second)
-
-	// check for status alerts that are not resolved when system comes up
-	// (can be removed if we figure out core bug in #1052)
-	checkStatusAlerts := time.Tick(561 * time.Second)
-
-	for {
-		select {
-		case <-am.stopChan:
-			return
-		case task := <-am.alertQueue:
-			switch task.action {
-			case "schedule":
-				am.pendingAlerts.Store(task.alertRecord.Id, &alertInfo{
-					systemName:  task.systemName,
-					alertRecord: task.alertRecord,
-					expireTime:  time.Now().Add(task.delay),
-				})
-			case "cancel":
-				am.pendingAlerts.Delete(task.alertRecord.Id)
+// Stop cancels all pending status alert timers.
+func (am *AlertManager) Stop() {
+	am.stopOnce.Do(func() {
+		am.pendingAlerts.Range(func(key, value any) bool {
+			info := value.(*alertInfo)
+			if info.timer != nil {
+				info.timer.Stop()
 			}
-		case <-checkStatusAlerts:
-			resolveStatusAlerts(am.hub)
-		case <-processPendingAlerts:
-			// Check for expired alerts every tick
-			now := time.Now()
-			for key, value := range am.pendingAlerts.Range {
-				info := value.(*alertInfo)
-				if now.After(info.expireTime) {
-					// Downtime delay has passed, process alert
-					am.sendStatusAlert("down", info.systemName, info.alertRecord)
-					am.pendingAlerts.Delete(key)
-				}
-			}
-		}
-	}
-}
-
-// StopWorker shuts down the AlertManager.worker goroutine
-func (am *AlertManager) StopWorker() {
-	close(am.stopChan)
+			am.pendingAlerts.Delete(key)
+			return true
+		})
+	})
 }

 // HandleStatusAlerts manages the logic when system status changes.
@@ -103,38 +65,43 @@ func (am *AlertManager) getSystemStatusAlerts(systemID string) ([]*core.Record,
 	return alertRecords, nil
 }

-// Schedules delayed "down" alerts for each alert record.
+// handleSystemDown manages the logic when a system status changes to "down". It schedules pending alerts for each alert record.
 func (am *AlertManager) handleSystemDown(systemName string, alertRecords []*core.Record) {
 	for _, alertRecord := range alertRecords {
-		// Continue if alert is already scheduled
-		if _, exists := am.pendingAlerts.Load(alertRecord.Id); exists {
-			continue
-		}
-		// Schedule by adding to queue
 		min := max(1, alertRecord.GetInt("min"))
-		am.alertQueue <- alertTask{
-			action:      "schedule",
-			systemName:  systemName,
-			alertRecord: alertRecord,
-			delay:       time.Duration(min) * time.Minute,
-		}
+		am.schedulePendingStatusAlert(systemName, alertRecord, time.Duration(min)*time.Minute)
 	}
 }

+// schedulePendingStatusAlert sets up a timer to send a "down" alert after the specified delay if the system is still down.
+// It returns true if the alert was scheduled, or false if an alert was already pending for the given alert record.
+func (am *AlertManager) schedulePendingStatusAlert(systemName string, alertRecord *core.Record, delay time.Duration) bool {
+	alert := &alertInfo{
+		systemName:  systemName,
+		alertRecord: alertRecord,
+		expireTime:  time.Now().Add(delay),
+	}
+
+	storedAlert, loaded := am.pendingAlerts.LoadOrStore(alertRecord.Id, alert)
+	if loaded {
+		return false
+	}
+
+	stored := storedAlert.(*alertInfo)
+	stored.timer = time.AfterFunc(time.Until(stored.expireTime), func() {
+		am.processPendingAlert(alertRecord.Id)
+	})
+	return true
+}
+
 // handleSystemUp manages the logic when a system status changes to "up".
 // It cancels any pending alerts and sends "up" alerts.
 func (am *AlertManager) handleSystemUp(systemName string, alertRecords []*core.Record) {
 	for _, alertRecord := range alertRecords {
-		alertRecordID := alertRecord.Id
 		// If alert exists for record, delete and continue (down alert not sent)
-		if _, exists := am.pendingAlerts.Load(alertRecordID); exists {
-			am.alertQueue <- alertTask{
-				action:      "cancel",
-				alertRecord: alertRecord,
-			}
+		if am.cancelPendingAlert(alertRecord.Id) {
 			continue
 		}
-		// No alert scheduled for this record, send "up" alert only if "down" was triggered
 		if !alertRecord.GetBool("triggered") {
 			continue
 		}
@@ -144,6 +111,36 @@ func (am *AlertManager) handleSystemUp(systemName string, alertRecords []*core.R
 	}
 }

+// cancelPendingAlert stops the timer and removes the pending alert for the given alert ID. Returns true if a pending alert was found and cancelled.
+func (am *AlertManager) cancelPendingAlert(alertID string) bool {
+	value, loaded := am.pendingAlerts.LoadAndDelete(alertID)
+	if !loaded {
+		return false
+	}
+
+	info := value.(*alertInfo)
+	if info.timer != nil {
+		info.timer.Stop()
+	}
+	return true
+}
+
+// processPendingAlert sends a "down" alert if the pending alert has expired and the system is still down.
+func (am *AlertManager) processPendingAlert(alertID string) {
+	value, loaded := am.pendingAlerts.LoadAndDelete(alertID)
+	if !loaded {
+		return
+	}
+
+	info := value.(*alertInfo)
+	if info.alertRecord.GetBool("triggered") {
+		return
+	}
+	if err := am.sendStatusAlert("down", info.systemName, info.alertRecord); err != nil {
+		am.hub.Logger().Error("Failed to send alert", "err", err)
+	}
+}
+
 // sendStatusAlert sends a status alert ("up" or "down") to the users associated with the alert records.
 func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, alertRecord *core.Record) error {
 	switch alertStatus {
@@ -177,8 +174,8 @@ func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, a
 	})
 }

-// resolveStatusAlerts resolves any status alerts that weren't resolved
-// when system came up (https://github.com/henrygd/beszel/issues/1052)
+// resolveStatusAlerts resolves any triggered status alerts that weren't resolved
+// when system came up (https://github.com/henrygd/beszel/issues/1052).
 func resolveStatusAlerts(app core.App) error {
 	db := app.DB()
 	// Find all active status alerts where the system is actually up
@@ -208,3 +205,36 @@ func resolveStatusAlerts(app core.App) error {
 	}
 	return nil
 }
+
+// restorePendingStatusAlerts re-queues untriggered status alerts for systems that
+// are still down after a hub restart. This rebuilds the lost in-memory timer state.
+func (am *AlertManager) restorePendingStatusAlerts() error {
+	type pendingStatusAlert struct {
+		AlertID    string `db:"alert_id"`
+		SystemName string `db:"system_name"`
+	}
+
+	var pending []pendingStatusAlert
+	err := am.hub.DB().NewQuery(`
+		SELECT a.id AS alert_id, s.name AS system_name
+		FROM alerts a
+		JOIN systems s ON a.system = s.id
+		WHERE a.name = 'Status'
+		AND a.triggered = false
+		AND s.status = 'down'
+	`).All(&pending)
+	if err != nil {
+		return err
+	}
+
+	for _, item := range pending {
+		alertRecord, err := am.hub.FindRecordById("alerts", item.AlertID)
+		if err != nil {
+			return err
+		}
+		min := max(1, alertRecord.GetInt("min"))
+		am.schedulePendingStatusAlert(item.SystemName, alertRecord, time.Duration(min)*time.Minute)
+	}
+
+	return nil
+}