refactor: simplify/improve status alert handling (#1519)

also adds new functionality to restore any pending down alerts
that were lost by hub restart before creation
This commit is contained in:
henrygd
2026-03-12 15:53:40 -04:00
parent 0d3dfcb207
commit 8e2316f845
7 changed files with 779 additions and 270 deletions

View File

@@ -9,63 +9,25 @@ import (
"github.com/pocketbase/pocketbase/core"
)
type alertTask struct {
action string // "schedule" or "cancel"
systemName string
alertRecord *core.Record
delay time.Duration
}
type alertInfo struct {
systemName string
alertRecord *core.Record
expireTime time.Time
timer *time.Timer
}
// startWorker is a long-running goroutine that processes alert tasks
// every x seconds. It must be running to process status alerts.
func (am *AlertManager) startWorker() {
processPendingAlerts := time.Tick(15 * time.Second)
// check for status alerts that are not resolved when system comes up
// (can be removed if we figure out core bug in #1052)
checkStatusAlerts := time.Tick(561 * time.Second)
for {
select {
case <-am.stopChan:
return
case task := <-am.alertQueue:
switch task.action {
case "schedule":
am.pendingAlerts.Store(task.alertRecord.Id, &alertInfo{
systemName: task.systemName,
alertRecord: task.alertRecord,
expireTime: time.Now().Add(task.delay),
})
case "cancel":
am.pendingAlerts.Delete(task.alertRecord.Id)
// Stop cancels all pending status alert timers.
func (am *AlertManager) Stop() {
am.stopOnce.Do(func() {
am.pendingAlerts.Range(func(key, value any) bool {
info := value.(*alertInfo)
if info.timer != nil {
info.timer.Stop()
}
case <-checkStatusAlerts:
resolveStatusAlerts(am.hub)
case <-processPendingAlerts:
// Check for expired alerts every tick
now := time.Now()
for key, value := range am.pendingAlerts.Range {
info := value.(*alertInfo)
if now.After(info.expireTime) {
// Downtime delay has passed, process alert
am.sendStatusAlert("down", info.systemName, info.alertRecord)
am.pendingAlerts.Delete(key)
}
}
}
}
}
// StopWorker shuts down the AlertManager.worker goroutine
func (am *AlertManager) StopWorker() {
close(am.stopChan)
am.pendingAlerts.Delete(key)
return true
})
})
}
// HandleStatusAlerts manages the logic when system status changes.
@@ -103,38 +65,43 @@ func (am *AlertManager) getSystemStatusAlerts(systemID string) ([]*core.Record,
return alertRecords, nil
}
// Schedules delayed "down" alerts for each alert record.
// handleSystemDown manages the logic when a system status changes to "down". It schedules pending alerts for each alert record.
func (am *AlertManager) handleSystemDown(systemName string, alertRecords []*core.Record) {
for _, alertRecord := range alertRecords {
// Continue if alert is already scheduled
if _, exists := am.pendingAlerts.Load(alertRecord.Id); exists {
continue
}
// Schedule by adding to queue
min := max(1, alertRecord.GetInt("min"))
am.alertQueue <- alertTask{
action: "schedule",
systemName: systemName,
alertRecord: alertRecord,
delay: time.Duration(min) * time.Minute,
}
am.schedulePendingStatusAlert(systemName, alertRecord, time.Duration(min)*time.Minute)
}
}
// schedulePendingStatusAlert sets up a timer to send a "down" alert after the specified delay if the system is still down.
// It returns true if the alert was scheduled, or false if an alert was already pending for the given alert record.
func (am *AlertManager) schedulePendingStatusAlert(systemName string, alertRecord *core.Record, delay time.Duration) bool {
alert := &alertInfo{
systemName: systemName,
alertRecord: alertRecord,
expireTime: time.Now().Add(delay),
}
storedAlert, loaded := am.pendingAlerts.LoadOrStore(alertRecord.Id, alert)
if loaded {
return false
}
stored := storedAlert.(*alertInfo)
stored.timer = time.AfterFunc(time.Until(stored.expireTime), func() {
am.processPendingAlert(alertRecord.Id)
})
return true
}
// handleSystemUp manages the logic when a system status changes to "up".
// It cancels any pending alerts and sends "up" alerts.
func (am *AlertManager) handleSystemUp(systemName string, alertRecords []*core.Record) {
for _, alertRecord := range alertRecords {
alertRecordID := alertRecord.Id
// If alert exists for record, delete and continue (down alert not sent)
if _, exists := am.pendingAlerts.Load(alertRecordID); exists {
am.alertQueue <- alertTask{
action: "cancel",
alertRecord: alertRecord,
}
if am.cancelPendingAlert(alertRecord.Id) {
continue
}
// No alert scheduled for this record, send "up" alert only if "down" was triggered
if !alertRecord.GetBool("triggered") {
continue
}
@@ -144,6 +111,36 @@ func (am *AlertManager) handleSystemUp(systemName string, alertRecords []*core.R
}
}
// cancelPendingAlert stops the timer and removes the pending alert for the given alert ID. Returns true if a pending alert was found and cancelled.
func (am *AlertManager) cancelPendingAlert(alertID string) bool {
value, loaded := am.pendingAlerts.LoadAndDelete(alertID)
if !loaded {
return false
}
info := value.(*alertInfo)
if info.timer != nil {
info.timer.Stop()
}
return true
}
// processPendingAlert sends a "down" alert if the pending alert has expired and the system is still down.
func (am *AlertManager) processPendingAlert(alertID string) {
value, loaded := am.pendingAlerts.LoadAndDelete(alertID)
if !loaded {
return
}
info := value.(*alertInfo)
if info.alertRecord.GetBool("triggered") {
return
}
if err := am.sendStatusAlert("down", info.systemName, info.alertRecord); err != nil {
am.hub.Logger().Error("Failed to send alert", "err", err)
}
}
// sendStatusAlert sends a status alert ("up" or "down") to the users associated with the alert records.
func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, alertRecord *core.Record) error {
switch alertStatus {
@@ -177,8 +174,8 @@ func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, a
})
}
// resolveStatusAlerts resolves any status alerts that weren't resolved
// when system came up (https://github.com/henrygd/beszel/issues/1052)
// resolveStatusAlerts resolves any triggered status alerts that weren't resolved
// when system came up (https://github.com/henrygd/beszel/issues/1052).
func resolveStatusAlerts(app core.App) error {
db := app.DB()
// Find all active status alerts where the system is actually up
@@ -208,3 +205,36 @@ func resolveStatusAlerts(app core.App) error {
}
return nil
}
// restorePendingStatusAlerts re-queues untriggered status alerts for systems that
// are still down after a hub restart. This rebuilds the lost in-memory timer state.
func (am *AlertManager) restorePendingStatusAlerts() error {
type pendingStatusAlert struct {
AlertID string `db:"alert_id"`
SystemName string `db:"system_name"`
}
var pending []pendingStatusAlert
err := am.hub.DB().NewQuery(`
SELECT a.id AS alert_id, s.name AS system_name
FROM alerts a
JOIN systems s ON a.system = s.id
WHERE a.name = 'Status'
AND a.triggered = false
AND s.status = 'down'
`).All(&pending)
if err != nil {
return err
}
for _, item := range pending {
alertRecord, err := am.hub.FindRecordById("alerts", item.AlertID)
if err != nil {
return err
}
min := max(1, alertRecord.GetInt("min"))
am.schedulePendingStatusAlert(item.SystemName, alertRecord, time.Duration(min)*time.Minute)
}
return nil
}