From ec7ad632a98406456975a42b1be35e5e757b1e57 Mon Sep 17 00:00:00 2001 From: Victor Eduardo Date: Mon, 9 Mar 2026 19:32:35 -0300 Subject: [PATCH] fix: Use historical records to average disk usage for extra disk alerts (#1801) - Introduced a new test file `alerts_disk_test.go` to validate the behavior of disk alerts using historical data for extra filesystems. - Enhanced the `HandleSystemAlerts` function to correctly calculate disk usage for extra filesystems based on historical records. - Updated the `SystemAlertStats` struct to include `ExtraFs` for tracking additional filesystem statistics. --- internal/alerts/alerts.go | 24 +++-- internal/alerts/alerts_disk_test.go | 155 ++++++++++++++++++++++++++++ internal/alerts/alerts_system.go | 14 +-- 3 files changed, 178 insertions(+), 15 deletions(-) create mode 100644 internal/alerts/alerts_disk_test.go diff --git a/internal/alerts/alerts.go b/internal/alerts/alerts.go index 3e33bd70..da3ad784 100644 --- a/internal/alerts/alerts.go +++ b/internal/alerts/alerts.go @@ -40,16 +40,22 @@ type UserNotificationSettings struct { Webhooks []string `json:"webhooks"` } +type SystemAlertFsStats struct { + DiskTotal float64 `json:"d"` + DiskUsed float64 `json:"du"` +} + type SystemAlertStats struct { - Cpu float64 `json:"cpu"` - Mem float64 `json:"mp"` - Disk float64 `json:"dp"` - NetSent float64 `json:"ns"` - NetRecv float64 `json:"nr"` - GPU map[string]SystemAlertGPUData `json:"g"` - Temperatures map[string]float32 `json:"t"` - LoadAvg [3]float64 `json:"la"` - Battery [2]uint8 `json:"bat"` + Cpu float64 `json:"cpu"` + Mem float64 `json:"mp"` + Disk float64 `json:"dp"` + NetSent float64 `json:"ns"` + NetRecv float64 `json:"nr"` + GPU map[string]SystemAlertGPUData `json:"g"` + Temperatures map[string]float32 `json:"t"` + LoadAvg [3]float64 `json:"la"` + Battery [2]uint8 `json:"bat"` + ExtraFs map[string]SystemAlertFsStats `json:"efs"` } type SystemAlertGPUData struct { diff --git a/internal/alerts/alerts_disk_test.go b/internal/alerts/alerts_disk_test.go new file mode 100644 index 00000000..72f094d0 --- /dev/null +++ b/internal/alerts/alerts_disk_test.go @@ -0,0 +1,155 @@ +//go:build testing + +package alerts_test + +import ( + "encoding/json" + "testing" + "time" + + "github.com/henrygd/beszel/internal/entities/system" + beszelTests "github.com/henrygd/beszel/internal/tests" + + "github.com/pocketbase/dbx" + "github.com/pocketbase/pocketbase/tools/types" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestDiskAlertExtraFsMultiMinute tests that multi-minute disk alerts correctly use +// historical per-minute values for extra (non-root) filesystems, not the current live snapshot. +func TestDiskAlertExtraFsMultiMinute(t *testing.T) { + hub, user := beszelTests.GetHubWithUser(t) + defer hub.Cleanup() + + systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up") + require.NoError(t, err) + systemRecord := systems[0] + + // Disk alert: threshold 80%, min=2 (requires historical averaging) + diskAlert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{ + "name": "Disk", + "system": systemRecord.Id, + "user": user.Id, + "value": 80, // threshold: 80% + "min": 2, // 2 minutes - requires historical averaging + }) + require.NoError(t, err) + assert.False(t, diskAlert.GetBool("triggered"), "Alert should not be triggered initially") + + am := hub.GetAlertManager() + now := time.Now().UTC() + + extraFsHigh := map[string]*system.FsStats{ + "/mnt/data": {DiskTotal: 1000, DiskUsed: 920}, // 92% - above threshold + } + + // Insert 4 historical records spread over 3 minutes (same pattern as battery tests). + // The oldest record must predate (now - 2min) so the alert time window is valid. + recordTimes := []time.Duration{ + -180 * time.Second, // 3 min ago - anchors oldest record before alert.time + -90 * time.Second, + -60 * time.Second, + -30 * time.Second, + } + + for _, offset := range recordTimes { + stats := system.Stats{ + DiskPct: 30, // root disk at 30% - below threshold + ExtraFs: extraFsHigh, + } + statsJSON, _ := json.Marshal(stats) + + recordTime := now.Add(offset) + record, err := beszelTests.CreateRecord(hub, "system_stats", map[string]any{ + "system": systemRecord.Id, + "type": "1m", + "stats": string(statsJSON), + }) + require.NoError(t, err) + record.SetRaw("created", recordTime.Format(types.DefaultDateLayout)) + err = hub.SaveNoValidate(record) + require.NoError(t, err) + } + + combinedDataHigh := &system.CombinedData{ + Stats: system.Stats{ + DiskPct: 30, + ExtraFs: extraFsHigh, + }, + Info: system.Info{ + DiskPct: 30, + }, + } + + systemRecord.Set("updated", now) + err = hub.SaveNoValidate(systemRecord) + require.NoError(t, err) + + err = am.HandleSystemAlerts(systemRecord, combinedDataHigh) + require.NoError(t, err) + + time.Sleep(20 * time.Millisecond) + + diskAlert, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": diskAlert.Id}) + require.NoError(t, err) + assert.True(t, diskAlert.GetBool("triggered"), + "Alert SHOULD be triggered when extra disk average (92%%) exceeds threshold (80%%)") + + // --- Resolution: extra disk drops to 50%, alert should resolve --- + + extraFsLow := map[string]*system.FsStats{ + "/mnt/data": {DiskTotal: 1000, DiskUsed: 500}, // 50% - below threshold + } + + newNow := now.Add(2 * time.Minute) + recordTimesLow := []time.Duration{ + -180 * time.Second, + -90 * time.Second, + -60 * time.Second, + -30 * time.Second, + } + + for _, offset := range recordTimesLow { + stats := system.Stats{ + DiskPct: 30, + ExtraFs: extraFsLow, + } + statsJSON, _ := json.Marshal(stats) + + recordTime := newNow.Add(offset) + record, err := beszelTests.CreateRecord(hub, "system_stats", map[string]any{ + "system": systemRecord.Id, + "type": "1m", + "stats": string(statsJSON), + }) + require.NoError(t, err) + record.SetRaw("created", recordTime.Format(types.DefaultDateLayout)) + err = hub.SaveNoValidate(record) + require.NoError(t, err) + } + + combinedDataLow := &system.CombinedData{ + Stats: system.Stats{ + DiskPct: 30, + ExtraFs: extraFsLow, + }, + Info: system.Info{ + DiskPct: 30, + }, + } + + systemRecord.Set("updated", newNow) + err = hub.SaveNoValidate(systemRecord) + require.NoError(t, err) + + err = am.HandleSystemAlerts(systemRecord, combinedDataLow) + require.NoError(t, err) + + time.Sleep(20 * time.Millisecond) + + diskAlert, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": diskAlert.Id}) + require.NoError(t, err) + assert.False(t, diskAlert.GetBool("triggered"), + "Alert should be resolved when extra disk average (50%%) drops below threshold (80%%)") +} diff --git a/internal/alerts/alerts_system.go b/internal/alerts/alerts_system.go index 817222dc..df48a944 100644 --- a/internal/alerts/alerts_system.go +++ b/internal/alerts/alerts_system.go @@ -195,19 +195,21 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *syst alert.val += stats.NetSent + stats.NetRecv case "Disk": if alert.mapSums == nil { - alert.mapSums = make(map[string]float32, len(data.Stats.ExtraFs)+1) + alert.mapSums = make(map[string]float32, len(stats.ExtraFs)+1) } // add root disk if _, ok := alert.mapSums["root"]; !ok { alert.mapSums["root"] = 0.0 } alert.mapSums["root"] += float32(stats.Disk) - // add extra disks - for key, fs := range data.Stats.ExtraFs { - if _, ok := alert.mapSums[key]; !ok { - alert.mapSums[key] = 0.0 + // add extra disks from historical record + for key, fs := range stats.ExtraFs { + if fs.DiskTotal > 0 { + if _, ok := alert.mapSums[key]; !ok { + alert.mapSums[key] = 0.0 + } + alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100) } - alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100) } case "Temperature": if alert.mapSums == nil {