fix: Use historical records to average disk usage for extra disk alerts (#1801)

- Introduced a new test file `alerts_disk_test.go` to validate the behavior of disk alerts using historical data for extra filesystems.
- Enhanced the `HandleSystemAlerts` function to correctly calculate disk usage for extra filesystems based on historical records.
- Updated the `SystemAlertStats` struct to include `ExtraFs` for tracking additional filesystem statistics.
This commit is contained in:
Victor Eduardo
2026-03-09 19:32:35 -03:00
committed by GitHub
parent 963fce5a33
commit ec7ad632a9
3 changed files with 178 additions and 15 deletions

View File

@@ -40,6 +40,11 @@ type UserNotificationSettings struct {
Webhooks []string `json:"webhooks"` Webhooks []string `json:"webhooks"`
} }
type SystemAlertFsStats struct {
DiskTotal float64 `json:"d"`
DiskUsed float64 `json:"du"`
}
type SystemAlertStats struct { type SystemAlertStats struct {
Cpu float64 `json:"cpu"` Cpu float64 `json:"cpu"`
Mem float64 `json:"mp"` Mem float64 `json:"mp"`
@@ -50,6 +55,7 @@ type SystemAlertStats struct {
Temperatures map[string]float32 `json:"t"` Temperatures map[string]float32 `json:"t"`
LoadAvg [3]float64 `json:"la"` LoadAvg [3]float64 `json:"la"`
Battery [2]uint8 `json:"bat"` Battery [2]uint8 `json:"bat"`
ExtraFs map[string]SystemAlertFsStats `json:"efs"`
} }
type SystemAlertGPUData struct { type SystemAlertGPUData struct {

View File

@@ -0,0 +1,155 @@
//go:build testing
package alerts_test
import (
"encoding/json"
"testing"
"time"
"github.com/henrygd/beszel/internal/entities/system"
beszelTests "github.com/henrygd/beszel/internal/tests"
"github.com/pocketbase/dbx"
"github.com/pocketbase/pocketbase/tools/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestDiskAlertExtraFsMultiMinute tests that multi-minute disk alerts correctly use
// historical per-minute values for extra (non-root) filesystems, not the current live snapshot.
func TestDiskAlertExtraFsMultiMinute(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up")
require.NoError(t, err)
systemRecord := systems[0]
// Disk alert: threshold 80%, min=2 (requires historical averaging)
diskAlert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Disk",
"system": systemRecord.Id,
"user": user.Id,
"value": 80, // threshold: 80%
"min": 2, // 2 minutes - requires historical averaging
})
require.NoError(t, err)
assert.False(t, diskAlert.GetBool("triggered"), "Alert should not be triggered initially")
am := hub.GetAlertManager()
now := time.Now().UTC()
extraFsHigh := map[string]*system.FsStats{
"/mnt/data": {DiskTotal: 1000, DiskUsed: 920}, // 92% - above threshold
}
// Insert 4 historical records spread over 3 minutes (same pattern as battery tests).
// The oldest record must predate (now - 2min) so the alert time window is valid.
recordTimes := []time.Duration{
-180 * time.Second, // 3 min ago - anchors oldest record before alert.time
-90 * time.Second,
-60 * time.Second,
-30 * time.Second,
}
for _, offset := range recordTimes {
stats := system.Stats{
DiskPct: 30, // root disk at 30% - below threshold
ExtraFs: extraFsHigh,
}
statsJSON, _ := json.Marshal(stats)
recordTime := now.Add(offset)
record, err := beszelTests.CreateRecord(hub, "system_stats", map[string]any{
"system": systemRecord.Id,
"type": "1m",
"stats": string(statsJSON),
})
require.NoError(t, err)
record.SetRaw("created", recordTime.Format(types.DefaultDateLayout))
err = hub.SaveNoValidate(record)
require.NoError(t, err)
}
combinedDataHigh := &system.CombinedData{
Stats: system.Stats{
DiskPct: 30,
ExtraFs: extraFsHigh,
},
Info: system.Info{
DiskPct: 30,
},
}
systemRecord.Set("updated", now)
err = hub.SaveNoValidate(systemRecord)
require.NoError(t, err)
err = am.HandleSystemAlerts(systemRecord, combinedDataHigh)
require.NoError(t, err)
time.Sleep(20 * time.Millisecond)
diskAlert, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": diskAlert.Id})
require.NoError(t, err)
assert.True(t, diskAlert.GetBool("triggered"),
"Alert SHOULD be triggered when extra disk average (92%%) exceeds threshold (80%%)")
// --- Resolution: extra disk drops to 50%, alert should resolve ---
extraFsLow := map[string]*system.FsStats{
"/mnt/data": {DiskTotal: 1000, DiskUsed: 500}, // 50% - below threshold
}
newNow := now.Add(2 * time.Minute)
recordTimesLow := []time.Duration{
-180 * time.Second,
-90 * time.Second,
-60 * time.Second,
-30 * time.Second,
}
for _, offset := range recordTimesLow {
stats := system.Stats{
DiskPct: 30,
ExtraFs: extraFsLow,
}
statsJSON, _ := json.Marshal(stats)
recordTime := newNow.Add(offset)
record, err := beszelTests.CreateRecord(hub, "system_stats", map[string]any{
"system": systemRecord.Id,
"type": "1m",
"stats": string(statsJSON),
})
require.NoError(t, err)
record.SetRaw("created", recordTime.Format(types.DefaultDateLayout))
err = hub.SaveNoValidate(record)
require.NoError(t, err)
}
combinedDataLow := &system.CombinedData{
Stats: system.Stats{
DiskPct: 30,
ExtraFs: extraFsLow,
},
Info: system.Info{
DiskPct: 30,
},
}
systemRecord.Set("updated", newNow)
err = hub.SaveNoValidate(systemRecord)
require.NoError(t, err)
err = am.HandleSystemAlerts(systemRecord, combinedDataLow)
require.NoError(t, err)
time.Sleep(20 * time.Millisecond)
diskAlert, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": diskAlert.Id})
require.NoError(t, err)
assert.False(t, diskAlert.GetBool("triggered"),
"Alert should be resolved when extra disk average (50%%) drops below threshold (80%%)")
}

View File

@@ -195,20 +195,22 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *syst
alert.val += stats.NetSent + stats.NetRecv alert.val += stats.NetSent + stats.NetRecv
case "Disk": case "Disk":
if alert.mapSums == nil { if alert.mapSums == nil {
alert.mapSums = make(map[string]float32, len(data.Stats.ExtraFs)+1) alert.mapSums = make(map[string]float32, len(stats.ExtraFs)+1)
} }
// add root disk // add root disk
if _, ok := alert.mapSums["root"]; !ok { if _, ok := alert.mapSums["root"]; !ok {
alert.mapSums["root"] = 0.0 alert.mapSums["root"] = 0.0
} }
alert.mapSums["root"] += float32(stats.Disk) alert.mapSums["root"] += float32(stats.Disk)
// add extra disks // add extra disks from historical record
for key, fs := range data.Stats.ExtraFs { for key, fs := range stats.ExtraFs {
if fs.DiskTotal > 0 {
if _, ok := alert.mapSums[key]; !ok { if _, ok := alert.mapSums[key]; !ok {
alert.mapSums[key] = 0.0 alert.mapSums[key] = 0.0
} }
alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100) alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100)
} }
}
case "Temperature": case "Temperature":
if alert.mapSums == nil { if alert.mapSums == nil {
alert.mapSums = make(map[string]float32, len(stats.Temperatures)) alert.mapSums = make(map[string]float32, len(stats.Temperatures))