Compare commits

...

3 Commits

Author SHA1 Message Date
henrygd
bd94a9d142 agent: improve disk discovery / IO mapping and add tests (#1811) 2026-03-13 16:03:27 -04:00
henrygd
8e2316f845 refactor: simplify/improve status alert handling (#1519)
also adds new functionality to restore any pending down alerts
that were lost by hub restart before creation
2026-03-12 15:53:40 -04:00
Sven van Ginkel
0d3dfcb207 fix(hub): check if status alert is triggered before sending up alert (#1806) 2026-03-12 13:38:42 -04:00
9 changed files with 1507 additions and 406 deletions

View File

@@ -14,6 +14,25 @@ import (
"github.com/shirou/gopsutil/v4/disk"
)
// fsRegistrationContext holds the shared lookup state needed to resolve a
// filesystem into the tracked fsStats key and metadata.
type fsRegistrationContext struct {
filesystem string // value of optional FILESYSTEM env var
isWindows bool
efPath string // path to extra filesystems (default "/extra-filesystems")
diskIoCounters map[string]disk.IOCountersStat
}
// diskDiscovery groups the transient state for a single initializeDiskInfo run so
// helper methods can share the same partitions, mount paths, and lookup functions
type diskDiscovery struct {
agent *Agent
rootMountPoint string
partitions []disk.PartitionStat
usageFn func(string) (*disk.UsageStat, error)
ctx fsRegistrationContext
}
// parseFilesystemEntry parses a filesystem entry in the format "device__customname"
// Returns the device/filesystem part and the custom name part
func parseFilesystemEntry(entry string) (device, customName string) {
@@ -27,19 +46,230 @@ func parseFilesystemEntry(entry string) (device, customName string) {
return device, customName
}
// extraFilesystemPartitionInfo derives the I/O device and optional display name
// for a mounted /extra-filesystems partition. Prefer the partition device reported
// by the system and only use the folder name for custom naming metadata.
func extraFilesystemPartitionInfo(p disk.PartitionStat) (device, customName string) {
device = strings.TrimSpace(p.Device)
folderDevice, customName := parseFilesystemEntry(filepath.Base(p.Mountpoint))
if device == "" {
device = folderDevice
}
return device, customName
}
func isDockerSpecialMountpoint(mountpoint string) bool {
switch mountpoint {
case "/etc/hosts", "/etc/resolv.conf", "/etc/hostname":
return true
default:
}
return false
}
// registerFilesystemStats resolves the tracked key and stats payload for a
// filesystem before it is inserted into fsStats.
func registerFilesystemStats(existing map[string]*system.FsStats, device, mountpoint string, root bool, customName string, ctx fsRegistrationContext) (string, *system.FsStats, bool) {
key := device
if !ctx.isWindows {
key = filepath.Base(device)
}
if root {
// Try to map root device to a diskIoCounters entry. First checks for an
// exact key match, then uses findIoDevice for normalized / prefix-based
// matching (e.g. nda0p2 -> nda0), and finally falls back to FILESYSTEM.
if _, ioMatch := ctx.diskIoCounters[key]; !ioMatch {
if matchedKey, match := findIoDevice(key, ctx.diskIoCounters); match {
key = matchedKey
} else if ctx.filesystem != "" {
if matchedKey, match := findIoDevice(ctx.filesystem, ctx.diskIoCounters); match {
key = matchedKey
}
}
if _, ioMatch = ctx.diskIoCounters[key]; !ioMatch {
slog.Warn("Root I/O unmapped; set FILESYSTEM", "device", device, "mountpoint", mountpoint)
}
}
} else {
// Check if non-root has diskstats and prefer the folder device for
// /extra-filesystems mounts when the discovered partition device is a
// mapper path (e.g. luks UUID) that obscures the underlying block device.
if _, ioMatch := ctx.diskIoCounters[key]; !ioMatch {
if strings.HasPrefix(mountpoint, ctx.efPath) {
folderDevice, _ := parseFilesystemEntry(filepath.Base(mountpoint))
if folderDevice != "" {
if matchedKey, match := findIoDevice(folderDevice, ctx.diskIoCounters); match {
key = matchedKey
}
}
}
if _, ioMatch = ctx.diskIoCounters[key]; !ioMatch {
if matchedKey, match := findIoDevice(key, ctx.diskIoCounters); match {
key = matchedKey
}
}
}
}
if _, exists := existing[key]; exists {
return "", nil, false
}
fsStats := &system.FsStats{Root: root, Mountpoint: mountpoint}
if customName != "" {
fsStats.Name = customName
}
return key, fsStats, true
}
// addFsStat inserts a discovered filesystem if it resolves to a new tracking
// key. The key selection itself lives in buildFsStatRegistration so that logic
// can stay directly unit-tested.
func (d *diskDiscovery) addFsStat(device, mountpoint string, root bool, customName string) {
key, fsStats, ok := registerFilesystemStats(d.agent.fsStats, device, mountpoint, root, customName, d.ctx)
if !ok {
return
}
d.agent.fsStats[key] = fsStats
name := key
if customName != "" {
name = customName
}
slog.Info("Detected disk", "name", name, "device", device, "mount", mountpoint, "io", key, "root", root)
}
// addConfiguredRootFs resolves FILESYSTEM against partitions first, then falls
// back to direct diskstats matching for setups like ZFS where partitions do not
// expose the physical device name.
func (d *diskDiscovery) addConfiguredRootFs() bool {
if d.ctx.filesystem == "" {
return false
}
for _, p := range d.partitions {
if filesystemMatchesPartitionSetting(d.ctx.filesystem, p) {
d.addFsStat(p.Device, p.Mountpoint, true, "")
return true
}
}
// FILESYSTEM may name a physical disk absent from partitions (e.g. ZFS lists
// dataset paths like zroot/ROOT/default, not block devices).
if ioKey, match := findIoDevice(d.ctx.filesystem, d.ctx.diskIoCounters); match {
d.agent.fsStats[ioKey] = &system.FsStats{Root: true, Mountpoint: d.rootMountPoint}
return true
}
slog.Warn("Partition details not found", "filesystem", d.ctx.filesystem)
return false
}
func isRootFallbackPartition(p disk.PartitionStat, rootMountPoint string) bool {
return p.Mountpoint == rootMountPoint ||
(isDockerSpecialMountpoint(p.Mountpoint) && strings.HasPrefix(p.Device, "/dev"))
}
// addPartitionRootFs handles the non-configured root fallback path when a
// partition looks like the active root mount but still needs translating to an
// I/O device key.
func (d *diskDiscovery) addPartitionRootFs(device, mountpoint string) bool {
fs, match := findIoDevice(filepath.Base(device), d.ctx.diskIoCounters)
if !match {
return false
}
// The resolved I/O device is already known here, so use it directly to avoid
// a second fallback search inside buildFsStatRegistration.
d.addFsStat(fs, mountpoint, true, "")
return true
}
// addLastResortRootFs is only used when neither FILESYSTEM nor partition-based
// heuristics can identify root, so it picks the busiest I/O device as a final
// fallback and preserves the root mountpoint for usage collection.
func (d *diskDiscovery) addLastResortRootFs() {
rootKey := mostActiveIoDevice(d.ctx.diskIoCounters)
if rootKey != "" {
slog.Warn("Using most active device for root I/O; set FILESYSTEM to override", "device", rootKey)
} else {
rootKey = filepath.Base(d.rootMountPoint)
if _, exists := d.agent.fsStats[rootKey]; exists {
rootKey = "root"
}
slog.Warn("Root I/O device not detected; set FILESYSTEM to override")
}
d.agent.fsStats[rootKey] = &system.FsStats{Root: true, Mountpoint: d.rootMountPoint}
}
// findPartitionByFilesystemSetting matches an EXTRA_FILESYSTEMS entry against a
// discovered partition either by mountpoint or by device suffix.
func findPartitionByFilesystemSetting(filesystem string, partitions []disk.PartitionStat) (disk.PartitionStat, bool) {
for _, p := range partitions {
if strings.HasSuffix(p.Device, filesystem) || p.Mountpoint == filesystem {
return p, true
}
}
return disk.PartitionStat{}, false
}
// addConfiguredExtraFsEntry resolves one EXTRA_FILESYSTEMS entry, preferring a
// discovered partition and falling back to any path that disk.Usage accepts.
func (d *diskDiscovery) addConfiguredExtraFsEntry(filesystem, customName string) {
if p, found := findPartitionByFilesystemSetting(filesystem, d.partitions); found {
d.addFsStat(p.Device, p.Mountpoint, false, customName)
return
}
if _, err := d.usageFn(filesystem); err == nil {
d.addFsStat(filepath.Base(filesystem), filesystem, false, customName)
return
} else {
slog.Error("Invalid filesystem", "name", filesystem, "err", err)
}
}
// addConfiguredExtraFilesystems parses and registers the comma-separated
// EXTRA_FILESYSTEMS env var entries.
func (d *diskDiscovery) addConfiguredExtraFilesystems(extraFilesystems string) {
for fsEntry := range strings.SplitSeq(extraFilesystems, ",") {
filesystem, customName := parseFilesystemEntry(fsEntry)
d.addConfiguredExtraFsEntry(filesystem, customName)
}
}
// addPartitionExtraFs registers partitions mounted under /extra-filesystems so
// their display names can come from the folder name while their I/O keys still
// prefer the underlying partition device.
func (d *diskDiscovery) addPartitionExtraFs(p disk.PartitionStat) {
if !strings.HasPrefix(p.Mountpoint, d.ctx.efPath) {
return
}
device, customName := extraFilesystemPartitionInfo(p)
d.addFsStat(device, p.Mountpoint, false, customName)
}
// addExtraFilesystemFolders handles bare directories under /extra-filesystems
// that may not appear in partition discovery, while skipping mountpoints that
// were already registered from higher-fidelity sources.
func (d *diskDiscovery) addExtraFilesystemFolders(folderNames []string) {
existingMountpoints := make(map[string]bool, len(d.agent.fsStats))
for _, stats := range d.agent.fsStats {
existingMountpoints[stats.Mountpoint] = true
}
for _, folderName := range folderNames {
mountpoint := filepath.Join(d.ctx.efPath, folderName)
slog.Debug("/extra-filesystems", "mountpoint", mountpoint)
if existingMountpoints[mountpoint] {
continue
}
device, customName := parseFilesystemEntry(folderName)
d.addFsStat(device, mountpoint, false, customName)
}
}
// Sets up the filesystems to monitor for disk usage and I/O.
func (a *Agent) initializeDiskInfo() {
filesystem, _ := utils.GetEnv("FILESYSTEM")
efPath := "/extra-filesystems"
hasRoot := false
isWindows := runtime.GOOS == "windows"
@@ -56,167 +286,57 @@ func (a *Agent) initializeDiskInfo() {
}
}
// ioContext := context.WithValue(a.sensorsContext,
// common.EnvKey, common.EnvMap{common.HostProcEnvKey: "/tmp/testproc"},
// )
// diskIoCounters, err := disk.IOCountersWithContext(ioContext)
diskIoCounters, err := disk.IOCounters()
if err != nil {
slog.Error("Error getting diskstats", "err", err)
}
slog.Debug("Disk I/O", "diskstats", diskIoCounters)
// Helper function to add a filesystem to fsStats if it doesn't exist
addFsStat := func(device, mountpoint string, root bool, customName ...string) {
var key string
if isWindows {
key = device
} else {
key = filepath.Base(device)
}
var ioMatch bool
if _, exists := a.fsStats[key]; !exists {
if root {
slog.Info("Detected root device", "name", key)
// Try to map root device to a diskIoCounters entry. First
// checks for an exact key match, then uses findIoDevice for
// normalized / prefix-based matching (e.g. nda0p2 → nda0),
// and finally falls back to the FILESYSTEM env var.
if _, ioMatch = diskIoCounters[key]; !ioMatch {
if matchedKey, match := findIoDevice(key, diskIoCounters); match {
key = matchedKey
ioMatch = true
} else if filesystem != "" {
if matchedKey, match := findIoDevice(filesystem, diskIoCounters); match {
key = matchedKey
ioMatch = true
}
}
if !ioMatch {
slog.Warn("Root I/O unmapped; set FILESYSTEM", "device", device, "mountpoint", mountpoint)
}
}
} else {
// Check if non-root has diskstats and fall back to folder name if not
// Scenario: device is encrypted and named luks-2bcb02be-999d-4417-8d18-5c61e660fb6e - not in /proc/diskstats.
// However, the device can be specified by mounting folder from luks device at /extra-filesystems/sda1
if _, ioMatch = diskIoCounters[key]; !ioMatch {
efBase := filepath.Base(mountpoint)
if _, ioMatch = diskIoCounters[efBase]; ioMatch {
key = efBase
}
}
}
fsStats := &system.FsStats{Root: root, Mountpoint: mountpoint}
if len(customName) > 0 && customName[0] != "" {
fsStats.Name = customName[0]
}
a.fsStats[key] = fsStats
}
ctx := fsRegistrationContext{
filesystem: filesystem,
isWindows: isWindows,
diskIoCounters: diskIoCounters,
efPath: "/extra-filesystems",
}
// Get the appropriate root mount point for this system
rootMountPoint := a.getRootMountPoint()
// Use FILESYSTEM env var to find root filesystem
if filesystem != "" {
for _, p := range partitions {
if filesystemMatchesPartitionSetting(filesystem, p) {
addFsStat(p.Device, p.Mountpoint, true)
hasRoot = true
break
}
}
if !hasRoot {
// FILESYSTEM may name a physical disk absent from partitions (e.g.
// ZFS lists dataset paths like zroot/ROOT/default, not block devices).
// Try matching directly against diskIoCounters.
if ioKey, match := findIoDevice(filesystem, diskIoCounters); match {
a.fsStats[ioKey] = &system.FsStats{Root: true, Mountpoint: rootMountPoint}
hasRoot = true
} else {
slog.Warn("Partition details not found", "filesystem", filesystem)
}
}
discovery := diskDiscovery{
agent: a,
rootMountPoint: a.getRootMountPoint(),
partitions: partitions,
usageFn: disk.Usage,
ctx: ctx,
}
hasRoot = discovery.addConfiguredRootFs()
// Add EXTRA_FILESYSTEMS env var values to fsStats
if extraFilesystems, exists := utils.GetEnv("EXTRA_FILESYSTEMS"); exists {
for fsEntry := range strings.SplitSeq(extraFilesystems, ",") {
// Parse custom name from format: device__customname
fs, customName := parseFilesystemEntry(fsEntry)
found := false
for _, p := range partitions {
if strings.HasSuffix(p.Device, fs) || p.Mountpoint == fs {
addFsStat(p.Device, p.Mountpoint, false, customName)
found = true
break
}
}
// if not in partitions, test if we can get disk usage
if !found {
if _, err := disk.Usage(fs); err == nil {
addFsStat(filepath.Base(fs), fs, false, customName)
} else {
slog.Error("Invalid filesystem", "name", fs, "err", err)
}
}
}
discovery.addConfiguredExtraFilesystems(extraFilesystems)
}
// Process partitions for various mount points
for _, p := range partitions {
// fmt.Println(p.Device, p.Mountpoint)
// Binary root fallback or docker root fallback
if !hasRoot && (p.Mountpoint == rootMountPoint || (isDockerSpecialMountpoint(p.Mountpoint) && strings.HasPrefix(p.Device, "/dev"))) {
fs, match := findIoDevice(filepath.Base(p.Device), diskIoCounters)
if match {
addFsStat(fs, p.Mountpoint, true)
hasRoot = true
}
}
// Check if device is in /extra-filesystems
if strings.HasPrefix(p.Mountpoint, efPath) {
device, customName := parseFilesystemEntry(p.Mountpoint)
addFsStat(device, p.Mountpoint, false, customName)
if !hasRoot && isRootFallbackPartition(p, discovery.rootMountPoint) {
hasRoot = discovery.addPartitionRootFs(p.Device, p.Mountpoint)
}
discovery.addPartitionExtraFs(p)
}
// Check all folders in /extra-filesystems and add them if not already present
if folders, err := os.ReadDir(efPath); err == nil {
existingMountpoints := make(map[string]bool)
for _, stats := range a.fsStats {
existingMountpoints[stats.Mountpoint] = true
}
if folders, err := os.ReadDir(discovery.ctx.efPath); err == nil {
folderNames := make([]string, 0, len(folders))
for _, folder := range folders {
if folder.IsDir() {
mountpoint := filepath.Join(efPath, folder.Name())
slog.Debug("/extra-filesystems", "mountpoint", mountpoint)
if !existingMountpoints[mountpoint] {
device, customName := parseFilesystemEntry(folder.Name())
addFsStat(device, mountpoint, false, customName)
}
folderNames = append(folderNames, folder.Name())
}
}
discovery.addExtraFilesystemFolders(folderNames)
}
// If no root filesystem set, try the most active I/O device as a last
// resort (e.g. ZFS where dataset names are unrelated to disk names).
if !hasRoot {
rootKey := mostActiveIoDevice(diskIoCounters)
if rootKey != "" {
slog.Warn("Using most active device for root I/O; set FILESYSTEM to override", "device", rootKey)
} else {
rootKey = filepath.Base(rootMountPoint)
if _, exists := a.fsStats[rootKey]; exists {
rootKey = "root"
}
slog.Warn("Root I/O device not detected; set FILESYSTEM to override")
}
a.fsStats[rootKey] = &system.FsStats{Root: true, Mountpoint: rootMountPoint}
discovery.addLastResortRootFs()
}
a.pruneDuplicateRootExtraFilesystems()
@@ -381,6 +501,8 @@ func normalizeDeviceName(value string) string {
// Sets start values for disk I/O stats.
func (a *Agent) initializeDiskIoStats(diskIoCounters map[string]disk.IOCountersStat) {
a.fsNames = a.fsNames[:0]
now := time.Now()
for device, stats := range a.fsStats {
// skip if not in diskIoCounters
d, exists := diskIoCounters[device]
@@ -389,7 +511,7 @@ func (a *Agent) initializeDiskIoStats(diskIoCounters map[string]disk.IOCountersS
continue
}
// populate initial values
stats.Time = time.Now()
stats.Time = now
stats.TotalRead = d.ReadBytes
stats.TotalWrite = d.WriteBytes
// add to list of valid io device names

View File

@@ -93,6 +93,443 @@ func TestParseFilesystemEntry(t *testing.T) {
}
}
func TestExtraFilesystemPartitionInfo(t *testing.T) {
t.Run("uses partition device for label-only mountpoint", func(t *testing.T) {
device, customName := extraFilesystemPartitionInfo(disk.PartitionStat{
Device: "/dev/sdc",
Mountpoint: "/extra-filesystems/Share",
})
assert.Equal(t, "/dev/sdc", device)
assert.Equal(t, "", customName)
})
t.Run("uses custom name from mountpoint suffix", func(t *testing.T) {
device, customName := extraFilesystemPartitionInfo(disk.PartitionStat{
Device: "/dev/sdc",
Mountpoint: "/extra-filesystems/sdc__Share",
})
assert.Equal(t, "/dev/sdc", device)
assert.Equal(t, "Share", customName)
})
t.Run("falls back to folder device when partition device is unavailable", func(t *testing.T) {
device, customName := extraFilesystemPartitionInfo(disk.PartitionStat{
Mountpoint: "/extra-filesystems/sdc__Share",
})
assert.Equal(t, "sdc", device)
assert.Equal(t, "Share", customName)
})
t.Run("supports custom name without folder device prefix", func(t *testing.T) {
device, customName := extraFilesystemPartitionInfo(disk.PartitionStat{
Device: "/dev/sdc",
Mountpoint: "/extra-filesystems/__Share",
})
assert.Equal(t, "/dev/sdc", device)
assert.Equal(t, "Share", customName)
})
}
func TestBuildFsStatRegistration(t *testing.T) {
t.Run("uses basename for non-windows exact io match", func(t *testing.T) {
key, stats, ok := registerFilesystemStats(
map[string]*system.FsStats{},
"/dev/sda1",
"/mnt/data",
false,
"archive",
fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"sda1": {Name: "sda1"},
},
},
)
assert.True(t, ok)
assert.Equal(t, "sda1", key)
assert.Equal(t, "/mnt/data", stats.Mountpoint)
assert.Equal(t, "archive", stats.Name)
assert.False(t, stats.Root)
})
t.Run("maps root partition to io device by prefix", func(t *testing.T) {
key, stats, ok := registerFilesystemStats(
map[string]*system.FsStats{},
"/dev/ada0p2",
"/",
true,
"",
fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"ada0": {Name: "ada0", ReadBytes: 1000, WriteBytes: 1000},
},
},
)
assert.True(t, ok)
assert.Equal(t, "ada0", key)
assert.True(t, stats.Root)
assert.Equal(t, "/", stats.Mountpoint)
})
t.Run("uses filesystem setting as root fallback", func(t *testing.T) {
key, _, ok := registerFilesystemStats(
map[string]*system.FsStats{},
"overlay",
"/",
true,
"",
fsRegistrationContext{
filesystem: "nvme0n1p2",
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"nvme0n1": {Name: "nvme0n1", ReadBytes: 1000, WriteBytes: 1000},
},
},
)
assert.True(t, ok)
assert.Equal(t, "nvme0n1", key)
})
t.Run("prefers parsed extra-filesystems device over mapper device", func(t *testing.T) {
key, stats, ok := registerFilesystemStats(
map[string]*system.FsStats{},
"/dev/mapper/luks-2bcb02be-999d-4417-8d18-5c61e660fb6e",
"/extra-filesystems/nvme0n1p2__Archive",
false,
"Archive",
fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"dm-1": {Name: "dm-1", Label: "luks-2bcb02be-999d-4417-8d18-5c61e660fb6e"},
"nvme0n1p2": {Name: "nvme0n1p2"},
},
},
)
assert.True(t, ok)
assert.Equal(t, "nvme0n1p2", key)
assert.Equal(t, "Archive", stats.Name)
})
t.Run("falls back to mapper io device when folder device cannot be resolved", func(t *testing.T) {
key, stats, ok := registerFilesystemStats(
map[string]*system.FsStats{},
"/dev/mapper/luks-2bcb02be-999d-4417-8d18-5c61e660fb6e",
"/extra-filesystems/Archive",
false,
"Archive",
fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"dm-1": {Name: "dm-1", Label: "luks-2bcb02be-999d-4417-8d18-5c61e660fb6e"},
},
},
)
assert.True(t, ok)
assert.Equal(t, "dm-1", key)
assert.Equal(t, "Archive", stats.Name)
})
t.Run("uses full device name on windows", func(t *testing.T) {
key, _, ok := registerFilesystemStats(
map[string]*system.FsStats{},
`C:`,
`C:\\`,
false,
"",
fsRegistrationContext{
isWindows: true,
diskIoCounters: map[string]disk.IOCountersStat{
`C:`: {Name: `C:`},
},
},
)
assert.True(t, ok)
assert.Equal(t, `C:`, key)
})
t.Run("skips existing key", func(t *testing.T) {
key, stats, ok := registerFilesystemStats(
map[string]*system.FsStats{"sda1": {Mountpoint: "/existing"}},
"/dev/sda1",
"/mnt/data",
false,
"",
fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"sda1": {Name: "sda1"},
},
},
)
assert.False(t, ok)
assert.Empty(t, key)
assert.Nil(t, stats)
})
}
func TestAddConfiguredRootFs(t *testing.T) {
t.Run("adds root from matching partition", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
rootMountPoint: "/",
partitions: []disk.PartitionStat{{Device: "/dev/ada0p2", Mountpoint: "/"}},
ctx: fsRegistrationContext{
filesystem: "/dev/ada0p2",
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"ada0": {Name: "ada0", ReadBytes: 1000, WriteBytes: 1000},
},
},
}
ok := discovery.addConfiguredRootFs()
assert.True(t, ok)
stats, exists := agent.fsStats["ada0"]
assert.True(t, exists)
assert.True(t, stats.Root)
assert.Equal(t, "/", stats.Mountpoint)
})
t.Run("adds root from io device when partition is missing", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
rootMountPoint: "/sysroot",
ctx: fsRegistrationContext{
filesystem: "zroot",
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"nda0": {Name: "nda0", Label: "zroot", ReadBytes: 1000, WriteBytes: 1000},
},
},
}
ok := discovery.addConfiguredRootFs()
assert.True(t, ok)
stats, exists := agent.fsStats["nda0"]
assert.True(t, exists)
assert.True(t, stats.Root)
assert.Equal(t, "/sysroot", stats.Mountpoint)
})
t.Run("returns false when filesystem cannot be resolved", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
rootMountPoint: "/",
ctx: fsRegistrationContext{
filesystem: "missing-disk",
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{},
},
}
ok := discovery.addConfiguredRootFs()
assert.False(t, ok)
assert.Empty(t, agent.fsStats)
})
}
func TestAddPartitionRootFs(t *testing.T) {
t.Run("adds root from fallback partition candidate", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
ctx: fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"nvme0n1": {Name: "nvme0n1", ReadBytes: 1000, WriteBytes: 1000},
},
},
}
ok := discovery.addPartitionRootFs("/dev/nvme0n1p2", "/")
assert.True(t, ok)
stats, exists := agent.fsStats["nvme0n1"]
assert.True(t, exists)
assert.True(t, stats.Root)
assert.Equal(t, "/", stats.Mountpoint)
})
t.Run("returns false when no io device matches", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{agent: agent, ctx: fsRegistrationContext{diskIoCounters: map[string]disk.IOCountersStat{}}}
ok := discovery.addPartitionRootFs("/dev/mapper/root", "/")
assert.False(t, ok)
assert.Empty(t, agent.fsStats)
})
}
func TestAddLastResortRootFs(t *testing.T) {
t.Run("uses most active io device when available", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{agent: agent, rootMountPoint: "/", ctx: fsRegistrationContext{diskIoCounters: map[string]disk.IOCountersStat{
"sda": {Name: "sda", ReadBytes: 5000, WriteBytes: 5000},
"sdb": {Name: "sdb", ReadBytes: 1000, WriteBytes: 1000},
}}}
discovery.addLastResortRootFs()
stats, exists := agent.fsStats["sda"]
assert.True(t, exists)
assert.True(t, stats.Root)
})
t.Run("falls back to root key when mountpoint basename collides", func(t *testing.T) {
agent := &Agent{fsStats: map[string]*system.FsStats{
"sysroot": {Mountpoint: "/extra-filesystems/sysroot"},
}}
discovery := diskDiscovery{agent: agent, rootMountPoint: "/sysroot", ctx: fsRegistrationContext{diskIoCounters: map[string]disk.IOCountersStat{}}}
discovery.addLastResortRootFs()
stats, exists := agent.fsStats["root"]
assert.True(t, exists)
assert.True(t, stats.Root)
assert.Equal(t, "/sysroot", stats.Mountpoint)
})
}
func TestAddConfiguredExtraFsEntry(t *testing.T) {
t.Run("uses matching partition when present", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
partitions: []disk.PartitionStat{{Device: "/dev/sdb1", Mountpoint: "/mnt/backup"}},
usageFn: func(string) (*disk.UsageStat, error) {
t.Fatal("usage fallback should not be called when partition matches")
return nil, nil
},
ctx: fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"sdb1": {Name: "sdb1"},
},
},
}
discovery.addConfiguredExtraFsEntry("sdb1", "backup")
stats, exists := agent.fsStats["sdb1"]
assert.True(t, exists)
assert.Equal(t, "/mnt/backup", stats.Mountpoint)
assert.Equal(t, "backup", stats.Name)
})
t.Run("falls back to usage-validated path", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
usageFn: func(path string) (*disk.UsageStat, error) {
assert.Equal(t, "/srv/archive", path)
return &disk.UsageStat{}, nil
},
ctx: fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"archive": {Name: "archive"},
},
},
}
discovery.addConfiguredExtraFsEntry("/srv/archive", "archive")
stats, exists := agent.fsStats["archive"]
assert.True(t, exists)
assert.Equal(t, "/srv/archive", stats.Mountpoint)
assert.Equal(t, "archive", stats.Name)
})
t.Run("ignores invalid filesystem entry", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
usageFn: func(string) (*disk.UsageStat, error) {
return nil, os.ErrNotExist
},
}
discovery.addConfiguredExtraFsEntry("/missing/archive", "")
assert.Empty(t, agent.fsStats)
})
}
func TestAddConfiguredExtraFilesystems(t *testing.T) {
t.Run("parses and registers multiple configured filesystems", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
partitions: []disk.PartitionStat{{Device: "/dev/sda1", Mountpoint: "/mnt/fast"}},
usageFn: func(path string) (*disk.UsageStat, error) {
if path == "/srv/archive" {
return &disk.UsageStat{}, nil
}
return nil, os.ErrNotExist
},
ctx: fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"sda1": {Name: "sda1"},
"archive": {Name: "archive"},
},
},
}
discovery.addConfiguredExtraFilesystems("sda1__fast,/srv/archive__cold")
assert.Contains(t, agent.fsStats, "sda1")
assert.Equal(t, "fast", agent.fsStats["sda1"].Name)
assert.Contains(t, agent.fsStats, "archive")
assert.Equal(t, "cold", agent.fsStats["archive"].Name)
})
}
func TestAddExtraFilesystemFolders(t *testing.T) {
t.Run("adds missing folders and skips existing mountpoints", func(t *testing.T) {
agent := &Agent{fsStats: map[string]*system.FsStats{
"existing": {Mountpoint: "/extra-filesystems/existing"},
}}
discovery := diskDiscovery{
agent: agent,
ctx: fsRegistrationContext{
isWindows: false,
efPath: "/extra-filesystems",
diskIoCounters: map[string]disk.IOCountersStat{
"newdisk": {Name: "newdisk"},
},
},
}
discovery.addExtraFilesystemFolders([]string{"existing", "newdisk__Archive"})
assert.Len(t, agent.fsStats, 2)
stats, exists := agent.fsStats["newdisk"]
assert.True(t, exists)
assert.Equal(t, "/extra-filesystems/newdisk__Archive", stats.Mountpoint)
assert.Equal(t, "Archive", stats.Name)
})
}
func TestFindIoDevice(t *testing.T) {
t.Run("matches by device name", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
@@ -310,7 +747,7 @@ func TestInitializeDiskInfoWithCustomNames(t *testing.T) {
// Test the parsing logic by calling the relevant part
// We'll create a simplified version to test just the parsing
extraFilesystems := tc.envValue
for _, fsEntry := range strings.Split(extraFilesystems, ",") {
for fsEntry := range strings.SplitSeq(extraFilesystems, ",") {
// Parse the entry
fsEntry = strings.TrimSpace(fsEntry)
var fs, customName string
@@ -506,3 +943,33 @@ func TestHasSameDiskUsage(t *testing.T) {
assert.False(t, hasSameDiskUsage(&disk.UsageStat{Total: 0, Used: 0}, &disk.UsageStat{Total: 1, Used: 1}))
})
}
func TestInitializeDiskIoStatsResetsTrackedDevices(t *testing.T) {
agent := &Agent{
fsStats: map[string]*system.FsStats{
"sda": {},
"sdb": {},
},
fsNames: []string{"stale", "sda"},
}
agent.initializeDiskIoStats(map[string]disk.IOCountersStat{
"sda": {Name: "sda", ReadBytes: 10, WriteBytes: 20},
"sdb": {Name: "sdb", ReadBytes: 30, WriteBytes: 40},
})
assert.ElementsMatch(t, []string{"sda", "sdb"}, agent.fsNames)
assert.Len(t, agent.fsNames, 2)
assert.Equal(t, uint64(10), agent.fsStats["sda"].TotalRead)
assert.Equal(t, uint64(20), agent.fsStats["sda"].TotalWrite)
assert.False(t, agent.fsStats["sda"].Time.IsZero())
assert.False(t, agent.fsStats["sdb"].Time.IsZero())
agent.initializeDiskIoStats(map[string]disk.IOCountersStat{
"sdb": {Name: "sdb", ReadBytes: 50, WriteBytes: 60},
})
assert.Equal(t, []string{"sdb"}, agent.fsNames)
assert.Equal(t, uint64(50), agent.fsStats["sdb"].TotalRead)
assert.Equal(t, uint64(60), agent.fsStats["sdb"].TotalWrite)
}

View File

@@ -21,8 +21,7 @@ type hubLike interface {
type AlertManager struct {
hub hubLike
alertQueue chan alertTask
stopChan chan struct{}
stopOnce sync.Once
pendingAlerts sync.Map
}
@@ -98,12 +97,9 @@ var supportsTitle = map[string]struct{}{
// NewAlertManager creates a new AlertManager instance.
func NewAlertManager(app hubLike) *AlertManager {
am := &AlertManager{
hub: app,
alertQueue: make(chan alertTask, 5),
stopChan: make(chan struct{}),
hub: app,
}
am.bindEvents()
go am.startWorker()
return am
}
@@ -112,6 +108,16 @@ func (am *AlertManager) bindEvents() {
am.hub.OnRecordAfterUpdateSuccess("alerts").BindFunc(updateHistoryOnAlertUpdate)
am.hub.OnRecordAfterDeleteSuccess("alerts").BindFunc(resolveHistoryOnAlertDelete)
am.hub.OnRecordAfterUpdateSuccess("smart_devices").BindFunc(am.handleSmartDeviceAlert)
am.hub.OnServe().BindFunc(func(e *core.ServeEvent) error {
if err := resolveStatusAlerts(e.App); err != nil {
e.App.Logger().Error("Failed to resolve stale status alerts", "err", err)
}
if err := am.restorePendingStatusAlerts(); err != nil {
e.App.Logger().Error("Failed to restore pending status alerts", "err", err)
}
return e.Next()
})
}
// IsNotificationSilenced checks if a notification should be silenced based on configured quiet hours

View File

@@ -49,7 +49,7 @@ func TestAlertSilencedOneTime(t *testing.T) {
// Get alert manager
am := alerts.NewAlertManager(hub)
defer am.StopWorker()
defer am.Stop()
// Test that alert is silenced
silenced := am.IsNotificationSilenced(user.Id, system.Id)
@@ -106,7 +106,7 @@ func TestAlertSilencedDaily(t *testing.T) {
// Get alert manager
am := alerts.NewAlertManager(hub)
defer am.StopWorker()
defer am.Stop()
// Get current hour and create a window that includes current time
now := time.Now().UTC()
@@ -170,7 +170,7 @@ func TestAlertSilencedDailyMidnightCrossing(t *testing.T) {
// Get alert manager
am := alerts.NewAlertManager(hub)
defer am.StopWorker()
defer am.Stop()
// Create a window that crosses midnight: 22:00 - 02:00
startTime := time.Date(2000, 1, 1, 22, 0, 0, 0, time.UTC)
@@ -211,7 +211,7 @@ func TestAlertSilencedGlobal(t *testing.T) {
// Get alert manager
am := alerts.NewAlertManager(hub)
defer am.StopWorker()
defer am.Stop()
// Create a global quiet hours window (no system specified)
now := time.Now().UTC()
@@ -250,7 +250,7 @@ func TestAlertSilencedSystemSpecific(t *testing.T) {
// Get alert manager
am := alerts.NewAlertManager(hub)
defer am.StopWorker()
defer am.Stop()
// Create a system-specific quiet hours window for system1 only
now := time.Now().UTC()
@@ -296,7 +296,7 @@ func TestAlertSilencedMultiUser(t *testing.T) {
// Get alert manager
am := alerts.NewAlertManager(hub)
defer am.StopWorker()
defer am.Stop()
// Create a quiet hours window for user1 only
now := time.Now().UTC()
@@ -417,7 +417,7 @@ func TestAlertSilencedNoWindows(t *testing.T) {
// Get alert manager
am := alerts.NewAlertManager(hub)
defer am.StopWorker()
defer am.Stop()
// Without any quiet hours windows, alert should NOT be silenced
silenced := am.IsNotificationSilenced(user.Id, system.Id)

View File

@@ -9,63 +9,25 @@ import (
"github.com/pocketbase/pocketbase/core"
)
type alertTask struct {
action string // "schedule" or "cancel"
systemName string
alertRecord *core.Record
delay time.Duration
}
type alertInfo struct {
systemName string
alertRecord *core.Record
expireTime time.Time
timer *time.Timer
}
// startWorker is a long-running goroutine that processes alert tasks
// every x seconds. It must be running to process status alerts.
func (am *AlertManager) startWorker() {
processPendingAlerts := time.Tick(15 * time.Second)
// check for status alerts that are not resolved when system comes up
// (can be removed if we figure out core bug in #1052)
checkStatusAlerts := time.Tick(561 * time.Second)
for {
select {
case <-am.stopChan:
return
case task := <-am.alertQueue:
switch task.action {
case "schedule":
am.pendingAlerts.Store(task.alertRecord.Id, &alertInfo{
systemName: task.systemName,
alertRecord: task.alertRecord,
expireTime: time.Now().Add(task.delay),
})
case "cancel":
am.pendingAlerts.Delete(task.alertRecord.Id)
// Stop cancels all pending status alert timers.
func (am *AlertManager) Stop() {
am.stopOnce.Do(func() {
am.pendingAlerts.Range(func(key, value any) bool {
info := value.(*alertInfo)
if info.timer != nil {
info.timer.Stop()
}
case <-checkStatusAlerts:
resolveStatusAlerts(am.hub)
case <-processPendingAlerts:
// Check for expired alerts every tick
now := time.Now()
for key, value := range am.pendingAlerts.Range {
info := value.(*alertInfo)
if now.After(info.expireTime) {
// Downtime delay has passed, process alert
am.sendStatusAlert("down", info.systemName, info.alertRecord)
am.pendingAlerts.Delete(key)
}
}
}
}
}
// StopWorker shuts down the AlertManager.worker goroutine
func (am *AlertManager) StopWorker() {
close(am.stopChan)
am.pendingAlerts.Delete(key)
return true
})
})
}
// HandleStatusAlerts manages the logic when system status changes.
@@ -103,44 +65,82 @@ func (am *AlertManager) getSystemStatusAlerts(systemID string) ([]*core.Record,
return alertRecords, nil
}
// Schedules delayed "down" alerts for each alert record.
// handleSystemDown manages the logic when a system status changes to "down". It schedules pending alerts for each alert record.
func (am *AlertManager) handleSystemDown(systemName string, alertRecords []*core.Record) {
for _, alertRecord := range alertRecords {
// Continue if alert is already scheduled
if _, exists := am.pendingAlerts.Load(alertRecord.Id); exists {
continue
}
// Schedule by adding to queue
min := max(1, alertRecord.GetInt("min"))
am.alertQueue <- alertTask{
action: "schedule",
systemName: systemName,
alertRecord: alertRecord,
delay: time.Duration(min) * time.Minute,
}
am.schedulePendingStatusAlert(systemName, alertRecord, time.Duration(min)*time.Minute)
}
}
// schedulePendingStatusAlert sets up a timer to send a "down" alert after the specified delay if the system is still down.
// It returns true if the alert was scheduled, or false if an alert was already pending for the given alert record.
func (am *AlertManager) schedulePendingStatusAlert(systemName string, alertRecord *core.Record, delay time.Duration) bool {
alert := &alertInfo{
systemName: systemName,
alertRecord: alertRecord,
expireTime: time.Now().Add(delay),
}
storedAlert, loaded := am.pendingAlerts.LoadOrStore(alertRecord.Id, alert)
if loaded {
return false
}
stored := storedAlert.(*alertInfo)
stored.timer = time.AfterFunc(time.Until(stored.expireTime), func() {
am.processPendingAlert(alertRecord.Id)
})
return true
}
// handleSystemUp manages the logic when a system status changes to "up".
// It cancels any pending alerts and sends "up" alerts.
func (am *AlertManager) handleSystemUp(systemName string, alertRecords []*core.Record) {
for _, alertRecord := range alertRecords {
alertRecordID := alertRecord.Id
// If alert exists for record, delete and continue (down alert not sent)
if _, exists := am.pendingAlerts.Load(alertRecordID); exists {
am.alertQueue <- alertTask{
action: "cancel",
alertRecord: alertRecord,
}
if am.cancelPendingAlert(alertRecord.Id) {
continue
}
if !alertRecord.GetBool("triggered") {
continue
}
// No alert scheduled for this record, send "up" alert
if err := am.sendStatusAlert("up", systemName, alertRecord); err != nil {
am.hub.Logger().Error("Failed to send alert", "err", err)
}
}
}
// cancelPendingAlert stops the timer and removes the pending alert for the given alert ID. Returns true if a pending alert was found and cancelled.
func (am *AlertManager) cancelPendingAlert(alertID string) bool {
value, loaded := am.pendingAlerts.LoadAndDelete(alertID)
if !loaded {
return false
}
info := value.(*alertInfo)
if info.timer != nil {
info.timer.Stop()
}
return true
}
// processPendingAlert sends a "down" alert if the pending alert has expired and the system is still down.
func (am *AlertManager) processPendingAlert(alertID string) {
value, loaded := am.pendingAlerts.LoadAndDelete(alertID)
if !loaded {
return
}
info := value.(*alertInfo)
if info.alertRecord.GetBool("triggered") {
return
}
if err := am.sendStatusAlert("down", info.systemName, info.alertRecord); err != nil {
am.hub.Logger().Error("Failed to send alert", "err", err)
}
}
// sendStatusAlert sends a status alert ("up" or "down") to the users associated with the alert records.
func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, alertRecord *core.Record) error {
switch alertStatus {
@@ -174,8 +174,8 @@ func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, a
})
}
// resolveStatusAlerts resolves any status alerts that weren't resolved
// when system came up (https://github.com/henrygd/beszel/issues/1052)
// resolveStatusAlerts resolves any triggered status alerts that weren't resolved
// when system came up (https://github.com/henrygd/beszel/issues/1052).
func resolveStatusAlerts(app core.App) error {
db := app.DB()
// Find all active status alerts where the system is actually up
@@ -205,3 +205,36 @@ func resolveStatusAlerts(app core.App) error {
}
return nil
}
// restorePendingStatusAlerts re-queues untriggered status alerts for systems that
// are still down after a hub restart. This rebuilds the lost in-memory timer state.
func (am *AlertManager) restorePendingStatusAlerts() error {
type pendingStatusAlert struct {
AlertID string `db:"alert_id"`
SystemName string `db:"system_name"`
}
var pending []pendingStatusAlert
err := am.hub.DB().NewQuery(`
SELECT a.id AS alert_id, s.name AS system_name
FROM alerts a
JOIN systems s ON a.system = s.id
WHERE a.name = 'Status'
AND a.triggered = false
AND s.status = 'down'
`).All(&pending)
if err != nil {
return err
}
for _, item := range pending {
alertRecord, err := am.hub.FindRecordById("alerts", item.AlertID)
if err != nil {
return err
}
min := max(1, alertRecord.GetInt("min"))
am.schedulePendingStatusAlert(item.SystemName, alertRecord, time.Duration(min)*time.Minute)
}
return nil
}

View File

@@ -0,0 +1,628 @@
//go:build testing
package alerts_test
import (
"testing"
"testing/synctest"
"time"
"github.com/henrygd/beszel/internal/alerts"
beszelTests "github.com/henrygd/beszel/internal/tests"
"github.com/pocketbase/dbx"
"github.com/pocketbase/pocketbase/core"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestStatusAlerts(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systems, err := beszelTests.CreateSystems(hub, 4, user.Id, "paused")
assert.NoError(t, err)
var alerts []*core.Record
for i, system := range systems {
alert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": system.Id,
"user": user.Id,
"min": i + 1,
})
assert.NoError(t, err)
alerts = append(alerts, alert)
}
time.Sleep(10 * time.Millisecond)
for _, alert := range alerts {
assert.False(t, alert.GetBool("triggered"), "Alert should not be triggered immediately")
}
if hub.TestMailer.TotalSend() != 0 {
assert.Zero(t, hub.TestMailer.TotalSend(), "Expected 0 messages, got %d", hub.TestMailer.TotalSend())
}
for _, system := range systems {
assert.EqualValues(t, "paused", system.GetString("status"), "System should be paused")
}
for _, system := range systems {
system.Set("status", "up")
err = hub.SaveNoValidate(system)
assert.NoError(t, err)
}
time.Sleep(time.Second)
assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
for _, system := range systems {
system.Set("status", "down")
err = hub.SaveNoValidate(system)
assert.NoError(t, err)
}
// after 30 seconds, should have 4 alerts in the pendingAlerts map, no triggered alerts
time.Sleep(time.Second * 30)
assert.EqualValues(t, 4, hub.GetPendingAlertsCount(), "should have 4 alerts in the pendingAlerts map")
triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 0, triggeredCount, "should have 0 alert triggered")
assert.EqualValues(t, 0, hub.TestMailer.TotalSend(), "should have 0 messages sent")
// after 1:30 seconds, should have 1 triggered alert and 3 pending alerts
time.Sleep(time.Second * 60)
assert.EqualValues(t, 3, hub.GetPendingAlertsCount(), "should have 3 alerts in the pendingAlerts map")
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 1, triggeredCount, "should have 1 alert triggered")
assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 messages sent")
// after 2:30 seconds, should have 2 triggered alerts and 2 pending alerts
time.Sleep(time.Second * 60)
assert.EqualValues(t, 2, hub.GetPendingAlertsCount(), "should have 2 alerts in the pendingAlerts map")
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 2, triggeredCount, "should have 2 alert triggered")
assert.EqualValues(t, 2, hub.TestMailer.TotalSend(), "should have 2 messages sent")
// now we will bring the remaning systems back up
for _, system := range systems {
system.Set("status", "up")
err = hub.SaveNoValidate(system)
assert.NoError(t, err)
}
time.Sleep(time.Second)
// should have 0 alerts in the pendingAlerts map and 0 alerts triggered
assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.Zero(t, triggeredCount, "should have 0 alert triggered")
// 4 messages sent, 2 down alerts and 2 up alerts for first 2 systems
assert.EqualValues(t, 4, hub.TestMailer.TotalSend(), "should have 4 messages sent")
})
}
func TestStatusAlertRecoveryBeforeDeadline(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
// Ensure user settings have an email
userSettings, _ := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
hub.Save(userSettings)
// Initial email count
initialEmailCount := hub.TestMailer.TotalSend()
systemCollection, _ := hub.FindCollectionByNameOrId("systems")
system := core.NewRecord(systemCollection)
system.Set("name", "test-system")
system.Set("status", "up")
system.Set("host", "127.0.0.1")
system.Set("users", []string{user.Id})
hub.Save(system)
alertCollection, _ := hub.FindCollectionByNameOrId("alerts")
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", false)
alert.Set("min", 1)
hub.Save(alert)
am := hub.AlertManager
// 1. System goes down
am.HandleStatusAlerts("down", system)
assert.Equal(t, 1, am.GetPendingAlertsCount(), "Alert should be scheduled")
// 2. System goes up BEFORE delay expires
// Triggering HandleStatusAlerts("up") SHOULD NOT send an alert.
am.HandleStatusAlerts("up", system)
assert.Equal(t, 0, am.GetPendingAlertsCount(), "Alert should be canceled if system recovers before delay expires")
// Verify that NO email was sent.
assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "Recovery notification should not be sent if system never went down")
}
func TestStatusAlertNormalRecovery(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
// Ensure user settings have an email
userSettings, _ := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
hub.Save(userSettings)
systemCollection, _ := hub.FindCollectionByNameOrId("systems")
system := core.NewRecord(systemCollection)
system.Set("name", "test-system")
system.Set("status", "up")
system.Set("host", "127.0.0.1")
system.Set("users", []string{user.Id})
hub.Save(system)
alertCollection, _ := hub.FindCollectionByNameOrId("alerts")
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", true) // System was confirmed DOWN
hub.Save(alert)
am := hub.AlertManager
initialEmailCount := hub.TestMailer.TotalSend()
// System goes up
am.HandleStatusAlerts("up", system)
// Verify that an email WAS sent (normal recovery).
assert.Equal(t, initialEmailCount+1, hub.TestMailer.TotalSend(), "Recovery notification should be sent if system was triggered as down")
}
func TestHandleStatusAlertsDoesNotSendRecoveryWhileDownIsOnlyPending(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
require.NoError(t, err)
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
require.NoError(t, hub.Save(userSettings))
systemCollection, err := hub.FindCollectionByNameOrId("systems")
require.NoError(t, err)
system := core.NewRecord(systemCollection)
system.Set("name", "test-system")
system.Set("status", "up")
system.Set("host", "127.0.0.1")
system.Set("users", []string{user.Id})
require.NoError(t, hub.Save(system))
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
require.NoError(t, err)
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", false)
alert.Set("min", 1)
require.NoError(t, hub.Save(alert))
initialEmailCount := hub.TestMailer.TotalSend()
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.HandleStatusAlerts("down", system))
assert.Equal(t, 1, am.GetPendingAlertsCount(), "down transition should register a pending alert immediately")
require.NoError(t, am.HandleStatusAlerts("up", system))
assert.Zero(t, am.GetPendingAlertsCount(), "recovery should cancel the pending down alert")
assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "recovery notification should not be sent before a down alert triggers")
alertRecord, err := hub.FindRecordById("alerts", alert.Id)
require.NoError(t, err)
assert.False(t, alertRecord.GetBool("triggered"), "alert should remain untriggered when downtime never matured")
}
func TestStatusAlertTimerCancellationPreventsBoundaryDelivery(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
require.NoError(t, err)
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
require.NoError(t, hub.Save(userSettings))
systemCollection, err := hub.FindCollectionByNameOrId("systems")
require.NoError(t, err)
system := core.NewRecord(systemCollection)
system.Set("name", "test-system")
system.Set("status", "up")
system.Set("host", "127.0.0.1")
system.Set("users", []string{user.Id})
require.NoError(t, hub.Save(system))
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
require.NoError(t, err)
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", false)
alert.Set("min", 1)
require.NoError(t, hub.Save(alert))
initialEmailCount := hub.TestMailer.TotalSend()
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.HandleStatusAlerts("down", system))
assert.Equal(t, 1, am.GetPendingAlertsCount(), "down transition should register a pending alert immediately")
require.True(t, am.ResetPendingAlertTimer(alert.Id, 25*time.Millisecond), "test should shorten the pending alert timer")
time.Sleep(10 * time.Millisecond)
require.NoError(t, am.HandleStatusAlerts("up", system))
assert.Zero(t, am.GetPendingAlertsCount(), "recovery should remove the pending alert before the timer callback runs")
time.Sleep(40 * time.Millisecond)
assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "timer callback should not deliver after recovery cancels the pending alert")
alertRecord, err := hub.FindRecordById("alerts", alert.Id)
require.NoError(t, err)
assert.False(t, alertRecord.GetBool("triggered"), "alert should remain untriggered when cancellation wins the timer race")
time.Sleep(time.Minute)
synctest.Wait()
})
}
func TestStatusAlertDownFiresAfterDelayExpires(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
require.NoError(t, err)
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
require.NoError(t, hub.Save(userSettings))
systemCollection, err := hub.FindCollectionByNameOrId("systems")
require.NoError(t, err)
system := core.NewRecord(systemCollection)
system.Set("name", "test-system")
system.Set("status", "up")
system.Set("host", "127.0.0.1")
system.Set("users", []string{user.Id})
require.NoError(t, hub.Save(system))
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
require.NoError(t, err)
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", false)
alert.Set("min", 1)
require.NoError(t, hub.Save(alert))
initialEmailCount := hub.TestMailer.TotalSend()
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.HandleStatusAlerts("down", system))
assert.Equal(t, 1, am.GetPendingAlertsCount(), "alert should be pending after system goes down")
// Expire the pending alert and process it
am.ForceExpirePendingAlerts()
processed, err := am.ProcessPendingAlerts()
require.NoError(t, err)
assert.Len(t, processed, 1, "one alert should have been processed")
assert.Equal(t, 0, am.GetPendingAlertsCount(), "pending alert should be consumed after processing")
// Verify down email was sent
assert.Equal(t, initialEmailCount+1, hub.TestMailer.TotalSend(), "down notification should be sent after delay expires")
// Verify triggered flag is set in the DB
alertRecord, err := hub.FindRecordById("alerts", alert.Id)
require.NoError(t, err)
assert.True(t, alertRecord.GetBool("triggered"), "alert should be marked triggered after downtime matures")
}
func TestStatusAlertDuplicateDownCallIsIdempotent(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
require.NoError(t, err)
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
require.NoError(t, hub.Save(userSettings))
systemCollection, err := hub.FindCollectionByNameOrId("systems")
require.NoError(t, err)
system := core.NewRecord(systemCollection)
system.Set("name", "test-system")
system.Set("status", "up")
system.Set("host", "127.0.0.1")
system.Set("users", []string{user.Id})
require.NoError(t, hub.Save(system))
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
require.NoError(t, err)
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", false)
alert.Set("min", 5)
require.NoError(t, hub.Save(alert))
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.HandleStatusAlerts("down", system))
require.NoError(t, am.HandleStatusAlerts("down", system))
require.NoError(t, am.HandleStatusAlerts("down", system))
assert.Equal(t, 1, am.GetPendingAlertsCount(), "repeated down calls should not schedule duplicate pending alerts")
}
func TestStatusAlertNoAlertRecord(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systemCollection, err := hub.FindCollectionByNameOrId("systems")
require.NoError(t, err)
system := core.NewRecord(systemCollection)
system.Set("name", "test-system")
system.Set("status", "up")
system.Set("host", "127.0.0.1")
system.Set("users", []string{user.Id})
require.NoError(t, hub.Save(system))
// No Status alert record created for this system
initialEmailCount := hub.TestMailer.TotalSend()
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.HandleStatusAlerts("down", system))
assert.Equal(t, 0, am.GetPendingAlertsCount(), "no pending alert when no alert record exists")
require.NoError(t, am.HandleStatusAlerts("up", system))
assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "no email when no alert record exists")
}
func TestRestorePendingStatusAlertsRequeuesDownSystemsAfterRestart(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
require.NoError(t, err)
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
require.NoError(t, hub.Save(userSettings))
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "down")
require.NoError(t, err)
system := systems[0]
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
require.NoError(t, err)
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", false)
alert.Set("min", 1)
require.NoError(t, hub.Save(alert))
initialEmailCount := hub.TestMailer.TotalSend()
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.RestorePendingStatusAlerts())
assert.Equal(t, 1, am.GetPendingAlertsCount(), "startup restore should requeue a pending down alert for a system still marked down")
am.ForceExpirePendingAlerts()
processed, err := am.ProcessPendingAlerts()
require.NoError(t, err)
assert.Len(t, processed, 1, "restored pending alert should be processable after the delay expires")
assert.Equal(t, initialEmailCount+1, hub.TestMailer.TotalSend(), "restored pending alert should send the down notification")
alertRecord, err := hub.FindRecordById("alerts", alert.Id)
require.NoError(t, err)
assert.True(t, alertRecord.GetBool("triggered"), "restored pending alert should mark the alert as triggered once delivered")
}
func TestRestorePendingStatusAlertsSkipsNonDownOrAlreadyTriggeredAlerts(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systemsDown, err := beszelTests.CreateSystems(hub, 2, user.Id, "down")
require.NoError(t, err)
systemDownPending := systemsDown[0]
systemDownTriggered := systemsDown[1]
systemUp, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
"name": "up-system",
"users": []string{user.Id},
"host": "127.0.0.2",
"status": "up",
})
require.NoError(t, err)
_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": systemDownPending.Id,
"user": user.Id,
"min": 1,
"triggered": false,
})
require.NoError(t, err)
_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": systemUp.Id,
"user": user.Id,
"min": 1,
"triggered": false,
})
require.NoError(t, err)
_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": systemDownTriggered.Id,
"user": user.Id,
"min": 1,
"triggered": true,
})
require.NoError(t, err)
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.RestorePendingStatusAlerts())
assert.Equal(t, 1, am.GetPendingAlertsCount(), "only untriggered alerts for currently down systems should be restored")
}
func TestRestorePendingStatusAlertsIsIdempotent(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "down")
require.NoError(t, err)
system := systems[0]
_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": system.Id,
"user": user.Id,
"min": 1,
"triggered": false,
})
require.NoError(t, err)
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.RestorePendingStatusAlerts())
require.NoError(t, am.RestorePendingStatusAlerts())
assert.Equal(t, 1, am.GetPendingAlertsCount(), "restoring twice should not create duplicate pending alerts")
am.ForceExpirePendingAlerts()
processed, err := am.ProcessPendingAlerts()
require.NoError(t, err)
assert.Len(t, processed, 1, "restored alert should still be processable exactly once")
assert.Zero(t, am.GetPendingAlertsCount(), "processing the restored alert should empty the pending map")
}
func TestResolveStatusAlertsFixesStaleTriggered(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
// CreateSystems uses SaveNoValidate after initial save to bypass the
// onRecordCreate hook that forces status = "pending".
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up")
require.NoError(t, err)
system := systems[0]
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
require.NoError(t, err)
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", true) // Stale: system is up but alert still says triggered
require.NoError(t, hub.Save(alert))
// resolveStatusAlerts should clear the stale triggered flag
require.NoError(t, alerts.ResolveStatusAlerts(hub))
alertRecord, err := hub.FindRecordById("alerts", alert.Id)
require.NoError(t, err)
assert.False(t, alertRecord.GetBool("triggered"), "stale triggered flag should be cleared when system is up")
}
func TestResolveStatusAlerts(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
// Create a systemUp
systemUp, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
"name": "test-system",
"users": []string{user.Id},
"host": "127.0.0.1",
"status": "up",
})
assert.NoError(t, err)
systemDown, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
"name": "test-system-2",
"users": []string{user.Id},
"host": "127.0.0.2",
"status": "up",
})
assert.NoError(t, err)
// Create a status alertUp for the system
alertUp, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": systemUp.Id,
"user": user.Id,
"min": 1,
})
assert.NoError(t, err)
alertDown, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": systemDown.Id,
"user": user.Id,
"min": 1,
})
assert.NoError(t, err)
// Verify alert is not triggered initially
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered initially")
// Set the system to 'up' (this should not trigger the alert)
systemUp.Set("status", "up")
err = hub.SaveNoValidate(systemUp)
assert.NoError(t, err)
systemDown.Set("status", "down")
err = hub.SaveNoValidate(systemDown)
assert.NoError(t, err)
// Wait a moment for any processing
time.Sleep(10 * time.Millisecond)
// Verify alertUp is still not triggered after setting system to up
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
assert.NoError(t, err)
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered when system is up")
// Manually set both alerts triggered to true
alertUp.Set("triggered", true)
err = hub.SaveNoValidate(alertUp)
assert.NoError(t, err)
alertDown.Set("triggered", true)
err = hub.SaveNoValidate(alertDown)
assert.NoError(t, err)
// Verify we have exactly one alert with triggered true
triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 2, triggeredCount, "Should have exactly two alerts with triggered true")
// Verify the specific alertUp is triggered
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
assert.NoError(t, err)
assert.True(t, alertUp.GetBool("triggered"), "Alert should be triggered")
// Verify we have two unresolved alert history records
alertHistoryCount, err := hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
assert.NoError(t, err)
assert.EqualValues(t, 2, alertHistoryCount, "Should have exactly two unresolved alert history records")
err = alerts.ResolveStatusAlerts(hub)
assert.NoError(t, err)
// Verify alertUp is not triggered after resolving
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
assert.NoError(t, err)
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered after resolving")
// Verify alertDown is still triggered
alertDown, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertDown.Id})
assert.NoError(t, err)
assert.True(t, alertDown.GetBool("triggered"), "Alert should still be triggered after resolving")
// Verify we have one unresolved alert history record
alertHistoryCount, err = hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
assert.NoError(t, err)
assert.EqualValues(t, 1, alertHistoryCount, "Should have exactly one unresolved alert history record")
}

View File

@@ -12,7 +12,6 @@ import (
"testing/synctest"
"time"
"github.com/henrygd/beszel/internal/alerts"
beszelTests "github.com/henrygd/beszel/internal/tests"
"github.com/pocketbase/dbx"
@@ -369,87 +368,6 @@ func TestUserAlertsApi(t *testing.T) {
}
}
func TestStatusAlerts(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systems, err := beszelTests.CreateSystems(hub, 4, user.Id, "paused")
assert.NoError(t, err)
var alerts []*core.Record
for i, system := range systems {
alert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": system.Id,
"user": user.Id,
"min": i + 1,
})
assert.NoError(t, err)
alerts = append(alerts, alert)
}
time.Sleep(10 * time.Millisecond)
for _, alert := range alerts {
assert.False(t, alert.GetBool("triggered"), "Alert should not be triggered immediately")
}
if hub.TestMailer.TotalSend() != 0 {
assert.Zero(t, hub.TestMailer.TotalSend(), "Expected 0 messages, got %d", hub.TestMailer.TotalSend())
}
for _, system := range systems {
assert.EqualValues(t, "paused", system.GetString("status"), "System should be paused")
}
for _, system := range systems {
system.Set("status", "up")
err = hub.SaveNoValidate(system)
assert.NoError(t, err)
}
time.Sleep(time.Second)
assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
for _, system := range systems {
system.Set("status", "down")
err = hub.SaveNoValidate(system)
assert.NoError(t, err)
}
// after 30 seconds, should have 4 alerts in the pendingAlerts map, no triggered alerts
time.Sleep(time.Second * 30)
assert.EqualValues(t, 4, hub.GetPendingAlertsCount(), "should have 4 alerts in the pendingAlerts map")
triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 0, triggeredCount, "should have 0 alert triggered")
assert.EqualValues(t, 0, hub.TestMailer.TotalSend(), "should have 0 messages sent")
// after 1:30 seconds, should have 1 triggered alert and 3 pending alerts
time.Sleep(time.Second * 60)
assert.EqualValues(t, 3, hub.GetPendingAlertsCount(), "should have 3 alerts in the pendingAlerts map")
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 1, triggeredCount, "should have 1 alert triggered")
assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 messages sent")
// after 2:30 seconds, should have 2 triggered alerts and 2 pending alerts
time.Sleep(time.Second * 60)
assert.EqualValues(t, 2, hub.GetPendingAlertsCount(), "should have 2 alerts in the pendingAlerts map")
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 2, triggeredCount, "should have 2 alert triggered")
assert.EqualValues(t, 2, hub.TestMailer.TotalSend(), "should have 2 messages sent")
// now we will bring the remaning systems back up
for _, system := range systems {
system.Set("status", "up")
err = hub.SaveNoValidate(system)
assert.NoError(t, err)
}
time.Sleep(time.Second)
// should have 0 alerts in the pendingAlerts map and 0 alerts triggered
assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.Zero(t, triggeredCount, "should have 0 alert triggered")
// 4 messages sent, 2 down alerts and 2 up alerts for first 2 systems
assert.EqualValues(t, 4, hub.TestMailer.TotalSend(), "should have 4 messages sent")
})
}
func TestAlertsHistory(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
@@ -578,102 +496,3 @@ func TestAlertsHistory(t *testing.T) {
assert.EqualValues(t, 2, totalHistoryCount, "Should have 2 total alert history records")
})
}
func TestResolveStatusAlerts(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
// Create a systemUp
systemUp, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
"name": "test-system",
"users": []string{user.Id},
"host": "127.0.0.1",
"status": "up",
})
assert.NoError(t, err)
systemDown, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
"name": "test-system-2",
"users": []string{user.Id},
"host": "127.0.0.2",
"status": "up",
})
assert.NoError(t, err)
// Create a status alertUp for the system
alertUp, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": systemUp.Id,
"user": user.Id,
"min": 1,
})
assert.NoError(t, err)
alertDown, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": systemDown.Id,
"user": user.Id,
"min": 1,
})
assert.NoError(t, err)
// Verify alert is not triggered initially
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered initially")
// Set the system to 'up' (this should not trigger the alert)
systemUp.Set("status", "up")
err = hub.SaveNoValidate(systemUp)
assert.NoError(t, err)
systemDown.Set("status", "down")
err = hub.SaveNoValidate(systemDown)
assert.NoError(t, err)
// Wait a moment for any processing
time.Sleep(10 * time.Millisecond)
// Verify alertUp is still not triggered after setting system to up
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
assert.NoError(t, err)
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered when system is up")
// Manually set both alerts triggered to true
alertUp.Set("triggered", true)
err = hub.SaveNoValidate(alertUp)
assert.NoError(t, err)
alertDown.Set("triggered", true)
err = hub.SaveNoValidate(alertDown)
assert.NoError(t, err)
// Verify we have exactly one alert with triggered true
triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 2, triggeredCount, "Should have exactly two alerts with triggered true")
// Verify the specific alertUp is triggered
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
assert.NoError(t, err)
assert.True(t, alertUp.GetBool("triggered"), "Alert should be triggered")
// Verify we have two unresolved alert history records
alertHistoryCount, err := hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
assert.NoError(t, err)
assert.EqualValues(t, 2, alertHistoryCount, "Should have exactly two unresolved alert history records")
err = alerts.ResolveStatusAlerts(hub)
assert.NoError(t, err)
// Verify alertUp is not triggered after resolving
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
assert.NoError(t, err)
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered after resolving")
// Verify alertDown is still triggered
alertDown, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertDown.Id})
assert.NoError(t, err)
assert.True(t, alertDown.GetBool("triggered"), "Alert should still be triggered after resolving")
// Verify we have one unresolved alert history record
alertHistoryCount, err = hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
assert.NoError(t, err)
assert.EqualValues(t, 1, alertHistoryCount, "Should have exactly one unresolved alert history record")
}

View File

@@ -9,6 +9,12 @@ import (
"github.com/pocketbase/pocketbase/core"
)
func NewTestAlertManagerWithoutWorker(app hubLike) *AlertManager {
return &AlertManager{
hub: app,
}
}
func (am *AlertManager) GetAlertManager() *AlertManager {
return am
}
@@ -34,12 +40,11 @@ func (am *AlertManager) ProcessPendingAlerts() ([]*core.Record, error) {
am.pendingAlerts.Range(func(key, value any) bool {
info := value.(*alertInfo)
if now.After(info.expireTime) {
// Downtime delay has passed, process alert
if err := am.sendStatusAlert("down", info.systemName, info.alertRecord); err != nil {
lastErr = err
if info.timer != nil {
info.timer.Stop()
}
am.processPendingAlert(key.(string))
processedAlerts = append(processedAlerts, info.alertRecord)
am.pendingAlerts.Delete(key)
}
return true
})
@@ -56,6 +61,27 @@ func (am *AlertManager) ForceExpirePendingAlerts() {
})
}
func (am *AlertManager) ResetPendingAlertTimer(alertID string, delay time.Duration) bool {
value, loaded := am.pendingAlerts.Load(alertID)
if !loaded {
return false
}
info := value.(*alertInfo)
if info.timer != nil {
info.timer.Stop()
}
info.expireTime = time.Now().Add(delay)
info.timer = time.AfterFunc(delay, func() {
am.processPendingAlert(alertID)
})
return true
}
func ResolveStatusAlerts(app core.App) error {
return resolveStatusAlerts(app)
}
func (am *AlertManager) RestorePendingStatusAlerts() error {
return am.restorePendingStatusAlerts()
}

View File

@@ -98,7 +98,7 @@ func ClearCollection(t testing.TB, app core.App, collectionName string) error {
}
func (h *TestHub) Cleanup() {
h.GetAlertManager().StopWorker()
h.GetAlertManager().Stop()
h.GetSystemManager().RemoveAllSystems()
h.TestApp.Cleanup()
}