mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-21 21:26:16 +01:00
Compare commits
3 Commits
temp-down-
...
bd94a9d142
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bd94a9d142 | ||
|
|
8e2316f845 | ||
|
|
0d3dfcb207 |
392
agent/disk.go
392
agent/disk.go
@@ -14,6 +14,25 @@ import (
|
||||
"github.com/shirou/gopsutil/v4/disk"
|
||||
)
|
||||
|
||||
// fsRegistrationContext holds the shared lookup state needed to resolve a
|
||||
// filesystem into the tracked fsStats key and metadata.
|
||||
type fsRegistrationContext struct {
|
||||
filesystem string // value of optional FILESYSTEM env var
|
||||
isWindows bool
|
||||
efPath string // path to extra filesystems (default "/extra-filesystems")
|
||||
diskIoCounters map[string]disk.IOCountersStat
|
||||
}
|
||||
|
||||
// diskDiscovery groups the transient state for a single initializeDiskInfo run so
|
||||
// helper methods can share the same partitions, mount paths, and lookup functions
|
||||
type diskDiscovery struct {
|
||||
agent *Agent
|
||||
rootMountPoint string
|
||||
partitions []disk.PartitionStat
|
||||
usageFn func(string) (*disk.UsageStat, error)
|
||||
ctx fsRegistrationContext
|
||||
}
|
||||
|
||||
// parseFilesystemEntry parses a filesystem entry in the format "device__customname"
|
||||
// Returns the device/filesystem part and the custom name part
|
||||
func parseFilesystemEntry(entry string) (device, customName string) {
|
||||
@@ -27,19 +46,230 @@ func parseFilesystemEntry(entry string) (device, customName string) {
|
||||
return device, customName
|
||||
}
|
||||
|
||||
// extraFilesystemPartitionInfo derives the I/O device and optional display name
|
||||
// for a mounted /extra-filesystems partition. Prefer the partition device reported
|
||||
// by the system and only use the folder name for custom naming metadata.
|
||||
func extraFilesystemPartitionInfo(p disk.PartitionStat) (device, customName string) {
|
||||
device = strings.TrimSpace(p.Device)
|
||||
folderDevice, customName := parseFilesystemEntry(filepath.Base(p.Mountpoint))
|
||||
if device == "" {
|
||||
device = folderDevice
|
||||
}
|
||||
return device, customName
|
||||
}
|
||||
|
||||
func isDockerSpecialMountpoint(mountpoint string) bool {
|
||||
switch mountpoint {
|
||||
case "/etc/hosts", "/etc/resolv.conf", "/etc/hostname":
|
||||
return true
|
||||
default:
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// registerFilesystemStats resolves the tracked key and stats payload for a
|
||||
// filesystem before it is inserted into fsStats.
|
||||
func registerFilesystemStats(existing map[string]*system.FsStats, device, mountpoint string, root bool, customName string, ctx fsRegistrationContext) (string, *system.FsStats, bool) {
|
||||
key := device
|
||||
if !ctx.isWindows {
|
||||
key = filepath.Base(device)
|
||||
}
|
||||
|
||||
if root {
|
||||
// Try to map root device to a diskIoCounters entry. First checks for an
|
||||
// exact key match, then uses findIoDevice for normalized / prefix-based
|
||||
// matching (e.g. nda0p2 -> nda0), and finally falls back to FILESYSTEM.
|
||||
if _, ioMatch := ctx.diskIoCounters[key]; !ioMatch {
|
||||
if matchedKey, match := findIoDevice(key, ctx.diskIoCounters); match {
|
||||
key = matchedKey
|
||||
} else if ctx.filesystem != "" {
|
||||
if matchedKey, match := findIoDevice(ctx.filesystem, ctx.diskIoCounters); match {
|
||||
key = matchedKey
|
||||
}
|
||||
}
|
||||
if _, ioMatch = ctx.diskIoCounters[key]; !ioMatch {
|
||||
slog.Warn("Root I/O unmapped; set FILESYSTEM", "device", device, "mountpoint", mountpoint)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Check if non-root has diskstats and prefer the folder device for
|
||||
// /extra-filesystems mounts when the discovered partition device is a
|
||||
// mapper path (e.g. luks UUID) that obscures the underlying block device.
|
||||
if _, ioMatch := ctx.diskIoCounters[key]; !ioMatch {
|
||||
if strings.HasPrefix(mountpoint, ctx.efPath) {
|
||||
folderDevice, _ := parseFilesystemEntry(filepath.Base(mountpoint))
|
||||
if folderDevice != "" {
|
||||
if matchedKey, match := findIoDevice(folderDevice, ctx.diskIoCounters); match {
|
||||
key = matchedKey
|
||||
}
|
||||
}
|
||||
}
|
||||
if _, ioMatch = ctx.diskIoCounters[key]; !ioMatch {
|
||||
if matchedKey, match := findIoDevice(key, ctx.diskIoCounters); match {
|
||||
key = matchedKey
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if _, exists := existing[key]; exists {
|
||||
return "", nil, false
|
||||
}
|
||||
|
||||
fsStats := &system.FsStats{Root: root, Mountpoint: mountpoint}
|
||||
if customName != "" {
|
||||
fsStats.Name = customName
|
||||
}
|
||||
return key, fsStats, true
|
||||
}
|
||||
|
||||
// addFsStat inserts a discovered filesystem if it resolves to a new tracking
|
||||
// key. The key selection itself lives in buildFsStatRegistration so that logic
|
||||
// can stay directly unit-tested.
|
||||
func (d *diskDiscovery) addFsStat(device, mountpoint string, root bool, customName string) {
|
||||
key, fsStats, ok := registerFilesystemStats(d.agent.fsStats, device, mountpoint, root, customName, d.ctx)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
d.agent.fsStats[key] = fsStats
|
||||
name := key
|
||||
if customName != "" {
|
||||
name = customName
|
||||
}
|
||||
slog.Info("Detected disk", "name", name, "device", device, "mount", mountpoint, "io", key, "root", root)
|
||||
}
|
||||
|
||||
// addConfiguredRootFs resolves FILESYSTEM against partitions first, then falls
|
||||
// back to direct diskstats matching for setups like ZFS where partitions do not
|
||||
// expose the physical device name.
|
||||
func (d *diskDiscovery) addConfiguredRootFs() bool {
|
||||
if d.ctx.filesystem == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, p := range d.partitions {
|
||||
if filesystemMatchesPartitionSetting(d.ctx.filesystem, p) {
|
||||
d.addFsStat(p.Device, p.Mountpoint, true, "")
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// FILESYSTEM may name a physical disk absent from partitions (e.g. ZFS lists
|
||||
// dataset paths like zroot/ROOT/default, not block devices).
|
||||
if ioKey, match := findIoDevice(d.ctx.filesystem, d.ctx.diskIoCounters); match {
|
||||
d.agent.fsStats[ioKey] = &system.FsStats{Root: true, Mountpoint: d.rootMountPoint}
|
||||
return true
|
||||
}
|
||||
|
||||
slog.Warn("Partition details not found", "filesystem", d.ctx.filesystem)
|
||||
return false
|
||||
}
|
||||
|
||||
func isRootFallbackPartition(p disk.PartitionStat, rootMountPoint string) bool {
|
||||
return p.Mountpoint == rootMountPoint ||
|
||||
(isDockerSpecialMountpoint(p.Mountpoint) && strings.HasPrefix(p.Device, "/dev"))
|
||||
}
|
||||
|
||||
// addPartitionRootFs handles the non-configured root fallback path when a
|
||||
// partition looks like the active root mount but still needs translating to an
|
||||
// I/O device key.
|
||||
func (d *diskDiscovery) addPartitionRootFs(device, mountpoint string) bool {
|
||||
fs, match := findIoDevice(filepath.Base(device), d.ctx.diskIoCounters)
|
||||
if !match {
|
||||
return false
|
||||
}
|
||||
// The resolved I/O device is already known here, so use it directly to avoid
|
||||
// a second fallback search inside buildFsStatRegistration.
|
||||
d.addFsStat(fs, mountpoint, true, "")
|
||||
return true
|
||||
}
|
||||
|
||||
// addLastResortRootFs is only used when neither FILESYSTEM nor partition-based
|
||||
// heuristics can identify root, so it picks the busiest I/O device as a final
|
||||
// fallback and preserves the root mountpoint for usage collection.
|
||||
func (d *diskDiscovery) addLastResortRootFs() {
|
||||
rootKey := mostActiveIoDevice(d.ctx.diskIoCounters)
|
||||
if rootKey != "" {
|
||||
slog.Warn("Using most active device for root I/O; set FILESYSTEM to override", "device", rootKey)
|
||||
} else {
|
||||
rootKey = filepath.Base(d.rootMountPoint)
|
||||
if _, exists := d.agent.fsStats[rootKey]; exists {
|
||||
rootKey = "root"
|
||||
}
|
||||
slog.Warn("Root I/O device not detected; set FILESYSTEM to override")
|
||||
}
|
||||
d.agent.fsStats[rootKey] = &system.FsStats{Root: true, Mountpoint: d.rootMountPoint}
|
||||
}
|
||||
|
||||
// findPartitionByFilesystemSetting matches an EXTRA_FILESYSTEMS entry against a
|
||||
// discovered partition either by mountpoint or by device suffix.
|
||||
func findPartitionByFilesystemSetting(filesystem string, partitions []disk.PartitionStat) (disk.PartitionStat, bool) {
|
||||
for _, p := range partitions {
|
||||
if strings.HasSuffix(p.Device, filesystem) || p.Mountpoint == filesystem {
|
||||
return p, true
|
||||
}
|
||||
}
|
||||
return disk.PartitionStat{}, false
|
||||
}
|
||||
|
||||
// addConfiguredExtraFsEntry resolves one EXTRA_FILESYSTEMS entry, preferring a
|
||||
// discovered partition and falling back to any path that disk.Usage accepts.
|
||||
func (d *diskDiscovery) addConfiguredExtraFsEntry(filesystem, customName string) {
|
||||
if p, found := findPartitionByFilesystemSetting(filesystem, d.partitions); found {
|
||||
d.addFsStat(p.Device, p.Mountpoint, false, customName)
|
||||
return
|
||||
}
|
||||
|
||||
if _, err := d.usageFn(filesystem); err == nil {
|
||||
d.addFsStat(filepath.Base(filesystem), filesystem, false, customName)
|
||||
return
|
||||
} else {
|
||||
slog.Error("Invalid filesystem", "name", filesystem, "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
// addConfiguredExtraFilesystems parses and registers the comma-separated
|
||||
// EXTRA_FILESYSTEMS env var entries.
|
||||
func (d *diskDiscovery) addConfiguredExtraFilesystems(extraFilesystems string) {
|
||||
for fsEntry := range strings.SplitSeq(extraFilesystems, ",") {
|
||||
filesystem, customName := parseFilesystemEntry(fsEntry)
|
||||
d.addConfiguredExtraFsEntry(filesystem, customName)
|
||||
}
|
||||
}
|
||||
|
||||
// addPartitionExtraFs registers partitions mounted under /extra-filesystems so
|
||||
// their display names can come from the folder name while their I/O keys still
|
||||
// prefer the underlying partition device.
|
||||
func (d *diskDiscovery) addPartitionExtraFs(p disk.PartitionStat) {
|
||||
if !strings.HasPrefix(p.Mountpoint, d.ctx.efPath) {
|
||||
return
|
||||
}
|
||||
device, customName := extraFilesystemPartitionInfo(p)
|
||||
d.addFsStat(device, p.Mountpoint, false, customName)
|
||||
}
|
||||
|
||||
// addExtraFilesystemFolders handles bare directories under /extra-filesystems
|
||||
// that may not appear in partition discovery, while skipping mountpoints that
|
||||
// were already registered from higher-fidelity sources.
|
||||
func (d *diskDiscovery) addExtraFilesystemFolders(folderNames []string) {
|
||||
existingMountpoints := make(map[string]bool, len(d.agent.fsStats))
|
||||
for _, stats := range d.agent.fsStats {
|
||||
existingMountpoints[stats.Mountpoint] = true
|
||||
}
|
||||
|
||||
for _, folderName := range folderNames {
|
||||
mountpoint := filepath.Join(d.ctx.efPath, folderName)
|
||||
slog.Debug("/extra-filesystems", "mountpoint", mountpoint)
|
||||
if existingMountpoints[mountpoint] {
|
||||
continue
|
||||
}
|
||||
device, customName := parseFilesystemEntry(folderName)
|
||||
d.addFsStat(device, mountpoint, false, customName)
|
||||
}
|
||||
}
|
||||
|
||||
// Sets up the filesystems to monitor for disk usage and I/O.
|
||||
func (a *Agent) initializeDiskInfo() {
|
||||
filesystem, _ := utils.GetEnv("FILESYSTEM")
|
||||
efPath := "/extra-filesystems"
|
||||
hasRoot := false
|
||||
isWindows := runtime.GOOS == "windows"
|
||||
|
||||
@@ -56,167 +286,57 @@ func (a *Agent) initializeDiskInfo() {
|
||||
}
|
||||
}
|
||||
|
||||
// ioContext := context.WithValue(a.sensorsContext,
|
||||
// common.EnvKey, common.EnvMap{common.HostProcEnvKey: "/tmp/testproc"},
|
||||
// )
|
||||
// diskIoCounters, err := disk.IOCountersWithContext(ioContext)
|
||||
|
||||
diskIoCounters, err := disk.IOCounters()
|
||||
if err != nil {
|
||||
slog.Error("Error getting diskstats", "err", err)
|
||||
}
|
||||
slog.Debug("Disk I/O", "diskstats", diskIoCounters)
|
||||
|
||||
// Helper function to add a filesystem to fsStats if it doesn't exist
|
||||
addFsStat := func(device, mountpoint string, root bool, customName ...string) {
|
||||
var key string
|
||||
if isWindows {
|
||||
key = device
|
||||
} else {
|
||||
key = filepath.Base(device)
|
||||
}
|
||||
var ioMatch bool
|
||||
if _, exists := a.fsStats[key]; !exists {
|
||||
if root {
|
||||
slog.Info("Detected root device", "name", key)
|
||||
// Try to map root device to a diskIoCounters entry. First
|
||||
// checks for an exact key match, then uses findIoDevice for
|
||||
// normalized / prefix-based matching (e.g. nda0p2 → nda0),
|
||||
// and finally falls back to the FILESYSTEM env var.
|
||||
if _, ioMatch = diskIoCounters[key]; !ioMatch {
|
||||
if matchedKey, match := findIoDevice(key, diskIoCounters); match {
|
||||
key = matchedKey
|
||||
ioMatch = true
|
||||
} else if filesystem != "" {
|
||||
if matchedKey, match := findIoDevice(filesystem, diskIoCounters); match {
|
||||
key = matchedKey
|
||||
ioMatch = true
|
||||
}
|
||||
}
|
||||
if !ioMatch {
|
||||
slog.Warn("Root I/O unmapped; set FILESYSTEM", "device", device, "mountpoint", mountpoint)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Check if non-root has diskstats and fall back to folder name if not
|
||||
// Scenario: device is encrypted and named luks-2bcb02be-999d-4417-8d18-5c61e660fb6e - not in /proc/diskstats.
|
||||
// However, the device can be specified by mounting folder from luks device at /extra-filesystems/sda1
|
||||
if _, ioMatch = diskIoCounters[key]; !ioMatch {
|
||||
efBase := filepath.Base(mountpoint)
|
||||
if _, ioMatch = diskIoCounters[efBase]; ioMatch {
|
||||
key = efBase
|
||||
}
|
||||
}
|
||||
}
|
||||
fsStats := &system.FsStats{Root: root, Mountpoint: mountpoint}
|
||||
if len(customName) > 0 && customName[0] != "" {
|
||||
fsStats.Name = customName[0]
|
||||
}
|
||||
a.fsStats[key] = fsStats
|
||||
}
|
||||
ctx := fsRegistrationContext{
|
||||
filesystem: filesystem,
|
||||
isWindows: isWindows,
|
||||
diskIoCounters: diskIoCounters,
|
||||
efPath: "/extra-filesystems",
|
||||
}
|
||||
|
||||
// Get the appropriate root mount point for this system
|
||||
rootMountPoint := a.getRootMountPoint()
|
||||
|
||||
// Use FILESYSTEM env var to find root filesystem
|
||||
if filesystem != "" {
|
||||
for _, p := range partitions {
|
||||
if filesystemMatchesPartitionSetting(filesystem, p) {
|
||||
addFsStat(p.Device, p.Mountpoint, true)
|
||||
hasRoot = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !hasRoot {
|
||||
// FILESYSTEM may name a physical disk absent from partitions (e.g.
|
||||
// ZFS lists dataset paths like zroot/ROOT/default, not block devices).
|
||||
// Try matching directly against diskIoCounters.
|
||||
if ioKey, match := findIoDevice(filesystem, diskIoCounters); match {
|
||||
a.fsStats[ioKey] = &system.FsStats{Root: true, Mountpoint: rootMountPoint}
|
||||
hasRoot = true
|
||||
} else {
|
||||
slog.Warn("Partition details not found", "filesystem", filesystem)
|
||||
}
|
||||
}
|
||||
discovery := diskDiscovery{
|
||||
agent: a,
|
||||
rootMountPoint: a.getRootMountPoint(),
|
||||
partitions: partitions,
|
||||
usageFn: disk.Usage,
|
||||
ctx: ctx,
|
||||
}
|
||||
|
||||
hasRoot = discovery.addConfiguredRootFs()
|
||||
|
||||
// Add EXTRA_FILESYSTEMS env var values to fsStats
|
||||
if extraFilesystems, exists := utils.GetEnv("EXTRA_FILESYSTEMS"); exists {
|
||||
for fsEntry := range strings.SplitSeq(extraFilesystems, ",") {
|
||||
// Parse custom name from format: device__customname
|
||||
fs, customName := parseFilesystemEntry(fsEntry)
|
||||
|
||||
found := false
|
||||
for _, p := range partitions {
|
||||
if strings.HasSuffix(p.Device, fs) || p.Mountpoint == fs {
|
||||
addFsStat(p.Device, p.Mountpoint, false, customName)
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
// if not in partitions, test if we can get disk usage
|
||||
if !found {
|
||||
if _, err := disk.Usage(fs); err == nil {
|
||||
addFsStat(filepath.Base(fs), fs, false, customName)
|
||||
} else {
|
||||
slog.Error("Invalid filesystem", "name", fs, "err", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
discovery.addConfiguredExtraFilesystems(extraFilesystems)
|
||||
}
|
||||
|
||||
// Process partitions for various mount points
|
||||
for _, p := range partitions {
|
||||
// fmt.Println(p.Device, p.Mountpoint)
|
||||
// Binary root fallback or docker root fallback
|
||||
if !hasRoot && (p.Mountpoint == rootMountPoint || (isDockerSpecialMountpoint(p.Mountpoint) && strings.HasPrefix(p.Device, "/dev"))) {
|
||||
fs, match := findIoDevice(filepath.Base(p.Device), diskIoCounters)
|
||||
if match {
|
||||
addFsStat(fs, p.Mountpoint, true)
|
||||
hasRoot = true
|
||||
}
|
||||
}
|
||||
|
||||
// Check if device is in /extra-filesystems
|
||||
if strings.HasPrefix(p.Mountpoint, efPath) {
|
||||
device, customName := parseFilesystemEntry(p.Mountpoint)
|
||||
addFsStat(device, p.Mountpoint, false, customName)
|
||||
if !hasRoot && isRootFallbackPartition(p, discovery.rootMountPoint) {
|
||||
hasRoot = discovery.addPartitionRootFs(p.Device, p.Mountpoint)
|
||||
}
|
||||
discovery.addPartitionExtraFs(p)
|
||||
}
|
||||
|
||||
// Check all folders in /extra-filesystems and add them if not already present
|
||||
if folders, err := os.ReadDir(efPath); err == nil {
|
||||
existingMountpoints := make(map[string]bool)
|
||||
for _, stats := range a.fsStats {
|
||||
existingMountpoints[stats.Mountpoint] = true
|
||||
}
|
||||
if folders, err := os.ReadDir(discovery.ctx.efPath); err == nil {
|
||||
folderNames := make([]string, 0, len(folders))
|
||||
for _, folder := range folders {
|
||||
if folder.IsDir() {
|
||||
mountpoint := filepath.Join(efPath, folder.Name())
|
||||
slog.Debug("/extra-filesystems", "mountpoint", mountpoint)
|
||||
if !existingMountpoints[mountpoint] {
|
||||
device, customName := parseFilesystemEntry(folder.Name())
|
||||
addFsStat(device, mountpoint, false, customName)
|
||||
}
|
||||
folderNames = append(folderNames, folder.Name())
|
||||
}
|
||||
}
|
||||
discovery.addExtraFilesystemFolders(folderNames)
|
||||
}
|
||||
|
||||
// If no root filesystem set, try the most active I/O device as a last
|
||||
// resort (e.g. ZFS where dataset names are unrelated to disk names).
|
||||
if !hasRoot {
|
||||
rootKey := mostActiveIoDevice(diskIoCounters)
|
||||
if rootKey != "" {
|
||||
slog.Warn("Using most active device for root I/O; set FILESYSTEM to override", "device", rootKey)
|
||||
} else {
|
||||
rootKey = filepath.Base(rootMountPoint)
|
||||
if _, exists := a.fsStats[rootKey]; exists {
|
||||
rootKey = "root"
|
||||
}
|
||||
slog.Warn("Root I/O device not detected; set FILESYSTEM to override")
|
||||
}
|
||||
a.fsStats[rootKey] = &system.FsStats{Root: true, Mountpoint: rootMountPoint}
|
||||
discovery.addLastResortRootFs()
|
||||
}
|
||||
|
||||
a.pruneDuplicateRootExtraFilesystems()
|
||||
@@ -381,6 +501,8 @@ func normalizeDeviceName(value string) string {
|
||||
|
||||
// Sets start values for disk I/O stats.
|
||||
func (a *Agent) initializeDiskIoStats(diskIoCounters map[string]disk.IOCountersStat) {
|
||||
a.fsNames = a.fsNames[:0]
|
||||
now := time.Now()
|
||||
for device, stats := range a.fsStats {
|
||||
// skip if not in diskIoCounters
|
||||
d, exists := diskIoCounters[device]
|
||||
@@ -389,7 +511,7 @@ func (a *Agent) initializeDiskIoStats(diskIoCounters map[string]disk.IOCountersS
|
||||
continue
|
||||
}
|
||||
// populate initial values
|
||||
stats.Time = time.Now()
|
||||
stats.Time = now
|
||||
stats.TotalRead = d.ReadBytes
|
||||
stats.TotalWrite = d.WriteBytes
|
||||
// add to list of valid io device names
|
||||
|
||||
@@ -93,6 +93,443 @@ func TestParseFilesystemEntry(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtraFilesystemPartitionInfo(t *testing.T) {
|
||||
t.Run("uses partition device for label-only mountpoint", func(t *testing.T) {
|
||||
device, customName := extraFilesystemPartitionInfo(disk.PartitionStat{
|
||||
Device: "/dev/sdc",
|
||||
Mountpoint: "/extra-filesystems/Share",
|
||||
})
|
||||
|
||||
assert.Equal(t, "/dev/sdc", device)
|
||||
assert.Equal(t, "", customName)
|
||||
})
|
||||
|
||||
t.Run("uses custom name from mountpoint suffix", func(t *testing.T) {
|
||||
device, customName := extraFilesystemPartitionInfo(disk.PartitionStat{
|
||||
Device: "/dev/sdc",
|
||||
Mountpoint: "/extra-filesystems/sdc__Share",
|
||||
})
|
||||
|
||||
assert.Equal(t, "/dev/sdc", device)
|
||||
assert.Equal(t, "Share", customName)
|
||||
})
|
||||
|
||||
t.Run("falls back to folder device when partition device is unavailable", func(t *testing.T) {
|
||||
device, customName := extraFilesystemPartitionInfo(disk.PartitionStat{
|
||||
Mountpoint: "/extra-filesystems/sdc__Share",
|
||||
})
|
||||
|
||||
assert.Equal(t, "sdc", device)
|
||||
assert.Equal(t, "Share", customName)
|
||||
})
|
||||
|
||||
t.Run("supports custom name without folder device prefix", func(t *testing.T) {
|
||||
device, customName := extraFilesystemPartitionInfo(disk.PartitionStat{
|
||||
Device: "/dev/sdc",
|
||||
Mountpoint: "/extra-filesystems/__Share",
|
||||
})
|
||||
|
||||
assert.Equal(t, "/dev/sdc", device)
|
||||
assert.Equal(t, "Share", customName)
|
||||
})
|
||||
}
|
||||
|
||||
func TestBuildFsStatRegistration(t *testing.T) {
|
||||
t.Run("uses basename for non-windows exact io match", func(t *testing.T) {
|
||||
key, stats, ok := registerFilesystemStats(
|
||||
map[string]*system.FsStats{},
|
||||
"/dev/sda1",
|
||||
"/mnt/data",
|
||||
false,
|
||||
"archive",
|
||||
fsRegistrationContext{
|
||||
isWindows: false,
|
||||
diskIoCounters: map[string]disk.IOCountersStat{
|
||||
"sda1": {Name: "sda1"},
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, "sda1", key)
|
||||
assert.Equal(t, "/mnt/data", stats.Mountpoint)
|
||||
assert.Equal(t, "archive", stats.Name)
|
||||
assert.False(t, stats.Root)
|
||||
})
|
||||
|
||||
t.Run("maps root partition to io device by prefix", func(t *testing.T) {
|
||||
key, stats, ok := registerFilesystemStats(
|
||||
map[string]*system.FsStats{},
|
||||
"/dev/ada0p2",
|
||||
"/",
|
||||
true,
|
||||
"",
|
||||
fsRegistrationContext{
|
||||
isWindows: false,
|
||||
diskIoCounters: map[string]disk.IOCountersStat{
|
||||
"ada0": {Name: "ada0", ReadBytes: 1000, WriteBytes: 1000},
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, "ada0", key)
|
||||
assert.True(t, stats.Root)
|
||||
assert.Equal(t, "/", stats.Mountpoint)
|
||||
})
|
||||
|
||||
t.Run("uses filesystem setting as root fallback", func(t *testing.T) {
|
||||
key, _, ok := registerFilesystemStats(
|
||||
map[string]*system.FsStats{},
|
||||
"overlay",
|
||||
"/",
|
||||
true,
|
||||
"",
|
||||
fsRegistrationContext{
|
||||
filesystem: "nvme0n1p2",
|
||||
isWindows: false,
|
||||
diskIoCounters: map[string]disk.IOCountersStat{
|
||||
"nvme0n1": {Name: "nvme0n1", ReadBytes: 1000, WriteBytes: 1000},
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, "nvme0n1", key)
|
||||
})
|
||||
|
||||
t.Run("prefers parsed extra-filesystems device over mapper device", func(t *testing.T) {
|
||||
key, stats, ok := registerFilesystemStats(
|
||||
map[string]*system.FsStats{},
|
||||
"/dev/mapper/luks-2bcb02be-999d-4417-8d18-5c61e660fb6e",
|
||||
"/extra-filesystems/nvme0n1p2__Archive",
|
||||
false,
|
||||
"Archive",
|
||||
fsRegistrationContext{
|
||||
isWindows: false,
|
||||
diskIoCounters: map[string]disk.IOCountersStat{
|
||||
"dm-1": {Name: "dm-1", Label: "luks-2bcb02be-999d-4417-8d18-5c61e660fb6e"},
|
||||
"nvme0n1p2": {Name: "nvme0n1p2"},
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, "nvme0n1p2", key)
|
||||
assert.Equal(t, "Archive", stats.Name)
|
||||
})
|
||||
|
||||
t.Run("falls back to mapper io device when folder device cannot be resolved", func(t *testing.T) {
|
||||
key, stats, ok := registerFilesystemStats(
|
||||
map[string]*system.FsStats{},
|
||||
"/dev/mapper/luks-2bcb02be-999d-4417-8d18-5c61e660fb6e",
|
||||
"/extra-filesystems/Archive",
|
||||
false,
|
||||
"Archive",
|
||||
fsRegistrationContext{
|
||||
isWindows: false,
|
||||
diskIoCounters: map[string]disk.IOCountersStat{
|
||||
"dm-1": {Name: "dm-1", Label: "luks-2bcb02be-999d-4417-8d18-5c61e660fb6e"},
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, "dm-1", key)
|
||||
assert.Equal(t, "Archive", stats.Name)
|
||||
})
|
||||
|
||||
t.Run("uses full device name on windows", func(t *testing.T) {
|
||||
key, _, ok := registerFilesystemStats(
|
||||
map[string]*system.FsStats{},
|
||||
`C:`,
|
||||
`C:\\`,
|
||||
false,
|
||||
"",
|
||||
fsRegistrationContext{
|
||||
isWindows: true,
|
||||
diskIoCounters: map[string]disk.IOCountersStat{
|
||||
`C:`: {Name: `C:`},
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, `C:`, key)
|
||||
})
|
||||
|
||||
t.Run("skips existing key", func(t *testing.T) {
|
||||
key, stats, ok := registerFilesystemStats(
|
||||
map[string]*system.FsStats{"sda1": {Mountpoint: "/existing"}},
|
||||
"/dev/sda1",
|
||||
"/mnt/data",
|
||||
false,
|
||||
"",
|
||||
fsRegistrationContext{
|
||||
isWindows: false,
|
||||
diskIoCounters: map[string]disk.IOCountersStat{
|
||||
"sda1": {Name: "sda1"},
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
assert.False(t, ok)
|
||||
assert.Empty(t, key)
|
||||
assert.Nil(t, stats)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAddConfiguredRootFs(t *testing.T) {
|
||||
t.Run("adds root from matching partition", func(t *testing.T) {
|
||||
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
|
||||
discovery := diskDiscovery{
|
||||
agent: agent,
|
||||
rootMountPoint: "/",
|
||||
partitions: []disk.PartitionStat{{Device: "/dev/ada0p2", Mountpoint: "/"}},
|
||||
ctx: fsRegistrationContext{
|
||||
filesystem: "/dev/ada0p2",
|
||||
isWindows: false,
|
||||
diskIoCounters: map[string]disk.IOCountersStat{
|
||||
"ada0": {Name: "ada0", ReadBytes: 1000, WriteBytes: 1000},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
ok := discovery.addConfiguredRootFs()
|
||||
|
||||
assert.True(t, ok)
|
||||
stats, exists := agent.fsStats["ada0"]
|
||||
assert.True(t, exists)
|
||||
assert.True(t, stats.Root)
|
||||
assert.Equal(t, "/", stats.Mountpoint)
|
||||
})
|
||||
|
||||
t.Run("adds root from io device when partition is missing", func(t *testing.T) {
|
||||
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
|
||||
discovery := diskDiscovery{
|
||||
agent: agent,
|
||||
rootMountPoint: "/sysroot",
|
||||
ctx: fsRegistrationContext{
|
||||
filesystem: "zroot",
|
||||
isWindows: false,
|
||||
diskIoCounters: map[string]disk.IOCountersStat{
|
||||
"nda0": {Name: "nda0", Label: "zroot", ReadBytes: 1000, WriteBytes: 1000},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
ok := discovery.addConfiguredRootFs()
|
||||
|
||||
assert.True(t, ok)
|
||||
stats, exists := agent.fsStats["nda0"]
|
||||
assert.True(t, exists)
|
||||
assert.True(t, stats.Root)
|
||||
assert.Equal(t, "/sysroot", stats.Mountpoint)
|
||||
})
|
||||
|
||||
t.Run("returns false when filesystem cannot be resolved", func(t *testing.T) {
|
||||
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
|
||||
discovery := diskDiscovery{
|
||||
agent: agent,
|
||||
rootMountPoint: "/",
|
||||
ctx: fsRegistrationContext{
|
||||
filesystem: "missing-disk",
|
||||
isWindows: false,
|
||||
diskIoCounters: map[string]disk.IOCountersStat{},
|
||||
},
|
||||
}
|
||||
|
||||
ok := discovery.addConfiguredRootFs()
|
||||
|
||||
assert.False(t, ok)
|
||||
assert.Empty(t, agent.fsStats)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAddPartitionRootFs(t *testing.T) {
|
||||
t.Run("adds root from fallback partition candidate", func(t *testing.T) {
|
||||
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
|
||||
discovery := diskDiscovery{
|
||||
agent: agent,
|
||||
ctx: fsRegistrationContext{
|
||||
isWindows: false,
|
||||
diskIoCounters: map[string]disk.IOCountersStat{
|
||||
"nvme0n1": {Name: "nvme0n1", ReadBytes: 1000, WriteBytes: 1000},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
ok := discovery.addPartitionRootFs("/dev/nvme0n1p2", "/")
|
||||
|
||||
assert.True(t, ok)
|
||||
stats, exists := agent.fsStats["nvme0n1"]
|
||||
assert.True(t, exists)
|
||||
assert.True(t, stats.Root)
|
||||
assert.Equal(t, "/", stats.Mountpoint)
|
||||
})
|
||||
|
||||
t.Run("returns false when no io device matches", func(t *testing.T) {
|
||||
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
|
||||
discovery := diskDiscovery{agent: agent, ctx: fsRegistrationContext{diskIoCounters: map[string]disk.IOCountersStat{}}}
|
||||
|
||||
ok := discovery.addPartitionRootFs("/dev/mapper/root", "/")
|
||||
|
||||
assert.False(t, ok)
|
||||
assert.Empty(t, agent.fsStats)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAddLastResortRootFs(t *testing.T) {
|
||||
t.Run("uses most active io device when available", func(t *testing.T) {
|
||||
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
|
||||
discovery := diskDiscovery{agent: agent, rootMountPoint: "/", ctx: fsRegistrationContext{diskIoCounters: map[string]disk.IOCountersStat{
|
||||
"sda": {Name: "sda", ReadBytes: 5000, WriteBytes: 5000},
|
||||
"sdb": {Name: "sdb", ReadBytes: 1000, WriteBytes: 1000},
|
||||
}}}
|
||||
|
||||
discovery.addLastResortRootFs()
|
||||
|
||||
stats, exists := agent.fsStats["sda"]
|
||||
assert.True(t, exists)
|
||||
assert.True(t, stats.Root)
|
||||
})
|
||||
|
||||
t.Run("falls back to root key when mountpoint basename collides", func(t *testing.T) {
|
||||
agent := &Agent{fsStats: map[string]*system.FsStats{
|
||||
"sysroot": {Mountpoint: "/extra-filesystems/sysroot"},
|
||||
}}
|
||||
discovery := diskDiscovery{agent: agent, rootMountPoint: "/sysroot", ctx: fsRegistrationContext{diskIoCounters: map[string]disk.IOCountersStat{}}}
|
||||
|
||||
discovery.addLastResortRootFs()
|
||||
|
||||
stats, exists := agent.fsStats["root"]
|
||||
assert.True(t, exists)
|
||||
assert.True(t, stats.Root)
|
||||
assert.Equal(t, "/sysroot", stats.Mountpoint)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAddConfiguredExtraFsEntry(t *testing.T) {
|
||||
t.Run("uses matching partition when present", func(t *testing.T) {
|
||||
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
|
||||
discovery := diskDiscovery{
|
||||
agent: agent,
|
||||
partitions: []disk.PartitionStat{{Device: "/dev/sdb1", Mountpoint: "/mnt/backup"}},
|
||||
usageFn: func(string) (*disk.UsageStat, error) {
|
||||
t.Fatal("usage fallback should not be called when partition matches")
|
||||
return nil, nil
|
||||
},
|
||||
ctx: fsRegistrationContext{
|
||||
isWindows: false,
|
||||
diskIoCounters: map[string]disk.IOCountersStat{
|
||||
"sdb1": {Name: "sdb1"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
discovery.addConfiguredExtraFsEntry("sdb1", "backup")
|
||||
|
||||
stats, exists := agent.fsStats["sdb1"]
|
||||
assert.True(t, exists)
|
||||
assert.Equal(t, "/mnt/backup", stats.Mountpoint)
|
||||
assert.Equal(t, "backup", stats.Name)
|
||||
})
|
||||
|
||||
t.Run("falls back to usage-validated path", func(t *testing.T) {
|
||||
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
|
||||
discovery := diskDiscovery{
|
||||
agent: agent,
|
||||
usageFn: func(path string) (*disk.UsageStat, error) {
|
||||
assert.Equal(t, "/srv/archive", path)
|
||||
return &disk.UsageStat{}, nil
|
||||
},
|
||||
ctx: fsRegistrationContext{
|
||||
isWindows: false,
|
||||
diskIoCounters: map[string]disk.IOCountersStat{
|
||||
"archive": {Name: "archive"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
discovery.addConfiguredExtraFsEntry("/srv/archive", "archive")
|
||||
|
||||
stats, exists := agent.fsStats["archive"]
|
||||
assert.True(t, exists)
|
||||
assert.Equal(t, "/srv/archive", stats.Mountpoint)
|
||||
assert.Equal(t, "archive", stats.Name)
|
||||
})
|
||||
|
||||
t.Run("ignores invalid filesystem entry", func(t *testing.T) {
|
||||
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
|
||||
discovery := diskDiscovery{
|
||||
agent: agent,
|
||||
usageFn: func(string) (*disk.UsageStat, error) {
|
||||
return nil, os.ErrNotExist
|
||||
},
|
||||
}
|
||||
|
||||
discovery.addConfiguredExtraFsEntry("/missing/archive", "")
|
||||
|
||||
assert.Empty(t, agent.fsStats)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAddConfiguredExtraFilesystems(t *testing.T) {
|
||||
t.Run("parses and registers multiple configured filesystems", func(t *testing.T) {
|
||||
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
|
||||
discovery := diskDiscovery{
|
||||
agent: agent,
|
||||
partitions: []disk.PartitionStat{{Device: "/dev/sda1", Mountpoint: "/mnt/fast"}},
|
||||
usageFn: func(path string) (*disk.UsageStat, error) {
|
||||
if path == "/srv/archive" {
|
||||
return &disk.UsageStat{}, nil
|
||||
}
|
||||
return nil, os.ErrNotExist
|
||||
},
|
||||
ctx: fsRegistrationContext{
|
||||
isWindows: false,
|
||||
diskIoCounters: map[string]disk.IOCountersStat{
|
||||
"sda1": {Name: "sda1"},
|
||||
"archive": {Name: "archive"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
discovery.addConfiguredExtraFilesystems("sda1__fast,/srv/archive__cold")
|
||||
|
||||
assert.Contains(t, agent.fsStats, "sda1")
|
||||
assert.Equal(t, "fast", agent.fsStats["sda1"].Name)
|
||||
assert.Contains(t, agent.fsStats, "archive")
|
||||
assert.Equal(t, "cold", agent.fsStats["archive"].Name)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAddExtraFilesystemFolders(t *testing.T) {
|
||||
t.Run("adds missing folders and skips existing mountpoints", func(t *testing.T) {
|
||||
agent := &Agent{fsStats: map[string]*system.FsStats{
|
||||
"existing": {Mountpoint: "/extra-filesystems/existing"},
|
||||
}}
|
||||
discovery := diskDiscovery{
|
||||
agent: agent,
|
||||
ctx: fsRegistrationContext{
|
||||
isWindows: false,
|
||||
efPath: "/extra-filesystems",
|
||||
diskIoCounters: map[string]disk.IOCountersStat{
|
||||
"newdisk": {Name: "newdisk"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
discovery.addExtraFilesystemFolders([]string{"existing", "newdisk__Archive"})
|
||||
|
||||
assert.Len(t, agent.fsStats, 2)
|
||||
stats, exists := agent.fsStats["newdisk"]
|
||||
assert.True(t, exists)
|
||||
assert.Equal(t, "/extra-filesystems/newdisk__Archive", stats.Mountpoint)
|
||||
assert.Equal(t, "Archive", stats.Name)
|
||||
})
|
||||
}
|
||||
|
||||
func TestFindIoDevice(t *testing.T) {
|
||||
t.Run("matches by device name", func(t *testing.T) {
|
||||
ioCounters := map[string]disk.IOCountersStat{
|
||||
@@ -310,7 +747,7 @@ func TestInitializeDiskInfoWithCustomNames(t *testing.T) {
|
||||
// Test the parsing logic by calling the relevant part
|
||||
// We'll create a simplified version to test just the parsing
|
||||
extraFilesystems := tc.envValue
|
||||
for _, fsEntry := range strings.Split(extraFilesystems, ",") {
|
||||
for fsEntry := range strings.SplitSeq(extraFilesystems, ",") {
|
||||
// Parse the entry
|
||||
fsEntry = strings.TrimSpace(fsEntry)
|
||||
var fs, customName string
|
||||
@@ -506,3 +943,33 @@ func TestHasSameDiskUsage(t *testing.T) {
|
||||
assert.False(t, hasSameDiskUsage(&disk.UsageStat{Total: 0, Used: 0}, &disk.UsageStat{Total: 1, Used: 1}))
|
||||
})
|
||||
}
|
||||
|
||||
func TestInitializeDiskIoStatsResetsTrackedDevices(t *testing.T) {
|
||||
agent := &Agent{
|
||||
fsStats: map[string]*system.FsStats{
|
||||
"sda": {},
|
||||
"sdb": {},
|
||||
},
|
||||
fsNames: []string{"stale", "sda"},
|
||||
}
|
||||
|
||||
agent.initializeDiskIoStats(map[string]disk.IOCountersStat{
|
||||
"sda": {Name: "sda", ReadBytes: 10, WriteBytes: 20},
|
||||
"sdb": {Name: "sdb", ReadBytes: 30, WriteBytes: 40},
|
||||
})
|
||||
|
||||
assert.ElementsMatch(t, []string{"sda", "sdb"}, agent.fsNames)
|
||||
assert.Len(t, agent.fsNames, 2)
|
||||
assert.Equal(t, uint64(10), agent.fsStats["sda"].TotalRead)
|
||||
assert.Equal(t, uint64(20), agent.fsStats["sda"].TotalWrite)
|
||||
assert.False(t, agent.fsStats["sda"].Time.IsZero())
|
||||
assert.False(t, agent.fsStats["sdb"].Time.IsZero())
|
||||
|
||||
agent.initializeDiskIoStats(map[string]disk.IOCountersStat{
|
||||
"sdb": {Name: "sdb", ReadBytes: 50, WriteBytes: 60},
|
||||
})
|
||||
|
||||
assert.Equal(t, []string{"sdb"}, agent.fsNames)
|
||||
assert.Equal(t, uint64(50), agent.fsStats["sdb"].TotalRead)
|
||||
assert.Equal(t, uint64(60), agent.fsStats["sdb"].TotalWrite)
|
||||
}
|
||||
|
||||
@@ -21,8 +21,7 @@ type hubLike interface {
|
||||
|
||||
type AlertManager struct {
|
||||
hub hubLike
|
||||
alertQueue chan alertTask
|
||||
stopChan chan struct{}
|
||||
stopOnce sync.Once
|
||||
pendingAlerts sync.Map
|
||||
}
|
||||
|
||||
@@ -98,12 +97,9 @@ var supportsTitle = map[string]struct{}{
|
||||
// NewAlertManager creates a new AlertManager instance.
|
||||
func NewAlertManager(app hubLike) *AlertManager {
|
||||
am := &AlertManager{
|
||||
hub: app,
|
||||
alertQueue: make(chan alertTask, 5),
|
||||
stopChan: make(chan struct{}),
|
||||
hub: app,
|
||||
}
|
||||
am.bindEvents()
|
||||
go am.startWorker()
|
||||
return am
|
||||
}
|
||||
|
||||
@@ -112,6 +108,16 @@ func (am *AlertManager) bindEvents() {
|
||||
am.hub.OnRecordAfterUpdateSuccess("alerts").BindFunc(updateHistoryOnAlertUpdate)
|
||||
am.hub.OnRecordAfterDeleteSuccess("alerts").BindFunc(resolveHistoryOnAlertDelete)
|
||||
am.hub.OnRecordAfterUpdateSuccess("smart_devices").BindFunc(am.handleSmartDeviceAlert)
|
||||
|
||||
am.hub.OnServe().BindFunc(func(e *core.ServeEvent) error {
|
||||
if err := resolveStatusAlerts(e.App); err != nil {
|
||||
e.App.Logger().Error("Failed to resolve stale status alerts", "err", err)
|
||||
}
|
||||
if err := am.restorePendingStatusAlerts(); err != nil {
|
||||
e.App.Logger().Error("Failed to restore pending status alerts", "err", err)
|
||||
}
|
||||
return e.Next()
|
||||
})
|
||||
}
|
||||
|
||||
// IsNotificationSilenced checks if a notification should be silenced based on configured quiet hours
|
||||
|
||||
@@ -49,7 +49,7 @@ func TestAlertSilencedOneTime(t *testing.T) {
|
||||
|
||||
// Get alert manager
|
||||
am := alerts.NewAlertManager(hub)
|
||||
defer am.StopWorker()
|
||||
defer am.Stop()
|
||||
|
||||
// Test that alert is silenced
|
||||
silenced := am.IsNotificationSilenced(user.Id, system.Id)
|
||||
@@ -106,7 +106,7 @@ func TestAlertSilencedDaily(t *testing.T) {
|
||||
|
||||
// Get alert manager
|
||||
am := alerts.NewAlertManager(hub)
|
||||
defer am.StopWorker()
|
||||
defer am.Stop()
|
||||
|
||||
// Get current hour and create a window that includes current time
|
||||
now := time.Now().UTC()
|
||||
@@ -170,7 +170,7 @@ func TestAlertSilencedDailyMidnightCrossing(t *testing.T) {
|
||||
|
||||
// Get alert manager
|
||||
am := alerts.NewAlertManager(hub)
|
||||
defer am.StopWorker()
|
||||
defer am.Stop()
|
||||
|
||||
// Create a window that crosses midnight: 22:00 - 02:00
|
||||
startTime := time.Date(2000, 1, 1, 22, 0, 0, 0, time.UTC)
|
||||
@@ -211,7 +211,7 @@ func TestAlertSilencedGlobal(t *testing.T) {
|
||||
|
||||
// Get alert manager
|
||||
am := alerts.NewAlertManager(hub)
|
||||
defer am.StopWorker()
|
||||
defer am.Stop()
|
||||
|
||||
// Create a global quiet hours window (no system specified)
|
||||
now := time.Now().UTC()
|
||||
@@ -250,7 +250,7 @@ func TestAlertSilencedSystemSpecific(t *testing.T) {
|
||||
|
||||
// Get alert manager
|
||||
am := alerts.NewAlertManager(hub)
|
||||
defer am.StopWorker()
|
||||
defer am.Stop()
|
||||
|
||||
// Create a system-specific quiet hours window for system1 only
|
||||
now := time.Now().UTC()
|
||||
@@ -296,7 +296,7 @@ func TestAlertSilencedMultiUser(t *testing.T) {
|
||||
|
||||
// Get alert manager
|
||||
am := alerts.NewAlertManager(hub)
|
||||
defer am.StopWorker()
|
||||
defer am.Stop()
|
||||
|
||||
// Create a quiet hours window for user1 only
|
||||
now := time.Now().UTC()
|
||||
@@ -417,7 +417,7 @@ func TestAlertSilencedNoWindows(t *testing.T) {
|
||||
|
||||
// Get alert manager
|
||||
am := alerts.NewAlertManager(hub)
|
||||
defer am.StopWorker()
|
||||
defer am.Stop()
|
||||
|
||||
// Without any quiet hours windows, alert should NOT be silenced
|
||||
silenced := am.IsNotificationSilenced(user.Id, system.Id)
|
||||
|
||||
@@ -9,63 +9,25 @@ import (
|
||||
"github.com/pocketbase/pocketbase/core"
|
||||
)
|
||||
|
||||
type alertTask struct {
|
||||
action string // "schedule" or "cancel"
|
||||
systemName string
|
||||
alertRecord *core.Record
|
||||
delay time.Duration
|
||||
}
|
||||
|
||||
type alertInfo struct {
|
||||
systemName string
|
||||
alertRecord *core.Record
|
||||
expireTime time.Time
|
||||
timer *time.Timer
|
||||
}
|
||||
|
||||
// startWorker is a long-running goroutine that processes alert tasks
|
||||
// every x seconds. It must be running to process status alerts.
|
||||
func (am *AlertManager) startWorker() {
|
||||
processPendingAlerts := time.Tick(15 * time.Second)
|
||||
|
||||
// check for status alerts that are not resolved when system comes up
|
||||
// (can be removed if we figure out core bug in #1052)
|
||||
checkStatusAlerts := time.Tick(561 * time.Second)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-am.stopChan:
|
||||
return
|
||||
case task := <-am.alertQueue:
|
||||
switch task.action {
|
||||
case "schedule":
|
||||
am.pendingAlerts.Store(task.alertRecord.Id, &alertInfo{
|
||||
systemName: task.systemName,
|
||||
alertRecord: task.alertRecord,
|
||||
expireTime: time.Now().Add(task.delay),
|
||||
})
|
||||
case "cancel":
|
||||
am.pendingAlerts.Delete(task.alertRecord.Id)
|
||||
// Stop cancels all pending status alert timers.
|
||||
func (am *AlertManager) Stop() {
|
||||
am.stopOnce.Do(func() {
|
||||
am.pendingAlerts.Range(func(key, value any) bool {
|
||||
info := value.(*alertInfo)
|
||||
if info.timer != nil {
|
||||
info.timer.Stop()
|
||||
}
|
||||
case <-checkStatusAlerts:
|
||||
resolveStatusAlerts(am.hub)
|
||||
case <-processPendingAlerts:
|
||||
// Check for expired alerts every tick
|
||||
now := time.Now()
|
||||
for key, value := range am.pendingAlerts.Range {
|
||||
info := value.(*alertInfo)
|
||||
if now.After(info.expireTime) {
|
||||
// Downtime delay has passed, process alert
|
||||
am.sendStatusAlert("down", info.systemName, info.alertRecord)
|
||||
am.pendingAlerts.Delete(key)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// StopWorker shuts down the AlertManager.worker goroutine
|
||||
func (am *AlertManager) StopWorker() {
|
||||
close(am.stopChan)
|
||||
am.pendingAlerts.Delete(key)
|
||||
return true
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// HandleStatusAlerts manages the logic when system status changes.
|
||||
@@ -103,44 +65,82 @@ func (am *AlertManager) getSystemStatusAlerts(systemID string) ([]*core.Record,
|
||||
return alertRecords, nil
|
||||
}
|
||||
|
||||
// Schedules delayed "down" alerts for each alert record.
|
||||
// handleSystemDown manages the logic when a system status changes to "down". It schedules pending alerts for each alert record.
|
||||
func (am *AlertManager) handleSystemDown(systemName string, alertRecords []*core.Record) {
|
||||
for _, alertRecord := range alertRecords {
|
||||
// Continue if alert is already scheduled
|
||||
if _, exists := am.pendingAlerts.Load(alertRecord.Id); exists {
|
||||
continue
|
||||
}
|
||||
// Schedule by adding to queue
|
||||
min := max(1, alertRecord.GetInt("min"))
|
||||
am.alertQueue <- alertTask{
|
||||
action: "schedule",
|
||||
systemName: systemName,
|
||||
alertRecord: alertRecord,
|
||||
delay: time.Duration(min) * time.Minute,
|
||||
}
|
||||
am.schedulePendingStatusAlert(systemName, alertRecord, time.Duration(min)*time.Minute)
|
||||
}
|
||||
}
|
||||
|
||||
// schedulePendingStatusAlert sets up a timer to send a "down" alert after the specified delay if the system is still down.
|
||||
// It returns true if the alert was scheduled, or false if an alert was already pending for the given alert record.
|
||||
func (am *AlertManager) schedulePendingStatusAlert(systemName string, alertRecord *core.Record, delay time.Duration) bool {
|
||||
alert := &alertInfo{
|
||||
systemName: systemName,
|
||||
alertRecord: alertRecord,
|
||||
expireTime: time.Now().Add(delay),
|
||||
}
|
||||
|
||||
storedAlert, loaded := am.pendingAlerts.LoadOrStore(alertRecord.Id, alert)
|
||||
if loaded {
|
||||
return false
|
||||
}
|
||||
|
||||
stored := storedAlert.(*alertInfo)
|
||||
stored.timer = time.AfterFunc(time.Until(stored.expireTime), func() {
|
||||
am.processPendingAlert(alertRecord.Id)
|
||||
})
|
||||
return true
|
||||
}
|
||||
|
||||
// handleSystemUp manages the logic when a system status changes to "up".
|
||||
// It cancels any pending alerts and sends "up" alerts.
|
||||
func (am *AlertManager) handleSystemUp(systemName string, alertRecords []*core.Record) {
|
||||
for _, alertRecord := range alertRecords {
|
||||
alertRecordID := alertRecord.Id
|
||||
// If alert exists for record, delete and continue (down alert not sent)
|
||||
if _, exists := am.pendingAlerts.Load(alertRecordID); exists {
|
||||
am.alertQueue <- alertTask{
|
||||
action: "cancel",
|
||||
alertRecord: alertRecord,
|
||||
}
|
||||
if am.cancelPendingAlert(alertRecord.Id) {
|
||||
continue
|
||||
}
|
||||
if !alertRecord.GetBool("triggered") {
|
||||
continue
|
||||
}
|
||||
// No alert scheduled for this record, send "up" alert
|
||||
if err := am.sendStatusAlert("up", systemName, alertRecord); err != nil {
|
||||
am.hub.Logger().Error("Failed to send alert", "err", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// cancelPendingAlert stops the timer and removes the pending alert for the given alert ID. Returns true if a pending alert was found and cancelled.
|
||||
func (am *AlertManager) cancelPendingAlert(alertID string) bool {
|
||||
value, loaded := am.pendingAlerts.LoadAndDelete(alertID)
|
||||
if !loaded {
|
||||
return false
|
||||
}
|
||||
|
||||
info := value.(*alertInfo)
|
||||
if info.timer != nil {
|
||||
info.timer.Stop()
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// processPendingAlert sends a "down" alert if the pending alert has expired and the system is still down.
|
||||
func (am *AlertManager) processPendingAlert(alertID string) {
|
||||
value, loaded := am.pendingAlerts.LoadAndDelete(alertID)
|
||||
if !loaded {
|
||||
return
|
||||
}
|
||||
|
||||
info := value.(*alertInfo)
|
||||
if info.alertRecord.GetBool("triggered") {
|
||||
return
|
||||
}
|
||||
if err := am.sendStatusAlert("down", info.systemName, info.alertRecord); err != nil {
|
||||
am.hub.Logger().Error("Failed to send alert", "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
// sendStatusAlert sends a status alert ("up" or "down") to the users associated with the alert records.
|
||||
func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, alertRecord *core.Record) error {
|
||||
switch alertStatus {
|
||||
@@ -174,8 +174,8 @@ func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, a
|
||||
})
|
||||
}
|
||||
|
||||
// resolveStatusAlerts resolves any status alerts that weren't resolved
|
||||
// when system came up (https://github.com/henrygd/beszel/issues/1052)
|
||||
// resolveStatusAlerts resolves any triggered status alerts that weren't resolved
|
||||
// when system came up (https://github.com/henrygd/beszel/issues/1052).
|
||||
func resolveStatusAlerts(app core.App) error {
|
||||
db := app.DB()
|
||||
// Find all active status alerts where the system is actually up
|
||||
@@ -205,3 +205,36 @@ func resolveStatusAlerts(app core.App) error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// restorePendingStatusAlerts re-queues untriggered status alerts for systems that
|
||||
// are still down after a hub restart. This rebuilds the lost in-memory timer state.
|
||||
func (am *AlertManager) restorePendingStatusAlerts() error {
|
||||
type pendingStatusAlert struct {
|
||||
AlertID string `db:"alert_id"`
|
||||
SystemName string `db:"system_name"`
|
||||
}
|
||||
|
||||
var pending []pendingStatusAlert
|
||||
err := am.hub.DB().NewQuery(`
|
||||
SELECT a.id AS alert_id, s.name AS system_name
|
||||
FROM alerts a
|
||||
JOIN systems s ON a.system = s.id
|
||||
WHERE a.name = 'Status'
|
||||
AND a.triggered = false
|
||||
AND s.status = 'down'
|
||||
`).All(&pending)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, item := range pending {
|
||||
alertRecord, err := am.hub.FindRecordById("alerts", item.AlertID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
min := max(1, alertRecord.GetInt("min"))
|
||||
am.schedulePendingStatusAlert(item.SystemName, alertRecord, time.Duration(min)*time.Minute)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
628
internal/alerts/alerts_status_test.go
Normal file
628
internal/alerts/alerts_status_test.go
Normal file
@@ -0,0 +1,628 @@
|
||||
//go:build testing
|
||||
|
||||
package alerts_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"testing/synctest"
|
||||
"time"
|
||||
|
||||
"github.com/henrygd/beszel/internal/alerts"
|
||||
beszelTests "github.com/henrygd/beszel/internal/tests"
|
||||
"github.com/pocketbase/dbx"
|
||||
"github.com/pocketbase/pocketbase/core"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestStatusAlerts(t *testing.T) {
|
||||
synctest.Test(t, func(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
systems, err := beszelTests.CreateSystems(hub, 4, user.Id, "paused")
|
||||
assert.NoError(t, err)
|
||||
|
||||
var alerts []*core.Record
|
||||
for i, system := range systems {
|
||||
alert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
|
||||
"name": "Status",
|
||||
"system": system.Id,
|
||||
"user": user.Id,
|
||||
"min": i + 1,
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
alerts = append(alerts, alert)
|
||||
}
|
||||
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
for _, alert := range alerts {
|
||||
assert.False(t, alert.GetBool("triggered"), "Alert should not be triggered immediately")
|
||||
}
|
||||
if hub.TestMailer.TotalSend() != 0 {
|
||||
assert.Zero(t, hub.TestMailer.TotalSend(), "Expected 0 messages, got %d", hub.TestMailer.TotalSend())
|
||||
}
|
||||
for _, system := range systems {
|
||||
assert.EqualValues(t, "paused", system.GetString("status"), "System should be paused")
|
||||
}
|
||||
for _, system := range systems {
|
||||
system.Set("status", "up")
|
||||
err = hub.SaveNoValidate(system)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
time.Sleep(time.Second)
|
||||
assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
|
||||
for _, system := range systems {
|
||||
system.Set("status", "down")
|
||||
err = hub.SaveNoValidate(system)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
// after 30 seconds, should have 4 alerts in the pendingAlerts map, no triggered alerts
|
||||
time.Sleep(time.Second * 30)
|
||||
assert.EqualValues(t, 4, hub.GetPendingAlertsCount(), "should have 4 alerts in the pendingAlerts map")
|
||||
triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, 0, triggeredCount, "should have 0 alert triggered")
|
||||
assert.EqualValues(t, 0, hub.TestMailer.TotalSend(), "should have 0 messages sent")
|
||||
// after 1:30 seconds, should have 1 triggered alert and 3 pending alerts
|
||||
time.Sleep(time.Second * 60)
|
||||
assert.EqualValues(t, 3, hub.GetPendingAlertsCount(), "should have 3 alerts in the pendingAlerts map")
|
||||
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, 1, triggeredCount, "should have 1 alert triggered")
|
||||
assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 messages sent")
|
||||
// after 2:30 seconds, should have 2 triggered alerts and 2 pending alerts
|
||||
time.Sleep(time.Second * 60)
|
||||
assert.EqualValues(t, 2, hub.GetPendingAlertsCount(), "should have 2 alerts in the pendingAlerts map")
|
||||
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, 2, triggeredCount, "should have 2 alert triggered")
|
||||
assert.EqualValues(t, 2, hub.TestMailer.TotalSend(), "should have 2 messages sent")
|
||||
// now we will bring the remaning systems back up
|
||||
for _, system := range systems {
|
||||
system.Set("status", "up")
|
||||
err = hub.SaveNoValidate(system)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
time.Sleep(time.Second)
|
||||
// should have 0 alerts in the pendingAlerts map and 0 alerts triggered
|
||||
assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
|
||||
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
|
||||
assert.NoError(t, err)
|
||||
assert.Zero(t, triggeredCount, "should have 0 alert triggered")
|
||||
// 4 messages sent, 2 down alerts and 2 up alerts for first 2 systems
|
||||
assert.EqualValues(t, 4, hub.TestMailer.TotalSend(), "should have 4 messages sent")
|
||||
})
|
||||
}
|
||||
func TestStatusAlertRecoveryBeforeDeadline(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
// Ensure user settings have an email
|
||||
userSettings, _ := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
|
||||
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
|
||||
hub.Save(userSettings)
|
||||
|
||||
// Initial email count
|
||||
initialEmailCount := hub.TestMailer.TotalSend()
|
||||
|
||||
systemCollection, _ := hub.FindCollectionByNameOrId("systems")
|
||||
system := core.NewRecord(systemCollection)
|
||||
system.Set("name", "test-system")
|
||||
system.Set("status", "up")
|
||||
system.Set("host", "127.0.0.1")
|
||||
system.Set("users", []string{user.Id})
|
||||
hub.Save(system)
|
||||
|
||||
alertCollection, _ := hub.FindCollectionByNameOrId("alerts")
|
||||
alert := core.NewRecord(alertCollection)
|
||||
alert.Set("user", user.Id)
|
||||
alert.Set("system", system.Id)
|
||||
alert.Set("name", "Status")
|
||||
alert.Set("triggered", false)
|
||||
alert.Set("min", 1)
|
||||
hub.Save(alert)
|
||||
|
||||
am := hub.AlertManager
|
||||
|
||||
// 1. System goes down
|
||||
am.HandleStatusAlerts("down", system)
|
||||
assert.Equal(t, 1, am.GetPendingAlertsCount(), "Alert should be scheduled")
|
||||
|
||||
// 2. System goes up BEFORE delay expires
|
||||
// Triggering HandleStatusAlerts("up") SHOULD NOT send an alert.
|
||||
am.HandleStatusAlerts("up", system)
|
||||
|
||||
assert.Equal(t, 0, am.GetPendingAlertsCount(), "Alert should be canceled if system recovers before delay expires")
|
||||
|
||||
// Verify that NO email was sent.
|
||||
assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "Recovery notification should not be sent if system never went down")
|
||||
|
||||
}
|
||||
|
||||
func TestStatusAlertNormalRecovery(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
// Ensure user settings have an email
|
||||
userSettings, _ := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
|
||||
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
|
||||
hub.Save(userSettings)
|
||||
|
||||
systemCollection, _ := hub.FindCollectionByNameOrId("systems")
|
||||
system := core.NewRecord(systemCollection)
|
||||
system.Set("name", "test-system")
|
||||
system.Set("status", "up")
|
||||
system.Set("host", "127.0.0.1")
|
||||
system.Set("users", []string{user.Id})
|
||||
hub.Save(system)
|
||||
|
||||
alertCollection, _ := hub.FindCollectionByNameOrId("alerts")
|
||||
alert := core.NewRecord(alertCollection)
|
||||
alert.Set("user", user.Id)
|
||||
alert.Set("system", system.Id)
|
||||
alert.Set("name", "Status")
|
||||
alert.Set("triggered", true) // System was confirmed DOWN
|
||||
hub.Save(alert)
|
||||
|
||||
am := hub.AlertManager
|
||||
initialEmailCount := hub.TestMailer.TotalSend()
|
||||
|
||||
// System goes up
|
||||
am.HandleStatusAlerts("up", system)
|
||||
|
||||
// Verify that an email WAS sent (normal recovery).
|
||||
assert.Equal(t, initialEmailCount+1, hub.TestMailer.TotalSend(), "Recovery notification should be sent if system was triggered as down")
|
||||
|
||||
}
|
||||
|
||||
func TestHandleStatusAlertsDoesNotSendRecoveryWhileDownIsOnlyPending(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
|
||||
require.NoError(t, err)
|
||||
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
|
||||
require.NoError(t, hub.Save(userSettings))
|
||||
|
||||
systemCollection, err := hub.FindCollectionByNameOrId("systems")
|
||||
require.NoError(t, err)
|
||||
system := core.NewRecord(systemCollection)
|
||||
system.Set("name", "test-system")
|
||||
system.Set("status", "up")
|
||||
system.Set("host", "127.0.0.1")
|
||||
system.Set("users", []string{user.Id})
|
||||
require.NoError(t, hub.Save(system))
|
||||
|
||||
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
|
||||
require.NoError(t, err)
|
||||
alert := core.NewRecord(alertCollection)
|
||||
alert.Set("user", user.Id)
|
||||
alert.Set("system", system.Id)
|
||||
alert.Set("name", "Status")
|
||||
alert.Set("triggered", false)
|
||||
alert.Set("min", 1)
|
||||
require.NoError(t, hub.Save(alert))
|
||||
|
||||
initialEmailCount := hub.TestMailer.TotalSend()
|
||||
am := alerts.NewTestAlertManagerWithoutWorker(hub)
|
||||
|
||||
require.NoError(t, am.HandleStatusAlerts("down", system))
|
||||
assert.Equal(t, 1, am.GetPendingAlertsCount(), "down transition should register a pending alert immediately")
|
||||
|
||||
require.NoError(t, am.HandleStatusAlerts("up", system))
|
||||
assert.Zero(t, am.GetPendingAlertsCount(), "recovery should cancel the pending down alert")
|
||||
assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "recovery notification should not be sent before a down alert triggers")
|
||||
|
||||
alertRecord, err := hub.FindRecordById("alerts", alert.Id)
|
||||
require.NoError(t, err)
|
||||
assert.False(t, alertRecord.GetBool("triggered"), "alert should remain untriggered when downtime never matured")
|
||||
}
|
||||
|
||||
func TestStatusAlertTimerCancellationPreventsBoundaryDelivery(t *testing.T) {
|
||||
synctest.Test(t, func(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
|
||||
require.NoError(t, err)
|
||||
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
|
||||
require.NoError(t, hub.Save(userSettings))
|
||||
|
||||
systemCollection, err := hub.FindCollectionByNameOrId("systems")
|
||||
require.NoError(t, err)
|
||||
system := core.NewRecord(systemCollection)
|
||||
system.Set("name", "test-system")
|
||||
system.Set("status", "up")
|
||||
system.Set("host", "127.0.0.1")
|
||||
system.Set("users", []string{user.Id})
|
||||
require.NoError(t, hub.Save(system))
|
||||
|
||||
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
|
||||
require.NoError(t, err)
|
||||
alert := core.NewRecord(alertCollection)
|
||||
alert.Set("user", user.Id)
|
||||
alert.Set("system", system.Id)
|
||||
alert.Set("name", "Status")
|
||||
alert.Set("triggered", false)
|
||||
alert.Set("min", 1)
|
||||
require.NoError(t, hub.Save(alert))
|
||||
|
||||
initialEmailCount := hub.TestMailer.TotalSend()
|
||||
am := alerts.NewTestAlertManagerWithoutWorker(hub)
|
||||
|
||||
require.NoError(t, am.HandleStatusAlerts("down", system))
|
||||
assert.Equal(t, 1, am.GetPendingAlertsCount(), "down transition should register a pending alert immediately")
|
||||
require.True(t, am.ResetPendingAlertTimer(alert.Id, 25*time.Millisecond), "test should shorten the pending alert timer")
|
||||
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
require.NoError(t, am.HandleStatusAlerts("up", system))
|
||||
assert.Zero(t, am.GetPendingAlertsCount(), "recovery should remove the pending alert before the timer callback runs")
|
||||
|
||||
time.Sleep(40 * time.Millisecond)
|
||||
assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "timer callback should not deliver after recovery cancels the pending alert")
|
||||
|
||||
alertRecord, err := hub.FindRecordById("alerts", alert.Id)
|
||||
require.NoError(t, err)
|
||||
assert.False(t, alertRecord.GetBool("triggered"), "alert should remain untriggered when cancellation wins the timer race")
|
||||
|
||||
time.Sleep(time.Minute)
|
||||
synctest.Wait()
|
||||
})
|
||||
}
|
||||
|
||||
func TestStatusAlertDownFiresAfterDelayExpires(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
|
||||
require.NoError(t, err)
|
||||
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
|
||||
require.NoError(t, hub.Save(userSettings))
|
||||
|
||||
systemCollection, err := hub.FindCollectionByNameOrId("systems")
|
||||
require.NoError(t, err)
|
||||
system := core.NewRecord(systemCollection)
|
||||
system.Set("name", "test-system")
|
||||
system.Set("status", "up")
|
||||
system.Set("host", "127.0.0.1")
|
||||
system.Set("users", []string{user.Id})
|
||||
require.NoError(t, hub.Save(system))
|
||||
|
||||
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
|
||||
require.NoError(t, err)
|
||||
alert := core.NewRecord(alertCollection)
|
||||
alert.Set("user", user.Id)
|
||||
alert.Set("system", system.Id)
|
||||
alert.Set("name", "Status")
|
||||
alert.Set("triggered", false)
|
||||
alert.Set("min", 1)
|
||||
require.NoError(t, hub.Save(alert))
|
||||
|
||||
initialEmailCount := hub.TestMailer.TotalSend()
|
||||
am := alerts.NewTestAlertManagerWithoutWorker(hub)
|
||||
|
||||
require.NoError(t, am.HandleStatusAlerts("down", system))
|
||||
assert.Equal(t, 1, am.GetPendingAlertsCount(), "alert should be pending after system goes down")
|
||||
|
||||
// Expire the pending alert and process it
|
||||
am.ForceExpirePendingAlerts()
|
||||
processed, err := am.ProcessPendingAlerts()
|
||||
require.NoError(t, err)
|
||||
assert.Len(t, processed, 1, "one alert should have been processed")
|
||||
assert.Equal(t, 0, am.GetPendingAlertsCount(), "pending alert should be consumed after processing")
|
||||
|
||||
// Verify down email was sent
|
||||
assert.Equal(t, initialEmailCount+1, hub.TestMailer.TotalSend(), "down notification should be sent after delay expires")
|
||||
|
||||
// Verify triggered flag is set in the DB
|
||||
alertRecord, err := hub.FindRecordById("alerts", alert.Id)
|
||||
require.NoError(t, err)
|
||||
assert.True(t, alertRecord.GetBool("triggered"), "alert should be marked triggered after downtime matures")
|
||||
}
|
||||
|
||||
func TestStatusAlertDuplicateDownCallIsIdempotent(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
|
||||
require.NoError(t, err)
|
||||
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
|
||||
require.NoError(t, hub.Save(userSettings))
|
||||
|
||||
systemCollection, err := hub.FindCollectionByNameOrId("systems")
|
||||
require.NoError(t, err)
|
||||
system := core.NewRecord(systemCollection)
|
||||
system.Set("name", "test-system")
|
||||
system.Set("status", "up")
|
||||
system.Set("host", "127.0.0.1")
|
||||
system.Set("users", []string{user.Id})
|
||||
require.NoError(t, hub.Save(system))
|
||||
|
||||
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
|
||||
require.NoError(t, err)
|
||||
alert := core.NewRecord(alertCollection)
|
||||
alert.Set("user", user.Id)
|
||||
alert.Set("system", system.Id)
|
||||
alert.Set("name", "Status")
|
||||
alert.Set("triggered", false)
|
||||
alert.Set("min", 5)
|
||||
require.NoError(t, hub.Save(alert))
|
||||
|
||||
am := alerts.NewTestAlertManagerWithoutWorker(hub)
|
||||
|
||||
require.NoError(t, am.HandleStatusAlerts("down", system))
|
||||
require.NoError(t, am.HandleStatusAlerts("down", system))
|
||||
require.NoError(t, am.HandleStatusAlerts("down", system))
|
||||
|
||||
assert.Equal(t, 1, am.GetPendingAlertsCount(), "repeated down calls should not schedule duplicate pending alerts")
|
||||
}
|
||||
|
||||
func TestStatusAlertNoAlertRecord(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
systemCollection, err := hub.FindCollectionByNameOrId("systems")
|
||||
require.NoError(t, err)
|
||||
system := core.NewRecord(systemCollection)
|
||||
system.Set("name", "test-system")
|
||||
system.Set("status", "up")
|
||||
system.Set("host", "127.0.0.1")
|
||||
system.Set("users", []string{user.Id})
|
||||
require.NoError(t, hub.Save(system))
|
||||
|
||||
// No Status alert record created for this system
|
||||
initialEmailCount := hub.TestMailer.TotalSend()
|
||||
am := alerts.NewTestAlertManagerWithoutWorker(hub)
|
||||
|
||||
require.NoError(t, am.HandleStatusAlerts("down", system))
|
||||
assert.Equal(t, 0, am.GetPendingAlertsCount(), "no pending alert when no alert record exists")
|
||||
|
||||
require.NoError(t, am.HandleStatusAlerts("up", system))
|
||||
assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "no email when no alert record exists")
|
||||
}
|
||||
|
||||
func TestRestorePendingStatusAlertsRequeuesDownSystemsAfterRestart(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
|
||||
require.NoError(t, err)
|
||||
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
|
||||
require.NoError(t, hub.Save(userSettings))
|
||||
|
||||
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "down")
|
||||
require.NoError(t, err)
|
||||
system := systems[0]
|
||||
|
||||
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
|
||||
require.NoError(t, err)
|
||||
alert := core.NewRecord(alertCollection)
|
||||
alert.Set("user", user.Id)
|
||||
alert.Set("system", system.Id)
|
||||
alert.Set("name", "Status")
|
||||
alert.Set("triggered", false)
|
||||
alert.Set("min", 1)
|
||||
require.NoError(t, hub.Save(alert))
|
||||
|
||||
initialEmailCount := hub.TestMailer.TotalSend()
|
||||
am := alerts.NewTestAlertManagerWithoutWorker(hub)
|
||||
|
||||
require.NoError(t, am.RestorePendingStatusAlerts())
|
||||
assert.Equal(t, 1, am.GetPendingAlertsCount(), "startup restore should requeue a pending down alert for a system still marked down")
|
||||
|
||||
am.ForceExpirePendingAlerts()
|
||||
processed, err := am.ProcessPendingAlerts()
|
||||
require.NoError(t, err)
|
||||
assert.Len(t, processed, 1, "restored pending alert should be processable after the delay expires")
|
||||
assert.Equal(t, initialEmailCount+1, hub.TestMailer.TotalSend(), "restored pending alert should send the down notification")
|
||||
|
||||
alertRecord, err := hub.FindRecordById("alerts", alert.Id)
|
||||
require.NoError(t, err)
|
||||
assert.True(t, alertRecord.GetBool("triggered"), "restored pending alert should mark the alert as triggered once delivered")
|
||||
}
|
||||
|
||||
func TestRestorePendingStatusAlertsSkipsNonDownOrAlreadyTriggeredAlerts(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
systemsDown, err := beszelTests.CreateSystems(hub, 2, user.Id, "down")
|
||||
require.NoError(t, err)
|
||||
systemDownPending := systemsDown[0]
|
||||
systemDownTriggered := systemsDown[1]
|
||||
|
||||
systemUp, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
|
||||
"name": "up-system",
|
||||
"users": []string{user.Id},
|
||||
"host": "127.0.0.2",
|
||||
"status": "up",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
|
||||
"name": "Status",
|
||||
"system": systemDownPending.Id,
|
||||
"user": user.Id,
|
||||
"min": 1,
|
||||
"triggered": false,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
|
||||
"name": "Status",
|
||||
"system": systemUp.Id,
|
||||
"user": user.Id,
|
||||
"min": 1,
|
||||
"triggered": false,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
|
||||
"name": "Status",
|
||||
"system": systemDownTriggered.Id,
|
||||
"user": user.Id,
|
||||
"min": 1,
|
||||
"triggered": true,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
am := alerts.NewTestAlertManagerWithoutWorker(hub)
|
||||
require.NoError(t, am.RestorePendingStatusAlerts())
|
||||
assert.Equal(t, 1, am.GetPendingAlertsCount(), "only untriggered alerts for currently down systems should be restored")
|
||||
}
|
||||
|
||||
func TestRestorePendingStatusAlertsIsIdempotent(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "down")
|
||||
require.NoError(t, err)
|
||||
system := systems[0]
|
||||
|
||||
_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
|
||||
"name": "Status",
|
||||
"system": system.Id,
|
||||
"user": user.Id,
|
||||
"min": 1,
|
||||
"triggered": false,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
am := alerts.NewTestAlertManagerWithoutWorker(hub)
|
||||
require.NoError(t, am.RestorePendingStatusAlerts())
|
||||
require.NoError(t, am.RestorePendingStatusAlerts())
|
||||
|
||||
assert.Equal(t, 1, am.GetPendingAlertsCount(), "restoring twice should not create duplicate pending alerts")
|
||||
am.ForceExpirePendingAlerts()
|
||||
processed, err := am.ProcessPendingAlerts()
|
||||
require.NoError(t, err)
|
||||
assert.Len(t, processed, 1, "restored alert should still be processable exactly once")
|
||||
assert.Zero(t, am.GetPendingAlertsCount(), "processing the restored alert should empty the pending map")
|
||||
}
|
||||
|
||||
func TestResolveStatusAlertsFixesStaleTriggered(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
// CreateSystems uses SaveNoValidate after initial save to bypass the
|
||||
// onRecordCreate hook that forces status = "pending".
|
||||
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up")
|
||||
require.NoError(t, err)
|
||||
system := systems[0]
|
||||
|
||||
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
|
||||
require.NoError(t, err)
|
||||
alert := core.NewRecord(alertCollection)
|
||||
alert.Set("user", user.Id)
|
||||
alert.Set("system", system.Id)
|
||||
alert.Set("name", "Status")
|
||||
alert.Set("triggered", true) // Stale: system is up but alert still says triggered
|
||||
require.NoError(t, hub.Save(alert))
|
||||
|
||||
// resolveStatusAlerts should clear the stale triggered flag
|
||||
require.NoError(t, alerts.ResolveStatusAlerts(hub))
|
||||
|
||||
alertRecord, err := hub.FindRecordById("alerts", alert.Id)
|
||||
require.NoError(t, err)
|
||||
assert.False(t, alertRecord.GetBool("triggered"), "stale triggered flag should be cleared when system is up")
|
||||
}
|
||||
func TestResolveStatusAlerts(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
// Create a systemUp
|
||||
systemUp, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
|
||||
"name": "test-system",
|
||||
"users": []string{user.Id},
|
||||
"host": "127.0.0.1",
|
||||
"status": "up",
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
|
||||
systemDown, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
|
||||
"name": "test-system-2",
|
||||
"users": []string{user.Id},
|
||||
"host": "127.0.0.2",
|
||||
"status": "up",
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
|
||||
// Create a status alertUp for the system
|
||||
alertUp, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
|
||||
"name": "Status",
|
||||
"system": systemUp.Id,
|
||||
"user": user.Id,
|
||||
"min": 1,
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
|
||||
alertDown, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
|
||||
"name": "Status",
|
||||
"system": systemDown.Id,
|
||||
"user": user.Id,
|
||||
"min": 1,
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
|
||||
// Verify alert is not triggered initially
|
||||
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered initially")
|
||||
|
||||
// Set the system to 'up' (this should not trigger the alert)
|
||||
systemUp.Set("status", "up")
|
||||
err = hub.SaveNoValidate(systemUp)
|
||||
assert.NoError(t, err)
|
||||
|
||||
systemDown.Set("status", "down")
|
||||
err = hub.SaveNoValidate(systemDown)
|
||||
assert.NoError(t, err)
|
||||
|
||||
// Wait a moment for any processing
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
// Verify alertUp is still not triggered after setting system to up
|
||||
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
|
||||
assert.NoError(t, err)
|
||||
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered when system is up")
|
||||
|
||||
// Manually set both alerts triggered to true
|
||||
alertUp.Set("triggered", true)
|
||||
err = hub.SaveNoValidate(alertUp)
|
||||
assert.NoError(t, err)
|
||||
alertDown.Set("triggered", true)
|
||||
err = hub.SaveNoValidate(alertDown)
|
||||
assert.NoError(t, err)
|
||||
|
||||
// Verify we have exactly one alert with triggered true
|
||||
triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, 2, triggeredCount, "Should have exactly two alerts with triggered true")
|
||||
|
||||
// Verify the specific alertUp is triggered
|
||||
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
|
||||
assert.NoError(t, err)
|
||||
assert.True(t, alertUp.GetBool("triggered"), "Alert should be triggered")
|
||||
|
||||
// Verify we have two unresolved alert history records
|
||||
alertHistoryCount, err := hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, 2, alertHistoryCount, "Should have exactly two unresolved alert history records")
|
||||
|
||||
err = alerts.ResolveStatusAlerts(hub)
|
||||
assert.NoError(t, err)
|
||||
|
||||
// Verify alertUp is not triggered after resolving
|
||||
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
|
||||
assert.NoError(t, err)
|
||||
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered after resolving")
|
||||
// Verify alertDown is still triggered
|
||||
alertDown, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertDown.Id})
|
||||
assert.NoError(t, err)
|
||||
assert.True(t, alertDown.GetBool("triggered"), "Alert should still be triggered after resolving")
|
||||
|
||||
// Verify we have one unresolved alert history record
|
||||
alertHistoryCount, err = hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, 1, alertHistoryCount, "Should have exactly one unresolved alert history record")
|
||||
|
||||
}
|
||||
@@ -12,7 +12,6 @@ import (
|
||||
"testing/synctest"
|
||||
"time"
|
||||
|
||||
"github.com/henrygd/beszel/internal/alerts"
|
||||
beszelTests "github.com/henrygd/beszel/internal/tests"
|
||||
|
||||
"github.com/pocketbase/dbx"
|
||||
@@ -369,87 +368,6 @@ func TestUserAlertsApi(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestStatusAlerts(t *testing.T) {
|
||||
synctest.Test(t, func(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
systems, err := beszelTests.CreateSystems(hub, 4, user.Id, "paused")
|
||||
assert.NoError(t, err)
|
||||
|
||||
var alerts []*core.Record
|
||||
for i, system := range systems {
|
||||
alert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
|
||||
"name": "Status",
|
||||
"system": system.Id,
|
||||
"user": user.Id,
|
||||
"min": i + 1,
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
alerts = append(alerts, alert)
|
||||
}
|
||||
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
for _, alert := range alerts {
|
||||
assert.False(t, alert.GetBool("triggered"), "Alert should not be triggered immediately")
|
||||
}
|
||||
if hub.TestMailer.TotalSend() != 0 {
|
||||
assert.Zero(t, hub.TestMailer.TotalSend(), "Expected 0 messages, got %d", hub.TestMailer.TotalSend())
|
||||
}
|
||||
for _, system := range systems {
|
||||
assert.EqualValues(t, "paused", system.GetString("status"), "System should be paused")
|
||||
}
|
||||
for _, system := range systems {
|
||||
system.Set("status", "up")
|
||||
err = hub.SaveNoValidate(system)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
time.Sleep(time.Second)
|
||||
assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
|
||||
for _, system := range systems {
|
||||
system.Set("status", "down")
|
||||
err = hub.SaveNoValidate(system)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
// after 30 seconds, should have 4 alerts in the pendingAlerts map, no triggered alerts
|
||||
time.Sleep(time.Second * 30)
|
||||
assert.EqualValues(t, 4, hub.GetPendingAlertsCount(), "should have 4 alerts in the pendingAlerts map")
|
||||
triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, 0, triggeredCount, "should have 0 alert triggered")
|
||||
assert.EqualValues(t, 0, hub.TestMailer.TotalSend(), "should have 0 messages sent")
|
||||
// after 1:30 seconds, should have 1 triggered alert and 3 pending alerts
|
||||
time.Sleep(time.Second * 60)
|
||||
assert.EqualValues(t, 3, hub.GetPendingAlertsCount(), "should have 3 alerts in the pendingAlerts map")
|
||||
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, 1, triggeredCount, "should have 1 alert triggered")
|
||||
assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 messages sent")
|
||||
// after 2:30 seconds, should have 2 triggered alerts and 2 pending alerts
|
||||
time.Sleep(time.Second * 60)
|
||||
assert.EqualValues(t, 2, hub.GetPendingAlertsCount(), "should have 2 alerts in the pendingAlerts map")
|
||||
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, 2, triggeredCount, "should have 2 alert triggered")
|
||||
assert.EqualValues(t, 2, hub.TestMailer.TotalSend(), "should have 2 messages sent")
|
||||
// now we will bring the remaning systems back up
|
||||
for _, system := range systems {
|
||||
system.Set("status", "up")
|
||||
err = hub.SaveNoValidate(system)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
time.Sleep(time.Second)
|
||||
// should have 0 alerts in the pendingAlerts map and 0 alerts triggered
|
||||
assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
|
||||
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
|
||||
assert.NoError(t, err)
|
||||
assert.Zero(t, triggeredCount, "should have 0 alert triggered")
|
||||
// 4 messages sent, 2 down alerts and 2 up alerts for first 2 systems
|
||||
assert.EqualValues(t, 4, hub.TestMailer.TotalSend(), "should have 4 messages sent")
|
||||
})
|
||||
}
|
||||
|
||||
func TestAlertsHistory(t *testing.T) {
|
||||
synctest.Test(t, func(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
@@ -578,102 +496,3 @@ func TestAlertsHistory(t *testing.T) {
|
||||
assert.EqualValues(t, 2, totalHistoryCount, "Should have 2 total alert history records")
|
||||
})
|
||||
}
|
||||
func TestResolveStatusAlerts(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
// Create a systemUp
|
||||
systemUp, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
|
||||
"name": "test-system",
|
||||
"users": []string{user.Id},
|
||||
"host": "127.0.0.1",
|
||||
"status": "up",
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
|
||||
systemDown, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
|
||||
"name": "test-system-2",
|
||||
"users": []string{user.Id},
|
||||
"host": "127.0.0.2",
|
||||
"status": "up",
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
|
||||
// Create a status alertUp for the system
|
||||
alertUp, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
|
||||
"name": "Status",
|
||||
"system": systemUp.Id,
|
||||
"user": user.Id,
|
||||
"min": 1,
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
|
||||
alertDown, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
|
||||
"name": "Status",
|
||||
"system": systemDown.Id,
|
||||
"user": user.Id,
|
||||
"min": 1,
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
|
||||
// Verify alert is not triggered initially
|
||||
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered initially")
|
||||
|
||||
// Set the system to 'up' (this should not trigger the alert)
|
||||
systemUp.Set("status", "up")
|
||||
err = hub.SaveNoValidate(systemUp)
|
||||
assert.NoError(t, err)
|
||||
|
||||
systemDown.Set("status", "down")
|
||||
err = hub.SaveNoValidate(systemDown)
|
||||
assert.NoError(t, err)
|
||||
|
||||
// Wait a moment for any processing
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
// Verify alertUp is still not triggered after setting system to up
|
||||
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
|
||||
assert.NoError(t, err)
|
||||
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered when system is up")
|
||||
|
||||
// Manually set both alerts triggered to true
|
||||
alertUp.Set("triggered", true)
|
||||
err = hub.SaveNoValidate(alertUp)
|
||||
assert.NoError(t, err)
|
||||
alertDown.Set("triggered", true)
|
||||
err = hub.SaveNoValidate(alertDown)
|
||||
assert.NoError(t, err)
|
||||
|
||||
// Verify we have exactly one alert with triggered true
|
||||
triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, 2, triggeredCount, "Should have exactly two alerts with triggered true")
|
||||
|
||||
// Verify the specific alertUp is triggered
|
||||
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
|
||||
assert.NoError(t, err)
|
||||
assert.True(t, alertUp.GetBool("triggered"), "Alert should be triggered")
|
||||
|
||||
// Verify we have two unresolved alert history records
|
||||
alertHistoryCount, err := hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, 2, alertHistoryCount, "Should have exactly two unresolved alert history records")
|
||||
|
||||
err = alerts.ResolveStatusAlerts(hub)
|
||||
assert.NoError(t, err)
|
||||
|
||||
// Verify alertUp is not triggered after resolving
|
||||
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
|
||||
assert.NoError(t, err)
|
||||
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered after resolving")
|
||||
// Verify alertDown is still triggered
|
||||
alertDown, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertDown.Id})
|
||||
assert.NoError(t, err)
|
||||
assert.True(t, alertDown.GetBool("triggered"), "Alert should still be triggered after resolving")
|
||||
|
||||
// Verify we have one unresolved alert history record
|
||||
alertHistoryCount, err = hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
|
||||
assert.NoError(t, err)
|
||||
assert.EqualValues(t, 1, alertHistoryCount, "Should have exactly one unresolved alert history record")
|
||||
|
||||
}
|
||||
|
||||
@@ -9,6 +9,12 @@ import (
|
||||
"github.com/pocketbase/pocketbase/core"
|
||||
)
|
||||
|
||||
func NewTestAlertManagerWithoutWorker(app hubLike) *AlertManager {
|
||||
return &AlertManager{
|
||||
hub: app,
|
||||
}
|
||||
}
|
||||
|
||||
func (am *AlertManager) GetAlertManager() *AlertManager {
|
||||
return am
|
||||
}
|
||||
@@ -34,12 +40,11 @@ func (am *AlertManager) ProcessPendingAlerts() ([]*core.Record, error) {
|
||||
am.pendingAlerts.Range(func(key, value any) bool {
|
||||
info := value.(*alertInfo)
|
||||
if now.After(info.expireTime) {
|
||||
// Downtime delay has passed, process alert
|
||||
if err := am.sendStatusAlert("down", info.systemName, info.alertRecord); err != nil {
|
||||
lastErr = err
|
||||
if info.timer != nil {
|
||||
info.timer.Stop()
|
||||
}
|
||||
am.processPendingAlert(key.(string))
|
||||
processedAlerts = append(processedAlerts, info.alertRecord)
|
||||
am.pendingAlerts.Delete(key)
|
||||
}
|
||||
return true
|
||||
})
|
||||
@@ -56,6 +61,27 @@ func (am *AlertManager) ForceExpirePendingAlerts() {
|
||||
})
|
||||
}
|
||||
|
||||
func (am *AlertManager) ResetPendingAlertTimer(alertID string, delay time.Duration) bool {
|
||||
value, loaded := am.pendingAlerts.Load(alertID)
|
||||
if !loaded {
|
||||
return false
|
||||
}
|
||||
|
||||
info := value.(*alertInfo)
|
||||
if info.timer != nil {
|
||||
info.timer.Stop()
|
||||
}
|
||||
info.expireTime = time.Now().Add(delay)
|
||||
info.timer = time.AfterFunc(delay, func() {
|
||||
am.processPendingAlert(alertID)
|
||||
})
|
||||
return true
|
||||
}
|
||||
|
||||
func ResolveStatusAlerts(app core.App) error {
|
||||
return resolveStatusAlerts(app)
|
||||
}
|
||||
|
||||
func (am *AlertManager) RestorePendingStatusAlerts() error {
|
||||
return am.restorePendingStatusAlerts()
|
||||
}
|
||||
|
||||
@@ -98,7 +98,7 @@ func ClearCollection(t testing.TB, app core.App, collectionName string) error {
|
||||
}
|
||||
|
||||
func (h *TestHub) Cleanup() {
|
||||
h.GetAlertManager().StopWorker()
|
||||
h.GetAlertManager().Stop()
|
||||
h.GetSystemManager().RemoveAllSystems()
|
||||
h.TestApp.Cleanup()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user