Compare commits

...

5 Commits

Author SHA1 Message Date
Victor Eduardo
ec7ad632a9 fix: Use historical records to average disk usage for extra disk alerts (#1801)
- Introduced a new test file `alerts_disk_test.go` to validate the behavior of disk alerts using historical data for extra filesystems.
- Enhanced the `HandleSystemAlerts` function to correctly calculate disk usage for extra filesystems based on historical records.
- Updated the `SystemAlertStats` struct to include `ExtraFs` for tracking additional filesystem statistics.
2026-03-09 18:32:35 -04:00
VACInc
963fce5a33 agent: mark mdraid rebuild as warning, not failed (#1797) 2026-03-09 17:54:53 -04:00
Sven van Ginkel
d38c0da06d fix: bypass NIC auto-filter when interface is explicitly whitelisted via NICS (#1805)
Co-authored-by: henrygd <hank@henrygd.me>
2026-03-09 17:47:59 -04:00
henrygd
cae6ac4626 update go version to 1.26.1 2026-03-09 16:10:38 -04:00
henrygd
6b1ff264f2 gpu(amd): add workaround for misreported sysfs filesize (#1799) 2026-03-09 14:53:52 -04:00
10 changed files with 268 additions and 36 deletions

View File

@@ -33,8 +33,8 @@ func (gm *GPUManager) hasAmdSysfs() bool {
return false
}
for _, vendorPath := range cards {
vendor, err := os.ReadFile(vendorPath)
if err == nil && strings.TrimSpace(string(vendor)) == "0x1002" {
vendor, err := utils.ReadStringFileLimited(vendorPath, 64)
if err == nil && vendor == "0x1002" {
return true
}
}
@@ -88,12 +88,11 @@ func (gm *GPUManager) collectAmdStats() error {
// isAmdGpu checks whether a DRM card path belongs to AMD vendor ID 0x1002.
func isAmdGpu(cardPath string) bool {
vendorPath := filepath.Join(cardPath, "device/vendor")
vendor, err := os.ReadFile(vendorPath)
vendor, err := utils.ReadStringFileLimited(filepath.Join(cardPath, "device/vendor"), 64)
if err != nil {
return false
}
return strings.TrimSpace(string(vendor)) == "0x1002"
return vendor == "0x1002"
}
// updateAmdGpuData reads GPU metrics from sysfs and updates the GPU data map.
@@ -155,11 +154,11 @@ func (gm *GPUManager) updateAmdGpuData(cardPath string) bool {
// readSysfsFloat reads and parses a numeric value from a sysfs file.
func readSysfsFloat(path string) (float64, error) {
val, err := os.ReadFile(path)
val, err := utils.ReadStringFileLimited(path, 64)
if err != nil {
return 0, err
}
return strconv.ParseFloat(strings.TrimSpace(string(val)), 64)
return strconv.ParseFloat(val, 64)
}
// normalizeHexID normalizes hex IDs by trimming spaces, lowercasing, and dropping 0x.
@@ -274,16 +273,16 @@ func cacheMissingAmdgpuName(deviceID, revisionID string) {
// Falls back to showing the raw device ID if not found in the lookup table.
func getAmdGpuName(devicePath string) string {
// Try product_name first (works for some enterprise GPUs)
if prod, err := os.ReadFile(filepath.Join(devicePath, "product_name")); err == nil {
return strings.TrimSpace(string(prod))
if prod, err := utils.ReadStringFileLimited(filepath.Join(devicePath, "product_name"), 128); err == nil {
return prod
}
// Read PCI device ID and look it up
if deviceID, err := os.ReadFile(filepath.Join(devicePath, "device")); err == nil {
id := normalizeHexID(string(deviceID))
if deviceID, err := utils.ReadStringFileLimited(filepath.Join(devicePath, "device"), 64); err == nil {
id := normalizeHexID(deviceID)
revision := ""
if revBytes, revErr := os.ReadFile(filepath.Join(devicePath, "revision")); revErr == nil {
revision = normalizeHexID(string(revBytes))
if rev, revErr := utils.ReadStringFileLimited(filepath.Join(devicePath, "revision"), 64); revErr == nil {
revision = normalizeHexID(rev)
}
if name, found, done := getCachedAmdgpuName(id, revision); found {

View File

@@ -170,11 +170,18 @@ func mdraidSmartStatus(health mdraidHealth) string {
case "inactive", "faulty", "broken", "stopped":
return "FAILED"
}
// During rebuild/recovery, arrays are often temporarily degraded; report as
// warning instead of hard failure while synchronization is in progress.
syncAction := strings.ToLower(strings.TrimSpace(health.syncAction))
switch syncAction {
case "resync", "recover", "reshape":
return "WARNING"
}
if health.degraded > 0 {
return "FAILED"
}
switch strings.ToLower(strings.TrimSpace(health.syncAction)) {
case "resync", "recover", "reshape", "check", "repair":
switch syncAction {
case "check", "repair":
return "WARNING"
}
switch state {

View File

@@ -85,6 +85,9 @@ func TestMdraidSmartStatus(t *testing.T) {
if got := mdraidSmartStatus(mdraidHealth{arrayState: "inactive"}); got != "FAILED" {
t.Fatalf("mdraidSmartStatus(inactive) = %q, want FAILED", got)
}
if got := mdraidSmartStatus(mdraidHealth{arrayState: "active", degraded: 1, syncAction: "recover"}); got != "WARNING" {
t.Fatalf("mdraidSmartStatus(degraded+recover) = %q, want WARNING", got)
}
if got := mdraidSmartStatus(mdraidHealth{arrayState: "active", degraded: 1}); got != "FAILED" {
t.Fatalf("mdraidSmartStatus(degraded) = %q, want FAILED", got)
}

View File

@@ -104,10 +104,7 @@ func (a *Agent) initializeNetIoStats() {
// get current network I/O stats and record valid interfaces
if netIO, err := psutilNet.IOCounters(true); err == nil {
for _, v := range netIO {
if nicsEnvExists && !isValidNic(v.Name, nicCfg) {
continue
}
if a.skipNetworkInterface(v) {
if skipNetworkInterface(v, nicCfg) {
continue
}
slog.Info("Detected network interface", "name", v.Name, "sent", v.BytesSent, "recv", v.BytesRecv)
@@ -243,7 +240,19 @@ func (a *Agent) applyNetworkTotals(
a.netIoStats[cacheTimeMs] = nis
}
func (a *Agent) skipNetworkInterface(v psutilNet.IOCountersStat) bool {
// skipNetworkInterface returns true if the network interface should be ignored.
func skipNetworkInterface(v psutilNet.IOCountersStat, nicCfg *NicConfig) bool {
if nicCfg != nil {
if !isValidNic(v.Name, nicCfg) {
return true
}
// In whitelist mode, we honor explicit inclusion without auto-filtering.
if !nicCfg.isBlacklist {
return false
}
// In blacklist mode, still apply the auto-filter below.
}
switch {
case strings.HasPrefix(v.Name, "lo"),
strings.HasPrefix(v.Name, "docker"),

View File

@@ -261,6 +261,39 @@ func TestNewNicConfig(t *testing.T) {
})
}
}
func TestSkipNetworkInterface(t *testing.T) {
tests := []struct {
name string
nic psutilNet.IOCountersStat
nicCfg *NicConfig
expectSkip bool
}{
{"loopback lo", psutilNet.IOCountersStat{Name: "lo", BytesSent: 100, BytesRecv: 100}, nil, true},
{"loopback lo0", psutilNet.IOCountersStat{Name: "lo0", BytesSent: 100, BytesRecv: 100}, nil, true},
{"docker prefix", psutilNet.IOCountersStat{Name: "docker0", BytesSent: 100, BytesRecv: 100}, nil, true},
{"br- prefix", psutilNet.IOCountersStat{Name: "br-lan", BytesSent: 100, BytesRecv: 100}, nil, true},
{"veth prefix", psutilNet.IOCountersStat{Name: "veth0abc", BytesSent: 100, BytesRecv: 100}, nil, true},
{"bond prefix", psutilNet.IOCountersStat{Name: "bond0", BytesSent: 100, BytesRecv: 100}, nil, true},
{"cali prefix", psutilNet.IOCountersStat{Name: "cali1234", BytesSent: 100, BytesRecv: 100}, nil, true},
{"zero BytesRecv", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 100, BytesRecv: 0}, nil, true},
{"zero BytesSent", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 0, BytesRecv: 100}, nil, true},
{"both zero", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 0, BytesRecv: 0}, nil, true},
{"normal eth0", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 100, BytesRecv: 200}, nil, false},
{"normal wlan0", psutilNet.IOCountersStat{Name: "wlan0", BytesSent: 1, BytesRecv: 1}, nil, false},
{"whitelist overrides skip (docker)", psutilNet.IOCountersStat{Name: "docker0", BytesSent: 100, BytesRecv: 100}, newNicConfig("docker0"), false},
{"whitelist overrides skip (lo)", psutilNet.IOCountersStat{Name: "lo", BytesSent: 100, BytesRecv: 100}, newNicConfig("lo"), false},
{"whitelist exclusion", psutilNet.IOCountersStat{Name: "eth1", BytesSent: 100, BytesRecv: 100}, newNicConfig("eth0"), true},
{"blacklist skip lo", psutilNet.IOCountersStat{Name: "lo", BytesSent: 100, BytesRecv: 100}, newNicConfig("-eth0"), true},
{"blacklist explicit eth0", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 100, BytesRecv: 100}, newNicConfig("-eth0"), true},
{"blacklist allow eth1", psutilNet.IOCountersStat{Name: "eth1", BytesSent: 100, BytesRecv: 100}, newNicConfig("-eth0"), false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.expectSkip, skipNetworkInterface(tt.nic, tt.nicCfg))
})
}
}
func TestEnsureNetworkInterfacesMap(t *testing.T) {
var a Agent
var stats system.Stats

View File

@@ -1,6 +1,7 @@
package utils
import (
"io"
"math"
"os"
"strconv"
@@ -50,6 +51,23 @@ func ReadStringFileOK(path string) (string, bool) {
return strings.TrimSpace(string(b)), true
}
// ReadStringFileLimited reads a file into a string with a maximum size (in bytes) to avoid
// allocating large buffers and potential panics with pseudo-files when the size is misreported.
func ReadStringFileLimited(path string, maxSize int) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
buf := make([]byte, maxSize)
n, err := f.Read(buf)
if err != nil && err != io.EOF {
return "", err
}
return strings.TrimSpace(string(buf[:n])), nil
}
// FileExists reports whether the given path exists.
func FileExists(path string) bool {
_, err := os.Stat(path)

2
go.mod
View File

@@ -1,6 +1,6 @@
module github.com/henrygd/beszel
go 1.26.0
go 1.26.1
require (
github.com/blang/semver v3.5.1+incompatible

View File

@@ -40,16 +40,22 @@ type UserNotificationSettings struct {
Webhooks []string `json:"webhooks"`
}
type SystemAlertFsStats struct {
DiskTotal float64 `json:"d"`
DiskUsed float64 `json:"du"`
}
type SystemAlertStats struct {
Cpu float64 `json:"cpu"`
Mem float64 `json:"mp"`
Disk float64 `json:"dp"`
NetSent float64 `json:"ns"`
NetRecv float64 `json:"nr"`
GPU map[string]SystemAlertGPUData `json:"g"`
Temperatures map[string]float32 `json:"t"`
LoadAvg [3]float64 `json:"la"`
Battery [2]uint8 `json:"bat"`
Cpu float64 `json:"cpu"`
Mem float64 `json:"mp"`
Disk float64 `json:"dp"`
NetSent float64 `json:"ns"`
NetRecv float64 `json:"nr"`
GPU map[string]SystemAlertGPUData `json:"g"`
Temperatures map[string]float32 `json:"t"`
LoadAvg [3]float64 `json:"la"`
Battery [2]uint8 `json:"bat"`
ExtraFs map[string]SystemAlertFsStats `json:"efs"`
}
type SystemAlertGPUData struct {

View File

@@ -0,0 +1,155 @@
//go:build testing
package alerts_test
import (
"encoding/json"
"testing"
"time"
"github.com/henrygd/beszel/internal/entities/system"
beszelTests "github.com/henrygd/beszel/internal/tests"
"github.com/pocketbase/dbx"
"github.com/pocketbase/pocketbase/tools/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestDiskAlertExtraFsMultiMinute tests that multi-minute disk alerts correctly use
// historical per-minute values for extra (non-root) filesystems, not the current live snapshot.
func TestDiskAlertExtraFsMultiMinute(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up")
require.NoError(t, err)
systemRecord := systems[0]
// Disk alert: threshold 80%, min=2 (requires historical averaging)
diskAlert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Disk",
"system": systemRecord.Id,
"user": user.Id,
"value": 80, // threshold: 80%
"min": 2, // 2 minutes - requires historical averaging
})
require.NoError(t, err)
assert.False(t, diskAlert.GetBool("triggered"), "Alert should not be triggered initially")
am := hub.GetAlertManager()
now := time.Now().UTC()
extraFsHigh := map[string]*system.FsStats{
"/mnt/data": {DiskTotal: 1000, DiskUsed: 920}, // 92% - above threshold
}
// Insert 4 historical records spread over 3 minutes (same pattern as battery tests).
// The oldest record must predate (now - 2min) so the alert time window is valid.
recordTimes := []time.Duration{
-180 * time.Second, // 3 min ago - anchors oldest record before alert.time
-90 * time.Second,
-60 * time.Second,
-30 * time.Second,
}
for _, offset := range recordTimes {
stats := system.Stats{
DiskPct: 30, // root disk at 30% - below threshold
ExtraFs: extraFsHigh,
}
statsJSON, _ := json.Marshal(stats)
recordTime := now.Add(offset)
record, err := beszelTests.CreateRecord(hub, "system_stats", map[string]any{
"system": systemRecord.Id,
"type": "1m",
"stats": string(statsJSON),
})
require.NoError(t, err)
record.SetRaw("created", recordTime.Format(types.DefaultDateLayout))
err = hub.SaveNoValidate(record)
require.NoError(t, err)
}
combinedDataHigh := &system.CombinedData{
Stats: system.Stats{
DiskPct: 30,
ExtraFs: extraFsHigh,
},
Info: system.Info{
DiskPct: 30,
},
}
systemRecord.Set("updated", now)
err = hub.SaveNoValidate(systemRecord)
require.NoError(t, err)
err = am.HandleSystemAlerts(systemRecord, combinedDataHigh)
require.NoError(t, err)
time.Sleep(20 * time.Millisecond)
diskAlert, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": diskAlert.Id})
require.NoError(t, err)
assert.True(t, diskAlert.GetBool("triggered"),
"Alert SHOULD be triggered when extra disk average (92%%) exceeds threshold (80%%)")
// --- Resolution: extra disk drops to 50%, alert should resolve ---
extraFsLow := map[string]*system.FsStats{
"/mnt/data": {DiskTotal: 1000, DiskUsed: 500}, // 50% - below threshold
}
newNow := now.Add(2 * time.Minute)
recordTimesLow := []time.Duration{
-180 * time.Second,
-90 * time.Second,
-60 * time.Second,
-30 * time.Second,
}
for _, offset := range recordTimesLow {
stats := system.Stats{
DiskPct: 30,
ExtraFs: extraFsLow,
}
statsJSON, _ := json.Marshal(stats)
recordTime := newNow.Add(offset)
record, err := beszelTests.CreateRecord(hub, "system_stats", map[string]any{
"system": systemRecord.Id,
"type": "1m",
"stats": string(statsJSON),
})
require.NoError(t, err)
record.SetRaw("created", recordTime.Format(types.DefaultDateLayout))
err = hub.SaveNoValidate(record)
require.NoError(t, err)
}
combinedDataLow := &system.CombinedData{
Stats: system.Stats{
DiskPct: 30,
ExtraFs: extraFsLow,
},
Info: system.Info{
DiskPct: 30,
},
}
systemRecord.Set("updated", newNow)
err = hub.SaveNoValidate(systemRecord)
require.NoError(t, err)
err = am.HandleSystemAlerts(systemRecord, combinedDataLow)
require.NoError(t, err)
time.Sleep(20 * time.Millisecond)
diskAlert, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": diskAlert.Id})
require.NoError(t, err)
assert.False(t, diskAlert.GetBool("triggered"),
"Alert should be resolved when extra disk average (50%%) drops below threshold (80%%)")
}

View File

@@ -195,19 +195,21 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *syst
alert.val += stats.NetSent + stats.NetRecv
case "Disk":
if alert.mapSums == nil {
alert.mapSums = make(map[string]float32, len(data.Stats.ExtraFs)+1)
alert.mapSums = make(map[string]float32, len(stats.ExtraFs)+1)
}
// add root disk
if _, ok := alert.mapSums["root"]; !ok {
alert.mapSums["root"] = 0.0
}
alert.mapSums["root"] += float32(stats.Disk)
// add extra disks
for key, fs := range data.Stats.ExtraFs {
if _, ok := alert.mapSums[key]; !ok {
alert.mapSums[key] = 0.0
// add extra disks from historical record
for key, fs := range stats.ExtraFs {
if fs.DiskTotal > 0 {
if _, ok := alert.mapSums[key]; !ok {
alert.mapSums[key] = 0.0
}
alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100)
}
alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100)
}
case "Temperature":
if alert.mapSums == nil {