mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-21 21:26:16 +01:00
Compare commits
5 Commits
35d0e792ad
...
ec7ad632a9
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ec7ad632a9 | ||
|
|
963fce5a33 | ||
|
|
d38c0da06d | ||
|
|
cae6ac4626 | ||
|
|
6b1ff264f2 |
@@ -33,8 +33,8 @@ func (gm *GPUManager) hasAmdSysfs() bool {
|
||||
return false
|
||||
}
|
||||
for _, vendorPath := range cards {
|
||||
vendor, err := os.ReadFile(vendorPath)
|
||||
if err == nil && strings.TrimSpace(string(vendor)) == "0x1002" {
|
||||
vendor, err := utils.ReadStringFileLimited(vendorPath, 64)
|
||||
if err == nil && vendor == "0x1002" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
@@ -88,12 +88,11 @@ func (gm *GPUManager) collectAmdStats() error {
|
||||
|
||||
// isAmdGpu checks whether a DRM card path belongs to AMD vendor ID 0x1002.
|
||||
func isAmdGpu(cardPath string) bool {
|
||||
vendorPath := filepath.Join(cardPath, "device/vendor")
|
||||
vendor, err := os.ReadFile(vendorPath)
|
||||
vendor, err := utils.ReadStringFileLimited(filepath.Join(cardPath, "device/vendor"), 64)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return strings.TrimSpace(string(vendor)) == "0x1002"
|
||||
return vendor == "0x1002"
|
||||
}
|
||||
|
||||
// updateAmdGpuData reads GPU metrics from sysfs and updates the GPU data map.
|
||||
@@ -155,11 +154,11 @@ func (gm *GPUManager) updateAmdGpuData(cardPath string) bool {
|
||||
|
||||
// readSysfsFloat reads and parses a numeric value from a sysfs file.
|
||||
func readSysfsFloat(path string) (float64, error) {
|
||||
val, err := os.ReadFile(path)
|
||||
val, err := utils.ReadStringFileLimited(path, 64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return strconv.ParseFloat(strings.TrimSpace(string(val)), 64)
|
||||
return strconv.ParseFloat(val, 64)
|
||||
}
|
||||
|
||||
// normalizeHexID normalizes hex IDs by trimming spaces, lowercasing, and dropping 0x.
|
||||
@@ -274,16 +273,16 @@ func cacheMissingAmdgpuName(deviceID, revisionID string) {
|
||||
// Falls back to showing the raw device ID if not found in the lookup table.
|
||||
func getAmdGpuName(devicePath string) string {
|
||||
// Try product_name first (works for some enterprise GPUs)
|
||||
if prod, err := os.ReadFile(filepath.Join(devicePath, "product_name")); err == nil {
|
||||
return strings.TrimSpace(string(prod))
|
||||
if prod, err := utils.ReadStringFileLimited(filepath.Join(devicePath, "product_name"), 128); err == nil {
|
||||
return prod
|
||||
}
|
||||
|
||||
// Read PCI device ID and look it up
|
||||
if deviceID, err := os.ReadFile(filepath.Join(devicePath, "device")); err == nil {
|
||||
id := normalizeHexID(string(deviceID))
|
||||
if deviceID, err := utils.ReadStringFileLimited(filepath.Join(devicePath, "device"), 64); err == nil {
|
||||
id := normalizeHexID(deviceID)
|
||||
revision := ""
|
||||
if revBytes, revErr := os.ReadFile(filepath.Join(devicePath, "revision")); revErr == nil {
|
||||
revision = normalizeHexID(string(revBytes))
|
||||
if rev, revErr := utils.ReadStringFileLimited(filepath.Join(devicePath, "revision"), 64); revErr == nil {
|
||||
revision = normalizeHexID(rev)
|
||||
}
|
||||
|
||||
if name, found, done := getCachedAmdgpuName(id, revision); found {
|
||||
|
||||
@@ -170,11 +170,18 @@ func mdraidSmartStatus(health mdraidHealth) string {
|
||||
case "inactive", "faulty", "broken", "stopped":
|
||||
return "FAILED"
|
||||
}
|
||||
// During rebuild/recovery, arrays are often temporarily degraded; report as
|
||||
// warning instead of hard failure while synchronization is in progress.
|
||||
syncAction := strings.ToLower(strings.TrimSpace(health.syncAction))
|
||||
switch syncAction {
|
||||
case "resync", "recover", "reshape":
|
||||
return "WARNING"
|
||||
}
|
||||
if health.degraded > 0 {
|
||||
return "FAILED"
|
||||
}
|
||||
switch strings.ToLower(strings.TrimSpace(health.syncAction)) {
|
||||
case "resync", "recover", "reshape", "check", "repair":
|
||||
switch syncAction {
|
||||
case "check", "repair":
|
||||
return "WARNING"
|
||||
}
|
||||
switch state {
|
||||
|
||||
@@ -85,6 +85,9 @@ func TestMdraidSmartStatus(t *testing.T) {
|
||||
if got := mdraidSmartStatus(mdraidHealth{arrayState: "inactive"}); got != "FAILED" {
|
||||
t.Fatalf("mdraidSmartStatus(inactive) = %q, want FAILED", got)
|
||||
}
|
||||
if got := mdraidSmartStatus(mdraidHealth{arrayState: "active", degraded: 1, syncAction: "recover"}); got != "WARNING" {
|
||||
t.Fatalf("mdraidSmartStatus(degraded+recover) = %q, want WARNING", got)
|
||||
}
|
||||
if got := mdraidSmartStatus(mdraidHealth{arrayState: "active", degraded: 1}); got != "FAILED" {
|
||||
t.Fatalf("mdraidSmartStatus(degraded) = %q, want FAILED", got)
|
||||
}
|
||||
|
||||
@@ -104,10 +104,7 @@ func (a *Agent) initializeNetIoStats() {
|
||||
// get current network I/O stats and record valid interfaces
|
||||
if netIO, err := psutilNet.IOCounters(true); err == nil {
|
||||
for _, v := range netIO {
|
||||
if nicsEnvExists && !isValidNic(v.Name, nicCfg) {
|
||||
continue
|
||||
}
|
||||
if a.skipNetworkInterface(v) {
|
||||
if skipNetworkInterface(v, nicCfg) {
|
||||
continue
|
||||
}
|
||||
slog.Info("Detected network interface", "name", v.Name, "sent", v.BytesSent, "recv", v.BytesRecv)
|
||||
@@ -243,7 +240,19 @@ func (a *Agent) applyNetworkTotals(
|
||||
a.netIoStats[cacheTimeMs] = nis
|
||||
}
|
||||
|
||||
func (a *Agent) skipNetworkInterface(v psutilNet.IOCountersStat) bool {
|
||||
// skipNetworkInterface returns true if the network interface should be ignored.
|
||||
func skipNetworkInterface(v psutilNet.IOCountersStat, nicCfg *NicConfig) bool {
|
||||
if nicCfg != nil {
|
||||
if !isValidNic(v.Name, nicCfg) {
|
||||
return true
|
||||
}
|
||||
// In whitelist mode, we honor explicit inclusion without auto-filtering.
|
||||
if !nicCfg.isBlacklist {
|
||||
return false
|
||||
}
|
||||
// In blacklist mode, still apply the auto-filter below.
|
||||
}
|
||||
|
||||
switch {
|
||||
case strings.HasPrefix(v.Name, "lo"),
|
||||
strings.HasPrefix(v.Name, "docker"),
|
||||
|
||||
@@ -261,6 +261,39 @@ func TestNewNicConfig(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
func TestSkipNetworkInterface(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
nic psutilNet.IOCountersStat
|
||||
nicCfg *NicConfig
|
||||
expectSkip bool
|
||||
}{
|
||||
{"loopback lo", psutilNet.IOCountersStat{Name: "lo", BytesSent: 100, BytesRecv: 100}, nil, true},
|
||||
{"loopback lo0", psutilNet.IOCountersStat{Name: "lo0", BytesSent: 100, BytesRecv: 100}, nil, true},
|
||||
{"docker prefix", psutilNet.IOCountersStat{Name: "docker0", BytesSent: 100, BytesRecv: 100}, nil, true},
|
||||
{"br- prefix", psutilNet.IOCountersStat{Name: "br-lan", BytesSent: 100, BytesRecv: 100}, nil, true},
|
||||
{"veth prefix", psutilNet.IOCountersStat{Name: "veth0abc", BytesSent: 100, BytesRecv: 100}, nil, true},
|
||||
{"bond prefix", psutilNet.IOCountersStat{Name: "bond0", BytesSent: 100, BytesRecv: 100}, nil, true},
|
||||
{"cali prefix", psutilNet.IOCountersStat{Name: "cali1234", BytesSent: 100, BytesRecv: 100}, nil, true},
|
||||
{"zero BytesRecv", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 100, BytesRecv: 0}, nil, true},
|
||||
{"zero BytesSent", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 0, BytesRecv: 100}, nil, true},
|
||||
{"both zero", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 0, BytesRecv: 0}, nil, true},
|
||||
{"normal eth0", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 100, BytesRecv: 200}, nil, false},
|
||||
{"normal wlan0", psutilNet.IOCountersStat{Name: "wlan0", BytesSent: 1, BytesRecv: 1}, nil, false},
|
||||
{"whitelist overrides skip (docker)", psutilNet.IOCountersStat{Name: "docker0", BytesSent: 100, BytesRecv: 100}, newNicConfig("docker0"), false},
|
||||
{"whitelist overrides skip (lo)", psutilNet.IOCountersStat{Name: "lo", BytesSent: 100, BytesRecv: 100}, newNicConfig("lo"), false},
|
||||
{"whitelist exclusion", psutilNet.IOCountersStat{Name: "eth1", BytesSent: 100, BytesRecv: 100}, newNicConfig("eth0"), true},
|
||||
{"blacklist skip lo", psutilNet.IOCountersStat{Name: "lo", BytesSent: 100, BytesRecv: 100}, newNicConfig("-eth0"), true},
|
||||
{"blacklist explicit eth0", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 100, BytesRecv: 100}, newNicConfig("-eth0"), true},
|
||||
{"blacklist allow eth1", psutilNet.IOCountersStat{Name: "eth1", BytesSent: 100, BytesRecv: 100}, newNicConfig("-eth0"), false},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
assert.Equal(t, tt.expectSkip, skipNetworkInterface(tt.nic, tt.nicCfg))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnsureNetworkInterfacesMap(t *testing.T) {
|
||||
var a Agent
|
||||
var stats system.Stats
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"io"
|
||||
"math"
|
||||
"os"
|
||||
"strconv"
|
||||
@@ -50,6 +51,23 @@ func ReadStringFileOK(path string) (string, bool) {
|
||||
return strings.TrimSpace(string(b)), true
|
||||
}
|
||||
|
||||
// ReadStringFileLimited reads a file into a string with a maximum size (in bytes) to avoid
|
||||
// allocating large buffers and potential panics with pseudo-files when the size is misreported.
|
||||
func ReadStringFileLimited(path string, maxSize int) (string, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
buf := make([]byte, maxSize)
|
||||
n, err := f.Read(buf)
|
||||
if err != nil && err != io.EOF {
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSpace(string(buf[:n])), nil
|
||||
}
|
||||
|
||||
// FileExists reports whether the given path exists.
|
||||
func FileExists(path string) bool {
|
||||
_, err := os.Stat(path)
|
||||
|
||||
2
go.mod
2
go.mod
@@ -1,6 +1,6 @@
|
||||
module github.com/henrygd/beszel
|
||||
|
||||
go 1.26.0
|
||||
go 1.26.1
|
||||
|
||||
require (
|
||||
github.com/blang/semver v3.5.1+incompatible
|
||||
|
||||
@@ -40,16 +40,22 @@ type UserNotificationSettings struct {
|
||||
Webhooks []string `json:"webhooks"`
|
||||
}
|
||||
|
||||
type SystemAlertFsStats struct {
|
||||
DiskTotal float64 `json:"d"`
|
||||
DiskUsed float64 `json:"du"`
|
||||
}
|
||||
|
||||
type SystemAlertStats struct {
|
||||
Cpu float64 `json:"cpu"`
|
||||
Mem float64 `json:"mp"`
|
||||
Disk float64 `json:"dp"`
|
||||
NetSent float64 `json:"ns"`
|
||||
NetRecv float64 `json:"nr"`
|
||||
GPU map[string]SystemAlertGPUData `json:"g"`
|
||||
Temperatures map[string]float32 `json:"t"`
|
||||
LoadAvg [3]float64 `json:"la"`
|
||||
Battery [2]uint8 `json:"bat"`
|
||||
Cpu float64 `json:"cpu"`
|
||||
Mem float64 `json:"mp"`
|
||||
Disk float64 `json:"dp"`
|
||||
NetSent float64 `json:"ns"`
|
||||
NetRecv float64 `json:"nr"`
|
||||
GPU map[string]SystemAlertGPUData `json:"g"`
|
||||
Temperatures map[string]float32 `json:"t"`
|
||||
LoadAvg [3]float64 `json:"la"`
|
||||
Battery [2]uint8 `json:"bat"`
|
||||
ExtraFs map[string]SystemAlertFsStats `json:"efs"`
|
||||
}
|
||||
|
||||
type SystemAlertGPUData struct {
|
||||
|
||||
155
internal/alerts/alerts_disk_test.go
Normal file
155
internal/alerts/alerts_disk_test.go
Normal file
@@ -0,0 +1,155 @@
|
||||
//go:build testing
|
||||
|
||||
package alerts_test
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/henrygd/beszel/internal/entities/system"
|
||||
beszelTests "github.com/henrygd/beszel/internal/tests"
|
||||
|
||||
"github.com/pocketbase/dbx"
|
||||
"github.com/pocketbase/pocketbase/tools/types"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// TestDiskAlertExtraFsMultiMinute tests that multi-minute disk alerts correctly use
|
||||
// historical per-minute values for extra (non-root) filesystems, not the current live snapshot.
|
||||
func TestDiskAlertExtraFsMultiMinute(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up")
|
||||
require.NoError(t, err)
|
||||
systemRecord := systems[0]
|
||||
|
||||
// Disk alert: threshold 80%, min=2 (requires historical averaging)
|
||||
diskAlert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
|
||||
"name": "Disk",
|
||||
"system": systemRecord.Id,
|
||||
"user": user.Id,
|
||||
"value": 80, // threshold: 80%
|
||||
"min": 2, // 2 minutes - requires historical averaging
|
||||
})
|
||||
require.NoError(t, err)
|
||||
assert.False(t, diskAlert.GetBool("triggered"), "Alert should not be triggered initially")
|
||||
|
||||
am := hub.GetAlertManager()
|
||||
now := time.Now().UTC()
|
||||
|
||||
extraFsHigh := map[string]*system.FsStats{
|
||||
"/mnt/data": {DiskTotal: 1000, DiskUsed: 920}, // 92% - above threshold
|
||||
}
|
||||
|
||||
// Insert 4 historical records spread over 3 minutes (same pattern as battery tests).
|
||||
// The oldest record must predate (now - 2min) so the alert time window is valid.
|
||||
recordTimes := []time.Duration{
|
||||
-180 * time.Second, // 3 min ago - anchors oldest record before alert.time
|
||||
-90 * time.Second,
|
||||
-60 * time.Second,
|
||||
-30 * time.Second,
|
||||
}
|
||||
|
||||
for _, offset := range recordTimes {
|
||||
stats := system.Stats{
|
||||
DiskPct: 30, // root disk at 30% - below threshold
|
||||
ExtraFs: extraFsHigh,
|
||||
}
|
||||
statsJSON, _ := json.Marshal(stats)
|
||||
|
||||
recordTime := now.Add(offset)
|
||||
record, err := beszelTests.CreateRecord(hub, "system_stats", map[string]any{
|
||||
"system": systemRecord.Id,
|
||||
"type": "1m",
|
||||
"stats": string(statsJSON),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
record.SetRaw("created", recordTime.Format(types.DefaultDateLayout))
|
||||
err = hub.SaveNoValidate(record)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
combinedDataHigh := &system.CombinedData{
|
||||
Stats: system.Stats{
|
||||
DiskPct: 30,
|
||||
ExtraFs: extraFsHigh,
|
||||
},
|
||||
Info: system.Info{
|
||||
DiskPct: 30,
|
||||
},
|
||||
}
|
||||
|
||||
systemRecord.Set("updated", now)
|
||||
err = hub.SaveNoValidate(systemRecord)
|
||||
require.NoError(t, err)
|
||||
|
||||
err = am.HandleSystemAlerts(systemRecord, combinedDataHigh)
|
||||
require.NoError(t, err)
|
||||
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
|
||||
diskAlert, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": diskAlert.Id})
|
||||
require.NoError(t, err)
|
||||
assert.True(t, diskAlert.GetBool("triggered"),
|
||||
"Alert SHOULD be triggered when extra disk average (92%%) exceeds threshold (80%%)")
|
||||
|
||||
// --- Resolution: extra disk drops to 50%, alert should resolve ---
|
||||
|
||||
extraFsLow := map[string]*system.FsStats{
|
||||
"/mnt/data": {DiskTotal: 1000, DiskUsed: 500}, // 50% - below threshold
|
||||
}
|
||||
|
||||
newNow := now.Add(2 * time.Minute)
|
||||
recordTimesLow := []time.Duration{
|
||||
-180 * time.Second,
|
||||
-90 * time.Second,
|
||||
-60 * time.Second,
|
||||
-30 * time.Second,
|
||||
}
|
||||
|
||||
for _, offset := range recordTimesLow {
|
||||
stats := system.Stats{
|
||||
DiskPct: 30,
|
||||
ExtraFs: extraFsLow,
|
||||
}
|
||||
statsJSON, _ := json.Marshal(stats)
|
||||
|
||||
recordTime := newNow.Add(offset)
|
||||
record, err := beszelTests.CreateRecord(hub, "system_stats", map[string]any{
|
||||
"system": systemRecord.Id,
|
||||
"type": "1m",
|
||||
"stats": string(statsJSON),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
record.SetRaw("created", recordTime.Format(types.DefaultDateLayout))
|
||||
err = hub.SaveNoValidate(record)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
combinedDataLow := &system.CombinedData{
|
||||
Stats: system.Stats{
|
||||
DiskPct: 30,
|
||||
ExtraFs: extraFsLow,
|
||||
},
|
||||
Info: system.Info{
|
||||
DiskPct: 30,
|
||||
},
|
||||
}
|
||||
|
||||
systemRecord.Set("updated", newNow)
|
||||
err = hub.SaveNoValidate(systemRecord)
|
||||
require.NoError(t, err)
|
||||
|
||||
err = am.HandleSystemAlerts(systemRecord, combinedDataLow)
|
||||
require.NoError(t, err)
|
||||
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
|
||||
diskAlert, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": diskAlert.Id})
|
||||
require.NoError(t, err)
|
||||
assert.False(t, diskAlert.GetBool("triggered"),
|
||||
"Alert should be resolved when extra disk average (50%%) drops below threshold (80%%)")
|
||||
}
|
||||
@@ -195,19 +195,21 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *syst
|
||||
alert.val += stats.NetSent + stats.NetRecv
|
||||
case "Disk":
|
||||
if alert.mapSums == nil {
|
||||
alert.mapSums = make(map[string]float32, len(data.Stats.ExtraFs)+1)
|
||||
alert.mapSums = make(map[string]float32, len(stats.ExtraFs)+1)
|
||||
}
|
||||
// add root disk
|
||||
if _, ok := alert.mapSums["root"]; !ok {
|
||||
alert.mapSums["root"] = 0.0
|
||||
}
|
||||
alert.mapSums["root"] += float32(stats.Disk)
|
||||
// add extra disks
|
||||
for key, fs := range data.Stats.ExtraFs {
|
||||
if _, ok := alert.mapSums[key]; !ok {
|
||||
alert.mapSums[key] = 0.0
|
||||
// add extra disks from historical record
|
||||
for key, fs := range stats.ExtraFs {
|
||||
if fs.DiskTotal > 0 {
|
||||
if _, ok := alert.mapSums[key]; !ok {
|
||||
alert.mapSums[key] = 0.0
|
||||
}
|
||||
alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100)
|
||||
}
|
||||
alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100)
|
||||
}
|
||||
case "Temperature":
|
||||
if alert.mapSums == nil {
|
||||
|
||||
Reference in New Issue
Block a user