Compare commits

..

78 Commits

Author SHA1 Message Date
henrygd
e4e0affbc1 test(hub): add additional tests for all system alerts 2026-03-17 18:48:54 -04:00
henrygd
c3a0e645ee refactor: variable renaming in alerts package 2026-03-17 18:44:46 -04:00
henrygd
c6c3950fb0 refactor: add alertsCache to maintain active alert data in memory 2026-03-17 18:32:57 -04:00
henrygd
48ddc96a0d systemd: allow timer monitoring with SERVICE_PATTERNS (#1820) 2026-03-17 15:11:44 -04:00
henrygd
704cb86de8 refactor: change ExpiryMap.store to be a pointer 2026-03-16 17:44:45 -04:00
henrygd
2854ce882f fix(ui): centralize default layout width and update default setting 2026-03-16 15:23:32 -04:00
henrygd
ed50367f70 fix(agent): add fallback for podman container health (#1475) 2026-03-15 17:59:59 -04:00
henrygd
4ebe869591 ui: virtualize smart table 2026-03-15 15:20:07 -04:00
henrygd
c9bbbe91f2 ui: improve table col widths and hide text showing above header 2026-03-15 14:59:25 -04:00
henrygd
5bfe4f6970 agent: include ip in container port if not 0.0.0.0 or :: 2026-03-15 14:58:21 -04:00
henrygd
380d2b1091 add ports column to containers table (#1481) 2026-03-14 19:29:39 -04:00
henrygd
a7f99e7a8c agent: support new Docker API Health field (#1475) 2026-03-14 15:26:44 -04:00
henrygd
bd94a9d142 agent: improve disk discovery / IO mapping and add tests (#1811) 2026-03-13 16:03:27 -04:00
henrygd
8e2316f845 refactor: simplify/improve status alert handling (#1519)
also adds new functionality to restore any pending down alerts
that were lost by hub restart before creation
2026-03-12 15:53:40 -04:00
Sven van Ginkel
0d3dfcb207 fix(hub): check if status alert is triggered before sending up alert (#1806) 2026-03-12 13:38:42 -04:00
henrygd
b386ce5190 hub: add ExpiryMap.UpdateExpiration and sync SMART fetch intervals (#1800)
- Update smartFetchMap expiration when agent smart interval changes
- Prevent background SMART fetching before initial system details are
loaded
- Add buffer to SMART fetch timing check
- Get rid of unnecessary pointers in expirymap
2026-03-11 16:25:52 -04:00
henrygd
e527534016 ensure deprecated system fields are migrated to newer structures
also removes refs to legacy load avg fields (l1, l5, l15) that were
around for a very short period
2026-03-10 18:46:57 -04:00
Victor Eduardo
ec7ad632a9 fix: Use historical records to average disk usage for extra disk alerts (#1801)
- Introduced a new test file `alerts_disk_test.go` to validate the behavior of disk alerts using historical data for extra filesystems.
- Enhanced the `HandleSystemAlerts` function to correctly calculate disk usage for extra filesystems based on historical records.
- Updated the `SystemAlertStats` struct to include `ExtraFs` for tracking additional filesystem statistics.
2026-03-09 18:32:35 -04:00
VACInc
963fce5a33 agent: mark mdraid rebuild as warning, not failed (#1797) 2026-03-09 17:54:53 -04:00
Sven van Ginkel
d38c0da06d fix: bypass NIC auto-filter when interface is explicitly whitelisted via NICS (#1805)
Co-authored-by: henrygd <hank@henrygd.me>
2026-03-09 17:47:59 -04:00
henrygd
cae6ac4626 update go version to 1.26.1 2026-03-09 16:10:38 -04:00
henrygd
6b1ff264f2 gpu(amd): add workaround for misreported sysfs filesize (#1799) 2026-03-09 14:53:52 -04:00
henrygd
35d0e792ad refactor(expirymap): optimize performance and add StopCleaner method 2026-03-08 19:09:41 -04:00
henrygd
654cd06b19 respect SMART_INTERVAL across agent reconnects (#1800)
Move tracking of the last SMART data fetch from individual System
instances to the SystemManager using a TTL-based ExpiryMap.

This ensures that the SMART_INTERVAL is respected even if an
agent connection is dropped and re-established, preventing
redundant data collection on every reconnect.
2026-03-08 19:03:50 -04:00
henrygd
5e1b028130 refactor(smart): improve perf by skipping ata_device_statistics parsing if unnecessary 2026-03-08 15:19:50 -04:00
henrygd
638e7dc12a fix(smart): handle negative ATA device statistics values (#1791) 2026-03-08 13:34:16 -04:00
henrygd
73c262455d refactor(agent): move GetEnv to utils package 2026-03-07 14:12:17 -05:00
henrygd
0c4d2edd45 refactor(agent): add utils package; rm utils.go and fs_utils.go 2026-03-07 13:50:49 -05:00
henrygd
8f23fff1c9 refactor: mdraid comments and organization
also hide serial / firmware in smart details if empty, remove a few
unnecessary ops, and add a few more passed state values
2026-02-27 14:23:10 -05:00
VACInc
02c1a0c13d Add Linux mdraid health monitoring (#1750) 2026-02-27 13:42:47 -05:00
henrygd
69fdcb36ab support ZFS ARC on freebsd 2026-02-26 18:38:54 -05:00
henrygd
b91eb6de40 improve root I/O device detection and fallback (#1772)
- Match FILESYSTEM directly against I/O devices if partition lookup
fails
- Fall back to the most active I/O device if no root device is detected
- Add WARN logs in final fallback case to most active device
2026-02-26 18:11:33 -05:00
henrygd
ec69f6c6e0 improve disk I/O device matching for partition-to-disk mismatches (#1772)
findIoDevice now normalizes device names and falls back to prefix-based
matching when partition names differ from IOCounter names (e.g. nda0p2 →
nda0 on FreeBSD). The most-active prefix-related device is selected,
avoiding the broad "most active of all" heuristic that caused Docker
misattribution in #1737.
2026-02-26 16:59:12 -05:00
henrygd
a86cb91e07 improve install scripts with retries, validation, and better error messages
Add curl retries/timeouts, archive integrity checks, binary existence
checks, and temp dir cleanup on all failure paths. Unify --mirror flag
handling in hub script to match agent. Use cat instead of tee for
systemd service file, quiet systemctl output.
2026-02-26 12:29:05 -05:00
henrygd
004841717a add checks for non-empty CPU times during initialization (#401) 2026-02-25 19:04:29 -05:00
henrygd
096296ba7b fix: ensure rc.d directory exists for minimal FreeBSD installs in install-agent.sh 2026-02-25 16:22:37 -05:00
ilya
b012df5669 Fix volume path in Docker run command (#1764) 2026-02-24 15:47:16 -05:00
henrygd
12545b4b6d fix: dedupe root-mirrored extra filesystems during disk discovery (#1428) 2026-02-24 15:41:29 -05:00
henrygd
9e2296452b fix: compute bandwidth alerts from byte-per-second source (#1770)
Use Info.BandwidthBytes converted to MB/s with float division so
bandwidth alert checks are based on current data without integer
truncation near thresholds.
2026-02-24 13:07:27 -05:00
henrygd
ac79860d4a dev: update biome schema and disable assist/source/organizeImports 2026-02-20 15:50:44 -05:00
henrygd
e13a99fdac ui: add fallback to display language code if no emoji / flag 2026-02-20 15:46:24 -05:00
henrygd
4cfb2a86ad 0.18.4 release 2026-02-20 15:00:15 -05:00
henrygd
191f25f6e0 ui: refactor heartbeat settings page 2026-02-20 14:48:59 -05:00
henrygd
aa8b3711d7 update translations 2026-02-19 19:22:54 -05:00
henrygd
1fb0b25988 testing: improve flaky hub cleanup in agent_connect_test.go 2026-02-19 18:35:31 -05:00
henrygd
04600d83cc refactor: small go 1.26 updates and go fix changes 2026-02-19 18:04:33 -05:00
henrygd
5d8906c9b2 amd gpu: small refactor + trim "series" from device name 2026-02-19 17:39:13 -05:00
henrygd
daac287b9d ui: fix race issue with meter threshold colors
also increase the default container width
2026-02-19 17:37:57 -05:00
henrygd
d526ea61a9 ui: freeze header of smart device details table 2026-02-19 17:35:12 -05:00
henrygd
79616e1662 update translations 2026-02-19 16:21:59 -05:00
Sven van Ginkel
01e8bdf040 feat: allow precise value entry for alerts via text input (#1718) 2026-02-19 13:15:12 -05:00
henrygd
1e3a44e05d agent: improve multiplexed logs detection for podman (#1755) 2026-02-18 17:45:37 -05:00
henrygd
311095cfdd harden against docker api path traversal
Validate container IDs (12-64 hex) in hub container endpoints and agent
Docker requests, and build Docker URLs with escaped path segments. Add
regression tests for traversal/malformed container inputs and safe
endpoint construction.
2026-02-18 17:33:00 -05:00
henrygd
4869c834bb fix(ui): update bandwidth fallback to 0 when data is empty (avoid NaN) 2026-02-18 16:28:18 -05:00
henrygd
e1c1e97f0a chore: update go version / go deps / changelog 2026-02-18 16:17:05 -05:00
henrygd
f6b2824ccc rename gpu_apple_unsupported.go to gpu_darwin_unsupported.go 2026-02-18 15:15:58 -05:00
henrygd
f17ffc21b8 gate apple gpu collectors + revert readme change 2026-02-18 14:57:41 -05:00
Robert Accettura
f792f9b102 Mac GPU Stats (#1747) 2026-02-18 14:51:30 -05:00
henrygd
1def7d8d3a agent: add dockerManager.retrySleep method to mock time.Sleep in tests 2026-02-18 13:45:03 -05:00
Elio Di Nino
ef92b254bf fix(agent): Retry Docker check on non-200 HTTP response (#1754)
The previous behavior only caught some errors including inaccessible
hosts, but not others like failed authentication or service
unavailability. This largely applies when using a socket proxy and
having the retry mitigates some erroneous behavior.
2026-02-18 13:42:58 -05:00
henrygd
10d853c004 heartbeat: tweaks and tests (#1729) 2026-02-17 16:12:29 -05:00
Amir Moradi
cdfd116da0 Add outbound heartbeat monitoring (#1729)
* feat: add outbound heartbeat monitoring to external endpoints

Allow Beszel hub to periodically ping an external monitoring service
(e.g. BetterStack, Uptime Kuma, Healthchecks.io) with system status
summaries, enabling monitoring without exposing Beszel to the internet.

Configuration via environment variables:
- BESZEL_HUB_HEARTBEAT_URL: endpoint to ping (required to enable)
- BESZEL_HUB_HEARTBEAT_INTERVAL: seconds between pings (default: 60)
- BESZEL_HUB_HEARTBEAT_METHOD: HTTP method - POST/GET/HEAD (default: POST)
2026-02-17 15:48:20 -05:00
henrygd
283fa9d5c2 include GTT memory in AMD GPU metrics (#1569) 2026-02-13 20:06:37 -05:00
henrygd
7d6c0caafc add amdgpu.ids to docker images (#1569) 2026-02-13 19:55:02 -05:00
henrygd
04d54a3efc update sysfs amd collector to pull pretty name from amdgpu.ids (#1569) 2026-02-13 19:41:40 -05:00
henrygd
14ecb1b069 add nvtop integration and introduce GPU_COLLECTOR env var 2026-02-13 19:41:40 -05:00
henrygd
1f1a448aef ui: small refactoring / auto formatting 2026-02-12 18:40:16 -05:00
VACInc
e816ea143a SMART: add eMMC health via sysfs (#1736)
* SMART: add eMMC health via sysfs

Read eMMC wear/EOL indicators from /sys/class/block/mmcblk*/device and expose in SMART device list. Includes mocked sysfs tests and UI tweaks for unknown temps.

* small optimizations for emmc scan and parsing

* smart: keep smartctl optional only for Linux hosts with eMMC

* update smart alerts to handle warning state

* refactor: rename binPath to smartctlPath and replace hasSmartctl with smartctlPath checks

---------

Co-authored-by: henrygd <hank@henrygd.me>
2026-02-12 15:27:42 -05:00
Sven van Ginkel
2230097dc7 chore: update inactivity-actions (#1742) 2026-02-12 12:29:22 -05:00
henrygd
25c77c5664 make: auto-apply glibc tag for agent on linux/amd64 glibc 2026-02-11 13:49:29 -05:00
henrygd
dba3519b2c fix(agent): avoid mismatched root disk I/O mapping in docker (#1737)
- Stop using max-read fallback when mapping root filesystem to
diskstats.
- Keep root usage reporting even when root I/O cannot be mapped.
- Expand docker fallback mount detection to include /etc/resolv.conf and
/etc/hostname (in addition to /etc/hosts).
- Add clearer warnings when root block device detection is uncertain.
2026-02-10 18:12:04 -05:00
henrygd
48c35aa54d update to go 1.25.7 (fixes GO-2026-4337) 2026-02-06 14:35:53 -05:00
Sven van Ginkel
6b7845b03e feat: add fingerprint command to agent (#1726)
Co-authored-by: henrygd <hank@henrygd.me>
2026-02-06 14:32:57 -05:00
Sven van Ginkel
221be1da58 Add version flag insteaf of subcommand (#1639) 2026-02-05 20:36:57 -05:00
Sven van Ginkel
8347afd68e feat: add uptime to table (#1719) 2026-02-04 20:18:28 -05:00
henrygd
2a3885a52e add check to make sure fingerprint file isn't empty (#1714) 2026-02-04 20:05:07 -05:00
henrygd
5452e50080 add DISABLE_SSH env var (#1061) 2026-02-04 18:48:55 -05:00
henrygd
028f7bafb2 add InstallMethod parameter to Windows install script
Allows users to explicitly choose Scoop or WinGet for installation
instead of relying on auto-detection. Useful when both package
managers are installed but the user prefers one over the other.
2026-02-02 14:30:08 -05:00
166 changed files with 12879 additions and 1655 deletions

View File

@@ -6,6 +6,7 @@ on:
workflow_dispatch:
permissions:
actions: write
issues: write
pull-requests: write
@@ -48,6 +49,9 @@ jobs:
# Action can not skip PRs, set it to 100 years to cover it.
days-before-pr-stale: 36524
# Max issues to process before early exit. Next run resumes from cache. GH API limit: 5000.
operations-per-run: 1500
# Labels
stale-issue-label: 'stale'
remove-stale-when-updated: true
@@ -56,4 +60,5 @@ jobs:
# Exemptions
exempt-assignees: true
exempt-milestones: true
exempt-milestones: true

1
.gitignore vendored
View File

@@ -10,6 +10,7 @@ dist
*.exe
internal/cmd/hub/hub
internal/cmd/agent/agent
agent.test
node_modules
build
*timestamp*

View File

@@ -3,6 +3,40 @@ OS ?= $(shell go env GOOS)
ARCH ?= $(shell go env GOARCH)
# Skip building the web UI if true
SKIP_WEB ?= false
# Controls NVML/glibc agent build tag behavior:
# - auto (default): enable on linux/amd64 glibc hosts
# - true: always enable
# - false: always disable
NVML ?= auto
# Detect glibc host for local linux/amd64 builds.
HOST_GLIBC := $(shell \
if [ "$(OS)" = "linux" ] && [ "$(ARCH)" = "amd64" ]; then \
for p in /lib64/ld-linux-x86-64.so.2 /lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 /lib/ld-linux-x86-64.so.2; do \
[ -e "$$p" ] && { echo true; exit 0; }; \
done; \
if command -v ldd >/dev/null 2>&1; then \
if ldd --version 2>&1 | tr '[:upper:]' '[:lower:]' | awk '/gnu libc|glibc/{found=1} END{exit !found}'; then \
echo true; \
else \
echo false; \
fi; \
else \
echo false; \
fi; \
else \
echo false; \
fi)
# Enable glibc build tag for NVML on supported Linux builds.
AGENT_GO_TAGS :=
ifeq ($(NVML),true)
AGENT_GO_TAGS := -tags glibc
else ifeq ($(NVML),auto)
ifeq ($(HOST_GLIBC),true)
AGENT_GO_TAGS := -tags glibc
endif
endif
# Set executable extension based on target OS
EXE_EXT := $(if $(filter windows,$(OS)),.exe,)
@@ -17,7 +51,6 @@ clean:
lint:
golangci-lint run
test: export GOEXPERIMENT=synctest
test:
go test -tags=testing ./...
@@ -54,7 +87,7 @@ fetch-smartctl-conditional:
# Update build-agent to include conditional .NET build
build-agent: tidy build-dotnet-conditional fetch-smartctl-conditional
GOOS=$(OS) GOARCH=$(ARCH) go build -o ./build/beszel-agent_$(OS)_$(ARCH)$(EXE_EXT) -ldflags "-w -s" ./internal/cmd/agent
GOOS=$(OS) GOARCH=$(ARCH) go build $(AGENT_GO_TAGS) -o ./build/beszel-agent_$(OS)_$(ARCH)$(EXE_EXT) -ldflags "-w -s" ./internal/cmd/agent
build-hub: tidy $(if $(filter false,$(SKIP_WEB)),build-web-ui)
GOOS=$(OS) GOARCH=$(ARCH) go build -o ./build/beszel_$(OS)_$(ARCH)$(EXE_EXT) -ldflags "-w -s" ./internal/cmd/hub
@@ -90,9 +123,9 @@ dev-hub:
dev-agent:
@if command -v entr >/dev/null 2>&1; then \
find ./internal/cmd/agent/*.go ./agent/*.go | entr -r go run github.com/henrygd/beszel/internal/cmd/agent; \
find ./internal/cmd/agent/*.go ./agent/*.go | entr -r go run $(AGENT_GO_TAGS) github.com/henrygd/beszel/internal/cmd/agent; \
else \
go run github.com/henrygd/beszel/internal/cmd/agent; \
go run $(AGENT_GO_TAGS) github.com/henrygd/beszel/internal/cmd/agent; \
fi
build-dotnet:

View File

@@ -5,11 +5,7 @@
package agent
import (
"crypto/sha256"
"encoding/hex"
"log/slog"
"os"
"path/filepath"
"strings"
"sync"
"time"
@@ -17,9 +13,9 @@ import (
"github.com/gliderlabs/ssh"
"github.com/henrygd/beszel"
"github.com/henrygd/beszel/agent/deltatracker"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/common"
"github.com/henrygd/beszel/internal/entities/system"
"github.com/shirou/gopsutil/v4/host"
gossh "golang.org/x/crypto/ssh"
)
@@ -65,18 +61,18 @@ func NewAgent(dataDir ...string) (agent *Agent, err error) {
agent.netIoStats = make(map[uint16]system.NetIoStats)
agent.netInterfaceDeltaTrackers = make(map[uint16]*deltatracker.DeltaTracker[string, uint64])
agent.dataDir, err = getDataDir(dataDir...)
agent.dataDir, err = GetDataDir(dataDir...)
if err != nil {
slog.Warn("Data directory not found")
} else {
slog.Info("Data directory", "path", agent.dataDir)
}
agent.memCalc, _ = GetEnv("MEM_CALC")
agent.memCalc, _ = utils.GetEnv("MEM_CALC")
agent.sensorConfig = agent.newSensorConfig()
// Parse disk usage cache duration (e.g., "15m", "1h") to avoid waking sleeping disks
if diskUsageCache, exists := GetEnv("DISK_USAGE_CACHE"); exists {
if diskUsageCache, exists := utils.GetEnv("DISK_USAGE_CACHE"); exists {
if duration, err := time.ParseDuration(diskUsageCache); err == nil {
agent.diskUsageCacheDuration = duration
slog.Info("DISK_USAGE_CACHE", "duration", duration)
@@ -86,7 +82,7 @@ func NewAgent(dataDir ...string) (agent *Agent, err error) {
}
// Set up slog with a log level determined by the LOG_LEVEL env var
if logLevelStr, exists := GetEnv("LOG_LEVEL"); exists {
if logLevelStr, exists := utils.GetEnv("LOG_LEVEL"); exists {
switch strings.ToLower(logLevelStr) {
case "debug":
agent.debug = true
@@ -107,7 +103,7 @@ func NewAgent(dataDir ...string) (agent *Agent, err error) {
agent.refreshSystemDetails()
// SMART_INTERVAL env var to update smart data at this interval
if smartIntervalEnv, exists := GetEnv("SMART_INTERVAL"); exists {
if smartIntervalEnv, exists := utils.GetEnv("SMART_INTERVAL"); exists {
if duration, err := time.ParseDuration(smartIntervalEnv); err == nil && duration > 0 {
agent.systemDetails.SmartInterval = duration
slog.Info("SMART_INTERVAL", "duration", duration)
@@ -152,15 +148,6 @@ func NewAgent(dataDir ...string) (agent *Agent, err error) {
return agent, nil
}
// GetEnv retrieves an environment variable with a "BESZEL_AGENT_" prefix, or falls back to the unprefixed key.
func GetEnv(key string) (value string, exists bool) {
if value, exists = os.LookupEnv("BESZEL_AGENT_" + key); exists {
return value, exists
}
// Fallback to the old unprefixed key
return os.LookupEnv(key)
}
func (a *Agent) gatherStats(options common.DataRequestOptions) *system.CombinedData {
a.Lock()
defer a.Unlock()
@@ -217,7 +204,7 @@ func (a *Agent) gatherStats(options common.DataRequestOptions) *system.CombinedD
data.Stats.ExtraFs[key] = stats
// Add percentages to Info struct for dashboard
if stats.DiskTotal > 0 {
pct := twoDecimals((stats.DiskUsed / stats.DiskTotal) * 100)
pct := utils.TwoDecimals((stats.DiskUsed / stats.DiskTotal) * 100)
data.Info.ExtraFsPct[key] = pct
}
}
@@ -228,38 +215,12 @@ func (a *Agent) gatherStats(options common.DataRequestOptions) *system.CombinedD
return data
}
// StartAgent initializes and starts the agent with optional WebSocket connection
// Start initializes and starts the agent with optional WebSocket connection
func (a *Agent) Start(serverOptions ServerOptions) error {
a.keys = serverOptions.Keys
return a.connectionManager.Start(serverOptions)
}
func (a *Agent) getFingerprint() string {
// first look for a fingerprint in the data directory
if a.dataDir != "" {
if fp, err := os.ReadFile(filepath.Join(a.dataDir, "fingerprint")); err == nil {
return string(fp)
}
}
// if no fingerprint is found, generate one
fingerprint, err := host.HostID()
// we ignore a commonly known "product_uuid" known not to be unique
if err != nil || fingerprint == "" || fingerprint == "03000200-0400-0500-0006-000700080009" {
fingerprint = a.systemDetails.Hostname + a.systemDetails.CpuModel
}
// hash fingerprint
sum := sha256.Sum256([]byte(fingerprint))
fingerprint = hex.EncodeToString(sum[:24])
// save fingerprint to data directory
if a.dataDir != "" {
err = os.WriteFile(filepath.Join(a.dataDir, "fingerprint"), []byte(fingerprint), 0644)
if err != nil {
slog.Warn("Failed to save fingerprint", "err", err)
}
}
return fingerprint
return GetFingerprint(a.dataDir, a.systemDetails.Hostname, a.systemDetails.CpuModel)
}

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package agent

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package agent

View File

@@ -14,6 +14,7 @@ import (
"time"
"github.com/henrygd/beszel"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/common"
"github.com/fxamacker/cbor/v2"
@@ -43,7 +44,7 @@ type WebSocketClient struct {
// newWebSocketClient creates a new WebSocket client for the given agent.
// It reads configuration from environment variables and validates the hub URL.
func newWebSocketClient(agent *Agent) (client *WebSocketClient, err error) {
hubURLStr, exists := GetEnv("HUB_URL")
hubURLStr, exists := utils.GetEnv("HUB_URL")
if !exists {
return nil, errors.New("HUB_URL environment variable not set")
}
@@ -72,12 +73,12 @@ func newWebSocketClient(agent *Agent) (client *WebSocketClient, err error) {
// If neither is set, it returns an error.
func getToken() (string, error) {
// get token from env var
token, _ := GetEnv("TOKEN")
token, _ := utils.GetEnv("TOKEN")
if token != "" {
return token, nil
}
// get token from file
tokenFile, _ := GetEnv("TOKEN_FILE")
tokenFile, _ := utils.GetEnv("TOKEN_FILE")
if tokenFile == "" {
return "", errors.New("must set TOKEN or TOKEN_FILE")
}
@@ -197,7 +198,7 @@ func (client *WebSocketClient) handleAuthChallenge(msg *common.HubRequest[cbor.R
}
if authRequest.NeedSysInfo {
response.Name, _ = GetEnv("SYSTEM_NAME")
response.Name, _ = utils.GetEnv("SYSTEM_NAME")
response.Hostname = client.agent.systemDetails.Hostname
serverAddr := client.agent.connectionManager.serverOptions.Addr
_, response.Port, _ = net.SplitHostPort(serverAddr)

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package agent

View File

@@ -1,9 +1,9 @@
package agent
import (
"context"
"errors"
"log/slog"
"os"
"os/signal"
"syscall"
"time"
@@ -91,8 +91,8 @@ func (c *ConnectionManager) Start(serverOptions ServerOptions) error {
c.eventChan = make(chan ConnectionEvent, 1)
// signal handling for shutdown
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
sigCtx, stopSignals := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer stopSignals()
c.startWsTicker()
c.connect()
@@ -109,8 +109,8 @@ func (c *ConnectionManager) Start(serverOptions ServerOptions) error {
_ = c.startWebSocketConnection()
case <-healthTicker:
_ = health.Update()
case <-sigChan:
slog.Info("Shutting down")
case <-sigCtx.Done():
slog.Info("Shutting down", "cause", context.Cause(sigCtx))
_ = c.agent.StopServer()
c.closeWebSocket()
return health.CleanUp()

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package agent

View File

@@ -14,10 +14,10 @@ var lastPerCoreCpuTimes = make(map[uint16][]cpu.TimesStat)
// init initializes the CPU monitoring by storing the initial CPU times
// for the default 60-second cache interval.
func init() {
if times, err := cpu.Times(false); err == nil {
if times, err := cpu.Times(false); err == nil && len(times) > 0 {
lastCpuTimes[60000] = times[0]
}
if perCoreTimes, err := cpu.Times(true); err == nil {
if perCoreTimes, err := cpu.Times(true); err == nil && len(perCoreTimes) > 0 {
lastPerCoreCpuTimes[60000] = perCoreTimes
}
}
@@ -89,10 +89,7 @@ func getPerCoreCpuUsage(cacheTimeMs uint16) (system.Uint8Slice, error) {
lastTimes := lastPerCoreCpuTimes[cacheTimeMs]
// Limit to the number of cores available in both samples
length := len(perCoreTimes)
if len(lastTimes) < length {
length = len(lastTimes)
}
length := min(len(lastTimes), len(perCoreTimes))
usage := make([]uint8, length)
for i := 0; i < length; i++ {

View File

@@ -6,17 +6,19 @@ import (
"os"
"path/filepath"
"runtime"
"github.com/henrygd/beszel/agent/utils"
)
// getDataDir returns the path to the data directory for the agent and an error
// GetDataDir returns the path to the data directory for the agent and an error
// if the directory is not valid. Attempts to find the optimal data directory if
// no data directories are provided.
func getDataDir(dataDirs ...string) (string, error) {
func GetDataDir(dataDirs ...string) (string, error) {
if len(dataDirs) > 0 {
return testDataDirs(dataDirs)
}
dataDir, _ := GetEnv("DATA_DIR")
dataDir, _ := utils.GetEnv("DATA_DIR")
if dataDir != "" {
dataDirs = append(dataDirs, dataDir)
}

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package agent
@@ -17,7 +16,7 @@ func TestGetDataDir(t *testing.T) {
// Test with explicit dataDir parameter
t.Run("explicit data dir", func(t *testing.T) {
tempDir := t.TempDir()
result, err := getDataDir(tempDir)
result, err := GetDataDir(tempDir)
require.NoError(t, err)
assert.Equal(t, tempDir, result)
})
@@ -26,7 +25,7 @@ func TestGetDataDir(t *testing.T) {
t.Run("explicit data dir - create new", func(t *testing.T) {
tempDir := t.TempDir()
newDir := filepath.Join(tempDir, "new-data-dir")
result, err := getDataDir(newDir)
result, err := GetDataDir(newDir)
require.NoError(t, err)
assert.Equal(t, newDir, result)
@@ -52,7 +51,7 @@ func TestGetDataDir(t *testing.T) {
os.Setenv("BESZEL_AGENT_DATA_DIR", tempDir)
result, err := getDataDir()
result, err := GetDataDir()
require.NoError(t, err)
assert.Equal(t, tempDir, result)
})
@@ -60,7 +59,7 @@ func TestGetDataDir(t *testing.T) {
// Test with invalid explicit dataDir
t.Run("invalid explicit data dir", func(t *testing.T) {
invalidPath := "/invalid/path/that/cannot/be/created"
_, err := getDataDir(invalidPath)
_, err := GetDataDir(invalidPath)
assert.Error(t, err)
})
@@ -79,7 +78,7 @@ func TestGetDataDir(t *testing.T) {
// This will try platform-specific defaults, which may or may not work
// We're mainly testing that it doesn't panic and returns some result
result, err := getDataDir()
result, err := GetDataDir()
// We don't assert success/failure here since it depends on system permissions
// Just verify we get a string result if no error
if err == nil {

View File

@@ -8,11 +8,31 @@ import (
"strings"
"time"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/system"
"github.com/shirou/gopsutil/v4/disk"
)
// fsRegistrationContext holds the shared lookup state needed to resolve a
// filesystem into the tracked fsStats key and metadata.
type fsRegistrationContext struct {
filesystem string // value of optional FILESYSTEM env var
isWindows bool
efPath string // path to extra filesystems (default "/extra-filesystems")
diskIoCounters map[string]disk.IOCountersStat
}
// diskDiscovery groups the transient state for a single initializeDiskInfo run so
// helper methods can share the same partitions, mount paths, and lookup functions
type diskDiscovery struct {
agent *Agent
rootMountPoint string
partitions []disk.PartitionStat
usageFn func(string) (*disk.UsageStat, error)
ctx fsRegistrationContext
}
// parseFilesystemEntry parses a filesystem entry in the format "device__customname"
// Returns the device/filesystem part and the custom name part
func parseFilesystemEntry(entry string) (device, customName string) {
@@ -26,10 +46,230 @@ func parseFilesystemEntry(entry string) (device, customName string) {
return device, customName
}
// extraFilesystemPartitionInfo derives the I/O device and optional display name
// for a mounted /extra-filesystems partition. Prefer the partition device reported
// by the system and only use the folder name for custom naming metadata.
func extraFilesystemPartitionInfo(p disk.PartitionStat) (device, customName string) {
device = strings.TrimSpace(p.Device)
folderDevice, customName := parseFilesystemEntry(filepath.Base(p.Mountpoint))
if device == "" {
device = folderDevice
}
return device, customName
}
func isDockerSpecialMountpoint(mountpoint string) bool {
switch mountpoint {
case "/etc/hosts", "/etc/resolv.conf", "/etc/hostname":
return true
}
return false
}
// registerFilesystemStats resolves the tracked key and stats payload for a
// filesystem before it is inserted into fsStats.
func registerFilesystemStats(existing map[string]*system.FsStats, device, mountpoint string, root bool, customName string, ctx fsRegistrationContext) (string, *system.FsStats, bool) {
key := device
if !ctx.isWindows {
key = filepath.Base(device)
}
if root {
// Try to map root device to a diskIoCounters entry. First checks for an
// exact key match, then uses findIoDevice for normalized / prefix-based
// matching (e.g. nda0p2 -> nda0), and finally falls back to FILESYSTEM.
if _, ioMatch := ctx.diskIoCounters[key]; !ioMatch {
if matchedKey, match := findIoDevice(key, ctx.diskIoCounters); match {
key = matchedKey
} else if ctx.filesystem != "" {
if matchedKey, match := findIoDevice(ctx.filesystem, ctx.diskIoCounters); match {
key = matchedKey
}
}
if _, ioMatch = ctx.diskIoCounters[key]; !ioMatch {
slog.Warn("Root I/O unmapped; set FILESYSTEM", "device", device, "mountpoint", mountpoint)
}
}
} else {
// Check if non-root has diskstats and prefer the folder device for
// /extra-filesystems mounts when the discovered partition device is a
// mapper path (e.g. luks UUID) that obscures the underlying block device.
if _, ioMatch := ctx.diskIoCounters[key]; !ioMatch {
if strings.HasPrefix(mountpoint, ctx.efPath) {
folderDevice, _ := parseFilesystemEntry(filepath.Base(mountpoint))
if folderDevice != "" {
if matchedKey, match := findIoDevice(folderDevice, ctx.diskIoCounters); match {
key = matchedKey
}
}
}
if _, ioMatch = ctx.diskIoCounters[key]; !ioMatch {
if matchedKey, match := findIoDevice(key, ctx.diskIoCounters); match {
key = matchedKey
}
}
}
}
if _, exists := existing[key]; exists {
return "", nil, false
}
fsStats := &system.FsStats{Root: root, Mountpoint: mountpoint}
if customName != "" {
fsStats.Name = customName
}
return key, fsStats, true
}
// addFsStat inserts a discovered filesystem if it resolves to a new tracking
// key. The key selection itself lives in buildFsStatRegistration so that logic
// can stay directly unit-tested.
func (d *diskDiscovery) addFsStat(device, mountpoint string, root bool, customName string) {
key, fsStats, ok := registerFilesystemStats(d.agent.fsStats, device, mountpoint, root, customName, d.ctx)
if !ok {
return
}
d.agent.fsStats[key] = fsStats
name := key
if customName != "" {
name = customName
}
slog.Info("Detected disk", "name", name, "device", device, "mount", mountpoint, "io", key, "root", root)
}
// addConfiguredRootFs resolves FILESYSTEM against partitions first, then falls
// back to direct diskstats matching for setups like ZFS where partitions do not
// expose the physical device name.
func (d *diskDiscovery) addConfiguredRootFs() bool {
if d.ctx.filesystem == "" {
return false
}
for _, p := range d.partitions {
if filesystemMatchesPartitionSetting(d.ctx.filesystem, p) {
d.addFsStat(p.Device, p.Mountpoint, true, "")
return true
}
}
// FILESYSTEM may name a physical disk absent from partitions (e.g. ZFS lists
// dataset paths like zroot/ROOT/default, not block devices).
if ioKey, match := findIoDevice(d.ctx.filesystem, d.ctx.diskIoCounters); match {
d.agent.fsStats[ioKey] = &system.FsStats{Root: true, Mountpoint: d.rootMountPoint}
return true
}
slog.Warn("Partition details not found", "filesystem", d.ctx.filesystem)
return false
}
func isRootFallbackPartition(p disk.PartitionStat, rootMountPoint string) bool {
return p.Mountpoint == rootMountPoint ||
(isDockerSpecialMountpoint(p.Mountpoint) && strings.HasPrefix(p.Device, "/dev"))
}
// addPartitionRootFs handles the non-configured root fallback path when a
// partition looks like the active root mount but still needs translating to an
// I/O device key.
func (d *diskDiscovery) addPartitionRootFs(device, mountpoint string) bool {
fs, match := findIoDevice(filepath.Base(device), d.ctx.diskIoCounters)
if !match {
return false
}
// The resolved I/O device is already known here, so use it directly to avoid
// a second fallback search inside buildFsStatRegistration.
d.addFsStat(fs, mountpoint, true, "")
return true
}
// addLastResortRootFs is only used when neither FILESYSTEM nor partition-based
// heuristics can identify root, so it picks the busiest I/O device as a final
// fallback and preserves the root mountpoint for usage collection.
func (d *diskDiscovery) addLastResortRootFs() {
rootKey := mostActiveIoDevice(d.ctx.diskIoCounters)
if rootKey != "" {
slog.Warn("Using most active device for root I/O; set FILESYSTEM to override", "device", rootKey)
} else {
rootKey = filepath.Base(d.rootMountPoint)
if _, exists := d.agent.fsStats[rootKey]; exists {
rootKey = "root"
}
slog.Warn("Root I/O device not detected; set FILESYSTEM to override")
}
d.agent.fsStats[rootKey] = &system.FsStats{Root: true, Mountpoint: d.rootMountPoint}
}
// findPartitionByFilesystemSetting matches an EXTRA_FILESYSTEMS entry against a
// discovered partition either by mountpoint or by device suffix.
func findPartitionByFilesystemSetting(filesystem string, partitions []disk.PartitionStat) (disk.PartitionStat, bool) {
for _, p := range partitions {
if strings.HasSuffix(p.Device, filesystem) || p.Mountpoint == filesystem {
return p, true
}
}
return disk.PartitionStat{}, false
}
// addConfiguredExtraFsEntry resolves one EXTRA_FILESYSTEMS entry, preferring a
// discovered partition and falling back to any path that disk.Usage accepts.
func (d *diskDiscovery) addConfiguredExtraFsEntry(filesystem, customName string) {
if p, found := findPartitionByFilesystemSetting(filesystem, d.partitions); found {
d.addFsStat(p.Device, p.Mountpoint, false, customName)
return
}
if _, err := d.usageFn(filesystem); err == nil {
d.addFsStat(filepath.Base(filesystem), filesystem, false, customName)
return
} else {
slog.Error("Invalid filesystem", "name", filesystem, "err", err)
}
}
// addConfiguredExtraFilesystems parses and registers the comma-separated
// EXTRA_FILESYSTEMS env var entries.
func (d *diskDiscovery) addConfiguredExtraFilesystems(extraFilesystems string) {
for fsEntry := range strings.SplitSeq(extraFilesystems, ",") {
filesystem, customName := parseFilesystemEntry(fsEntry)
d.addConfiguredExtraFsEntry(filesystem, customName)
}
}
// addPartitionExtraFs registers partitions mounted under /extra-filesystems so
// their display names can come from the folder name while their I/O keys still
// prefer the underlying partition device.
func (d *diskDiscovery) addPartitionExtraFs(p disk.PartitionStat) {
if !strings.HasPrefix(p.Mountpoint, d.ctx.efPath) {
return
}
device, customName := extraFilesystemPartitionInfo(p)
d.addFsStat(device, p.Mountpoint, false, customName)
}
// addExtraFilesystemFolders handles bare directories under /extra-filesystems
// that may not appear in partition discovery, while skipping mountpoints that
// were already registered from higher-fidelity sources.
func (d *diskDiscovery) addExtraFilesystemFolders(folderNames []string) {
existingMountpoints := make(map[string]bool, len(d.agent.fsStats))
for _, stats := range d.agent.fsStats {
existingMountpoints[stats.Mountpoint] = true
}
for _, folderName := range folderNames {
mountpoint := filepath.Join(d.ctx.efPath, folderName)
slog.Debug("/extra-filesystems", "mountpoint", mountpoint)
if existingMountpoints[mountpoint] {
continue
}
device, customName := parseFilesystemEntry(folderName)
d.addFsStat(device, mountpoint, false, customName)
}
}
// Sets up the filesystems to monitor for disk usage and I/O.
func (a *Agent) initializeDiskInfo() {
filesystem, _ := GetEnv("FILESYSTEM")
efPath := "/extra-filesystems"
filesystem, _ := utils.GetEnv("FILESYSTEM")
hasRoot := false
isWindows := runtime.GOOS == "windows"
@@ -46,167 +286,223 @@ func (a *Agent) initializeDiskInfo() {
}
}
// ioContext := context.WithValue(a.sensorsContext,
// common.EnvKey, common.EnvMap{common.HostProcEnvKey: "/tmp/testproc"},
// )
// diskIoCounters, err := disk.IOCountersWithContext(ioContext)
diskIoCounters, err := disk.IOCounters()
if err != nil {
slog.Error("Error getting diskstats", "err", err)
}
slog.Debug("Disk I/O", "diskstats", diskIoCounters)
// Helper function to add a filesystem to fsStats if it doesn't exist
addFsStat := func(device, mountpoint string, root bool, customName ...string) {
var key string
if isWindows {
key = device
} else {
key = filepath.Base(device)
}
var ioMatch bool
if _, exists := a.fsStats[key]; !exists {
if root {
slog.Info("Detected root device", "name", key)
// Check if root device is in /proc/diskstats, use fallback if not
if _, ioMatch = diskIoCounters[key]; !ioMatch {
key, ioMatch = findIoDevice(filesystem, diskIoCounters, a.fsStats)
if !ioMatch {
slog.Info("Using I/O fallback", "device", device, "mountpoint", mountpoint, "fallback", key)
}
}
} else {
// Check if non-root has diskstats and fall back to folder name if not
// Scenario: device is encrypted and named luks-2bcb02be-999d-4417-8d18-5c61e660fb6e - not in /proc/diskstats.
// However, the device can be specified by mounting folder from luks device at /extra-filesystems/sda1
if _, ioMatch = diskIoCounters[key]; !ioMatch {
efBase := filepath.Base(mountpoint)
if _, ioMatch = diskIoCounters[efBase]; ioMatch {
key = efBase
}
}
}
fsStats := &system.FsStats{Root: root, Mountpoint: mountpoint}
if len(customName) > 0 && customName[0] != "" {
fsStats.Name = customName[0]
}
a.fsStats[key] = fsStats
}
ctx := fsRegistrationContext{
filesystem: filesystem,
isWindows: isWindows,
diskIoCounters: diskIoCounters,
efPath: "/extra-filesystems",
}
// Get the appropriate root mount point for this system
rootMountPoint := a.getRootMountPoint()
// Use FILESYSTEM env var to find root filesystem
if filesystem != "" {
for _, p := range partitions {
if strings.HasSuffix(p.Device, filesystem) || p.Mountpoint == filesystem {
addFsStat(p.Device, p.Mountpoint, true)
hasRoot = true
break
}
}
if !hasRoot {
slog.Warn("Partition details not found", "filesystem", filesystem)
}
discovery := diskDiscovery{
agent: a,
rootMountPoint: a.getRootMountPoint(),
partitions: partitions,
usageFn: disk.Usage,
ctx: ctx,
}
// Add EXTRA_FILESYSTEMS env var values to fsStats
if extraFilesystems, exists := GetEnv("EXTRA_FILESYSTEMS"); exists {
for _, fsEntry := range strings.Split(extraFilesystems, ",") {
// Parse custom name from format: device__customname
fs, customName := parseFilesystemEntry(fsEntry)
hasRoot = discovery.addConfiguredRootFs()
found := false
for _, p := range partitions {
if strings.HasSuffix(p.Device, fs) || p.Mountpoint == fs {
addFsStat(p.Device, p.Mountpoint, false, customName)
found = true
break
}
}
// if not in partitions, test if we can get disk usage
if !found {
if _, err := disk.Usage(fs); err == nil {
addFsStat(filepath.Base(fs), fs, false, customName)
} else {
slog.Error("Invalid filesystem", "name", fs, "err", err)
}
}
}
// Add EXTRA_FILESYSTEMS env var values to fsStats
if extraFilesystems, exists := utils.GetEnv("EXTRA_FILESYSTEMS"); exists {
discovery.addConfiguredExtraFilesystems(extraFilesystems)
}
// Process partitions for various mount points
for _, p := range partitions {
// fmt.Println(p.Device, p.Mountpoint)
// Binary root fallback or docker root fallback
if !hasRoot && (p.Mountpoint == rootMountPoint || (p.Mountpoint == "/etc/hosts" && strings.HasPrefix(p.Device, "/dev"))) {
fs, match := findIoDevice(filepath.Base(p.Device), diskIoCounters, a.fsStats)
if match {
addFsStat(fs, p.Mountpoint, true)
hasRoot = true
}
}
// Check if device is in /extra-filesystems
if strings.HasPrefix(p.Mountpoint, efPath) {
device, customName := parseFilesystemEntry(p.Mountpoint)
addFsStat(device, p.Mountpoint, false, customName)
if !hasRoot && isRootFallbackPartition(p, discovery.rootMountPoint) {
hasRoot = discovery.addPartitionRootFs(p.Device, p.Mountpoint)
}
discovery.addPartitionExtraFs(p)
}
// Check all folders in /extra-filesystems and add them if not already present
if folders, err := os.ReadDir(efPath); err == nil {
existingMountpoints := make(map[string]bool)
for _, stats := range a.fsStats {
existingMountpoints[stats.Mountpoint] = true
}
if folders, err := os.ReadDir(discovery.ctx.efPath); err == nil {
folderNames := make([]string, 0, len(folders))
for _, folder := range folders {
if folder.IsDir() {
mountpoint := filepath.Join(efPath, folder.Name())
slog.Debug("/extra-filesystems", "mountpoint", mountpoint)
if !existingMountpoints[mountpoint] {
device, customName := parseFilesystemEntry(folder.Name())
addFsStat(device, mountpoint, false, customName)
}
folderNames = append(folderNames, folder.Name())
}
}
discovery.addExtraFilesystemFolders(folderNames)
}
// If no root filesystem set, use fallback
// If no root filesystem set, try the most active I/O device as a last
// resort (e.g. ZFS where dataset names are unrelated to disk names).
if !hasRoot {
rootDevice, _ := findIoDevice(filepath.Base(filesystem), diskIoCounters, a.fsStats)
slog.Info("Root disk", "mountpoint", rootMountPoint, "io", rootDevice)
a.fsStats[rootDevice] = &system.FsStats{Root: true, Mountpoint: rootMountPoint}
discovery.addLastResortRootFs()
}
a.pruneDuplicateRootExtraFilesystems()
a.initializeDiskIoStats(diskIoCounters)
}
// Returns matching device from /proc/diskstats,
// or the device with the most reads if no match is found.
// bool is true if a match was found.
func findIoDevice(filesystem string, diskIoCounters map[string]disk.IOCountersStat, fsStats map[string]*system.FsStats) (string, bool) {
var maxReadBytes uint64
maxReadDevice := "/"
for _, d := range diskIoCounters {
if d.Name == filesystem || (d.Label != "" && d.Label == filesystem) {
return d.Name, true
}
if d.ReadBytes > maxReadBytes {
// don't use if device already exists in fsStats
if _, exists := fsStats[d.Name]; !exists {
maxReadBytes = d.ReadBytes
maxReadDevice = d.Name
}
// Removes extra filesystems that mirror root usage (https://github.com/henrygd/beszel/issues/1428).
func (a *Agent) pruneDuplicateRootExtraFilesystems() {
var rootMountpoint string
for _, stats := range a.fsStats {
if stats != nil && stats.Root {
rootMountpoint = stats.Mountpoint
break
}
}
return maxReadDevice, false
if rootMountpoint == "" {
return
}
rootUsage, err := disk.Usage(rootMountpoint)
if err != nil {
return
}
for name, stats := range a.fsStats {
if stats == nil || stats.Root {
continue
}
extraUsage, err := disk.Usage(stats.Mountpoint)
if err != nil {
continue
}
if hasSameDiskUsage(rootUsage, extraUsage) {
slog.Info("Ignoring duplicate FS", "name", name, "mount", stats.Mountpoint)
delete(a.fsStats, name)
}
}
}
// hasSameDiskUsage compares root/extra usage with a small byte tolerance.
func hasSameDiskUsage(a, b *disk.UsageStat) bool {
if a == nil || b == nil || a.Total == 0 || b.Total == 0 {
return false
}
// Allow minor drift between sequential disk usage calls.
const toleranceBytes uint64 = 16 * 1024 * 1024
return withinUsageTolerance(a.Total, b.Total, toleranceBytes) &&
withinUsageTolerance(a.Used, b.Used, toleranceBytes)
}
// withinUsageTolerance reports whether two byte values differ by at most tolerance.
func withinUsageTolerance(a, b, tolerance uint64) bool {
if a >= b {
return a-b <= tolerance
}
return b-a <= tolerance
}
type ioMatchCandidate struct {
name string
bytes uint64
ops uint64
}
// findIoDevice prefers exact device/label matches, then falls back to a
// prefix-related candidate with the highest recent activity.
func findIoDevice(filesystem string, diskIoCounters map[string]disk.IOCountersStat) (string, bool) {
filesystem = normalizeDeviceName(filesystem)
if filesystem == "" {
return "", false
}
candidates := []ioMatchCandidate{}
for _, d := range diskIoCounters {
if normalizeDeviceName(d.Name) == filesystem || (d.Label != "" && normalizeDeviceName(d.Label) == filesystem) {
return d.Name, true
}
if prefixRelated(normalizeDeviceName(d.Name), filesystem) ||
(d.Label != "" && prefixRelated(normalizeDeviceName(d.Label), filesystem)) {
candidates = append(candidates, ioMatchCandidate{
name: d.Name,
bytes: d.ReadBytes + d.WriteBytes,
ops: d.ReadCount + d.WriteCount,
})
}
}
if len(candidates) == 0 {
return "", false
}
best := candidates[0]
for _, c := range candidates[1:] {
if c.bytes > best.bytes ||
(c.bytes == best.bytes && c.ops > best.ops) ||
(c.bytes == best.bytes && c.ops == best.ops && c.name < best.name) {
best = c
}
}
slog.Info("Using disk I/O fallback", "requested", filesystem, "selected", best.name)
return best.name, true
}
// mostActiveIoDevice returns the device with the highest I/O activity,
// or "" if diskIoCounters is empty.
func mostActiveIoDevice(diskIoCounters map[string]disk.IOCountersStat) string {
var best ioMatchCandidate
for _, d := range diskIoCounters {
c := ioMatchCandidate{
name: d.Name,
bytes: d.ReadBytes + d.WriteBytes,
ops: d.ReadCount + d.WriteCount,
}
if best.name == "" || c.bytes > best.bytes ||
(c.bytes == best.bytes && c.ops > best.ops) ||
(c.bytes == best.bytes && c.ops == best.ops && c.name < best.name) {
best = c
}
}
return best.name
}
// prefixRelated reports whether either identifier is a prefix of the other.
func prefixRelated(a, b string) bool {
if a == "" || b == "" || a == b {
return false
}
return strings.HasPrefix(a, b) || strings.HasPrefix(b, a)
}
// filesystemMatchesPartitionSetting checks whether a FILESYSTEM env var value
// matches a partition by mountpoint, exact device name, or prefix relationship
// (e.g. FILESYSTEM=ada0 matches partition /dev/ada0p2).
func filesystemMatchesPartitionSetting(filesystem string, p disk.PartitionStat) bool {
filesystem = strings.TrimSpace(filesystem)
if filesystem == "" {
return false
}
if p.Mountpoint == filesystem {
return true
}
fsName := normalizeDeviceName(filesystem)
partName := normalizeDeviceName(p.Device)
if fsName == "" || partName == "" {
return false
}
if fsName == partName {
return true
}
return prefixRelated(partName, fsName)
}
// normalizeDeviceName canonicalizes device strings for comparisons.
func normalizeDeviceName(value string) string {
name := filepath.Base(strings.TrimSpace(value))
if name == "." {
return ""
}
return name
}
// Sets start values for disk I/O stats.
func (a *Agent) initializeDiskIoStats(diskIoCounters map[string]disk.IOCountersStat) {
a.fsNames = a.fsNames[:0]
now := time.Now()
for device, stats := range a.fsStats {
// skip if not in diskIoCounters
d, exists := diskIoCounters[device]
@@ -215,7 +511,7 @@ func (a *Agent) initializeDiskIoStats(diskIoCounters map[string]disk.IOCountersS
continue
}
// populate initial values
stats.Time = time.Now()
stats.Time = now
stats.TotalRead = d.ReadBytes
stats.TotalWrite = d.WriteBytes
// add to list of valid io device names
@@ -239,12 +535,12 @@ func (a *Agent) updateDiskUsage(systemStats *system.Stats) {
continue
}
if d, err := disk.Usage(stats.Mountpoint); err == nil {
stats.DiskTotal = bytesToGigabytes(d.Total)
stats.DiskUsed = bytesToGigabytes(d.Used)
stats.DiskTotal = utils.BytesToGigabytes(d.Total)
stats.DiskUsed = utils.BytesToGigabytes(d.Used)
if stats.Root {
systemStats.DiskTotal = bytesToGigabytes(d.Total)
systemStats.DiskUsed = bytesToGigabytes(d.Used)
systemStats.DiskPct = twoDecimals(d.UsedPercent)
systemStats.DiskTotal = utils.BytesToGigabytes(d.Total)
systemStats.DiskUsed = utils.BytesToGigabytes(d.Used)
systemStats.DiskPct = utils.TwoDecimals(d.UsedPercent)
}
} else {
// reset stats if error (likely unmounted)
@@ -297,8 +593,8 @@ func (a *Agent) updateDiskIo(cacheTimeMs uint16, systemStats *system.Stats) {
diskIORead := (d.ReadBytes - prev.readBytes) * 1000 / msElapsed
diskIOWrite := (d.WriteBytes - prev.writeBytes) * 1000 / msElapsed
readMbPerSecond := bytesToMegabytes(float64(diskIORead))
writeMbPerSecond := bytesToMegabytes(float64(diskIOWrite))
readMbPerSecond := utils.BytesToMegabytes(float64(diskIORead))
writeMbPerSecond := utils.BytesToMegabytes(float64(diskIOWrite))
// validate values
if readMbPerSecond > 50_000 || writeMbPerSecond > 50_000 {

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package agent
@@ -94,6 +93,599 @@ func TestParseFilesystemEntry(t *testing.T) {
}
}
func TestExtraFilesystemPartitionInfo(t *testing.T) {
t.Run("uses partition device for label-only mountpoint", func(t *testing.T) {
device, customName := extraFilesystemPartitionInfo(disk.PartitionStat{
Device: "/dev/sdc",
Mountpoint: "/extra-filesystems/Share",
})
assert.Equal(t, "/dev/sdc", device)
assert.Equal(t, "", customName)
})
t.Run("uses custom name from mountpoint suffix", func(t *testing.T) {
device, customName := extraFilesystemPartitionInfo(disk.PartitionStat{
Device: "/dev/sdc",
Mountpoint: "/extra-filesystems/sdc__Share",
})
assert.Equal(t, "/dev/sdc", device)
assert.Equal(t, "Share", customName)
})
t.Run("falls back to folder device when partition device is unavailable", func(t *testing.T) {
device, customName := extraFilesystemPartitionInfo(disk.PartitionStat{
Mountpoint: "/extra-filesystems/sdc__Share",
})
assert.Equal(t, "sdc", device)
assert.Equal(t, "Share", customName)
})
t.Run("supports custom name without folder device prefix", func(t *testing.T) {
device, customName := extraFilesystemPartitionInfo(disk.PartitionStat{
Device: "/dev/sdc",
Mountpoint: "/extra-filesystems/__Share",
})
assert.Equal(t, "/dev/sdc", device)
assert.Equal(t, "Share", customName)
})
}
func TestBuildFsStatRegistration(t *testing.T) {
t.Run("uses basename for non-windows exact io match", func(t *testing.T) {
key, stats, ok := registerFilesystemStats(
map[string]*system.FsStats{},
"/dev/sda1",
"/mnt/data",
false,
"archive",
fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"sda1": {Name: "sda1"},
},
},
)
assert.True(t, ok)
assert.Equal(t, "sda1", key)
assert.Equal(t, "/mnt/data", stats.Mountpoint)
assert.Equal(t, "archive", stats.Name)
assert.False(t, stats.Root)
})
t.Run("maps root partition to io device by prefix", func(t *testing.T) {
key, stats, ok := registerFilesystemStats(
map[string]*system.FsStats{},
"/dev/ada0p2",
"/",
true,
"",
fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"ada0": {Name: "ada0", ReadBytes: 1000, WriteBytes: 1000},
},
},
)
assert.True(t, ok)
assert.Equal(t, "ada0", key)
assert.True(t, stats.Root)
assert.Equal(t, "/", stats.Mountpoint)
})
t.Run("uses filesystem setting as root fallback", func(t *testing.T) {
key, _, ok := registerFilesystemStats(
map[string]*system.FsStats{},
"overlay",
"/",
true,
"",
fsRegistrationContext{
filesystem: "nvme0n1p2",
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"nvme0n1": {Name: "nvme0n1", ReadBytes: 1000, WriteBytes: 1000},
},
},
)
assert.True(t, ok)
assert.Equal(t, "nvme0n1", key)
})
t.Run("prefers parsed extra-filesystems device over mapper device", func(t *testing.T) {
key, stats, ok := registerFilesystemStats(
map[string]*system.FsStats{},
"/dev/mapper/luks-2bcb02be-999d-4417-8d18-5c61e660fb6e",
"/extra-filesystems/nvme0n1p2__Archive",
false,
"Archive",
fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"dm-1": {Name: "dm-1", Label: "luks-2bcb02be-999d-4417-8d18-5c61e660fb6e"},
"nvme0n1p2": {Name: "nvme0n1p2"},
},
},
)
assert.True(t, ok)
assert.Equal(t, "nvme0n1p2", key)
assert.Equal(t, "Archive", stats.Name)
})
t.Run("falls back to mapper io device when folder device cannot be resolved", func(t *testing.T) {
key, stats, ok := registerFilesystemStats(
map[string]*system.FsStats{},
"/dev/mapper/luks-2bcb02be-999d-4417-8d18-5c61e660fb6e",
"/extra-filesystems/Archive",
false,
"Archive",
fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"dm-1": {Name: "dm-1", Label: "luks-2bcb02be-999d-4417-8d18-5c61e660fb6e"},
},
},
)
assert.True(t, ok)
assert.Equal(t, "dm-1", key)
assert.Equal(t, "Archive", stats.Name)
})
t.Run("uses full device name on windows", func(t *testing.T) {
key, _, ok := registerFilesystemStats(
map[string]*system.FsStats{},
`C:`,
`C:\\`,
false,
"",
fsRegistrationContext{
isWindows: true,
diskIoCounters: map[string]disk.IOCountersStat{
`C:`: {Name: `C:`},
},
},
)
assert.True(t, ok)
assert.Equal(t, `C:`, key)
})
t.Run("skips existing key", func(t *testing.T) {
key, stats, ok := registerFilesystemStats(
map[string]*system.FsStats{"sda1": {Mountpoint: "/existing"}},
"/dev/sda1",
"/mnt/data",
false,
"",
fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"sda1": {Name: "sda1"},
},
},
)
assert.False(t, ok)
assert.Empty(t, key)
assert.Nil(t, stats)
})
}
func TestAddConfiguredRootFs(t *testing.T) {
t.Run("adds root from matching partition", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
rootMountPoint: "/",
partitions: []disk.PartitionStat{{Device: "/dev/ada0p2", Mountpoint: "/"}},
ctx: fsRegistrationContext{
filesystem: "/dev/ada0p2",
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"ada0": {Name: "ada0", ReadBytes: 1000, WriteBytes: 1000},
},
},
}
ok := discovery.addConfiguredRootFs()
assert.True(t, ok)
stats, exists := agent.fsStats["ada0"]
assert.True(t, exists)
assert.True(t, stats.Root)
assert.Equal(t, "/", stats.Mountpoint)
})
t.Run("adds root from io device when partition is missing", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
rootMountPoint: "/sysroot",
ctx: fsRegistrationContext{
filesystem: "zroot",
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"nda0": {Name: "nda0", Label: "zroot", ReadBytes: 1000, WriteBytes: 1000},
},
},
}
ok := discovery.addConfiguredRootFs()
assert.True(t, ok)
stats, exists := agent.fsStats["nda0"]
assert.True(t, exists)
assert.True(t, stats.Root)
assert.Equal(t, "/sysroot", stats.Mountpoint)
})
t.Run("returns false when filesystem cannot be resolved", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
rootMountPoint: "/",
ctx: fsRegistrationContext{
filesystem: "missing-disk",
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{},
},
}
ok := discovery.addConfiguredRootFs()
assert.False(t, ok)
assert.Empty(t, agent.fsStats)
})
}
func TestAddPartitionRootFs(t *testing.T) {
t.Run("adds root from fallback partition candidate", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
ctx: fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"nvme0n1": {Name: "nvme0n1", ReadBytes: 1000, WriteBytes: 1000},
},
},
}
ok := discovery.addPartitionRootFs("/dev/nvme0n1p2", "/")
assert.True(t, ok)
stats, exists := agent.fsStats["nvme0n1"]
assert.True(t, exists)
assert.True(t, stats.Root)
assert.Equal(t, "/", stats.Mountpoint)
})
t.Run("returns false when no io device matches", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{agent: agent, ctx: fsRegistrationContext{diskIoCounters: map[string]disk.IOCountersStat{}}}
ok := discovery.addPartitionRootFs("/dev/mapper/root", "/")
assert.False(t, ok)
assert.Empty(t, agent.fsStats)
})
}
func TestAddLastResortRootFs(t *testing.T) {
t.Run("uses most active io device when available", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{agent: agent, rootMountPoint: "/", ctx: fsRegistrationContext{diskIoCounters: map[string]disk.IOCountersStat{
"sda": {Name: "sda", ReadBytes: 5000, WriteBytes: 5000},
"sdb": {Name: "sdb", ReadBytes: 1000, WriteBytes: 1000},
}}}
discovery.addLastResortRootFs()
stats, exists := agent.fsStats["sda"]
assert.True(t, exists)
assert.True(t, stats.Root)
})
t.Run("falls back to root key when mountpoint basename collides", func(t *testing.T) {
agent := &Agent{fsStats: map[string]*system.FsStats{
"sysroot": {Mountpoint: "/extra-filesystems/sysroot"},
}}
discovery := diskDiscovery{agent: agent, rootMountPoint: "/sysroot", ctx: fsRegistrationContext{diskIoCounters: map[string]disk.IOCountersStat{}}}
discovery.addLastResortRootFs()
stats, exists := agent.fsStats["root"]
assert.True(t, exists)
assert.True(t, stats.Root)
assert.Equal(t, "/sysroot", stats.Mountpoint)
})
}
func TestAddConfiguredExtraFsEntry(t *testing.T) {
t.Run("uses matching partition when present", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
partitions: []disk.PartitionStat{{Device: "/dev/sdb1", Mountpoint: "/mnt/backup"}},
usageFn: func(string) (*disk.UsageStat, error) {
t.Fatal("usage fallback should not be called when partition matches")
return nil, nil
},
ctx: fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"sdb1": {Name: "sdb1"},
},
},
}
discovery.addConfiguredExtraFsEntry("sdb1", "backup")
stats, exists := agent.fsStats["sdb1"]
assert.True(t, exists)
assert.Equal(t, "/mnt/backup", stats.Mountpoint)
assert.Equal(t, "backup", stats.Name)
})
t.Run("falls back to usage-validated path", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
usageFn: func(path string) (*disk.UsageStat, error) {
assert.Equal(t, "/srv/archive", path)
return &disk.UsageStat{}, nil
},
ctx: fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"archive": {Name: "archive"},
},
},
}
discovery.addConfiguredExtraFsEntry("/srv/archive", "archive")
stats, exists := agent.fsStats["archive"]
assert.True(t, exists)
assert.Equal(t, "/srv/archive", stats.Mountpoint)
assert.Equal(t, "archive", stats.Name)
})
t.Run("ignores invalid filesystem entry", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
usageFn: func(string) (*disk.UsageStat, error) {
return nil, os.ErrNotExist
},
}
discovery.addConfiguredExtraFsEntry("/missing/archive", "")
assert.Empty(t, agent.fsStats)
})
}
func TestAddConfiguredExtraFilesystems(t *testing.T) {
t.Run("parses and registers multiple configured filesystems", func(t *testing.T) {
agent := &Agent{fsStats: make(map[string]*system.FsStats)}
discovery := diskDiscovery{
agent: agent,
partitions: []disk.PartitionStat{{Device: "/dev/sda1", Mountpoint: "/mnt/fast"}},
usageFn: func(path string) (*disk.UsageStat, error) {
if path == "/srv/archive" {
return &disk.UsageStat{}, nil
}
return nil, os.ErrNotExist
},
ctx: fsRegistrationContext{
isWindows: false,
diskIoCounters: map[string]disk.IOCountersStat{
"sda1": {Name: "sda1"},
"archive": {Name: "archive"},
},
},
}
discovery.addConfiguredExtraFilesystems("sda1__fast,/srv/archive__cold")
assert.Contains(t, agent.fsStats, "sda1")
assert.Equal(t, "fast", agent.fsStats["sda1"].Name)
assert.Contains(t, agent.fsStats, "archive")
assert.Equal(t, "cold", agent.fsStats["archive"].Name)
})
}
func TestAddExtraFilesystemFolders(t *testing.T) {
t.Run("adds missing folders and skips existing mountpoints", func(t *testing.T) {
agent := &Agent{fsStats: map[string]*system.FsStats{
"existing": {Mountpoint: "/extra-filesystems/existing"},
}}
discovery := diskDiscovery{
agent: agent,
ctx: fsRegistrationContext{
isWindows: false,
efPath: "/extra-filesystems",
diskIoCounters: map[string]disk.IOCountersStat{
"newdisk": {Name: "newdisk"},
},
},
}
discovery.addExtraFilesystemFolders([]string{"existing", "newdisk__Archive"})
assert.Len(t, agent.fsStats, 2)
stats, exists := agent.fsStats["newdisk"]
assert.True(t, exists)
assert.Equal(t, "/extra-filesystems/newdisk__Archive", stats.Mountpoint)
assert.Equal(t, "Archive", stats.Name)
})
}
func TestFindIoDevice(t *testing.T) {
t.Run("matches by device name", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"sda": {Name: "sda"},
"sdb": {Name: "sdb"},
}
device, ok := findIoDevice("sdb", ioCounters)
assert.True(t, ok)
assert.Equal(t, "sdb", device)
})
t.Run("matches by device label", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"sda": {Name: "sda", Label: "rootfs"},
"sdb": {Name: "sdb"},
}
device, ok := findIoDevice("rootfs", ioCounters)
assert.True(t, ok)
assert.Equal(t, "sda", device)
})
t.Run("returns no match when not found", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"sda": {Name: "sda"},
"sdb": {Name: "sdb"},
}
device, ok := findIoDevice("nvme0n1p1", ioCounters)
assert.False(t, ok)
assert.Equal(t, "", device)
})
t.Run("uses uncertain unique prefix fallback", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"nvme0n1": {Name: "nvme0n1"},
"sda": {Name: "sda"},
}
device, ok := findIoDevice("nvme0n1p2", ioCounters)
assert.True(t, ok)
assert.Equal(t, "nvme0n1", device)
})
t.Run("uses dominant activity when prefix matches are ambiguous", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"sda": {Name: "sda", ReadBytes: 5000, WriteBytes: 5000, ReadCount: 100, WriteCount: 100},
"sdb": {Name: "sdb", ReadBytes: 1000, WriteBytes: 1000, ReadCount: 50, WriteCount: 50},
}
device, ok := findIoDevice("sd", ioCounters)
assert.True(t, ok)
assert.Equal(t, "sda", device)
})
t.Run("uses highest activity when ambiguous without dominance", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"sda": {Name: "sda", ReadBytes: 3000, WriteBytes: 3000, ReadCount: 50, WriteCount: 50},
"sdb": {Name: "sdb", ReadBytes: 2500, WriteBytes: 2500, ReadCount: 40, WriteCount: 40},
}
device, ok := findIoDevice("sd", ioCounters)
assert.True(t, ok)
assert.Equal(t, "sda", device)
})
t.Run("matches /dev/-prefixed partition to parent disk", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"nda0": {Name: "nda0", ReadBytes: 1000, WriteBytes: 1000},
}
device, ok := findIoDevice("/dev/nda0p2", ioCounters)
assert.True(t, ok)
assert.Equal(t, "nda0", device)
})
t.Run("uses deterministic name tie-breaker", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"sdb": {Name: "sdb", ReadBytes: 2000, WriteBytes: 2000, ReadCount: 10, WriteCount: 10},
"sda": {Name: "sda", ReadBytes: 2000, WriteBytes: 2000, ReadCount: 10, WriteCount: 10},
}
device, ok := findIoDevice("sd", ioCounters)
assert.True(t, ok)
assert.Equal(t, "sda", device)
})
}
func TestFilesystemMatchesPartitionSetting(t *testing.T) {
p := disk.PartitionStat{Device: "/dev/ada0p2", Mountpoint: "/"}
t.Run("matches mountpoint setting", func(t *testing.T) {
assert.True(t, filesystemMatchesPartitionSetting("/", p))
})
t.Run("matches exact partition setting", func(t *testing.T) {
assert.True(t, filesystemMatchesPartitionSetting("ada0p2", p))
assert.True(t, filesystemMatchesPartitionSetting("/dev/ada0p2", p))
})
t.Run("matches prefix-style parent setting", func(t *testing.T) {
assert.True(t, filesystemMatchesPartitionSetting("ada0", p))
assert.True(t, filesystemMatchesPartitionSetting("/dev/ada0", p))
})
t.Run("does not match unrelated device", func(t *testing.T) {
assert.False(t, filesystemMatchesPartitionSetting("sda", p))
assert.False(t, filesystemMatchesPartitionSetting("nvme0n1", p))
assert.False(t, filesystemMatchesPartitionSetting("", p))
})
}
func TestMostActiveIoDevice(t *testing.T) {
t.Run("returns most active device", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"nda0": {Name: "nda0", ReadBytes: 5000, WriteBytes: 5000, ReadCount: 100, WriteCount: 100},
"nda1": {Name: "nda1", ReadBytes: 1000, WriteBytes: 1000, ReadCount: 50, WriteCount: 50},
}
assert.Equal(t, "nda0", mostActiveIoDevice(ioCounters))
})
t.Run("uses deterministic tie-breaker", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"sdb": {Name: "sdb", ReadBytes: 1000, WriteBytes: 1000, ReadCount: 10, WriteCount: 10},
"sda": {Name: "sda", ReadBytes: 1000, WriteBytes: 1000, ReadCount: 10, WriteCount: 10},
}
assert.Equal(t, "sda", mostActiveIoDevice(ioCounters))
})
t.Run("returns empty for empty map", func(t *testing.T) {
assert.Equal(t, "", mostActiveIoDevice(map[string]disk.IOCountersStat{}))
})
}
func TestIsDockerSpecialMountpoint(t *testing.T) {
testCases := []struct {
name string
mountpoint string
expected bool
}{
{name: "hosts", mountpoint: "/etc/hosts", expected: true},
{name: "resolv", mountpoint: "/etc/resolv.conf", expected: true},
{name: "hostname", mountpoint: "/etc/hostname", expected: true},
{name: "root", mountpoint: "/", expected: false},
{name: "passwd", mountpoint: "/etc/passwd", expected: false},
{name: "extra-filesystem", mountpoint: "/extra-filesystems/sda1", expected: false},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
assert.Equal(t, tc.expected, isDockerSpecialMountpoint(tc.mountpoint))
})
}
}
func TestInitializeDiskInfoWithCustomNames(t *testing.T) {
// Set up environment variables
oldEnv := os.Getenv("EXTRA_FILESYSTEMS")
@@ -155,7 +747,7 @@ func TestInitializeDiskInfoWithCustomNames(t *testing.T) {
// Test the parsing logic by calling the relevant part
// We'll create a simplified version to test just the parsing
extraFilesystems := tc.envValue
for _, fsEntry := range strings.Split(extraFilesystems, ",") {
for fsEntry := range strings.SplitSeq(extraFilesystems, ",") {
// Parse the entry
fsEntry = strings.TrimSpace(fsEntry)
var fs, customName string
@@ -317,3 +909,67 @@ func TestDiskUsageCaching(t *testing.T) {
"lastDiskUsageUpdate should be refreshed when cache expires")
})
}
func TestHasSameDiskUsage(t *testing.T) {
const toleranceBytes uint64 = 16 * 1024 * 1024
t.Run("returns true when totals and usage are equal", func(t *testing.T) {
a := &disk.UsageStat{Total: 100 * 1024 * 1024 * 1024, Used: 42 * 1024 * 1024 * 1024}
b := &disk.UsageStat{Total: 100 * 1024 * 1024 * 1024, Used: 42 * 1024 * 1024 * 1024}
assert.True(t, hasSameDiskUsage(a, b))
})
t.Run("returns true within tolerance", func(t *testing.T) {
a := &disk.UsageStat{Total: 100 * 1024 * 1024 * 1024, Used: 42 * 1024 * 1024 * 1024}
b := &disk.UsageStat{
Total: a.Total + toleranceBytes - 1,
Used: a.Used - toleranceBytes + 1,
}
assert.True(t, hasSameDiskUsage(a, b))
})
t.Run("returns false when total exceeds tolerance", func(t *testing.T) {
a := &disk.UsageStat{Total: 100 * 1024 * 1024 * 1024, Used: 42 * 1024 * 1024 * 1024}
b := &disk.UsageStat{
Total: a.Total + toleranceBytes + 1,
Used: a.Used,
}
assert.False(t, hasSameDiskUsage(a, b))
})
t.Run("returns false for nil or zero total", func(t *testing.T) {
assert.False(t, hasSameDiskUsage(nil, &disk.UsageStat{Total: 1, Used: 1}))
assert.False(t, hasSameDiskUsage(&disk.UsageStat{Total: 1, Used: 1}, nil))
assert.False(t, hasSameDiskUsage(&disk.UsageStat{Total: 0, Used: 0}, &disk.UsageStat{Total: 1, Used: 1}))
})
}
func TestInitializeDiskIoStatsResetsTrackedDevices(t *testing.T) {
agent := &Agent{
fsStats: map[string]*system.FsStats{
"sda": {},
"sdb": {},
},
fsNames: []string{"stale", "sda"},
}
agent.initializeDiskIoStats(map[string]disk.IOCountersStat{
"sda": {Name: "sda", ReadBytes: 10, WriteBytes: 20},
"sdb": {Name: "sdb", ReadBytes: 30, WriteBytes: 40},
})
assert.ElementsMatch(t, []string{"sda", "sdb"}, agent.fsNames)
assert.Len(t, agent.fsNames, 2)
assert.Equal(t, uint64(10), agent.fsStats["sda"].TotalRead)
assert.Equal(t, uint64(20), agent.fsStats["sda"].TotalWrite)
assert.False(t, agent.fsStats["sda"].Time.IsZero())
assert.False(t, agent.fsStats["sdb"].Time.IsZero())
agent.initializeDiskIoStats(map[string]disk.IOCountersStat{
"sdb": {Name: "sdb", ReadBytes: 50, WriteBytes: 60},
})
assert.Equal(t, []string{"sdb"}, agent.fsNames)
assert.Equal(t, uint64(50), agent.fsStats["sdb"].TotalRead)
assert.Equal(t, uint64(60), agent.fsStats["sdb"].TotalWrite)
}

View File

@@ -1,6 +1,7 @@
package agent
import (
"bufio"
"bytes"
"context"
"encoding/binary"
@@ -15,11 +16,14 @@ import (
"os"
"path"
"regexp"
"sort"
"strconv"
"strings"
"sync"
"time"
"github.com/henrygd/beszel/agent/deltatracker"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/container"
"github.com/blang/semver"
@@ -28,6 +32,7 @@ import (
// ansiEscapePattern matches ANSI escape sequences (colors, cursor movement, etc.)
// This includes CSI sequences like \x1b[...m and simple escapes like \x1b[K
var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b\][^\x07]*\x07|\x1b[@-Z\\-_]`)
var dockerContainerIDPattern = regexp.MustCompile(`^[a-fA-F0-9]{12,64}$`)
const (
// Docker API timeout in milliseconds
@@ -72,6 +77,7 @@ type dockerManager struct {
// cacheTimeMs -> DeltaTracker for network bytes sent/received
networkSentTrackers map[uint16]*deltatracker.DeltaTracker[string, uint64]
networkRecvTrackers map[uint16]*deltatracker.DeltaTracker[string, uint64]
retrySleep func(time.Duration)
}
// userAgentRoundTripper is a custom http.RoundTripper that adds a User-Agent header to all requests
@@ -333,15 +339,48 @@ func validateCpuPercentage(cpuPct float64, containerName string) error {
// updateContainerStatsValues updates the final stats values
func updateContainerStatsValues(stats *container.Stats, cpuPct float64, usedMemory uint64, sent_delta, recv_delta uint64, readTime time.Time) {
stats.Cpu = twoDecimals(cpuPct)
stats.Mem = bytesToMegabytes(float64(usedMemory))
stats.Cpu = utils.TwoDecimals(cpuPct)
stats.Mem = utils.BytesToMegabytes(float64(usedMemory))
stats.Bandwidth = [2]uint64{sent_delta, recv_delta}
// TODO(0.19+): stop populating NetworkSent/NetworkRecv (deprecated in 0.18.3)
stats.NetworkSent = bytesToMegabytes(float64(sent_delta))
stats.NetworkRecv = bytesToMegabytes(float64(recv_delta))
stats.NetworkSent = utils.BytesToMegabytes(float64(sent_delta))
stats.NetworkRecv = utils.BytesToMegabytes(float64(recv_delta))
stats.PrevReadTime = readTime
}
// convertContainerPortsToString formats the ports of a container into a sorted, deduplicated string.
// ctr.Ports is nilled out after processing so the slice is not accidentally reused.
func convertContainerPortsToString(ctr *container.ApiInfo) string {
if len(ctr.Ports) == 0 {
return ""
}
sort.Slice(ctr.Ports, func(i, j int) bool {
return ctr.Ports[i].PublicPort < ctr.Ports[j].PublicPort
})
var builder strings.Builder
seenPorts := make(map[uint16]struct{})
for _, p := range ctr.Ports {
_, ok := seenPorts[p.PublicPort]
if p.PublicPort == 0 || ok {
continue
}
seenPorts[p.PublicPort] = struct{}{}
if builder.Len() > 0 {
builder.WriteString(", ")
}
switch p.IP {
case "0.0.0.0", "::":
default:
builder.WriteString(p.IP)
builder.WriteByte(':')
}
builder.WriteString(strconv.Itoa(int(p.PublicPort)))
}
// clear ports slice so it doesn't get reused and blend into next response
ctr.Ports = nil
return builder.String()
}
func parseDockerStatus(status string) (string, container.DockerHealth) {
trimmed := strings.TrimSpace(status)
if trimmed == "" {
@@ -361,22 +400,60 @@ func parseDockerStatus(status string) (string, container.DockerHealth) {
statusText = trimmed
}
healthText := strings.ToLower(strings.TrimSpace(strings.TrimSuffix(trimmed[openIdx+1:], ")")))
healthText := strings.TrimSpace(strings.TrimSuffix(trimmed[openIdx+1:], ")"))
// Some Docker statuses include a "health:" prefix inside the parentheses.
// Strip it so it maps correctly to the known health states.
if colonIdx := strings.IndexRune(healthText, ':'); colonIdx != -1 {
prefix := strings.TrimSpace(healthText[:colonIdx])
prefix := strings.ToLower(strings.TrimSpace(healthText[:colonIdx]))
if prefix == "health" || prefix == "health status" {
healthText = strings.TrimSpace(healthText[colonIdx+1:])
}
}
if health, ok := container.DockerHealthStrings[healthText]; ok {
if health, ok := parseDockerHealthStatus(healthText); ok {
return statusText, health
}
return trimmed, container.DockerHealthNone
}
// parseDockerHealthStatus maps Docker health status strings to container.DockerHealth values
func parseDockerHealthStatus(status string) (container.DockerHealth, bool) {
health, ok := container.DockerHealthStrings[strings.ToLower(strings.TrimSpace(status))]
return health, ok
}
// getPodmanContainerHealth fetches container health status from the container inspect endpoint.
// Used for Podman which doesn't provide health status in the /containers/json endpoint as of March 2026.
// https://github.com/containers/podman/issues/27786
func (dm *dockerManager) getPodmanContainerHealth(containerID string) (container.DockerHealth, error) {
resp, err := dm.client.Get(fmt.Sprintf("http://localhost/containers/%s/json", url.PathEscape(containerID)))
if err != nil {
return container.DockerHealthNone, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return container.DockerHealthNone, fmt.Errorf("container inspect request failed: %s", resp.Status)
}
var inspectInfo struct {
State struct {
Health struct {
Status string
}
}
}
if err := json.NewDecoder(resp.Body).Decode(&inspectInfo); err != nil {
return container.DockerHealthNone, err
}
if health, ok := parseDockerHealthStatus(inspectInfo.State.Health.Status); ok {
return health, nil
}
return container.DockerHealthNone, nil
}
// Updates stats for individual container with cache-time-aware delta tracking
func (dm *dockerManager) updateContainerStats(ctr *container.ApiInfo, cacheTimeMs uint16) error {
name := ctr.Names[0][1:]
@@ -386,6 +463,21 @@ func (dm *dockerManager) updateContainerStats(ctr *container.ApiInfo, cacheTimeM
return err
}
statusText, health := parseDockerStatus(ctr.Status)
// Docker exposes Health.Status on /containers/json in API 1.52+.
// Podman currently requires falling back to the inspect endpoint as of March 2026.
// https://github.com/containers/podman/issues/27786
if ctr.Health.Status != "" {
if h, ok := parseDockerHealthStatus(ctr.Health.Status); ok {
health = h
}
} else if dm.usingPodman {
if podmanHealth, err := dm.getPodmanContainerHealth(ctr.IdShort); err == nil {
health = podmanHealth
}
}
dm.containerStatsMutex.Lock()
defer dm.containerStatsMutex.Unlock()
@@ -397,11 +489,13 @@ func (dm *dockerManager) updateContainerStats(ctr *container.ApiInfo, cacheTimeM
}
stats.Id = ctr.IdShort
statusText, health := parseDockerStatus(ctr.Status)
stats.Status = statusText
stats.Health = health
if len(ctr.Ports) > 0 {
stats.Ports = convertContainerPortsToString(ctr)
}
// reset current stats
stats.Cpu = 0
stats.Mem = 0
@@ -484,7 +578,7 @@ func (dm *dockerManager) deleteContainerStatsSync(id string) {
// Creates a new http client for Docker or Podman API
func newDockerManager() *dockerManager {
dockerHost, exists := GetEnv("DOCKER_HOST")
dockerHost, exists := utils.GetEnv("DOCKER_HOST")
if exists {
// return nil if set to empty string
if dockerHost == "" {
@@ -520,7 +614,7 @@ func newDockerManager() *dockerManager {
// configurable timeout
timeout := time.Millisecond * time.Duration(dockerTimeoutMs)
if t, set := GetEnv("DOCKER_TIMEOUT"); set {
if t, set := utils.GetEnv("DOCKER_TIMEOUT"); set {
timeout, err = time.ParseDuration(t)
if err != nil {
slog.Error(err.Error())
@@ -537,7 +631,7 @@ func newDockerManager() *dockerManager {
// Read container exclusion patterns from environment variable
var excludeContainers []string
if excludeStr, set := GetEnv("EXCLUDE_CONTAINERS"); set && excludeStr != "" {
if excludeStr, set := utils.GetEnv("EXCLUDE_CONTAINERS"); set && excludeStr != "" {
parts := strings.SplitSeq(excludeStr, ",")
for part := range parts {
trimmed := strings.TrimSpace(part)
@@ -565,6 +659,7 @@ func newDockerManager() *dockerManager {
lastCpuReadTime: make(map[uint16]map[string]time.Time),
networkSentTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
networkRecvTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
retrySleep: time.Sleep,
}
// If using podman, return client
@@ -574,7 +669,7 @@ func newDockerManager() *dockerManager {
return manager
}
// this can take up to 5 seconds with retry, so run in goroutine
// run version check in goroutine to avoid blocking (server may not be ready and requires retries)
go manager.checkDockerVersion()
// give version check a chance to complete before returning
@@ -594,18 +689,18 @@ func (dm *dockerManager) checkDockerVersion() {
const versionMaxTries = 2
for i := 1; i <= versionMaxTries; i++ {
resp, err = dm.client.Get("http://localhost/version")
if err == nil {
if err == nil && resp.StatusCode == http.StatusOK {
break
}
if resp != nil {
resp.Body.Close()
}
if i < versionMaxTries {
slog.Debug("Failed to get Docker version; retrying", "attempt", i, "error", err)
time.Sleep(5 * time.Second)
slog.Debug("Failed to get Docker version; retrying", "attempt", i, "err", err, "response", resp)
dm.retrySleep(5 * time.Second)
}
}
if err != nil {
if err != nil || resp.StatusCode != http.StatusOK {
return
}
if err := dm.decode(resp, &versionInfo); err != nil {
@@ -647,9 +742,34 @@ func getDockerHost() string {
return scheme + socks[0]
}
func validateContainerID(containerID string) error {
if !dockerContainerIDPattern.MatchString(containerID) {
return fmt.Errorf("invalid container id")
}
return nil
}
func buildDockerContainerEndpoint(containerID, action string, query url.Values) (string, error) {
if err := validateContainerID(containerID); err != nil {
return "", err
}
u := &url.URL{
Scheme: "http",
Host: "localhost",
Path: fmt.Sprintf("/containers/%s/%s", url.PathEscape(containerID), action),
}
if len(query) > 0 {
u.RawQuery = query.Encode()
}
return u.String(), nil
}
// getContainerInfo fetches the inspection data for a container
func (dm *dockerManager) getContainerInfo(ctx context.Context, containerID string) ([]byte, error) {
endpoint := fmt.Sprintf("http://localhost/containers/%s/json", containerID)
endpoint, err := buildDockerContainerEndpoint(containerID, "json", nil)
if err != nil {
return nil, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return nil, err
@@ -680,7 +800,15 @@ func (dm *dockerManager) getContainerInfo(ctx context.Context, containerID strin
// getLogs fetches the logs for a container
func (dm *dockerManager) getLogs(ctx context.Context, containerID string) (string, error) {
endpoint := fmt.Sprintf("http://localhost/containers/%s/logs?stdout=1&stderr=1&tail=%d", containerID, dockerLogsTail)
query := url.Values{
"stdout": []string{"1"},
"stderr": []string{"1"},
"tail": []string{fmt.Sprintf("%d", dockerLogsTail)},
}
endpoint, err := buildDockerContainerEndpoint(containerID, "logs", query)
if err != nil {
return "", err
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return "", err
@@ -698,8 +826,17 @@ func (dm *dockerManager) getLogs(ctx context.Context, containerID string) (strin
}
var builder strings.Builder
multiplexed := resp.Header.Get("Content-Type") == "application/vnd.docker.multiplexed-stream"
if err := decodeDockerLogStream(resp.Body, &builder, multiplexed); err != nil {
contentType := resp.Header.Get("Content-Type")
multiplexed := strings.HasSuffix(contentType, "multiplexed-stream")
logReader := io.Reader(resp.Body)
if !multiplexed {
// Podman may return multiplexed logs without Content-Type. Sniff the first frame header
// with a small buffered reader only when the header check fails.
bufferedReader := bufio.NewReaderSize(resp.Body, 8)
multiplexed = detectDockerMultiplexedStream(bufferedReader)
logReader = bufferedReader
}
if err := decodeDockerLogStream(logReader, &builder, multiplexed); err != nil {
return "", err
}
@@ -711,6 +848,23 @@ func (dm *dockerManager) getLogs(ctx context.Context, containerID string) (strin
return logs, nil
}
func detectDockerMultiplexedStream(reader *bufio.Reader) bool {
const headerSize = 8
header, err := reader.Peek(headerSize)
if err != nil {
return false
}
if header[0] != 0x01 && header[0] != 0x02 {
return false
}
// Docker's stream framing header reserves bytes 1-3 as zero.
if header[1] != 0 || header[2] != 0 || header[3] != 0 {
return false
}
frameLen := binary.BigEndian.Uint32(header[4:])
return frameLen <= maxLogFrameSize
}
func decodeDockerLogStream(reader io.Reader, builder *strings.Builder, multiplexed bool) error {
if !multiplexed {
_, err := io.Copy(builder, io.LimitReader(reader, maxTotalLogSize))

View File

@@ -1,17 +1,24 @@
//go:build testing
// +build testing
package agent
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net"
"net/http"
"net/http/httptest"
"os"
"strings"
"testing"
"time"
"github.com/henrygd/beszel/agent/deltatracker"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/container"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@@ -19,6 +26,43 @@ import (
var defaultCacheTimeMs = uint16(60_000)
type recordingRoundTripper struct {
statusCode int
body string
contentType string
called bool
lastPath string
lastQuery map[string]string
}
type roundTripFunc func(*http.Request) (*http.Response, error)
func (fn roundTripFunc) RoundTrip(req *http.Request) (*http.Response, error) {
return fn(req)
}
func (rt *recordingRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
rt.called = true
rt.lastPath = req.URL.EscapedPath()
rt.lastQuery = map[string]string{}
for key, values := range req.URL.Query() {
if len(values) > 0 {
rt.lastQuery[key] = values[0]
}
}
resp := &http.Response{
StatusCode: rt.statusCode,
Status: "200 OK",
Header: make(http.Header),
Body: io.NopCloser(strings.NewReader(rt.body)),
Request: req,
}
if rt.contentType != "" {
resp.Header.Set("Content-Type", rt.contentType)
}
return resp, nil
}
// cycleCpuDeltas cycles the CPU tracking data for a specific cache time interval
func (dm *dockerManager) cycleCpuDeltas(cacheTimeMs uint16) {
// Clear the CPU tracking maps for this cache time interval
@@ -110,6 +154,94 @@ func TestCalculateMemoryUsage(t *testing.T) {
}
}
func TestBuildDockerContainerEndpoint(t *testing.T) {
t.Run("valid container ID builds escaped endpoint", func(t *testing.T) {
endpoint, err := buildDockerContainerEndpoint("0123456789ab", "json", nil)
require.NoError(t, err)
assert.Equal(t, "http://localhost/containers/0123456789ab/json", endpoint)
})
t.Run("invalid container ID is rejected", func(t *testing.T) {
_, err := buildDockerContainerEndpoint("../../version", "json", nil)
require.Error(t, err)
assert.Contains(t, err.Error(), "invalid container id")
})
}
func TestContainerDetailsRequestsValidateContainerID(t *testing.T) {
rt := &recordingRoundTripper{
statusCode: 200,
body: `{"Config":{"Env":["SECRET=1"]}}`,
}
dm := &dockerManager{
client: &http.Client{Transport: rt},
}
_, err := dm.getContainerInfo(context.Background(), "../version")
require.Error(t, err)
assert.Contains(t, err.Error(), "invalid container id")
assert.False(t, rt.called, "request should be rejected before dispatching to Docker API")
}
func TestContainerDetailsRequestsUseExpectedDockerPaths(t *testing.T) {
t.Run("container info uses container json endpoint", func(t *testing.T) {
rt := &recordingRoundTripper{
statusCode: 200,
body: `{"Config":{"Env":["SECRET=1"]},"Name":"demo"}`,
}
dm := &dockerManager{
client: &http.Client{Transport: rt},
}
body, err := dm.getContainerInfo(context.Background(), "0123456789ab")
require.NoError(t, err)
assert.True(t, rt.called)
assert.Equal(t, "/containers/0123456789ab/json", rt.lastPath)
assert.NotContains(t, string(body), "SECRET=1", "sensitive env vars should be removed")
})
t.Run("container logs uses expected endpoint and query params", func(t *testing.T) {
rt := &recordingRoundTripper{
statusCode: 200,
body: "line1\nline2\n",
}
dm := &dockerManager{
client: &http.Client{Transport: rt},
}
logs, err := dm.getLogs(context.Background(), "abcdef123456")
require.NoError(t, err)
assert.True(t, rt.called)
assert.Equal(t, "/containers/abcdef123456/logs", rt.lastPath)
assert.Equal(t, "1", rt.lastQuery["stdout"])
assert.Equal(t, "1", rt.lastQuery["stderr"])
assert.Equal(t, "200", rt.lastQuery["tail"])
assert.Equal(t, "line1\nline2\n", logs)
})
}
func TestGetPodmanContainerHealth(t *testing.T) {
called := false
dm := &dockerManager{
client: &http.Client{Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) {
called = true
assert.Equal(t, "/containers/0123456789ab/json", req.URL.EscapedPath())
return &http.Response{
StatusCode: http.StatusOK,
Status: "200 OK",
Header: make(http.Header),
Body: io.NopCloser(strings.NewReader(`{"State":{"Health":{"Status":"healthy"}}}`)),
Request: req,
}, nil
})},
}
health, err := dm.getPodmanContainerHealth("0123456789ab")
require.NoError(t, err)
assert.True(t, called)
assert.Equal(t, container.DockerHealthHealthy, health)
}
func TestValidateCpuPercentage(t *testing.T) {
tests := []struct {
name string
@@ -195,48 +327,6 @@ func TestUpdateContainerStatsValues(t *testing.T) {
assert.Equal(t, testTime, stats.PrevReadTime)
}
func TestTwoDecimals(t *testing.T) {
tests := []struct {
name string
input float64
expected float64
}{
{"round down", 1.234, 1.23},
{"round half up", 1.235, 1.24}, // math.Round rounds half up
{"no rounding needed", 1.23, 1.23},
{"negative number", -1.235, -1.24}, // math.Round rounds half up (more negative)
{"zero", 0.0, 0.0},
{"large number", 123.456, 123.46}, // rounds 5 up
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := twoDecimals(tt.input)
assert.Equal(t, tt.expected, result)
})
}
}
func TestBytesToMegabytes(t *testing.T) {
tests := []struct {
name string
input float64
expected float64
}{
{"1 MB", 1048576, 1.0},
{"512 KB", 524288, 0.5},
{"zero", 0, 0},
{"large value", 1073741824, 1024}, // 1 GB = 1024 MB
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := bytesToMegabytes(tt.input)
assert.Equal(t, tt.expected, result)
})
}
}
func TestInitializeCpuTracking(t *testing.T) {
dm := &dockerManager{
lastCpuContainer: make(map[uint16]map[string]uint64),
@@ -379,6 +469,117 @@ func TestDockerManagerCreation(t *testing.T) {
assert.NotNil(t, dm.networkRecvTrackers)
}
func TestCheckDockerVersion(t *testing.T) {
tests := []struct {
name string
responses []struct {
statusCode int
body string
}
expectedGood bool
expectedRequests int
}{
{
name: "200 with good version on first try",
responses: []struct {
statusCode int
body string
}{
{http.StatusOK, `{"Version":"25.0.1"}`},
},
expectedGood: true,
expectedRequests: 1,
},
{
name: "200 with old version on first try",
responses: []struct {
statusCode int
body string
}{
{http.StatusOK, `{"Version":"24.0.7"}`},
},
expectedGood: false,
expectedRequests: 1,
},
{
name: "non-200 then 200 with good version",
responses: []struct {
statusCode int
body string
}{
{http.StatusServiceUnavailable, `"not ready"`},
{http.StatusOK, `{"Version":"25.1.0"}`},
},
expectedGood: true,
expectedRequests: 2,
},
{
name: "non-200 on all retries",
responses: []struct {
statusCode int
body string
}{
{http.StatusInternalServerError, `"error"`},
{http.StatusUnauthorized, `"error"`},
},
expectedGood: false,
expectedRequests: 2,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
requestCount := 0
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
idx := requestCount
requestCount++
if idx >= len(tt.responses) {
idx = len(tt.responses) - 1
}
w.WriteHeader(tt.responses[idx].statusCode)
fmt.Fprint(w, tt.responses[idx].body)
}))
defer server.Close()
dm := &dockerManager{
client: &http.Client{
Transport: &http.Transport{
DialContext: func(_ context.Context, network, _ string) (net.Conn, error) {
return net.Dial(network, server.Listener.Addr().String())
},
},
},
retrySleep: func(time.Duration) {},
}
dm.checkDockerVersion()
assert.Equal(t, tt.expectedGood, dm.goodDockerVersion)
assert.Equal(t, tt.expectedRequests, requestCount)
})
}
t.Run("request error on all retries", func(t *testing.T) {
requestCount := 0
dm := &dockerManager{
client: &http.Client{
Transport: &http.Transport{
DialContext: func(_ context.Context, _, _ string) (net.Conn, error) {
requestCount++
return nil, errors.New("connection refused")
},
},
},
retrySleep: func(time.Duration) {},
}
dm.checkDockerVersion()
assert.False(t, dm.goodDockerVersion)
assert.Equal(t, 2, requestCount)
})
}
func TestCycleCpuDeltas(t *testing.T) {
dm := &dockerManager{
lastCpuContainer: map[uint16]map[string]uint64{
@@ -691,14 +892,50 @@ func TestContainerStatsEndToEndWithRealData(t *testing.T) {
updateContainerStatsValues(testStats, cpuPct, usedMemory, 1000000, 500000, testTime)
assert.Equal(t, cpuPct, testStats.Cpu)
assert.Equal(t, bytesToMegabytes(float64(usedMemory)), testStats.Mem)
assert.Equal(t, utils.BytesToMegabytes(float64(usedMemory)), testStats.Mem)
assert.Equal(t, [2]uint64{1000000, 500000}, testStats.Bandwidth)
// Deprecated fields still populated for backward compatibility with older hubs
assert.Equal(t, bytesToMegabytes(1000000), testStats.NetworkSent)
assert.Equal(t, bytesToMegabytes(500000), testStats.NetworkRecv)
assert.Equal(t, utils.BytesToMegabytes(1000000), testStats.NetworkSent)
assert.Equal(t, utils.BytesToMegabytes(500000), testStats.NetworkRecv)
assert.Equal(t, testTime, testStats.PrevReadTime)
}
func TestGetLogsDetectsMultiplexedWithoutContentType(t *testing.T) {
// Docker multiplexed frame: [stream][0,0,0][len(4 bytes BE)][payload]
frame := []byte{
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05,
'H', 'e', 'l', 'l', 'o',
}
rt := &recordingRoundTripper{
statusCode: 200,
body: string(frame),
// Intentionally omit content type to simulate Podman behavior.
}
dm := &dockerManager{
client: &http.Client{Transport: rt},
}
logs, err := dm.getLogs(context.Background(), "abcdef123456")
require.NoError(t, err)
assert.Equal(t, "Hello", logs)
}
func TestGetLogsDoesNotMisclassifyRawStreamAsMultiplexed(t *testing.T) {
// Starts with 0x01, but doesn't match Docker frame signature (reserved bytes aren't all zero).
raw := []byte{0x01, 0x02, 0x03, 0x04, 'r', 'a', 'w'}
rt := &recordingRoundTripper{
statusCode: 200,
body: string(raw),
}
dm := &dockerManager{
client: &http.Client{Transport: rt},
}
logs, err := dm.getLogs(context.Background(), "abcdef123456")
require.NoError(t, err)
assert.Equal(t, raw, []byte(logs))
}
func TestEdgeCasesWithRealData(t *testing.T) {
// Test with minimal container stats
minimalStats := &container.ApiStats{
@@ -920,6 +1157,18 @@ func TestParseDockerStatus(t *testing.T) {
expectedStatus: "",
expectedHealth: container.DockerHealthNone,
},
{
name: "status health with health: prefix",
input: "Up 5 minutes (health: starting)",
expectedStatus: "Up 5 minutes",
expectedHealth: container.DockerHealthStarting,
},
{
name: "status health with health status: prefix",
input: "Up 10 minutes (health status: unhealthy)",
expectedStatus: "Up 10 minutes",
expectedHealth: container.DockerHealthUnhealthy,
},
}
for _, tt := range tests {
@@ -931,6 +1180,84 @@ func TestParseDockerStatus(t *testing.T) {
}
}
func TestParseDockerHealthStatus(t *testing.T) {
tests := []struct {
input string
expectedHealth container.DockerHealth
expectedOk bool
}{
{"healthy", container.DockerHealthHealthy, true},
{"unhealthy", container.DockerHealthUnhealthy, true},
{"starting", container.DockerHealthStarting, true},
{"none", container.DockerHealthNone, true},
{" Healthy ", container.DockerHealthHealthy, true},
{"unknown", container.DockerHealthNone, false},
{"", container.DockerHealthNone, false},
}
for _, tt := range tests {
t.Run(tt.input, func(t *testing.T) {
health, ok := parseDockerHealthStatus(tt.input)
assert.Equal(t, tt.expectedHealth, health)
assert.Equal(t, tt.expectedOk, ok)
})
}
}
func TestUpdateContainerStatsUsesPodmanInspectHealthFallback(t *testing.T) {
var requestedPaths []string
dm := &dockerManager{
client: &http.Client{Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) {
requestedPaths = append(requestedPaths, req.URL.EscapedPath())
switch req.URL.EscapedPath() {
case "/containers/0123456789ab/stats":
return &http.Response{
StatusCode: http.StatusOK,
Status: "200 OK",
Header: make(http.Header),
Body: io.NopCloser(strings.NewReader(`{
"read":"2026-03-15T21:26:59Z",
"cpu_stats":{"cpu_usage":{"total_usage":1000},"system_cpu_usage":2000},
"memory_stats":{"usage":1048576,"stats":{"inactive_file":262144}},
"networks":{"eth0":{"rx_bytes":0,"tx_bytes":0}}
}`)),
Request: req,
}, nil
case "/containers/0123456789ab/json":
return &http.Response{
StatusCode: http.StatusOK,
Status: "200 OK",
Header: make(http.Header),
Body: io.NopCloser(strings.NewReader(`{"State":{"Health":{"Status":"healthy"}}}`)),
Request: req,
}, nil
default:
return nil, fmt.Errorf("unexpected path: %s", req.URL.EscapedPath())
}
})},
containerStatsMap: make(map[string]*container.Stats),
apiStats: &container.ApiStats{},
usingPodman: true,
lastCpuContainer: make(map[uint16]map[string]uint64),
lastCpuSystem: make(map[uint16]map[string]uint64),
lastCpuReadTime: make(map[uint16]map[string]time.Time),
networkSentTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
networkRecvTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
}
ctr := &container.ApiInfo{
IdShort: "0123456789ab",
Names: []string{"/beszel"},
Status: "Up 2 minutes",
Image: "beszel:latest",
}
err := dm.updateContainerStats(ctr, defaultCacheTimeMs)
require.NoError(t, err)
assert.Equal(t, []string{"/containers/0123456789ab/stats", "/containers/0123456789ab/json"}, requestedPaths)
assert.Equal(t, container.DockerHealthHealthy, dm.containerStatsMap[ctr.IdShort].Health)
assert.Equal(t, "Up 2 minutes", dm.containerStatsMap[ctr.IdShort].Status)
}
func TestConstantsAndUtilityFunctions(t *testing.T) {
// Test constants are properly defined
assert.Equal(t, uint16(60000), defaultCacheTimeMs)
@@ -940,13 +1267,13 @@ func TestConstantsAndUtilityFunctions(t *testing.T) {
assert.Equal(t, 5*1024*1024, maxTotalLogSize) // 5MB
// Test utility functions
assert.Equal(t, 1.5, twoDecimals(1.499))
assert.Equal(t, 1.5, twoDecimals(1.5))
assert.Equal(t, 1.5, twoDecimals(1.501))
assert.Equal(t, 1.5, utils.TwoDecimals(1.499))
assert.Equal(t, 1.5, utils.TwoDecimals(1.5))
assert.Equal(t, 1.5, utils.TwoDecimals(1.501))
assert.Equal(t, 1.0, bytesToMegabytes(1048576)) // 1 MB
assert.Equal(t, 0.5, bytesToMegabytes(524288)) // 512 KB
assert.Equal(t, 0.0, bytesToMegabytes(0))
assert.Equal(t, 1.0, utils.BytesToMegabytes(1048576)) // 1 MB
assert.Equal(t, 0.5, utils.BytesToMegabytes(524288)) // 512 KB
assert.Equal(t, 0.0, utils.BytesToMegabytes(0))
}
func TestDecodeDockerLogStream(t *testing.T) {
@@ -1246,3 +1573,99 @@ func TestAnsiEscapePattern(t *testing.T) {
})
}
}
func TestConvertContainerPortsToString(t *testing.T) {
type port = struct {
PublicPort uint16
IP string
}
tests := []struct {
name string
ports []port
expected string
}{
{
name: "empty ports",
ports: nil,
expected: "",
},
{
name: "single port",
ports: []port{
{PublicPort: 80, IP: "0.0.0.0"},
},
expected: "80",
},
{
name: "single port with non-default IP",
ports: []port{
{PublicPort: 80, IP: "1.2.3.4"},
},
expected: "1.2.3.4:80",
},
{
name: "ipv6 default ip",
ports: []port{
{PublicPort: 80, IP: "::"},
},
expected: "80",
},
{
name: "zero PublicPort is skipped",
ports: []port{
{PublicPort: 0, IP: "0.0.0.0"},
{PublicPort: 80, IP: "0.0.0.0"},
},
expected: "80",
},
{
name: "ports sorted ascending by PublicPort",
ports: []port{
{PublicPort: 443, IP: "0.0.0.0"},
{PublicPort: 80, IP: "0.0.0.0"},
{PublicPort: 8080, IP: "0.0.0.0"},
},
expected: "80, 443, 8080",
},
{
name: "duplicates are deduplicated",
ports: []port{
{PublicPort: 80, IP: "0.0.0.0"},
{PublicPort: 80, IP: "0.0.0.0"},
{PublicPort: 443, IP: "0.0.0.0"},
},
expected: "80, 443",
},
{
name: "multiple ports with different IPs",
ports: []port{
{PublicPort: 80, IP: "0.0.0.0"},
{PublicPort: 443, IP: "1.2.3.4"},
},
expected: "80, 1.2.3.4:443",
},
{
name: "ports slice is nilled after call",
ports: []port{
{PublicPort: 8080, IP: "0.0.0.0"},
},
expected: "8080",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctr := &container.ApiInfo{}
for _, p := range tt.ports {
ctr.Ports = append(ctr.Ports, struct {
PublicPort uint16
IP string
}{PublicPort: p.PublicPort, IP: p.IP})
}
result := convertContainerPortsToString(ctr)
assert.Equal(t, tt.expected, result)
// Ports slice must be cleared to prevent bleed-over into the next response
assert.Nil(t, ctr.Ports, "ctr.Ports should be nil after formatContainerPorts")
})
}
}

95
agent/emmc_common.go Normal file
View File

@@ -0,0 +1,95 @@
package agent
import (
"fmt"
"strconv"
"strings"
)
func isEmmcBlockName(name string) bool {
if !strings.HasPrefix(name, "mmcblk") {
return false
}
suffix := strings.TrimPrefix(name, "mmcblk")
if suffix == "" {
return false
}
for _, c := range suffix {
if c < '0' || c > '9' {
return false
}
}
return true
}
func parseHexOrDecByte(s string) (uint8, bool) {
s = strings.TrimSpace(s)
if s == "" {
return 0, false
}
base := 10
if strings.HasPrefix(s, "0x") || strings.HasPrefix(s, "0X") {
base = 16
s = s[2:]
}
parsed, err := strconv.ParseUint(s, base, 8)
if err != nil {
return 0, false
}
return uint8(parsed), true
}
func parseHexBytePair(s string) (uint8, uint8, bool) {
fields := strings.Fields(s)
if len(fields) < 2 {
return 0, 0, false
}
a, okA := parseHexOrDecByte(fields[0])
b, okB := parseHexOrDecByte(fields[1])
if !okA && !okB {
return 0, 0, false
}
return a, b, true
}
func emmcSmartStatus(preEOL uint8) string {
switch preEOL {
case 0x01:
return "PASSED"
case 0x02:
return "WARNING"
case 0x03:
return "FAILED"
default:
return "UNKNOWN"
}
}
func emmcPreEOLString(preEOL uint8) string {
switch preEOL {
case 0x01:
return "0x01 (normal)"
case 0x02:
return "0x02 (warning)"
case 0x03:
return "0x03 (urgent)"
default:
return fmt.Sprintf("0x%02x", preEOL)
}
}
func emmcLifeTimeString(v uint8) string {
// JEDEC eMMC: 0x01..0x0A => 0-100% used in 10% steps, 0x0B => exceeded.
switch {
case v == 0:
return "0x00 (not reported)"
case v >= 0x01 && v <= 0x0A:
low := int(v-1) * 10
high := int(v) * 10
return fmt.Sprintf("0x%02x (%d-%d%% used)", v, low, high)
case v == 0x0B:
return "0x0b (>100% used)"
default:
return fmt.Sprintf("0x%02x", v)
}
}

78
agent/emmc_common_test.go Normal file
View File

@@ -0,0 +1,78 @@
package agent
import "testing"
func TestParseHexOrDecByte(t *testing.T) {
tests := []struct {
in string
want uint8
ok bool
}{
{"0x01", 1, true},
{"0X0b", 11, true},
{"01", 1, true},
{" 3 ", 3, true},
{"", 0, false},
{"0x", 0, false},
{"nope", 0, false},
}
for _, tt := range tests {
got, ok := parseHexOrDecByte(tt.in)
if ok != tt.ok || got != tt.want {
t.Fatalf("parseHexOrDecByte(%q) = (%d,%v), want (%d,%v)", tt.in, got, ok, tt.want, tt.ok)
}
}
}
func TestParseHexBytePair(t *testing.T) {
a, b, ok := parseHexBytePair("0x01 0x02\n")
if !ok || a != 1 || b != 2 {
t.Fatalf("parseHexBytePair hex = (%d,%d,%v), want (1,2,true)", a, b, ok)
}
a, b, ok = parseHexBytePair("01 02")
if !ok || a != 1 || b != 2 {
t.Fatalf("parseHexBytePair dec = (%d,%d,%v), want (1,2,true)", a, b, ok)
}
_, _, ok = parseHexBytePair("0x01")
if ok {
t.Fatalf("parseHexBytePair short input ok=true, want false")
}
}
func TestEmmcSmartStatus(t *testing.T) {
if got := emmcSmartStatus(0x01); got != "PASSED" {
t.Fatalf("emmcSmartStatus(0x01) = %q, want PASSED", got)
}
if got := emmcSmartStatus(0x02); got != "WARNING" {
t.Fatalf("emmcSmartStatus(0x02) = %q, want WARNING", got)
}
if got := emmcSmartStatus(0x03); got != "FAILED" {
t.Fatalf("emmcSmartStatus(0x03) = %q, want FAILED", got)
}
if got := emmcSmartStatus(0x00); got != "UNKNOWN" {
t.Fatalf("emmcSmartStatus(0x00) = %q, want UNKNOWN", got)
}
}
func TestIsEmmcBlockName(t *testing.T) {
cases := []struct {
name string
ok bool
}{
{"mmcblk0", true},
{"mmcblk1", true},
{"mmcblk10", true},
{"mmcblk0p1", false},
{"sda", false},
{"mmcblk", false},
{"mmcblkA", false},
}
for _, c := range cases {
if got := isEmmcBlockName(c.name); got != c.ok {
t.Fatalf("isEmmcBlockName(%q) = %v, want %v", c.name, got, c.ok)
}
}
}

215
agent/emmc_linux.go Normal file
View File

@@ -0,0 +1,215 @@
//go:build linux
package agent
import (
"os"
"path/filepath"
"strconv"
"strings"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/smart"
)
// emmcSysfsRoot is a test hook; production value is "/sys".
var emmcSysfsRoot = "/sys"
type emmcHealth struct {
model string
serial string
revision string
capacity uint64
preEOL uint8
lifeA uint8
lifeB uint8
}
func scanEmmcDevices() []*DeviceInfo {
blockDir := filepath.Join(emmcSysfsRoot, "class", "block")
entries, err := os.ReadDir(blockDir)
if err != nil {
return nil
}
devices := make([]*DeviceInfo, 0, 2)
for _, ent := range entries {
name := ent.Name()
if !isEmmcBlockName(name) {
continue
}
deviceDir := filepath.Join(blockDir, name, "device")
if !hasEmmcHealthFiles(deviceDir) {
continue
}
devPath := filepath.Join("/dev", name)
devices = append(devices, &DeviceInfo{
Name: devPath,
Type: "emmc",
InfoName: devPath + " [eMMC]",
Protocol: "MMC",
})
}
return devices
}
func (sm *SmartManager) collectEmmcHealth(deviceInfo *DeviceInfo) (bool, error) {
if deviceInfo == nil || deviceInfo.Name == "" {
return false, nil
}
base := filepath.Base(deviceInfo.Name)
if !isEmmcBlockName(base) && !strings.EqualFold(deviceInfo.Type, "emmc") && !strings.EqualFold(deviceInfo.Type, "mmc") {
return false, nil
}
health, ok := readEmmcHealth(base)
if !ok {
return false, nil
}
// Normalize the device type to keep pruning logic stable across refreshes.
deviceInfo.Type = "emmc"
key := health.serial
if key == "" {
key = filepath.Join("/dev", base)
}
status := emmcSmartStatus(health.preEOL)
attrs := []*smart.SmartAttribute{
{
Name: "PreEOLInfo",
RawValue: uint64(health.preEOL),
RawString: emmcPreEOLString(health.preEOL),
},
{
Name: "DeviceLifeTimeEstA",
RawValue: uint64(health.lifeA),
RawString: emmcLifeTimeString(health.lifeA),
},
{
Name: "DeviceLifeTimeEstB",
RawValue: uint64(health.lifeB),
RawString: emmcLifeTimeString(health.lifeB),
},
}
sm.Lock()
defer sm.Unlock()
if _, exists := sm.SmartDataMap[key]; !exists {
sm.SmartDataMap[key] = &smart.SmartData{}
}
data := sm.SmartDataMap[key]
data.ModelName = health.model
data.SerialNumber = health.serial
data.FirmwareVersion = health.revision
data.Capacity = health.capacity
data.Temperature = 0
data.SmartStatus = status
data.DiskName = filepath.Join("/dev", base)
data.DiskType = "emmc"
data.Attributes = attrs
return true, nil
}
func readEmmcHealth(blockName string) (emmcHealth, bool) {
var out emmcHealth
if !isEmmcBlockName(blockName) {
return out, false
}
deviceDir := filepath.Join(emmcSysfsRoot, "class", "block", blockName, "device")
preEOL, okPre := readHexByteFile(filepath.Join(deviceDir, "pre_eol_info"))
// Some kernels expose EXT_CSD lifetime via "life_time" (two bytes), others as
// separate files. Support both.
lifeA, lifeB, okLife := readLifeTime(deviceDir)
if !okPre && !okLife {
return out, false
}
out.preEOL = preEOL
out.lifeA = lifeA
out.lifeB = lifeB
out.model = utils.ReadStringFile(filepath.Join(deviceDir, "name"))
out.serial = utils.ReadStringFile(filepath.Join(deviceDir, "serial"))
out.revision = utils.ReadStringFile(filepath.Join(deviceDir, "prv"))
if capBytes, ok := readBlockCapacityBytes(blockName); ok {
out.capacity = capBytes
}
return out, true
}
func readLifeTime(deviceDir string) (uint8, uint8, bool) {
if content, ok := utils.ReadStringFileOK(filepath.Join(deviceDir, "life_time")); ok {
a, b, ok := parseHexBytePair(content)
return a, b, ok
}
a, okA := readHexByteFile(filepath.Join(deviceDir, "device_life_time_est_typ_a"))
b, okB := readHexByteFile(filepath.Join(deviceDir, "device_life_time_est_typ_b"))
if okA || okB {
return a, b, true
}
return 0, 0, false
}
func readBlockCapacityBytes(blockName string) (uint64, bool) {
sizePath := filepath.Join(emmcSysfsRoot, "class", "block", blockName, "size")
lbsPath := filepath.Join(emmcSysfsRoot, "class", "block", blockName, "queue", "logical_block_size")
sizeStr, ok := utils.ReadStringFileOK(sizePath)
if !ok {
return 0, false
}
sectors, err := strconv.ParseUint(sizeStr, 10, 64)
if err != nil || sectors == 0 {
return 0, false
}
lbsStr, ok := utils.ReadStringFileOK(lbsPath)
logicalBlockSize := uint64(512)
if ok {
if parsed, err := strconv.ParseUint(lbsStr, 10, 64); err == nil && parsed > 0 {
logicalBlockSize = parsed
}
}
return sectors * logicalBlockSize, true
}
func readHexByteFile(path string) (uint8, bool) {
content, ok := utils.ReadStringFileOK(path)
if !ok {
return 0, false
}
b, ok := parseHexOrDecByte(content)
return b, ok
}
func hasEmmcHealthFiles(deviceDir string) bool {
entries, err := os.ReadDir(deviceDir)
if err != nil {
return false
}
for _, ent := range entries {
switch ent.Name() {
case "pre_eol_info", "life_time", "device_life_time_est_typ_a", "device_life_time_est_typ_b":
return true
}
}
return false
}

80
agent/emmc_linux_test.go Normal file
View File

@@ -0,0 +1,80 @@
//go:build linux
package agent
import (
"os"
"path/filepath"
"testing"
"github.com/henrygd/beszel/internal/entities/smart"
)
func TestEmmcMockSysfsScanAndCollect(t *testing.T) {
tmp := t.TempDir()
prev := emmcSysfsRoot
emmcSysfsRoot = tmp
t.Cleanup(func() { emmcSysfsRoot = prev })
// Fake: /sys/class/block/mmcblk0
mmcDeviceDir := filepath.Join(tmp, "class", "block", "mmcblk0", "device")
mmcQueueDir := filepath.Join(tmp, "class", "block", "mmcblk0", "queue")
if err := os.MkdirAll(mmcDeviceDir, 0o755); err != nil {
t.Fatal(err)
}
if err := os.MkdirAll(mmcQueueDir, 0o755); err != nil {
t.Fatal(err)
}
write := func(path, content string) {
t.Helper()
if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
t.Fatal(err)
}
}
write(filepath.Join(mmcDeviceDir, "pre_eol_info"), "0x02\n")
write(filepath.Join(mmcDeviceDir, "life_time"), "0x04 0x05\n")
write(filepath.Join(mmcDeviceDir, "name"), "H26M52103FMR\n")
write(filepath.Join(mmcDeviceDir, "serial"), "01234567\n")
write(filepath.Join(mmcDeviceDir, "prv"), "0x08\n")
write(filepath.Join(mmcQueueDir, "logical_block_size"), "512\n")
write(filepath.Join(tmp, "class", "block", "mmcblk0", "size"), "1024\n") // sectors
devs := scanEmmcDevices()
if len(devs) != 1 {
t.Fatalf("scanEmmcDevices() = %d devices, want 1", len(devs))
}
if devs[0].Name != "/dev/mmcblk0" || devs[0].Type != "emmc" {
t.Fatalf("scanEmmcDevices()[0] = %+v, want Name=/dev/mmcblk0 Type=emmc", devs[0])
}
sm := &SmartManager{SmartDataMap: map[string]*smart.SmartData{}}
ok, err := sm.collectEmmcHealth(devs[0])
if err != nil || !ok {
t.Fatalf("collectEmmcHealth() = (ok=%v, err=%v), want (true,nil)", ok, err)
}
if len(sm.SmartDataMap) != 1 {
t.Fatalf("SmartDataMap len=%d, want 1", len(sm.SmartDataMap))
}
var got *smart.SmartData
for _, v := range sm.SmartDataMap {
got = v
break
}
if got == nil {
t.Fatalf("SmartDataMap value nil")
}
if got.DiskType != "emmc" || got.DiskName != "/dev/mmcblk0" {
t.Fatalf("disk fields = (type=%q name=%q), want (emmc,/dev/mmcblk0)", got.DiskType, got.DiskName)
}
if got.SmartStatus != "WARNING" {
t.Fatalf("SmartStatus=%q, want WARNING", got.SmartStatus)
}
if got.SerialNumber != "01234567" || got.ModelName == "" || got.Capacity == 0 {
t.Fatalf("identity fields = (model=%q serial=%q cap=%d), want non-empty model, serial 01234567, cap>0", got.ModelName, got.SerialNumber, got.Capacity)
}
if len(got.Attributes) < 3 {
t.Fatalf("attributes len=%d, want >= 3", len(got.Attributes))
}
}

14
agent/emmc_stub.go Normal file
View File

@@ -0,0 +1,14 @@
//go:build !linux
package agent
// Non-Linux builds: eMMC health via sysfs is not available.
func scanEmmcDevices() []*DeviceInfo {
return nil
}
func (sm *SmartManager) collectEmmcHealth(deviceInfo *DeviceInfo) (bool, error) {
return false, nil
}

87
agent/fingerprint.go Normal file
View File

@@ -0,0 +1,87 @@
package agent
import (
"crypto/sha256"
"encoding/hex"
"errors"
"os"
"path/filepath"
"strings"
"github.com/shirou/gopsutil/v4/cpu"
"github.com/shirou/gopsutil/v4/host"
)
const fingerprintFileName = "fingerprint"
// knownBadUUID is a commonly known "product_uuid" that is not unique across systems.
const knownBadUUID = "03000200-0400-0500-0006-000700080009"
// GetFingerprint returns the agent fingerprint. It first tries to read a saved
// fingerprint from the data directory. If not found (or dataDir is empty), it
// generates one from system properties. The hostname and cpuModel parameters are
// used as fallback material if host.HostID() fails. If either is empty, they
// are fetched from the system automatically.
//
// If a new fingerprint is generated and a dataDir is provided, it is saved.
func GetFingerprint(dataDir, hostname, cpuModel string) string {
if dataDir != "" {
if fp, err := readFingerprint(dataDir); err == nil {
return fp
}
}
fp := generateFingerprint(hostname, cpuModel)
if dataDir != "" {
_ = SaveFingerprint(dataDir, fp)
}
return fp
}
// generateFingerprint creates a fingerprint from system properties.
// It tries host.HostID() first, falling back to hostname + cpuModel.
// If hostname or cpuModel are empty, they are fetched from the system.
func generateFingerprint(hostname, cpuModel string) string {
fingerprint, err := host.HostID()
if err != nil || fingerprint == "" || fingerprint == knownBadUUID {
if hostname == "" {
hostname, _ = os.Hostname()
}
if cpuModel == "" {
if info, err := cpu.Info(); err == nil && len(info) > 0 {
cpuModel = info[0].ModelName
}
}
fingerprint = hostname + cpuModel
}
sum := sha256.Sum256([]byte(fingerprint))
return hex.EncodeToString(sum[:24])
}
// readFingerprint reads the saved fingerprint from the data directory.
func readFingerprint(dataDir string) (string, error) {
fp, err := os.ReadFile(filepath.Join(dataDir, fingerprintFileName))
if err != nil {
return "", err
}
s := strings.TrimSpace(string(fp))
if s == "" {
return "", errors.New("fingerprint file is empty")
}
return s, nil
}
// SaveFingerprint writes the fingerprint to the data directory.
func SaveFingerprint(dataDir, fingerprint string) error {
return os.WriteFile(filepath.Join(dataDir, fingerprintFileName), []byte(fingerprint), 0o644)
}
// DeleteFingerprint removes the saved fingerprint file from the data directory.
// Returns nil if the file does not exist (idempotent).
func DeleteFingerprint(dataDir string) error {
err := os.Remove(filepath.Join(dataDir, fingerprintFileName))
if errors.Is(err, os.ErrNotExist) {
return nil
}
return err
}

102
agent/fingerprint_test.go Normal file
View File

@@ -0,0 +1,102 @@
//go:build testing
package agent
import (
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestGetFingerprint(t *testing.T) {
t.Run("reads existing fingerprint from file", func(t *testing.T) {
dir := t.TempDir()
expected := "abc123def456"
err := os.WriteFile(filepath.Join(dir, fingerprintFileName), []byte(expected), 0644)
require.NoError(t, err)
fp := GetFingerprint(dir, "", "")
assert.Equal(t, expected, fp)
})
t.Run("trims whitespace from file", func(t *testing.T) {
dir := t.TempDir()
err := os.WriteFile(filepath.Join(dir, fingerprintFileName), []byte(" abc123 \n"), 0644)
require.NoError(t, err)
fp := GetFingerprint(dir, "", "")
assert.Equal(t, "abc123", fp)
})
t.Run("generates fingerprint when file does not exist", func(t *testing.T) {
dir := t.TempDir()
fp := GetFingerprint(dir, "", "")
assert.NotEmpty(t, fp)
})
t.Run("generates fingerprint when dataDir is empty", func(t *testing.T) {
fp := GetFingerprint("", "", "")
assert.NotEmpty(t, fp)
})
t.Run("generates consistent fingerprint for same inputs", func(t *testing.T) {
fp1 := GetFingerprint("", "myhost", "mycpu")
fp2 := GetFingerprint("", "myhost", "mycpu")
assert.Equal(t, fp1, fp2)
})
t.Run("prefers saved fingerprint over generated", func(t *testing.T) {
dir := t.TempDir()
require.NoError(t, SaveFingerprint(dir, "saved-fp"))
fp := GetFingerprint(dir, "anyhost", "anycpu")
assert.Equal(t, "saved-fp", fp)
})
}
func TestSaveFingerprint(t *testing.T) {
t.Run("saves fingerprint to file", func(t *testing.T) {
dir := t.TempDir()
err := SaveFingerprint(dir, "abc123")
require.NoError(t, err)
content, err := os.ReadFile(filepath.Join(dir, fingerprintFileName))
require.NoError(t, err)
assert.Equal(t, "abc123", string(content))
})
t.Run("overwrites existing fingerprint", func(t *testing.T) {
dir := t.TempDir()
require.NoError(t, SaveFingerprint(dir, "old"))
require.NoError(t, SaveFingerprint(dir, "new"))
content, err := os.ReadFile(filepath.Join(dir, fingerprintFileName))
require.NoError(t, err)
assert.Equal(t, "new", string(content))
})
}
func TestDeleteFingerprint(t *testing.T) {
t.Run("deletes existing fingerprint", func(t *testing.T) {
dir := t.TempDir()
fp := filepath.Join(dir, fingerprintFileName)
err := os.WriteFile(fp, []byte("abc123"), 0644)
require.NoError(t, err)
err = DeleteFingerprint(dir)
require.NoError(t, err)
// Verify file is gone
_, err = os.Stat(fp)
assert.True(t, os.IsNotExist(err))
})
t.Run("no error when file does not exist", func(t *testing.T) {
dir := t.TempDir()
err := DeleteFingerprint(dir)
assert.NoError(t, err)
})
}

View File

@@ -9,25 +9,26 @@ import (
"maps"
"os/exec"
"regexp"
"runtime"
"strconv"
"strings"
"sync"
"time"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/system"
)
const (
// Commands
nvidiaSmiCmd string = "nvidia-smi"
rocmSmiCmd string = "rocm-smi"
amdgpuCmd string = "amdgpu" // internal cmd for sysfs collection
tegraStatsCmd string = "tegrastats"
nvidiaSmiCmd string = "nvidia-smi"
rocmSmiCmd string = "rocm-smi"
tegraStatsCmd string = "tegrastats"
nvtopCmd string = "nvtop"
powermetricsCmd string = "powermetrics"
macmonCmd string = "macmon"
noGPUFoundMsg string = "no GPU found - see https://beszel.dev/guide/gpu"
// Polling intervals
nvidiaSmiInterval string = "4" // in seconds
tegraStatsInterval string = "3700" // in milliseconds
rocmSmiInterval time.Duration = 4300 * time.Millisecond
// Command retry and timeout constants
retryWaitTime time.Duration = 5 * time.Second
maxFailureRetries int = 5
@@ -40,13 +41,7 @@ const (
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
type GPUManager struct {
sync.Mutex
nvidiaSmi bool
rocmSmi bool
amdgpu bool
tegrastats bool
intelGpuStats bool
nvml bool
GpuDataMap map[string]*system.GPUData
GpuDataMap map[string]*system.GPUData
// lastAvgData stores the last calculated averages for each GPU
// Used when a collection happens before new data arrives (Count == 0)
lastAvgData map[string]system.GPUData
@@ -87,6 +82,58 @@ type gpuCollector struct {
var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data
// collectorSource identifies a selectable GPU collector in GPU_COLLECTOR.
type collectorSource string
const (
collectorSourceNVTop collectorSource = collectorSource(nvtopCmd)
collectorSourceNVML collectorSource = "nvml"
collectorSourceNvidiaSMI collectorSource = collectorSource(nvidiaSmiCmd)
collectorSourceIntelGpuTop collectorSource = collectorSource(intelGpuStatsCmd)
collectorSourceAmdSysfs collectorSource = "amd_sysfs"
collectorSourceRocmSMI collectorSource = collectorSource(rocmSmiCmd)
collectorSourceMacmon collectorSource = collectorSource(macmonCmd)
collectorSourcePowermetrics collectorSource = collectorSource(powermetricsCmd)
collectorGroupNvidia string = "nvidia"
collectorGroupIntel string = "intel"
collectorGroupAmd string = "amd"
collectorGroupApple string = "apple"
)
func isValidCollectorSource(source collectorSource) bool {
switch source {
case collectorSourceNVTop,
collectorSourceNVML,
collectorSourceNvidiaSMI,
collectorSourceIntelGpuTop,
collectorSourceAmdSysfs,
collectorSourceRocmSMI,
collectorSourceMacmon,
collectorSourcePowermetrics:
return true
}
return false
}
// gpuCapabilities describes detected GPU tooling and sysfs support on the host.
type gpuCapabilities struct {
hasNvidiaSmi bool
hasRocmSmi bool
hasAmdSysfs bool
hasTegrastats bool
hasIntelGpuTop bool
hasNvtop bool
hasMacmon bool
hasPowermetrics bool
}
type collectorDefinition struct {
group string
available bool
start func(onFailure func()) bool
deprecationWarning string
}
// starts and manages the ongoing collection of GPU data for the specified GPU management utility
func (c *gpuCollector) start() {
for {
@@ -245,8 +292,8 @@ func (gm *GPUManager) parseAmdData(output []byte) bool {
}
gpu := gm.GpuDataMap[id]
gpu.Temperature, _ = strconv.ParseFloat(v.Temperature, 64)
gpu.MemoryUsed = bytesToMegabytes(memoryUsage)
gpu.MemoryTotal = bytesToMegabytes(totalMemory)
gpu.MemoryUsed = utils.BytesToMegabytes(memoryUsage)
gpu.MemoryTotal = utils.BytesToMegabytes(totalMemory)
gpu.Usage += usage
gpu.Power += power
gpu.Count++
@@ -320,16 +367,16 @@ func (gm *GPUManager) calculateGPUAverage(id string, gpu *system.GPUData, cacheK
gpuAvg := *gpu
deltaUsage, deltaPower, deltaPowerPkg := gm.calculateDeltas(gpu, lastSnapshot)
gpuAvg.Power = twoDecimals(deltaPower / float64(deltaCount))
gpuAvg.Power = utils.TwoDecimals(deltaPower / float64(deltaCount))
if gpu.Engines != nil {
// make fresh map for averaged engine metrics to avoid mutating
// the accumulator map stored in gm.GpuDataMap
gpuAvg.Engines = make(map[string]float64, len(gpu.Engines))
gpuAvg.Usage = gm.calculateIntelGPUUsage(&gpuAvg, gpu, lastSnapshot, deltaCount)
gpuAvg.PowerPkg = twoDecimals(deltaPowerPkg / float64(deltaCount))
gpuAvg.PowerPkg = utils.TwoDecimals(deltaPowerPkg / float64(deltaCount))
} else {
gpuAvg.Usage = twoDecimals(deltaUsage / float64(deltaCount))
gpuAvg.Usage = utils.TwoDecimals(deltaUsage / float64(deltaCount))
}
gm.lastAvgData[id] = gpuAvg
@@ -364,17 +411,17 @@ func (gm *GPUManager) calculateIntelGPUUsage(gpuAvg, gpu *system.GPUData, lastSn
} else {
deltaEngine = engine
}
gpuAvg.Engines[name] = twoDecimals(deltaEngine / float64(deltaCount))
gpuAvg.Engines[name] = utils.TwoDecimals(deltaEngine / float64(deltaCount))
maxEngineUsage = max(maxEngineUsage, deltaEngine/float64(deltaCount))
}
return twoDecimals(maxEngineUsage)
return utils.TwoDecimals(maxEngineUsage)
}
// updateInstantaneousValues updates values that should reflect current state, not averages
func (gm *GPUManager) updateInstantaneousValues(gpuAvg *system.GPUData, gpu *system.GPUData) {
gpuAvg.Temperature = twoDecimals(gpu.Temperature)
gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed)
gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal)
gpuAvg.Temperature = utils.TwoDecimals(gpu.Temperature)
gpuAvg.MemoryUsed = utils.TwoDecimals(gpu.MemoryUsed)
gpuAvg.MemoryTotal = utils.TwoDecimals(gpu.MemoryTotal)
}
// storeSnapshot saves the current GPU state for this cache key
@@ -392,133 +439,324 @@ func (gm *GPUManager) storeSnapshot(id string, gpu *system.GPUData, cacheKey uin
gm.lastSnapshots[cacheKey][id] = snapshot
}
// detectGPUs checks for the presence of GPU management tools (nvidia-smi, rocm-smi, tegrastats)
// in the system path. It sets the corresponding flags in the GPUManager struct if any of these
// tools are found. If none of the tools are found, it returns an error indicating that no GPU
// management tools are available.
func (gm *GPUManager) detectGPUs() error {
// discoverGpuCapabilities checks for available GPU tooling and sysfs support.
// It only reports capability presence and does not apply policy decisions.
func (gm *GPUManager) discoverGpuCapabilities() gpuCapabilities {
caps := gpuCapabilities{
hasAmdSysfs: gm.hasAmdSysfs(),
}
if _, err := exec.LookPath(nvidiaSmiCmd); err == nil {
gm.nvidiaSmi = true
caps.hasNvidiaSmi = true
}
if _, err := exec.LookPath(rocmSmiCmd); err == nil {
if val, _ := GetEnv("AMD_SYSFS"); val == "true" {
gm.amdgpu = true
} else {
gm.rocmSmi = true
}
} else if gm.hasAmdSysfs() {
gm.amdgpu = true
caps.hasRocmSmi = true
}
if _, err := exec.LookPath(tegraStatsCmd); err == nil {
gm.tegrastats = true
gm.nvidiaSmi = false
caps.hasTegrastats = true
}
if _, err := exec.LookPath(intelGpuStatsCmd); err == nil {
gm.intelGpuStats = true
caps.hasIntelGpuTop = true
}
if gm.nvidiaSmi || gm.rocmSmi || gm.amdgpu || gm.tegrastats || gm.intelGpuStats || gm.nvml {
return nil
if _, err := exec.LookPath(nvtopCmd); err == nil {
caps.hasNvtop = true
}
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or intel_gpu_top")
if runtime.GOOS == "darwin" {
if _, err := exec.LookPath(macmonCmd); err == nil {
caps.hasMacmon = true
}
if _, err := exec.LookPath(powermetricsCmd); err == nil {
caps.hasPowermetrics = true
}
}
return caps
}
// startCollector starts the appropriate GPU data collector based on the command
func (gm *GPUManager) startCollector(command string) {
collector := gpuCollector{
name: command,
bufSize: 10 * 1024,
}
switch command {
case intelGpuStatsCmd:
go func() {
failures := 0
for {
if err := gm.collectIntelStats(); err != nil {
failures++
if failures > maxFailureRetries {
break
}
slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err)
time.Sleep(retryWaitTime)
continue
func hasAnyGpuCollector(caps gpuCapabilities) bool {
return caps.hasNvidiaSmi || caps.hasRocmSmi || caps.hasAmdSysfs || caps.hasTegrastats || caps.hasIntelGpuTop || caps.hasNvtop || caps.hasMacmon || caps.hasPowermetrics
}
func (gm *GPUManager) startIntelCollector() {
go func() {
failures := 0
for {
if err := gm.collectIntelStats(); err != nil {
failures++
if failures > maxFailureRetries {
break
}
slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err)
time.Sleep(retryWaitTime)
continue
}
}()
case nvidiaSmiCmd:
collector.cmdArgs = []string{
"-l", nvidiaSmiInterval,
}
}()
}
func (gm *GPUManager) startNvidiaSmiCollector(intervalSeconds string) {
collector := gpuCollector{
name: nvidiaSmiCmd,
bufSize: 10 * 1024,
cmdArgs: []string{
"-l", intervalSeconds,
"--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw",
"--format=csv,noheader,nounits",
}
collector.parse = gm.parseNvidiaData
go collector.start()
case tegraStatsCmd:
collector.cmdArgs = []string{"--interval", tegraStatsInterval}
collector.parse = gm.getJetsonParser()
go collector.start()
case amdgpuCmd:
go func() {
if err := gm.collectAmdStats(); err != nil {
slog.Warn("Error collecting AMD GPU data via sysfs", "err", err)
}
}()
case rocmSmiCmd:
collector.cmdArgs = []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"}
collector.parse = gm.parseAmdData
go func() {
failures := 0
for {
if err := collector.collect(); err != nil {
failures++
if failures > maxFailureRetries {
break
}
slog.Warn("Error collecting AMD GPU data via rocm-smi", "err", err)
}
time.Sleep(rocmSmiInterval)
}
}()
},
parse: gm.parseNvidiaData,
}
go collector.start()
}
func (gm *GPUManager) startTegraStatsCollector(intervalMilliseconds string) {
collector := gpuCollector{
name: tegraStatsCmd,
bufSize: 10 * 1024,
cmdArgs: []string{"--interval", intervalMilliseconds},
parse: gm.getJetsonParser(),
}
go collector.start()
}
func (gm *GPUManager) startRocmSmiCollector(pollInterval time.Duration) {
collector := gpuCollector{
name: rocmSmiCmd,
bufSize: 10 * 1024,
cmdArgs: []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"},
parse: gm.parseAmdData,
}
go func() {
failures := 0
for {
if err := collector.collect(); err != nil {
failures++
if failures > maxFailureRetries {
break
}
slog.Warn("Error collecting AMD GPU data via rocm-smi", "err", err)
}
time.Sleep(pollInterval)
}
}()
}
func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSource]collectorDefinition {
return map[collectorSource]collectorDefinition{
collectorSourceNVML: {
group: collectorGroupNvidia,
available: caps.hasNvidiaSmi,
start: func(_ func()) bool {
return gm.startNvmlCollector()
},
},
collectorSourceNvidiaSMI: {
group: collectorGroupNvidia,
available: caps.hasNvidiaSmi,
start: func(_ func()) bool {
gm.startNvidiaSmiCollector("4") // seconds
return true
},
},
collectorSourceIntelGpuTop: {
group: collectorGroupIntel,
available: caps.hasIntelGpuTop,
start: func(_ func()) bool {
gm.startIntelCollector()
return true
},
},
collectorSourceAmdSysfs: {
group: collectorGroupAmd,
available: caps.hasAmdSysfs,
start: func(_ func()) bool {
return gm.startAmdSysfsCollector()
},
},
collectorSourceRocmSMI: {
group: collectorGroupAmd,
available: caps.hasRocmSmi,
deprecationWarning: "rocm-smi is deprecated and may be removed in a future release",
start: func(_ func()) bool {
gm.startRocmSmiCollector(4300 * time.Millisecond)
return true
},
},
collectorSourceNVTop: {
available: caps.hasNvtop,
start: func(onFailure func()) bool {
gm.startNvtopCollector("30", onFailure) // tens of milliseconds
return true
},
},
collectorSourceMacmon: {
group: collectorGroupApple,
available: caps.hasMacmon,
start: func(_ func()) bool {
gm.startMacmonCollector()
return true
},
},
collectorSourcePowermetrics: {
group: collectorGroupApple,
available: caps.hasPowermetrics,
start: func(_ func()) bool {
gm.startPowermetricsCollector()
return true
},
},
}
}
// parseCollectorPriority parses GPU_COLLECTOR and returns valid ordered entries.
func parseCollectorPriority(value string) []collectorSource {
parts := strings.Split(value, ",")
priorities := make([]collectorSource, 0, len(parts))
for _, raw := range parts {
name := collectorSource(strings.TrimSpace(strings.ToLower(raw)))
if !isValidCollectorSource(name) {
if name != "" {
slog.Warn("Ignoring unknown GPU collector", "collector", name)
}
continue
}
priorities = append(priorities, name)
}
return priorities
}
// startNvmlCollector initializes NVML and starts its polling loop.
func (gm *GPUManager) startNvmlCollector() bool {
collector := &nvmlCollector{gm: gm}
if err := collector.init(); err != nil {
slog.Warn("Failed to initialize NVML", "err", err)
return false
}
go collector.start()
return true
}
// startAmdSysfsCollector starts AMD GPU collection via sysfs.
func (gm *GPUManager) startAmdSysfsCollector() bool {
go func() {
if err := gm.collectAmdStats(); err != nil {
slog.Warn("Error collecting AMD GPU data via sysfs", "err", err)
}
}()
return true
}
// startCollectorsByPriority starts collectors in order with one source per vendor group.
func (gm *GPUManager) startCollectorsByPriority(priorities []collectorSource, caps gpuCapabilities) int {
definitions := gm.collectorDefinitions(caps)
selectedGroups := make(map[string]bool, 3)
started := 0
for i, source := range priorities {
definition, ok := definitions[source]
if !ok || !definition.available {
continue
}
// nvtop is not a vendor-specific collector, so should only be used if no other collectors are selected or it is first in GPU_COLLECTOR.
if source == collectorSourceNVTop {
if len(selectedGroups) > 0 {
slog.Warn("Skipping nvtop because other collectors are selected")
continue
}
// if nvtop fails, fall back to remaining collectors.
remaining := append([]collectorSource(nil), priorities[i+1:]...)
if definition.start(func() {
gm.startCollectorsByPriority(remaining, caps)
}) {
started++
return started
}
}
group := definition.group
if group == "" || selectedGroups[group] {
continue
}
if definition.deprecationWarning != "" {
slog.Warn(definition.deprecationWarning)
}
if definition.start(nil) {
selectedGroups[group] = true
started++
}
}
return started
}
// resolveLegacyCollectorPriority builds the default collector order when GPU_COLLECTOR is unset.
func (gm *GPUManager) resolveLegacyCollectorPriority(caps gpuCapabilities) []collectorSource {
priorities := make([]collectorSource, 0, 4)
if caps.hasNvidiaSmi && !caps.hasTegrastats {
if nvml, _ := utils.GetEnv("NVML"); nvml == "true" {
priorities = append(priorities, collectorSourceNVML, collectorSourceNvidiaSMI)
} else {
priorities = append(priorities, collectorSourceNvidiaSMI)
}
}
if caps.hasRocmSmi {
if val, _ := utils.GetEnv("AMD_SYSFS"); val == "true" {
priorities = append(priorities, collectorSourceAmdSysfs)
} else {
priorities = append(priorities, collectorSourceRocmSMI)
}
} else if caps.hasAmdSysfs {
priorities = append(priorities, collectorSourceAmdSysfs)
}
if caps.hasIntelGpuTop {
priorities = append(priorities, collectorSourceIntelGpuTop)
}
// Apple collectors are currently opt-in only for testing.
// Enable them with GPU_COLLECTOR=macmon or GPU_COLLECTOR=powermetrics.
// TODO: uncomment below when Apple collectors are confirmed to be working.
//
// Prefer macmon on macOS (no sudo). Fall back to powermetrics if present.
// if caps.hasMacmon {
// priorities = append(priorities, collectorSourceMacmon)
// } else if caps.hasPowermetrics {
// priorities = append(priorities, collectorSourcePowermetrics)
// }
// Keep nvtop as a last resort only when no vendor collector exists.
if len(priorities) == 0 && caps.hasNvtop {
priorities = append(priorities, collectorSourceNVTop)
}
return priorities
}
// NewGPUManager creates and initializes a new GPUManager
func NewGPUManager() (*GPUManager, error) {
if skipGPU, _ := GetEnv("SKIP_GPU"); skipGPU == "true" {
if skipGPU, _ := utils.GetEnv("SKIP_GPU"); skipGPU == "true" {
return nil, nil
}
var gm GPUManager
if err := gm.detectGPUs(); err != nil {
return nil, err
caps := gm.discoverGpuCapabilities()
if !hasAnyGpuCollector(caps) {
return nil, fmt.Errorf(noGPUFoundMsg)
}
gm.GpuDataMap = make(map[string]*system.GPUData)
if gm.nvidiaSmi {
if nvml, _ := GetEnv("NVML"); nvml == "true" {
gm.nvml = true
gm.nvidiaSmi = false
collector := &nvmlCollector{gm: &gm}
if err := collector.init(); err == nil {
go collector.start()
} else {
slog.Warn("Failed to initialize NVML, falling back to nvidia-smi", "err", err)
gm.nvidiaSmi = true
gm.startCollector(nvidiaSmiCmd)
}
} else {
gm.startCollector(nvidiaSmiCmd)
// Jetson devices should always use tegrastats (ignore GPU_COLLECTOR).
if caps.hasTegrastats {
gm.startTegraStatsCollector("3700")
return &gm, nil
}
// if GPU_COLLECTOR is set, start user-defined collectors.
if collectorConfig, ok := utils.GetEnv("GPU_COLLECTOR"); ok && strings.TrimSpace(collectorConfig) != "" {
priorities := parseCollectorPriority(collectorConfig)
if gm.startCollectorsByPriority(priorities, caps) == 0 {
return nil, fmt.Errorf("no configured GPU collectors are available")
}
return &gm, nil
}
if gm.rocmSmi {
gm.startCollector(rocmSmiCmd)
}
if gm.amdgpu {
gm.startCollector(amdgpuCmd)
}
if gm.tegrastats {
gm.startCollector(tegraStatsCmd)
}
if gm.intelGpuStats {
gm.startCollector(intelGpuStatsCmd)
// auto-detect and start collectors when GPU_COLLECTOR is unset.
if gm.startCollectorsByPriority(gm.resolveLegacyCollectorPriority(caps), caps) == 0 {
return nil, fmt.Errorf(noGPUFoundMsg)
}
return &gm, nil

View File

@@ -3,6 +3,7 @@
package agent
import (
"bufio"
"fmt"
"log/slog"
"os"
@@ -12,9 +13,19 @@ import (
"sync"
"time"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/system"
)
var amdgpuNameCache = struct {
sync.RWMutex
hits map[string]string
misses map[string]struct{}
}{
hits: make(map[string]string),
misses: make(map[string]struct{}),
}
// hasAmdSysfs returns true if any AMD GPU sysfs nodes are found
func (gm *GPUManager) hasAmdSysfs() bool {
cards, err := filepath.Glob("/sys/class/drm/card*/device/vendor")
@@ -22,8 +33,8 @@ func (gm *GPUManager) hasAmdSysfs() bool {
return false
}
for _, vendorPath := range cards {
vendor, err := os.ReadFile(vendorPath)
if err == nil && strings.TrimSpace(string(vendor)) == "0x1002" {
vendor, err := utils.ReadStringFileLimited(vendorPath, 64)
if err == nil && vendor == "0x1002" {
return true
}
}
@@ -32,6 +43,7 @@ func (gm *GPUManager) hasAmdSysfs() bool {
// collectAmdStats collects AMD GPU metrics directly from sysfs to avoid the overhead of rocm-smi
func (gm *GPUManager) collectAmdStats() error {
sysfsPollInterval := 3000 * time.Millisecond
cards, err := filepath.Glob("/sys/class/drm/card*")
if err != nil {
return err
@@ -70,17 +82,17 @@ func (gm *GPUManager) collectAmdStats() error {
continue
}
failures = 0
time.Sleep(rocmSmiInterval)
time.Sleep(sysfsPollInterval)
}
}
// isAmdGpu checks whether a DRM card path belongs to AMD vendor ID 0x1002.
func isAmdGpu(cardPath string) bool {
vendorPath := filepath.Join(cardPath, "device/vendor")
vendor, err := os.ReadFile(vendorPath)
vendor, err := utils.ReadStringFileLimited(filepath.Join(cardPath, "device/vendor"), 64)
if err != nil {
return false
}
return strings.TrimSpace(string(vendor)) == "0x1002"
return vendor == "0x1002"
}
// updateAmdGpuData reads GPU metrics from sysfs and updates the GPU data map.
@@ -93,6 +105,13 @@ func (gm *GPUManager) updateAmdGpuData(cardPath string) bool {
usage, usageErr := readSysfsFloat(filepath.Join(devicePath, "gpu_busy_percent"))
memUsed, memUsedErr := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_used"))
memTotal, _ := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_total"))
// if gtt is present, add it to the memory used and total (https://github.com/henrygd/beszel/issues/1569#issuecomment-3837640484)
if gttUsed, err := readSysfsFloat(filepath.Join(devicePath, "mem_info_gtt_used")); err == nil && gttUsed > 0 {
if gttTotal, err := readSysfsFloat(filepath.Join(devicePath, "mem_info_gtt_total")); err == nil {
memUsed += gttUsed
memTotal += gttTotal
}
}
var temp, power float64
hwmons, _ := filepath.Glob(filepath.Join(devicePath, "hwmon/hwmon*"))
@@ -125,20 +144,128 @@ func (gm *GPUManager) updateAmdGpuData(cardPath string) bool {
if usageErr == nil {
gpu.Usage += usage
}
gpu.MemoryUsed = bytesToMegabytes(memUsed)
gpu.MemoryTotal = bytesToMegabytes(memTotal)
gpu.MemoryUsed = utils.BytesToMegabytes(memUsed)
gpu.MemoryTotal = utils.BytesToMegabytes(memTotal)
gpu.Temperature = temp
gpu.Power += power
gpu.Count++
return true
}
// readSysfsFloat reads and parses a numeric value from a sysfs file.
func readSysfsFloat(path string) (float64, error) {
val, err := os.ReadFile(path)
val, err := utils.ReadStringFileLimited(path, 64)
if err != nil {
return 0, err
}
return strconv.ParseFloat(strings.TrimSpace(string(val)), 64)
return strconv.ParseFloat(val, 64)
}
// normalizeHexID normalizes hex IDs by trimming spaces, lowercasing, and dropping 0x.
func normalizeHexID(id string) string {
return strings.TrimPrefix(strings.ToLower(strings.TrimSpace(id)), "0x")
}
// cacheKeyForAmdgpu builds the cache key for a device and optional revision.
func cacheKeyForAmdgpu(deviceID, revisionID string) string {
if revisionID != "" {
return deviceID + ":" + revisionID
}
return deviceID
}
// lookupAmdgpuNameInFile resolves an AMDGPU name from amdgpu.ids by device/revision.
func lookupAmdgpuNameInFile(deviceID, revisionID, filePath string) (name string, exact bool, found bool) {
file, err := os.Open(filePath)
if err != nil {
return "", false, false
}
defer file.Close()
var byDevice string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" || strings.HasPrefix(line, "#") {
continue
}
parts := strings.SplitN(line, ",", 3)
if len(parts) != 3 {
continue
}
dev := normalizeHexID(parts[0])
rev := normalizeHexID(parts[1])
productName := strings.TrimSpace(parts[2])
if dev == "" || productName == "" || dev != deviceID {
continue
}
if byDevice == "" {
byDevice = productName
}
if revisionID != "" && rev == revisionID {
return productName, true, true
}
}
if byDevice != "" {
return byDevice, false, true
}
return "", false, false
}
// getCachedAmdgpuName returns cached hit/miss status for the given device/revision.
func getCachedAmdgpuName(deviceID, revisionID string) (name string, found bool, done bool) {
// Build the list of cache keys to check. We always look up the exact device+revision key.
// When revisionID is set, we also look up deviceID alone, since the cache may store a
// device-only fallback when we couldn't resolve the exact revision.
keys := []string{cacheKeyForAmdgpu(deviceID, revisionID)}
if revisionID != "" {
keys = append(keys, deviceID)
}
knownMisses := 0
amdgpuNameCache.RLock()
defer amdgpuNameCache.RUnlock()
for _, key := range keys {
if name, ok := amdgpuNameCache.hits[key]; ok {
return name, true, true
}
if _, ok := amdgpuNameCache.misses[key]; ok {
knownMisses++
}
}
// done=true means "don't bother doing slow lookup": we either found a name (above) or
// every key we checked was already a known miss, so we've tried before and failed.
return "", false, knownMisses == len(keys)
}
// normalizeAmdgpuName trims standard suffixes from AMDGPU product names.
func normalizeAmdgpuName(name string) string {
for _, suffix := range []string{" Graphics", " Series"} {
name = strings.TrimSuffix(name, suffix)
}
return name
}
// cacheAmdgpuName stores a resolved AMDGPU name in the lookup cache.
func cacheAmdgpuName(deviceID, revisionID, name string, exact bool) {
name = normalizeAmdgpuName(name)
amdgpuNameCache.Lock()
defer amdgpuNameCache.Unlock()
if exact && revisionID != "" {
amdgpuNameCache.hits[cacheKeyForAmdgpu(deviceID, revisionID)] = name
}
amdgpuNameCache.hits[deviceID] = name
}
// cacheMissingAmdgpuName records unresolved device/revision lookups.
func cacheMissingAmdgpuName(deviceID, revisionID string) {
amdgpuNameCache.Lock()
defer amdgpuNameCache.Unlock()
amdgpuNameCache.misses[deviceID] = struct{}{}
if revisionID != "" {
amdgpuNameCache.misses[cacheKeyForAmdgpu(deviceID, revisionID)] = struct{}{}
}
}
// getAmdGpuName attempts to get a descriptive GPU name.
@@ -146,39 +273,30 @@ func readSysfsFloat(path string) (float64, error) {
// Falls back to showing the raw device ID if not found in the lookup table.
func getAmdGpuName(devicePath string) string {
// Try product_name first (works for some enterprise GPUs)
if prod, err := os.ReadFile(filepath.Join(devicePath, "product_name")); err == nil {
return strings.TrimSpace(string(prod))
if prod, err := utils.ReadStringFileLimited(filepath.Join(devicePath, "product_name"), 128); err == nil {
return prod
}
// Read PCI device ID and look it up
if deviceID, err := os.ReadFile(filepath.Join(devicePath, "device")); err == nil {
id := strings.TrimPrefix(strings.ToLower(strings.TrimSpace(string(deviceID))), "0x")
if name, ok := getRadeonNames()[id]; ok {
return fmt.Sprintf("Radeon %s", name)
if deviceID, err := utils.ReadStringFileLimited(filepath.Join(devicePath, "device"), 64); err == nil {
id := normalizeHexID(deviceID)
revision := ""
if rev, revErr := utils.ReadStringFileLimited(filepath.Join(devicePath, "revision"), 64); revErr == nil {
revision = normalizeHexID(rev)
}
if name, found, done := getCachedAmdgpuName(id, revision); found {
return name
} else if !done {
if name, exact, ok := lookupAmdgpuNameInFile(id, revision, "/usr/share/libdrm/amdgpu.ids"); ok {
cacheAmdgpuName(id, revision, name, exact)
return normalizeAmdgpuName(name)
}
cacheMissingAmdgpuName(id, revision)
}
return fmt.Sprintf("AMD GPU (%s)", id)
}
return "AMD GPU"
}
// getRadeonNames returns the AMD GPU name lookup table
// Device IDs from https://pci-ids.ucw.cz/read/PC/1002
var getRadeonNames = sync.OnceValue(func() map[string]string {
return map[string]string{
"7550": "RX 9070",
"7590": "RX 9060 XT",
"7551": "AI PRO R9700",
"744c": "RX 7900",
"1681": "680M",
"7448": "PRO W7900",
"745e": "PRO W7800",
"7470": "PRO W7700",
"73e3": "PRO W6600",
"7422": "PRO W6400",
"7341": "PRO W5500",
}
})

265
agent/gpu_amd_linux_test.go Normal file
View File

@@ -0,0 +1,265 @@
//go:build linux
package agent
import (
"os"
"path/filepath"
"testing"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/system"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestNormalizeHexID(t *testing.T) {
tests := []struct {
in string
want string
}{
{"0x1002", "1002"},
{"C2", "c2"},
{" 15BF ", "15bf"},
{"0x15bf", "15bf"},
{"", ""},
}
for _, tt := range tests {
subName := tt.in
if subName == "" {
subName = "empty_string"
}
t.Run(subName, func(t *testing.T) {
got := normalizeHexID(tt.in)
assert.Equal(t, tt.want, got)
})
}
}
func TestCacheKeyForAmdgpu(t *testing.T) {
tests := []struct {
deviceID string
revisionID string
want string
}{
{"1114", "c2", "1114:c2"},
{"15bf", "", "15bf"},
{"1506", "c1", "1506:c1"},
}
for _, tt := range tests {
got := cacheKeyForAmdgpu(tt.deviceID, tt.revisionID)
assert.Equal(t, tt.want, got)
}
}
func TestReadSysfsFloat(t *testing.T) {
dir := t.TempDir()
validPath := filepath.Join(dir, "val")
require.NoError(t, os.WriteFile(validPath, []byte(" 42.5 \n"), 0o644))
got, err := readSysfsFloat(validPath)
require.NoError(t, err)
assert.Equal(t, 42.5, got)
// Integer and scientific
sciPath := filepath.Join(dir, "sci")
require.NoError(t, os.WriteFile(sciPath, []byte("1e2"), 0o644))
got, err = readSysfsFloat(sciPath)
require.NoError(t, err)
assert.Equal(t, 100.0, got)
// Missing file
_, err = readSysfsFloat(filepath.Join(dir, "missing"))
require.Error(t, err)
// Invalid content
badPath := filepath.Join(dir, "bad")
require.NoError(t, os.WriteFile(badPath, []byte("not a number"), 0o644))
_, err = readSysfsFloat(badPath)
require.Error(t, err)
}
func TestIsAmdGpu(t *testing.T) {
dir := t.TempDir()
deviceDir := filepath.Join(dir, "device")
require.NoError(t, os.MkdirAll(deviceDir, 0o755))
// AMD vendor 0x1002 -> true
require.NoError(t, os.WriteFile(filepath.Join(deviceDir, "vendor"), []byte("0x1002\n"), 0o644))
assert.True(t, isAmdGpu(dir), "vendor 0x1002 should be AMD")
// Non-AMD vendor -> false
require.NoError(t, os.WriteFile(filepath.Join(deviceDir, "vendor"), []byte("0x10de\n"), 0o644))
assert.False(t, isAmdGpu(dir), "vendor 0x10de should not be AMD")
// Missing vendor file -> false
require.NoError(t, os.Remove(filepath.Join(deviceDir, "vendor")))
assert.False(t, isAmdGpu(dir), "missing vendor file should be false")
}
func TestAmdgpuNameCacheRoundTrip(t *testing.T) {
// Cache a name and retrieve it (unique key to avoid affecting other tests)
deviceID, revisionID := "cachedev99", "00"
cacheAmdgpuName(deviceID, revisionID, "AMD Test GPU 99 Graphics", true)
name, found, done := getCachedAmdgpuName(deviceID, revisionID)
assert.True(t, found)
assert.True(t, done)
assert.Equal(t, "AMD Test GPU 99", name)
// Device-only key also stored
name2, found2, _ := getCachedAmdgpuName(deviceID, "")
assert.True(t, found2)
assert.Equal(t, "AMD Test GPU 99", name2)
// Cache a miss
cacheMissingAmdgpuName("missedev99", "ab")
_, found3, done3 := getCachedAmdgpuName("missedev99", "ab")
assert.False(t, found3)
assert.True(t, done3, "done should be true so caller skips file lookup")
}
func TestUpdateAmdGpuDataWithFakeSysfs(t *testing.T) {
tests := []struct {
name string
writeGTT bool
wantMemoryUsed float64
wantMemoryTotal float64
}{
{
name: "sums vram and gtt when gtt is present",
writeGTT: true,
wantMemoryUsed: utils.BytesToMegabytes(1073741824 + 536870912),
wantMemoryTotal: utils.BytesToMegabytes(2147483648 + 4294967296),
},
{
name: "falls back to vram when gtt is missing",
writeGTT: false,
wantMemoryUsed: utils.BytesToMegabytes(1073741824),
wantMemoryTotal: utils.BytesToMegabytes(2147483648),
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
dir := t.TempDir()
cardPath := filepath.Join(dir, "card0")
devicePath := filepath.Join(cardPath, "device")
hwmonPath := filepath.Join(devicePath, "hwmon", "hwmon0")
require.NoError(t, os.MkdirAll(hwmonPath, 0o755))
write := func(name, content string) {
require.NoError(t, os.WriteFile(filepath.Join(devicePath, name), []byte(content), 0o644))
}
write("vendor", "0x1002")
write("device", "0x1506")
write("revision", "0xc1")
write("gpu_busy_percent", "25")
write("mem_info_vram_used", "1073741824")
write("mem_info_vram_total", "2147483648")
if tt.writeGTT {
write("mem_info_gtt_used", "536870912")
write("mem_info_gtt_total", "4294967296")
}
require.NoError(t, os.WriteFile(filepath.Join(hwmonPath, "temp1_input"), []byte("45000"), 0o644))
require.NoError(t, os.WriteFile(filepath.Join(hwmonPath, "power1_input"), []byte("20000000"), 0o644))
// Pre-cache name so getAmdGpuName returns a known value (it uses system amdgpu.ids path)
cacheAmdgpuName("1506", "c1", "AMD Radeon 610M Graphics", true)
gm := &GPUManager{GpuDataMap: make(map[string]*system.GPUData)}
ok := gm.updateAmdGpuData(cardPath)
require.True(t, ok)
gpu, ok := gm.GpuDataMap["card0"]
require.True(t, ok)
assert.Equal(t, "AMD Radeon 610M", gpu.Name)
assert.Equal(t, 25.0, gpu.Usage)
assert.Equal(t, tt.wantMemoryUsed, gpu.MemoryUsed)
assert.Equal(t, tt.wantMemoryTotal, gpu.MemoryTotal)
assert.Equal(t, 45.0, gpu.Temperature)
assert.Equal(t, 20.0, gpu.Power)
assert.Equal(t, 1.0, gpu.Count)
})
}
}
func TestLookupAmdgpuNameInFile(t *testing.T) {
idsPath := filepath.Join("test-data", "amdgpu.ids")
tests := []struct {
name string
deviceID string
revisionID string
wantName string
wantExact bool
wantFound bool
}{
{
name: "exact device and revision match",
deviceID: "1114",
revisionID: "c2",
wantName: "AMD Radeon 860M Graphics",
wantExact: true,
wantFound: true,
},
{
name: "exact match 15BF revision 01 returns 760M",
deviceID: "15bf",
revisionID: "01",
wantName: "AMD Radeon 760M Graphics",
wantExact: true,
wantFound: true,
},
{
name: "exact match 15BF revision 00 returns 780M",
deviceID: "15bf",
revisionID: "00",
wantName: "AMD Radeon 780M Graphics",
wantExact: true,
wantFound: true,
},
{
name: "device-only match returns first entry for device",
deviceID: "1506",
revisionID: "",
wantName: "AMD Radeon 610M",
wantExact: false,
wantFound: true,
},
{
name: "unknown device not found",
deviceID: "dead",
revisionID: "00",
wantName: "",
wantExact: false,
wantFound: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gotName, gotExact, gotFound := lookupAmdgpuNameInFile(tt.deviceID, tt.revisionID, idsPath)
assert.Equal(t, tt.wantName, gotName, "name")
assert.Equal(t, tt.wantExact, gotExact, "exact")
assert.Equal(t, tt.wantFound, gotFound, "found")
})
}
}
func TestGetAmdGpuNameFromIdsFile(t *testing.T) {
// Test that getAmdGpuName resolves a name when we can't inject the ids path.
// We only verify behavior when product_name is missing and device/revision
// would be read from sysfs; the actual lookup uses /usr/share/libdrm/amdgpu.ids.
// So this test focuses on normalizeAmdgpuName and that lookupAmdgpuNameInFile
// returns the expected name for our test-data file.
idsPath := filepath.Join("test-data", "amdgpu.ids")
name, exact, found := lookupAmdgpuNameInFile("1435", "ae", idsPath)
require.True(t, found)
require.True(t, exact)
assert.Equal(t, "AMD Custom GPU 0932", name)
assert.Equal(t, "AMD Custom GPU 0932", normalizeAmdgpuName(name))
// " Graphics" suffix is trimmed by normalizeAmdgpuName
name2 := "AMD Radeon 860M Graphics"
assert.Equal(t, "AMD Radeon 860M", normalizeAmdgpuName(name2))
}

252
agent/gpu_darwin.go Normal file
View File

@@ -0,0 +1,252 @@
//go:build darwin
package agent
import (
"bufio"
"bytes"
"encoding/json"
"io"
"log/slog"
"os/exec"
"strconv"
"strings"
"time"
"github.com/henrygd/beszel/internal/entities/system"
)
const (
// powermetricsSampleIntervalMs is the sampling interval passed to powermetrics (-i).
powermetricsSampleIntervalMs = 500
// powermetricsPollInterval is how often we run powermetrics to collect a new sample.
powermetricsPollInterval = 2 * time.Second
// macmonIntervalMs is the sampling interval passed to macmon pipe (-i), in milliseconds.
macmonIntervalMs = 2500
)
const appleGPUID = "0"
// startPowermetricsCollector runs powermetrics --samplers gpu_power in a loop and updates
// GPU usage and power. Requires root (sudo) on macOS. A single logical GPU is reported as id "0".
func (gm *GPUManager) startPowermetricsCollector() {
// Ensure single GPU entry for Apple GPU
if _, ok := gm.GpuDataMap[appleGPUID]; !ok {
gm.GpuDataMap[appleGPUID] = &system.GPUData{Name: "Apple GPU"}
}
go func() {
failures := 0
for {
if err := gm.collectPowermetrics(); err != nil {
failures++
if failures > maxFailureRetries {
slog.Warn("powermetrics GPU collector failed repeatedly, stopping", "err", err)
break
}
slog.Warn("Error collecting macOS GPU data via powermetrics (may require sudo)", "err", err)
time.Sleep(retryWaitTime)
continue
}
failures = 0
time.Sleep(powermetricsPollInterval)
}
}()
}
// collectPowermetrics runs powermetrics once and parses GPU usage and power from its output.
func (gm *GPUManager) collectPowermetrics() error {
interval := strconv.Itoa(powermetricsSampleIntervalMs)
cmd := exec.Command(powermetricsCmd, "--samplers", "gpu_power", "-i", interval, "-n", "1")
cmd.Stderr = nil
out, err := cmd.Output()
if err != nil {
return err
}
if !gm.parsePowermetricsData(out) {
return errNoValidData
}
return nil
}
// parsePowermetricsData parses powermetrics gpu_power output and updates GpuDataMap["0"].
// Example output:
//
// **** GPU usage ****
// GPU HW active frequency: 444 MHz
// GPU HW active residency: 0.97% (444 MHz: .97% ...
// GPU idle residency: 99.03%
// GPU Power: 4 mW
func (gm *GPUManager) parsePowermetricsData(output []byte) bool {
var idleResidency, powerMW float64
var gotIdle, gotPower bool
scanner := bufio.NewScanner(bytes.NewReader(output))
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(line, "GPU idle residency:") {
// "GPU idle residency: 99.03%"
fields := strings.Fields(strings.TrimPrefix(line, "GPU idle residency:"))
if len(fields) >= 1 {
pct := strings.TrimSuffix(fields[0], "%")
if v, err := strconv.ParseFloat(pct, 64); err == nil {
idleResidency = v
gotIdle = true
}
}
} else if strings.HasPrefix(line, "GPU Power:") {
// "GPU Power: 4 mW"
fields := strings.Fields(strings.TrimPrefix(line, "GPU Power:"))
if len(fields) >= 1 {
if v, err := strconv.ParseFloat(fields[0], 64); err == nil {
powerMW = v
gotPower = true
}
}
}
}
if err := scanner.Err(); err != nil {
return false
}
if !gotIdle && !gotPower {
return false
}
gm.Lock()
defer gm.Unlock()
if _, ok := gm.GpuDataMap[appleGPUID]; !ok {
gm.GpuDataMap[appleGPUID] = &system.GPUData{Name: "Apple GPU"}
}
gpu := gm.GpuDataMap[appleGPUID]
if gotIdle {
// Usage = 100 - idle residency (e.g. 100 - 99.03 = 0.97%)
gpu.Usage += 100 - idleResidency
}
if gotPower {
// mW -> W
gpu.Power += powerMW / milliwattsInAWatt
}
gpu.Count++
return true
}
// startMacmonCollector runs `macmon pipe` in a loop and parses one JSON object per line.
// This collector does not require sudo. A single logical GPU is reported as id "0".
func (gm *GPUManager) startMacmonCollector() {
if _, ok := gm.GpuDataMap[appleGPUID]; !ok {
gm.GpuDataMap[appleGPUID] = &system.GPUData{Name: "Apple GPU"}
}
go func() {
failures := 0
for {
if err := gm.collectMacmonPipe(); err != nil {
failures++
if failures > maxFailureRetries {
slog.Warn("macmon GPU collector failed repeatedly, stopping", "err", err)
break
}
slog.Warn("Error collecting macOS GPU data via macmon", "err", err)
time.Sleep(retryWaitTime)
continue
}
failures = 0
// `macmon pipe` is long-running; if it returns, wait a bit before restarting.
time.Sleep(retryWaitTime)
}
}()
}
type macmonTemp struct {
GPUTempAvg float64 `json:"gpu_temp_avg"`
}
type macmonSample struct {
GPUPower float64 `json:"gpu_power"` // watts (macmon reports fractional values)
GPURAMPower float64 `json:"gpu_ram_power"` // watts
GPUUsage []float64 `json:"gpu_usage"` // [freq_mhz, usage] where usage is typically 0..1
Temp macmonTemp `json:"temp"`
}
func (gm *GPUManager) collectMacmonPipe() (err error) {
cmd := exec.Command(macmonCmd, "pipe", "-i", strconv.Itoa(macmonIntervalMs))
// Avoid blocking if macmon writes to stderr.
cmd.Stderr = io.Discard
stdout, err := cmd.StdoutPipe()
if err != nil {
return err
}
if err := cmd.Start(); err != nil {
return err
}
// Ensure we always reap the child to avoid zombies on any return path and
// propagate a non-zero exit code if no other error was set.
defer func() {
_ = stdout.Close()
if cmd.ProcessState == nil || !cmd.ProcessState.Exited() {
_ = cmd.Process.Kill()
}
if waitErr := cmd.Wait(); err == nil && waitErr != nil {
err = waitErr
}
}()
scanner := bufio.NewScanner(stdout)
var hadSample bool
for scanner.Scan() {
line := bytes.TrimSpace(scanner.Bytes())
if len(line) == 0 {
continue
}
if gm.parseMacmonLine(line) {
hadSample = true
}
}
if scanErr := scanner.Err(); scanErr != nil {
return scanErr
}
if !hadSample {
return errNoValidData
}
return nil
}
// parseMacmonLine parses a single macmon JSON line and updates Apple GPU metrics.
func (gm *GPUManager) parseMacmonLine(line []byte) bool {
var sample macmonSample
if err := json.Unmarshal(line, &sample); err != nil {
return false
}
usage := 0.0
if len(sample.GPUUsage) >= 2 {
usage = sample.GPUUsage[1]
// Heuristic: macmon typically reports 0..1; convert to percentage.
if usage <= 1.0 {
usage *= 100
}
}
// Consider the line valid if it contains at least one GPU metric.
if usage == 0 && sample.GPUPower == 0 && sample.Temp.GPUTempAvg == 0 {
return false
}
gm.Lock()
defer gm.Unlock()
gpu, ok := gm.GpuDataMap[appleGPUID]
if !ok {
gpu = &system.GPUData{Name: "Apple GPU"}
gm.GpuDataMap[appleGPUID] = gpu
}
gpu.Temperature = sample.Temp.GPUTempAvg
gpu.Usage += usage
// macmon reports power in watts; include VRAM power if present.
gpu.Power += sample.GPUPower + sample.GPURAMPower
gpu.Count++
return true
}

81
agent/gpu_darwin_test.go Normal file
View File

@@ -0,0 +1,81 @@
//go:build darwin
package agent
import (
"testing"
"github.com/henrygd/beszel/internal/entities/system"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestParsePowermetricsData(t *testing.T) {
input := `
Machine model: Mac14,10
OS version: 25D125
*** Sampled system activity (Sat Feb 14 00:42:06 2026 -0500) (503.05ms elapsed) ***
**** GPU usage ****
GPU HW active frequency: 444 MHz
GPU HW active residency: 0.97% (444 MHz: .97% 612 MHz: 0% 808 MHz: 0% 968 MHz: 0% 1110 MHz: 0% 1236 MHz: 0% 1338 MHz: 0% 1398 MHz: 0%)
GPU SW requested state: (P1 : 100% P2 : 0% P3 : 0% P4 : 0% P5 : 0% P6 : 0% P7 : 0% P8 : 0%)
GPU idle residency: 99.03%
GPU Power: 4 mW
`
gm := &GPUManager{
GpuDataMap: make(map[string]*system.GPUData),
}
valid := gm.parsePowermetricsData([]byte(input))
require.True(t, valid)
g0, ok := gm.GpuDataMap["0"]
require.True(t, ok)
assert.Equal(t, "Apple GPU", g0.Name)
// Usage = 100 - 99.03 = 0.97
assert.InDelta(t, 0.97, g0.Usage, 0.01)
// 4 mW -> 0.004 W
assert.InDelta(t, 0.004, g0.Power, 0.0001)
assert.Equal(t, 1.0, g0.Count)
}
func TestParsePowermetricsDataPartial(t *testing.T) {
// Only power line (e.g. older macOS or different sampler output)
input := `
**** GPU usage ****
GPU Power: 120 mW
`
gm := &GPUManager{
GpuDataMap: make(map[string]*system.GPUData),
}
valid := gm.parsePowermetricsData([]byte(input))
require.True(t, valid)
g0, ok := gm.GpuDataMap["0"]
require.True(t, ok)
assert.Equal(t, "Apple GPU", g0.Name)
assert.InDelta(t, 0.12, g0.Power, 0.001)
assert.Equal(t, 1.0, g0.Count)
}
func TestParseMacmonLine(t *testing.T) {
input := `{"all_power":0.6468324661254883,"ane_power":0.0,"cpu_power":0.6359732151031494,"ecpu_usage":[2061,0.1726151406764984],"gpu_power":0.010859241709113121,"gpu_ram_power":0.000965250947047025,"gpu_usage":[503,0.013633215799927711],"memory":{"ram_total":17179869184,"ram_usage":12322914304,"swap_total":0,"swap_usage":0},"pcpu_usage":[1248,0.11792058497667313],"ram_power":0.14885640144348145,"sys_power":10.4955415725708,"temp":{"cpu_temp_avg":23.041261672973633,"gpu_temp_avg":29.44516944885254},"timestamp":"2026-02-17T19:34:27.942556+00:00"}`
gm := &GPUManager{
GpuDataMap: make(map[string]*system.GPUData),
}
valid := gm.parseMacmonLine([]byte(input))
require.True(t, valid)
g0, ok := gm.GpuDataMap["0"]
require.True(t, ok)
assert.Equal(t, "Apple GPU", g0.Name)
// macmon reports usage fraction 0..1; expect percent conversion.
assert.InDelta(t, 1.3633, g0.Usage, 0.05)
// power includes gpu_power + gpu_ram_power
assert.InDelta(t, 0.011824, g0.Power, 0.0005)
assert.InDelta(t, 29.445, g0.Temperature, 0.01)
assert.Equal(t, 1.0, g0.Count)
}

View File

@@ -0,0 +1,9 @@
//go:build !darwin
package agent
// startPowermetricsCollector is a no-op on non-darwin platforms; the real implementation is in gpu_darwin.go.
func (gm *GPUManager) startPowermetricsCollector() {}
// startMacmonCollector is a no-op on non-darwin platforms; the real implementation is in gpu_darwin.go.
func (gm *GPUManager) startMacmonCollector() {}

View File

@@ -7,6 +7,7 @@ import (
"strconv"
"strings"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/system"
)
@@ -52,7 +53,7 @@ func (gm *GPUManager) updateIntelFromStats(sample *intelGpuStats) bool {
func (gm *GPUManager) collectIntelStats() (err error) {
// Build command arguments, optionally selecting a device via -d
args := []string{"-s", intelGpuStatsInterval, "-l"}
if dev, ok := GetEnv("INTEL_GPU_DEVICE"); ok && dev != "" {
if dev, ok := utils.GetEnv("INTEL_GPU_DEVICE"); ok && dev != "" {
args = append(args, "-d", dev)
}
cmd := exec.Command(intelGpuStatsCmd, args...)

View File

@@ -13,21 +13,3 @@ func (c *nvmlCollector) init() error {
}
func (c *nvmlCollector) start() {}
func (c *nvmlCollector) collect() {}
func openLibrary(name string) (uintptr, error) {
return 0, fmt.Errorf("nvml not supported on this platform")
}
func getNVMLPath() string {
return ""
}
func hasSymbol(lib uintptr, symbol string) bool {
return false
}
func (c *nvmlCollector) isGPUActive(bdf string) bool {
return true
}

160
agent/gpu_nvtop.go Normal file
View File

@@ -0,0 +1,160 @@
package agent
import (
"encoding/json"
"io"
"log/slog"
"os/exec"
"strconv"
"strings"
"time"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/system"
)
type nvtopSnapshot struct {
DeviceName string `json:"device_name"`
Temp *string `json:"temp"`
PowerDraw *string `json:"power_draw"`
GpuUtil *string `json:"gpu_util"`
MemTotal *string `json:"mem_total"`
MemUsed *string `json:"mem_used"`
}
// parseNvtopNumber parses nvtop numeric strings with units (C/W/%).
func parseNvtopNumber(raw string) float64 {
cleaned := strings.TrimSpace(raw)
cleaned = strings.TrimSuffix(cleaned, "C")
cleaned = strings.TrimSuffix(cleaned, "W")
cleaned = strings.TrimSuffix(cleaned, "%")
val, _ := strconv.ParseFloat(cleaned, 64)
return val
}
// parseNvtopData parses a single nvtop JSON snapshot payload.
func (gm *GPUManager) parseNvtopData(output []byte) bool {
var snapshots []nvtopSnapshot
if err := json.Unmarshal(output, &snapshots); err != nil || len(snapshots) == 0 {
return false
}
return gm.updateNvtopSnapshots(snapshots)
}
// updateNvtopSnapshots applies one decoded nvtop snapshot batch to GPU accumulators.
func (gm *GPUManager) updateNvtopSnapshots(snapshots []nvtopSnapshot) bool {
gm.Lock()
defer gm.Unlock()
valid := false
usedIDs := make(map[string]struct{}, len(snapshots))
for i, sample := range snapshots {
if sample.DeviceName == "" {
continue
}
indexID := "n" + strconv.Itoa(i)
id := indexID
// nvtop ordering can change, so prefer reusing an existing slot with matching device name.
if existingByIndex, ok := gm.GpuDataMap[indexID]; ok && existingByIndex.Name != "" && existingByIndex.Name != sample.DeviceName {
for existingID, gpu := range gm.GpuDataMap {
if !strings.HasPrefix(existingID, "n") {
continue
}
if _, taken := usedIDs[existingID]; taken {
continue
}
if gpu.Name == sample.DeviceName {
id = existingID
break
}
}
}
if _, ok := gm.GpuDataMap[id]; !ok {
gm.GpuDataMap[id] = &system.GPUData{Name: sample.DeviceName}
}
gpu := gm.GpuDataMap[id]
gpu.Name = sample.DeviceName
if sample.Temp != nil {
gpu.Temperature = parseNvtopNumber(*sample.Temp)
}
if sample.MemUsed != nil {
gpu.MemoryUsed = utils.BytesToMegabytes(parseNvtopNumber(*sample.MemUsed))
}
if sample.MemTotal != nil {
gpu.MemoryTotal = utils.BytesToMegabytes(parseNvtopNumber(*sample.MemTotal))
}
if sample.GpuUtil != nil {
gpu.Usage += parseNvtopNumber(*sample.GpuUtil)
}
if sample.PowerDraw != nil {
gpu.Power += parseNvtopNumber(*sample.PowerDraw)
}
gpu.Count++
usedIDs[id] = struct{}{}
valid = true
}
return valid
}
// collectNvtopStats runs nvtop loop mode and continuously decodes JSON snapshots.
func (gm *GPUManager) collectNvtopStats(interval string) error {
cmd := exec.Command(nvtopCmd, "-lP", "-d", interval)
stdout, err := cmd.StdoutPipe()
if err != nil {
return err
}
if err := cmd.Start(); err != nil {
return err
}
defer func() {
_ = stdout.Close()
if cmd.ProcessState == nil || !cmd.ProcessState.Exited() {
_ = cmd.Process.Kill()
}
_ = cmd.Wait()
}()
decoder := json.NewDecoder(stdout)
foundValid := false
for {
var snapshots []nvtopSnapshot
if err := decoder.Decode(&snapshots); err != nil {
if err == io.EOF {
if foundValid {
return nil
}
return errNoValidData
}
return err
}
if gm.updateNvtopSnapshots(snapshots) {
foundValid = true
}
}
}
// startNvtopCollector starts nvtop collection with retry or fallback callback handling.
func (gm *GPUManager) startNvtopCollector(interval string, onFailure func()) {
go func() {
failures := 0
for {
if err := gm.collectNvtopStats(interval); err != nil {
if onFailure != nil {
slog.Warn("Error collecting GPU data via nvtop", "err", err)
onFailure()
return
}
failures++
if failures > maxFailureRetries {
break
}
slog.Warn("Error collecting GPU data via nvtop", "err", err)
time.Sleep(retryWaitTime)
continue
}
}
}()
}

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package agent
@@ -11,6 +10,7 @@ import (
"testing"
"time"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/system"
"github.com/stretchr/testify/assert"
@@ -250,6 +250,100 @@ func TestParseAmdData(t *testing.T) {
}
}
func TestParseNvtopData(t *testing.T) {
input, err := os.ReadFile("test-data/nvtop.json")
require.NoError(t, err)
gm := &GPUManager{
GpuDataMap: make(map[string]*system.GPUData),
}
valid := gm.parseNvtopData(input)
require.True(t, valid)
g0, ok := gm.GpuDataMap["n0"]
require.True(t, ok)
assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", g0.Name)
assert.Equal(t, 48.0, g0.Temperature)
assert.Equal(t, 5.0, g0.Usage)
assert.Equal(t, 13.0, g0.Power)
assert.Equal(t, utils.BytesToMegabytes(349372416), g0.MemoryUsed)
assert.Equal(t, utils.BytesToMegabytes(4294967296), g0.MemoryTotal)
assert.Equal(t, 1.0, g0.Count)
g1, ok := gm.GpuDataMap["n1"]
require.True(t, ok)
assert.Equal(t, "AMD Radeon 680M", g1.Name)
assert.Equal(t, 48.0, g1.Temperature)
assert.Equal(t, 12.0, g1.Usage)
assert.Equal(t, 9.0, g1.Power)
assert.Equal(t, utils.BytesToMegabytes(1213784064), g1.MemoryUsed)
assert.Equal(t, utils.BytesToMegabytes(16929173504), g1.MemoryTotal)
assert.Equal(t, 1.0, g1.Count)
}
func TestUpdateNvtopSnapshotsKeepsDeviceAssociationWhenOrderChanges(t *testing.T) {
strPtr := func(s string) *string { return &s }
gm := &GPUManager{
GpuDataMap: make(map[string]*system.GPUData),
}
firstBatch := []nvtopSnapshot{
{
DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
GpuUtil: strPtr("20%"),
PowerDraw: strPtr("10W"),
},
{
DeviceName: "AMD Radeon 680M",
GpuUtil: strPtr("30%"),
PowerDraw: strPtr("20W"),
},
}
secondBatchSwapped := []nvtopSnapshot{
{
DeviceName: "AMD Radeon 680M",
GpuUtil: strPtr("40%"),
PowerDraw: strPtr("25W"),
},
{
DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
GpuUtil: strPtr("50%"),
PowerDraw: strPtr("15W"),
},
}
require.True(t, gm.updateNvtopSnapshots(firstBatch))
require.True(t, gm.updateNvtopSnapshots(secondBatchSwapped))
nvidia := gm.GpuDataMap["n0"]
require.NotNil(t, nvidia)
assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", nvidia.Name)
assert.Equal(t, 70.0, nvidia.Usage)
assert.Equal(t, 25.0, nvidia.Power)
assert.Equal(t, 2.0, nvidia.Count)
amd := gm.GpuDataMap["n1"]
require.NotNil(t, amd)
assert.Equal(t, "AMD Radeon 680M", amd.Name)
assert.Equal(t, 70.0, amd.Usage)
assert.Equal(t, 45.0, amd.Power)
assert.Equal(t, 2.0, amd.Count)
}
func TestParseCollectorPriority(t *testing.T) {
got := parseCollectorPriority(" nvml, nvidia-smi, intel_gpu_top, amd_sysfs, nvtop, rocm-smi, bad ")
want := []collectorSource{
collectorSourceNVML,
collectorSourceNvidiaSMI,
collectorSourceIntelGpuTop,
collectorSourceAmdSysfs,
collectorSourceNVTop,
collectorSourceRocmSMI,
}
assert.Equal(t, want, got)
}
func TestParseJetsonData(t *testing.T) {
tests := []struct {
name string
@@ -987,36 +1081,35 @@ func TestCalculateGPUAverage(t *testing.T) {
})
}
func TestDetectGPUs(t *testing.T) {
func TestGPUCapabilitiesAndLegacyPriority(t *testing.T) {
// Save original PATH
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
// Set up temp dir with the commands
tempDir := t.TempDir()
os.Setenv("PATH", tempDir)
hasAmdSysfs := (&GPUManager{}).hasAmdSysfs()
tests := []struct {
name string
setupCommands func() error
setupCommands func(string) error
wantNvidiaSmi bool
wantRocmSmi bool
wantTegrastats bool
wantNvtop bool
wantErr bool
}{
{
name: "nvidia-smi not available",
setupCommands: func() error {
setupCommands: func(_ string) error {
return nil
},
wantNvidiaSmi: false,
wantRocmSmi: false,
wantTegrastats: false,
wantNvtop: false,
wantErr: true,
},
{
name: "nvidia-smi available",
setupCommands: func() error {
setupCommands: func(tempDir string) error {
path := filepath.Join(tempDir, "nvidia-smi")
script := `#!/bin/sh
echo "test"`
@@ -1028,29 +1121,14 @@ echo "test"`
wantNvidiaSmi: true,
wantTegrastats: false,
wantRocmSmi: false,
wantNvtop: false,
wantErr: false,
},
{
name: "rocm-smi available",
setupCommands: func() error {
setupCommands: func(tempDir string) error {
path := filepath.Join(tempDir, "rocm-smi")
script := `#!/bin/sh
echo "test"`
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
return err
}
return nil
},
wantNvidiaSmi: true,
wantRocmSmi: true,
wantTegrastats: false,
wantErr: false,
},
{
name: "tegrastats available",
setupCommands: func() error {
path := filepath.Join(tempDir, "tegrastats")
script := `#!/bin/sh
echo "test"`
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
return err
@@ -1059,12 +1137,47 @@ echo "test"`
},
wantNvidiaSmi: false,
wantRocmSmi: true,
wantTegrastats: false,
wantNvtop: false,
wantErr: false,
},
{
name: "tegrastats available",
setupCommands: func(tempDir string) error {
path := filepath.Join(tempDir, "tegrastats")
script := `#!/bin/sh
echo "test"`
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
return err
}
return nil
},
wantNvidiaSmi: false,
wantRocmSmi: false,
wantTegrastats: true,
wantNvtop: false,
wantErr: false,
},
{
name: "nvtop available",
setupCommands: func(tempDir string) error {
path := filepath.Join(tempDir, "nvtop")
script := `#!/bin/sh
echo "[]"`
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
return err
}
return nil
},
wantNvidiaSmi: false,
wantRocmSmi: false,
wantTegrastats: false,
wantNvtop: true,
wantErr: false,
},
{
name: "no gpu tools available",
setupCommands: func() error {
setupCommands: func(_ string) error {
os.Setenv("PATH", "")
return nil
},
@@ -1074,29 +1187,53 @@ echo "test"`
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if err := tt.setupCommands(); err != nil {
tempDir := t.TempDir()
os.Setenv("PATH", tempDir)
if err := tt.setupCommands(tempDir); err != nil {
t.Fatal(err)
}
gm := &GPUManager{}
err := gm.detectGPUs()
caps := gm.discoverGpuCapabilities()
var err error
if !hasAnyGpuCollector(caps) {
err = fmt.Errorf(noGPUFoundMsg)
}
priorities := gm.resolveLegacyCollectorPriority(caps)
hasPriority := func(source collectorSource) bool {
for _, s := range priorities {
if s == source {
return true
}
}
return false
}
gotNvidiaSmi := hasPriority(collectorSourceNvidiaSMI)
gotRocmSmi := hasPriority(collectorSourceRocmSMI)
gotTegrastats := caps.hasTegrastats
gotNvtop := caps.hasNvtop
t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gm.nvidiaSmi, gm.rocmSmi, gm.tegrastats)
t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gotNvidiaSmi, gotRocmSmi, gotTegrastats)
if tt.wantErr {
wantErr := tt.wantErr
if hasAmdSysfs && (tt.name == "nvidia-smi not available" || tt.name == "no gpu tools available") {
wantErr = false
}
if wantErr {
assert.Error(t, err)
return
}
assert.NoError(t, err)
assert.Equal(t, tt.wantNvidiaSmi, gm.nvidiaSmi)
assert.Equal(t, tt.wantRocmSmi, gm.rocmSmi)
assert.Equal(t, tt.wantTegrastats, gm.tegrastats)
assert.Equal(t, tt.wantNvidiaSmi, gotNvidiaSmi)
assert.Equal(t, tt.wantRocmSmi, gotRocmSmi)
assert.Equal(t, tt.wantTegrastats, gotTegrastats)
assert.Equal(t, tt.wantNvtop, gotNvtop)
})
}
}
func TestStartCollector(t *testing.T) {
func TestCollectorStartHelpers(t *testing.T) {
// Save original PATH
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
@@ -1181,6 +1318,27 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m
},
},
},
{
name: "nvtop collector",
command: "nvtop",
setup: func(t *testing.T) error {
path := filepath.Join(dir, "nvtop")
script := `#!/bin/sh
echo '[{"device_name":"NVIDIA Test GPU","temp":"52C","power_draw":"31W","gpu_util":"37%","mem_total":"4294967296","mem_used":"536870912","processes":[]}]'`
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
return err
}
return nil
},
validate: func(t *testing.T, gm *GPUManager) {
gpu, exists := gm.GpuDataMap["n0"]
assert.True(t, exists)
if exists {
assert.Equal(t, "NVIDIA Test GPU", gpu.Name)
assert.Equal(t, 52.0, gpu.Temperature)
}
},
},
}
for _, tt := range tests {
@@ -1193,13 +1351,157 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m
GpuDataMap: make(map[string]*system.GPUData),
}
}
tt.gm.startCollector(tt.command)
switch tt.command {
case nvidiaSmiCmd:
tt.gm.startNvidiaSmiCollector("4")
case rocmSmiCmd:
tt.gm.startRocmSmiCollector(4300 * time.Millisecond)
case tegraStatsCmd:
tt.gm.startTegraStatsCollector("3700")
case nvtopCmd:
tt.gm.startNvtopCollector("30", nil)
default:
t.Fatalf("unknown test command %q", tt.command)
}
time.Sleep(50 * time.Millisecond) // Give collector time to run
tt.validate(t, tt.gm)
})
}
}
func TestNewGPUManagerPriorityNvtopFallback(t *testing.T) {
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvtop,nvidia-smi")
nvtopPath := filepath.Join(dir, "nvtop")
nvtopScript := `#!/bin/sh
echo 'not-json'`
require.NoError(t, os.WriteFile(nvtopPath, []byte(nvtopScript), 0755))
nvidiaPath := filepath.Join(dir, "nvidia-smi")
nvidiaScript := `#!/bin/sh
echo "0, NVIDIA Priority GPU, 45, 512, 2048, 12, 25"`
require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755))
gm, err := NewGPUManager()
require.NoError(t, err)
require.NotNil(t, gm)
time.Sleep(150 * time.Millisecond)
gpu, ok := gm.GpuDataMap["0"]
require.True(t, ok)
assert.Equal(t, "Priority GPU", gpu.Name)
assert.Equal(t, 45.0, gpu.Temperature)
}
func TestNewGPUManagerPriorityMixedCollectors(t *testing.T) {
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "intel_gpu_top,rocm-smi")
intelPath := filepath.Join(dir, "intel_gpu_top")
intelScript := `#!/bin/sh
echo "Freq MHz IRQ RC6 Power W IMC MiB/s RCS VCS"
echo " req act /s % gpu pkg rd wr % se wa % se wa"
echo "226 223 338 58 2.00 2.69 1820 965 0.00 0 0 0.00 0 0"
echo "189 187 412 67 1.80 2.45 1950 823 8.50 2 1 15.00 1 0"
`
require.NoError(t, os.WriteFile(intelPath, []byte(intelScript), 0755))
rocmPath := filepath.Join(dir, "rocm-smi")
rocmScript := `#!/bin/sh
echo '{"card0": {"Temperature (Sensor edge) (C)": "49.0", "Current Socket Graphics Package Power (W)": "28.159", "GPU use (%)": "0", "VRAM Total Memory (B)": "536870912", "VRAM Total Used Memory (B)": "445550592", "Card Series": "Rembrandt [Radeon 680M]", "GUID": "34756"}}'
`
require.NoError(t, os.WriteFile(rocmPath, []byte(rocmScript), 0755))
gm, err := NewGPUManager()
require.NoError(t, err)
require.NotNil(t, gm)
time.Sleep(150 * time.Millisecond)
_, intelOk := gm.GpuDataMap["i0"]
_, amdOk := gm.GpuDataMap["34756"]
assert.True(t, intelOk)
assert.True(t, amdOk)
}
func TestNewGPUManagerPriorityNvmlFallbackToNvidiaSmi(t *testing.T) {
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvml,nvidia-smi")
nvidiaPath := filepath.Join(dir, "nvidia-smi")
nvidiaScript := `#!/bin/sh
echo "0, NVIDIA Fallback GPU, 41, 256, 1024, 8, 14"`
require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755))
gm, err := NewGPUManager()
require.NoError(t, err)
require.NotNil(t, gm)
time.Sleep(150 * time.Millisecond)
gpu, ok := gm.GpuDataMap["0"]
require.True(t, ok)
assert.Equal(t, "Fallback GPU", gpu.Name)
}
func TestNewGPUManagerConfiguredCollectorsMustStart(t *testing.T) {
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
t.Run("configured valid collector unavailable", func(t *testing.T) {
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi")
gm, err := NewGPUManager()
require.Nil(t, gm)
require.Error(t, err)
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
})
t.Run("configured collector list has only unknown entries", func(t *testing.T) {
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "bad,unknown")
gm, err := NewGPUManager()
require.Nil(t, gm)
require.Error(t, err)
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
})
}
func TestNewGPUManagerJetsonIgnoresCollectorConfig(t *testing.T) {
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi")
tegraPath := filepath.Join(dir, "tegrastats")
tegraScript := `#!/bin/sh
echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000mW"`
require.NoError(t, os.WriteFile(tegraPath, []byte(tegraScript), 0755))
gm, err := NewGPUManager()
require.NoError(t, err)
require.NotNil(t, gm)
time.Sleep(100 * time.Millisecond)
gpu, ok := gm.GpuDataMap["0"]
require.True(t, ok)
assert.Equal(t, "GPU", gpu.Name)
}
// TestAccumulationTableDriven tests the accumulation behavior for all three GPU types
func TestAccumulation(t *testing.T) {
type expectedGPUValues struct {

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package agent

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package health
@@ -37,7 +36,6 @@ func TestHealth(t *testing.T) {
})
// This test uses synctest to simulate time passing.
// NOTE: This test requires GOEXPERIMENT=synctest to run.
t.Run("check with simulated time", func(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
// Update the file to set the initial timestamp.

233
agent/mdraid_linux.go Normal file
View File

@@ -0,0 +1,233 @@
//go:build linux
package agent
import (
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/smart"
)
// mdraidSysfsRoot is a test hook; production value is "/sys".
var mdraidSysfsRoot = "/sys"
type mdraidHealth struct {
level string
arrayState string
degraded uint64
raidDisks uint64
syncAction string
syncCompleted string
syncSpeed string
mismatchCnt uint64
capacity uint64
}
// scanMdraidDevices discovers Linux md arrays exposed in sysfs.
func scanMdraidDevices() []*DeviceInfo {
blockDir := filepath.Join(mdraidSysfsRoot, "block")
entries, err := os.ReadDir(blockDir)
if err != nil {
return nil
}
devices := make([]*DeviceInfo, 0, 2)
for _, ent := range entries {
name := ent.Name()
if !isMdraidBlockName(name) {
continue
}
mdDir := filepath.Join(blockDir, name, "md")
if !utils.FileExists(filepath.Join(mdDir, "array_state")) {
continue
}
devPath := filepath.Join("/dev", name)
devices = append(devices, &DeviceInfo{
Name: devPath,
Type: "mdraid",
InfoName: devPath + " [mdraid]",
Protocol: "MD",
})
}
return devices
}
// collectMdraidHealth reads mdraid health and stores it in SmartDataMap.
func (sm *SmartManager) collectMdraidHealth(deviceInfo *DeviceInfo) (bool, error) {
if deviceInfo == nil || deviceInfo.Name == "" {
return false, nil
}
base := filepath.Base(deviceInfo.Name)
if !isMdraidBlockName(base) && !strings.EqualFold(deviceInfo.Type, "mdraid") {
return false, nil
}
health, ok := readMdraidHealth(base)
if !ok {
return false, nil
}
deviceInfo.Type = "mdraid"
key := fmt.Sprintf("mdraid:%s", base)
status := mdraidSmartStatus(health)
attrs := make([]*smart.SmartAttribute, 0, 10)
if health.arrayState != "" {
attrs = append(attrs, &smart.SmartAttribute{Name: "ArrayState", RawString: health.arrayState})
}
if health.level != "" {
attrs = append(attrs, &smart.SmartAttribute{Name: "RaidLevel", RawString: health.level})
}
if health.raidDisks > 0 {
attrs = append(attrs, &smart.SmartAttribute{Name: "RaidDisks", RawValue: health.raidDisks})
}
if health.degraded > 0 {
attrs = append(attrs, &smart.SmartAttribute{Name: "Degraded", RawValue: health.degraded})
}
if health.syncAction != "" {
attrs = append(attrs, &smart.SmartAttribute{Name: "SyncAction", RawString: health.syncAction})
}
if health.syncCompleted != "" {
attrs = append(attrs, &smart.SmartAttribute{Name: "SyncCompleted", RawString: health.syncCompleted})
}
if health.syncSpeed != "" {
attrs = append(attrs, &smart.SmartAttribute{Name: "SyncSpeed", RawString: health.syncSpeed})
}
if health.mismatchCnt > 0 {
attrs = append(attrs, &smart.SmartAttribute{Name: "MismatchCount", RawValue: health.mismatchCnt})
}
sm.Lock()
defer sm.Unlock()
if _, exists := sm.SmartDataMap[key]; !exists {
sm.SmartDataMap[key] = &smart.SmartData{}
}
data := sm.SmartDataMap[key]
data.ModelName = "Linux MD RAID"
if health.level != "" {
data.ModelName = "Linux MD RAID (" + health.level + ")"
}
data.Capacity = health.capacity
data.SmartStatus = status
data.DiskName = filepath.Join("/dev", base)
data.DiskType = "mdraid"
data.Attributes = attrs
return true, nil
}
// readMdraidHealth reads md array health fields from sysfs.
func readMdraidHealth(blockName string) (mdraidHealth, bool) {
var out mdraidHealth
if !isMdraidBlockName(blockName) {
return out, false
}
mdDir := filepath.Join(mdraidSysfsRoot, "block", blockName, "md")
arrayState, okState := utils.ReadStringFileOK(filepath.Join(mdDir, "array_state"))
if !okState {
return out, false
}
out.arrayState = arrayState
out.level = utils.ReadStringFile(filepath.Join(mdDir, "level"))
out.syncAction = utils.ReadStringFile(filepath.Join(mdDir, "sync_action"))
out.syncCompleted = utils.ReadStringFile(filepath.Join(mdDir, "sync_completed"))
out.syncSpeed = utils.ReadStringFile(filepath.Join(mdDir, "sync_speed"))
if val, ok := utils.ReadUintFile(filepath.Join(mdDir, "raid_disks")); ok {
out.raidDisks = val
}
if val, ok := utils.ReadUintFile(filepath.Join(mdDir, "degraded")); ok {
out.degraded = val
}
if val, ok := utils.ReadUintFile(filepath.Join(mdDir, "mismatch_cnt")); ok {
out.mismatchCnt = val
}
if capBytes, ok := readMdraidBlockCapacityBytes(blockName, mdraidSysfsRoot); ok {
out.capacity = capBytes
}
return out, true
}
// mdraidSmartStatus maps md state/sync signals to a SMART-like status.
func mdraidSmartStatus(health mdraidHealth) string {
state := strings.ToLower(strings.TrimSpace(health.arrayState))
switch state {
case "inactive", "faulty", "broken", "stopped":
return "FAILED"
}
// During rebuild/recovery, arrays are often temporarily degraded; report as
// warning instead of hard failure while synchronization is in progress.
syncAction := strings.ToLower(strings.TrimSpace(health.syncAction))
switch syncAction {
case "resync", "recover", "reshape":
return "WARNING"
}
if health.degraded > 0 {
return "FAILED"
}
switch syncAction {
case "check", "repair":
return "WARNING"
}
switch state {
case "clean", "active", "active-idle", "write-pending", "read-auto", "readonly":
return "PASSED"
}
return "UNKNOWN"
}
// isMdraidBlockName matches /dev/mdN-style block device names.
func isMdraidBlockName(name string) bool {
if !strings.HasPrefix(name, "md") {
return false
}
suffix := strings.TrimPrefix(name, "md")
if suffix == "" {
return false
}
for _, c := range suffix {
if c < '0' || c > '9' {
return false
}
}
return true
}
// readMdraidBlockCapacityBytes converts block size metadata into bytes.
func readMdraidBlockCapacityBytes(blockName, root string) (uint64, bool) {
sizePath := filepath.Join(root, "block", blockName, "size")
lbsPath := filepath.Join(root, "block", blockName, "queue", "logical_block_size")
sizeStr, ok := utils.ReadStringFileOK(sizePath)
if !ok {
return 0, false
}
sectors, err := strconv.ParseUint(sizeStr, 10, 64)
if err != nil || sectors == 0 {
return 0, false
}
logicalBlockSize := uint64(512)
if lbsStr, ok := utils.ReadStringFileOK(lbsPath); ok {
if parsed, err := strconv.ParseUint(lbsStr, 10, 64); err == nil && parsed > 0 {
logicalBlockSize = parsed
}
}
return sectors * logicalBlockSize, true
}

103
agent/mdraid_linux_test.go Normal file
View File

@@ -0,0 +1,103 @@
//go:build linux
package agent
import (
"os"
"path/filepath"
"testing"
"github.com/henrygd/beszel/internal/entities/smart"
)
func TestMdraidMockSysfsScanAndCollect(t *testing.T) {
tmp := t.TempDir()
prev := mdraidSysfsRoot
mdraidSysfsRoot = tmp
t.Cleanup(func() { mdraidSysfsRoot = prev })
mdDir := filepath.Join(tmp, "block", "md0", "md")
queueDir := filepath.Join(tmp, "block", "md0", "queue")
if err := os.MkdirAll(mdDir, 0o755); err != nil {
t.Fatal(err)
}
if err := os.MkdirAll(queueDir, 0o755); err != nil {
t.Fatal(err)
}
write := func(path, content string) {
t.Helper()
if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
t.Fatal(err)
}
}
write(filepath.Join(mdDir, "array_state"), "active\n")
write(filepath.Join(mdDir, "level"), "raid1\n")
write(filepath.Join(mdDir, "raid_disks"), "2\n")
write(filepath.Join(mdDir, "degraded"), "0\n")
write(filepath.Join(mdDir, "sync_action"), "resync\n")
write(filepath.Join(mdDir, "sync_completed"), "10%\n")
write(filepath.Join(mdDir, "sync_speed"), "100M\n")
write(filepath.Join(mdDir, "mismatch_cnt"), "0\n")
write(filepath.Join(queueDir, "logical_block_size"), "512\n")
write(filepath.Join(tmp, "block", "md0", "size"), "2048\n")
devs := scanMdraidDevices()
if len(devs) != 1 {
t.Fatalf("scanMdraidDevices() = %d devices, want 1", len(devs))
}
if devs[0].Name != "/dev/md0" || devs[0].Type != "mdraid" {
t.Fatalf("scanMdraidDevices()[0] = %+v, want Name=/dev/md0 Type=mdraid", devs[0])
}
sm := &SmartManager{SmartDataMap: map[string]*smart.SmartData{}}
ok, err := sm.collectMdraidHealth(devs[0])
if err != nil || !ok {
t.Fatalf("collectMdraidHealth() = (ok=%v, err=%v), want (true,nil)", ok, err)
}
if len(sm.SmartDataMap) != 1 {
t.Fatalf("SmartDataMap len=%d, want 1", len(sm.SmartDataMap))
}
var got *smart.SmartData
for _, v := range sm.SmartDataMap {
got = v
break
}
if got == nil {
t.Fatalf("SmartDataMap value nil")
}
if got.DiskType != "mdraid" || got.DiskName != "/dev/md0" {
t.Fatalf("disk fields = (type=%q name=%q), want (mdraid,/dev/md0)", got.DiskType, got.DiskName)
}
if got.SmartStatus != "WARNING" {
t.Fatalf("SmartStatus=%q, want WARNING", got.SmartStatus)
}
if got.ModelName == "" || got.Capacity == 0 {
t.Fatalf("identity fields = (model=%q cap=%d), want non-empty model and cap>0", got.ModelName, got.Capacity)
}
if len(got.Attributes) < 5 {
t.Fatalf("attributes len=%d, want >= 5", len(got.Attributes))
}
}
func TestMdraidSmartStatus(t *testing.T) {
if got := mdraidSmartStatus(mdraidHealth{arrayState: "inactive"}); got != "FAILED" {
t.Fatalf("mdraidSmartStatus(inactive) = %q, want FAILED", got)
}
if got := mdraidSmartStatus(mdraidHealth{arrayState: "active", degraded: 1, syncAction: "recover"}); got != "WARNING" {
t.Fatalf("mdraidSmartStatus(degraded+recover) = %q, want WARNING", got)
}
if got := mdraidSmartStatus(mdraidHealth{arrayState: "active", degraded: 1}); got != "FAILED" {
t.Fatalf("mdraidSmartStatus(degraded) = %q, want FAILED", got)
}
if got := mdraidSmartStatus(mdraidHealth{arrayState: "active", syncAction: "recover"}); got != "WARNING" {
t.Fatalf("mdraidSmartStatus(recover) = %q, want WARNING", got)
}
if got := mdraidSmartStatus(mdraidHealth{arrayState: "clean"}); got != "PASSED" {
t.Fatalf("mdraidSmartStatus(clean) = %q, want PASSED", got)
}
if got := mdraidSmartStatus(mdraidHealth{arrayState: "unknown"}); got != "UNKNOWN" {
t.Fatalf("mdraidSmartStatus(unknown) = %q, want UNKNOWN", got)
}
}

11
agent/mdraid_stub.go Normal file
View File

@@ -0,0 +1,11 @@
//go:build !linux
package agent
func scanMdraidDevices() []*DeviceInfo {
return nil
}
func (sm *SmartManager) collectMdraidHealth(deviceInfo *DeviceInfo) (bool, error) {
return false, nil
}

View File

@@ -8,6 +8,7 @@ import (
"time"
"github.com/henrygd/beszel/agent/deltatracker"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/system"
psutilNet "github.com/shirou/gopsutil/v4/net"
)
@@ -94,7 +95,7 @@ func (a *Agent) initializeNetIoStats() {
a.netInterfaces = make(map[string]struct{}, 0)
// parse NICS env var for whitelist / blacklist
nicsEnvVal, nicsEnvExists := GetEnv("NICS")
nicsEnvVal, nicsEnvExists := utils.GetEnv("NICS")
var nicCfg *NicConfig
if nicsEnvExists {
nicCfg = newNicConfig(nicsEnvVal)
@@ -103,10 +104,7 @@ func (a *Agent) initializeNetIoStats() {
// get current network I/O stats and record valid interfaces
if netIO, err := psutilNet.IOCounters(true); err == nil {
for _, v := range netIO {
if nicsEnvExists && !isValidNic(v.Name, nicCfg) {
continue
}
if a.skipNetworkInterface(v) {
if skipNetworkInterface(v, nicCfg) {
continue
}
slog.Info("Detected network interface", "name", v.Name, "sent", v.BytesSent, "recv", v.BytesRecv)
@@ -215,10 +213,8 @@ func (a *Agent) applyNetworkTotals(
totalBytesSent, totalBytesRecv uint64,
bytesSentPerSecond, bytesRecvPerSecond uint64,
) {
networkSentPs := bytesToMegabytes(float64(bytesSentPerSecond))
networkRecvPs := bytesToMegabytes(float64(bytesRecvPerSecond))
if networkSentPs > 10_000 || networkRecvPs > 10_000 {
slog.Warn("Invalid net stats. Resetting.", "sent", networkSentPs, "recv", networkRecvPs)
if bytesSentPerSecond > 10_000_000_000 || bytesRecvPerSecond > 10_000_000_000 {
slog.Warn("Invalid net stats. Resetting.", "sent", bytesSentPerSecond, "recv", bytesRecvPerSecond)
for _, v := range netIO {
if _, exists := a.netInterfaces[v.Name]; !exists {
continue
@@ -228,21 +224,29 @@ func (a *Agent) applyNetworkTotals(
a.initializeNetIoStats()
delete(a.netIoStats, cacheTimeMs)
delete(a.netInterfaceDeltaTrackers, cacheTimeMs)
systemStats.NetworkSent = 0
systemStats.NetworkRecv = 0
systemStats.Bandwidth[0], systemStats.Bandwidth[1] = 0, 0
return
}
systemStats.NetworkSent = networkSentPs
systemStats.NetworkRecv = networkRecvPs
systemStats.Bandwidth[0], systemStats.Bandwidth[1] = bytesSentPerSecond, bytesRecvPerSecond
nis.BytesSent = totalBytesSent
nis.BytesRecv = totalBytesRecv
a.netIoStats[cacheTimeMs] = nis
}
func (a *Agent) skipNetworkInterface(v psutilNet.IOCountersStat) bool {
// skipNetworkInterface returns true if the network interface should be ignored.
func skipNetworkInterface(v psutilNet.IOCountersStat, nicCfg *NicConfig) bool {
if nicCfg != nil {
if !isValidNic(v.Name, nicCfg) {
return true
}
// In whitelist mode, we honor explicit inclusion without auto-filtering.
if !nicCfg.isBlacklist {
return false
}
// In blacklist mode, still apply the auto-filter below.
}
switch {
case strings.HasPrefix(v.Name, "lo"),
strings.HasPrefix(v.Name, "docker"),

View File

@@ -261,6 +261,39 @@ func TestNewNicConfig(t *testing.T) {
})
}
}
func TestSkipNetworkInterface(t *testing.T) {
tests := []struct {
name string
nic psutilNet.IOCountersStat
nicCfg *NicConfig
expectSkip bool
}{
{"loopback lo", psutilNet.IOCountersStat{Name: "lo", BytesSent: 100, BytesRecv: 100}, nil, true},
{"loopback lo0", psutilNet.IOCountersStat{Name: "lo0", BytesSent: 100, BytesRecv: 100}, nil, true},
{"docker prefix", psutilNet.IOCountersStat{Name: "docker0", BytesSent: 100, BytesRecv: 100}, nil, true},
{"br- prefix", psutilNet.IOCountersStat{Name: "br-lan", BytesSent: 100, BytesRecv: 100}, nil, true},
{"veth prefix", psutilNet.IOCountersStat{Name: "veth0abc", BytesSent: 100, BytesRecv: 100}, nil, true},
{"bond prefix", psutilNet.IOCountersStat{Name: "bond0", BytesSent: 100, BytesRecv: 100}, nil, true},
{"cali prefix", psutilNet.IOCountersStat{Name: "cali1234", BytesSent: 100, BytesRecv: 100}, nil, true},
{"zero BytesRecv", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 100, BytesRecv: 0}, nil, true},
{"zero BytesSent", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 0, BytesRecv: 100}, nil, true},
{"both zero", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 0, BytesRecv: 0}, nil, true},
{"normal eth0", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 100, BytesRecv: 200}, nil, false},
{"normal wlan0", psutilNet.IOCountersStat{Name: "wlan0", BytesSent: 1, BytesRecv: 1}, nil, false},
{"whitelist overrides skip (docker)", psutilNet.IOCountersStat{Name: "docker0", BytesSent: 100, BytesRecv: 100}, newNicConfig("docker0"), false},
{"whitelist overrides skip (lo)", psutilNet.IOCountersStat{Name: "lo", BytesSent: 100, BytesRecv: 100}, newNicConfig("lo"), false},
{"whitelist exclusion", psutilNet.IOCountersStat{Name: "eth1", BytesSent: 100, BytesRecv: 100}, newNicConfig("eth0"), true},
{"blacklist skip lo", psutilNet.IOCountersStat{Name: "lo", BytesSent: 100, BytesRecv: 100}, newNicConfig("-eth0"), true},
{"blacklist explicit eth0", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 100, BytesRecv: 100}, newNicConfig("-eth0"), true},
{"blacklist allow eth1", psutilNet.IOCountersStat{Name: "eth1", BytesSent: 100, BytesRecv: 100}, newNicConfig("-eth0"), false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.expectSkip, skipNetworkInterface(tt.nic, tt.nicCfg))
})
}
}
func TestEnsureNetworkInterfacesMap(t *testing.T) {
var a Agent
var stats system.Stats
@@ -383,8 +416,6 @@ func TestApplyNetworkTotals(t *testing.T) {
totalBytesSent uint64
totalBytesRecv uint64
expectReset bool
expectedNetworkSent float64
expectedNetworkRecv float64
expectedBandwidthSent uint64
expectedBandwidthRecv uint64
}{
@@ -395,8 +426,6 @@ func TestApplyNetworkTotals(t *testing.T) {
totalBytesSent: 10000000,
totalBytesRecv: 20000000,
expectReset: false,
expectedNetworkSent: 0.95, // ~1 MB/s rounded to 2 decimals
expectedNetworkRecv: 1.91, // ~2 MB/s rounded to 2 decimals
expectedBandwidthSent: 1000000,
expectedBandwidthRecv: 2000000,
},
@@ -424,18 +453,6 @@ func TestApplyNetworkTotals(t *testing.T) {
totalBytesRecv: 20000000,
expectReset: true,
},
{
name: "Valid network stats - at threshold boundary",
bytesSentPerSecond: 10485750000, // ~9999.99 MB/s (rounds to 9999.99)
bytesRecvPerSecond: 10485750000, // ~9999.99 MB/s (rounds to 9999.99)
totalBytesSent: 10000000,
totalBytesRecv: 20000000,
expectReset: false,
expectedNetworkSent: 9999.99,
expectedNetworkRecv: 9999.99,
expectedBandwidthSent: 10485750000,
expectedBandwidthRecv: 10485750000,
},
{
name: "Zero values",
bytesSentPerSecond: 0,
@@ -443,8 +460,6 @@ func TestApplyNetworkTotals(t *testing.T) {
totalBytesSent: 0,
totalBytesRecv: 0,
expectReset: false,
expectedNetworkSent: 0.0,
expectedNetworkRecv: 0.0,
expectedBandwidthSent: 0,
expectedBandwidthRecv: 0,
},
@@ -481,14 +496,10 @@ func TestApplyNetworkTotals(t *testing.T) {
// Should have reset network tracking state - maps cleared and stats zeroed
assert.NotContains(t, a.netIoStats, cacheTimeMs, "cache entry should be cleared after reset")
assert.NotContains(t, a.netInterfaceDeltaTrackers, cacheTimeMs, "tracker should be cleared on reset")
assert.Zero(t, systemStats.NetworkSent)
assert.Zero(t, systemStats.NetworkRecv)
assert.Zero(t, systemStats.Bandwidth[0])
assert.Zero(t, systemStats.Bandwidth[1])
} else {
// Should have applied stats
assert.Equal(t, tt.expectedNetworkSent, systemStats.NetworkSent)
assert.Equal(t, tt.expectedNetworkRecv, systemStats.NetworkRecv)
assert.Equal(t, tt.expectedBandwidthSent, systemStats.Bandwidth[0])
assert.Equal(t, tt.expectedBandwidthRecv, systemStats.Bandwidth[1])

View File

@@ -10,6 +10,7 @@ import (
"strings"
"unicode/utf8"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/system"
"github.com/shirou/gopsutil/v4/common"
@@ -26,9 +27,9 @@ type SensorConfig struct {
}
func (a *Agent) newSensorConfig() *SensorConfig {
primarySensor, _ := GetEnv("PRIMARY_SENSOR")
sysSensors, _ := GetEnv("SYS_SENSORS")
sensorsEnvVal, sensorsSet := GetEnv("SENSORS")
primarySensor, _ := utils.GetEnv("PRIMARY_SENSOR")
sysSensors, _ := utils.GetEnv("SYS_SENSORS")
sensorsEnvVal, sensorsSet := utils.GetEnv("SENSORS")
skipCollection := sensorsSet && sensorsEnvVal == ""
return a.newSensorConfigWithEnv(primarySensor, sysSensors, sensorsEnvVal, skipCollection)
@@ -135,7 +136,7 @@ func (a *Agent) updateTemperatures(systemStats *system.Stats) {
case sensorName:
a.systemInfo.DashboardTemp = sensor.Temperature
}
systemStats.Temperatures[sensorName] = twoDecimals(sensor.Temperature)
systemStats.Temperatures[sensorName] = utils.TwoDecimals(sensor.Temperature)
}
}

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package agent

View File

@@ -12,6 +12,7 @@ import (
"time"
"github.com/henrygd/beszel"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/common"
"github.com/henrygd/beszel/internal/entities/system"
@@ -36,6 +37,9 @@ var hubVersions map[string]semver.Version
// and begins listening for connections. Returns an error if the server
// is already running or if there's an issue starting the server.
func (a *Agent) StartServer(opts ServerOptions) error {
if disableSSH, _ := utils.GetEnv("DISABLE_SSH"); disableSSH == "true" {
return errors.New("SSH disabled")
}
if a.server != nil {
return errors.New("server already started")
}
@@ -235,11 +239,11 @@ func ParseKeys(input string) ([]gossh.PublicKey, error) {
// and finally defaults to ":45876".
func GetAddress(addr string) string {
if addr == "" {
addr, _ = GetEnv("LISTEN")
addr, _ = utils.GetEnv("LISTEN")
}
if addr == "" {
// Legacy PORT environment variable support
addr, _ = GetEnv("PORT")
addr, _ = utils.GetEnv("PORT")
}
if addr == "" {
return ":45876"
@@ -255,7 +259,7 @@ func GetAddress(addr string) string {
// It checks the NETWORK environment variable first, then infers from
// the address format: addresses starting with "/" are "unix", others are "tcp".
func GetNetwork(addr string) string {
if network, ok := GetEnv("NETWORK"); ok && network != "" {
if network, ok := utils.GetEnv("NETWORK"); ok && network != "" {
return network
}
if strings.HasPrefix(addr, "/") {

View File

@@ -1,3 +1,5 @@
//go:build testing
package agent
import (
@@ -180,6 +182,23 @@ func TestStartServer(t *testing.T) {
}
}
func TestStartServerDisableSSH(t *testing.T) {
os.Setenv("BESZEL_AGENT_DISABLE_SSH", "true")
defer os.Unsetenv("BESZEL_AGENT_DISABLE_SSH")
agent, err := NewAgent("")
require.NoError(t, err)
opts := ServerOptions{
Network: "tcp",
Addr: ":45990",
}
err = agent.StartServer(opts)
assert.Error(t, err)
assert.Contains(t, err.Error(), "SSH disabled")
}
/////////////////////////////////////////////////////////////////
//////////////////// ParseKeys Tests ////////////////////////////
/////////////////////////////////////////////////////////////////

View File

@@ -8,6 +8,7 @@ import (
"encoding/json"
"errors"
"fmt"
"log/slog"
"os"
"os/exec"
"path/filepath"
@@ -17,9 +18,8 @@ import (
"sync"
"time"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/smart"
"log/slog"
)
// SmartManager manages data collection for SMART devices
@@ -29,7 +29,7 @@ type SmartManager struct {
SmartDevices []*DeviceInfo
refreshMutex sync.Mutex
lastScanTime time.Time
binPath string
smartctlPath string
excludedDevices map[string]struct{}
}
@@ -157,7 +157,7 @@ func (sm *SmartManager) ScanDevices(force bool) error {
currentDevices := sm.devicesSnapshot()
var configuredDevices []*DeviceInfo
if configuredRaw, ok := GetEnv("SMART_DEVICES"); ok {
if configuredRaw, ok := utils.GetEnv("SMART_DEVICES"); ok {
slog.Info("SMART_DEVICES", "value", configuredRaw)
config := strings.TrimSpace(configuredRaw)
if config == "" {
@@ -171,27 +171,42 @@ func (sm *SmartManager) ScanDevices(force bool) error {
configuredDevices = parsedDevices
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, sm.binPath, "--scan", "-j")
output, err := cmd.Output()
var (
scanErr error
scannedDevices []*DeviceInfo
hasValidScan bool
)
if err != nil {
scanErr = err
} else {
scannedDevices, hasValidScan = sm.parseScan(output)
if !hasValidScan {
scanErr = errNoValidSmartData
if sm.smartctlPath != "" {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, sm.smartctlPath, "--scan", "-j")
output, err := cmd.Output()
if err != nil {
scanErr = err
} else {
scannedDevices, hasValidScan = sm.parseScan(output)
if !hasValidScan {
scanErr = errNoValidSmartData
}
}
}
// Add eMMC devices (Linux only) by reading sysfs health fields. This does not
// require smartctl and does not scan the whole device.
if emmcDevices := scanEmmcDevices(); len(emmcDevices) > 0 {
scannedDevices = append(scannedDevices, emmcDevices...)
hasValidScan = true
}
// Add Linux mdraid arrays by reading sysfs health fields. This does not
// require smartctl and does not scan the whole device.
if raidDevices := scanMdraidDevices(); len(raidDevices) > 0 {
scannedDevices = append(scannedDevices, raidDevices...)
hasValidScan = true
}
finalDevices := mergeDeviceLists(currentDevices, scannedDevices, configuredDevices)
finalDevices = sm.filterExcludedDevices(finalDevices)
sm.updateSmartDevices(finalDevices)
@@ -208,7 +223,7 @@ func (sm *SmartManager) ScanDevices(force bool) error {
}
func (sm *SmartManager) parseConfiguredDevices(config string) ([]*DeviceInfo, error) {
splitChar := os.Getenv("SMART_DEVICES_SEPARATOR")
splitChar, _ := utils.GetEnv("SMART_DEVICES_SEPARATOR")
if splitChar == "" {
splitChar = ","
}
@@ -246,7 +261,7 @@ func (sm *SmartManager) parseConfiguredDevices(config string) ([]*DeviceInfo, er
}
func (sm *SmartManager) refreshExcludedDevices() {
rawValue, _ := GetEnv("EXCLUDE_SMART")
rawValue, _ := utils.GetEnv("EXCLUDE_SMART")
sm.excludedDevices = make(map[string]struct{})
for entry := range strings.SplitSeq(rawValue, ",") {
@@ -443,6 +458,24 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
return errNoValidSmartData
}
// mdraid health is not exposed via SMART; Linux exposes array state in sysfs.
if deviceInfo != nil {
if ok, err := sm.collectMdraidHealth(deviceInfo); ok {
return err
}
}
// eMMC health is not exposed via SMART on Linux, but the kernel provides
// wear / EOL indicators via sysfs. Prefer that path when available.
if deviceInfo != nil {
if ok, err := sm.collectEmmcHealth(deviceInfo); ok {
return err
}
}
if sm.smartctlPath == "" {
return errNoValidSmartData
}
// slog.Info("collecting SMART data", "device", deviceInfo.Name, "type", deviceInfo.Type, "has_existing_data", sm.hasDataForDevice(deviceInfo.Name))
// Check if we have any existing data for this device
@@ -453,11 +486,11 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
// Try with -n standby first if we have existing data
args := sm.smartctlArgs(deviceInfo, hasExistingData)
cmd := exec.CommandContext(ctx, sm.binPath, args...)
cmd := exec.CommandContext(ctx, sm.smartctlPath, args...)
output, err := cmd.CombinedOutput()
// Check if device is in standby (exit status 2)
if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == 2 {
if exitErr, ok := errors.AsType[*exec.ExitError](err); ok && exitErr.ExitCode() == 2 {
if hasExistingData {
// Device is in standby and we have cached data, keep using cache
return nil
@@ -466,7 +499,7 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
ctx2, cancel2 := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel2()
args = sm.smartctlArgs(deviceInfo, false)
cmd = exec.CommandContext(ctx2, sm.binPath, args...)
cmd = exec.CommandContext(ctx2, sm.smartctlPath, args...)
output, err = cmd.CombinedOutput()
}
@@ -483,7 +516,7 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
ctx3, cancel3 := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel3()
args = sm.smartctlArgs(deviceInfo, false)
cmd = exec.CommandContext(ctx3, sm.binPath, args...)
cmd = exec.CommandContext(ctx3, sm.smartctlPath, args...)
output, err = cmd.CombinedOutput()
hasValidData = sm.parseSmartOutput(deviceInfo, output)
@@ -838,15 +871,18 @@ func (sm *SmartManager) parseSmartForSata(output []byte) (bool, int) {
smartData.FirmwareVersion = data.FirmwareVersion
smartData.Capacity = data.UserCapacity.Bytes
smartData.Temperature = data.Temperature.Current
if smartData.Temperature == 0 {
if temp, ok := temperatureFromAtaDeviceStatistics(data.AtaDeviceStatistics); ok {
smartData.Temperature = temp
}
}
smartData.SmartStatus = getSmartStatus(smartData.Temperature, data.SmartStatus.Passed)
smartData.DiskName = data.Device.Name
smartData.DiskType = data.Device.Type
// get values from ata_device_statistics if necessary
var ataDeviceStats smart.AtaDeviceStatistics
if smartData.Temperature == 0 {
if temp := findAtaDeviceStatisticsValue(&data, &ataDeviceStats, 5, "Current Temperature", 0, 255); temp != nil {
smartData.Temperature = uint8(*temp)
}
}
// update SmartAttributes
smartData.Attributes = make([]*smart.SmartAttribute, 0, len(data.AtaSmartAttributes.Table))
for _, attr := range data.AtaSmartAttributes.Table {
@@ -881,23 +917,20 @@ func getSmartStatus(temperature uint8, passed bool) string {
}
}
func temperatureFromAtaDeviceStatistics(stats smart.AtaDeviceStatistics) (uint8, bool) {
entry := findAtaDeviceStatisticsEntry(stats, 5, "Current Temperature")
if entry == nil || entry.Value == nil {
return 0, false
}
if *entry.Value > 255 {
return 0, false
}
return uint8(*entry.Value), true
}
// findAtaDeviceStatisticsEntry centralizes ATA devstat lookups so additional
// metrics can be pulled from the same structure in the future.
func findAtaDeviceStatisticsEntry(stats smart.AtaDeviceStatistics, pageNumber uint8, entryName string) *smart.AtaDeviceStatisticsEntry {
for pageIdx := range stats.Pages {
page := &stats.Pages[pageIdx]
if page.Number != pageNumber {
func findAtaDeviceStatisticsValue(data *smart.SmartInfoForSata, ataDeviceStats *smart.AtaDeviceStatistics, entryNumber uint8, entryName string, minValue, maxValue int64) *int64 {
if len(ataDeviceStats.Pages) == 0 {
if len(data.AtaDeviceStatistics) == 0 {
return nil
}
if err := json.Unmarshal(data.AtaDeviceStatistics, ataDeviceStats); err != nil {
return nil
}
}
for pageIdx := range ataDeviceStats.Pages {
page := &ataDeviceStats.Pages[pageIdx]
if page.Number != entryNumber {
continue
}
for entryIdx := range page.Table {
@@ -905,7 +938,10 @@ func findAtaDeviceStatisticsEntry(stats smart.AtaDeviceStatistics, pageNumber ui
if !strings.EqualFold(entry.Name, entryName) {
continue
}
return entry
if entry.Value == nil || *entry.Value < minValue || *entry.Value > maxValue {
return nil
}
return entry.Value
}
}
return nil
@@ -1124,11 +1160,17 @@ func NewSmartManager() (*SmartManager, error) {
}
sm.refreshExcludedDevices()
path, err := sm.detectSmartctl()
slog.Debug("smartctl", "path", path, "err", err)
if err != nil {
slog.Debug(err.Error())
// Keep the previous fail-fast behavior unless this Linux host exposes
// eMMC or mdraid health via sysfs, in which case smartctl is optional.
if runtime.GOOS == "linux" {
if len(scanEmmcDevices()) > 0 || len(scanMdraidDevices()) > 0 {
return sm, nil
}
}
return nil, err
}
slog.Debug("smartctl", "path", path)
sm.binPath = path
sm.smartctlPath = path
return sm, nil
}

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package agent
@@ -122,6 +121,78 @@ func TestParseSmartForSataDeviceStatisticsTemperature(t *testing.T) {
assert.Equal(t, uint8(22), deviceData.Temperature)
}
func TestParseSmartForSataAtaDeviceStatistics(t *testing.T) {
// tests that ata_device_statistics values are parsed correctly
jsonPayload := []byte(`{
"smartctl": {"exit_status": 0},
"device": {"name": "/dev/sdb", "type": "sat"},
"model_name": "SanDisk SSD U110 16GB",
"serial_number": "lksjfh23lhj",
"firmware_version": "U21B001",
"user_capacity": {"bytes": 16013942784},
"smart_status": {"passed": true},
"ata_smart_attributes": {"table": []},
"ata_device_statistics": {
"pages": [
{
"number": 5,
"name": "Temperature Statistics",
"table": [
{"name": "Current Temperature", "value": 43, "flags": {"valid": true}},
{"name": "Specified Minimum Operating Temperature", "value": -20, "flags": {"valid": true}}
]
}
]
}
}`)
sm := &SmartManager{SmartDataMap: make(map[string]*smart.SmartData)}
hasData, exitStatus := sm.parseSmartForSata(jsonPayload)
require.True(t, hasData)
assert.Equal(t, 0, exitStatus)
deviceData, ok := sm.SmartDataMap["lksjfh23lhj"]
require.True(t, ok, "expected smart data entry for serial lksjfh23lhj")
assert.Equal(t, uint8(43), deviceData.Temperature)
}
func TestParseSmartForSataNegativeDeviceStatistics(t *testing.T) {
// Tests that negative values in ata_device_statistics (e.g. min operating temp)
// do not cause the entire SAT parser to fail.
jsonPayload := []byte(`{
"smartctl": {"exit_status": 0},
"device": {"name": "/dev/sdb", "type": "sat"},
"model_name": "SanDisk SSD U110 16GB",
"serial_number": "NEGATIVE123",
"firmware_version": "U21B001",
"user_capacity": {"bytes": 16013942784},
"smart_status": {"passed": true},
"temperature": {"current": 38},
"ata_smart_attributes": {"table": []},
"ata_device_statistics": {
"pages": [
{
"number": 5,
"name": "Temperature Statistics",
"table": [
{"name": "Current Temperature", "value": 38, "flags": {"valid": true}},
{"name": "Specified Minimum Operating Temperature", "value": -20, "flags": {"valid": true}}
]
}
]
}
}`)
sm := &SmartManager{SmartDataMap: make(map[string]*smart.SmartData)}
hasData, exitStatus := sm.parseSmartForSata(jsonPayload)
require.True(t, hasData)
assert.Equal(t, 0, exitStatus)
deviceData, ok := sm.SmartDataMap["NEGATIVE123"]
require.True(t, ok, "expected smart data entry for serial NEGATIVE123")
assert.Equal(t, uint8(38), deviceData.Temperature)
}
func TestParseSmartForSataParentheticalRawValue(t *testing.T) {
jsonPayload := []byte(`{
"smartctl": {"exit_status": 0},
@@ -728,6 +799,182 @@ func TestIsVirtualDeviceScsi(t *testing.T) {
}
}
func TestFindAtaDeviceStatisticsValue(t *testing.T) {
val42 := int64(42)
val100 := int64(100)
valMinus20 := int64(-20)
tests := []struct {
name string
data smart.SmartInfoForSata
ataDeviceStats smart.AtaDeviceStatistics
entryNumber uint8
entryName string
minValue int64
maxValue int64
expectedValue *int64
}{
{
name: "value in ataDeviceStats",
ataDeviceStats: smart.AtaDeviceStatistics{
Pages: []smart.AtaDeviceStatisticsPage{
{
Number: 5,
Table: []smart.AtaDeviceStatisticsEntry{
{Name: "Current Temperature", Value: &val42},
},
},
},
},
entryNumber: 5,
entryName: "Current Temperature",
minValue: 0,
maxValue: 100,
expectedValue: &val42,
},
{
name: "value unmarshaled from data",
data: smart.SmartInfoForSata{
AtaDeviceStatistics: []byte(`{"pages":[{"number":5,"table":[{"name":"Current Temperature","value":100}]}]}`),
},
entryNumber: 5,
entryName: "Current Temperature",
minValue: 0,
maxValue: 255,
expectedValue: &val100,
},
{
name: "value out of range (too high)",
ataDeviceStats: smart.AtaDeviceStatistics{
Pages: []smart.AtaDeviceStatisticsPage{
{
Number: 5,
Table: []smart.AtaDeviceStatisticsEntry{
{Name: "Current Temperature", Value: &val100},
},
},
},
},
entryNumber: 5,
entryName: "Current Temperature",
minValue: 0,
maxValue: 50,
expectedValue: nil,
},
{
name: "value out of range (too low)",
ataDeviceStats: smart.AtaDeviceStatistics{
Pages: []smart.AtaDeviceStatisticsPage{
{
Number: 5,
Table: []smart.AtaDeviceStatisticsEntry{
{Name: "Min Temp", Value: &valMinus20},
},
},
},
},
entryNumber: 5,
entryName: "Min Temp",
minValue: 0,
maxValue: 100,
expectedValue: nil,
},
{
name: "no statistics available",
data: smart.SmartInfoForSata{},
entryNumber: 5,
entryName: "Current Temperature",
minValue: 0,
maxValue: 255,
expectedValue: nil,
},
{
name: "wrong page number",
ataDeviceStats: smart.AtaDeviceStatistics{
Pages: []smart.AtaDeviceStatisticsPage{
{
Number: 1,
Table: []smart.AtaDeviceStatisticsEntry{
{Name: "Current Temperature", Value: &val42},
},
},
},
},
entryNumber: 5,
entryName: "Current Temperature",
minValue: 0,
maxValue: 100,
expectedValue: nil,
},
{
name: "wrong entry name",
ataDeviceStats: smart.AtaDeviceStatistics{
Pages: []smart.AtaDeviceStatisticsPage{
{
Number: 5,
Table: []smart.AtaDeviceStatisticsEntry{
{Name: "Other Stat", Value: &val42},
},
},
},
},
entryNumber: 5,
entryName: "Current Temperature",
minValue: 0,
maxValue: 100,
expectedValue: nil,
},
{
name: "case insensitive name match",
ataDeviceStats: smart.AtaDeviceStatistics{
Pages: []smart.AtaDeviceStatisticsPage{
{
Number: 5,
Table: []smart.AtaDeviceStatisticsEntry{
{Name: "CURRENT TEMPERATURE", Value: &val42},
},
},
},
},
entryNumber: 5,
entryName: "Current Temperature",
minValue: 0,
maxValue: 100,
expectedValue: &val42,
},
{
name: "entry value is nil",
ataDeviceStats: smart.AtaDeviceStatistics{
Pages: []smart.AtaDeviceStatisticsPage{
{
Number: 5,
Table: []smart.AtaDeviceStatisticsEntry{
{Name: "Current Temperature", Value: nil},
},
},
},
},
entryNumber: 5,
entryName: "Current Temperature",
minValue: 0,
maxValue: 100,
expectedValue: nil,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := findAtaDeviceStatisticsValue(&tt.data, &tt.ataDeviceStats, tt.entryNumber, tt.entryName, tt.minValue, tt.maxValue)
if tt.expectedValue == nil {
assert.Nil(t, result)
} else {
require.NotNil(t, result)
assert.Equal(t, *tt.expectedValue, *result)
}
})
}
}
func TestRefreshExcludedDevices(t *testing.T) {
tests := []struct {
name string

View File

@@ -7,12 +7,13 @@ import (
"log/slog"
"os"
"runtime"
"strconv"
"strings"
"time"
"github.com/henrygd/beszel"
"github.com/henrygd/beszel/agent/battery"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/agent/zfs"
"github.com/henrygd/beszel/internal/entities/container"
"github.com/henrygd/beszel/internal/entities/system"
@@ -107,7 +108,7 @@ func (a *Agent) refreshSystemDetails() {
}
// zfs
if _, err := getARCSize(); err != nil {
if _, err := zfs.ARCSize(); err != nil {
slog.Debug("Not monitoring ZFS ARC", "err", err)
} else {
a.zfs = true
@@ -127,13 +128,13 @@ func (a *Agent) getSystemStats(cacheTimeMs uint16) system.Stats {
// cpu metrics
cpuMetrics, err := getCpuMetrics(cacheTimeMs)
if err == nil {
systemStats.Cpu = twoDecimals(cpuMetrics.Total)
systemStats.Cpu = utils.TwoDecimals(cpuMetrics.Total)
systemStats.CpuBreakdown = []float64{
twoDecimals(cpuMetrics.User),
twoDecimals(cpuMetrics.System),
twoDecimals(cpuMetrics.Iowait),
twoDecimals(cpuMetrics.Steal),
twoDecimals(cpuMetrics.Idle),
utils.TwoDecimals(cpuMetrics.User),
utils.TwoDecimals(cpuMetrics.System),
utils.TwoDecimals(cpuMetrics.Iowait),
utils.TwoDecimals(cpuMetrics.Steal),
utils.TwoDecimals(cpuMetrics.Idle),
}
} else {
slog.Error("Error getting cpu metrics", "err", err)
@@ -157,8 +158,8 @@ func (a *Agent) getSystemStats(cacheTimeMs uint16) system.Stats {
// memory
if v, err := mem.VirtualMemory(); err == nil {
// swap
systemStats.Swap = bytesToGigabytes(v.SwapTotal)
systemStats.SwapUsed = bytesToGigabytes(v.SwapTotal - v.SwapFree - v.SwapCached)
systemStats.Swap = utils.BytesToGigabytes(v.SwapTotal)
systemStats.SwapUsed = utils.BytesToGigabytes(v.SwapTotal - v.SwapFree - v.SwapCached)
// cache + buffers value for default mem calculation
// note: gopsutil automatically adds SReclaimable to v.Cached
cacheBuff := v.Cached + v.Buffers - v.Shared
@@ -178,16 +179,16 @@ func (a *Agent) getSystemStats(cacheTimeMs uint16) system.Stats {
// }
// subtract ZFS ARC size from used memory and add as its own category
if a.zfs {
if arcSize, _ := getARCSize(); arcSize > 0 && arcSize < v.Used {
if arcSize, _ := zfs.ARCSize(); arcSize > 0 && arcSize < v.Used {
v.Used = v.Used - arcSize
v.UsedPercent = float64(v.Used) / float64(v.Total) * 100.0
systemStats.MemZfsArc = bytesToGigabytes(arcSize)
systemStats.MemZfsArc = utils.BytesToGigabytes(arcSize)
}
}
systemStats.Mem = bytesToGigabytes(v.Total)
systemStats.MemBuffCache = bytesToGigabytes(cacheBuff)
systemStats.MemUsed = bytesToGigabytes(v.Used)
systemStats.MemPct = twoDecimals(v.UsedPercent)
systemStats.Mem = utils.BytesToGigabytes(v.Total)
systemStats.MemBuffCache = utils.BytesToGigabytes(cacheBuff)
systemStats.MemUsed = utils.BytesToGigabytes(v.Used)
systemStats.MemPct = utils.TwoDecimals(v.UsedPercent)
}
// disk usage
@@ -250,32 +251,6 @@ func (a *Agent) getSystemStats(cacheTimeMs uint16) system.Stats {
return systemStats
}
// Returns the size of the ZFS ARC memory cache in bytes
func getARCSize() (uint64, error) {
file, err := os.Open("/proc/spl/kstat/zfs/arcstats")
if err != nil {
return 0, err
}
defer file.Close()
// Scan the lines
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "size") {
// Example line: size 4 15032385536
fields := strings.Fields(line)
if len(fields) < 3 {
return 0, err
}
// Return the size as uint64
return strconv.ParseUint(fields[2], 10, 64)
}
}
return 0, fmt.Errorf("failed to parse size field")
}
// getOsPrettyName attempts to get the pretty OS name from /etc/os-release on Linux systems
func getOsPrettyName() (string, error) {
file, err := os.Open("/etc/os-release")

View File

@@ -15,6 +15,7 @@ import (
"time"
"github.com/coreos/go-systemd/v22/dbus"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/systemd"
)
@@ -49,7 +50,7 @@ func isSystemdAvailable() bool {
// newSystemdManager creates a new systemdManager.
func newSystemdManager() (*systemdManager, error) {
if skipSystemd, _ := GetEnv("SKIP_SYSTEMD"); skipSystemd == "true" {
if skipSystemd, _ := utils.GetEnv("SKIP_SYSTEMD"); skipSystemd == "true" {
return nil, nil
}
@@ -294,13 +295,13 @@ func unescapeServiceName(name string) string {
// otherwise defaults to "*service".
func getServicePatterns() []string {
patterns := []string{}
if envPatterns, _ := GetEnv("SERVICE_PATTERNS"); envPatterns != "" {
if envPatterns, _ := utils.GetEnv("SERVICE_PATTERNS"); envPatterns != "" {
for pattern := range strings.SplitSeq(envPatterns, ",") {
pattern = strings.TrimSpace(pattern)
if pattern == "" {
continue
}
if !strings.HasSuffix(pattern, ".service") {
if !strings.HasSuffix(pattern, "timer") && !strings.HasSuffix(pattern, ".service") {
pattern += ".service"
}
patterns = append(patterns, pattern)

View File

@@ -156,6 +156,13 @@ func TestGetServicePatterns(t *testing.T) {
expected: []string{"*nginx*.service", "*apache*.service"},
cleanupEnvVars: true,
},
{
name: "opt into timer monitoring",
prefixedEnv: "nginx.service,docker,apache.timer",
unprefixedEnv: "",
expected: []string{"nginx.service", "docker.service", "apache.timer"},
cleanupEnvVars: true,
},
}
for _, tt := range tests {

700
agent/test-data/amdgpu.ids Normal file
View File

@@ -0,0 +1,700 @@
# List of AMDGPU IDs
#
# Syntax:
# device_id, revision_id, product_name <-- single tab after comma
1.0.0
1114, C2, AMD Radeon 860M Graphics
1114, C3, AMD Radeon 840M Graphics
1114, D2, AMD Radeon 860M Graphics
1114, D3, AMD Radeon 840M Graphics
1309, 00, AMD Radeon R7 Graphics
130A, 00, AMD Radeon R6 Graphics
130B, 00, AMD Radeon R4 Graphics
130C, 00, AMD Radeon R7 Graphics
130D, 00, AMD Radeon R6 Graphics
130E, 00, AMD Radeon R5 Graphics
130F, 00, AMD Radeon R7 Graphics
130F, D4, AMD Radeon R7 Graphics
130F, D5, AMD Radeon R7 Graphics
130F, D6, AMD Radeon R7 Graphics
130F, D7, AMD Radeon R7 Graphics
1313, 00, AMD Radeon R7 Graphics
1313, D4, AMD Radeon R7 Graphics
1313, D5, AMD Radeon R7 Graphics
1313, D6, AMD Radeon R7 Graphics
1315, 00, AMD Radeon R5 Graphics
1315, D4, AMD Radeon R5 Graphics
1315, D5, AMD Radeon R5 Graphics
1315, D6, AMD Radeon R5 Graphics
1315, D7, AMD Radeon R5 Graphics
1316, 00, AMD Radeon R5 Graphics
1318, 00, AMD Radeon R5 Graphics
131B, 00, AMD Radeon R4 Graphics
131C, 00, AMD Radeon R7 Graphics
131D, 00, AMD Radeon R6 Graphics
1435, AE, AMD Custom GPU 0932
1506, C1, AMD Radeon 610M
1506, C2, AMD Radeon 610M
1506, C3, AMD Radeon 610M
1506, C4, AMD Radeon 610M
150E, C1, AMD Radeon 890M Graphics
150E, C4, AMD Radeon 890M Graphics
150E, C5, AMD Radeon 890M Graphics
150E, C6, AMD Radeon 890M Graphics
150E, D1, AMD Radeon 890M Graphics
150E, D2, AMD Radeon 890M Graphics
150E, D3, AMD Radeon 890M Graphics
1586, C1, Radeon 8060S Graphics
1586, C2, Radeon 8050S Graphics
1586, C4, Radeon 8050S Graphics
1586, D1, Radeon 8060S Graphics
1586, D2, Radeon 8050S Graphics
1586, D4, Radeon 8050S Graphics
1586, D5, Radeon 8040S Graphics
15BF, 00, AMD Radeon 780M Graphics
15BF, 01, AMD Radeon 760M Graphics
15BF, 02, AMD Radeon 780M Graphics
15BF, 03, AMD Radeon 760M Graphics
15BF, C1, AMD Radeon 780M Graphics
15BF, C2, AMD Radeon 780M Graphics
15BF, C3, AMD Radeon 760M Graphics
15BF, C4, AMD Radeon 780M Graphics
15BF, C5, AMD Radeon 740M Graphics
15BF, C6, AMD Radeon 780M Graphics
15BF, C7, AMD Radeon 780M Graphics
15BF, C8, AMD Radeon 760M Graphics
15BF, C9, AMD Radeon 780M Graphics
15BF, CA, AMD Radeon 740M Graphics
15BF, CB, AMD Radeon 760M Graphics
15BF, CC, AMD Radeon 740M Graphics
15BF, CD, AMD Radeon 760M Graphics
15BF, CF, AMD Radeon 780M Graphics
15BF, D0, AMD Radeon 780M Graphics
15BF, D1, AMD Radeon 780M Graphics
15BF, D2, AMD Radeon 780M Graphics
15BF, D3, AMD Radeon 780M Graphics
15BF, D4, AMD Radeon 780M Graphics
15BF, D5, AMD Radeon 760M Graphics
15BF, D6, AMD Radeon 760M Graphics
15BF, D7, AMD Radeon 780M Graphics
15BF, D8, AMD Radeon 740M Graphics
15BF, D9, AMD Radeon 780M Graphics
15BF, DA, AMD Radeon 780M Graphics
15BF, DB, AMD Radeon 760M Graphics
15BF, DC, AMD Radeon 760M Graphics
15BF, DD, AMD Radeon 780M Graphics
15BF, DE, AMD Radeon 740M Graphics
15BF, DF, AMD Radeon 760M Graphics
15BF, F0, AMD Radeon 760M Graphics
15C8, C1, AMD Radeon 740M Graphics
15C8, C2, AMD Radeon 740M Graphics
15C8, C3, AMD Radeon 740M Graphics
15C8, C4, AMD Radeon 740M Graphics
15C8, D1, AMD Radeon 740M Graphics
15C8, D2, AMD Radeon 740M Graphics
15C8, D3, AMD Radeon 740M Graphics
15C8, D4, AMD Radeon 740M Graphics
15D8, 00, AMD Radeon RX Vega 8 Graphics WS
15D8, 91, AMD Radeon Vega 3 Graphics
15D8, 91, AMD Ryzen Embedded R1606G with Radeon Vega Gfx
15D8, 92, AMD Radeon Vega 3 Graphics
15D8, 92, AMD Ryzen Embedded R1505G with Radeon Vega Gfx
15D8, 93, AMD Radeon Vega 1 Graphics
15D8, A1, AMD Radeon Vega 10 Graphics
15D8, A2, AMD Radeon Vega 8 Graphics
15D8, A3, AMD Radeon Vega 6 Graphics
15D8, A4, AMD Radeon Vega 3 Graphics
15D8, B1, AMD Radeon Vega 10 Graphics
15D8, B2, AMD Radeon Vega 8 Graphics
15D8, B3, AMD Radeon Vega 6 Graphics
15D8, B4, AMD Radeon Vega 3 Graphics
15D8, C1, AMD Radeon Vega 10 Graphics
15D8, C2, AMD Radeon Vega 8 Graphics
15D8, C3, AMD Radeon Vega 6 Graphics
15D8, C4, AMD Radeon Vega 3 Graphics
15D8, C5, AMD Radeon Vega 3 Graphics
15D8, C8, AMD Radeon Vega 11 Graphics
15D8, C9, AMD Radeon Vega 8 Graphics
15D8, CA, AMD Radeon Vega 11 Graphics
15D8, CB, AMD Radeon Vega 8 Graphics
15D8, CC, AMD Radeon Vega 3 Graphics
15D8, CE, AMD Radeon Vega 3 Graphics
15D8, CF, AMD Ryzen Embedded R1305G with Radeon Vega Gfx
15D8, D1, AMD Radeon Vega 10 Graphics
15D8, D2, AMD Radeon Vega 8 Graphics
15D8, D3, AMD Radeon Vega 6 Graphics
15D8, D4, AMD Radeon Vega 3 Graphics
15D8, D8, AMD Radeon Vega 11 Graphics
15D8, D9, AMD Radeon Vega 8 Graphics
15D8, DA, AMD Radeon Vega 11 Graphics
15D8, DB, AMD Radeon Vega 3 Graphics
15D8, DB, AMD Radeon Vega 8 Graphics
15D8, DC, AMD Radeon Vega 3 Graphics
15D8, DD, AMD Radeon Vega 3 Graphics
15D8, DE, AMD Radeon Vega 3 Graphics
15D8, DF, AMD Radeon Vega 3 Graphics
15D8, E3, AMD Radeon Vega 3 Graphics
15D8, E4, AMD Ryzen Embedded R1102G with Radeon Vega Gfx
15DD, 81, AMD Ryzen Embedded V1807B with Radeon Vega Gfx
15DD, 82, AMD Ryzen Embedded V1756B with Radeon Vega Gfx
15DD, 83, AMD Ryzen Embedded V1605B with Radeon Vega Gfx
15DD, 84, AMD Radeon Vega 6 Graphics
15DD, 85, AMD Ryzen Embedded V1202B with Radeon Vega Gfx
15DD, 86, AMD Radeon Vega 11 Graphics
15DD, 88, AMD Radeon Vega 8 Graphics
15DD, C1, AMD Radeon Vega 11 Graphics
15DD, C2, AMD Radeon Vega 8 Graphics
15DD, C3, AMD Radeon Vega 3 / 10 Graphics
15DD, C4, AMD Radeon Vega 8 Graphics
15DD, C5, AMD Radeon Vega 3 Graphics
15DD, C6, AMD Radeon Vega 11 Graphics
15DD, C8, AMD Radeon Vega 8 Graphics
15DD, C9, AMD Radeon Vega 11 Graphics
15DD, CA, AMD Radeon Vega 8 Graphics
15DD, CB, AMD Radeon Vega 3 Graphics
15DD, CC, AMD Radeon Vega 6 Graphics
15DD, CE, AMD Radeon Vega 3 Graphics
15DD, CF, AMD Radeon Vega 3 Graphics
15DD, D0, AMD Radeon Vega 10 Graphics
15DD, D1, AMD Radeon Vega 8 Graphics
15DD, D3, AMD Radeon Vega 11 Graphics
15DD, D5, AMD Radeon Vega 8 Graphics
15DD, D6, AMD Radeon Vega 11 Graphics
15DD, D7, AMD Radeon Vega 8 Graphics
15DD, D8, AMD Radeon Vega 3 Graphics
15DD, D9, AMD Radeon Vega 6 Graphics
15DD, E1, AMD Radeon Vega 3 Graphics
15DD, E2, AMD Radeon Vega 3 Graphics
163F, AE, AMD Custom GPU 0405
163F, E1, AMD Custom GPU 0405
164E, D8, AMD Radeon 610M
164E, D9, AMD Radeon 610M
164E, DA, AMD Radeon 610M
164E, DB, AMD Radeon 610M
164E, DC, AMD Radeon 610M
1681, 06, AMD Radeon 680M
1681, 07, AMD Radeon 660M
1681, 0A, AMD Radeon 680M
1681, 0B, AMD Radeon 660M
1681, C7, AMD Radeon 680M
1681, C8, AMD Radeon 680M
1681, C9, AMD Radeon 660M
1900, 01, AMD Radeon 780M Graphics
1900, 02, AMD Radeon 760M Graphics
1900, 03, AMD Radeon 780M Graphics
1900, 04, AMD Radeon 760M Graphics
1900, 05, AMD Radeon 780M Graphics
1900, 06, AMD Radeon 780M Graphics
1900, 07, AMD Radeon 760M Graphics
1900, B0, AMD Radeon 780M Graphics
1900, B1, AMD Radeon 780M Graphics
1900, B2, AMD Radeon 780M Graphics
1900, B3, AMD Radeon 780M Graphics
1900, B4, AMD Radeon 780M Graphics
1900, B5, AMD Radeon 780M Graphics
1900, B6, AMD Radeon 780M Graphics
1900, B7, AMD Radeon 760M Graphics
1900, B8, AMD Radeon 760M Graphics
1900, B9, AMD Radeon 780M Graphics
1900, BA, AMD Radeon 780M Graphics
1900, BB, AMD Radeon 780M Graphics
1900, C0, AMD Radeon 780M Graphics
1900, C1, AMD Radeon 760M Graphics
1900, C2, AMD Radeon 780M Graphics
1900, C3, AMD Radeon 760M Graphics
1900, C4, AMD Radeon 780M Graphics
1900, C5, AMD Radeon 780M Graphics
1900, C6, AMD Radeon 760M Graphics
1900, C7, AMD Radeon 780M Graphics
1900, C8, AMD Radeon 760M Graphics
1900, C9, AMD Radeon 780M Graphics
1900, CA, AMD Radeon 760M Graphics
1900, CB, AMD Radeon 780M Graphics
1900, CC, AMD Radeon 780M Graphics
1900, CD, AMD Radeon 760M Graphics
1900, CE, AMD Radeon 780M Graphics
1900, CF, AMD Radeon 760M Graphics
1900, D0, AMD Radeon 780M Graphics
1900, D1, AMD Radeon 760M Graphics
1900, D2, AMD Radeon 780M Graphics
1900, D3, AMD Radeon 760M Graphics
1900, D4, AMD Radeon 780M Graphics
1900, D5, AMD Radeon 780M Graphics
1900, D6, AMD Radeon 760M Graphics
1900, D7, AMD Radeon 780M Graphics
1900, D8, AMD Radeon 760M Graphics
1900, D9, AMD Radeon 780M Graphics
1900, DA, AMD Radeon 760M Graphics
1900, DB, AMD Radeon 780M Graphics
1900, DC, AMD Radeon 780M Graphics
1900, DD, AMD Radeon 760M Graphics
1900, DE, AMD Radeon 780M Graphics
1900, DF, AMD Radeon 760M Graphics
1900, F0, AMD Radeon 780M Graphics
1900, F1, AMD Radeon 780M Graphics
1900, F2, AMD Radeon 780M Graphics
1901, C1, AMD Radeon 740M Graphics
1901, C2, AMD Radeon 740M Graphics
1901, C3, AMD Radeon 740M Graphics
1901, C6, AMD Radeon 740M Graphics
1901, C7, AMD Radeon 740M Graphics
1901, C8, AMD Radeon 740M Graphics
1901, C9, AMD Radeon 740M Graphics
1901, CA, AMD Radeon 740M Graphics
1901, D1, AMD Radeon 740M Graphics
1901, D2, AMD Radeon 740M Graphics
1901, D3, AMD Radeon 740M Graphics
1901, D4, AMD Radeon 740M Graphics
1901, D5, AMD Radeon 740M Graphics
1901, D6, AMD Radeon 740M Graphics
1901, D7, AMD Radeon 740M Graphics
1901, D8, AMD Radeon 740M Graphics
6600, 00, AMD Radeon HD 8600 / 8700M
6600, 81, AMD Radeon R7 M370
6601, 00, AMD Radeon HD 8500M / 8700M
6604, 00, AMD Radeon R7 M265 Series
6604, 81, AMD Radeon R7 M350
6605, 00, AMD Radeon R7 M260 Series
6605, 81, AMD Radeon R7 M340
6606, 00, AMD Radeon HD 8790M
6607, 00, AMD Radeon R5 M240
6608, 00, AMD FirePro W2100
6610, 00, AMD Radeon R7 200 Series
6610, 81, AMD Radeon R7 350
6610, 83, AMD Radeon R5 340
6610, 87, AMD Radeon R7 200 Series
6611, 00, AMD Radeon R7 200 Series
6611, 87, AMD Radeon R7 200 Series
6613, 00, AMD Radeon R7 200 Series
6617, 00, AMD Radeon R7 240 Series
6617, 87, AMD Radeon R7 200 Series
6617, C7, AMD Radeon R7 240 Series
6640, 00, AMD Radeon HD 8950
6640, 80, AMD Radeon R9 M380
6646, 00, AMD Radeon R9 M280X
6646, 80, AMD Radeon R9 M385
6646, 80, AMD Radeon R9 M470X
6647, 00, AMD Radeon R9 M200X Series
6647, 80, AMD Radeon R9 M380
6649, 00, AMD FirePro W5100
6658, 00, AMD Radeon R7 200 Series
665C, 00, AMD Radeon HD 7700 Series
665D, 00, AMD Radeon R7 200 Series
665F, 81, AMD Radeon R7 360 Series
6660, 00, AMD Radeon HD 8600M Series
6660, 81, AMD Radeon R5 M335
6660, 83, AMD Radeon R5 M330
6663, 00, AMD Radeon HD 8500M Series
6663, 83, AMD Radeon R5 M320
6664, 00, AMD Radeon R5 M200 Series
6665, 00, AMD Radeon R5 M230 Series
6665, 83, AMD Radeon R5 M320
6665, C3, AMD Radeon R5 M435
6666, 00, AMD Radeon R5 M200 Series
6667, 00, AMD Radeon R5 M200 Series
666F, 00, AMD Radeon HD 8500M
66A1, 02, AMD Instinct MI60 / MI50
66A1, 06, AMD Radeon Pro VII
66AF, C1, AMD Radeon VII
6780, 00, AMD FirePro W9000
6784, 00, ATI FirePro V (FireGL V) Graphics Adapter
6788, 00, ATI FirePro V (FireGL V) Graphics Adapter
678A, 00, AMD FirePro W8000
6798, 00, AMD Radeon R9 200 / HD 7900 Series
6799, 00, AMD Radeon HD 7900 Series
679A, 00, AMD Radeon HD 7900 Series
679B, 00, AMD Radeon HD 7900 Series
679E, 00, AMD Radeon HD 7800 Series
67A0, 00, AMD Radeon FirePro W9100
67A1, 00, AMD Radeon FirePro W8100
67B0, 00, AMD Radeon R9 200 Series
67B0, 80, AMD Radeon R9 390 Series
67B1, 00, AMD Radeon R9 200 Series
67B1, 80, AMD Radeon R9 390 Series
67B9, 00, AMD Radeon R9 200 Series
67C0, 00, AMD Radeon Pro WX 7100 Graphics
67C0, 80, AMD Radeon E9550
67C2, 01, AMD Radeon Pro V7350x2
67C2, 02, AMD Radeon Pro V7300X
67C4, 00, AMD Radeon Pro WX 7100 Graphics
67C4, 80, AMD Radeon E9560 / E9565 Graphics
67C7, 00, AMD Radeon Pro WX 5100 Graphics
67C7, 80, AMD Radeon E9390 Graphics
67D0, 01, AMD Radeon Pro V7350x2
67D0, 02, AMD Radeon Pro V7300X
67DF, C0, AMD Radeon Pro 580X
67DF, C1, AMD Radeon RX 580 Series
67DF, C2, AMD Radeon RX 570 Series
67DF, C3, AMD Radeon RX 580 Series
67DF, C4, AMD Radeon RX 480 Graphics
67DF, C5, AMD Radeon RX 470 Graphics
67DF, C6, AMD Radeon RX 570 Series
67DF, C7, AMD Radeon RX 480 Graphics
67DF, CF, AMD Radeon RX 470 Graphics
67DF, D7, AMD Radeon RX 470 Graphics
67DF, E0, AMD Radeon RX 470 Series
67DF, E1, AMD Radeon RX 590 Series
67DF, E3, AMD Radeon RX Series
67DF, E7, AMD Radeon RX 580 Series
67DF, EB, AMD Radeon Pro 580X
67DF, EF, AMD Radeon RX 570 Series
67DF, F7, AMD Radeon RX P30PH
67DF, FF, AMD Radeon RX 470 Series
67E0, 00, AMD Radeon Pro WX Series
67E3, 00, AMD Radeon Pro WX 4100
67E8, 00, AMD Radeon Pro WX Series
67E8, 01, AMD Radeon Pro WX Series
67E8, 80, AMD Radeon E9260 Graphics
67EB, 00, AMD Radeon Pro V5300X
67EF, C0, AMD Radeon RX Graphics
67EF, C1, AMD Radeon RX 460 Graphics
67EF, C2, AMD Radeon Pro Series
67EF, C3, AMD Radeon RX Series
67EF, C5, AMD Radeon RX 460 Graphics
67EF, C7, AMD Radeon RX Graphics
67EF, CF, AMD Radeon RX 460 Graphics
67EF, E0, AMD Radeon RX 560 Series
67EF, E1, AMD Radeon RX Series
67EF, E2, AMD Radeon RX 560X
67EF, E3, AMD Radeon RX Series
67EF, E5, AMD Radeon RX 560 Series
67EF, E7, AMD Radeon RX 560 Series
67EF, EF, AMD Radeon 550 Series
67EF, FF, AMD Radeon RX 460 Graphics
67FF, C0, AMD Radeon Pro 465
67FF, C1, AMD Radeon RX 560 Series
67FF, CF, AMD Radeon RX 560 Series
67FF, EF, AMD Radeon RX 560 Series
67FF, FF, AMD Radeon RX 550 Series
6800, 00, AMD Radeon HD 7970M
6801, 00, AMD Radeon HD 8970M
6806, 00, AMD Radeon R9 M290X
6808, 00, AMD FirePro W7000
6808, 00, ATI FirePro V (FireGL V) Graphics Adapter
6809, 00, ATI FirePro W5000
6810, 00, AMD Radeon R9 200 Series
6810, 81, AMD Radeon R9 370 Series
6811, 00, AMD Radeon R9 200 Series
6811, 81, AMD Radeon R7 370 Series
6818, 00, AMD Radeon HD 7800 Series
6819, 00, AMD Radeon HD 7800 Series
6820, 00, AMD Radeon R9 M275X
6820, 81, AMD Radeon R9 M375
6820, 83, AMD Radeon R9 M375X
6821, 00, AMD Radeon R9 M200X Series
6821, 83, AMD Radeon R9 M370X
6821, 87, AMD Radeon R7 M380
6822, 00, AMD Radeon E8860
6823, 00, AMD Radeon R9 M200X Series
6825, 00, AMD Radeon HD 7800M Series
6826, 00, AMD Radeon HD 7700M Series
6827, 00, AMD Radeon HD 7800M Series
6828, 00, AMD FirePro W600
682B, 00, AMD Radeon HD 8800M Series
682B, 87, AMD Radeon R9 M360
682C, 00, AMD FirePro W4100
682D, 00, AMD Radeon HD 7700M Series
682F, 00, AMD Radeon HD 7700M Series
6830, 00, AMD Radeon 7800M Series
6831, 00, AMD Radeon 7700M Series
6835, 00, AMD Radeon R7 Series / HD 9000 Series
6837, 00, AMD Radeon HD 7700 Series
683D, 00, AMD Radeon HD 7700 Series
683F, 00, AMD Radeon HD 7700 Series
684C, 00, ATI FirePro V (FireGL V) Graphics Adapter
6860, 00, AMD Radeon Instinct MI25
6860, 01, AMD Radeon Instinct MI25
6860, 02, AMD Radeon Instinct MI25
6860, 03, AMD Radeon Pro V340
6860, 04, AMD Radeon Instinct MI25x2
6860, 07, AMD Radeon Pro V320
6861, 00, AMD Radeon Pro WX 9100
6862, 00, AMD Radeon Pro SSG
6863, 00, AMD Radeon Vega Frontier Edition
6864, 03, AMD Radeon Pro V340
6864, 04, AMD Radeon Instinct MI25x2
6864, 05, AMD Radeon Pro V340
6868, 00, AMD Radeon Pro WX 8200
686C, 00, AMD Radeon Instinct MI25 MxGPU
686C, 01, AMD Radeon Instinct MI25 MxGPU
686C, 02, AMD Radeon Instinct MI25 MxGPU
686C, 03, AMD Radeon Pro V340 MxGPU
686C, 04, AMD Radeon Instinct MI25x2 MxGPU
686C, 05, AMD Radeon Pro V340L MxGPU
686C, 06, AMD Radeon Instinct MI25 MxGPU
687F, 01, AMD Radeon RX Vega
687F, C0, AMD Radeon RX Vega
687F, C1, AMD Radeon RX Vega
687F, C3, AMD Radeon RX Vega
687F, C7, AMD Radeon RX Vega
6900, 00, AMD Radeon R7 M260
6900, 81, AMD Radeon R7 M360
6900, 83, AMD Radeon R7 M340
6900, C1, AMD Radeon R5 M465 Series
6900, C3, AMD Radeon R5 M445 Series
6900, D1, AMD Radeon 530 Series
6900, D3, AMD Radeon 530 Series
6901, 00, AMD Radeon R5 M255
6902, 00, AMD Radeon Series
6907, 00, AMD Radeon R5 M255
6907, 87, AMD Radeon R5 M315
6920, 00, AMD Radeon R9 M395X
6920, 01, AMD Radeon R9 M390X
6921, 00, AMD Radeon R9 M390X
6929, 00, AMD FirePro S7150
6929, 01, AMD FirePro S7100X
692B, 00, AMD FirePro W7100
6938, 00, AMD Radeon R9 200 Series
6938, F0, AMD Radeon R9 200 Series
6938, F1, AMD Radeon R9 380 Series
6939, 00, AMD Radeon R9 200 Series
6939, F0, AMD Radeon R9 200 Series
6939, F1, AMD Radeon R9 380 Series
694C, C0, AMD Radeon RX Vega M GH Graphics
694E, C0, AMD Radeon RX Vega M GL Graphics
6980, 00, AMD Radeon Pro WX 3100
6981, 00, AMD Radeon Pro WX 3200 Series
6981, 01, AMD Radeon Pro WX 3200 Series
6981, 10, AMD Radeon Pro WX 3200 Series
6985, 00, AMD Radeon Pro WX 3100
6986, 00, AMD Radeon Pro WX 2100
6987, 80, AMD Embedded Radeon E9171
6987, C0, AMD Radeon 550X Series
6987, C1, AMD Radeon RX 640
6987, C3, AMD Radeon 540X Series
6987, C7, AMD Radeon 540
6995, 00, AMD Radeon Pro WX 2100
6997, 00, AMD Radeon Pro WX 2100
699F, 81, AMD Embedded Radeon E9170 Series
699F, C0, AMD Radeon 500 Series
699F, C1, AMD Radeon 540 Series
699F, C3, AMD Radeon 500 Series
699F, C7, AMD Radeon RX 550 / 550 Series
699F, C9, AMD Radeon 540
6FDF, E7, AMD Radeon RX 590 GME
6FDF, EF, AMD Radeon RX 580 2048SP
7300, C1, AMD FirePro S9300 x2
7300, C8, AMD Radeon R9 Fury Series
7300, C9, AMD Radeon Pro Duo
7300, CA, AMD Radeon R9 Fury Series
7300, CB, AMD Radeon R9 Fury Series
7312, 00, AMD Radeon Pro W5700
731E, C6, AMD Radeon RX 5700XTB
731E, C7, AMD Radeon RX 5700B
731F, C0, AMD Radeon RX 5700 XT 50th Anniversary
731F, C1, AMD Radeon RX 5700 XT
731F, C2, AMD Radeon RX 5600M
731F, C3, AMD Radeon RX 5700M
731F, C4, AMD Radeon RX 5700
731F, C5, AMD Radeon RX 5700 XT
731F, CA, AMD Radeon RX 5600 XT
731F, CB, AMD Radeon RX 5600 OEM
7340, C1, AMD Radeon RX 5500M
7340, C3, AMD Radeon RX 5300M
7340, C5, AMD Radeon RX 5500 XT
7340, C7, AMD Radeon RX 5500
7340, C9, AMD Radeon RX 5500XTB
7340, CF, AMD Radeon RX 5300
7341, 00, AMD Radeon Pro W5500
7347, 00, AMD Radeon Pro W5500M
7360, 41, AMD Radeon Pro 5600M
7360, C3, AMD Radeon Pro V520
7362, C1, AMD Radeon Pro V540
7362, C3, AMD Radeon Pro V520
738C, 01, AMD Instinct MI100
73A1, 00, AMD Radeon Pro V620
73A3, 00, AMD Radeon Pro W6800
73A5, C0, AMD Radeon RX 6950 XT
73AE, 00, AMD Radeon Pro V620 MxGPU
73AF, C0, AMD Radeon RX 6900 XT
73BF, C0, AMD Radeon RX 6900 XT
73BF, C1, AMD Radeon RX 6800 XT
73BF, C3, AMD Radeon RX 6800
73DF, C0, AMD Radeon RX 6750 XT
73DF, C1, AMD Radeon RX 6700 XT
73DF, C2, AMD Radeon RX 6800M
73DF, C3, AMD Radeon RX 6800M
73DF, C5, AMD Radeon RX 6700 XT
73DF, CF, AMD Radeon RX 6700M
73DF, D5, AMD Radeon RX 6750 GRE 12GB
73DF, D7, AMD TDC-235
73DF, DF, AMD Radeon RX 6700
73DF, E5, AMD Radeon RX 6750 GRE 12GB
73DF, FF, AMD Radeon RX 6700
73E0, 00, AMD Radeon RX 6600M
73E1, 00, AMD Radeon Pro W6600M
73E3, 00, AMD Radeon Pro W6600
73EF, C0, AMD Radeon RX 6800S
73EF, C1, AMD Radeon RX 6650 XT
73EF, C2, AMD Radeon RX 6700S
73EF, C3, AMD Radeon RX 6650M
73EF, C4, AMD Radeon RX 6650M XT
73FF, C1, AMD Radeon RX 6600 XT
73FF, C3, AMD Radeon RX 6600M
73FF, C7, AMD Radeon RX 6600
73FF, CB, AMD Radeon RX 6600S
73FF, CF, AMD Radeon RX 6600 LE
73FF, DF, AMD Radeon RX 6750 GRE 10GB
7408, 00, AMD Instinct MI250X
740C, 01, AMD Instinct MI250X / MI250
740F, 02, AMD Instinct MI210
7421, 00, AMD Radeon Pro W6500M
7422, 00, AMD Radeon Pro W6400
7423, 00, AMD Radeon Pro W6300M
7423, 01, AMD Radeon Pro W6300
7424, 00, AMD Radeon RX 6300
743F, C1, AMD Radeon RX 6500 XT
743F, C3, AMD Radeon RX 6500
743F, C3, AMD Radeon RX 6500M
743F, C7, AMD Radeon RX 6400
743F, C8, AMD Radeon RX 6500M
743F, CC, AMD Radeon 6550S
743F, CE, AMD Radeon RX 6450M
743F, CF, AMD Radeon RX 6300M
743F, D3, AMD Radeon RX 6550M
743F, D7, AMD Radeon RX 6400
7448, 00, AMD Radeon Pro W7900
7449, 00, AMD Radeon Pro W7800 48GB
744A, 00, AMD Radeon Pro W7900 Dual Slot
744B, 00, AMD Radeon Pro W7900D
744C, C8, AMD Radeon RX 7900 XTX
744C, CC, AMD Radeon RX 7900 XT
744C, CE, AMD Radeon RX 7900 GRE
744C, CF, AMD Radeon RX 7900M
745E, CC, AMD Radeon Pro W7800
7460, 00, AMD Radeon Pro V710
7461, 00, AMD Radeon Pro V710 MxGPU
7470, 00, AMD Radeon Pro W7700
747E, C8, AMD Radeon RX 7800 XT
747E, D8, AMD Radeon RX 7800M
747E, DB, AMD Radeon RX 7700
747E, FF, AMD Radeon RX 7700 XT
7480, 00, AMD Radeon Pro W7600
7480, C0, AMD Radeon RX 7600 XT
7480, C1, AMD Radeon RX 7700S
7480, C2, AMD Radeon RX 7650 GRE
7480, C3, AMD Radeon RX 7600S
7480, C7, AMD Radeon RX 7600M XT
7480, CF, AMD Radeon RX 7600
7481, C7, AMD Steam Machine
7483, CF, AMD Radeon RX 7600M
7489, 00, AMD Radeon Pro W7500
7499, 00, AMD Radeon Pro W7400
7499, C0, AMD Radeon RX 7400
7499, C1, AMD Radeon RX 7300
74A0, 00, AMD Instinct MI300A
74A1, 00, AMD Instinct MI300X
74A2, 00, AMD Instinct MI308X
74A5, 00, AMD Instinct MI325X
74A8, 00, AMD Instinct MI308X HF
74A9, 00, AMD Instinct MI300X HF
74B5, 00, AMD Instinct MI300X VF
74B6, 00, AMD Instinct MI308X
74BD, 00, AMD Instinct MI300X HF
7550, C0, AMD Radeon RX 9070 XT
7550, C2, AMD Radeon RX 9070 GRE
7550, C3, AMD Radeon RX 9070
7551, C0, AMD Radeon AI PRO R9700
7590, C0, AMD Radeon RX 9060 XT
7590, C7, AMD Radeon RX 9060
75A0, C0, AMD Instinct MI350X
75A3, C0, AMD Instinct MI355X
75B0, C0, AMD Instinct MI350X VF
75B3, C0, AMD Instinct MI355X VF
9830, 00, AMD Radeon HD 8400 / R3 Series
9831, 00, AMD Radeon HD 8400E
9832, 00, AMD Radeon HD 8330
9833, 00, AMD Radeon HD 8330E
9834, 00, AMD Radeon HD 8210
9835, 00, AMD Radeon HD 8210E
9836, 00, AMD Radeon HD 8200 / R3 Series
9837, 00, AMD Radeon HD 8280E
9838, 00, AMD Radeon HD 8200 / R3 series
9839, 00, AMD Radeon HD 8180
983D, 00, AMD Radeon HD 8250
9850, 00, AMD Radeon R3 Graphics
9850, 03, AMD Radeon R3 Graphics
9850, 40, AMD Radeon R2 Graphics
9850, 45, AMD Radeon R3 Graphics
9851, 00, AMD Radeon R4 Graphics
9851, 01, AMD Radeon R5E Graphics
9851, 05, AMD Radeon R5 Graphics
9851, 06, AMD Radeon R5E Graphics
9851, 40, AMD Radeon R4 Graphics
9851, 45, AMD Radeon R5 Graphics
9852, 00, AMD Radeon R2 Graphics
9852, 40, AMD Radeon E1 Graphics
9853, 00, AMD Radeon R2 Graphics
9853, 01, AMD Radeon R4E Graphics
9853, 03, AMD Radeon R2 Graphics
9853, 05, AMD Radeon R1E Graphics
9853, 06, AMD Radeon R1E Graphics
9853, 07, AMD Radeon R1E Graphics
9853, 08, AMD Radeon R1E Graphics
9853, 40, AMD Radeon R2 Graphics
9854, 00, AMD Radeon R3 Graphics
9854, 01, AMD Radeon R3E Graphics
9854, 02, AMD Radeon R3 Graphics
9854, 05, AMD Radeon R2 Graphics
9854, 06, AMD Radeon R4 Graphics
9854, 07, AMD Radeon R3 Graphics
9855, 02, AMD Radeon R6 Graphics
9855, 05, AMD Radeon R4 Graphics
9856, 00, AMD Radeon R2 Graphics
9856, 01, AMD Radeon R2E Graphics
9856, 02, AMD Radeon R2 Graphics
9856, 05, AMD Radeon R1E Graphics
9856, 06, AMD Radeon R2 Graphics
9856, 07, AMD Radeon R1E Graphics
9856, 08, AMD Radeon R1E Graphics
9856, 13, AMD Radeon R1E Graphics
9874, 81, AMD Radeon R6 Graphics
9874, 84, AMD Radeon R7 Graphics
9874, 85, AMD Radeon R6 Graphics
9874, 87, AMD Radeon R5 Graphics
9874, 88, AMD Radeon R7E Graphics
9874, 89, AMD Radeon R6E Graphics
9874, C4, AMD Radeon R7 Graphics
9874, C5, AMD Radeon R6 Graphics
9874, C6, AMD Radeon R6 Graphics
9874, C7, AMD Radeon R5 Graphics
9874, C8, AMD Radeon R7 Graphics
9874, C9, AMD Radeon R7 Graphics
9874, CA, AMD Radeon R5 Graphics
9874, CB, AMD Radeon R5 Graphics
9874, CC, AMD Radeon R7 Graphics
9874, CD, AMD Radeon R7 Graphics
9874, CE, AMD Radeon R5 Graphics
9874, E1, AMD Radeon R7 Graphics
9874, E2, AMD Radeon R7 Graphics
9874, E3, AMD Radeon R7 Graphics
9874, E4, AMD Radeon R7 Graphics
9874, E5, AMD Radeon R5 Graphics
9874, E6, AMD Radeon R5 Graphics
98E4, 80, AMD Radeon R5E Graphics
98E4, 81, AMD Radeon R4E Graphics
98E4, 83, AMD Radeon R2E Graphics
98E4, 84, AMD Radeon R2E Graphics
98E4, 86, AMD Radeon R1E Graphics
98E4, C0, AMD Radeon R4 Graphics
98E4, C1, AMD Radeon R5 Graphics
98E4, C2, AMD Radeon R4 Graphics
98E4, C4, AMD Radeon R5 Graphics
98E4, C6, AMD Radeon R5 Graphics
98E4, C8, AMD Radeon R4 Graphics
98E4, C9, AMD Radeon R4 Graphics
98E4, CA, AMD Radeon R5 Graphics
98E4, D0, AMD Radeon R2 Graphics
98E4, D1, AMD Radeon R2 Graphics
98E4, D2, AMD Radeon R2 Graphics
98E4, D4, AMD Radeon R2 Graphics
98E4, D9, AMD Radeon R5 Graphics
98E4, DA, AMD Radeon R5 Graphics
98E4, DB, AMD Radeon R3 Graphics
98E4, E1, AMD Radeon R3 Graphics
98E4, E2, AMD Radeon R3 Graphics
98E4, E9, AMD Radeon R4 Graphics
98E4, EA, AMD Radeon R4 Graphics
98E4, EB, AMD Radeon R3 Graphics
98E4, EB, AMD Radeon R4 Graphics

View File

@@ -0,0 +1,34 @@
[
{
"device_name": "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
"gpu_clock": "1485MHz",
"mem_clock": "6001MHz",
"temp": "48C",
"fan_speed": null,
"power_draw": "13W",
"gpu_util": "5%",
"encode": "0%",
"decode": "0%",
"mem_util": "8%",
"mem_total": "4294967296",
"mem_used": "349372416",
"mem_free": "3945594880",
"processes" : []
},
{
"device_name": "AMD Radeon 680M",
"gpu_clock": "2200MHz",
"mem_clock": "2400MHz",
"temp": "48C",
"fan_speed": "CPU Fan",
"power_draw": "9W",
"gpu_util": "12%",
"encode": null,
"decode": "0%",
"mem_util": "7%",
"mem_total": "16929173504",
"mem_used": "1213784064",
"mem_free": "15715389440",
"processes" : []
}
]

View File

@@ -63,9 +63,9 @@ func detectRestarter() restarter {
if path, err := exec.LookPath("rc-service"); err == nil {
return &openRCRestarter{cmd: path}
}
if path, err := exec.LookPath("procd"); err == nil {
return &openWRTRestarter{cmd: path}
}
if path, err := exec.LookPath("procd"); err == nil {
return &openWRTRestarter{cmd: path}
}
if path, err := exec.LookPath("service"); err == nil {
if runtime.GOOS == "freebsd" {
return &freeBSDRestarter{cmd: path}
@@ -79,7 +79,7 @@ func detectRestarter() restarter {
func Update(useMirror bool) error {
exePath, _ := os.Executable()
dataDir, err := getDataDir()
dataDir, err := GetDataDir()
if err != nil {
dataDir = os.TempDir()
}
@@ -125,4 +125,3 @@ func Update(useMirror bool) error {
return nil
}

View File

@@ -1,15 +0,0 @@
package agent
import "math"
func bytesToMegabytes(b float64) float64 {
return twoDecimals(b / 1048576)
}
func bytesToGigabytes(b uint64) float64 {
return twoDecimals(float64(b) / 1073741824)
}
func twoDecimals(value float64) float64 {
return math.Round(value*100) / 100
}

88
agent/utils/utils.go Normal file
View File

@@ -0,0 +1,88 @@
package utils
import (
"io"
"math"
"os"
"strconv"
"strings"
)
// GetEnv retrieves an environment variable with a "BESZEL_AGENT_" prefix, or falls back to the unprefixed key.
func GetEnv(key string) (value string, exists bool) {
if value, exists = os.LookupEnv("BESZEL_AGENT_" + key); exists {
return value, exists
}
return os.LookupEnv(key)
}
// BytesToMegabytes converts bytes to megabytes and rounds to two decimal places.
func BytesToMegabytes(b float64) float64 {
return TwoDecimals(b / 1048576)
}
// BytesToGigabytes converts bytes to gigabytes and rounds to two decimal places.
func BytesToGigabytes(b uint64) float64 {
return TwoDecimals(float64(b) / 1073741824)
}
// TwoDecimals rounds a float64 value to two decimal places.
func TwoDecimals(value float64) float64 {
return math.Round(value*100) / 100
}
// func RoundFloat(val float64, precision uint) float64 {
// ratio := math.Pow(10, float64(precision))
// return math.Round(val*ratio) / ratio
// }
// ReadStringFile returns trimmed file contents or empty string on error.
func ReadStringFile(path string) string {
content, _ := ReadStringFileOK(path)
return content
}
// ReadStringFileOK returns trimmed file contents and read success.
func ReadStringFileOK(path string) (string, bool) {
b, err := os.ReadFile(path)
if err != nil {
return "", false
}
return strings.TrimSpace(string(b)), true
}
// ReadStringFileLimited reads a file into a string with a maximum size (in bytes) to avoid
// allocating large buffers and potential panics with pseudo-files when the size is misreported.
func ReadStringFileLimited(path string, maxSize int) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
buf := make([]byte, maxSize)
n, err := f.Read(buf)
if err != nil && err != io.EOF {
return "", err
}
return strings.TrimSpace(string(buf[:n])), nil
}
// FileExists reports whether the given path exists.
func FileExists(path string) bool {
_, err := os.Stat(path)
return err == nil
}
// ReadUintFile parses a decimal uint64 value from a file.
func ReadUintFile(path string) (uint64, bool) {
raw, ok := ReadStringFileOK(path)
if !ok {
return 0, false
}
parsed, err := strconv.ParseUint(raw, 10, 64)
if err != nil {
return 0, false
}
return parsed, true
}

165
agent/utils/utils_test.go Normal file
View File

@@ -0,0 +1,165 @@
package utils
import (
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/assert"
)
func TestTwoDecimals(t *testing.T) {
tests := []struct {
name string
input float64
expected float64
}{
{"round down", 1.234, 1.23},
{"round half up", 1.235, 1.24}, // math.Round rounds half up
{"no rounding needed", 1.23, 1.23},
{"negative number", -1.235, -1.24}, // math.Round rounds half up (more negative)
{"zero", 0.0, 0.0},
{"large number", 123.456, 123.46}, // rounds 5 up
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := TwoDecimals(tt.input)
assert.Equal(t, tt.expected, result)
})
}
}
func TestBytesToMegabytes(t *testing.T) {
tests := []struct {
name string
input float64
expected float64
}{
{"1 MB", 1048576, 1.0},
{"512 KB", 524288, 0.5},
{"zero", 0, 0},
{"large value", 1073741824, 1024}, // 1 GB = 1024 MB
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := BytesToMegabytes(tt.input)
assert.Equal(t, tt.expected, result)
})
}
}
func TestBytesToGigabytes(t *testing.T) {
tests := []struct {
name string
input uint64
expected float64
}{
{"1 GB", 1073741824, 1.0},
{"512 MB", 536870912, 0.5},
{"0 GB", 0, 0},
{"2 GB", 2147483648, 2.0},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := BytesToGigabytes(tt.input)
assert.Equal(t, tt.expected, result)
})
}
}
func TestFileFunctions(t *testing.T) {
tmpDir := t.TempDir()
testFilePath := filepath.Join(tmpDir, "test.txt")
testContent := "hello world"
// Test FileExists (false)
assert.False(t, FileExists(testFilePath))
// Test ReadStringFileOK (false)
content, ok := ReadStringFileOK(testFilePath)
assert.False(t, ok)
assert.Empty(t, content)
// Test ReadStringFile (empty)
assert.Empty(t, ReadStringFile(testFilePath))
// Write file
err := os.WriteFile(testFilePath, []byte(testContent+"\n "), 0644)
assert.NoError(t, err)
// Test FileExists (true)
assert.True(t, FileExists(testFilePath))
// Test ReadStringFileOK (true)
content, ok = ReadStringFileOK(testFilePath)
assert.True(t, ok)
assert.Equal(t, testContent, content)
// Test ReadStringFile (content)
assert.Equal(t, testContent, ReadStringFile(testFilePath))
}
func TestReadUintFile(t *testing.T) {
tmpDir := t.TempDir()
t.Run("valid uint", func(t *testing.T) {
path := filepath.Join(tmpDir, "uint.txt")
os.WriteFile(path, []byte(" 12345\n"), 0644)
val, ok := ReadUintFile(path)
assert.True(t, ok)
assert.Equal(t, uint64(12345), val)
})
t.Run("invalid uint", func(t *testing.T) {
path := filepath.Join(tmpDir, "invalid.txt")
os.WriteFile(path, []byte("abc"), 0644)
val, ok := ReadUintFile(path)
assert.False(t, ok)
assert.Equal(t, uint64(0), val)
})
t.Run("missing file", func(t *testing.T) {
path := filepath.Join(tmpDir, "missing.txt")
val, ok := ReadUintFile(path)
assert.False(t, ok)
assert.Equal(t, uint64(0), val)
})
}
func TestGetEnv(t *testing.T) {
key := "TEST_VAR"
prefixedKey := "BESZEL_AGENT_" + key
t.Run("prefixed variable exists", func(t *testing.T) {
os.Setenv(prefixedKey, "prefixed_val")
os.Setenv(key, "unprefixed_val")
defer os.Unsetenv(prefixedKey)
defer os.Unsetenv(key)
val, exists := GetEnv(key)
assert.True(t, exists)
assert.Equal(t, "prefixed_val", val)
})
t.Run("only unprefixed variable exists", func(t *testing.T) {
os.Unsetenv(prefixedKey)
os.Setenv(key, "unprefixed_val")
defer os.Unsetenv(key)
val, exists := GetEnv(key)
assert.True(t, exists)
assert.Equal(t, "unprefixed_val", val)
})
t.Run("neither variable exists", func(t *testing.T) {
os.Unsetenv(prefixedKey)
os.Unsetenv(key)
val, exists := GetEnv(key)
assert.False(t, exists)
assert.Empty(t, val)
})
}

11
agent/zfs/zfs_freebsd.go Normal file
View File

@@ -0,0 +1,11 @@
//go:build freebsd
package zfs
import (
"golang.org/x/sys/unix"
)
func ARCSize() (uint64, error) {
return unix.SysctlUint64("kstat.zfs.misc.arcstats.size")
}

34
agent/zfs/zfs_linux.go Normal file
View File

@@ -0,0 +1,34 @@
//go:build linux
// Package zfs provides functions to read ZFS statistics.
package zfs
import (
"bufio"
"fmt"
"os"
"strconv"
"strings"
)
func ARCSize() (uint64, error) {
file, err := os.Open("/proc/spl/kstat/zfs/arcstats")
if err != nil {
return 0, err
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "size") {
fields := strings.Fields(line)
if len(fields) < 3 {
return 0, fmt.Errorf("unexpected arcstats size format: %s", line)
}
return strconv.ParseUint(fields[2], 10, 64)
}
}
return 0, fmt.Errorf("size field not found in arcstats")
}

View File

@@ -0,0 +1,9 @@
//go:build !linux && !freebsd
package zfs
import "errors"
func ARCSize() (uint64, error) {
return 0, errors.ErrUnsupported
}

View File

@@ -6,7 +6,7 @@ import "github.com/blang/semver"
const (
// Version is the current version of the application.
Version = "0.18.3"
Version = "0.18.4"
// AppName is the name of the application.
AppName = "beszel"
)

30
go.mod
View File

@@ -1,6 +1,6 @@
module github.com/henrygd/beszel
go 1.25.5
go 1.26.1
require (
github.com/blang/semver v3.5.1+incompatible
@@ -11,17 +11,17 @@ require (
github.com/gliderlabs/ssh v0.3.8
github.com/google/uuid v1.6.0
github.com/lxzan/gws v1.8.9
github.com/nicholas-fedor/shoutrrr v0.13.1
github.com/pocketbase/dbx v1.11.0
github.com/pocketbase/pocketbase v0.36.2
github.com/nicholas-fedor/shoutrrr v0.13.2
github.com/pocketbase/dbx v1.12.0
github.com/pocketbase/pocketbase v0.36.4
github.com/shirou/gopsutil/v4 v4.26.1
github.com/spf13/cast v1.10.0
github.com/spf13/cobra v1.10.2
github.com/spf13/pflag v1.0.10
github.com/stretchr/testify v1.11.1
golang.org/x/crypto v0.47.0
golang.org/x/exp v0.0.0-20260112195511-716be5621a96
golang.org/x/sys v0.40.0
golang.org/x/crypto v0.48.0
golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa
golang.org/x/sys v0.41.0
gopkg.in/yaml.v3 v3.0.1
)
@@ -42,8 +42,8 @@ require (
github.com/godbus/dbus/v5 v5.2.2 // indirect
github.com/golang-jwt/jwt/v5 v5.3.1 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/klauspost/compress v1.18.3 // indirect
github.com/lufia/plan9stats v0.0.0-20251013123823-9fd1530e3ec3 // indirect
github.com/klauspost/compress v1.18.4 // indirect
github.com/lufia/plan9stats v0.0.0-20260216142805-b3301c5f2a88 // indirect
github.com/mattn/go-colorable v0.1.14 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/ncruces/go-strftime v1.0.0 // indirect
@@ -54,15 +54,15 @@ require (
github.com/tklauser/numcpus v0.11.0 // indirect
github.com/x448/float16 v0.8.4 // indirect
github.com/yusufpapurcu/wmi v1.2.4 // indirect
golang.org/x/image v0.35.0 // indirect
golang.org/x/net v0.49.0 // indirect
golang.org/x/oauth2 v0.34.0 // indirect
golang.org/x/image v0.36.0 // indirect
golang.org/x/net v0.50.0 // indirect
golang.org/x/oauth2 v0.35.0 // indirect
golang.org/x/sync v0.19.0 // indirect
golang.org/x/term v0.39.0 // indirect
golang.org/x/text v0.33.0 // indirect
golang.org/x/term v0.40.0 // indirect
golang.org/x/text v0.34.0 // indirect
howett.net/plist v1.0.1 // indirect
modernc.org/libc v1.67.6 // indirect
modernc.org/mathutil v1.7.1 // indirect
modernc.org/memory v1.11.0 // indirect
modernc.org/sqlite v1.44.3 // indirect
modernc.org/sqlite v1.45.0 // indirect
)

76
go.sum
View File

@@ -69,14 +69,14 @@ github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLf
github.com/jarcoal/httpmock v1.4.1 h1:0Ju+VCFuARfFlhVXFc2HxlcQkfB+Xq12/EotHko+x2A=
github.com/jarcoal/httpmock v1.4.1/go.mod h1:ftW1xULwo+j0R0JJkJIIi7UKigZUXCLLanykgjwBXL0=
github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw=
github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c=
github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/lufia/plan9stats v0.0.0-20251013123823-9fd1530e3ec3 h1:PwQumkgq4/acIiZhtifTV5OUqqiP82UAl0h87xj/l9k=
github.com/lufia/plan9stats v0.0.0-20251013123823-9fd1530e3ec3/go.mod h1:autxFIvghDt3jPTLoqZ9OZ7s9qTGNAWmYCjVFWPX/zg=
github.com/lufia/plan9stats v0.0.0-20260216142805-b3301c5f2a88 h1:PTw+yKnXcOFCR6+8hHTyWBeQ/P4Nb7dd4/0ohEcWQuM=
github.com/lufia/plan9stats v0.0.0-20260216142805-b3301c5f2a88/go.mod h1:autxFIvghDt3jPTLoqZ9OZ7s9qTGNAWmYCjVFWPX/zg=
github.com/lxzan/gws v1.8.9 h1:VU3SGUeWlQrEwfUSfokcZep8mdg/BrUF+y73YYshdBM=
github.com/lxzan/gws v1.8.9/go.mod h1:d9yHaR1eDTBHagQC6KY7ycUOaz5KWeqQtP3xu7aMK8Y=
github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE=
@@ -85,19 +85,19 @@ github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWE
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
github.com/nicholas-fedor/shoutrrr v0.13.1 h1:llEoHNbnMM4GfQ9+2Ns3n6ssvNfi3NPWluM0AQiicoY=
github.com/nicholas-fedor/shoutrrr v0.13.1/go.mod h1:kU4cFJpEAtTzl3iV0l+XUXmM90OlC5T01b7roM4/pYM=
github.com/onsi/ginkgo/v2 v2.27.3 h1:ICsZJ8JoYafeXFFlFAG75a7CxMsJHwgKwtO+82SE9L8=
github.com/onsi/ginkgo/v2 v2.27.3/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo=
github.com/onsi/gomega v1.38.3 h1:eTX+W6dobAYfFeGC2PV6RwXRu/MyT+cQguijutvkpSM=
github.com/onsi/gomega v1.38.3/go.mod h1:ZCU1pkQcXDO5Sl9/VVEGlDyp+zm0m1cmeG5TOzLgdh4=
github.com/nicholas-fedor/shoutrrr v0.13.2 h1:hfsYBIqSFYGg92pZP5CXk/g7/OJIkLYmiUnRl+AD1IA=
github.com/nicholas-fedor/shoutrrr v0.13.2/go.mod h1:ZqzV3gY/Wj6AvWs1etlO7+yKbh4iptSbeL8avBpMQbA=
github.com/onsi/ginkgo/v2 v2.28.1 h1:S4hj+HbZp40fNKuLUQOYLDgZLwNUVn19N3Atb98NCyI=
github.com/onsi/ginkgo/v2 v2.28.1/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsxPwvdOgE=
github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28=
github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pocketbase/dbx v1.11.0 h1:LpZezioMfT3K4tLrqA55wWFw1EtH1pM4tzSVa7kgszU=
github.com/pocketbase/dbx v1.11.0/go.mod h1:xXRCIAKTHMgUCyCKZm55pUOdvFziJjQfXaWKhu2vhMs=
github.com/pocketbase/pocketbase v0.36.2 h1:mzrxnvXKc3yxKlvZdbwoYXkH8kfIETteD0hWdgj0VI4=
github.com/pocketbase/pocketbase v0.36.2/go.mod h1:71vSF8whUDzC8mcLFE10+Qatf9JQdeOGIRWawOuLLKM=
github.com/pocketbase/dbx v1.12.0 h1:/oLErM+A0b4xI0PWTGPqSDVjzix48PqI/bng2l0PzoA=
github.com/pocketbase/dbx v1.12.0/go.mod h1:xXRCIAKTHMgUCyCKZm55pUOdvFziJjQfXaWKhu2vhMs=
github.com/pocketbase/pocketbase v0.36.4 h1:zTjRZbp2WfTOJJfb+pFRWa200UaQwxZYt8RzkFMlAZ4=
github.com/pocketbase/pocketbase v0.36.4/go.mod h1:9CiezhRudd9FZGa5xZa53QZBTNxc5vvw/FGG+diAECI=
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU=
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
@@ -129,20 +129,20 @@ github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQ
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8=
golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A=
golang.org/x/exp v0.0.0-20260112195511-716be5621a96 h1:Z/6YuSHTLOHfNFdb8zVZomZr7cqNgTJvA8+Qz75D8gU=
golang.org/x/exp v0.0.0-20260112195511-716be5621a96/go.mod h1:nzimsREAkjBCIEFtHiYkrJyT+2uy9YZJB7H1k68CXZU=
golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts=
golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos=
golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa h1:Zt3DZoOFFYkKhDT3v7Lm9FDMEV06GpzjG2jrqW+QTE0=
golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa/go.mod h1:K79w1Vqn7PoiZn+TkNpx3BUWUQksGO3JcVX6qIjytmA=
golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
golang.org/x/image v0.35.0 h1:LKjiHdgMtO8z7Fh18nGY6KDcoEtVfsgLDPeLyguqb7I=
golang.org/x/image v0.35.0/go.mod h1:MwPLTVgvxSASsxdLzKrl8BRFuyqMyGhLwmC+TO1Sybk=
golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c=
golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU=
golang.org/x/image v0.36.0 h1:Iknbfm1afbgtwPTmHnS2gTM/6PPZfH+z2EFuOkSbqwc=
golang.org/x/image v0.36.0/go.mod h1:YsWD2TyyGKiIX1kZlu9QfKIsQ4nAAK9bdgdrIsE7xy4=
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o=
golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8=
golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw=
golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60=
golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM=
golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ=
golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -150,20 +150,20 @@ golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY=
golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww=
golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k=
golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.40.0 h1:36e4zGLqU4yhjlmxEaagx2KuYbJq3EwY8K943ZsHcvg=
golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE=
golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8=
golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk=
golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc=
golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg=
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
@@ -195,8 +195,8 @@ modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
modernc.org/sqlite v1.44.3 h1:+39JvV/HWMcYslAwRxHb8067w+2zowvFOUrOWIy9PjY=
modernc.org/sqlite v1.44.3/go.mod h1:CzbrU2lSB1DKUusvwGz7rqEKIq+NUd8GWuBBZDs9/nA=
modernc.org/sqlite v1.45.0 h1:r51cSGzKpbptxnby+EIIz5fop4VuE4qFoVEjNvWoObs=
modernc.org/sqlite v1.45.0/go.mod h1:CzbrU2lSB1DKUusvwGz7rqEKIq+NUd8GWuBBZDs9/nA=
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=

View File

@@ -21,9 +21,9 @@ type hubLike interface {
type AlertManager struct {
hub hubLike
alertQueue chan alertTask
stopChan chan struct{}
stopOnce sync.Once
pendingAlerts sync.Map
alertsCache *AlertsCache
}
type AlertMessageData struct {
@@ -40,16 +40,22 @@ type UserNotificationSettings struct {
Webhooks []string `json:"webhooks"`
}
type SystemAlertFsStats struct {
DiskTotal float64 `json:"d"`
DiskUsed float64 `json:"du"`
}
// Values pulled from system_stats.stats that are relevant to alerts.
type SystemAlertStats struct {
Cpu float64 `json:"cpu"`
Mem float64 `json:"mp"`
Disk float64 `json:"dp"`
NetSent float64 `json:"ns"`
NetRecv float64 `json:"nr"`
Bandwidth [2]uint64 `json:"b"`
GPU map[string]SystemAlertGPUData `json:"g"`
Temperatures map[string]float32 `json:"t"`
LoadAvg [3]float64 `json:"la"`
Battery [2]uint8 `json:"bat"`
ExtraFs map[string]SystemAlertFsStats `json:"efs"`
}
type SystemAlertGPUData struct {
@@ -58,7 +64,7 @@ type SystemAlertGPUData struct {
type SystemAlertData struct {
systemRecord *core.Record
alertRecord *core.Record
alertData CachedAlertData
name string
unit string
val float64
@@ -92,12 +98,10 @@ var supportsTitle = map[string]struct{}{
// NewAlertManager creates a new AlertManager instance.
func NewAlertManager(app hubLike) *AlertManager {
am := &AlertManager{
hub: app,
alertQueue: make(chan alertTask, 5),
stopChan: make(chan struct{}),
hub: app,
alertsCache: NewAlertsCache(app),
}
am.bindEvents()
go am.startWorker()
return am
}
@@ -106,6 +110,19 @@ func (am *AlertManager) bindEvents() {
am.hub.OnRecordAfterUpdateSuccess("alerts").BindFunc(updateHistoryOnAlertUpdate)
am.hub.OnRecordAfterDeleteSuccess("alerts").BindFunc(resolveHistoryOnAlertDelete)
am.hub.OnRecordAfterUpdateSuccess("smart_devices").BindFunc(am.handleSmartDeviceAlert)
am.hub.OnServe().BindFunc(func(e *core.ServeEvent) error {
// Populate all alerts into cache on startup
_ = am.alertsCache.PopulateFromDB(true)
if err := resolveStatusAlerts(e.App); err != nil {
e.App.Logger().Error("Failed to resolve stale status alerts", "err", err)
}
if err := am.restorePendingStatusAlerts(); err != nil {
e.App.Logger().Error("Failed to restore pending status alerts", "err", err)
}
return e.Next()
})
}
// IsNotificationSilenced checks if a notification should be silenced based on configured quiet hours
@@ -259,13 +276,14 @@ func (am *AlertManager) SendShoutrrrAlert(notificationUrl, title, message, link,
}
// Add link
if scheme == "ntfy" {
switch scheme {
case "ntfy":
queryParams.Add("Actions", fmt.Sprintf("view, %s, %s", linkText, link))
} else if scheme == "lark" {
case "lark":
queryParams.Add("link", link)
} else if scheme == "bark" {
case "bark":
queryParams.Add("url", link)
} else {
default:
message += "\n\n" + link
}
@@ -298,3 +316,13 @@ func (am *AlertManager) SendTestNotification(e *core.RequestEvent) error {
}
return e.JSON(200, map[string]bool{"err": false})
}
// setAlertTriggered updates the "triggered" status of an alert record in the database
func (am *AlertManager) setAlertTriggered(alert CachedAlertData, triggered bool) error {
alertRecord, err := am.hub.FindRecordById("alerts", alert.Id)
if err != nil {
return err
}
alertRecord.Set("triggered", triggered)
return am.hub.Save(alertRecord)
}

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package alerts_test

View File

@@ -0,0 +1,177 @@
package alerts
import (
"github.com/pocketbase/dbx"
"github.com/pocketbase/pocketbase/core"
"github.com/pocketbase/pocketbase/tools/store"
)
// CachedAlertData represents the relevant fields of an alert record for status checking and updates.
type CachedAlertData struct {
Id string
SystemID string
UserID string
Name string
Value float64
Triggered bool
Min uint8
// Created types.DateTime
}
func (a *CachedAlertData) PopulateFromRecord(record *core.Record) {
a.Id = record.Id
a.SystemID = record.GetString("system")
a.UserID = record.GetString("user")
a.Name = record.GetString("name")
a.Value = record.GetFloat("value")
a.Triggered = record.GetBool("triggered")
a.Min = uint8(record.GetInt("min"))
// a.Created = record.GetDateTime("created")
}
// AlertsCache provides an in-memory cache for system alerts.
type AlertsCache struct {
app core.App
store *store.Store[string, *store.Store[string, CachedAlertData]]
populated bool
}
// NewAlertsCache creates a new instance of SystemAlertsCache.
func NewAlertsCache(app core.App) *AlertsCache {
c := AlertsCache{
app: app,
store: store.New(map[string]*store.Store[string, CachedAlertData]{}),
}
return c.bindEvents()
}
// bindEvents sets up event listeners to keep the cache in sync with database changes.
func (c *AlertsCache) bindEvents() *AlertsCache {
c.app.OnRecordAfterUpdateSuccess("alerts").BindFunc(func(e *core.RecordEvent) error {
// c.Delete(e.Record.Original()) // this would be needed if the system field on an existing alert was changed, however we don't currently allow that in the UI so we'll leave it commented out
c.Update(e.Record)
return e.Next()
})
c.app.OnRecordAfterDeleteSuccess("alerts").BindFunc(func(e *core.RecordEvent) error {
c.Delete(e.Record)
return e.Next()
})
c.app.OnRecordAfterCreateSuccess("alerts").BindFunc(func(e *core.RecordEvent) error {
c.Update(e.Record)
return e.Next()
})
return c
}
// PopulateFromDB clears current entries and loads all alerts from the database into the cache.
func (c *AlertsCache) PopulateFromDB(force bool) error {
if !force && c.populated {
return nil
}
records, err := c.app.FindAllRecords("alerts")
if err != nil {
return err
}
c.store.RemoveAll()
for _, record := range records {
c.Update(record)
}
c.populated = true
return nil
}
// Update adds or updates an alert record in the cache.
func (c *AlertsCache) Update(record *core.Record) {
systemID := record.GetString("system")
if systemID == "" {
return
}
systemStore, ok := c.store.GetOk(systemID)
if !ok {
systemStore = store.New(map[string]CachedAlertData{})
c.store.Set(systemID, systemStore)
}
var ca CachedAlertData
ca.PopulateFromRecord(record)
systemStore.Set(record.Id, ca)
}
// Delete removes an alert record from the cache.
func (c *AlertsCache) Delete(record *core.Record) {
systemID := record.GetString("system")
if systemID == "" {
return
}
if systemStore, ok := c.store.GetOk(systemID); ok {
systemStore.Remove(record.Id)
}
}
// GetSystemAlerts returns all alerts for the specified system, lazy-loading if necessary.
func (c *AlertsCache) GetSystemAlerts(systemID string) []CachedAlertData {
systemStore, ok := c.store.GetOk(systemID)
if !ok {
// Populate cache for this system
records, err := c.app.FindAllRecords("alerts", dbx.NewExp("system={:system}", dbx.Params{"system": systemID}))
if err != nil {
return nil
}
systemStore = store.New(map[string]CachedAlertData{})
for _, record := range records {
var ca CachedAlertData
ca.PopulateFromRecord(record)
systemStore.Set(record.Id, ca)
}
c.store.Set(systemID, systemStore)
}
all := systemStore.GetAll()
alerts := make([]CachedAlertData, 0, len(all))
for _, alert := range all {
alerts = append(alerts, alert)
}
return alerts
}
// GetAlert returns a specific alert by its ID from the cache.
func (c *AlertsCache) GetAlert(systemID, alertID string) (CachedAlertData, bool) {
if systemStore, ok := c.store.GetOk(systemID); ok {
return systemStore.GetOk(alertID)
}
return CachedAlertData{}, false
}
// GetAlertsByName returns all alerts of a specific type for the specified system.
func (c *AlertsCache) GetAlertsByName(systemID, alertName string) []CachedAlertData {
allAlerts := c.GetSystemAlerts(systemID)
var alerts []CachedAlertData
for _, record := range allAlerts {
if record.Name == alertName {
alerts = append(alerts, record)
}
}
return alerts
}
// GetAlertsExcludingNames returns all alerts for the specified system excluding the given types.
func (c *AlertsCache) GetAlertsExcludingNames(systemID string, excludedNames ...string) []CachedAlertData {
excludeMap := make(map[string]struct{})
for _, name := range excludedNames {
excludeMap[name] = struct{}{}
}
allAlerts := c.GetSystemAlerts(systemID)
var alerts []CachedAlertData
for _, record := range allAlerts {
if _, excluded := excludeMap[record.Name]; !excluded {
alerts = append(alerts, record)
}
}
return alerts
}
// Refresh returns the latest cached copy for an alert snapshot if it still exists.
func (c *AlertsCache) Refresh(alert CachedAlertData) (CachedAlertData, bool) {
if alert.Id == "" {
return CachedAlertData{}, false
}
return c.GetAlert(alert.SystemID, alert.Id)
}

View File

@@ -0,0 +1,215 @@
//go:build testing
package alerts_test
import (
"testing"
"github.com/henrygd/beszel/internal/alerts"
beszelTests "github.com/henrygd/beszel/internal/tests"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestSystemAlertsCachePopulateAndFilter(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systems, err := beszelTests.CreateSystems(hub, 2, user.Id, "up")
require.NoError(t, err)
system1 := systems[0]
system2 := systems[1]
statusAlert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": system1.Id,
"user": user.Id,
"min": 1,
})
require.NoError(t, err)
cpuAlert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "CPU",
"system": system1.Id,
"user": user.Id,
"value": 80,
"min": 1,
})
require.NoError(t, err)
memoryAlert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Memory",
"system": system2.Id,
"user": user.Id,
"value": 90,
"min": 1,
})
require.NoError(t, err)
cache := alerts.NewAlertsCache(hub)
cache.PopulateFromDB(false)
statusAlerts := cache.GetAlertsByName(system1.Id, "Status")
require.Len(t, statusAlerts, 1)
assert.Equal(t, statusAlert.Id, statusAlerts[0].Id)
nonStatusAlerts := cache.GetAlertsExcludingNames(system1.Id, "Status")
require.Len(t, nonStatusAlerts, 1)
assert.Equal(t, cpuAlert.Id, nonStatusAlerts[0].Id)
system2Alerts := cache.GetSystemAlerts(system2.Id)
require.Len(t, system2Alerts, 1)
assert.Equal(t, memoryAlert.Id, system2Alerts[0].Id)
}
func TestSystemAlertsCacheLazyLoadUpdateAndDelete(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up")
require.NoError(t, err)
systemRecord := systems[0]
statusAlert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": systemRecord.Id,
"user": user.Id,
"min": 1,
})
require.NoError(t, err)
cache := alerts.NewAlertsCache(hub)
require.Len(t, cache.GetSystemAlerts(systemRecord.Id), 1, "first lookup should lazy-load alerts for the system")
cpuAlert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "CPU",
"system": systemRecord.Id,
"user": user.Id,
"value": 80,
"min": 1,
})
require.NoError(t, err)
cache.Update(cpuAlert)
nonStatusAlerts := cache.GetAlertsExcludingNames(systemRecord.Id, "Status")
require.Len(t, nonStatusAlerts, 1)
assert.Equal(t, cpuAlert.Id, nonStatusAlerts[0].Id)
cache.Delete(statusAlert)
assert.Empty(t, cache.GetAlertsByName(systemRecord.Id, "Status"), "deleted alerts should be removed from the in-memory cache")
}
func TestSystemAlertsCacheRefreshReturnsLatestCopy(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up")
require.NoError(t, err)
system := systems[0]
alert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": system.Id,
"user": user.Id,
"min": 1,
"triggered": false,
})
require.NoError(t, err)
cache := alerts.NewAlertsCache(hub)
snapshot := cache.GetSystemAlerts(system.Id)[0]
assert.False(t, snapshot.Triggered)
alert.Set("triggered", true)
require.NoError(t, hub.Save(alert))
refreshed, ok := cache.Refresh(snapshot)
require.True(t, ok)
assert.Equal(t, snapshot.Id, refreshed.Id)
assert.True(t, refreshed.Triggered, "refresh should return the updated cached value rather than the stale snapshot")
require.NoError(t, hub.Delete(alert))
_, ok = cache.Refresh(snapshot)
assert.False(t, ok, "refresh should report false when the cached alert no longer exists")
}
func TestAlertManagerCacheLifecycle(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up")
require.NoError(t, err)
system := systems[0]
// Create an alert
alert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "CPU",
"system": system.Id,
"user": user.Id,
"value": 80,
"min": 1,
})
require.NoError(t, err)
am := hub.AlertManager
cache := am.GetSystemAlertsCache()
// Verify it's in cache (it should be since CreateRecord triggers the event)
assert.Len(t, cache.GetSystemAlerts(system.Id), 1)
assert.Equal(t, alert.Id, cache.GetSystemAlerts(system.Id)[0].Id)
assert.EqualValues(t, 80, cache.GetSystemAlerts(system.Id)[0].Value)
// Update the alert through PocketBase to trigger events
alert.Set("value", 85)
require.NoError(t, hub.Save(alert))
// Check if updated value is reflected (or at least that it's still there)
cachedAlerts := cache.GetSystemAlerts(system.Id)
assert.Len(t, cachedAlerts, 1)
assert.EqualValues(t, 85, cachedAlerts[0].Value)
// Delete the alert through PocketBase to trigger events
require.NoError(t, hub.Delete(alert))
// Verify it's removed from cache
assert.Empty(t, cache.GetSystemAlerts(system.Id), "alert should be removed from cache after PocketBase delete")
}
// func TestAlertManagerCacheMovesAlertToNewSystemOnUpdate(t *testing.T) {
// hub, user := beszelTests.GetHubWithUser(t)
// defer hub.Cleanup()
// systems, err := beszelTests.CreateSystems(hub, 2, user.Id, "up")
// require.NoError(t, err)
// system1 := systems[0]
// system2 := systems[1]
// alert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
// "name": "CPU",
// "system": system1.Id,
// "user": user.Id,
// "value": 80,
// "min": 1,
// })
// require.NoError(t, err)
// am := hub.AlertManager
// cache := am.GetSystemAlertsCache()
// // Initially in system1 cache
// assert.Len(t, cache.Get(system1.Id), 1)
// assert.Empty(t, cache.Get(system2.Id))
// // Move alert to system2
// alert.Set("system", system2.Id)
// require.NoError(t, hub.Save(alert))
// // DEBUG: print if it is found
// // fmt.Printf("system1 alerts after update: %v\n", cache.Get(system1.Id))
// // Should be removed from system1 and present in system2
// assert.Empty(t, cache.GetType(system1.Id, "CPU"), "updated alerts should be evicted from the previous system cache")
// require.Len(t, cache.Get(system2.Id), 1)
// assert.Equal(t, alert.Id, cache.Get(system2.Id)[0].Id)
// }

View File

@@ -0,0 +1,155 @@
//go:build testing
package alerts_test
import (
"encoding/json"
"testing"
"time"
"github.com/henrygd/beszel/internal/entities/system"
beszelTests "github.com/henrygd/beszel/internal/tests"
"github.com/pocketbase/dbx"
"github.com/pocketbase/pocketbase/tools/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestDiskAlertExtraFsMultiMinute tests that multi-minute disk alerts correctly use
// historical per-minute values for extra (non-root) filesystems, not the current live snapshot.
func TestDiskAlertExtraFsMultiMinute(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up")
require.NoError(t, err)
systemRecord := systems[0]
// Disk alert: threshold 80%, min=2 (requires historical averaging)
diskAlert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Disk",
"system": systemRecord.Id,
"user": user.Id,
"value": 80, // threshold: 80%
"min": 2, // 2 minutes - requires historical averaging
})
require.NoError(t, err)
assert.False(t, diskAlert.GetBool("triggered"), "Alert should not be triggered initially")
am := hub.GetAlertManager()
now := time.Now().UTC()
extraFsHigh := map[string]*system.FsStats{
"/mnt/data": {DiskTotal: 1000, DiskUsed: 920}, // 92% - above threshold
}
// Insert 4 historical records spread over 3 minutes (same pattern as battery tests).
// The oldest record must predate (now - 2min) so the alert time window is valid.
recordTimes := []time.Duration{
-180 * time.Second, // 3 min ago - anchors oldest record before alert.time
-90 * time.Second,
-60 * time.Second,
-30 * time.Second,
}
for _, offset := range recordTimes {
stats := system.Stats{
DiskPct: 30, // root disk at 30% - below threshold
ExtraFs: extraFsHigh,
}
statsJSON, _ := json.Marshal(stats)
recordTime := now.Add(offset)
record, err := beszelTests.CreateRecord(hub, "system_stats", map[string]any{
"system": systemRecord.Id,
"type": "1m",
"stats": string(statsJSON),
})
require.NoError(t, err)
record.SetRaw("created", recordTime.Format(types.DefaultDateLayout))
err = hub.SaveNoValidate(record)
require.NoError(t, err)
}
combinedDataHigh := &system.CombinedData{
Stats: system.Stats{
DiskPct: 30,
ExtraFs: extraFsHigh,
},
Info: system.Info{
DiskPct: 30,
},
}
systemRecord.Set("updated", now)
err = hub.SaveNoValidate(systemRecord)
require.NoError(t, err)
err = am.HandleSystemAlerts(systemRecord, combinedDataHigh)
require.NoError(t, err)
time.Sleep(20 * time.Millisecond)
diskAlert, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": diskAlert.Id})
require.NoError(t, err)
assert.True(t, diskAlert.GetBool("triggered"),
"Alert SHOULD be triggered when extra disk average (92%%) exceeds threshold (80%%)")
// --- Resolution: extra disk drops to 50%, alert should resolve ---
extraFsLow := map[string]*system.FsStats{
"/mnt/data": {DiskTotal: 1000, DiskUsed: 500}, // 50% - below threshold
}
newNow := now.Add(2 * time.Minute)
recordTimesLow := []time.Duration{
-180 * time.Second,
-90 * time.Second,
-60 * time.Second,
-30 * time.Second,
}
for _, offset := range recordTimesLow {
stats := system.Stats{
DiskPct: 30,
ExtraFs: extraFsLow,
}
statsJSON, _ := json.Marshal(stats)
recordTime := newNow.Add(offset)
record, err := beszelTests.CreateRecord(hub, "system_stats", map[string]any{
"system": systemRecord.Id,
"type": "1m",
"stats": string(statsJSON),
})
require.NoError(t, err)
record.SetRaw("created", recordTime.Format(types.DefaultDateLayout))
err = hub.SaveNoValidate(record)
require.NoError(t, err)
}
combinedDataLow := &system.CombinedData{
Stats: system.Stats{
DiskPct: 30,
ExtraFs: extraFsLow,
},
Info: system.Info{
DiskPct: 30,
},
}
systemRecord.Set("updated", newNow)
err = hub.SaveNoValidate(systemRecord)
require.NoError(t, err)
err = am.HandleSystemAlerts(systemRecord, combinedDataLow)
require.NoError(t, err)
time.Sleep(20 * time.Millisecond)
diskAlert, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": diskAlert.Id})
require.NoError(t, err)
assert.False(t, diskAlert.GetBool("triggered"),
"Alert should be resolved when extra disk average (50%%) drops below threshold (80%%)")
}

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package alerts_test
@@ -50,7 +49,7 @@ func TestAlertSilencedOneTime(t *testing.T) {
// Get alert manager
am := alerts.NewAlertManager(hub)
defer am.StopWorker()
defer am.Stop()
// Test that alert is silenced
silenced := am.IsNotificationSilenced(user.Id, system.Id)
@@ -107,7 +106,7 @@ func TestAlertSilencedDaily(t *testing.T) {
// Get alert manager
am := alerts.NewAlertManager(hub)
defer am.StopWorker()
defer am.Stop()
// Get current hour and create a window that includes current time
now := time.Now().UTC()
@@ -171,7 +170,7 @@ func TestAlertSilencedDailyMidnightCrossing(t *testing.T) {
// Get alert manager
am := alerts.NewAlertManager(hub)
defer am.StopWorker()
defer am.Stop()
// Create a window that crosses midnight: 22:00 - 02:00
startTime := time.Date(2000, 1, 1, 22, 0, 0, 0, time.UTC)
@@ -212,7 +211,7 @@ func TestAlertSilencedGlobal(t *testing.T) {
// Get alert manager
am := alerts.NewAlertManager(hub)
defer am.StopWorker()
defer am.Stop()
// Create a global quiet hours window (no system specified)
now := time.Now().UTC()
@@ -251,7 +250,7 @@ func TestAlertSilencedSystemSpecific(t *testing.T) {
// Get alert manager
am := alerts.NewAlertManager(hub)
defer am.StopWorker()
defer am.Stop()
// Create a system-specific quiet hours window for system1 only
now := time.Now().UTC()
@@ -297,7 +296,7 @@ func TestAlertSilencedMultiUser(t *testing.T) {
// Get alert manager
am := alerts.NewAlertManager(hub)
defer am.StopWorker()
defer am.Stop()
// Create a quiet hours window for user1 only
now := time.Now().UTC()
@@ -418,7 +417,7 @@ func TestAlertSilencedNoWindows(t *testing.T) {
// Get alert manager
am := alerts.NewAlertManager(hub)
defer am.StopWorker()
defer am.Stop()
// Without any quiet hours windows, alert should NOT be silenced
silenced := am.IsNotificationSilenced(user.Id, system.Id)

View File

@@ -2,18 +2,18 @@ package alerts
import (
"fmt"
"strings"
"github.com/pocketbase/pocketbase/core"
)
// handleSmartDeviceAlert sends alerts when a SMART device state changes from PASSED to FAILED.
// handleSmartDeviceAlert sends alerts when a SMART device state worsens into WARNING/FAILED.
// This is automatic and does not require user opt-in.
func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
oldState := e.Record.Original().GetString("state")
newState := e.Record.GetString("state")
// Only alert when transitioning from PASSED to FAILED
if oldState != "PASSED" || newState != "FAILED" {
if !shouldSendSmartDeviceAlert(oldState, newState) {
return e.Next()
}
@@ -32,14 +32,15 @@ func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
systemName := systemRecord.GetString("name")
deviceName := e.Record.GetString("name")
model := e.Record.GetString("model")
statusLabel := smartStateLabel(newState)
// Build alert message
title := fmt.Sprintf("SMART failure on %s: %s \U0001F534", systemName, deviceName)
title := fmt.Sprintf("SMART %s on %s: %s %s", statusLabel, systemName, deviceName, smartStateEmoji(newState))
var message string
if model != "" {
message = fmt.Sprintf("Disk %s (%s) SMART status changed to FAILED", deviceName, model)
message = fmt.Sprintf("Disk %s (%s) SMART status changed to %s", deviceName, model, newState)
} else {
message = fmt.Sprintf("Disk %s SMART status changed to FAILED", deviceName)
message = fmt.Sprintf("Disk %s SMART status changed to %s", deviceName, newState)
}
// Get users associated with the system
@@ -65,3 +66,42 @@ func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
return e.Next()
}
func shouldSendSmartDeviceAlert(oldState, newState string) bool {
oldSeverity := smartStateSeverity(oldState)
newSeverity := smartStateSeverity(newState)
// Ignore unknown states and recoveries; only alert on worsening transitions
// from known-good/degraded states into WARNING/FAILED.
return oldSeverity >= 1 && newSeverity > oldSeverity
}
func smartStateSeverity(state string) int {
switch state {
case "PASSED":
return 1
case "WARNING":
return 2
case "FAILED":
return 3
default:
return 0
}
}
func smartStateEmoji(state string) string {
switch state {
case "WARNING":
return "\U0001F7E0"
default:
return "\U0001F534"
}
}
func smartStateLabel(state string) string {
switch state {
case "FAILED":
return "failure"
default:
return strings.ToLower(state)
}
}

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package alerts_test
@@ -58,6 +57,74 @@ func TestSmartDeviceAlert(t *testing.T) {
assert.Contains(t, lastMessage.Text, "FAILED")
}
func TestSmartDeviceAlertPassedToWarning(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
system, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
"name": "test-system",
"users": []string{user.Id},
"host": "127.0.0.1",
})
assert.NoError(t, err)
smartDevice, err := beszelTests.CreateRecord(hub, "smart_devices", map[string]any{
"system": system.Id,
"name": "/dev/mmcblk0",
"model": "eMMC",
"state": "PASSED",
})
assert.NoError(t, err)
smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
assert.NoError(t, err)
smartDevice.Set("state", "WARNING")
err = hub.Save(smartDevice)
assert.NoError(t, err)
time.Sleep(50 * time.Millisecond)
assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 email sent after state changed to WARNING")
lastMessage := hub.TestMailer.LastMessage()
assert.Contains(t, lastMessage.Subject, "SMART warning on test-system")
assert.Contains(t, lastMessage.Text, "WARNING")
}
func TestSmartDeviceAlertWarningToFailed(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
system, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
"name": "test-system",
"users": []string{user.Id},
"host": "127.0.0.1",
})
assert.NoError(t, err)
smartDevice, err := beszelTests.CreateRecord(hub, "smart_devices", map[string]any{
"system": system.Id,
"name": "/dev/mmcblk0",
"model": "eMMC",
"state": "WARNING",
})
assert.NoError(t, err)
smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
assert.NoError(t, err)
smartDevice.Set("state", "FAILED")
err = hub.Save(smartDevice)
assert.NoError(t, err)
time.Sleep(50 * time.Millisecond)
assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 email sent after state changed from WARNING to FAILED")
lastMessage := hub.TestMailer.LastMessage()
assert.Contains(t, lastMessage.Subject, "SMART failure on test-system")
assert.Contains(t, lastMessage.Text, "FAILED")
}
func TestSmartDeviceAlertNoAlertOnNonPassedToFailed(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
@@ -83,7 +150,8 @@ func TestSmartDeviceAlertNoAlertOnNonPassedToFailed(t *testing.T) {
smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
assert.NoError(t, err)
// Update the state from UNKNOWN to FAILED - should NOT trigger alert
// Update the state from UNKNOWN to FAILED - should NOT trigger alert.
// We only alert from known healthy/degraded states.
smartDevice.Set("state", "FAILED")
err = hub.Save(smartDevice)
assert.NoError(t, err)

View File

@@ -5,67 +5,28 @@ import (
"strings"
"time"
"github.com/pocketbase/dbx"
"github.com/pocketbase/pocketbase/core"
)
type alertTask struct {
action string // "schedule" or "cancel"
systemName string
alertRecord *core.Record
delay time.Duration
}
type alertInfo struct {
systemName string
alertRecord *core.Record
expireTime time.Time
systemName string
alertData CachedAlertData
expireTime time.Time
timer *time.Timer
}
// startWorker is a long-running goroutine that processes alert tasks
// every x seconds. It must be running to process status alerts.
func (am *AlertManager) startWorker() {
processPendingAlerts := time.Tick(15 * time.Second)
// check for status alerts that are not resolved when system comes up
// (can be removed if we figure out core bug in #1052)
checkStatusAlerts := time.Tick(561 * time.Second)
for {
select {
case <-am.stopChan:
return
case task := <-am.alertQueue:
switch task.action {
case "schedule":
am.pendingAlerts.Store(task.alertRecord.Id, &alertInfo{
systemName: task.systemName,
alertRecord: task.alertRecord,
expireTime: time.Now().Add(task.delay),
})
case "cancel":
am.pendingAlerts.Delete(task.alertRecord.Id)
// Stop cancels all pending status alert timers.
func (am *AlertManager) Stop() {
am.stopOnce.Do(func() {
am.pendingAlerts.Range(func(key, value any) bool {
info := value.(*alertInfo)
if info.timer != nil {
info.timer.Stop()
}
case <-checkStatusAlerts:
resolveStatusAlerts(am.hub)
case <-processPendingAlerts:
// Check for expired alerts every tick
now := time.Now()
for key, value := range am.pendingAlerts.Range {
info := value.(*alertInfo)
if now.After(info.expireTime) {
// Downtime delay has passed, process alert
am.sendStatusAlert("down", info.systemName, info.alertRecord)
am.pendingAlerts.Delete(key)
}
}
}
}
}
// StopWorker shuts down the AlertManager.worker goroutine
func (am *AlertManager) StopWorker() {
close(am.stopChan)
am.pendingAlerts.Delete(key)
return true
})
})
}
// HandleStatusAlerts manages the logic when system status changes.
@@ -74,82 +35,104 @@ func (am *AlertManager) HandleStatusAlerts(newStatus string, systemRecord *core.
return nil
}
alertRecords, err := am.getSystemStatusAlerts(systemRecord.Id)
if err != nil {
return err
}
if len(alertRecords) == 0 {
alerts := am.alertsCache.GetAlertsByName(systemRecord.Id, "Status")
if len(alerts) == 0 {
return nil
}
systemName := systemRecord.GetString("name")
if newStatus == "down" {
am.handleSystemDown(systemName, alertRecords)
am.handleSystemDown(systemName, alerts)
} else {
am.handleSystemUp(systemName, alertRecords)
am.handleSystemUp(systemName, alerts)
}
return nil
}
// getSystemStatusAlerts retrieves all "Status" alert records for a given system ID.
func (am *AlertManager) getSystemStatusAlerts(systemID string) ([]*core.Record, error) {
alertRecords, err := am.hub.FindAllRecords("alerts", dbx.HashExp{
"system": systemID,
"name": "Status",
})
if err != nil {
return nil, err
// handleSystemDown manages the logic when a system status changes to "down". It schedules pending alerts for each alert record.
func (am *AlertManager) handleSystemDown(systemName string, alerts []CachedAlertData) {
for _, alertData := range alerts {
min := max(1, int(alertData.Min))
am.schedulePendingStatusAlert(systemName, alertData, time.Duration(min)*time.Minute)
}
return alertRecords, nil
}
// Schedules delayed "down" alerts for each alert record.
func (am *AlertManager) handleSystemDown(systemName string, alertRecords []*core.Record) {
for _, alertRecord := range alertRecords {
// Continue if alert is already scheduled
if _, exists := am.pendingAlerts.Load(alertRecord.Id); exists {
continue
}
// Schedule by adding to queue
min := max(1, alertRecord.GetInt("min"))
am.alertQueue <- alertTask{
action: "schedule",
systemName: systemName,
alertRecord: alertRecord,
delay: time.Duration(min) * time.Minute,
}
// schedulePendingStatusAlert sets up a timer to send a "down" alert after the specified delay if the system is still down.
// It returns true if the alert was scheduled, or false if an alert was already pending for the given alert record.
func (am *AlertManager) schedulePendingStatusAlert(systemName string, alertData CachedAlertData, delay time.Duration) bool {
alert := &alertInfo{
systemName: systemName,
alertData: alertData,
expireTime: time.Now().Add(delay),
}
storedAlert, loaded := am.pendingAlerts.LoadOrStore(alertData.Id, alert)
if loaded {
return false
}
stored := storedAlert.(*alertInfo)
stored.timer = time.AfterFunc(time.Until(stored.expireTime), func() {
am.processPendingAlert(alertData.Id)
})
return true
}
// handleSystemUp manages the logic when a system status changes to "up".
// It cancels any pending alerts and sends "up" alerts.
func (am *AlertManager) handleSystemUp(systemName string, alertRecords []*core.Record) {
for _, alertRecord := range alertRecords {
alertRecordID := alertRecord.Id
func (am *AlertManager) handleSystemUp(systemName string, alerts []CachedAlertData) {
for _, alertData := range alerts {
// If alert exists for record, delete and continue (down alert not sent)
if _, exists := am.pendingAlerts.Load(alertRecordID); exists {
am.alertQueue <- alertTask{
action: "cancel",
alertRecord: alertRecord,
}
if am.cancelPendingAlert(alertData.Id) {
continue
}
// No alert scheduled for this record, send "up" alert
if err := am.sendStatusAlert("up", systemName, alertRecord); err != nil {
if !alertData.Triggered {
continue
}
if err := am.sendStatusAlert("up", systemName, alertData); err != nil {
am.hub.Logger().Error("Failed to send alert", "err", err)
}
}
}
// sendStatusAlert sends a status alert ("up" or "down") to the users associated with the alert records.
func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, alertRecord *core.Record) error {
switch alertStatus {
case "up":
alertRecord.Set("triggered", false)
case "down":
alertRecord.Set("triggered", true)
// cancelPendingAlert stops the timer and removes the pending alert for the given alert ID. Returns true if a pending alert was found and cancelled.
func (am *AlertManager) cancelPendingAlert(alertID string) bool {
value, loaded := am.pendingAlerts.LoadAndDelete(alertID)
if !loaded {
return false
}
info := value.(*alertInfo)
if info.timer != nil {
info.timer.Stop()
}
return true
}
// processPendingAlert sends a "down" alert if the pending alert has expired and the system is still down.
func (am *AlertManager) processPendingAlert(alertID string) {
value, loaded := am.pendingAlerts.LoadAndDelete(alertID)
if !loaded {
return
}
info := value.(*alertInfo)
refreshedAlertData, ok := am.alertsCache.Refresh(info.alertData)
if !ok || refreshedAlertData.Triggered {
return
}
if err := am.sendStatusAlert("down", info.systemName, refreshedAlertData); err != nil {
am.hub.Logger().Error("Failed to send alert", "err", err)
}
}
// sendStatusAlert sends a status alert ("up" or "down") to the users associated with the alert records.
func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, alertData CachedAlertData) error {
// Update trigger state for alert record before sending alert
triggered := alertStatus == "down"
if err := am.setAlertTriggered(alertData, triggered); err != nil {
return err
}
am.hub.Save(alertRecord)
var emoji string
if alertStatus == "up" {
@@ -162,10 +145,10 @@ func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, a
message := strings.TrimSuffix(title, emoji)
// Get system ID for the link
systemID := alertRecord.GetString("system")
systemID := alertData.SystemID
return am.SendAlert(AlertMessageData{
UserID: alertRecord.GetString("user"),
UserID: alertData.UserID,
SystemID: systemID,
Title: title,
Message: message,
@@ -174,8 +157,8 @@ func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, a
})
}
// resolveStatusAlerts resolves any status alerts that weren't resolved
// when system came up (https://github.com/henrygd/beszel/issues/1052)
// resolveStatusAlerts resolves any triggered status alerts that weren't resolved
// when system came up (https://github.com/henrygd/beszel/issues/1052).
func resolveStatusAlerts(app core.App) error {
db := app.DB()
// Find all active status alerts where the system is actually up
@@ -205,3 +188,40 @@ func resolveStatusAlerts(app core.App) error {
}
return nil
}
// restorePendingStatusAlerts re-queues untriggered status alerts for systems that
// are still down after a hub restart. This rebuilds the lost in-memory timer state.
func (am *AlertManager) restorePendingStatusAlerts() error {
type pendingStatusAlert struct {
AlertID string `db:"alert_id"`
SystemID string `db:"system_id"`
SystemName string `db:"system_name"`
}
var pending []pendingStatusAlert
err := am.hub.DB().NewQuery(`
SELECT a.id AS alert_id, a.system AS system_id, s.name AS system_name
FROM alerts a
JOIN systems s ON a.system = s.id
WHERE a.name = 'Status'
AND a.triggered = false
AND s.status = 'down'
`).All(&pending)
if err != nil {
return err
}
// Make sure cache is populated before trying to restore pending alerts
_ = am.alertsCache.PopulateFromDB(false)
for _, item := range pending {
alertData, ok := am.alertsCache.GetAlert(item.SystemID, item.AlertID)
if !ok {
continue
}
min := max(1, int(alertData.Min))
am.schedulePendingStatusAlert(item.SystemName, alertData, time.Duration(min)*time.Minute)
}
return nil
}

View File

@@ -0,0 +1,755 @@
//go:build testing
package alerts_test
import (
"testing"
"testing/synctest"
"time"
"github.com/henrygd/beszel/internal/alerts"
beszelTests "github.com/henrygd/beszel/internal/tests"
"github.com/pocketbase/dbx"
"github.com/pocketbase/pocketbase/core"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestStatusAlerts(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systems, err := beszelTests.CreateSystems(hub, 4, user.Id, "paused")
assert.NoError(t, err)
var alerts []*core.Record
for i, system := range systems {
alert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": system.Id,
"user": user.Id,
"min": i + 1,
})
assert.NoError(t, err)
alerts = append(alerts, alert)
}
time.Sleep(10 * time.Millisecond)
for _, alert := range alerts {
assert.False(t, alert.GetBool("triggered"), "Alert should not be triggered immediately")
}
if hub.TestMailer.TotalSend() != 0 {
assert.Zero(t, hub.TestMailer.TotalSend(), "Expected 0 messages, got %d", hub.TestMailer.TotalSend())
}
for _, system := range systems {
assert.EqualValues(t, "paused", system.GetString("status"), "System should be paused")
}
for _, system := range systems {
system.Set("status", "up")
err = hub.SaveNoValidate(system)
assert.NoError(t, err)
}
time.Sleep(time.Second)
assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
for _, system := range systems {
system.Set("status", "down")
err = hub.SaveNoValidate(system)
assert.NoError(t, err)
}
// after 30 seconds, should have 4 alerts in the pendingAlerts map, no triggered alerts
time.Sleep(time.Second * 30)
assert.EqualValues(t, 4, hub.GetPendingAlertsCount(), "should have 4 alerts in the pendingAlerts map")
triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 0, triggeredCount, "should have 0 alert triggered")
assert.EqualValues(t, 0, hub.TestMailer.TotalSend(), "should have 0 messages sent")
// after 1:30 seconds, should have 1 triggered alert and 3 pending alerts
time.Sleep(time.Second * 60)
assert.EqualValues(t, 3, hub.GetPendingAlertsCount(), "should have 3 alerts in the pendingAlerts map")
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 1, triggeredCount, "should have 1 alert triggered")
assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 messages sent")
// after 2:30 seconds, should have 2 triggered alerts and 2 pending alerts
time.Sleep(time.Second * 60)
assert.EqualValues(t, 2, hub.GetPendingAlertsCount(), "should have 2 alerts in the pendingAlerts map")
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 2, triggeredCount, "should have 2 alert triggered")
assert.EqualValues(t, 2, hub.TestMailer.TotalSend(), "should have 2 messages sent")
// now we will bring the remaning systems back up
for _, system := range systems {
system.Set("status", "up")
err = hub.SaveNoValidate(system)
assert.NoError(t, err)
}
time.Sleep(time.Second)
// should have 0 alerts in the pendingAlerts map and 0 alerts triggered
assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.Zero(t, triggeredCount, "should have 0 alert triggered")
// 4 messages sent, 2 down alerts and 2 up alerts for first 2 systems
assert.EqualValues(t, 4, hub.TestMailer.TotalSend(), "should have 4 messages sent")
})
}
func TestStatusAlertRecoveryBeforeDeadline(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
// Ensure user settings have an email
userSettings, _ := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
hub.Save(userSettings)
// Initial email count
initialEmailCount := hub.TestMailer.TotalSend()
systemCollection, _ := hub.FindCollectionByNameOrId("systems")
system := core.NewRecord(systemCollection)
system.Set("name", "test-system")
system.Set("status", "up")
system.Set("host", "127.0.0.1")
system.Set("users", []string{user.Id})
hub.Save(system)
alertCollection, _ := hub.FindCollectionByNameOrId("alerts")
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", false)
alert.Set("min", 1)
hub.Save(alert)
am := hub.AlertManager
// 1. System goes down
am.HandleStatusAlerts("down", system)
assert.Equal(t, 1, am.GetPendingAlertsCount(), "Alert should be scheduled")
// 2. System goes up BEFORE delay expires
// Triggering HandleStatusAlerts("up") SHOULD NOT send an alert.
am.HandleStatusAlerts("up", system)
assert.Equal(t, 0, am.GetPendingAlertsCount(), "Alert should be canceled if system recovers before delay expires")
// Verify that NO email was sent.
assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "Recovery notification should not be sent if system never went down")
}
func TestStatusAlertNormalRecovery(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
// Ensure user settings have an email
userSettings, _ := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
hub.Save(userSettings)
systemCollection, _ := hub.FindCollectionByNameOrId("systems")
system := core.NewRecord(systemCollection)
system.Set("name", "test-system")
system.Set("status", "up")
system.Set("host", "127.0.0.1")
system.Set("users", []string{user.Id})
hub.Save(system)
alertCollection, _ := hub.FindCollectionByNameOrId("alerts")
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", true) // System was confirmed DOWN
hub.Save(alert)
am := hub.AlertManager
initialEmailCount := hub.TestMailer.TotalSend()
// System goes up
am.HandleStatusAlerts("up", system)
// Verify that an email WAS sent (normal recovery).
assert.Equal(t, initialEmailCount+1, hub.TestMailer.TotalSend(), "Recovery notification should be sent if system was triggered as down")
}
func TestHandleStatusAlertsDoesNotSendRecoveryWhileDownIsOnlyPending(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
require.NoError(t, err)
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
require.NoError(t, hub.Save(userSettings))
systemCollection, err := hub.FindCollectionByNameOrId("systems")
require.NoError(t, err)
system := core.NewRecord(systemCollection)
system.Set("name", "test-system")
system.Set("status", "up")
system.Set("host", "127.0.0.1")
system.Set("users", []string{user.Id})
require.NoError(t, hub.Save(system))
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
require.NoError(t, err)
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", false)
alert.Set("min", 1)
require.NoError(t, hub.Save(alert))
initialEmailCount := hub.TestMailer.TotalSend()
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.HandleStatusAlerts("down", system))
assert.Equal(t, 1, am.GetPendingAlertsCount(), "down transition should register a pending alert immediately")
require.NoError(t, am.HandleStatusAlerts("up", system))
assert.Zero(t, am.GetPendingAlertsCount(), "recovery should cancel the pending down alert")
assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "recovery notification should not be sent before a down alert triggers")
alertRecord, err := hub.FindRecordById("alerts", alert.Id)
require.NoError(t, err)
assert.False(t, alertRecord.GetBool("triggered"), "alert should remain untriggered when downtime never matured")
}
func TestStatusAlertTimerCancellationPreventsBoundaryDelivery(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
require.NoError(t, err)
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
require.NoError(t, hub.Save(userSettings))
systemCollection, err := hub.FindCollectionByNameOrId("systems")
require.NoError(t, err)
system := core.NewRecord(systemCollection)
system.Set("name", "test-system")
system.Set("status", "up")
system.Set("host", "127.0.0.1")
system.Set("users", []string{user.Id})
require.NoError(t, hub.Save(system))
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
require.NoError(t, err)
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", false)
alert.Set("min", 1)
require.NoError(t, hub.Save(alert))
initialEmailCount := hub.TestMailer.TotalSend()
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.HandleStatusAlerts("down", system))
assert.Equal(t, 1, am.GetPendingAlertsCount(), "down transition should register a pending alert immediately")
require.True(t, am.ResetPendingAlertTimer(alert.Id, 25*time.Millisecond), "test should shorten the pending alert timer")
time.Sleep(10 * time.Millisecond)
require.NoError(t, am.HandleStatusAlerts("up", system))
assert.Zero(t, am.GetPendingAlertsCount(), "recovery should remove the pending alert before the timer callback runs")
time.Sleep(40 * time.Millisecond)
assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "timer callback should not deliver after recovery cancels the pending alert")
alertRecord, err := hub.FindRecordById("alerts", alert.Id)
require.NoError(t, err)
assert.False(t, alertRecord.GetBool("triggered"), "alert should remain untriggered when cancellation wins the timer race")
time.Sleep(time.Minute)
synctest.Wait()
})
}
func TestStatusAlertDownFiresAfterDelayExpires(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
require.NoError(t, err)
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
require.NoError(t, hub.Save(userSettings))
systemCollection, err := hub.FindCollectionByNameOrId("systems")
require.NoError(t, err)
system := core.NewRecord(systemCollection)
system.Set("name", "test-system")
system.Set("status", "up")
system.Set("host", "127.0.0.1")
system.Set("users", []string{user.Id})
require.NoError(t, hub.Save(system))
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
require.NoError(t, err)
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", false)
alert.Set("min", 1)
require.NoError(t, hub.Save(alert))
initialEmailCount := hub.TestMailer.TotalSend()
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.HandleStatusAlerts("down", system))
assert.Equal(t, 1, am.GetPendingAlertsCount(), "alert should be pending after system goes down")
// Expire the pending alert and process it
am.ForceExpirePendingAlerts()
processed, err := am.ProcessPendingAlerts()
require.NoError(t, err)
assert.Len(t, processed, 1, "one alert should have been processed")
assert.Equal(t, 0, am.GetPendingAlertsCount(), "pending alert should be consumed after processing")
// Verify down email was sent
assert.Equal(t, initialEmailCount+1, hub.TestMailer.TotalSend(), "down notification should be sent after delay expires")
// Verify triggered flag is set in the DB
alertRecord, err := hub.FindRecordById("alerts", alert.Id)
require.NoError(t, err)
assert.True(t, alertRecord.GetBool("triggered"), "alert should be marked triggered after downtime matures")
}
func TestStatusAlertDuplicateDownCallIsIdempotent(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
require.NoError(t, err)
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
require.NoError(t, hub.Save(userSettings))
systemCollection, err := hub.FindCollectionByNameOrId("systems")
require.NoError(t, err)
system := core.NewRecord(systemCollection)
system.Set("name", "test-system")
system.Set("status", "up")
system.Set("host", "127.0.0.1")
system.Set("users", []string{user.Id})
require.NoError(t, hub.Save(system))
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
require.NoError(t, err)
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", false)
alert.Set("min", 5)
require.NoError(t, hub.Save(alert))
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.HandleStatusAlerts("down", system))
require.NoError(t, am.HandleStatusAlerts("down", system))
require.NoError(t, am.HandleStatusAlerts("down", system))
assert.Equal(t, 1, am.GetPendingAlertsCount(), "repeated down calls should not schedule duplicate pending alerts")
}
func TestStatusAlertNoAlertRecord(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systemCollection, err := hub.FindCollectionByNameOrId("systems")
require.NoError(t, err)
system := core.NewRecord(systemCollection)
system.Set("name", "test-system")
system.Set("status", "up")
system.Set("host", "127.0.0.1")
system.Set("users", []string{user.Id})
require.NoError(t, hub.Save(system))
// No Status alert record created for this system
initialEmailCount := hub.TestMailer.TotalSend()
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.HandleStatusAlerts("down", system))
assert.Equal(t, 0, am.GetPendingAlertsCount(), "no pending alert when no alert record exists")
require.NoError(t, am.HandleStatusAlerts("up", system))
assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "no email when no alert record exists")
}
func TestRestorePendingStatusAlertsRequeuesDownSystemsAfterRestart(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
require.NoError(t, err)
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
require.NoError(t, hub.Save(userSettings))
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "down")
require.NoError(t, err)
system := systems[0]
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
require.NoError(t, err)
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", false)
alert.Set("min", 1)
require.NoError(t, hub.Save(alert))
initialEmailCount := hub.TestMailer.TotalSend()
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.RestorePendingStatusAlerts())
assert.Equal(t, 1, am.GetPendingAlertsCount(), "startup restore should requeue a pending down alert for a system still marked down")
am.ForceExpirePendingAlerts()
processed, err := am.ProcessPendingAlerts()
require.NoError(t, err)
assert.Len(t, processed, 1, "restored pending alert should be processable after the delay expires")
assert.Equal(t, initialEmailCount+1, hub.TestMailer.TotalSend(), "restored pending alert should send the down notification")
alertRecord, err := hub.FindRecordById("alerts", alert.Id)
require.NoError(t, err)
assert.True(t, alertRecord.GetBool("triggered"), "restored pending alert should mark the alert as triggered once delivered")
}
func TestRestorePendingStatusAlertsSkipsNonDownOrAlreadyTriggeredAlerts(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systemsDown, err := beszelTests.CreateSystems(hub, 2, user.Id, "down")
require.NoError(t, err)
systemDownPending := systemsDown[0]
systemDownTriggered := systemsDown[1]
systemUp, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
"name": "up-system",
"users": []string{user.Id},
"host": "127.0.0.2",
"status": "up",
})
require.NoError(t, err)
_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": systemDownPending.Id,
"user": user.Id,
"min": 1,
"triggered": false,
})
require.NoError(t, err)
_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": systemUp.Id,
"user": user.Id,
"min": 1,
"triggered": false,
})
require.NoError(t, err)
_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": systemDownTriggered.Id,
"user": user.Id,
"min": 1,
"triggered": true,
})
require.NoError(t, err)
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.RestorePendingStatusAlerts())
assert.Equal(t, 1, am.GetPendingAlertsCount(), "only untriggered alerts for currently down systems should be restored")
}
func TestRestorePendingStatusAlertsIsIdempotent(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "down")
require.NoError(t, err)
system := systems[0]
_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": system.Id,
"user": user.Id,
"min": 1,
"triggered": false,
})
require.NoError(t, err)
am := alerts.NewTestAlertManagerWithoutWorker(hub)
require.NoError(t, am.RestorePendingStatusAlerts())
require.NoError(t, am.RestorePendingStatusAlerts())
assert.Equal(t, 1, am.GetPendingAlertsCount(), "restoring twice should not create duplicate pending alerts")
am.ForceExpirePendingAlerts()
processed, err := am.ProcessPendingAlerts()
require.NoError(t, err)
assert.Len(t, processed, 1, "restored alert should still be processable exactly once")
assert.Zero(t, am.GetPendingAlertsCount(), "processing the restored alert should empty the pending map")
}
func TestResolveStatusAlertsFixesStaleTriggered(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
// CreateSystems uses SaveNoValidate after initial save to bypass the
// onRecordCreate hook that forces status = "pending".
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up")
require.NoError(t, err)
system := systems[0]
alertCollection, err := hub.FindCollectionByNameOrId("alerts")
require.NoError(t, err)
alert := core.NewRecord(alertCollection)
alert.Set("user", user.Id)
alert.Set("system", system.Id)
alert.Set("name", "Status")
alert.Set("triggered", true) // Stale: system is up but alert still says triggered
require.NoError(t, hub.Save(alert))
// resolveStatusAlerts should clear the stale triggered flag
require.NoError(t, alerts.ResolveStatusAlerts(hub))
alertRecord, err := hub.FindRecordById("alerts", alert.Id)
require.NoError(t, err)
assert.False(t, alertRecord.GetBool("triggered"), "stale triggered flag should be cleared when system is up")
}
func TestResolveStatusAlerts(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
// Create a systemUp
systemUp, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
"name": "test-system",
"users": []string{user.Id},
"host": "127.0.0.1",
"status": "up",
})
assert.NoError(t, err)
systemDown, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
"name": "test-system-2",
"users": []string{user.Id},
"host": "127.0.0.2",
"status": "up",
})
assert.NoError(t, err)
// Create a status alertUp for the system
alertUp, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": systemUp.Id,
"user": user.Id,
"min": 1,
})
assert.NoError(t, err)
alertDown, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": systemDown.Id,
"user": user.Id,
"min": 1,
})
assert.NoError(t, err)
// Verify alert is not triggered initially
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered initially")
// Set the system to 'up' (this should not trigger the alert)
systemUp.Set("status", "up")
err = hub.SaveNoValidate(systemUp)
assert.NoError(t, err)
systemDown.Set("status", "down")
err = hub.SaveNoValidate(systemDown)
assert.NoError(t, err)
// Wait a moment for any processing
time.Sleep(10 * time.Millisecond)
// Verify alertUp is still not triggered after setting system to up
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
assert.NoError(t, err)
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered when system is up")
// Manually set both alerts triggered to true
alertUp.Set("triggered", true)
err = hub.SaveNoValidate(alertUp)
assert.NoError(t, err)
alertDown.Set("triggered", true)
err = hub.SaveNoValidate(alertDown)
assert.NoError(t, err)
// Verify we have exactly one alert with triggered true
triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 2, triggeredCount, "Should have exactly two alerts with triggered true")
// Verify the specific alertUp is triggered
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
assert.NoError(t, err)
assert.True(t, alertUp.GetBool("triggered"), "Alert should be triggered")
// Verify we have two unresolved alert history records
alertHistoryCount, err := hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
assert.NoError(t, err)
assert.EqualValues(t, 2, alertHistoryCount, "Should have exactly two unresolved alert history records")
err = alerts.ResolveStatusAlerts(hub)
assert.NoError(t, err)
// Verify alertUp is not triggered after resolving
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
assert.NoError(t, err)
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered after resolving")
// Verify alertDown is still triggered
alertDown, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertDown.Id})
assert.NoError(t, err)
assert.True(t, alertDown.GetBool("triggered"), "Alert should still be triggered after resolving")
// Verify we have one unresolved alert history record
alertHistoryCount, err = hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
assert.NoError(t, err)
assert.EqualValues(t, 1, alertHistoryCount, "Should have exactly one unresolved alert history record")
}
func TestAlertsHistoryStatus(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
// Create a system
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up")
assert.NoError(t, err)
system := systems[0]
// Create a status alertRecord for the system
alertRecord, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": system.Id,
"user": user.Id,
"min": 1,
})
assert.NoError(t, err)
// Verify alert is not triggered initially
assert.False(t, alertRecord.GetBool("triggered"), "Alert should not be triggered initially")
// Set the system to 'down' (this should trigger the alert)
system.Set("status", "down")
err = hub.Save(system)
assert.NoError(t, err)
time.Sleep(time.Second * 30)
synctest.Wait()
alertFresh, _ := hub.FindRecordById("alerts", alertRecord.Id)
assert.False(t, alertFresh.GetBool("triggered"), "Alert should not be triggered after 30 seconds")
time.Sleep(time.Minute)
synctest.Wait()
// Verify alert is triggered after setting system to down
alertFresh, err = hub.FindRecordById("alerts", alertRecord.Id)
assert.NoError(t, err)
assert.True(t, alertFresh.GetBool("triggered"), "Alert should be triggered after one minute")
// Verify we have one unresolved alert history record
alertHistoryCount, err := hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
assert.NoError(t, err)
assert.EqualValues(t, 1, alertHistoryCount, "Should have exactly one unresolved alert history record")
// Set the system back to 'up' (this should resolve the alert)
system.Set("status", "up")
err = hub.Save(system)
assert.NoError(t, err)
time.Sleep(time.Second)
synctest.Wait()
// Verify alert is not triggered after setting system back to up
alertFresh, err = hub.FindRecordById("alerts", alertRecord.Id)
assert.NoError(t, err)
assert.False(t, alertFresh.GetBool("triggered"), "Alert should not be triggered after system recovers")
// Verify the alert history record is resolved
alertHistoryCount, err = hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
assert.NoError(t, err)
assert.EqualValues(t, 0, alertHistoryCount, "Should have no unresolved alert history records")
})
}
func TestStatusAlertClearedBeforeSend(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
// Create a system
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up")
assert.NoError(t, err)
system := systems[0]
// Ensure user settings have an email
userSettings, _ := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
hub.Save(userSettings)
// Initial email count
initialEmailCount := hub.TestMailer.TotalSend()
// Create a status alertRecord for the system
alertRecord, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": system.Id,
"user": user.Id,
"min": 1,
})
assert.NoError(t, err)
// Verify alert is not triggered initially
assert.False(t, alertRecord.GetBool("triggered"), "Alert should not be triggered initially")
// Set the system to 'down' (this should trigger the alert)
system.Set("status", "down")
err = hub.Save(system)
assert.NoError(t, err)
time.Sleep(time.Second * 30)
synctest.Wait()
// Set system back up to clear the pending alert before it triggers
system.Set("status", "up")
err = hub.Save(system)
assert.NoError(t, err)
time.Sleep(time.Minute)
synctest.Wait()
// Verify that we have not sent any emails since the system recovered before the alert triggered
assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "No email should be sent if system recovers before alert triggers")
// Verify alert is not triggered after setting system back to up
alertFresh, err := hub.FindRecordById("alerts", alertRecord.Id)
assert.NoError(t, err)
assert.False(t, alertFresh.GetBool("triggered"), "Alert should not be triggered after system recovers")
// Verify that no alert history record was created since the alert never triggered
alertHistoryCount, err := hub.CountRecords("alerts_history")
assert.NoError(t, err)
assert.EqualValues(t, 0, alertHistoryCount, "Should have no unresolved alert history records since alert never triggered")
})
}

View File

@@ -11,15 +11,11 @@ import (
"github.com/pocketbase/dbx"
"github.com/pocketbase/pocketbase/core"
"github.com/pocketbase/pocketbase/tools/types"
"github.com/spf13/cast"
)
func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *system.CombinedData) error {
alertRecords, err := am.hub.FindAllRecords("alerts",
dbx.NewExp("system={:system} AND name!='Status'", dbx.Params{"system": systemRecord.Id}),
)
if err != nil || len(alertRecords) == 0 {
// log.Println("no alerts found for system")
alerts := am.alertsCache.GetAlertsExcludingNames(systemRecord.Id, "Status")
if len(alerts) == 0 {
return nil
}
@@ -27,8 +23,8 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *syst
now := systemRecord.GetDateTime("updated").Time().UTC()
oldestTime := now
for _, alertRecord := range alertRecords {
name := alertRecord.GetString("name")
for _, alertData := range alerts {
name := alertData.Name
var val float64
unit := "%"
@@ -38,7 +34,7 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *syst
case "Memory":
val = data.Info.MemPct
case "Bandwidth":
val = data.Info.Bandwidth
val = float64(data.Info.BandwidthBytes) / (1024 * 1024)
unit = " MB/s"
case "Disk":
maxUsedPct := data.Info.DiskPct
@@ -73,8 +69,8 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *syst
val = float64(data.Stats.Battery[0])
}
triggered := alertRecord.GetBool("triggered")
threshold := alertRecord.GetFloat("value")
triggered := alertData.Triggered
threshold := alertData.Value
// Battery alert has inverted logic: trigger when value is BELOW threshold
lowAlert := isLowAlert(name)
@@ -92,11 +88,11 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *syst
}
}
min := max(1, cast.ToUint8(alertRecord.Get("min")))
min := max(1, alertData.Min)
alert := SystemAlertData{
systemRecord: systemRecord,
alertRecord: alertRecord,
alertData: alertData,
name: name,
unit: unit,
val: val,
@@ -129,7 +125,7 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *syst
Created types.DateTime `db:"created"`
}{}
err = am.hub.DB().
err := am.hub.DB().
Select("stats", "created").
From("system_stats").
Where(dbx.NewExp(
@@ -192,22 +188,24 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *syst
case "Memory":
alert.val += stats.Mem
case "Bandwidth":
alert.val += stats.NetSent + stats.NetRecv
alert.val += float64(stats.Bandwidth[0]+stats.Bandwidth[1]) / (1024 * 1024)
case "Disk":
if alert.mapSums == nil {
alert.mapSums = make(map[string]float32, len(data.Stats.ExtraFs)+1)
alert.mapSums = make(map[string]float32, len(stats.ExtraFs)+1)
}
// add root disk
if _, ok := alert.mapSums["root"]; !ok {
alert.mapSums["root"] = 0.0
}
alert.mapSums["root"] += float32(stats.Disk)
// add extra disks
for key, fs := range data.Stats.ExtraFs {
if _, ok := alert.mapSums[key]; !ok {
alert.mapSums[key] = 0.0
// add extra disks from historical record
for key, fs := range stats.ExtraFs {
if fs.DiskTotal > 0 {
if _, ok := alert.mapSums[key]; !ok {
alert.mapSums[key] = 0.0
}
alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100)
}
alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100)
}
case "Temperature":
if alert.mapSums == nil {
@@ -342,13 +340,12 @@ func (am *AlertManager) sendSystemAlert(alert SystemAlertData) {
}
body := fmt.Sprintf("%s averaged %.2f%s for the previous %v %s.", alert.descriptor, alert.val, alert.unit, alert.min, minutesLabel)
alert.alertRecord.Set("triggered", alert.triggered)
if err := am.hub.Save(alert.alertRecord); err != nil {
if err := am.setAlertTriggered(alert.alertData, alert.triggered); err != nil {
// app.Logger().Error("failed to save alert record", "err", err)
return
}
am.SendAlert(AlertMessageData{
UserID: alert.alertRecord.GetString("user"),
UserID: alert.alertData.UserID,
SystemID: alert.systemRecord.Id,
Title: subject,
Message: body,

View File

@@ -0,0 +1,218 @@
//go:build testing
package alerts_test
import (
"testing"
"testing/synctest"
"time"
"github.com/henrygd/beszel/internal/entities/system"
beszelTests "github.com/henrygd/beszel/internal/tests"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type systemAlertValueSetter[T any] func(info *system.Info, stats *system.Stats, value T)
type systemAlertTestFixture struct {
hub *beszelTests.TestHub
alertID string
submit func(*system.CombinedData) error
}
func createCombinedData[T any](value T, setValue systemAlertValueSetter[T]) *system.CombinedData {
var data system.CombinedData
setValue(&data.Info, &data.Stats, value)
return &data
}
func newSystemAlertTestFixture(t *testing.T, alertName string, min int, threshold float64) *systemAlertTestFixture {
t.Helper()
hub, user := beszelTests.GetHubWithUser(t)
systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up")
require.NoError(t, err)
systemRecord := systems[0]
sysManagerSystem, err := hub.GetSystemManager().GetSystemFromStore(systemRecord.Id)
require.NoError(t, err)
require.NotNil(t, sysManagerSystem)
sysManagerSystem.StopUpdater()
userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
require.NoError(t, err)
userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
require.NoError(t, hub.Save(userSettings))
alertRecord, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": alertName,
"system": systemRecord.Id,
"user": user.Id,
"min": min,
"value": threshold,
})
require.NoError(t, err)
assert.False(t, alertRecord.GetBool("triggered"), "Alert should not be triggered initially")
alertsCache := hub.GetAlertManager().GetSystemAlertsCache()
cachedAlerts := alertsCache.GetAlertsExcludingNames(systemRecord.Id, "Status")
assert.Len(t, cachedAlerts, 1, "Alert should be in cache")
return &systemAlertTestFixture{
hub: hub,
alertID: alertRecord.Id,
submit: func(data *system.CombinedData) error {
_, err := sysManagerSystem.CreateRecords(data)
return err
},
}
}
func (fixture *systemAlertTestFixture) cleanup() {
fixture.hub.Cleanup()
}
func submitValue[T any](fixture *systemAlertTestFixture, t *testing.T, value T, setValue systemAlertValueSetter[T]) {
t.Helper()
require.NoError(t, fixture.submit(createCombinedData(value, setValue)))
}
func (fixture *systemAlertTestFixture) assertTriggered(t *testing.T, triggered bool, message string) {
t.Helper()
alertRecord, err := fixture.hub.FindRecordById("alerts", fixture.alertID)
require.NoError(t, err)
assert.Equal(t, triggered, alertRecord.GetBool("triggered"), message)
}
func waitForSystemAlert(d time.Duration) {
time.Sleep(d)
synctest.Wait()
}
func testOneMinuteSystemAlert[T any](t *testing.T, alertName string, threshold float64, setValue systemAlertValueSetter[T], triggerValue, resolveValue T) {
t.Helper()
synctest.Test(t, func(t *testing.T) {
fixture := newSystemAlertTestFixture(t, alertName, 1, threshold)
defer fixture.cleanup()
submitValue(fixture, t, triggerValue, setValue)
waitForSystemAlert(time.Second)
fixture.assertTriggered(t, true, "Alert should be triggered")
assert.Equal(t, 1, fixture.hub.TestMailer.TotalSend(), "An email should have been sent")
submitValue(fixture, t, resolveValue, setValue)
waitForSystemAlert(time.Second)
fixture.assertTriggered(t, false, "Alert should be untriggered")
assert.Equal(t, 2, fixture.hub.TestMailer.TotalSend(), "A second email should have been sent for untriggering the alert")
waitForSystemAlert(time.Minute)
})
}
func testMultiMinuteSystemAlert[T any](t *testing.T, alertName string, threshold float64, min int, setValue systemAlertValueSetter[T], baselineValue, triggerValue, resolveValue T) {
t.Helper()
synctest.Test(t, func(t *testing.T) {
fixture := newSystemAlertTestFixture(t, alertName, min, threshold)
defer fixture.cleanup()
submitValue(fixture, t, baselineValue, setValue)
waitForSystemAlert(time.Minute + time.Second)
fixture.assertTriggered(t, false, "Alert should not be triggered yet")
submitValue(fixture, t, triggerValue, setValue)
waitForSystemAlert(time.Minute)
fixture.assertTriggered(t, false, "Alert should not be triggered until the history window is full")
submitValue(fixture, t, triggerValue, setValue)
waitForSystemAlert(time.Second)
fixture.assertTriggered(t, true, "Alert should be triggered")
assert.Equal(t, 1, fixture.hub.TestMailer.TotalSend(), "An email should have been sent")
submitValue(fixture, t, resolveValue, setValue)
waitForSystemAlert(time.Second)
fixture.assertTriggered(t, false, "Alert should be untriggered")
assert.Equal(t, 2, fixture.hub.TestMailer.TotalSend(), "A second email should have been sent for untriggering the alert")
})
}
func setCPUAlertValue(info *system.Info, stats *system.Stats, value float64) {
info.Cpu = value
stats.Cpu = value
}
func setMemoryAlertValue(info *system.Info, stats *system.Stats, value float64) {
info.MemPct = value
stats.MemPct = value
}
func setDiskAlertValue(info *system.Info, stats *system.Stats, value float64) {
info.DiskPct = value
stats.DiskPct = value
}
func setBandwidthAlertValue(info *system.Info, stats *system.Stats, value [2]uint64) {
info.BandwidthBytes = value[0] + value[1]
stats.Bandwidth = value
}
func megabytesToBytes(mb uint64) uint64 {
return mb * 1024 * 1024
}
func setGPUAlertValue(info *system.Info, stats *system.Stats, value float64) {
info.GpuPct = value
stats.GPUData = map[string]system.GPUData{
"GPU0": {Usage: value},
}
}
func setTemperatureAlertValue(info *system.Info, stats *system.Stats, value float64) {
info.DashboardTemp = value
stats.Temperatures = map[string]float64{
"Temp0": value,
}
}
func setLoadAvgAlertValue(info *system.Info, stats *system.Stats, value [3]float64) {
info.LoadAvg = value
stats.LoadAvg = value
}
func setBatteryAlertValue(info *system.Info, stats *system.Stats, value [2]uint8) {
info.Battery = value
stats.Battery = value
}
func TestSystemAlertsOneMin(t *testing.T) {
testOneMinuteSystemAlert(t, "CPU", 50, setCPUAlertValue, 51, 49)
testOneMinuteSystemAlert(t, "Memory", 50, setMemoryAlertValue, 51, 49)
testOneMinuteSystemAlert(t, "Disk", 50, setDiskAlertValue, 51, 49)
testOneMinuteSystemAlert(t, "Bandwidth", 50, setBandwidthAlertValue, [2]uint64{megabytesToBytes(26), megabytesToBytes(25)}, [2]uint64{megabytesToBytes(25), megabytesToBytes(24)})
testOneMinuteSystemAlert(t, "GPU", 50, setGPUAlertValue, 51, 49)
testOneMinuteSystemAlert(t, "Temperature", 70, setTemperatureAlertValue, 71, 69)
testOneMinuteSystemAlert(t, "LoadAvg1", 4, setLoadAvgAlertValue, [3]float64{4.1, 0, 0}, [3]float64{3.9, 0, 0})
testOneMinuteSystemAlert(t, "LoadAvg5", 4, setLoadAvgAlertValue, [3]float64{0, 4.1, 0}, [3]float64{0, 3.9, 0})
testOneMinuteSystemAlert(t, "LoadAvg15", 4, setLoadAvgAlertValue, [3]float64{0, 0, 4.1}, [3]float64{0, 0, 3.9})
testOneMinuteSystemAlert(t, "Battery", 20, setBatteryAlertValue, [2]uint8{19, 0}, [2]uint8{21, 0})
}
func TestSystemAlertsTwoMin(t *testing.T) {
testMultiMinuteSystemAlert(t, "CPU", 50, 2, setCPUAlertValue, 10, 51, 48)
testMultiMinuteSystemAlert(t, "Memory", 50, 2, setMemoryAlertValue, 10, 51, 48)
testMultiMinuteSystemAlert(t, "Disk", 50, 2, setDiskAlertValue, 10, 51, 48)
testMultiMinuteSystemAlert(t, "Bandwidth", 50, 2, setBandwidthAlertValue, [2]uint64{megabytesToBytes(10), megabytesToBytes(10)}, [2]uint64{megabytesToBytes(26), megabytesToBytes(25)}, [2]uint64{megabytesToBytes(10), megabytesToBytes(10)})
testMultiMinuteSystemAlert(t, "GPU", 50, 2, setGPUAlertValue, 10, 51, 48)
testMultiMinuteSystemAlert(t, "Temperature", 70, 2, setTemperatureAlertValue, 10, 71, 67)
testMultiMinuteSystemAlert(t, "LoadAvg1", 4, 2, setLoadAvgAlertValue, [3]float64{0, 0, 0}, [3]float64{4.1, 0, 0}, [3]float64{3.5, 0, 0})
testMultiMinuteSystemAlert(t, "LoadAvg5", 4, 2, setLoadAvgAlertValue, [3]float64{0, 2, 0}, [3]float64{0, 4.1, 0}, [3]float64{0, 3.5, 0})
testMultiMinuteSystemAlert(t, "LoadAvg15", 4, 2, setLoadAvgAlertValue, [3]float64{0, 0, 2}, [3]float64{0, 0, 4.1}, [3]float64{0, 0, 3.5})
testMultiMinuteSystemAlert(t, "Battery", 20, 2, setBatteryAlertValue, [2]uint8{21, 0}, [2]uint8{19, 0}, [2]uint8{25, 1})
}

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package alerts_test
@@ -13,9 +12,9 @@ import (
"testing/synctest"
"time"
"github.com/henrygd/beszel/internal/alerts"
beszelTests "github.com/henrygd/beszel/internal/tests"
"github.com/henrygd/beszel/internal/alerts"
"github.com/pocketbase/dbx"
"github.com/pocketbase/pocketbase/core"
pbTests "github.com/pocketbase/pocketbase/tests"
@@ -370,87 +369,6 @@ func TestUserAlertsApi(t *testing.T) {
}
}
func TestStatusAlerts(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
systems, err := beszelTests.CreateSystems(hub, 4, user.Id, "paused")
assert.NoError(t, err)
var alerts []*core.Record
for i, system := range systems {
alert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": system.Id,
"user": user.Id,
"min": i + 1,
})
assert.NoError(t, err)
alerts = append(alerts, alert)
}
time.Sleep(10 * time.Millisecond)
for _, alert := range alerts {
assert.False(t, alert.GetBool("triggered"), "Alert should not be triggered immediately")
}
if hub.TestMailer.TotalSend() != 0 {
assert.Zero(t, hub.TestMailer.TotalSend(), "Expected 0 messages, got %d", hub.TestMailer.TotalSend())
}
for _, system := range systems {
assert.EqualValues(t, "paused", system.GetString("status"), "System should be paused")
}
for _, system := range systems {
system.Set("status", "up")
err = hub.SaveNoValidate(system)
assert.NoError(t, err)
}
time.Sleep(time.Second)
assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
for _, system := range systems {
system.Set("status", "down")
err = hub.SaveNoValidate(system)
assert.NoError(t, err)
}
// after 30 seconds, should have 4 alerts in the pendingAlerts map, no triggered alerts
time.Sleep(time.Second * 30)
assert.EqualValues(t, 4, hub.GetPendingAlertsCount(), "should have 4 alerts in the pendingAlerts map")
triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 0, triggeredCount, "should have 0 alert triggered")
assert.EqualValues(t, 0, hub.TestMailer.TotalSend(), "should have 0 messages sent")
// after 1:30 seconds, should have 1 triggered alert and 3 pending alerts
time.Sleep(time.Second * 60)
assert.EqualValues(t, 3, hub.GetPendingAlertsCount(), "should have 3 alerts in the pendingAlerts map")
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 1, triggeredCount, "should have 1 alert triggered")
assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 messages sent")
// after 2:30 seconds, should have 2 triggered alerts and 2 pending alerts
time.Sleep(time.Second * 60)
assert.EqualValues(t, 2, hub.GetPendingAlertsCount(), "should have 2 alerts in the pendingAlerts map")
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 2, triggeredCount, "should have 2 alert triggered")
assert.EqualValues(t, 2, hub.TestMailer.TotalSend(), "should have 2 messages sent")
// now we will bring the remaning systems back up
for _, system := range systems {
system.Set("status", "up")
err = hub.SaveNoValidate(system)
assert.NoError(t, err)
}
time.Sleep(time.Second)
// should have 0 alerts in the pendingAlerts map and 0 alerts triggered
assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.Zero(t, triggeredCount, "should have 0 alert triggered")
// 4 messages sent, 2 down alerts and 2 up alerts for first 2 systems
assert.EqualValues(t, 4, hub.TestMailer.TotalSend(), "should have 4 messages sent")
})
}
func TestAlertsHistory(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
@@ -579,102 +497,46 @@ func TestAlertsHistory(t *testing.T) {
assert.EqualValues(t, 2, totalHistoryCount, "Should have 2 total alert history records")
})
}
func TestResolveStatusAlerts(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
func TestSetAlertTriggered(t *testing.T) {
hub, _ := beszelTests.NewTestHub(t.TempDir())
defer hub.Cleanup()
// Create a systemUp
systemUp, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
"name": "test-system",
"users": []string{user.Id},
"host": "127.0.0.1",
"status": "up",
hub.StartHub()
user, _ := beszelTests.CreateUser(hub, "test@example.com", "password")
system, _ := beszelTests.CreateRecord(hub, "systems", map[string]any{
"name": "test-system",
"users": []string{user.Id},
"host": "127.0.0.1",
})
assert.NoError(t, err)
systemDown, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
"name": "test-system-2",
"users": []string{user.Id},
"host": "127.0.0.2",
"status": "up",
alertRecord, _ := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "CPU",
"system": system.Id,
"user": user.Id,
"value": 80,
"triggered": false,
})
am := alerts.NewAlertManager(hub)
var alert alerts.CachedAlertData
alert.PopulateFromRecord(alertRecord)
// Test triggering the alert
err := am.SetAlertTriggered(alert, true)
assert.NoError(t, err)
// Create a status alertUp for the system
alertUp, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": systemUp.Id,
"user": user.Id,
"min": 1,
})
updatedRecord, err := hub.FindRecordById("alerts", alert.Id)
assert.NoError(t, err)
assert.True(t, updatedRecord.GetBool("triggered"))
// Test un-triggering the alert
err = am.SetAlertTriggered(alert, false)
assert.NoError(t, err)
alertDown, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
"name": "Status",
"system": systemDown.Id,
"user": user.Id,
"min": 1,
})
updatedRecord, err = hub.FindRecordById("alerts", alert.Id)
assert.NoError(t, err)
// Verify alert is not triggered initially
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered initially")
// Set the system to 'up' (this should not trigger the alert)
systemUp.Set("status", "up")
err = hub.SaveNoValidate(systemUp)
assert.NoError(t, err)
systemDown.Set("status", "down")
err = hub.SaveNoValidate(systemDown)
assert.NoError(t, err)
// Wait a moment for any processing
time.Sleep(10 * time.Millisecond)
// Verify alertUp is still not triggered after setting system to up
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
assert.NoError(t, err)
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered when system is up")
// Manually set both alerts triggered to true
alertUp.Set("triggered", true)
err = hub.SaveNoValidate(alertUp)
assert.NoError(t, err)
alertDown.Set("triggered", true)
err = hub.SaveNoValidate(alertDown)
assert.NoError(t, err)
// Verify we have exactly one alert with triggered true
triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
assert.NoError(t, err)
assert.EqualValues(t, 2, triggeredCount, "Should have exactly two alerts with triggered true")
// Verify the specific alertUp is triggered
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
assert.NoError(t, err)
assert.True(t, alertUp.GetBool("triggered"), "Alert should be triggered")
// Verify we have two unresolved alert history records
alertHistoryCount, err := hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
assert.NoError(t, err)
assert.EqualValues(t, 2, alertHistoryCount, "Should have exactly two unresolved alert history records")
err = alerts.ResolveStatusAlerts(hub)
assert.NoError(t, err)
// Verify alertUp is not triggered after resolving
alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
assert.NoError(t, err)
assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered after resolving")
// Verify alertDown is still triggered
alertDown, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertDown.Id})
assert.NoError(t, err)
assert.True(t, alertDown.GetBool("triggered"), "Alert should still be triggered after resolving")
// Verify we have one unresolved alert history record
alertHistoryCount, err = hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
assert.NoError(t, err)
assert.EqualValues(t, 1, alertHistoryCount, "Should have exactly one unresolved alert history record")
assert.False(t, updatedRecord.GetBool("triggered"))
}

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package alerts
@@ -10,6 +9,18 @@ import (
"github.com/pocketbase/pocketbase/core"
)
func NewTestAlertManagerWithoutWorker(app hubLike) *AlertManager {
return &AlertManager{
hub: app,
alertsCache: NewAlertsCache(app),
}
}
// GetSystemAlertsCache returns the internal system alerts cache.
func (am *AlertManager) GetSystemAlertsCache() *AlertsCache {
return am.alertsCache
}
func (am *AlertManager) GetAlertManager() *AlertManager {
return am
}
@@ -28,19 +39,18 @@ func (am *AlertManager) GetPendingAlertsCount() int {
}
// ProcessPendingAlerts manually processes all expired alerts (for testing)
func (am *AlertManager) ProcessPendingAlerts() ([]*core.Record, error) {
func (am *AlertManager) ProcessPendingAlerts() ([]CachedAlertData, error) {
now := time.Now()
var lastErr error
var processedAlerts []*core.Record
var processedAlerts []CachedAlertData
am.pendingAlerts.Range(func(key, value any) bool {
info := value.(*alertInfo)
if now.After(info.expireTime) {
// Downtime delay has passed, process alert
if err := am.sendStatusAlert("down", info.systemName, info.alertRecord); err != nil {
lastErr = err
if info.timer != nil {
info.timer.Stop()
}
processedAlerts = append(processedAlerts, info.alertRecord)
am.pendingAlerts.Delete(key)
am.processPendingAlert(key.(string))
processedAlerts = append(processedAlerts, info.alertData)
}
return true
})
@@ -57,6 +67,31 @@ func (am *AlertManager) ForceExpirePendingAlerts() {
})
}
func (am *AlertManager) ResetPendingAlertTimer(alertID string, delay time.Duration) bool {
value, loaded := am.pendingAlerts.Load(alertID)
if !loaded {
return false
}
info := value.(*alertInfo)
if info.timer != nil {
info.timer.Stop()
}
info.expireTime = time.Now().Add(delay)
info.timer = time.AfterFunc(delay, func() {
am.processPendingAlert(alertID)
})
return true
}
func ResolveStatusAlerts(app core.App) error {
return resolveStatusAlerts(app)
}
func (am *AlertManager) RestorePendingStatusAlerts() error {
return am.restorePendingStatusAlerts()
}
func (am *AlertManager) SetAlertTriggered(alert CachedAlertData, triggered bool) error {
return am.setAlertTriggered(alert, triggered)
}

View File

@@ -9,6 +9,7 @@ import (
"github.com/henrygd/beszel"
"github.com/henrygd/beszel/agent"
"github.com/henrygd/beszel/agent/health"
"github.com/henrygd/beszel/agent/utils"
"github.com/spf13/pflag"
"golang.org/x/crypto/ssh"
)
@@ -31,9 +32,6 @@ func (opts *cmdOptions) parse() bool {
// Subcommands that don't require any pflag parsing
switch subcommand {
case "-v", "version":
fmt.Println(beszel.AppName+"-agent", beszel.Version)
return true
case "health":
err := health.Check()
if err != nil {
@@ -41,6 +39,9 @@ func (opts *cmdOptions) parse() bool {
}
fmt.Print("ok")
return true
case "fingerprint":
handleFingerprint()
return true
}
// pflag.CommandLine.ParseErrorsWhitelist.UnknownFlags = true
@@ -49,6 +50,7 @@ func (opts *cmdOptions) parse() bool {
pflag.StringVarP(&opts.hubURL, "url", "u", "", "URL of the Beszel hub")
pflag.StringVarP(&opts.token, "token", "t", "", "Token to use for authentication")
chinaMirrors := pflag.BoolP("china-mirrors", "c", false, "Use mirror for update (gh.beszel.dev) instead of GitHub")
version := pflag.BoolP("version", "v", false, "Show version information")
help := pflag.BoolP("help", "h", false, "Show this help message")
// Convert old single-dash long flags to double-dash for backward compatibility
@@ -73,9 +75,9 @@ func (opts *cmdOptions) parse() bool {
builder.WriteString(os.Args[0])
builder.WriteString(" [command] [flags]\n")
builder.WriteString("\nCommands:\n")
builder.WriteString(" health Check if the agent is running\n")
// builder.WriteString(" help Display this help message\n")
builder.WriteString(" update Update to the latest version\n")
builder.WriteString(" fingerprint View or reset the agent fingerprint\n")
builder.WriteString(" health Check if the agent is running\n")
builder.WriteString(" update Update to the latest version\n")
builder.WriteString("\nFlags:\n")
fmt.Print(builder.String())
pflag.PrintDefaults()
@@ -86,6 +88,9 @@ func (opts *cmdOptions) parse() bool {
// Must run after pflag.Parse()
switch {
case *version:
fmt.Println(beszel.AppName+"-agent", beszel.Version)
return true
case *help || subcommand == "help":
pflag.Usage()
return true
@@ -112,12 +117,12 @@ func (opts *cmdOptions) loadPublicKeys() ([]ssh.PublicKey, error) {
}
// Try environment variable
if key, ok := agent.GetEnv("KEY"); ok && key != "" {
if key, ok := utils.GetEnv("KEY"); ok && key != "" {
return agent.ParseKeys(key)
}
// Try key file
keyFile, ok := agent.GetEnv("KEY_FILE")
keyFile, ok := utils.GetEnv("KEY_FILE")
if !ok {
return nil, fmt.Errorf("no key provided: must set -key flag, KEY env var, or KEY_FILE env var. Use 'beszel-agent help' for usage")
}
@@ -133,6 +138,38 @@ func (opts *cmdOptions) getAddress() string {
return agent.GetAddress(opts.listen)
}
// handleFingerprint handles the "fingerprint" command with subcommands "view" and "reset".
func handleFingerprint() {
subCmd := ""
if len(os.Args) > 2 {
subCmd = os.Args[2]
}
switch subCmd {
case "", "view":
dataDir, _ := agent.GetDataDir()
fp := agent.GetFingerprint(dataDir, "", "")
fmt.Println(fp)
case "help", "-h", "--help":
fmt.Print(fingerprintUsage())
case "reset":
dataDir, err := agent.GetDataDir()
if err != nil {
log.Fatal(err)
}
if err := agent.DeleteFingerprint(dataDir); err != nil {
log.Fatal(err)
}
fmt.Println("Fingerprint reset. A new one will be generated on next start.")
default:
log.Fatalf("Unknown command: %q\n\n%s", subCmd, fingerprintUsage())
}
}
func fingerprintUsage() string {
return fmt.Sprintf("Usage: %s fingerprint [view|reset]\n\nCommands:\n view Print fingerprint (default)\n reset Reset saved fingerprint\n", os.Args[0])
}
func main() {
var opts cmdOptions
subcommandHandled := opts.parse()

View File

@@ -23,6 +23,9 @@ COPY --from=builder /agent /agent
# this is so we don't need to create the /tmp directory in the scratch container
COPY --from=builder /tmp /tmp
# AMD GPU name lookup (used by agent on Linux when /usr/share/libdrm/amdgpu.ids is read)
COPY --from=builder /app/agent/test-data/amdgpu.ids /usr/share/libdrm/amdgpu.ids
# Ensure data persistence across container recreations
VOLUME ["/var/lib/beszel-agent"]

View File

@@ -20,6 +20,9 @@ RUN rm -rf /tmp/*
FROM alpine:3.23
COPY --from=builder /agent /agent
# AMD GPU name lookup (used by agent on Linux when /usr/share/libdrm/amdgpu.ids is read)
COPY --from=builder /app/agent/test-data/amdgpu.ids /usr/share/libdrm/amdgpu.ids
RUN apk add --no-cache smartmontools
# Ensure data persistence across container recreations

View File

@@ -37,6 +37,9 @@ RUN apt-get update && apt-get install -y \
FROM nvidia/cuda:12.2.2-base-ubuntu22.04
COPY --from=builder /agent /agent
# AMD GPU name lookup (used by agent on hybrid laptops when /usr/share/libdrm/amdgpu.ids is read)
COPY --from=builder /app/agent/test-data/amdgpu.ids /usr/share/libdrm/amdgpu.ids
# Copy smartmontools binaries and config files
COPY --from=smartmontools-builder /usr/sbin/smartctl /usr/sbin/smartctl

View File

@@ -10,10 +10,19 @@ type ApiInfo struct {
Status string
State string
Image string
Health struct {
Status string
// FailingStreak int
}
Ports []struct {
// PrivatePort uint16
PublicPort uint16
IP string
// Type string
}
// ImageID string
// Command string
// Created int64
// Ports []Port
// SizeRw int64 `json:",omitempty"`
// SizeRootFs int64 `json:",omitempty"`
// Labels map[string]string
@@ -140,6 +149,7 @@ type Stats struct {
Status string `json:"-" cbor:"6,keyasint"`
Id string `json:"-" cbor:"7,keyasint"`
Image string `json:"-" cbor:"8,keyasint"`
Ports string `json:"-" cbor:"10,keyasint"`
// PrevCpu [2]uint64 `json:"-"`
CpuSystem uint64 `json:"-"`
CpuContainer uint64 `json:"-"`

View File

@@ -143,8 +143,8 @@ type AtaDeviceStatisticsPage struct {
}
type AtaDeviceStatisticsEntry struct {
Name string `json:"name"`
Value *uint64 `json:"value,omitempty"`
Name string `json:"name"`
Value *int64 `json:"value,omitempty"`
}
type AtaSmartAttribute struct {
@@ -356,8 +356,8 @@ type SmartInfoForSata struct {
SmartStatus SmartStatusInfo `json:"smart_status"`
// AtaSmartData AtaSmartData `json:"ata_smart_data"`
// AtaSctCapabilities AtaSctCapabilities `json:"ata_sct_capabilities"`
AtaSmartAttributes AtaSmartAttributes `json:"ata_smart_attributes"`
AtaDeviceStatistics AtaDeviceStatistics `json:"ata_device_statistics"`
AtaSmartAttributes AtaSmartAttributes `json:"ata_smart_attributes"`
AtaDeviceStatistics json.RawMessage `json:"ata_device_statistics"`
// PowerOnTime PowerOnTimeInfo `json:"power_on_time"`
// PowerCycleCount uint16 `json:"power_cycle_count"`
Temperature TemperatureInfo `json:"temperature"`

View File

@@ -12,8 +12,9 @@ import (
type Stats struct {
Cpu float64 `json:"cpu" cbor:"0,keyasint"`
MaxCpu float64 `json:"cpum,omitempty" cbor:"1,keyasint,omitempty"`
MaxCpu float64 `json:"cpum,omitempty" cbor:"-"`
Mem float64 `json:"m" cbor:"2,keyasint"`
MaxMem float64 `json:"mm,omitempty" cbor:"-"`
MemUsed float64 `json:"mu" cbor:"3,keyasint"`
MemPct float64 `json:"mp" cbor:"4,keyasint"`
MemBuffCache float64 `json:"mb" cbor:"5,keyasint"`
@@ -23,26 +24,25 @@ type Stats struct {
DiskTotal float64 `json:"d" cbor:"9,keyasint"`
DiskUsed float64 `json:"du" cbor:"10,keyasint"`
DiskPct float64 `json:"dp" cbor:"11,keyasint"`
DiskReadPs float64 `json:"dr" cbor:"12,keyasint"`
DiskWritePs float64 `json:"dw" cbor:"13,keyasint"`
MaxDiskReadPs float64 `json:"drm,omitempty" cbor:"14,keyasint,omitempty"`
MaxDiskWritePs float64 `json:"dwm,omitempty" cbor:"15,keyasint,omitempty"`
DiskReadPs float64 `json:"dr,omitzero" cbor:"12,keyasint,omitzero"`
DiskWritePs float64 `json:"dw,omitzero" cbor:"13,keyasint,omitzero"`
MaxDiskReadPs float64 `json:"drm,omitempty" cbor:"-"`
MaxDiskWritePs float64 `json:"dwm,omitempty" cbor:"-"`
NetworkSent float64 `json:"ns,omitzero" cbor:"16,keyasint,omitzero"`
NetworkRecv float64 `json:"nr,omitzero" cbor:"17,keyasint,omitzero"`
MaxNetworkSent float64 `json:"nsm,omitempty" cbor:"18,keyasint,omitempty"`
MaxNetworkRecv float64 `json:"nrm,omitempty" cbor:"19,keyasint,omitempty"`
MaxNetworkSent float64 `json:"nsm,omitempty" cbor:"-"`
MaxNetworkRecv float64 `json:"nrm,omitempty" cbor:"-"`
Temperatures map[string]float64 `json:"t,omitempty" cbor:"20,keyasint,omitempty"`
ExtraFs map[string]*FsStats `json:"efs,omitempty" cbor:"21,keyasint,omitempty"`
GPUData map[string]GPUData `json:"g,omitempty" cbor:"22,keyasint,omitempty"`
LoadAvg1 float64 `json:"l1,omitempty" cbor:"23,keyasint,omitempty"`
LoadAvg5 float64 `json:"l5,omitempty" cbor:"24,keyasint,omitempty"`
LoadAvg15 float64 `json:"l15,omitempty" cbor:"25,keyasint,omitempty"`
Bandwidth [2]uint64 `json:"b,omitzero" cbor:"26,keyasint,omitzero"` // [sent bytes, recv bytes]
MaxBandwidth [2]uint64 `json:"bm,omitzero" cbor:"27,keyasint,omitzero"` // [sent bytes, recv bytes]
// LoadAvg1 float64 `json:"l1,omitempty" cbor:"23,keyasint,omitempty"`
// LoadAvg5 float64 `json:"l5,omitempty" cbor:"24,keyasint,omitempty"`
// LoadAvg15 float64 `json:"l15,omitempty" cbor:"25,keyasint,omitempty"`
Bandwidth [2]uint64 `json:"b,omitzero" cbor:"26,keyasint,omitzero"` // [sent bytes, recv bytes]
MaxBandwidth [2]uint64 `json:"bm,omitzero" cbor:"-"` // [sent bytes, recv bytes]
// TODO: remove other load fields in future release in favor of load avg array
LoadAvg [3]float64 `json:"la,omitempty" cbor:"28,keyasint"`
Battery [2]uint8 `json:"bat,omitzero" cbor:"29,keyasint,omitzero"` // [percent, charge state, current]
MaxMem float64 `json:"mm,omitempty" cbor:"30,keyasint,omitempty"`
Battery [2]uint8 `json:"bat,omitzero" cbor:"29,keyasint,omitzero"` // [percent, charge state, current]
NetworkInterfaces map[string][4]uint64 `json:"ni,omitempty" cbor:"31,keyasint,omitempty"` // [upload bytes, download bytes, total upload, total download]
DiskIO [2]uint64 `json:"dio,omitzero" cbor:"32,keyasint,omitzero"` // [read bytes, write bytes]
MaxDiskIO [2]uint64 `json:"diom,omitzero" cbor:"-"` // [max read bytes, max write bytes]
@@ -90,8 +90,8 @@ type FsStats struct {
TotalWrite uint64 `json:"-"`
DiskReadPs float64 `json:"r" cbor:"2,keyasint"`
DiskWritePs float64 `json:"w" cbor:"3,keyasint"`
MaxDiskReadPS float64 `json:"rm,omitempty" cbor:"4,keyasint,omitempty"`
MaxDiskWritePS float64 `json:"wm,omitempty" cbor:"5,keyasint,omitempty"`
MaxDiskReadPS float64 `json:"rm,omitempty" cbor:"-"`
MaxDiskWritePS float64 `json:"wm,omitempty" cbor:"-"`
// TODO: remove DiskReadPs and DiskWritePs in future release in favor of DiskReadBytes and DiskWriteBytes
DiskReadBytes uint64 `json:"rb" cbor:"6,keyasint,omitempty"`
DiskWriteBytes uint64 `json:"wb" cbor:"7,keyasint,omitempty"`
@@ -129,23 +129,23 @@ type Info struct {
KernelVersion string `json:"k,omitempty" cbor:"1,keyasint,omitempty"` // deprecated - moved to Details struct
Cores int `json:"c,omitzero" cbor:"2,keyasint,omitzero"` // deprecated - moved to Details struct
// Threads is needed in Info struct to calculate load average thresholds
Threads int `json:"t,omitempty" cbor:"3,keyasint,omitempty"`
CpuModel string `json:"m,omitempty" cbor:"4,keyasint,omitempty"` // deprecated - moved to Details struct
Uptime uint64 `json:"u" cbor:"5,keyasint"`
Cpu float64 `json:"cpu" cbor:"6,keyasint"`
MemPct float64 `json:"mp" cbor:"7,keyasint"`
DiskPct float64 `json:"dp" cbor:"8,keyasint"`
Bandwidth float64 `json:"b" cbor:"9,keyasint"`
AgentVersion string `json:"v" cbor:"10,keyasint"`
Podman bool `json:"p,omitempty" cbor:"11,keyasint,omitempty"` // deprecated - moved to Details struct
GpuPct float64 `json:"g,omitempty" cbor:"12,keyasint,omitempty"`
DashboardTemp float64 `json:"dt,omitempty" cbor:"13,keyasint,omitempty"`
Os Os `json:"os,omitempty" cbor:"14,keyasint,omitempty"` // deprecated - moved to Details struct
LoadAvg1 float64 `json:"l1,omitempty" cbor:"15,keyasint,omitempty"` // deprecated - use `la` array instead
LoadAvg5 float64 `json:"l5,omitempty" cbor:"16,keyasint,omitempty"` // deprecated - use `la` array instead
LoadAvg15 float64 `json:"l15,omitempty" cbor:"17,keyasint,omitempty"` // deprecated - use `la` array instead
BandwidthBytes uint64 `json:"bb" cbor:"18,keyasint"`
Threads int `json:"t,omitempty" cbor:"3,keyasint,omitempty"`
CpuModel string `json:"m,omitempty" cbor:"4,keyasint,omitempty"` // deprecated - moved to Details struct
Uptime uint64 `json:"u" cbor:"5,keyasint"`
Cpu float64 `json:"cpu" cbor:"6,keyasint"`
MemPct float64 `json:"mp" cbor:"7,keyasint"`
DiskPct float64 `json:"dp" cbor:"8,keyasint"`
Bandwidth float64 `json:"b,omitzero" cbor:"9,keyasint"` // deprecated in favor of BandwidthBytes
AgentVersion string `json:"v" cbor:"10,keyasint"`
Podman bool `json:"p,omitempty" cbor:"11,keyasint,omitempty"` // deprecated - moved to Details struct
GpuPct float64 `json:"g,omitempty" cbor:"12,keyasint,omitempty"`
DashboardTemp float64 `json:"dt,omitempty" cbor:"13,keyasint,omitempty"`
Os Os `json:"os,omitempty" cbor:"14,keyasint,omitempty"` // deprecated - moved to Details struct
// LoadAvg1 float64 `json:"l1,omitempty" cbor:"15,keyasint,omitempty"` // deprecated - use `la` array instead
// LoadAvg5 float64 `json:"l5,omitempty" cbor:"16,keyasint,omitempty"` // deprecated - use `la` array instead
// LoadAvg15 float64 `json:"l15,omitempty" cbor:"17,keyasint,omitempty"` // deprecated - use `la` array instead
BandwidthBytes uint64 `json:"bb" cbor:"18,keyasint"`
LoadAvg [3]float64 `json:"la,omitempty" cbor:"19,keyasint"`
ConnectionType ConnectionType `json:"ct,omitempty" cbor:"20,keyasint,omitempty,omitzero"`
ExtraFsPct map[string]float64 `json:"efs,omitempty" cbor:"21,keyasint,omitempty"`

View File

@@ -34,7 +34,7 @@ func ColorPrint(color, text string) {
fmt.Println(color + text + colorReset)
}
func ColorPrintf(color, format string, args ...interface{}) {
func ColorPrintf(color, format string, args ...any) {
fmt.Printf(color+format+colorReset+"\n", args...)
}

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package hub
@@ -10,6 +9,7 @@ import (
"net/http/httptest"
"os"
"path/filepath"
"runtime"
"strings"
"testing"
"time"
@@ -35,6 +35,26 @@ func createTestHub(t testing.TB) (*Hub, *pbtests.TestApp, error) {
return NewHub(testApp), testApp, nil
}
// cleanupTestHub stops background system goroutines before tearing down the app.
func cleanupTestHub(hub *Hub, testApp *pbtests.TestApp) {
if hub != nil {
sm := hub.GetSystemManager()
sm.RemoveAllSystems()
// Give updater goroutines a brief window to observe cancellation before DB teardown.
for range 20 {
if sm.GetSystemCount() == 0 {
break
}
runtime.Gosched()
time.Sleep(5 * time.Millisecond)
}
time.Sleep(20 * time.Millisecond)
}
if testApp != nil {
testApp.Cleanup()
}
}
// Helper function to create a test record
func createTestRecord(app core.App, collection string, data map[string]any) (*core.Record, error) {
col, err := app.FindCachedCollectionByNameOrId(collection)
@@ -64,7 +84,7 @@ func TestValidateAgentHeaders(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer testApp.Cleanup()
defer cleanupTestHub(hub, testApp)
testCases := []struct {
name string
@@ -145,7 +165,7 @@ func TestGetAllFingerprintRecordsByToken(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer testApp.Cleanup()
defer cleanupTestHub(hub, testApp)
// create test user
userRecord, err := createTestUser(testApp)
@@ -235,7 +255,7 @@ func TestSetFingerprint(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer testApp.Cleanup()
defer cleanupTestHub(hub, testApp)
// Create test user
userRecord, err := createTestUser(testApp)
@@ -315,7 +335,7 @@ func TestCreateSystemFromAgentData(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer testApp.Cleanup()
defer cleanupTestHub(hub, testApp)
// Create test user
userRecord, err := createTestUser(testApp)
@@ -425,7 +445,7 @@ func TestUniversalTokenFlow(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer testApp.Cleanup()
defer cleanupTestHub(nil, testApp)
// Create test user
userRecord, err := createTestUser(testApp)
@@ -493,7 +513,7 @@ func TestAgentConnect(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer testApp.Cleanup()
defer cleanupTestHub(hub, testApp)
// Create test user
userRecord, err := createTestUser(testApp)
@@ -652,7 +672,7 @@ func TestHandleAgentConnect(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer testApp.Cleanup()
defer cleanupTestHub(hub, testApp)
// Create test user
userRecord, err := createTestUser(testApp)
@@ -737,7 +757,7 @@ func TestAgentWebSocketIntegration(t *testing.T) {
// Create hub and test app
hub, testApp, err := createTestHub(t)
require.NoError(t, err)
defer testApp.Cleanup()
defer cleanupTestHub(hub, testApp)
// Get the hub's SSH key
hubSigner, err := hub.GetSSHKey("")
@@ -897,7 +917,7 @@ func TestAgentWebSocketIntegration(t *testing.T) {
// Wait for connection result
maxWait := 2 * time.Second
time.Sleep(20 * time.Millisecond)
time.Sleep(40 * time.Millisecond)
checkInterval := 20 * time.Millisecond
timeout := time.After(maxWait)
ticker := time.Tick(checkInterval)
@@ -942,6 +962,8 @@ func TestAgentWebSocketIntegration(t *testing.T) {
}
}
time.Sleep(20 * time.Millisecond)
// Verify fingerprint state by re-reading the specific record
updatedFingerprintRecord, err := testApp.FindRecordById("fingerprints", fingerprintRecord.Id)
require.NoError(t, err)
@@ -976,7 +998,7 @@ func TestMultipleSystemsWithSameUniversalToken(t *testing.T) {
// Create hub and test app
hub, testApp, err := createTestHub(t)
require.NoError(t, err)
defer testApp.Cleanup()
defer cleanupTestHub(hub, testApp)
// Get the hub's SSH key
hubSigner, err := hub.GetSSHKey("")
@@ -1144,6 +1166,8 @@ func TestMultipleSystemsWithSameUniversalToken(t *testing.T) {
assert.Equal(t, systemCount, systemsAfterCount, "Total system count should remain the same")
}
time.Sleep(20 * time.Millisecond)
// Verify that a fingerprint record exists for this fingerprint
fingerprints, err := testApp.FindRecordsByFilter("fingerprints", "token = {:token} && fingerprint = {:fingerprint}", "", -1, 0, map[string]any{
"token": universalToken,
@@ -1176,7 +1200,7 @@ func TestPermanentUniversalTokenFromDB(t *testing.T) {
// Create hub and test app
hub, testApp, err := createTestHub(t)
require.NoError(t, err)
defer testApp.Cleanup()
defer cleanupTestHub(hub, testApp)
// Get the hub's SSH key
hubSigner, err := hub.GetSSHKey("")
@@ -1273,7 +1297,7 @@ verify:
func TestFindOrCreateSystemForToken(t *testing.T) {
hub, testApp, err := createTestHub(t)
require.NoError(t, err)
defer testApp.Cleanup()
defer cleanupTestHub(hub, testApp)
// Create test user
userRecord, err := createTestUser(testApp)

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package config_test

View File

@@ -1,35 +1,39 @@
// Package expirymap provides a thread-safe map with expiring entries.
// It supports TTL-based expiration with both lazy cleanup on access
// and periodic background cleanup.
package expirymap
import (
"reflect"
"sync"
"time"
"github.com/pocketbase/pocketbase/tools/store"
)
type val[T any] struct {
type val[T comparable] struct {
value T
expires time.Time
}
type ExpiryMap[T any] struct {
store *store.Store[string, *val[T]]
cleanupInterval time.Duration
type ExpiryMap[T comparable] struct {
store *store.Store[string, val[T]]
stopChan chan struct{}
stopOnce sync.Once
}
// New creates a new expiry map with custom cleanup interval
func New[T any](cleanupInterval time.Duration) *ExpiryMap[T] {
func New[T comparable](cleanupInterval time.Duration) *ExpiryMap[T] {
m := &ExpiryMap[T]{
store: store.New(map[string]*val[T]{}),
cleanupInterval: cleanupInterval,
store: store.New(map[string]val[T]{}),
stopChan: make(chan struct{}),
}
m.startCleaner()
go m.startCleaner(cleanupInterval)
return m
}
// Set stores a value with the given TTL
func (m *ExpiryMap[T]) Set(key string, value T, ttl time.Duration) {
m.store.Set(key, &val[T]{
m.store.Set(key, val[T]{
value: value,
expires: time.Now().Add(ttl),
})
@@ -55,7 +59,7 @@ func (m *ExpiryMap[T]) GetOk(key string) (T, bool) {
// GetByValue retrieves a value by value
func (m *ExpiryMap[T]) GetByValue(val T) (key string, value T, ok bool) {
for key, v := range m.store.GetAll() {
if reflect.DeepEqual(v.value, val) {
if v.value == val {
// check if expired
if v.expires.Before(time.Now()) {
m.store.Remove(key)
@@ -75,7 +79,7 @@ func (m *ExpiryMap[T]) Remove(key string) {
// RemovebyValue removes a value by value
func (m *ExpiryMap[T]) RemovebyValue(value T) (T, bool) {
for key, val := range m.store.GetAll() {
if reflect.DeepEqual(val.value, value) {
if val.value == value {
m.store.Remove(key)
return val.value, true
}
@@ -84,13 +88,23 @@ func (m *ExpiryMap[T]) RemovebyValue(value T) (T, bool) {
}
// startCleaner runs the background cleanup process
func (m *ExpiryMap[T]) startCleaner() {
go func() {
tick := time.Tick(m.cleanupInterval)
for range tick {
func (m *ExpiryMap[T]) startCleaner(interval time.Duration) {
tick := time.Tick(interval)
for {
select {
case <-tick:
m.cleanup()
case <-m.stopChan:
return
}
}()
}
}
// StopCleaner stops the background cleanup process
func (m *ExpiryMap[T]) StopCleaner() {
m.stopOnce.Do(func() {
close(m.stopChan)
})
}
// cleanup removes all expired entries
@@ -102,3 +116,12 @@ func (m *ExpiryMap[T]) cleanup() {
}
}
}
// UpdateExpiration updates the expiration time of a key
func (m *ExpiryMap[T]) UpdateExpiration(key string, ttl time.Duration) {
value, ok := m.store.GetOk(key)
if ok {
value.expires = time.Now().Add(ttl)
m.store.Set(key, value)
}
}

View File

@@ -1,10 +1,10 @@
//go:build testing
// +build testing
package expirymap
import (
"testing"
"testing/synctest"
"time"
"github.com/stretchr/testify/assert"
@@ -178,6 +178,33 @@ func TestExpiryMap_GenericTypes(t *testing.T) {
})
}
func TestExpiryMap_UpdateExpiration(t *testing.T) {
em := New[string](time.Hour)
// Set a value with short TTL
em.Set("key1", "value1", time.Millisecond*50)
// Verify it exists
assert.True(t, em.Has("key1"))
// Update expiration to a longer TTL
em.UpdateExpiration("key1", time.Hour)
// Wait for the original TTL to pass
time.Sleep(time.Millisecond * 100)
// Should still exist because expiration was updated
assert.True(t, em.Has("key1"))
value, ok := em.GetOk("key1")
assert.True(t, ok)
assert.Equal(t, "value1", value)
// Try updating non-existent key (should not panic)
assert.NotPanics(t, func() {
em.UpdateExpiration("nonexistent", time.Hour)
})
}
func TestExpiryMap_ZeroValues(t *testing.T) {
em := New[string](time.Hour)
@@ -474,3 +501,52 @@ func TestExpiryMap_ValueOperations_Integration(t *testing.T) {
assert.Equal(t, "unique", value)
assert.Equal(t, "key2", key)
}
func TestExpiryMap_Cleaner(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
em := New[string](time.Second)
defer em.StopCleaner()
em.Set("test", "value", 500*time.Millisecond)
// Wait 600ms, value is expired but cleaner hasn't run yet (interval is 1s)
time.Sleep(600 * time.Millisecond)
synctest.Wait()
// Map should still hold the value in its internal store before lazy access or cleaner
assert.Equal(t, 1, len(em.store.GetAll()), "store should still have 1 item before cleaner runs")
// Wait another 500ms so cleaner (1s interval) runs
time.Sleep(500 * time.Millisecond)
synctest.Wait() // Wait for background goroutine to process the tick
assert.Equal(t, 0, len(em.store.GetAll()), "store should be empty after cleaner runs")
})
}
func TestExpiryMap_StopCleaner(t *testing.T) {
em := New[string](time.Hour)
// Initially, stopChan is open, reading would block
select {
case <-em.stopChan:
t.Fatal("stopChan should be open initially")
default:
// success
}
em.StopCleaner()
// After StopCleaner, stopChan is closed, reading returns immediately
select {
case <-em.stopChan:
// success
default:
t.Fatal("stopChan was not closed by StopCleaner")
}
// Calling StopCleaner again should NOT panic thanks to sync.Once
assert.NotPanics(t, func() {
em.StopCleaner()
})
}

View File

@@ -0,0 +1,303 @@
// Package heartbeat sends periodic outbound pings to an external monitoring
// endpoint (e.g. BetterStack, Uptime Kuma, Healthchecks.io) so operators can
// monitor Beszel without exposing it to the internet.
package heartbeat
import (
"bytes"
"encoding/json"
"fmt"
"net/http"
"net/url"
"strconv"
"strings"
"time"
"github.com/henrygd/beszel"
"github.com/pocketbase/pocketbase/core"
)
// Default values for heartbeat configuration.
const (
defaultInterval = 60 // seconds
httpTimeout = 10 * time.Second
)
// Payload is the JSON body sent with each heartbeat request.
type Payload struct {
// Status is "ok" when all non-paused systems are up, "warn" when alerts
// are triggered but no systems are down, and "error" when any system is down.
Status string `json:"status"`
Timestamp string `json:"timestamp"`
Msg string `json:"msg"`
Systems SystemsSummary `json:"systems"`
Down []SystemInfo `json:"down_systems,omitempty"`
Alerts []AlertInfo `json:"triggered_alerts,omitempty"`
Version string `json:"beszel_version"`
}
// SystemsSummary contains counts of systems by status.
type SystemsSummary struct {
Total int `json:"total"`
Up int `json:"up"`
Down int `json:"down"`
Paused int `json:"paused"`
Pending int `json:"pending"`
}
// SystemInfo identifies a system that is currently down.
type SystemInfo struct {
ID string `json:"id" db:"id"`
Name string `json:"name" db:"name"`
Host string `json:"host" db:"host"`
}
// AlertInfo describes a currently triggered alert.
type AlertInfo struct {
SystemID string `json:"system_id"`
SystemName string `json:"system_name"`
AlertName string `json:"alert_name"`
Threshold float64 `json:"threshold"`
}
// Config holds heartbeat settings read from environment variables.
type Config struct {
URL string // endpoint to ping
Interval int // seconds between pings
Method string // HTTP method (GET or POST, default POST)
}
// Heartbeat manages the periodic outbound health check.
type Heartbeat struct {
app core.App
config Config
client *http.Client
}
// New creates a Heartbeat if configuration is present.
// Returns nil if HEARTBEAT_URL is not set (feature disabled).
func New(app core.App, getEnv func(string) (string, bool)) *Heartbeat {
url, _ := getEnv("HEARTBEAT_URL")
url = strings.TrimSpace(url)
if app == nil || url == "" {
return nil
}
interval := defaultInterval
if v, ok := getEnv("HEARTBEAT_INTERVAL"); ok {
if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 {
interval = parsed
}
}
method := http.MethodPost
if v, ok := getEnv("HEARTBEAT_METHOD"); ok {
v = strings.ToUpper(strings.TrimSpace(v))
if v == http.MethodGet || v == http.MethodHead {
method = v
}
}
return &Heartbeat{
app: app,
config: Config{
URL: url,
Interval: interval,
Method: method,
},
client: &http.Client{Timeout: httpTimeout},
}
}
// Start begins the heartbeat loop. It blocks and should be called in a goroutine.
// The loop runs until the provided stop channel is closed.
func (hb *Heartbeat) Start(stop <-chan struct{}) {
sanitizedURL := sanitizeHeartbeatURL(hb.config.URL)
hb.app.Logger().Info("Heartbeat enabled",
"url", sanitizedURL,
"interval", fmt.Sprintf("%ds", hb.config.Interval),
"method", hb.config.Method,
)
// Send an initial heartbeat immediately on startup.
hb.send()
ticker := time.NewTicker(time.Duration(hb.config.Interval) * time.Second)
defer ticker.Stop()
for {
select {
case <-stop:
return
case <-ticker.C:
hb.send()
}
}
}
// Send performs a single heartbeat ping. Exposed for the test-heartbeat API endpoint.
func (hb *Heartbeat) Send() error {
return hb.send()
}
// GetConfig returns the current heartbeat configuration.
func (hb *Heartbeat) GetConfig() Config {
return hb.config
}
func (hb *Heartbeat) send() error {
var req *http.Request
var err error
method := normalizeMethod(hb.config.Method)
if method == http.MethodGet || method == http.MethodHead {
req, err = http.NewRequest(method, hb.config.URL, nil)
} else {
payload, payloadErr := hb.buildPayload()
if payloadErr != nil {
hb.app.Logger().Error("Heartbeat: failed to build payload", "err", payloadErr)
return payloadErr
}
body, jsonErr := json.Marshal(payload)
if jsonErr != nil {
hb.app.Logger().Error("Heartbeat: failed to marshal payload", "err", jsonErr)
return jsonErr
}
req, err = http.NewRequest(http.MethodPost, hb.config.URL, bytes.NewReader(body))
if err == nil {
req.Header.Set("Content-Type", "application/json")
}
}
if err != nil {
hb.app.Logger().Error("Heartbeat: failed to create request", "err", err)
return err
}
req.Header.Set("User-Agent", "Beszel-Heartbeat")
resp, err := hb.client.Do(req)
if err != nil {
hb.app.Logger().Error("Heartbeat: request failed", "url", sanitizeHeartbeatURL(hb.config.URL), "err", err)
return err
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
hb.app.Logger().Warn("Heartbeat: non-success response",
"url", sanitizeHeartbeatURL(hb.config.URL),
"status", resp.StatusCode,
)
return fmt.Errorf("heartbeat endpoint returned status %d", resp.StatusCode)
}
return nil
}
func (hb *Heartbeat) buildPayload() (*Payload, error) {
db := hb.app.DB()
// Count systems by status.
var systemCounts []struct {
Status string `db:"status"`
Count int `db:"cnt"`
}
err := db.NewQuery("SELECT status, COUNT(*) as cnt FROM systems GROUP BY status").All(&systemCounts)
if err != nil {
return nil, fmt.Errorf("query system counts: %w", err)
}
summary := SystemsSummary{}
for _, sc := range systemCounts {
switch sc.Status {
case "up":
summary.Up = sc.Count
case "down":
summary.Down = sc.Count
case "paused":
summary.Paused = sc.Count
case "pending":
summary.Pending = sc.Count
}
summary.Total += sc.Count
}
// Get names of down systems.
var downSystems []SystemInfo
if summary.Down > 0 {
err = db.NewQuery("SELECT id, name, host FROM systems WHERE status = 'down'").All(&downSystems)
if err != nil {
return nil, fmt.Errorf("query down systems: %w", err)
}
}
// Get triggered alerts with system names.
var triggeredAlerts []struct {
SystemID string `db:"system"`
SystemName string `db:"system_name"`
AlertName string `db:"name"`
Value float64 `db:"value"`
}
err = db.NewQuery(`
SELECT a.system, s.name as system_name, a.name, a.value
FROM alerts a
JOIN systems s ON a.system = s.id
WHERE a.triggered = true
`).All(&triggeredAlerts)
if err != nil {
// Non-fatal: alerts info is supplementary.
triggeredAlerts = nil
}
alerts := make([]AlertInfo, 0, len(triggeredAlerts))
for _, ta := range triggeredAlerts {
alerts = append(alerts, AlertInfo{
SystemID: ta.SystemID,
SystemName: ta.SystemName,
AlertName: ta.AlertName,
Threshold: ta.Value,
})
}
// Determine overall status.
status := "ok"
msg := "All systems operational"
if summary.Down > 0 {
status = "error"
names := make([]string, len(downSystems))
for i, ds := range downSystems {
names[i] = ds.Name
}
msg = fmt.Sprintf("%d system(s) down: %s", summary.Down, strings.Join(names, ", "))
} else if len(alerts) > 0 {
status = "warn"
msg = fmt.Sprintf("%d alert(s) triggered", len(alerts))
}
return &Payload{
Status: status,
Timestamp: time.Now().UTC().Format(time.RFC3339),
Msg: msg,
Systems: summary,
Down: downSystems,
Alerts: alerts,
Version: beszel.Version,
}, nil
}
func normalizeMethod(method string) string {
upper := strings.ToUpper(strings.TrimSpace(method))
if upper == http.MethodGet || upper == http.MethodHead || upper == http.MethodPost {
return upper
}
return http.MethodPost
}
func sanitizeHeartbeatURL(rawURL string) string {
parsed, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil || parsed.Scheme == "" || parsed.Host == "" {
return "<invalid-url>"
}
return parsed.Scheme + "://" + parsed.Host
}

View File

@@ -0,0 +1,257 @@
//go:build testing
package heartbeat_test
import (
"encoding/json"
"io"
"net/http"
"net/http/httptest"
"testing"
"github.com/henrygd/beszel/internal/hub/heartbeat"
beszeltests "github.com/henrygd/beszel/internal/tests"
"github.com/pocketbase/pocketbase/core"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestNew(t *testing.T) {
t.Run("returns nil when app is missing", func(t *testing.T) {
hb := heartbeat.New(nil, envGetter(map[string]string{
"HEARTBEAT_URL": "https://heartbeat.example.com/ping",
}))
assert.Nil(t, hb)
})
t.Run("returns nil when URL is missing", func(t *testing.T) {
app := newTestHub(t)
hb := heartbeat.New(app.App, func(string) (string, bool) {
return "", false
})
assert.Nil(t, hb)
})
t.Run("parses and normalizes config values", func(t *testing.T) {
app := newTestHub(t)
env := map[string]string{
"HEARTBEAT_URL": " https://heartbeat.example.com/ping ",
"HEARTBEAT_INTERVAL": "90",
"HEARTBEAT_METHOD": "head",
}
getEnv := func(key string) (string, bool) {
v, ok := env[key]
return v, ok
}
hb := heartbeat.New(app.App, getEnv)
require.NotNil(t, hb)
cfg := hb.GetConfig()
assert.Equal(t, "https://heartbeat.example.com/ping", cfg.URL)
assert.Equal(t, 90, cfg.Interval)
assert.Equal(t, http.MethodHead, cfg.Method)
})
}
func TestSendGETDoesNotRequireAppOrDB(t *testing.T) {
app := newTestHub(t)
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
assert.Equal(t, http.MethodGet, r.Method)
assert.Equal(t, "Beszel-Heartbeat", r.Header.Get("User-Agent"))
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
hb := heartbeat.New(app.App, envGetter(map[string]string{
"HEARTBEAT_URL": server.URL,
"HEARTBEAT_METHOD": "GET",
}))
require.NotNil(t, hb)
require.NoError(t, hb.Send())
}
func TestSendReturnsErrorOnHTTPFailureStatus(t *testing.T) {
app := newTestHub(t)
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
}))
defer server.Close()
hb := heartbeat.New(app.App, envGetter(map[string]string{
"HEARTBEAT_URL": server.URL,
"HEARTBEAT_METHOD": "GET",
}))
require.NotNil(t, hb)
err := hb.Send()
require.Error(t, err)
assert.ErrorContains(t, err, "heartbeat endpoint returned status 500")
}
func TestSendPOSTBuildsExpectedStatuses(t *testing.T) {
tests := []struct {
name string
setup func(t *testing.T, app *beszeltests.TestHub, user *core.Record)
expectStatus string
expectMsgPart string
expectDown int
expectAlerts int
expectTotal int
expectUp int
expectPaused int
expectPending int
expectDownSumm int
}{
{
name: "error when at least one system is down",
setup: func(t *testing.T, app *beszeltests.TestHub, user *core.Record) {
downSystem := createTestSystem(t, app, user.Id, "db-1", "10.0.0.1", "down")
_ = createTestSystem(t, app, user.Id, "web-1", "10.0.0.2", "up")
createTriggeredAlert(t, app, user.Id, downSystem.Id, "CPU", 95)
},
expectStatus: "error",
expectMsgPart: "1 system(s) down",
expectDown: 1,
expectAlerts: 1,
expectTotal: 2,
expectUp: 1,
expectDownSumm: 1,
},
{
name: "warn when only alerts are triggered",
setup: func(t *testing.T, app *beszeltests.TestHub, user *core.Record) {
system := createTestSystem(t, app, user.Id, "api-1", "10.1.0.1", "up")
createTriggeredAlert(t, app, user.Id, system.Id, "CPU", 90)
},
expectStatus: "warn",
expectMsgPart: "1 alert(s) triggered",
expectDown: 0,
expectAlerts: 1,
expectTotal: 1,
expectUp: 1,
expectDownSumm: 0,
},
{
name: "ok when no down systems and no alerts",
setup: func(t *testing.T, app *beszeltests.TestHub, user *core.Record) {
_ = createTestSystem(t, app, user.Id, "node-1", "10.2.0.1", "up")
_ = createTestSystem(t, app, user.Id, "node-2", "10.2.0.2", "paused")
_ = createTestSystem(t, app, user.Id, "node-3", "10.2.0.3", "pending")
},
expectStatus: "ok",
expectMsgPart: "All systems operational",
expectDown: 0,
expectAlerts: 0,
expectTotal: 3,
expectUp: 1,
expectPaused: 1,
expectPending: 1,
expectDownSumm: 0,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
app := newTestHub(t)
user := createTestUser(t, app)
tt.setup(t, app, user)
type requestCapture struct {
method string
userAgent string
contentType string
payload heartbeat.Payload
}
captured := make(chan requestCapture, 1)
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
defer r.Body.Close()
body, err := io.ReadAll(r.Body)
require.NoError(t, err)
var payload heartbeat.Payload
require.NoError(t, json.Unmarshal(body, &payload))
captured <- requestCapture{
method: r.Method,
userAgent: r.Header.Get("User-Agent"),
contentType: r.Header.Get("Content-Type"),
payload: payload,
}
w.WriteHeader(http.StatusNoContent)
}))
defer server.Close()
hb := heartbeat.New(app.App, envGetter(map[string]string{
"HEARTBEAT_URL": server.URL,
"HEARTBEAT_METHOD": "POST",
}))
require.NotNil(t, hb)
require.NoError(t, hb.Send())
req := <-captured
assert.Equal(t, http.MethodPost, req.method)
assert.Equal(t, "Beszel-Heartbeat", req.userAgent)
assert.Equal(t, "application/json", req.contentType)
assert.Equal(t, tt.expectStatus, req.payload.Status)
assert.Contains(t, req.payload.Msg, tt.expectMsgPart)
assert.Equal(t, tt.expectDown, len(req.payload.Down))
assert.Equal(t, tt.expectAlerts, len(req.payload.Alerts))
assert.Equal(t, tt.expectTotal, req.payload.Systems.Total)
assert.Equal(t, tt.expectUp, req.payload.Systems.Up)
assert.Equal(t, tt.expectDownSumm, req.payload.Systems.Down)
assert.Equal(t, tt.expectPaused, req.payload.Systems.Paused)
assert.Equal(t, tt.expectPending, req.payload.Systems.Pending)
})
}
}
func newTestHub(t *testing.T) *beszeltests.TestHub {
t.Helper()
app, err := beszeltests.NewTestHub(t.TempDir())
require.NoError(t, err)
t.Cleanup(app.Cleanup)
return app
}
func createTestUser(t *testing.T, app *beszeltests.TestHub) *core.Record {
t.Helper()
user, err := beszeltests.CreateUser(app.App, "admin@example.com", "password123")
require.NoError(t, err)
return user
}
func createTestSystem(t *testing.T, app *beszeltests.TestHub, userID, name, host, status string) *core.Record {
t.Helper()
system, err := beszeltests.CreateRecord(app.App, "systems", map[string]any{
"name": name,
"host": host,
"port": "45876",
"users": []string{userID},
"status": status,
})
require.NoError(t, err)
return system
}
func createTriggeredAlert(t *testing.T, app *beszeltests.TestHub, userID, systemID, name string, threshold float64) *core.Record {
t.Helper()
alert, err := beszeltests.CreateRecord(app.App, "alerts", map[string]any{
"name": name,
"system": systemID,
"user": userID,
"value": threshold,
"min": 0,
"triggered": true,
})
require.NoError(t, err)
return alert
}
func envGetter(values map[string]string) func(string) (string, bool) {
return func(key string) (string, bool) {
v, ok := values[key]
return v, ok
}
}

View File

@@ -9,12 +9,14 @@ import (
"net/url"
"os"
"path"
"regexp"
"strings"
"time"
"github.com/henrygd/beszel"
"github.com/henrygd/beszel/internal/alerts"
"github.com/henrygd/beszel/internal/hub/config"
"github.com/henrygd/beszel/internal/hub/heartbeat"
"github.com/henrygd/beszel/internal/hub/systems"
"github.com/henrygd/beszel/internal/records"
"github.com/henrygd/beszel/internal/users"
@@ -33,11 +35,15 @@ type Hub struct {
um *users.UserManager
rm *records.RecordManager
sm *systems.SystemManager
hb *heartbeat.Heartbeat
hbStop chan struct{}
pubKey string
signer ssh.Signer
appURL string
}
var containerIDPattern = regexp.MustCompile(`^[a-fA-F0-9]{12,64}$`)
// NewHub creates a new Hub instance with default configuration
func NewHub(app core.App) *Hub {
hub := &Hub{}
@@ -48,6 +54,10 @@ func NewHub(app core.App) *Hub {
hub.rm = records.NewRecordManager(hub)
hub.sm = systems.NewSystemManager(hub)
hub.appURL, _ = GetEnv("APP_URL")
hub.hb = heartbeat.New(app, GetEnv)
if hub.hb != nil {
hub.hbStop = make(chan struct{})
}
return hub
}
@@ -88,6 +98,10 @@ func (h *Hub) StartHub() error {
if err := h.sm.Initialize(); err != nil {
return err
}
// start heartbeat if configured
if h.hb != nil {
go h.hb.Start(h.hbStop)
}
return e.Next()
})
@@ -287,6 +301,9 @@ func (h *Hub) registerApiRoutes(se *core.ServeEvent) error {
})
// send test notification
apiAuth.POST("/test-notification", h.SendTestNotification)
// heartbeat status and test
apiAuth.GET("/heartbeat-status", h.getHeartbeatStatus)
apiAuth.POST("/test-heartbeat", h.testHeartbeat)
// get config.yml content
apiAuth.GET("/config-yaml", config.GetYamlConfig)
// handle agent websocket connection
@@ -403,6 +420,42 @@ func (h *Hub) getUniversalToken(e *core.RequestEvent) error {
return e.JSON(http.StatusOK, response)
}
// getHeartbeatStatus returns current heartbeat configuration and whether it's enabled
func (h *Hub) getHeartbeatStatus(e *core.RequestEvent) error {
if e.Auth.GetString("role") != "admin" {
return e.ForbiddenError("Requires admin role", nil)
}
if h.hb == nil {
return e.JSON(http.StatusOK, map[string]any{
"enabled": false,
"msg": "Set HEARTBEAT_URL to enable outbound heartbeat monitoring",
})
}
cfg := h.hb.GetConfig()
return e.JSON(http.StatusOK, map[string]any{
"enabled": true,
"url": cfg.URL,
"interval": cfg.Interval,
"method": cfg.Method,
})
}
// testHeartbeat triggers a single heartbeat ping and returns the result
func (h *Hub) testHeartbeat(e *core.RequestEvent) error {
if e.Auth.GetString("role") != "admin" {
return e.ForbiddenError("Requires admin role", nil)
}
if h.hb == nil {
return e.JSON(http.StatusOK, map[string]any{
"err": "Heartbeat not configured. Set HEARTBEAT_URL environment variable.",
})
}
if err := h.hb.Send(); err != nil {
return e.JSON(http.StatusOK, map[string]any{"err": err.Error()})
}
return e.JSON(http.StatusOK, map[string]any{"err": false})
}
// containerRequestHandler handles both container logs and info requests
func (h *Hub) containerRequestHandler(e *core.RequestEvent, fetchFunc func(*systems.System, string) (string, error), responseKey string) error {
systemID := e.Request.URL.Query().Get("system")
@@ -411,6 +464,9 @@ func (h *Hub) containerRequestHandler(e *core.RequestEvent, fetchFunc func(*syst
if systemID == "" || containerID == "" {
return e.JSON(http.StatusBadRequest, map[string]string{"error": "system and container parameters are required"})
}
if !containerIDPattern.MatchString(containerID) {
return e.JSON(http.StatusBadRequest, map[string]string{"error": "invalid container parameter"})
}
system, err := h.sm.GetSystem(systemID)
if err != nil {

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package hub_test
@@ -362,6 +361,58 @@ func TestApiRoutesAuthentication(t *testing.T) {
ExpectedContent: []string{"test-system"},
TestAppFactory: testAppFactory,
},
{
Name: "GET /heartbeat-status - no auth should fail",
Method: http.MethodGet,
URL: "/api/beszel/heartbeat-status",
ExpectedStatus: 401,
ExpectedContent: []string{"requires valid"},
TestAppFactory: testAppFactory,
},
{
Name: "GET /heartbeat-status - with user auth should fail",
Method: http.MethodGet,
URL: "/api/beszel/heartbeat-status",
Headers: map[string]string{
"Authorization": userToken,
},
ExpectedStatus: 403,
ExpectedContent: []string{"Requires admin role"},
TestAppFactory: testAppFactory,
},
{
Name: "GET /heartbeat-status - with admin auth should succeed",
Method: http.MethodGet,
URL: "/api/beszel/heartbeat-status",
Headers: map[string]string{
"Authorization": adminUserToken,
},
ExpectedStatus: 200,
ExpectedContent: []string{`"enabled":false`},
TestAppFactory: testAppFactory,
},
{
Name: "POST /test-heartbeat - with user auth should fail",
Method: http.MethodPost,
URL: "/api/beszel/test-heartbeat",
Headers: map[string]string{
"Authorization": userToken,
},
ExpectedStatus: 403,
ExpectedContent: []string{"Requires admin role"},
TestAppFactory: testAppFactory,
},
{
Name: "POST /test-heartbeat - with admin auth should report disabled state",
Method: http.MethodPost,
URL: "/api/beszel/test-heartbeat",
Headers: map[string]string{
"Authorization": adminUserToken,
},
ExpectedStatus: 200,
ExpectedContent: []string{"Heartbeat not configured"},
TestAppFactory: testAppFactory,
},
{
Name: "GET /universal-token - no auth should fail",
Method: http.MethodGet,
@@ -493,7 +544,7 @@ func TestApiRoutesAuthentication(t *testing.T) {
{
Name: "GET /containers/logs - with auth but invalid system should fail",
Method: http.MethodGet,
URL: "/api/beszel/containers/logs?system=invalid-system&container=test-container",
URL: "/api/beszel/containers/logs?system=invalid-system&container=0123456789ab",
Headers: map[string]string{
"Authorization": userToken,
},
@@ -501,6 +552,39 @@ func TestApiRoutesAuthentication(t *testing.T) {
ExpectedContent: []string{"system not found"},
TestAppFactory: testAppFactory,
},
{
Name: "GET /containers/logs - traversal container should fail validation",
Method: http.MethodGet,
URL: "/api/beszel/containers/logs?system=" + system.Id + "&container=..%2F..%2Fversion",
Headers: map[string]string{
"Authorization": userToken,
},
ExpectedStatus: 400,
ExpectedContent: []string{"invalid container parameter"},
TestAppFactory: testAppFactory,
},
{
Name: "GET /containers/info - traversal container should fail validation",
Method: http.MethodGet,
URL: "/api/beszel/containers/info?system=" + system.Id + "&container=../../version?x=",
Headers: map[string]string{
"Authorization": userToken,
},
ExpectedStatus: 400,
ExpectedContent: []string{"invalid container parameter"},
TestAppFactory: testAppFactory,
},
{
Name: "GET /containers/info - non-hex container should fail validation",
Method: http.MethodGet,
URL: "/api/beszel/containers/info?system=" + system.Id + "&container=container_name",
Headers: map[string]string{
"Authorization": userToken,
},
ExpectedStatus: 400,
ExpectedContent: []string{"invalid container parameter"},
TestAppFactory: testAppFactory,
},
// Auth Optional Routes - Should work without authentication
{

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package hub

View File

@@ -48,7 +48,6 @@ type System struct {
detailsFetched atomic.Bool // True if static system details have been fetched and saved
smartFetching atomic.Bool // True if SMART devices are currently being fetched
smartInterval time.Duration // Interval for periodic SMART data updates
lastSmartFetch atomic.Int64 // Unix milliseconds of last SMART data fetch
}
func (sm *SystemManager) NewSystem(systemId string) *System {
@@ -134,19 +133,34 @@ func (sys *System) update() error {
return err
}
// ensure deprecated fields from older agents are migrated to current fields
migrateDeprecatedFields(data, !sys.detailsFetched.Load())
// create system records
_, err = sys.createRecords(data)
// if details were included and fetched successfully, mark details as fetched and update smart interval if set by agent
if err == nil && data.Details != nil {
sys.detailsFetched.Store(true)
// update smart interval if it's set on the agent side
if data.Details.SmartInterval > 0 {
sys.smartInterval = data.Details.SmartInterval
// make sure we reset expiration of lastFetch to remain as long as the new smart interval
// to prevent premature expiration leading to new fetch if interval is different.
sys.manager.smartFetchMap.UpdateExpiration(sys.Id, sys.smartInterval+time.Minute)
}
}
// Fetch and save SMART devices when system first comes online or at intervals
if backgroundSmartFetchEnabled() {
if backgroundSmartFetchEnabled() && sys.detailsFetched.Load() {
if sys.smartInterval <= 0 {
sys.smartInterval = time.Hour
}
lastFetch := sys.lastSmartFetch.Load()
if time.Since(time.UnixMilli(lastFetch)) >= sys.smartInterval && sys.smartFetching.CompareAndSwap(false, true) {
lastFetch, _ := sys.manager.smartFetchMap.GetOk(sys.Id)
if time.Since(time.UnixMilli(lastFetch-1e4)) >= sys.smartInterval && sys.smartFetching.CompareAndSwap(false, true) {
go func() {
defer sys.smartFetching.Store(false)
sys.lastSmartFetch.Store(time.Now().UnixMilli())
sys.manager.smartFetchMap.Set(sys.Id, time.Now().UnixMilli(), sys.smartInterval+time.Minute)
_ = sys.FetchAndSaveSmartDevices()
}()
}
@@ -221,11 +235,6 @@ func (sys *System) createRecords(data *system.CombinedData) (*core.Record, error
if err := createSystemDetailsRecord(txApp, data.Details, sys.Id); err != nil {
return err
}
sys.detailsFetched.Store(true)
// update smart interval if it's set on the agent side
if data.Details.SmartInterval > 0 {
sys.smartInterval = data.Details.SmartInterval
}
}
// update system record (do this last because it triggers alerts and we need above records to be inserted first)
@@ -309,10 +318,11 @@ func createContainerRecords(app core.App, data []*container.Stats, systemId stri
valueStrings := make([]string, 0, len(data))
for i, container := range data {
suffix := fmt.Sprintf("%d", i)
valueStrings = append(valueStrings, fmt.Sprintf("({:id%[1]s}, {:system}, {:name%[1]s}, {:image%[1]s}, {:status%[1]s}, {:health%[1]s}, {:cpu%[1]s}, {:memory%[1]s}, {:net%[1]s}, {:updated})", suffix))
valueStrings = append(valueStrings, fmt.Sprintf("({:id%[1]s}, {:system}, {:name%[1]s}, {:image%[1]s}, {:ports%[1]s}, {:status%[1]s}, {:health%[1]s}, {:cpu%[1]s}, {:memory%[1]s}, {:net%[1]s}, {:updated})", suffix))
params["id"+suffix] = container.Id
params["name"+suffix] = container.Name
params["image"+suffix] = container.Image
params["ports"+suffix] = container.Ports
params["status"+suffix] = container.Status
params["health"+suffix] = container.Health
params["cpu"+suffix] = container.Cpu
@@ -324,7 +334,7 @@ func createContainerRecords(app core.App, data []*container.Stats, systemId stri
params["net"+suffix] = netBytes
}
queryString := fmt.Sprintf(
"INSERT INTO containers (id, system, name, image, status, health, cpu, memory, net, updated) VALUES %s ON CONFLICT(id) DO UPDATE SET system = excluded.system, name = excluded.name, image = excluded.image, status = excluded.status, health = excluded.health, cpu = excluded.cpu, memory = excluded.memory, net = excluded.net, updated = excluded.updated",
"INSERT INTO containers (id, system, name, image, ports, status, health, cpu, memory, net, updated) VALUES %s ON CONFLICT(id) DO UPDATE SET system = excluded.system, name = excluded.name, image = excluded.image, ports = excluded.ports, status = excluded.status, health = excluded.health, cpu = excluded.cpu, memory = excluded.memory, net = excluded.net, updated = excluded.updated",
strings.Join(valueStrings, ","),
)
_, err := app.DB().NewQuery(queryString).Bind(params).Execute()
@@ -703,3 +713,50 @@ func getJitter() <-chan time.Time {
msDelay := (interval * minPercent / 100) + rand.Intn(interval*jitterRange/100)
return time.After(time.Duration(msDelay) * time.Millisecond)
}
// migrateDeprecatedFields moves values from deprecated fields to their new locations if the new
// fields are not already populated. Deprecated fields and refs may be removed at least 30 days
// and one minor version release after the release that includes the migration.
//
// This is run when processing incoming system data from agents, which may be on older versions.
func migrateDeprecatedFields(cd *system.CombinedData, createDetails bool) {
// migration added 0.19.0
if cd.Stats.Bandwidth[0] == 0 && cd.Stats.Bandwidth[1] == 0 {
cd.Stats.Bandwidth[0] = uint64(cd.Stats.NetworkSent * 1024 * 1024)
cd.Stats.Bandwidth[1] = uint64(cd.Stats.NetworkRecv * 1024 * 1024)
cd.Stats.NetworkSent, cd.Stats.NetworkRecv = 0, 0
}
// migration added 0.19.0
if cd.Info.BandwidthBytes == 0 {
cd.Info.BandwidthBytes = uint64(cd.Info.Bandwidth * 1024 * 1024)
cd.Info.Bandwidth = 0
}
// migration added 0.19.0
if cd.Stats.DiskIO[0] == 0 && cd.Stats.DiskIO[1] == 0 {
cd.Stats.DiskIO[0] = uint64(cd.Stats.DiskReadPs * 1024 * 1024)
cd.Stats.DiskIO[1] = uint64(cd.Stats.DiskWritePs * 1024 * 1024)
cd.Stats.DiskReadPs, cd.Stats.DiskWritePs = 0, 0
}
// migration added 0.19.0 - Move deprecated Info fields to Details struct
if cd.Details == nil && cd.Info.Hostname != "" {
if createDetails {
cd.Details = &system.Details{
Hostname: cd.Info.Hostname,
Kernel: cd.Info.KernelVersion,
Cores: cd.Info.Cores,
Threads: cd.Info.Threads,
CpuModel: cd.Info.CpuModel,
Podman: cd.Info.Podman,
Os: cd.Info.Os,
MemoryTotal: uint64(cd.Stats.Mem * 1024 * 1024 * 1024),
}
}
// zero the deprecated fields to prevent saving them in systems.info DB json payload
cd.Info.Hostname = ""
cd.Info.KernelVersion = ""
cd.Info.Cores = 0
cd.Info.CpuModel = ""
cd.Info.Podman = false
cd.Info.Os = 0
}
}

View File

@@ -8,6 +8,7 @@ import (
"github.com/henrygd/beszel/internal/hub/ws"
"github.com/henrygd/beszel/internal/entities/system"
"github.com/henrygd/beszel/internal/hub/expirymap"
"github.com/henrygd/beszel/internal/common"
@@ -40,9 +41,10 @@ var errSystemExists = errors.New("system exists")
// SystemManager manages a collection of monitored systems and their connections.
// It handles system lifecycle, status updates, and maintains both SSH and WebSocket connections.
type SystemManager struct {
hub hubLike // Hub interface for database and alert operations
systems *store.Store[string, *System] // Thread-safe store of active systems
sshConfig *ssh.ClientConfig // SSH client configuration for system connections
hub hubLike // Hub interface for database and alert operations
systems *store.Store[string, *System] // Thread-safe store of active systems
sshConfig *ssh.ClientConfig // SSH client configuration for system connections
smartFetchMap *expirymap.ExpiryMap[int64] // Stores last SMART fetch time per system ID
}
// hubLike defines the interface requirements for the hub dependency.
@@ -58,8 +60,9 @@ type hubLike interface {
// The hub must implement the hubLike interface to provide database and alert functionality.
func NewSystemManager(hub hubLike) *SystemManager {
return &SystemManager{
systems: store.New(map[string]*System{}),
hub: hub,
systems: store.New(map[string]*System{}),
hub: hub,
smartFetchMap: expirymap.New[int64](time.Hour),
}
}

View File

@@ -0,0 +1,159 @@
//go:build testing
package systems
import (
"testing"
"github.com/henrygd/beszel/internal/entities/system"
)
func TestCombinedData_MigrateDeprecatedFields(t *testing.T) {
t.Run("Migrate NetworkSent and NetworkRecv to Bandwidth", func(t *testing.T) {
cd := &system.CombinedData{
Stats: system.Stats{
NetworkSent: 1.5, // 1.5 MB
NetworkRecv: 2.5, // 2.5 MB
},
}
migrateDeprecatedFields(cd, true)
expectedSent := uint64(1.5 * 1024 * 1024)
expectedRecv := uint64(2.5 * 1024 * 1024)
if cd.Stats.Bandwidth[0] != expectedSent {
t.Errorf("expected Bandwidth[0] %d, got %d", expectedSent, cd.Stats.Bandwidth[0])
}
if cd.Stats.Bandwidth[1] != expectedRecv {
t.Errorf("expected Bandwidth[1] %d, got %d", expectedRecv, cd.Stats.Bandwidth[1])
}
if cd.Stats.NetworkSent != 0 || cd.Stats.NetworkRecv != 0 {
t.Errorf("expected NetworkSent and NetworkRecv to be reset, got %f, %f", cd.Stats.NetworkSent, cd.Stats.NetworkRecv)
}
})
t.Run("Migrate Info.Bandwidth to Info.BandwidthBytes", func(t *testing.T) {
cd := &system.CombinedData{
Info: system.Info{
Bandwidth: 10.0, // 10 MB
},
}
migrateDeprecatedFields(cd, true)
expected := uint64(10 * 1024 * 1024)
if cd.Info.BandwidthBytes != expected {
t.Errorf("expected BandwidthBytes %d, got %d", expected, cd.Info.BandwidthBytes)
}
if cd.Info.Bandwidth != 0 {
t.Errorf("expected Info.Bandwidth to be reset, got %f", cd.Info.Bandwidth)
}
})
t.Run("Migrate DiskReadPs and DiskWritePs to DiskIO", func(t *testing.T) {
cd := &system.CombinedData{
Stats: system.Stats{
DiskReadPs: 3.0, // 3 MB
DiskWritePs: 4.0, // 4 MB
},
}
migrateDeprecatedFields(cd, true)
expectedRead := uint64(3 * 1024 * 1024)
expectedWrite := uint64(4 * 1024 * 1024)
if cd.Stats.DiskIO[0] != expectedRead {
t.Errorf("expected DiskIO[0] %d, got %d", expectedRead, cd.Stats.DiskIO[0])
}
if cd.Stats.DiskIO[1] != expectedWrite {
t.Errorf("expected DiskIO[1] %d, got %d", expectedWrite, cd.Stats.DiskIO[1])
}
if cd.Stats.DiskReadPs != 0 || cd.Stats.DiskWritePs != 0 {
t.Errorf("expected DiskReadPs and DiskWritePs to be reset, got %f, %f", cd.Stats.DiskReadPs, cd.Stats.DiskWritePs)
}
})
t.Run("Migrate Info fields to Details struct", func(t *testing.T) {
cd := &system.CombinedData{
Stats: system.Stats{
Mem: 16.0, // 16 GB
},
Info: system.Info{
Hostname: "test-host",
KernelVersion: "6.8.0",
Cores: 8,
Threads: 16,
CpuModel: "Intel i7",
Podman: true,
Os: system.Linux,
},
}
migrateDeprecatedFields(cd, true)
if cd.Details == nil {
t.Fatal("expected Details struct to be created")
}
if cd.Details.Hostname != "test-host" {
t.Errorf("expected Hostname 'test-host', got '%s'", cd.Details.Hostname)
}
if cd.Details.Kernel != "6.8.0" {
t.Errorf("expected Kernel '6.8.0', got '%s'", cd.Details.Kernel)
}
if cd.Details.Cores != 8 {
t.Errorf("expected Cores 8, got %d", cd.Details.Cores)
}
if cd.Details.Threads != 16 {
t.Errorf("expected Threads 16, got %d", cd.Details.Threads)
}
if cd.Details.CpuModel != "Intel i7" {
t.Errorf("expected CpuModel 'Intel i7', got '%s'", cd.Details.CpuModel)
}
if cd.Details.Podman != true {
t.Errorf("expected Podman true, got %v", cd.Details.Podman)
}
if cd.Details.Os != system.Linux {
t.Errorf("expected Os Linux, got %d", cd.Details.Os)
}
expectedMem := uint64(16 * 1024 * 1024 * 1024)
if cd.Details.MemoryTotal != expectedMem {
t.Errorf("expected MemoryTotal %d, got %d", expectedMem, cd.Details.MemoryTotal)
}
if cd.Info.Hostname != "" || cd.Info.KernelVersion != "" || cd.Info.Cores != 0 || cd.Info.CpuModel != "" || cd.Info.Podman != false || cd.Info.Os != 0 {
t.Errorf("expected Info fields to be reset, got %+v", cd.Info)
}
})
t.Run("Do not migrate if Details already exists", func(t *testing.T) {
cd := &system.CombinedData{
Details: &system.Details{Hostname: "existing-host"},
Info: system.Info{
Hostname: "deprecated-host",
},
}
migrateDeprecatedFields(cd, true)
if cd.Details.Hostname != "existing-host" {
t.Errorf("expected Hostname 'existing-host', got '%s'", cd.Details.Hostname)
}
if cd.Info.Hostname != "deprecated-host" {
t.Errorf("expected Info.Hostname to remain 'deprecated-host', got '%s'", cd.Info.Hostname)
}
})
t.Run("Do not create details if migrateDetails is false", func(t *testing.T) {
cd := &system.CombinedData{
Info: system.Info{
Hostname: "deprecated-host",
},
}
migrateDeprecatedFields(cd, false)
if cd.Details != nil {
t.Fatal("expected Details struct to not be created")
}
if cd.Info.Hostname != "" {
t.Errorf("expected Info.Hostname to be reset, got '%s'", cd.Info.Hostname)
}
})
}

View File

@@ -1,5 +1,4 @@
//go:build !testing
// +build !testing
package systems

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package systems_test

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package systems
@@ -8,6 +7,7 @@ import (
"fmt"
entities "github.com/henrygd/beszel/internal/entities/system"
"github.com/pocketbase/pocketbase/core"
)
// The hub integration tests create/replace systems and cleanup the test apps quickly.
@@ -114,4 +114,14 @@ func (sm *SystemManager) RemoveAllSystems() {
for _, system := range sm.systems.GetAll() {
sm.RemoveSystem(system.Id)
}
sm.smartFetchMap.StopCleaner()
}
func (s *System) StopUpdater() {
s.cancel()
}
func (s *System) CreateRecords(data *entities.CombinedData) (*core.Record, error) {
s.data = data
return s.createRecords(data)
}

View File

@@ -1,5 +1,4 @@
//go:build testing
// +build testing
package ws

Some files were not shown because too many files have changed in this diff Show More