refactor: simplify/improve status alert handling (#1519 )

also adds new functionality to restore any pending down alerts that were lost by hub restart before creation
fix(hub): check if status alert is triggered before sending up alert (#1806 )
2026-03-22 05:36:15 +01:00 · 2026-03-12 15:53:40 -04:00 · 2026-03-12 13:38:42 -04:00 · 2026-03-11 16:25:52 -04:00 · 2026-03-10 18:46:57 -04:00 · 2026-03-09 18:32:35 -04:00
156 changed files with 10681 additions and 1395 deletions
--- a/.github/workflows/inactivity-actions.yml
+++ b/.github/workflows/inactivity-actions.yml
@@ -6,6 +6,7 @@ on:
  workflow_dispatch:

 permissions:
+  actions: write
  issues: write
  pull-requests: write

@@ -48,6 +49,9 @@ jobs:
          # Action can not skip PRs, set it to 100 years to cover it.
          days-before-pr-stale: 36524

+          # Max issues to process before early exit. Next run resumes from cache. GH API limit: 5000.
+          operations-per-run: 1500
+
          # Labels
          stale-issue-label: 'stale'
          remove-stale-when-updated: true
@@ -56,4 +60,5 @@ jobs:

          # Exemptions
          exempt-assignees: true
-          exempt-milestones: true
+
+          exempt-milestones: true
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ dist
 *.exe
 internal/cmd/hub/hub
 internal/cmd/agent/agent
+agent.test
 node_modules
 build
 *timestamp*
--- a/41
+++ b/41
@@ -3,6 +3,40 @@ OS ?= $(shell go env GOOS)
 ARCH ?= $(shell go env GOARCH)
 # Skip building the web UI if true
 SKIP_WEB ?= false
+# Controls NVML/glibc agent build tag behavior:
+# - auto (default): enable on linux/amd64 glibc hosts
+# - true: always enable
+# - false: always disable
+NVML ?= auto
+
+# Detect glibc host for local linux/amd64 builds.
+HOST_GLIBC := $(shell \
+	if [ "$(OS)" = "linux" ] && [ "$(ARCH)" = "amd64" ]; then \
+		for p in /lib64/ld-linux-x86-64.so.2 /lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 /lib/ld-linux-x86-64.so.2; do \
+			[ -e "$$p" ] && { echo true; exit 0; }; \
+		done; \
+		if command -v ldd >/dev/null 2>&1; then \
+			if ldd --version 2>&1 | tr '[:upper:]' '[:lower:]' | awk '/gnu libc|glibc/{found=1} END{exit !found}'; then \
+				echo true; \
+			else \
+				echo false; \
+			fi; \
+		else \
+			echo false; \
+		fi; \
+	else \
+		echo false; \
+	fi)
+
+# Enable glibc build tag for NVML on supported Linux builds.
+AGENT_GO_TAGS :=
+ifeq ($(NVML),true)
+AGENT_GO_TAGS := -tags glibc
+else ifeq ($(NVML),auto)
+ifeq ($(HOST_GLIBC),true)
+AGENT_GO_TAGS := -tags glibc
+endif
+endif

 # Set executable extension based on target OS
 EXE_EXT := $(if $(filter windows,$(OS)),.exe,)
@@ -17,7 +51,6 @@ clean:
 lint:
 	golangci-lint run

-test: export GOEXPERIMENT=synctest
 test:
 	go test -tags=testing ./...

@@ -54,7 +87,7 @@ fetch-smartctl-conditional:

 # Update build-agent to include conditional .NET build
 build-agent: tidy build-dotnet-conditional fetch-smartctl-conditional
-	GOOS=$(OS) GOARCH=$(ARCH) go build -o ./build/beszel-agent_$(OS)_$(ARCH)$(EXE_EXT) -ldflags "-w -s" ./internal/cmd/agent
+	GOOS=$(OS) GOARCH=$(ARCH) go build $(AGENT_GO_TAGS) -o ./build/beszel-agent_$(OS)_$(ARCH)$(EXE_EXT) -ldflags "-w -s" ./internal/cmd/agent

 build-hub: tidy $(if $(filter false,$(SKIP_WEB)),build-web-ui)
 	GOOS=$(OS) GOARCH=$(ARCH) go build -o ./build/beszel_$(OS)_$(ARCH)$(EXE_EXT) -ldflags "-w -s" ./internal/cmd/hub
@@ -90,9 +123,9 @@ dev-hub:

 dev-agent:
 	@if command -v entr >/dev/null 2>&1; then \
-		find ./internal/cmd/agent/*.go ./agent/*.go | entr -r go run github.com/henrygd/beszel/internal/cmd/agent; \
+		find ./internal/cmd/agent/*.go ./agent/*.go | entr -r go run $(AGENT_GO_TAGS) github.com/henrygd/beszel/internal/cmd/agent; \
 	else \
-		go run github.com/henrygd/beszel/internal/cmd/agent; \
+		go run $(AGENT_GO_TAGS) github.com/henrygd/beszel/internal/cmd/agent; \
 	fi
 	
 build-dotnet:
--- a/agent/agent.go
+++ b/agent/agent.go
@@ -5,11 +5,7 @@
 package agent

 import (
-	"crypto/sha256"
-	"encoding/hex"
 	"log/slog"
-	"os"
-	"path/filepath"
 	"strings"
 	"sync"
 	"time"
@@ -17,9 +13,9 @@ import (
 	"github.com/gliderlabs/ssh"
 	"github.com/henrygd/beszel"
 	"github.com/henrygd/beszel/agent/deltatracker"
+	"github.com/henrygd/beszel/agent/utils"
 	"github.com/henrygd/beszel/internal/common"
 	"github.com/henrygd/beszel/internal/entities/system"
-	"github.com/shirou/gopsutil/v4/host"
 	gossh "golang.org/x/crypto/ssh"
 )

@@ -65,18 +61,18 @@ func NewAgent(dataDir ...string) (agent *Agent, err error) {
 	agent.netIoStats = make(map[uint16]system.NetIoStats)
 	agent.netInterfaceDeltaTrackers = make(map[uint16]*deltatracker.DeltaTracker[string, uint64])

-	agent.dataDir, err = getDataDir(dataDir...)
+	agent.dataDir, err = GetDataDir(dataDir...)
 	if err != nil {
 		slog.Warn("Data directory not found")
 	} else {
 		slog.Info("Data directory", "path", agent.dataDir)
 	}

-	agent.memCalc, _ = GetEnv("MEM_CALC")
+	agent.memCalc, _ = utils.GetEnv("MEM_CALC")
 	agent.sensorConfig = agent.newSensorConfig()

 	// Parse disk usage cache duration (e.g., "15m", "1h") to avoid waking sleeping disks
-	if diskUsageCache, exists := GetEnv("DISK_USAGE_CACHE"); exists {
+	if diskUsageCache, exists := utils.GetEnv("DISK_USAGE_CACHE"); exists {
 		if duration, err := time.ParseDuration(diskUsageCache); err == nil {
 			agent.diskUsageCacheDuration = duration
 			slog.Info("DISK_USAGE_CACHE", "duration", duration)
@@ -86,7 +82,7 @@ func NewAgent(dataDir ...string) (agent *Agent, err error) {
 	}

 	// Set up slog with a log level determined by the LOG_LEVEL env var
-	if logLevelStr, exists := GetEnv("LOG_LEVEL"); exists {
+	if logLevelStr, exists := utils.GetEnv("LOG_LEVEL"); exists {
 		switch strings.ToLower(logLevelStr) {
 		case "debug":
 			agent.debug = true
@@ -107,7 +103,7 @@ func NewAgent(dataDir ...string) (agent *Agent, err error) {
 	agent.refreshSystemDetails()

 	// SMART_INTERVAL env var to update smart data at this interval
-	if smartIntervalEnv, exists := GetEnv("SMART_INTERVAL"); exists {
+	if smartIntervalEnv, exists := utils.GetEnv("SMART_INTERVAL"); exists {
 		if duration, err := time.ParseDuration(smartIntervalEnv); err == nil && duration > 0 {
 			agent.systemDetails.SmartInterval = duration
 			slog.Info("SMART_INTERVAL", "duration", duration)
@@ -152,15 +148,6 @@ func NewAgent(dataDir ...string) (agent *Agent, err error) {
 	return agent, nil
 }

-// GetEnv retrieves an environment variable with a "BESZEL_AGENT_" prefix, or falls back to the unprefixed key.
-func GetEnv(key string) (value string, exists bool) {
-	if value, exists = os.LookupEnv("BESZEL_AGENT_" + key); exists {
-		return value, exists
-	}
-	// Fallback to the old unprefixed key
-	return os.LookupEnv(key)
-}
-
 func (a *Agent) gatherStats(options common.DataRequestOptions) *system.CombinedData {
 	a.Lock()
 	defer a.Unlock()
@@ -217,7 +204,7 @@ func (a *Agent) gatherStats(options common.DataRequestOptions) *system.CombinedD
 			data.Stats.ExtraFs[key] = stats
 			// Add percentages to Info struct for dashboard
 			if stats.DiskTotal > 0 {
-				pct := twoDecimals((stats.DiskUsed / stats.DiskTotal) * 100)
+				pct := utils.TwoDecimals((stats.DiskUsed / stats.DiskTotal) * 100)
 				data.Info.ExtraFsPct[key] = pct
 			}
 		}
@@ -228,38 +215,12 @@ func (a *Agent) gatherStats(options common.DataRequestOptions) *system.CombinedD
 	return data
 }

-// StartAgent initializes and starts the agent with optional WebSocket connection
+// Start initializes and starts the agent with optional WebSocket connection
 func (a *Agent) Start(serverOptions ServerOptions) error {
 	a.keys = serverOptions.Keys
 	return a.connectionManager.Start(serverOptions)
 }

 func (a *Agent) getFingerprint() string {
-	// first look for a fingerprint in the data directory
-	if a.dataDir != "" {
-		if fp, err := os.ReadFile(filepath.Join(a.dataDir, "fingerprint")); err == nil {
-			return string(fp)
-		}
-	}
-
-	// if no fingerprint is found, generate one
-	fingerprint, err := host.HostID()
-	// we ignore a commonly known "product_uuid" known not to be unique
-	if err != nil || fingerprint == "" || fingerprint == "03000200-0400-0500-0006-000700080009" {
-		fingerprint = a.systemDetails.Hostname + a.systemDetails.CpuModel
-	}
-
-	// hash fingerprint
-	sum := sha256.Sum256([]byte(fingerprint))
-	fingerprint = hex.EncodeToString(sum[:24])
-
-	// save fingerprint to data directory
-	if a.dataDir != "" {
-		err = os.WriteFile(filepath.Join(a.dataDir, "fingerprint"), []byte(fingerprint), 0644)
-		if err != nil {
-			slog.Warn("Failed to save fingerprint", "err", err)
-		}
-	}
-
-	return fingerprint
+	return GetFingerprint(a.dataDir, a.systemDetails.Hostname, a.systemDetails.CpuModel)
 }
--- a/agent/agent_cache_test.go
+++ b/agent/agent_cache_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package agent

--- a/agent/agent_test_helpers.go
+++ b/agent/agent_test_helpers.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package agent

--- a/agent/client.go
+++ b/agent/client.go
@@ -14,6 +14,7 @@ import (
 	"time"

 	"github.com/henrygd/beszel"
+	"github.com/henrygd/beszel/agent/utils"
 	"github.com/henrygd/beszel/internal/common"

 	"github.com/fxamacker/cbor/v2"
@@ -43,7 +44,7 @@ type WebSocketClient struct {
 // newWebSocketClient creates a new WebSocket client for the given agent.
 // It reads configuration from environment variables and validates the hub URL.
 func newWebSocketClient(agent *Agent) (client *WebSocketClient, err error) {
-	hubURLStr, exists := GetEnv("HUB_URL")
+	hubURLStr, exists := utils.GetEnv("HUB_URL")
 	if !exists {
 		return nil, errors.New("HUB_URL environment variable not set")
 	}
@@ -72,12 +73,12 @@ func newWebSocketClient(agent *Agent) (client *WebSocketClient, err error) {
 // If neither is set, it returns an error.
 func getToken() (string, error) {
 	// get token from env var
-	token, _ := GetEnv("TOKEN")
+	token, _ := utils.GetEnv("TOKEN")
 	if token != "" {
 		return token, nil
 	}
 	// get token from file
-	tokenFile, _ := GetEnv("TOKEN_FILE")
+	tokenFile, _ := utils.GetEnv("TOKEN_FILE")
 	if tokenFile == "" {
 		return "", errors.New("must set TOKEN or TOKEN_FILE")
 	}
@@ -197,7 +198,7 @@ func (client *WebSocketClient) handleAuthChallenge(msg *common.HubRequest[cbor.R
 	}

 	if authRequest.NeedSysInfo {
-		response.Name, _ = GetEnv("SYSTEM_NAME")
+		response.Name, _ = utils.GetEnv("SYSTEM_NAME")
 		response.Hostname = client.agent.systemDetails.Hostname
 		serverAddr := client.agent.connectionManager.serverOptions.Addr
 		_, response.Port, _ = net.SplitHostPort(serverAddr)
--- a/agent/client_test.go
+++ b/agent/client_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package agent

--- a/agent/connection_manager.go
+++ b/agent/connection_manager.go
@@ -1,9 +1,9 @@
 package agent

 import (
+	"context"
 	"errors"
 	"log/slog"
-	"os"
 	"os/signal"
 	"syscall"
 	"time"
@@ -91,8 +91,8 @@ func (c *ConnectionManager) Start(serverOptions ServerOptions) error {
 	c.eventChan = make(chan ConnectionEvent, 1)

 	// signal handling for shutdown
-	sigChan := make(chan os.Signal, 1)
-	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
+	sigCtx, stopSignals := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
+	defer stopSignals()

 	c.startWsTicker()
 	c.connect()
@@ -109,8 +109,8 @@ func (c *ConnectionManager) Start(serverOptions ServerOptions) error {
 			_ = c.startWebSocketConnection()
 		case <-healthTicker:
 			_ = health.Update()
-		case <-sigChan:
-			slog.Info("Shutting down")
+		case <-sigCtx.Done():
+			slog.Info("Shutting down", "cause", context.Cause(sigCtx))
 			_ = c.agent.StopServer()
 			c.closeWebSocket()
 			return health.CleanUp()
--- a/agent/connection_manager_test.go
+++ b/agent/connection_manager_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package agent

--- a/agent/cpu.go
+++ b/agent/cpu.go
@@ -14,10 +14,10 @@ var lastPerCoreCpuTimes = make(map[uint16][]cpu.TimesStat)
 // init initializes the CPU monitoring by storing the initial CPU times
 // for the default 60-second cache interval.
 func init() {
-	if times, err := cpu.Times(false); err == nil {
+	if times, err := cpu.Times(false); err == nil && len(times) > 0 {
 		lastCpuTimes[60000] = times[0]
 	}
-	if perCoreTimes, err := cpu.Times(true); err == nil {
+	if perCoreTimes, err := cpu.Times(true); err == nil && len(perCoreTimes) > 0 {
 		lastPerCoreCpuTimes[60000] = perCoreTimes
 	}
 }
@@ -89,10 +89,7 @@ func getPerCoreCpuUsage(cacheTimeMs uint16) (system.Uint8Slice, error) {
 	lastTimes := lastPerCoreCpuTimes[cacheTimeMs]

 	// Limit to the number of cores available in both samples
-	length := len(perCoreTimes)
-	if len(lastTimes) < length {
-		length = len(lastTimes)
-	}
+	length := min(len(lastTimes), len(perCoreTimes))

 	usage := make([]uint8, length)
 	for i := 0; i < length; i++ {
--- a/agent/data_dir.go
+++ b/agent/data_dir.go
@@ -6,17 +6,19 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
+
+	"github.com/henrygd/beszel/agent/utils"
 )

-// getDataDir returns the path to the data directory for the agent and an error
+// GetDataDir returns the path to the data directory for the agent and an error
 // if the directory is not valid. Attempts to find the optimal data directory if
 // no data directories are provided.
-func getDataDir(dataDirs ...string) (string, error) {
+func GetDataDir(dataDirs ...string) (string, error) {
 	if len(dataDirs) > 0 {
 		return testDataDirs(dataDirs)
 	}

-	dataDir, _ := GetEnv("DATA_DIR")
+	dataDir, _ := utils.GetEnv("DATA_DIR")
 	if dataDir != "" {
 		dataDirs = append(dataDirs, dataDir)
 	}
--- a/agent/data_dir_test.go
+++ b/agent/data_dir_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package agent

@@ -17,7 +16,7 @@ func TestGetDataDir(t *testing.T) {
 	// Test with explicit dataDir parameter
 	t.Run("explicit data dir", func(t *testing.T) {
 		tempDir := t.TempDir()
-		result, err := getDataDir(tempDir)
+		result, err := GetDataDir(tempDir)
 		require.NoError(t, err)
 		assert.Equal(t, tempDir, result)
 	})
@@ -26,7 +25,7 @@ func TestGetDataDir(t *testing.T) {
 	t.Run("explicit data dir - create new", func(t *testing.T) {
 		tempDir := t.TempDir()
 		newDir := filepath.Join(tempDir, "new-data-dir")
-		result, err := getDataDir(newDir)
+		result, err := GetDataDir(newDir)
 		require.NoError(t, err)
 		assert.Equal(t, newDir, result)

@@ -52,7 +51,7 @@ func TestGetDataDir(t *testing.T) {

 		os.Setenv("BESZEL_AGENT_DATA_DIR", tempDir)

-		result, err := getDataDir()
+		result, err := GetDataDir()
 		require.NoError(t, err)
 		assert.Equal(t, tempDir, result)
 	})
@@ -60,7 +59,7 @@ func TestGetDataDir(t *testing.T) {
 	// Test with invalid explicit dataDir
 	t.Run("invalid explicit data dir", func(t *testing.T) {
 		invalidPath := "/invalid/path/that/cannot/be/created"
-		_, err := getDataDir(invalidPath)
+		_, err := GetDataDir(invalidPath)
 		assert.Error(t, err)
 	})

@@ -79,7 +78,7 @@ func TestGetDataDir(t *testing.T) {

 		// This will try platform-specific defaults, which may or may not work
 		// We're mainly testing that it doesn't panic and returns some result
-		result, err := getDataDir()
+		result, err := GetDataDir()
 		// We don't assert success/failure here since it depends on system permissions
 		// Just verify we get a string result if no error
 		if err == nil {
--- a/agent/disk.go
+++ b/agent/disk.go
@@ -8,6 +8,7 @@ import (
 	"strings"
 	"time"

+	"github.com/henrygd/beszel/agent/utils"
 	"github.com/henrygd/beszel/internal/entities/system"

 	"github.com/shirou/gopsutil/v4/disk"
@@ -26,9 +27,18 @@ func parseFilesystemEntry(entry string) (device, customName string) {
 	return device, customName
 }

+func isDockerSpecialMountpoint(mountpoint string) bool {
+	switch mountpoint {
+	case "/etc/hosts", "/etc/resolv.conf", "/etc/hostname":
+		return true
+	default:
+		return false
+	}
+}
+
 // Sets up the filesystems to monitor for disk usage and I/O.
 func (a *Agent) initializeDiskInfo() {
-	filesystem, _ := GetEnv("FILESYSTEM")
+	filesystem, _ := utils.GetEnv("FILESYSTEM")
 	efPath := "/extra-filesystems"
 	hasRoot := false
 	isWindows := runtime.GOOS == "windows"
@@ -69,11 +79,22 @@ func (a *Agent) initializeDiskInfo() {
 		if _, exists := a.fsStats[key]; !exists {
 			if root {
 				slog.Info("Detected root device", "name", key)
-				// Check if root device is in /proc/diskstats, use fallback if not
+				// Try to map root device to a diskIoCounters entry. First
+				// checks for an exact key match, then uses findIoDevice for
+				// normalized / prefix-based matching (e.g. nda0p2 → nda0),
+				// and finally falls back to the FILESYSTEM env var.
 				if _, ioMatch = diskIoCounters[key]; !ioMatch {
-					key, ioMatch = findIoDevice(filesystem, diskIoCounters, a.fsStats)
+					if matchedKey, match := findIoDevice(key, diskIoCounters); match {
+						key = matchedKey
+						ioMatch = true
+					} else if filesystem != "" {
+						if matchedKey, match := findIoDevice(filesystem, diskIoCounters); match {
+							key = matchedKey
+							ioMatch = true
+						}
+					}
 					if !ioMatch {
-						slog.Info("Using I/O fallback", "device", device, "mountpoint", mountpoint, "fallback", key)
+						slog.Warn("Root I/O unmapped; set FILESYSTEM", "device", device, "mountpoint", mountpoint)
 					}
 				}
 			} else {
@@ -101,20 +122,28 @@ func (a *Agent) initializeDiskInfo() {
 	// Use FILESYSTEM env var to find root filesystem
 	if filesystem != "" {
 		for _, p := range partitions {
-			if strings.HasSuffix(p.Device, filesystem) || p.Mountpoint == filesystem {
+			if filesystemMatchesPartitionSetting(filesystem, p) {
 				addFsStat(p.Device, p.Mountpoint, true)
 				hasRoot = true
 				break
 			}
 		}
 		if !hasRoot {
-			slog.Warn("Partition details not found", "filesystem", filesystem)
+			// FILESYSTEM may name a physical disk absent from partitions (e.g.
+			// ZFS lists dataset paths like zroot/ROOT/default, not block devices).
+			// Try matching directly against diskIoCounters.
+			if ioKey, match := findIoDevice(filesystem, diskIoCounters); match {
+				a.fsStats[ioKey] = &system.FsStats{Root: true, Mountpoint: rootMountPoint}
+				hasRoot = true
+			} else {
+				slog.Warn("Partition details not found", "filesystem", filesystem)
+			}
 		}
 	}

 	// Add EXTRA_FILESYSTEMS env var values to fsStats
-	if extraFilesystems, exists := GetEnv("EXTRA_FILESYSTEMS"); exists {
-		for _, fsEntry := range strings.Split(extraFilesystems, ",") {
+	if extraFilesystems, exists := utils.GetEnv("EXTRA_FILESYSTEMS"); exists {
+		for fsEntry := range strings.SplitSeq(extraFilesystems, ",") {
 			// Parse custom name from format: device__customname
 			fs, customName := parseFilesystemEntry(fsEntry)

@@ -141,8 +170,8 @@ func (a *Agent) initializeDiskInfo() {
 	for _, p := range partitions {
 		// fmt.Println(p.Device, p.Mountpoint)
 		// Binary root fallback or docker root fallback
-		if !hasRoot && (p.Mountpoint == rootMountPoint || (p.Mountpoint == "/etc/hosts" && strings.HasPrefix(p.Device, "/dev"))) {
-			fs, match := findIoDevice(filepath.Base(p.Device), diskIoCounters, a.fsStats)
+		if !hasRoot && (p.Mountpoint == rootMountPoint || (isDockerSpecialMountpoint(p.Mountpoint) && strings.HasPrefix(p.Device, "/dev"))) {
+			fs, match := findIoDevice(filepath.Base(p.Device), diskIoCounters)
 			if match {
 				addFsStat(fs, p.Mountpoint, true)
 				hasRoot = true
@@ -174,35 +203,180 @@ func (a *Agent) initializeDiskInfo() {
 		}
 	}

-	// If no root filesystem set, use fallback
+	// If no root filesystem set, try the most active I/O device as a last
+	// resort (e.g. ZFS where dataset names are unrelated to disk names).
 	if !hasRoot {
-		rootDevice, _ := findIoDevice(filepath.Base(filesystem), diskIoCounters, a.fsStats)
-		slog.Info("Root disk", "mountpoint", rootMountPoint, "io", rootDevice)
-		a.fsStats[rootDevice] = &system.FsStats{Root: true, Mountpoint: rootMountPoint}
+		rootKey := mostActiveIoDevice(diskIoCounters)
+		if rootKey != "" {
+			slog.Warn("Using most active device for root I/O; set FILESYSTEM to override", "device", rootKey)
+		} else {
+			rootKey = filepath.Base(rootMountPoint)
+			if _, exists := a.fsStats[rootKey]; exists {
+				rootKey = "root"
+			}
+			slog.Warn("Root I/O device not detected; set FILESYSTEM to override")
+		}
+		a.fsStats[rootKey] = &system.FsStats{Root: true, Mountpoint: rootMountPoint}
 	}

+	a.pruneDuplicateRootExtraFilesystems()
 	a.initializeDiskIoStats(diskIoCounters)
 }

-// Returns matching device from /proc/diskstats,
-// or the device with the most reads if no match is found.
-// bool is true if a match was found.
-func findIoDevice(filesystem string, diskIoCounters map[string]disk.IOCountersStat, fsStats map[string]*system.FsStats) (string, bool) {
-	var maxReadBytes uint64
-	maxReadDevice := "/"
-	for _, d := range diskIoCounters {
-		if d.Name == filesystem || (d.Label != "" && d.Label == filesystem) {
-			return d.Name, true
-		}
-		if d.ReadBytes > maxReadBytes {
-			// don't use if device already exists in fsStats
-			if _, exists := fsStats[d.Name]; !exists {
-				maxReadBytes = d.ReadBytes
-				maxReadDevice = d.Name
-			}
+// Removes extra filesystems that mirror root usage (https://github.com/henrygd/beszel/issues/1428).
+func (a *Agent) pruneDuplicateRootExtraFilesystems() {
+	var rootMountpoint string
+	for _, stats := range a.fsStats {
+		if stats != nil && stats.Root {
+			rootMountpoint = stats.Mountpoint
+			break
 		}
 	}
-	return maxReadDevice, false
+	if rootMountpoint == "" {
+		return
+	}
+	rootUsage, err := disk.Usage(rootMountpoint)
+	if err != nil {
+		return
+	}
+	for name, stats := range a.fsStats {
+		if stats == nil || stats.Root {
+			continue
+		}
+		extraUsage, err := disk.Usage(stats.Mountpoint)
+		if err != nil {
+			continue
+		}
+		if hasSameDiskUsage(rootUsage, extraUsage) {
+			slog.Info("Ignoring duplicate FS", "name", name, "mount", stats.Mountpoint)
+			delete(a.fsStats, name)
+		}
+	}
+}
+
+// hasSameDiskUsage compares root/extra usage with a small byte tolerance.
+func hasSameDiskUsage(a, b *disk.UsageStat) bool {
+	if a == nil || b == nil || a.Total == 0 || b.Total == 0 {
+		return false
+	}
+	// Allow minor drift between sequential disk usage calls.
+	const toleranceBytes uint64 = 16 * 1024 * 1024
+	return withinUsageTolerance(a.Total, b.Total, toleranceBytes) &&
+		withinUsageTolerance(a.Used, b.Used, toleranceBytes)
+}
+
+// withinUsageTolerance reports whether two byte values differ by at most tolerance.
+func withinUsageTolerance(a, b, tolerance uint64) bool {
+	if a >= b {
+		return a-b <= tolerance
+	}
+	return b-a <= tolerance
+}
+
+type ioMatchCandidate struct {
+	name  string
+	bytes uint64
+	ops   uint64
+}
+
+// findIoDevice prefers exact device/label matches, then falls back to a
+// prefix-related candidate with the highest recent activity.
+func findIoDevice(filesystem string, diskIoCounters map[string]disk.IOCountersStat) (string, bool) {
+	filesystem = normalizeDeviceName(filesystem)
+	if filesystem == "" {
+		return "", false
+	}
+
+	candidates := []ioMatchCandidate{}
+
+	for _, d := range diskIoCounters {
+		if normalizeDeviceName(d.Name) == filesystem || (d.Label != "" && normalizeDeviceName(d.Label) == filesystem) {
+			return d.Name, true
+		}
+		if prefixRelated(normalizeDeviceName(d.Name), filesystem) ||
+			(d.Label != "" && prefixRelated(normalizeDeviceName(d.Label), filesystem)) {
+			candidates = append(candidates, ioMatchCandidate{
+				name:  d.Name,
+				bytes: d.ReadBytes + d.WriteBytes,
+				ops:   d.ReadCount + d.WriteCount,
+			})
+		}
+	}
+
+	if len(candidates) == 0 {
+		return "", false
+	}
+
+	best := candidates[0]
+	for _, c := range candidates[1:] {
+		if c.bytes > best.bytes ||
+			(c.bytes == best.bytes && c.ops > best.ops) ||
+			(c.bytes == best.bytes && c.ops == best.ops && c.name < best.name) {
+			best = c
+		}
+	}
+
+	slog.Info("Using disk I/O fallback", "requested", filesystem, "selected", best.name)
+	return best.name, true
+}
+
+// mostActiveIoDevice returns the device with the highest I/O activity,
+// or "" if diskIoCounters is empty.
+func mostActiveIoDevice(diskIoCounters map[string]disk.IOCountersStat) string {
+	var best ioMatchCandidate
+	for _, d := range diskIoCounters {
+		c := ioMatchCandidate{
+			name:  d.Name,
+			bytes: d.ReadBytes + d.WriteBytes,
+			ops:   d.ReadCount + d.WriteCount,
+		}
+		if best.name == "" || c.bytes > best.bytes ||
+			(c.bytes == best.bytes && c.ops > best.ops) ||
+			(c.bytes == best.bytes && c.ops == best.ops && c.name < best.name) {
+			best = c
+		}
+	}
+	return best.name
+}
+
+// prefixRelated reports whether either identifier is a prefix of the other.
+func prefixRelated(a, b string) bool {
+	if a == "" || b == "" || a == b {
+		return false
+	}
+	return strings.HasPrefix(a, b) || strings.HasPrefix(b, a)
+}
+
+// filesystemMatchesPartitionSetting checks whether a FILESYSTEM env var value
+// matches a partition by mountpoint, exact device name, or prefix relationship
+// (e.g. FILESYSTEM=ada0 matches partition /dev/ada0p2).
+func filesystemMatchesPartitionSetting(filesystem string, p disk.PartitionStat) bool {
+	filesystem = strings.TrimSpace(filesystem)
+	if filesystem == "" {
+		return false
+	}
+	if p.Mountpoint == filesystem {
+		return true
+	}
+
+	fsName := normalizeDeviceName(filesystem)
+	partName := normalizeDeviceName(p.Device)
+	if fsName == "" || partName == "" {
+		return false
+	}
+	if fsName == partName {
+		return true
+	}
+	return prefixRelated(partName, fsName)
+}
+
+// normalizeDeviceName canonicalizes device strings for comparisons.
+func normalizeDeviceName(value string) string {
+	name := filepath.Base(strings.TrimSpace(value))
+	if name == "." {
+		return ""
+	}
+	return name
 }

 // Sets start values for disk I/O stats.
@@ -239,12 +413,12 @@ func (a *Agent) updateDiskUsage(systemStats *system.Stats) {
 			continue
 		}
 		if d, err := disk.Usage(stats.Mountpoint); err == nil {
-			stats.DiskTotal = bytesToGigabytes(d.Total)
-			stats.DiskUsed = bytesToGigabytes(d.Used)
+			stats.DiskTotal = utils.BytesToGigabytes(d.Total)
+			stats.DiskUsed = utils.BytesToGigabytes(d.Used)
 			if stats.Root {
-				systemStats.DiskTotal = bytesToGigabytes(d.Total)
-				systemStats.DiskUsed = bytesToGigabytes(d.Used)
-				systemStats.DiskPct = twoDecimals(d.UsedPercent)
+				systemStats.DiskTotal = utils.BytesToGigabytes(d.Total)
+				systemStats.DiskUsed = utils.BytesToGigabytes(d.Used)
+				systemStats.DiskPct = utils.TwoDecimals(d.UsedPercent)
 			}
 		} else {
 			// reset stats if error (likely unmounted)
@@ -297,8 +471,8 @@ func (a *Agent) updateDiskIo(cacheTimeMs uint16, systemStats *system.Stats) {

 			diskIORead := (d.ReadBytes - prev.readBytes) * 1000 / msElapsed
 			diskIOWrite := (d.WriteBytes - prev.writeBytes) * 1000 / msElapsed
-			readMbPerSecond := bytesToMegabytes(float64(diskIORead))
-			writeMbPerSecond := bytesToMegabytes(float64(diskIOWrite))
+			readMbPerSecond := utils.BytesToMegabytes(float64(diskIORead))
+			writeMbPerSecond := utils.BytesToMegabytes(float64(diskIOWrite))

 			// validate values
 			if readMbPerSecond > 50_000 || writeMbPerSecond > 50_000 {
--- a/agent/disk_test.go
+++ b/agent/disk_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package agent

@@ -94,6 +93,162 @@ func TestParseFilesystemEntry(t *testing.T) {
 	}
 }

+func TestFindIoDevice(t *testing.T) {
+	t.Run("matches by device name", func(t *testing.T) {
+		ioCounters := map[string]disk.IOCountersStat{
+			"sda": {Name: "sda"},
+			"sdb": {Name: "sdb"},
+		}
+
+		device, ok := findIoDevice("sdb", ioCounters)
+		assert.True(t, ok)
+		assert.Equal(t, "sdb", device)
+	})
+
+	t.Run("matches by device label", func(t *testing.T) {
+		ioCounters := map[string]disk.IOCountersStat{
+			"sda": {Name: "sda", Label: "rootfs"},
+			"sdb": {Name: "sdb"},
+		}
+
+		device, ok := findIoDevice("rootfs", ioCounters)
+		assert.True(t, ok)
+		assert.Equal(t, "sda", device)
+	})
+
+	t.Run("returns no match when not found", func(t *testing.T) {
+		ioCounters := map[string]disk.IOCountersStat{
+			"sda": {Name: "sda"},
+			"sdb": {Name: "sdb"},
+		}
+
+		device, ok := findIoDevice("nvme0n1p1", ioCounters)
+		assert.False(t, ok)
+		assert.Equal(t, "", device)
+	})
+
+	t.Run("uses uncertain unique prefix fallback", func(t *testing.T) {
+		ioCounters := map[string]disk.IOCountersStat{
+			"nvme0n1": {Name: "nvme0n1"},
+			"sda":     {Name: "sda"},
+		}
+
+		device, ok := findIoDevice("nvme0n1p2", ioCounters)
+		assert.True(t, ok)
+		assert.Equal(t, "nvme0n1", device)
+	})
+
+	t.Run("uses dominant activity when prefix matches are ambiguous", func(t *testing.T) {
+		ioCounters := map[string]disk.IOCountersStat{
+			"sda": {Name: "sda", ReadBytes: 5000, WriteBytes: 5000, ReadCount: 100, WriteCount: 100},
+			"sdb": {Name: "sdb", ReadBytes: 1000, WriteBytes: 1000, ReadCount: 50, WriteCount: 50},
+		}
+
+		device, ok := findIoDevice("sd", ioCounters)
+		assert.True(t, ok)
+		assert.Equal(t, "sda", device)
+	})
+
+	t.Run("uses highest activity when ambiguous without dominance", func(t *testing.T) {
+		ioCounters := map[string]disk.IOCountersStat{
+			"sda": {Name: "sda", ReadBytes: 3000, WriteBytes: 3000, ReadCount: 50, WriteCount: 50},
+			"sdb": {Name: "sdb", ReadBytes: 2500, WriteBytes: 2500, ReadCount: 40, WriteCount: 40},
+		}
+
+		device, ok := findIoDevice("sd", ioCounters)
+		assert.True(t, ok)
+		assert.Equal(t, "sda", device)
+	})
+
+	t.Run("matches /dev/-prefixed partition to parent disk", func(t *testing.T) {
+		ioCounters := map[string]disk.IOCountersStat{
+			"nda0": {Name: "nda0", ReadBytes: 1000, WriteBytes: 1000},
+		}
+
+		device, ok := findIoDevice("/dev/nda0p2", ioCounters)
+		assert.True(t, ok)
+		assert.Equal(t, "nda0", device)
+	})
+
+	t.Run("uses deterministic name tie-breaker", func(t *testing.T) {
+		ioCounters := map[string]disk.IOCountersStat{
+			"sdb": {Name: "sdb", ReadBytes: 2000, WriteBytes: 2000, ReadCount: 10, WriteCount: 10},
+			"sda": {Name: "sda", ReadBytes: 2000, WriteBytes: 2000, ReadCount: 10, WriteCount: 10},
+		}
+
+		device, ok := findIoDevice("sd", ioCounters)
+		assert.True(t, ok)
+		assert.Equal(t, "sda", device)
+	})
+}
+
+func TestFilesystemMatchesPartitionSetting(t *testing.T) {
+	p := disk.PartitionStat{Device: "/dev/ada0p2", Mountpoint: "/"}
+
+	t.Run("matches mountpoint setting", func(t *testing.T) {
+		assert.True(t, filesystemMatchesPartitionSetting("/", p))
+	})
+
+	t.Run("matches exact partition setting", func(t *testing.T) {
+		assert.True(t, filesystemMatchesPartitionSetting("ada0p2", p))
+		assert.True(t, filesystemMatchesPartitionSetting("/dev/ada0p2", p))
+	})
+
+	t.Run("matches prefix-style parent setting", func(t *testing.T) {
+		assert.True(t, filesystemMatchesPartitionSetting("ada0", p))
+		assert.True(t, filesystemMatchesPartitionSetting("/dev/ada0", p))
+	})
+
+	t.Run("does not match unrelated device", func(t *testing.T) {
+		assert.False(t, filesystemMatchesPartitionSetting("sda", p))
+		assert.False(t, filesystemMatchesPartitionSetting("nvme0n1", p))
+		assert.False(t, filesystemMatchesPartitionSetting("", p))
+	})
+}
+
+func TestMostActiveIoDevice(t *testing.T) {
+	t.Run("returns most active device", func(t *testing.T) {
+		ioCounters := map[string]disk.IOCountersStat{
+			"nda0": {Name: "nda0", ReadBytes: 5000, WriteBytes: 5000, ReadCount: 100, WriteCount: 100},
+			"nda1": {Name: "nda1", ReadBytes: 1000, WriteBytes: 1000, ReadCount: 50, WriteCount: 50},
+		}
+		assert.Equal(t, "nda0", mostActiveIoDevice(ioCounters))
+	})
+
+	t.Run("uses deterministic tie-breaker", func(t *testing.T) {
+		ioCounters := map[string]disk.IOCountersStat{
+			"sdb": {Name: "sdb", ReadBytes: 1000, WriteBytes: 1000, ReadCount: 10, WriteCount: 10},
+			"sda": {Name: "sda", ReadBytes: 1000, WriteBytes: 1000, ReadCount: 10, WriteCount: 10},
+		}
+		assert.Equal(t, "sda", mostActiveIoDevice(ioCounters))
+	})
+
+	t.Run("returns empty for empty map", func(t *testing.T) {
+		assert.Equal(t, "", mostActiveIoDevice(map[string]disk.IOCountersStat{}))
+	})
+}
+
+func TestIsDockerSpecialMountpoint(t *testing.T) {
+	testCases := []struct {
+		name       string
+		mountpoint string
+		expected   bool
+	}{
+		{name: "hosts", mountpoint: "/etc/hosts", expected: true},
+		{name: "resolv", mountpoint: "/etc/resolv.conf", expected: true},
+		{name: "hostname", mountpoint: "/etc/hostname", expected: true},
+		{name: "root", mountpoint: "/", expected: false},
+		{name: "passwd", mountpoint: "/etc/passwd", expected: false},
+		{name: "extra-filesystem", mountpoint: "/extra-filesystems/sda1", expected: false},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			assert.Equal(t, tc.expected, isDockerSpecialMountpoint(tc.mountpoint))
+		})
+	}
+}
+
 func TestInitializeDiskInfoWithCustomNames(t *testing.T) {
 	// Set up environment variables
 	oldEnv := os.Getenv("EXTRA_FILESYSTEMS")
@@ -317,3 +472,37 @@ func TestDiskUsageCaching(t *testing.T) {
 			"lastDiskUsageUpdate should be refreshed when cache expires")
 	})
 }
+
+func TestHasSameDiskUsage(t *testing.T) {
+	const toleranceBytes uint64 = 16 * 1024 * 1024
+
+	t.Run("returns true when totals and usage are equal", func(t *testing.T) {
+		a := &disk.UsageStat{Total: 100 * 1024 * 1024 * 1024, Used: 42 * 1024 * 1024 * 1024}
+		b := &disk.UsageStat{Total: 100 * 1024 * 1024 * 1024, Used: 42 * 1024 * 1024 * 1024}
+		assert.True(t, hasSameDiskUsage(a, b))
+	})
+
+	t.Run("returns true within tolerance", func(t *testing.T) {
+		a := &disk.UsageStat{Total: 100 * 1024 * 1024 * 1024, Used: 42 * 1024 * 1024 * 1024}
+		b := &disk.UsageStat{
+			Total: a.Total + toleranceBytes - 1,
+			Used:  a.Used - toleranceBytes + 1,
+		}
+		assert.True(t, hasSameDiskUsage(a, b))
+	})
+
+	t.Run("returns false when total exceeds tolerance", func(t *testing.T) {
+		a := &disk.UsageStat{Total: 100 * 1024 * 1024 * 1024, Used: 42 * 1024 * 1024 * 1024}
+		b := &disk.UsageStat{
+			Total: a.Total + toleranceBytes + 1,
+			Used:  a.Used,
+		}
+		assert.False(t, hasSameDiskUsage(a, b))
+	})
+
+	t.Run("returns false for nil or zero total", func(t *testing.T) {
+		assert.False(t, hasSameDiskUsage(nil, &disk.UsageStat{Total: 1, Used: 1}))
+		assert.False(t, hasSameDiskUsage(&disk.UsageStat{Total: 1, Used: 1}, nil))
+		assert.False(t, hasSameDiskUsage(&disk.UsageStat{Total: 0, Used: 0}, &disk.UsageStat{Total: 1, Used: 1}))
+	})
+}
--- a/agent/docker.go
+++ b/agent/docker.go
@@ -1,6 +1,7 @@
 package agent

 import (
+	"bufio"
 	"bytes"
 	"context"
 	"encoding/binary"
@@ -20,6 +21,7 @@ import (
 	"time"

 	"github.com/henrygd/beszel/agent/deltatracker"
+	"github.com/henrygd/beszel/agent/utils"
 	"github.com/henrygd/beszel/internal/entities/container"

 	"github.com/blang/semver"
@@ -28,6 +30,7 @@ import (
 // ansiEscapePattern matches ANSI escape sequences (colors, cursor movement, etc.)
 // This includes CSI sequences like \x1b[...m and simple escapes like \x1b[K
 var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b\][^\x07]*\x07|\x1b[@-Z\\-_]`)
+var dockerContainerIDPattern = regexp.MustCompile(`^[a-fA-F0-9]{12,64}$`)

 const (
 	// Docker API timeout in milliseconds
@@ -72,6 +75,7 @@ type dockerManager struct {
 	// cacheTimeMs -> DeltaTracker for network bytes sent/received
 	networkSentTrackers map[uint16]*deltatracker.DeltaTracker[string, uint64]
 	networkRecvTrackers map[uint16]*deltatracker.DeltaTracker[string, uint64]
+	retrySleep          func(time.Duration)
 }

 // userAgentRoundTripper is a custom http.RoundTripper that adds a User-Agent header to all requests
@@ -333,12 +337,12 @@ func validateCpuPercentage(cpuPct float64, containerName string) error {

 // updateContainerStatsValues updates the final stats values
 func updateContainerStatsValues(stats *container.Stats, cpuPct float64, usedMemory uint64, sent_delta, recv_delta uint64, readTime time.Time) {
-	stats.Cpu = twoDecimals(cpuPct)
-	stats.Mem = bytesToMegabytes(float64(usedMemory))
+	stats.Cpu = utils.TwoDecimals(cpuPct)
+	stats.Mem = utils.BytesToMegabytes(float64(usedMemory))
 	stats.Bandwidth = [2]uint64{sent_delta, recv_delta}
 	// TODO(0.19+): stop populating NetworkSent/NetworkRecv (deprecated in 0.18.3)
-	stats.NetworkSent = bytesToMegabytes(float64(sent_delta))
-	stats.NetworkRecv = bytesToMegabytes(float64(recv_delta))
+	stats.NetworkSent = utils.BytesToMegabytes(float64(sent_delta))
+	stats.NetworkRecv = utils.BytesToMegabytes(float64(recv_delta))
 	stats.PrevReadTime = readTime
 }

@@ -484,7 +488,7 @@ func (dm *dockerManager) deleteContainerStatsSync(id string) {

 // Creates a new http client for Docker or Podman API
 func newDockerManager() *dockerManager {
-	dockerHost, exists := GetEnv("DOCKER_HOST")
+	dockerHost, exists := utils.GetEnv("DOCKER_HOST")
 	if exists {
 		// return nil if set to empty string
 		if dockerHost == "" {
@@ -520,7 +524,7 @@ func newDockerManager() *dockerManager {

 	// configurable timeout
 	timeout := time.Millisecond * time.Duration(dockerTimeoutMs)
-	if t, set := GetEnv("DOCKER_TIMEOUT"); set {
+	if t, set := utils.GetEnv("DOCKER_TIMEOUT"); set {
 		timeout, err = time.ParseDuration(t)
 		if err != nil {
 			slog.Error(err.Error())
@@ -537,7 +541,7 @@ func newDockerManager() *dockerManager {

 	// Read container exclusion patterns from environment variable
 	var excludeContainers []string
-	if excludeStr, set := GetEnv("EXCLUDE_CONTAINERS"); set && excludeStr != "" {
+	if excludeStr, set := utils.GetEnv("EXCLUDE_CONTAINERS"); set && excludeStr != "" {
 		parts := strings.SplitSeq(excludeStr, ",")
 		for part := range parts {
 			trimmed := strings.TrimSpace(part)
@@ -565,6 +569,7 @@ func newDockerManager() *dockerManager {
 		lastCpuReadTime:     make(map[uint16]map[string]time.Time),
 		networkSentTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
 		networkRecvTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
+		retrySleep:          time.Sleep,
 	}

 	// If using podman, return client
@@ -574,7 +579,7 @@ func newDockerManager() *dockerManager {
 		return manager
 	}

-	// this can take up to 5 seconds with retry, so run in goroutine
+	// run version check in goroutine to avoid blocking (server may not be ready and requires retries)
 	go manager.checkDockerVersion()

 	// give version check a chance to complete before returning
@@ -594,18 +599,18 @@ func (dm *dockerManager) checkDockerVersion() {
 	const versionMaxTries = 2
 	for i := 1; i <= versionMaxTries; i++ {
 		resp, err = dm.client.Get("http://localhost/version")
-		if err == nil {
+		if err == nil && resp.StatusCode == http.StatusOK {
 			break
 		}
 		if resp != nil {
 			resp.Body.Close()
 		}
 		if i < versionMaxTries {
-			slog.Debug("Failed to get Docker version; retrying", "attempt", i, "error", err)
-			time.Sleep(5 * time.Second)
+			slog.Debug("Failed to get Docker version; retrying", "attempt", i, "err", err, "response", resp)
+			dm.retrySleep(5 * time.Second)
 		}
 	}
-	if err != nil {
+	if err != nil || resp.StatusCode != http.StatusOK {
 		return
 	}
 	if err := dm.decode(resp, &versionInfo); err != nil {
@@ -647,9 +652,34 @@ func getDockerHost() string {
 	return scheme + socks[0]
 }

+func validateContainerID(containerID string) error {
+	if !dockerContainerIDPattern.MatchString(containerID) {
+		return fmt.Errorf("invalid container id")
+	}
+	return nil
+}
+
+func buildDockerContainerEndpoint(containerID, action string, query url.Values) (string, error) {
+	if err := validateContainerID(containerID); err != nil {
+		return "", err
+	}
+	u := &url.URL{
+		Scheme: "http",
+		Host:   "localhost",
+		Path:   fmt.Sprintf("/containers/%s/%s", url.PathEscape(containerID), action),
+	}
+	if len(query) > 0 {
+		u.RawQuery = query.Encode()
+	}
+	return u.String(), nil
+}
+
 // getContainerInfo fetches the inspection data for a container
 func (dm *dockerManager) getContainerInfo(ctx context.Context, containerID string) ([]byte, error) {
-	endpoint := fmt.Sprintf("http://localhost/containers/%s/json", containerID)
+	endpoint, err := buildDockerContainerEndpoint(containerID, "json", nil)
+	if err != nil {
+		return nil, err
+	}
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
 	if err != nil {
 		return nil, err
@@ -680,7 +710,15 @@ func (dm *dockerManager) getContainerInfo(ctx context.Context, containerID strin

 // getLogs fetches the logs for a container
 func (dm *dockerManager) getLogs(ctx context.Context, containerID string) (string, error) {
-	endpoint := fmt.Sprintf("http://localhost/containers/%s/logs?stdout=1&stderr=1&tail=%d", containerID, dockerLogsTail)
+	query := url.Values{
+		"stdout": []string{"1"},
+		"stderr": []string{"1"},
+		"tail":   []string{fmt.Sprintf("%d", dockerLogsTail)},
+	}
+	endpoint, err := buildDockerContainerEndpoint(containerID, "logs", query)
+	if err != nil {
+		return "", err
+	}
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
 	if err != nil {
 		return "", err
@@ -698,8 +736,17 @@ func (dm *dockerManager) getLogs(ctx context.Context, containerID string) (strin
 	}

 	var builder strings.Builder
-	multiplexed := resp.Header.Get("Content-Type") == "application/vnd.docker.multiplexed-stream"
-	if err := decodeDockerLogStream(resp.Body, &builder, multiplexed); err != nil {
+	contentType := resp.Header.Get("Content-Type")
+	multiplexed := strings.HasSuffix(contentType, "multiplexed-stream")
+	logReader := io.Reader(resp.Body)
+	if !multiplexed {
+		// Podman may return multiplexed logs without Content-Type. Sniff the first frame header
+		// with a small buffered reader only when the header check fails.
+		bufferedReader := bufio.NewReaderSize(resp.Body, 8)
+		multiplexed = detectDockerMultiplexedStream(bufferedReader)
+		logReader = bufferedReader
+	}
+	if err := decodeDockerLogStream(logReader, &builder, multiplexed); err != nil {
 		return "", err
 	}

@@ -711,6 +758,23 @@ func (dm *dockerManager) getLogs(ctx context.Context, containerID string) (strin
 	return logs, nil
 }

+func detectDockerMultiplexedStream(reader *bufio.Reader) bool {
+	const headerSize = 8
+	header, err := reader.Peek(headerSize)
+	if err != nil {
+		return false
+	}
+	if header[0] != 0x01 && header[0] != 0x02 {
+		return false
+	}
+	// Docker's stream framing header reserves bytes 1-3 as zero.
+	if header[1] != 0 || header[2] != 0 || header[3] != 0 {
+		return false
+	}
+	frameLen := binary.BigEndian.Uint32(header[4:])
+	return frameLen <= maxLogFrameSize
+}
+
 func decodeDockerLogStream(reader io.Reader, builder *strings.Builder, multiplexed bool) error {
 	if !multiplexed {
 		_, err := io.Copy(builder, io.LimitReader(reader, maxTotalLogSize))
--- a/agent/docker_test.go
+++ b/agent/docker_test.go
@@ -1,17 +1,24 @@
 //go:build testing
-// +build testing

 package agent

 import (
 	"bytes"
+	"context"
 	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"net"
+	"net/http"
+	"net/http/httptest"
 	"os"
 	"strings"
 	"testing"
 	"time"

 	"github.com/henrygd/beszel/agent/deltatracker"
+	"github.com/henrygd/beszel/agent/utils"
 	"github.com/henrygd/beszel/internal/entities/container"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -19,6 +26,37 @@ import (

 var defaultCacheTimeMs = uint16(60_000)

+type recordingRoundTripper struct {
+	statusCode  int
+	body        string
+	contentType string
+	called      bool
+	lastPath    string
+	lastQuery   map[string]string
+}
+
+func (rt *recordingRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
+	rt.called = true
+	rt.lastPath = req.URL.EscapedPath()
+	rt.lastQuery = map[string]string{}
+	for key, values := range req.URL.Query() {
+		if len(values) > 0 {
+			rt.lastQuery[key] = values[0]
+		}
+	}
+	resp := &http.Response{
+		StatusCode: rt.statusCode,
+		Status:     "200 OK",
+		Header:     make(http.Header),
+		Body:       io.NopCloser(strings.NewReader(rt.body)),
+		Request:    req,
+	}
+	if rt.contentType != "" {
+		resp.Header.Set("Content-Type", rt.contentType)
+	}
+	return resp, nil
+}
+
 // cycleCpuDeltas cycles the CPU tracking data for a specific cache time interval
 func (dm *dockerManager) cycleCpuDeltas(cacheTimeMs uint16) {
 	// Clear the CPU tracking maps for this cache time interval
@@ -110,6 +148,72 @@ func TestCalculateMemoryUsage(t *testing.T) {
 	}
 }

+func TestBuildDockerContainerEndpoint(t *testing.T) {
+	t.Run("valid container ID builds escaped endpoint", func(t *testing.T) {
+		endpoint, err := buildDockerContainerEndpoint("0123456789ab", "json", nil)
+		require.NoError(t, err)
+		assert.Equal(t, "http://localhost/containers/0123456789ab/json", endpoint)
+	})
+
+	t.Run("invalid container ID is rejected", func(t *testing.T) {
+		_, err := buildDockerContainerEndpoint("../../version", "json", nil)
+		require.Error(t, err)
+		assert.Contains(t, err.Error(), "invalid container id")
+	})
+}
+
+func TestContainerDetailsRequestsValidateContainerID(t *testing.T) {
+	rt := &recordingRoundTripper{
+		statusCode: 200,
+		body:       `{"Config":{"Env":["SECRET=1"]}}`,
+	}
+	dm := &dockerManager{
+		client: &http.Client{Transport: rt},
+	}
+
+	_, err := dm.getContainerInfo(context.Background(), "../version")
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "invalid container id")
+	assert.False(t, rt.called, "request should be rejected before dispatching to Docker API")
+}
+
+func TestContainerDetailsRequestsUseExpectedDockerPaths(t *testing.T) {
+	t.Run("container info uses container json endpoint", func(t *testing.T) {
+		rt := &recordingRoundTripper{
+			statusCode: 200,
+			body:       `{"Config":{"Env":["SECRET=1"]},"Name":"demo"}`,
+		}
+		dm := &dockerManager{
+			client: &http.Client{Transport: rt},
+		}
+
+		body, err := dm.getContainerInfo(context.Background(), "0123456789ab")
+		require.NoError(t, err)
+		assert.True(t, rt.called)
+		assert.Equal(t, "/containers/0123456789ab/json", rt.lastPath)
+		assert.NotContains(t, string(body), "SECRET=1", "sensitive env vars should be removed")
+	})
+
+	t.Run("container logs uses expected endpoint and query params", func(t *testing.T) {
+		rt := &recordingRoundTripper{
+			statusCode: 200,
+			body:       "line1\nline2\n",
+		}
+		dm := &dockerManager{
+			client: &http.Client{Transport: rt},
+		}
+
+		logs, err := dm.getLogs(context.Background(), "abcdef123456")
+		require.NoError(t, err)
+		assert.True(t, rt.called)
+		assert.Equal(t, "/containers/abcdef123456/logs", rt.lastPath)
+		assert.Equal(t, "1", rt.lastQuery["stdout"])
+		assert.Equal(t, "1", rt.lastQuery["stderr"])
+		assert.Equal(t, "200", rt.lastQuery["tail"])
+		assert.Equal(t, "line1\nline2\n", logs)
+	})
+}
+
 func TestValidateCpuPercentage(t *testing.T) {
 	tests := []struct {
 		name          string
@@ -195,48 +299,6 @@ func TestUpdateContainerStatsValues(t *testing.T) {
 	assert.Equal(t, testTime, stats.PrevReadTime)
 }

-func TestTwoDecimals(t *testing.T) {
-	tests := []struct {
-		name     string
-		input    float64
-		expected float64
-	}{
-		{"round down", 1.234, 1.23},
-		{"round half up", 1.235, 1.24}, // math.Round rounds half up
-		{"no rounding needed", 1.23, 1.23},
-		{"negative number", -1.235, -1.24}, // math.Round rounds half up (more negative)
-		{"zero", 0.0, 0.0},
-		{"large number", 123.456, 123.46}, // rounds 5 up
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := twoDecimals(tt.input)
-			assert.Equal(t, tt.expected, result)
-		})
-	}
-}
-
-func TestBytesToMegabytes(t *testing.T) {
-	tests := []struct {
-		name     string
-		input    float64
-		expected float64
-	}{
-		{"1 MB", 1048576, 1.0},
-		{"512 KB", 524288, 0.5},
-		{"zero", 0, 0},
-		{"large value", 1073741824, 1024}, // 1 GB = 1024 MB
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := bytesToMegabytes(tt.input)
-			assert.Equal(t, tt.expected, result)
-		})
-	}
-}
-
 func TestInitializeCpuTracking(t *testing.T) {
 	dm := &dockerManager{
 		lastCpuContainer: make(map[uint16]map[string]uint64),
@@ -379,6 +441,117 @@ func TestDockerManagerCreation(t *testing.T) {
 	assert.NotNil(t, dm.networkRecvTrackers)
 }

+func TestCheckDockerVersion(t *testing.T) {
+	tests := []struct {
+		name      string
+		responses []struct {
+			statusCode int
+			body       string
+		}
+		expectedGood     bool
+		expectedRequests int
+	}{
+		{
+			name: "200 with good version on first try",
+			responses: []struct {
+				statusCode int
+				body       string
+			}{
+				{http.StatusOK, `{"Version":"25.0.1"}`},
+			},
+			expectedGood:     true,
+			expectedRequests: 1,
+		},
+		{
+			name: "200 with old version on first try",
+			responses: []struct {
+				statusCode int
+				body       string
+			}{
+				{http.StatusOK, `{"Version":"24.0.7"}`},
+			},
+			expectedGood:     false,
+			expectedRequests: 1,
+		},
+		{
+			name: "non-200 then 200 with good version",
+			responses: []struct {
+				statusCode int
+				body       string
+			}{
+				{http.StatusServiceUnavailable, `"not ready"`},
+				{http.StatusOK, `{"Version":"25.1.0"}`},
+			},
+			expectedGood:     true,
+			expectedRequests: 2,
+		},
+		{
+			name: "non-200 on all retries",
+			responses: []struct {
+				statusCode int
+				body       string
+			}{
+				{http.StatusInternalServerError, `"error"`},
+				{http.StatusUnauthorized, `"error"`},
+			},
+			expectedGood:     false,
+			expectedRequests: 2,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			requestCount := 0
+			server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				idx := requestCount
+				requestCount++
+				if idx >= len(tt.responses) {
+					idx = len(tt.responses) - 1
+				}
+				w.WriteHeader(tt.responses[idx].statusCode)
+				fmt.Fprint(w, tt.responses[idx].body)
+			}))
+			defer server.Close()
+
+			dm := &dockerManager{
+				client: &http.Client{
+					Transport: &http.Transport{
+						DialContext: func(_ context.Context, network, _ string) (net.Conn, error) {
+							return net.Dial(network, server.Listener.Addr().String())
+						},
+					},
+				},
+				retrySleep: func(time.Duration) {},
+			}
+
+			dm.checkDockerVersion()
+
+			assert.Equal(t, tt.expectedGood, dm.goodDockerVersion)
+			assert.Equal(t, tt.expectedRequests, requestCount)
+		})
+	}
+
+	t.Run("request error on all retries", func(t *testing.T) {
+		requestCount := 0
+		dm := &dockerManager{
+			client: &http.Client{
+				Transport: &http.Transport{
+					DialContext: func(_ context.Context, _, _ string) (net.Conn, error) {
+						requestCount++
+						return nil, errors.New("connection refused")
+					},
+				},
+			},
+			retrySleep: func(time.Duration) {},
+		}
+
+		dm.checkDockerVersion()
+
+		assert.False(t, dm.goodDockerVersion)
+		assert.Equal(t, 2, requestCount)
+	})
+}
+
 func TestCycleCpuDeltas(t *testing.T) {
 	dm := &dockerManager{
 		lastCpuContainer: map[uint16]map[string]uint64{
@@ -691,14 +864,50 @@ func TestContainerStatsEndToEndWithRealData(t *testing.T) {
 	updateContainerStatsValues(testStats, cpuPct, usedMemory, 1000000, 500000, testTime)

 	assert.Equal(t, cpuPct, testStats.Cpu)
-	assert.Equal(t, bytesToMegabytes(float64(usedMemory)), testStats.Mem)
+	assert.Equal(t, utils.BytesToMegabytes(float64(usedMemory)), testStats.Mem)
 	assert.Equal(t, [2]uint64{1000000, 500000}, testStats.Bandwidth)
 	// Deprecated fields still populated for backward compatibility with older hubs
-	assert.Equal(t, bytesToMegabytes(1000000), testStats.NetworkSent)
-	assert.Equal(t, bytesToMegabytes(500000), testStats.NetworkRecv)
+	assert.Equal(t, utils.BytesToMegabytes(1000000), testStats.NetworkSent)
+	assert.Equal(t, utils.BytesToMegabytes(500000), testStats.NetworkRecv)
 	assert.Equal(t, testTime, testStats.PrevReadTime)
 }

+func TestGetLogsDetectsMultiplexedWithoutContentType(t *testing.T) {
+	// Docker multiplexed frame: [stream][0,0,0][len(4 bytes BE)][payload]
+	frame := []byte{
+		0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05,
+		'H', 'e', 'l', 'l', 'o',
+	}
+	rt := &recordingRoundTripper{
+		statusCode: 200,
+		body:       string(frame),
+		// Intentionally omit content type to simulate Podman behavior.
+	}
+	dm := &dockerManager{
+		client: &http.Client{Transport: rt},
+	}
+
+	logs, err := dm.getLogs(context.Background(), "abcdef123456")
+	require.NoError(t, err)
+	assert.Equal(t, "Hello", logs)
+}
+
+func TestGetLogsDoesNotMisclassifyRawStreamAsMultiplexed(t *testing.T) {
+	// Starts with 0x01, but doesn't match Docker frame signature (reserved bytes aren't all zero).
+	raw := []byte{0x01, 0x02, 0x03, 0x04, 'r', 'a', 'w'}
+	rt := &recordingRoundTripper{
+		statusCode: 200,
+		body:       string(raw),
+	}
+	dm := &dockerManager{
+		client: &http.Client{Transport: rt},
+	}
+
+	logs, err := dm.getLogs(context.Background(), "abcdef123456")
+	require.NoError(t, err)
+	assert.Equal(t, raw, []byte(logs))
+}
+
 func TestEdgeCasesWithRealData(t *testing.T) {
 	// Test with minimal container stats
 	minimalStats := &container.ApiStats{
@@ -940,13 +1149,13 @@ func TestConstantsAndUtilityFunctions(t *testing.T) {
 	assert.Equal(t, 5*1024*1024, maxTotalLogSize)               // 5MB

 	// Test utility functions
-	assert.Equal(t, 1.5, twoDecimals(1.499))
-	assert.Equal(t, 1.5, twoDecimals(1.5))
-	assert.Equal(t, 1.5, twoDecimals(1.501))
+	assert.Equal(t, 1.5, utils.TwoDecimals(1.499))
+	assert.Equal(t, 1.5, utils.TwoDecimals(1.5))
+	assert.Equal(t, 1.5, utils.TwoDecimals(1.501))

-	assert.Equal(t, 1.0, bytesToMegabytes(1048576)) // 1 MB
-	assert.Equal(t, 0.5, bytesToMegabytes(524288))  // 512 KB
-	assert.Equal(t, 0.0, bytesToMegabytes(0))
+	assert.Equal(t, 1.0, utils.BytesToMegabytes(1048576)) // 1 MB
+	assert.Equal(t, 0.5, utils.BytesToMegabytes(524288))  // 512 KB
+	assert.Equal(t, 0.0, utils.BytesToMegabytes(0))
 }

 func TestDecodeDockerLogStream(t *testing.T) {
--- a/agent/emmc_common.go
+++ b/agent/emmc_common.go
@@ -0,0 +1,95 @@
+package agent
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+)
+
+func isEmmcBlockName(name string) bool {
+	if !strings.HasPrefix(name, "mmcblk") {
+		return false
+	}
+	suffix := strings.TrimPrefix(name, "mmcblk")
+	if suffix == "" {
+		return false
+	}
+	for _, c := range suffix {
+		if c < '0' || c > '9' {
+			return false
+		}
+	}
+	return true
+}
+
+func parseHexOrDecByte(s string) (uint8, bool) {
+	s = strings.TrimSpace(s)
+	if s == "" {
+		return 0, false
+	}
+	base := 10
+	if strings.HasPrefix(s, "0x") || strings.HasPrefix(s, "0X") {
+		base = 16
+		s = s[2:]
+	}
+	parsed, err := strconv.ParseUint(s, base, 8)
+	if err != nil {
+		return 0, false
+	}
+	return uint8(parsed), true
+}
+
+func parseHexBytePair(s string) (uint8, uint8, bool) {
+	fields := strings.Fields(s)
+	if len(fields) < 2 {
+		return 0, 0, false
+	}
+	a, okA := parseHexOrDecByte(fields[0])
+	b, okB := parseHexOrDecByte(fields[1])
+	if !okA && !okB {
+		return 0, 0, false
+	}
+	return a, b, true
+}
+
+func emmcSmartStatus(preEOL uint8) string {
+	switch preEOL {
+	case 0x01:
+		return "PASSED"
+	case 0x02:
+		return "WARNING"
+	case 0x03:
+		return "FAILED"
+	default:
+		return "UNKNOWN"
+	}
+}
+
+func emmcPreEOLString(preEOL uint8) string {
+	switch preEOL {
+	case 0x01:
+		return "0x01 (normal)"
+	case 0x02:
+		return "0x02 (warning)"
+	case 0x03:
+		return "0x03 (urgent)"
+	default:
+		return fmt.Sprintf("0x%02x", preEOL)
+	}
+}
+
+func emmcLifeTimeString(v uint8) string {
+	// JEDEC eMMC: 0x01..0x0A => 0-100% used in 10% steps, 0x0B => exceeded.
+	switch {
+	case v == 0:
+		return "0x00 (not reported)"
+	case v >= 0x01 && v <= 0x0A:
+		low := int(v-1) * 10
+		high := int(v) * 10
+		return fmt.Sprintf("0x%02x (%d-%d%% used)", v, low, high)
+	case v == 0x0B:
+		return "0x0b (>100% used)"
+	default:
+		return fmt.Sprintf("0x%02x", v)
+	}
+}
--- a/agent/emmc_common_test.go
+++ b/agent/emmc_common_test.go
@@ -0,0 +1,78 @@
+package agent
+
+import "testing"
+
+func TestParseHexOrDecByte(t *testing.T) {
+	tests := []struct {
+		in   string
+		want uint8
+		ok   bool
+	}{
+		{"0x01", 1, true},
+		{"0X0b", 11, true},
+		{"01", 1, true},
+		{" 3 ", 3, true},
+		{"", 0, false},
+		{"0x", 0, false},
+		{"nope", 0, false},
+	}
+
+	for _, tt := range tests {
+		got, ok := parseHexOrDecByte(tt.in)
+		if ok != tt.ok || got != tt.want {
+			t.Fatalf("parseHexOrDecByte(%q) = (%d,%v), want (%d,%v)", tt.in, got, ok, tt.want, tt.ok)
+		}
+	}
+}
+
+func TestParseHexBytePair(t *testing.T) {
+	a, b, ok := parseHexBytePair("0x01 0x02\n")
+	if !ok || a != 1 || b != 2 {
+		t.Fatalf("parseHexBytePair hex = (%d,%d,%v), want (1,2,true)", a, b, ok)
+	}
+
+	a, b, ok = parseHexBytePair("01 02")
+	if !ok || a != 1 || b != 2 {
+		t.Fatalf("parseHexBytePair dec = (%d,%d,%v), want (1,2,true)", a, b, ok)
+	}
+
+	_, _, ok = parseHexBytePair("0x01")
+	if ok {
+		t.Fatalf("parseHexBytePair short input ok=true, want false")
+	}
+}
+
+func TestEmmcSmartStatus(t *testing.T) {
+	if got := emmcSmartStatus(0x01); got != "PASSED" {
+		t.Fatalf("emmcSmartStatus(0x01) = %q, want PASSED", got)
+	}
+	if got := emmcSmartStatus(0x02); got != "WARNING" {
+		t.Fatalf("emmcSmartStatus(0x02) = %q, want WARNING", got)
+	}
+	if got := emmcSmartStatus(0x03); got != "FAILED" {
+		t.Fatalf("emmcSmartStatus(0x03) = %q, want FAILED", got)
+	}
+	if got := emmcSmartStatus(0x00); got != "UNKNOWN" {
+		t.Fatalf("emmcSmartStatus(0x00) = %q, want UNKNOWN", got)
+	}
+}
+
+func TestIsEmmcBlockName(t *testing.T) {
+	cases := []struct {
+		name string
+		ok   bool
+	}{
+		{"mmcblk0", true},
+		{"mmcblk1", true},
+		{"mmcblk10", true},
+		{"mmcblk0p1", false},
+		{"sda", false},
+		{"mmcblk", false},
+		{"mmcblkA", false},
+	}
+	for _, c := range cases {
+		if got := isEmmcBlockName(c.name); got != c.ok {
+			t.Fatalf("isEmmcBlockName(%q) = %v, want %v", c.name, got, c.ok)
+		}
+	}
+}
--- a/agent/emmc_linux.go
+++ b/agent/emmc_linux.go
@@ -0,0 +1,215 @@
+//go:build linux
+
+package agent
+
+import (
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/henrygd/beszel/agent/utils"
+	"github.com/henrygd/beszel/internal/entities/smart"
+)
+
+// emmcSysfsRoot is a test hook; production value is "/sys".
+var emmcSysfsRoot = "/sys"
+
+type emmcHealth struct {
+	model    string
+	serial   string
+	revision string
+	capacity uint64
+	preEOL   uint8
+	lifeA    uint8
+	lifeB    uint8
+}
+
+func scanEmmcDevices() []*DeviceInfo {
+	blockDir := filepath.Join(emmcSysfsRoot, "class", "block")
+	entries, err := os.ReadDir(blockDir)
+	if err != nil {
+		return nil
+	}
+
+	devices := make([]*DeviceInfo, 0, 2)
+	for _, ent := range entries {
+		name := ent.Name()
+		if !isEmmcBlockName(name) {
+			continue
+		}
+
+		deviceDir := filepath.Join(blockDir, name, "device")
+		if !hasEmmcHealthFiles(deviceDir) {
+			continue
+		}
+
+		devPath := filepath.Join("/dev", name)
+		devices = append(devices, &DeviceInfo{
+			Name:     devPath,
+			Type:     "emmc",
+			InfoName: devPath + " [eMMC]",
+			Protocol: "MMC",
+		})
+	}
+
+	return devices
+}
+
+func (sm *SmartManager) collectEmmcHealth(deviceInfo *DeviceInfo) (bool, error) {
+	if deviceInfo == nil || deviceInfo.Name == "" {
+		return false, nil
+	}
+
+	base := filepath.Base(deviceInfo.Name)
+	if !isEmmcBlockName(base) && !strings.EqualFold(deviceInfo.Type, "emmc") && !strings.EqualFold(deviceInfo.Type, "mmc") {
+		return false, nil
+	}
+
+	health, ok := readEmmcHealth(base)
+	if !ok {
+		return false, nil
+	}
+
+	// Normalize the device type to keep pruning logic stable across refreshes.
+	deviceInfo.Type = "emmc"
+
+	key := health.serial
+	if key == "" {
+		key = filepath.Join("/dev", base)
+	}
+
+	status := emmcSmartStatus(health.preEOL)
+
+	attrs := []*smart.SmartAttribute{
+		{
+			Name:      "PreEOLInfo",
+			RawValue:  uint64(health.preEOL),
+			RawString: emmcPreEOLString(health.preEOL),
+		},
+		{
+			Name:      "DeviceLifeTimeEstA",
+			RawValue:  uint64(health.lifeA),
+			RawString: emmcLifeTimeString(health.lifeA),
+		},
+		{
+			Name:      "DeviceLifeTimeEstB",
+			RawValue:  uint64(health.lifeB),
+			RawString: emmcLifeTimeString(health.lifeB),
+		},
+	}
+
+	sm.Lock()
+	defer sm.Unlock()
+
+	if _, exists := sm.SmartDataMap[key]; !exists {
+		sm.SmartDataMap[key] = &smart.SmartData{}
+	}
+
+	data := sm.SmartDataMap[key]
+	data.ModelName = health.model
+	data.SerialNumber = health.serial
+	data.FirmwareVersion = health.revision
+	data.Capacity = health.capacity
+	data.Temperature = 0
+	data.SmartStatus = status
+	data.DiskName = filepath.Join("/dev", base)
+	data.DiskType = "emmc"
+	data.Attributes = attrs
+
+	return true, nil
+}
+
+func readEmmcHealth(blockName string) (emmcHealth, bool) {
+	var out emmcHealth
+
+	if !isEmmcBlockName(blockName) {
+		return out, false
+	}
+
+	deviceDir := filepath.Join(emmcSysfsRoot, "class", "block", blockName, "device")
+	preEOL, okPre := readHexByteFile(filepath.Join(deviceDir, "pre_eol_info"))
+
+	// Some kernels expose EXT_CSD lifetime via "life_time" (two bytes), others as
+	// separate files. Support both.
+	lifeA, lifeB, okLife := readLifeTime(deviceDir)
+
+	if !okPre && !okLife {
+		return out, false
+	}
+
+	out.preEOL = preEOL
+	out.lifeA = lifeA
+	out.lifeB = lifeB
+
+	out.model = utils.ReadStringFile(filepath.Join(deviceDir, "name"))
+	out.serial = utils.ReadStringFile(filepath.Join(deviceDir, "serial"))
+	out.revision = utils.ReadStringFile(filepath.Join(deviceDir, "prv"))
+
+	if capBytes, ok := readBlockCapacityBytes(blockName); ok {
+		out.capacity = capBytes
+	}
+
+	return out, true
+}
+
+func readLifeTime(deviceDir string) (uint8, uint8, bool) {
+	if content, ok := utils.ReadStringFileOK(filepath.Join(deviceDir, "life_time")); ok {
+		a, b, ok := parseHexBytePair(content)
+		return a, b, ok
+	}
+
+	a, okA := readHexByteFile(filepath.Join(deviceDir, "device_life_time_est_typ_a"))
+	b, okB := readHexByteFile(filepath.Join(deviceDir, "device_life_time_est_typ_b"))
+	if okA || okB {
+		return a, b, true
+	}
+	return 0, 0, false
+}
+
+func readBlockCapacityBytes(blockName string) (uint64, bool) {
+	sizePath := filepath.Join(emmcSysfsRoot, "class", "block", blockName, "size")
+	lbsPath := filepath.Join(emmcSysfsRoot, "class", "block", blockName, "queue", "logical_block_size")
+
+	sizeStr, ok := utils.ReadStringFileOK(sizePath)
+	if !ok {
+		return 0, false
+	}
+	sectors, err := strconv.ParseUint(sizeStr, 10, 64)
+	if err != nil || sectors == 0 {
+		return 0, false
+	}
+
+	lbsStr, ok := utils.ReadStringFileOK(lbsPath)
+	logicalBlockSize := uint64(512)
+	if ok {
+		if parsed, err := strconv.ParseUint(lbsStr, 10, 64); err == nil && parsed > 0 {
+			logicalBlockSize = parsed
+		}
+	}
+
+	return sectors * logicalBlockSize, true
+}
+
+func readHexByteFile(path string) (uint8, bool) {
+	content, ok := utils.ReadStringFileOK(path)
+	if !ok {
+		return 0, false
+	}
+	b, ok := parseHexOrDecByte(content)
+	return b, ok
+}
+
+func hasEmmcHealthFiles(deviceDir string) bool {
+	entries, err := os.ReadDir(deviceDir)
+	if err != nil {
+		return false
+	}
+	for _, ent := range entries {
+		switch ent.Name() {
+		case "pre_eol_info", "life_time", "device_life_time_est_typ_a", "device_life_time_est_typ_b":
+			return true
+		}
+	}
+	return false
+}
--- a/agent/emmc_linux_test.go
+++ b/agent/emmc_linux_test.go
@@ -0,0 +1,80 @@
+//go:build linux
+
+package agent
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/henrygd/beszel/internal/entities/smart"
+)
+
+func TestEmmcMockSysfsScanAndCollect(t *testing.T) {
+	tmp := t.TempDir()
+	prev := emmcSysfsRoot
+	emmcSysfsRoot = tmp
+	t.Cleanup(func() { emmcSysfsRoot = prev })
+
+	// Fake: /sys/class/block/mmcblk0
+	mmcDeviceDir := filepath.Join(tmp, "class", "block", "mmcblk0", "device")
+	mmcQueueDir := filepath.Join(tmp, "class", "block", "mmcblk0", "queue")
+	if err := os.MkdirAll(mmcDeviceDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.MkdirAll(mmcQueueDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	write := func(path, content string) {
+		t.Helper()
+		if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	write(filepath.Join(mmcDeviceDir, "pre_eol_info"), "0x02\n")
+	write(filepath.Join(mmcDeviceDir, "life_time"), "0x04 0x05\n")
+	write(filepath.Join(mmcDeviceDir, "name"), "H26M52103FMR\n")
+	write(filepath.Join(mmcDeviceDir, "serial"), "01234567\n")
+	write(filepath.Join(mmcDeviceDir, "prv"), "0x08\n")
+	write(filepath.Join(mmcQueueDir, "logical_block_size"), "512\n")
+	write(filepath.Join(tmp, "class", "block", "mmcblk0", "size"), "1024\n") // sectors
+
+	devs := scanEmmcDevices()
+	if len(devs) != 1 {
+		t.Fatalf("scanEmmcDevices() = %d devices, want 1", len(devs))
+	}
+	if devs[0].Name != "/dev/mmcblk0" || devs[0].Type != "emmc" {
+		t.Fatalf("scanEmmcDevices()[0] = %+v, want Name=/dev/mmcblk0 Type=emmc", devs[0])
+	}
+
+	sm := &SmartManager{SmartDataMap: map[string]*smart.SmartData{}}
+	ok, err := sm.collectEmmcHealth(devs[0])
+	if err != nil || !ok {
+		t.Fatalf("collectEmmcHealth() = (ok=%v, err=%v), want (true,nil)", ok, err)
+	}
+	if len(sm.SmartDataMap) != 1 {
+		t.Fatalf("SmartDataMap len=%d, want 1", len(sm.SmartDataMap))
+	}
+	var got *smart.SmartData
+	for _, v := range sm.SmartDataMap {
+		got = v
+		break
+	}
+	if got == nil {
+		t.Fatalf("SmartDataMap value nil")
+	}
+	if got.DiskType != "emmc" || got.DiskName != "/dev/mmcblk0" {
+		t.Fatalf("disk fields = (type=%q name=%q), want (emmc,/dev/mmcblk0)", got.DiskType, got.DiskName)
+	}
+	if got.SmartStatus != "WARNING" {
+		t.Fatalf("SmartStatus=%q, want WARNING", got.SmartStatus)
+	}
+	if got.SerialNumber != "01234567" || got.ModelName == "" || got.Capacity == 0 {
+		t.Fatalf("identity fields = (model=%q serial=%q cap=%d), want non-empty model, serial 01234567, cap>0", got.ModelName, got.SerialNumber, got.Capacity)
+	}
+	if len(got.Attributes) < 3 {
+		t.Fatalf("attributes len=%d, want >= 3", len(got.Attributes))
+	}
+}
--- a/agent/emmc_stub.go
+++ b/agent/emmc_stub.go
@@ -0,0 +1,14 @@
+//go:build !linux
+
+package agent
+
+// Non-Linux builds: eMMC health via sysfs is not available.
+
+func scanEmmcDevices() []*DeviceInfo {
+	return nil
+}
+
+func (sm *SmartManager) collectEmmcHealth(deviceInfo *DeviceInfo) (bool, error) {
+	return false, nil
+}
+
--- a/agent/fingerprint.go
+++ b/agent/fingerprint.go
@@ -0,0 +1,87 @@
+package agent
+
+import (
+	"crypto/sha256"
+	"encoding/hex"
+	"errors"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/shirou/gopsutil/v4/cpu"
+	"github.com/shirou/gopsutil/v4/host"
+)
+
+const fingerprintFileName = "fingerprint"
+
+// knownBadUUID is a commonly known "product_uuid" that is not unique across systems.
+const knownBadUUID = "03000200-0400-0500-0006-000700080009"
+
+// GetFingerprint returns the agent fingerprint. It first tries to read a saved
+// fingerprint from the data directory. If not found (or dataDir is empty), it
+// generates one from system properties. The hostname and cpuModel parameters are
+// used as fallback material if host.HostID() fails. If either is empty, they
+// are fetched from the system automatically.
+//
+// If a new fingerprint is generated and a dataDir is provided, it is saved.
+func GetFingerprint(dataDir, hostname, cpuModel string) string {
+	if dataDir != "" {
+		if fp, err := readFingerprint(dataDir); err == nil {
+			return fp
+		}
+	}
+	fp := generateFingerprint(hostname, cpuModel)
+	if dataDir != "" {
+		_ = SaveFingerprint(dataDir, fp)
+	}
+	return fp
+}
+
+// generateFingerprint creates a fingerprint from system properties.
+// It tries host.HostID() first, falling back to hostname + cpuModel.
+// If hostname or cpuModel are empty, they are fetched from the system.
+func generateFingerprint(hostname, cpuModel string) string {
+	fingerprint, err := host.HostID()
+	if err != nil || fingerprint == "" || fingerprint == knownBadUUID {
+		if hostname == "" {
+			hostname, _ = os.Hostname()
+		}
+		if cpuModel == "" {
+			if info, err := cpu.Info(); err == nil && len(info) > 0 {
+				cpuModel = info[0].ModelName
+			}
+		}
+		fingerprint = hostname + cpuModel
+	}
+
+	sum := sha256.Sum256([]byte(fingerprint))
+	return hex.EncodeToString(sum[:24])
+}
+
+// readFingerprint reads the saved fingerprint from the data directory.
+func readFingerprint(dataDir string) (string, error) {
+	fp, err := os.ReadFile(filepath.Join(dataDir, fingerprintFileName))
+	if err != nil {
+		return "", err
+	}
+	s := strings.TrimSpace(string(fp))
+	if s == "" {
+		return "", errors.New("fingerprint file is empty")
+	}
+	return s, nil
+}
+
+// SaveFingerprint writes the fingerprint to the data directory.
+func SaveFingerprint(dataDir, fingerprint string) error {
+	return os.WriteFile(filepath.Join(dataDir, fingerprintFileName), []byte(fingerprint), 0o644)
+}
+
+// DeleteFingerprint removes the saved fingerprint file from the data directory.
+// Returns nil if the file does not exist (idempotent).
+func DeleteFingerprint(dataDir string) error {
+	err := os.Remove(filepath.Join(dataDir, fingerprintFileName))
+	if errors.Is(err, os.ErrNotExist) {
+		return nil
+	}
+	return err
+}
--- a/agent/fingerprint_test.go
+++ b/agent/fingerprint_test.go
@@ -0,0 +1,102 @@
+//go:build testing
+
+package agent
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestGetFingerprint(t *testing.T) {
+	t.Run("reads existing fingerprint from file", func(t *testing.T) {
+		dir := t.TempDir()
+		expected := "abc123def456"
+		err := os.WriteFile(filepath.Join(dir, fingerprintFileName), []byte(expected), 0644)
+		require.NoError(t, err)
+
+		fp := GetFingerprint(dir, "", "")
+		assert.Equal(t, expected, fp)
+	})
+
+	t.Run("trims whitespace from file", func(t *testing.T) {
+		dir := t.TempDir()
+		err := os.WriteFile(filepath.Join(dir, fingerprintFileName), []byte("  abc123  \n"), 0644)
+		require.NoError(t, err)
+
+		fp := GetFingerprint(dir, "", "")
+		assert.Equal(t, "abc123", fp)
+	})
+
+	t.Run("generates fingerprint when file does not exist", func(t *testing.T) {
+		dir := t.TempDir()
+		fp := GetFingerprint(dir, "", "")
+		assert.NotEmpty(t, fp)
+	})
+
+	t.Run("generates fingerprint when dataDir is empty", func(t *testing.T) {
+		fp := GetFingerprint("", "", "")
+		assert.NotEmpty(t, fp)
+	})
+
+	t.Run("generates consistent fingerprint for same inputs", func(t *testing.T) {
+		fp1 := GetFingerprint("", "myhost", "mycpu")
+		fp2 := GetFingerprint("", "myhost", "mycpu")
+		assert.Equal(t, fp1, fp2)
+	})
+
+	t.Run("prefers saved fingerprint over generated", func(t *testing.T) {
+		dir := t.TempDir()
+		require.NoError(t, SaveFingerprint(dir, "saved-fp"))
+
+		fp := GetFingerprint(dir, "anyhost", "anycpu")
+		assert.Equal(t, "saved-fp", fp)
+	})
+}
+
+func TestSaveFingerprint(t *testing.T) {
+	t.Run("saves fingerprint to file", func(t *testing.T) {
+		dir := t.TempDir()
+		err := SaveFingerprint(dir, "abc123")
+		require.NoError(t, err)
+
+		content, err := os.ReadFile(filepath.Join(dir, fingerprintFileName))
+		require.NoError(t, err)
+		assert.Equal(t, "abc123", string(content))
+	})
+
+	t.Run("overwrites existing fingerprint", func(t *testing.T) {
+		dir := t.TempDir()
+		require.NoError(t, SaveFingerprint(dir, "old"))
+		require.NoError(t, SaveFingerprint(dir, "new"))
+
+		content, err := os.ReadFile(filepath.Join(dir, fingerprintFileName))
+		require.NoError(t, err)
+		assert.Equal(t, "new", string(content))
+	})
+}
+
+func TestDeleteFingerprint(t *testing.T) {
+	t.Run("deletes existing fingerprint", func(t *testing.T) {
+		dir := t.TempDir()
+		fp := filepath.Join(dir, fingerprintFileName)
+		err := os.WriteFile(fp, []byte("abc123"), 0644)
+		require.NoError(t, err)
+
+		err = DeleteFingerprint(dir)
+		require.NoError(t, err)
+
+		// Verify file is gone
+		_, err = os.Stat(fp)
+		assert.True(t, os.IsNotExist(err))
+	})
+
+	t.Run("no error when file does not exist", func(t *testing.T) {
+		dir := t.TempDir()
+		err := DeleteFingerprint(dir)
+		assert.NoError(t, err)
+	})
+}
--- a/agent/gpu.go
+++ b/agent/gpu.go
@@ -9,25 +9,26 @@ import (
 	"maps"
 	"os/exec"
 	"regexp"
+	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"time"

+	"github.com/henrygd/beszel/agent/utils"
 	"github.com/henrygd/beszel/internal/entities/system"
 )

 const (
 	// Commands
-	nvidiaSmiCmd  string = "nvidia-smi"
-	rocmSmiCmd    string = "rocm-smi"
-	amdgpuCmd     string = "amdgpu" // internal cmd for sysfs collection
-	tegraStatsCmd string = "tegrastats"
+	nvidiaSmiCmd    string = "nvidia-smi"
+	rocmSmiCmd      string = "rocm-smi"
+	tegraStatsCmd   string = "tegrastats"
+	nvtopCmd        string = "nvtop"
+	powermetricsCmd string = "powermetrics"
+	macmonCmd       string = "macmon"
+	noGPUFoundMsg   string = "no GPU found - see https://beszel.dev/guide/gpu"

-	// Polling intervals
-	nvidiaSmiInterval  string        = "4"    // in seconds
-	tegraStatsInterval string        = "3700" // in milliseconds
-	rocmSmiInterval    time.Duration = 4300 * time.Millisecond
 	// Command retry and timeout constants
 	retryWaitTime     time.Duration = 5 * time.Second
 	maxFailureRetries int           = 5
@@ -40,13 +41,7 @@ const (
 // GPUManager manages data collection for GPUs (either Nvidia or AMD)
 type GPUManager struct {
 	sync.Mutex
-	nvidiaSmi     bool
-	rocmSmi       bool
-	amdgpu        bool
-	tegrastats    bool
-	intelGpuStats bool
-	nvml          bool
-	GpuDataMap    map[string]*system.GPUData
+	GpuDataMap map[string]*system.GPUData
 	// lastAvgData stores the last calculated averages for each GPU
 	// Used when a collection happens before new data arrives (Count == 0)
 	lastAvgData map[string]system.GPUData
@@ -87,6 +82,58 @@ type gpuCollector struct {

 var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data

+// collectorSource identifies a selectable GPU collector in GPU_COLLECTOR.
+type collectorSource string
+
+const (
+	collectorSourceNVTop        collectorSource = collectorSource(nvtopCmd)
+	collectorSourceNVML         collectorSource = "nvml"
+	collectorSourceNvidiaSMI    collectorSource = collectorSource(nvidiaSmiCmd)
+	collectorSourceIntelGpuTop  collectorSource = collectorSource(intelGpuStatsCmd)
+	collectorSourceAmdSysfs     collectorSource = "amd_sysfs"
+	collectorSourceRocmSMI      collectorSource = collectorSource(rocmSmiCmd)
+	collectorSourceMacmon       collectorSource = collectorSource(macmonCmd)
+	collectorSourcePowermetrics collectorSource = collectorSource(powermetricsCmd)
+	collectorGroupNvidia        string          = "nvidia"
+	collectorGroupIntel         string          = "intel"
+	collectorGroupAmd           string          = "amd"
+	collectorGroupApple         string          = "apple"
+)
+
+func isValidCollectorSource(source collectorSource) bool {
+	switch source {
+	case collectorSourceNVTop,
+		collectorSourceNVML,
+		collectorSourceNvidiaSMI,
+		collectorSourceIntelGpuTop,
+		collectorSourceAmdSysfs,
+		collectorSourceRocmSMI,
+		collectorSourceMacmon,
+		collectorSourcePowermetrics:
+		return true
+	}
+	return false
+}
+
+// gpuCapabilities describes detected GPU tooling and sysfs support on the host.
+type gpuCapabilities struct {
+	hasNvidiaSmi    bool
+	hasRocmSmi      bool
+	hasAmdSysfs     bool
+	hasTegrastats   bool
+	hasIntelGpuTop  bool
+	hasNvtop        bool
+	hasMacmon       bool
+	hasPowermetrics bool
+}
+
+type collectorDefinition struct {
+	group              string
+	available          bool
+	start              func(onFailure func()) bool
+	deprecationWarning string
+}
+
 // starts and manages the ongoing collection of GPU data for the specified GPU management utility
 func (c *gpuCollector) start() {
 	for {
@@ -245,8 +292,8 @@ func (gm *GPUManager) parseAmdData(output []byte) bool {
 		}
 		gpu := gm.GpuDataMap[id]
 		gpu.Temperature, _ = strconv.ParseFloat(v.Temperature, 64)
-		gpu.MemoryUsed = bytesToMegabytes(memoryUsage)
-		gpu.MemoryTotal = bytesToMegabytes(totalMemory)
+		gpu.MemoryUsed = utils.BytesToMegabytes(memoryUsage)
+		gpu.MemoryTotal = utils.BytesToMegabytes(totalMemory)
 		gpu.Usage += usage
 		gpu.Power += power
 		gpu.Count++
@@ -320,16 +367,16 @@ func (gm *GPUManager) calculateGPUAverage(id string, gpu *system.GPUData, cacheK
 	gpuAvg := *gpu
 	deltaUsage, deltaPower, deltaPowerPkg := gm.calculateDeltas(gpu, lastSnapshot)

-	gpuAvg.Power = twoDecimals(deltaPower / float64(deltaCount))
+	gpuAvg.Power = utils.TwoDecimals(deltaPower / float64(deltaCount))

 	if gpu.Engines != nil {
 		// make fresh map for averaged engine metrics to avoid mutating
 		// the accumulator map stored in gm.GpuDataMap
 		gpuAvg.Engines = make(map[string]float64, len(gpu.Engines))
 		gpuAvg.Usage = gm.calculateIntelGPUUsage(&gpuAvg, gpu, lastSnapshot, deltaCount)
-		gpuAvg.PowerPkg = twoDecimals(deltaPowerPkg / float64(deltaCount))
+		gpuAvg.PowerPkg = utils.TwoDecimals(deltaPowerPkg / float64(deltaCount))
 	} else {
-		gpuAvg.Usage = twoDecimals(deltaUsage / float64(deltaCount))
+		gpuAvg.Usage = utils.TwoDecimals(deltaUsage / float64(deltaCount))
 	}

 	gm.lastAvgData[id] = gpuAvg
@@ -364,17 +411,17 @@ func (gm *GPUManager) calculateIntelGPUUsage(gpuAvg, gpu *system.GPUData, lastSn
 		} else {
 			deltaEngine = engine
 		}
-		gpuAvg.Engines[name] = twoDecimals(deltaEngine / float64(deltaCount))
+		gpuAvg.Engines[name] = utils.TwoDecimals(deltaEngine / float64(deltaCount))
 		maxEngineUsage = max(maxEngineUsage, deltaEngine/float64(deltaCount))
 	}
-	return twoDecimals(maxEngineUsage)
+	return utils.TwoDecimals(maxEngineUsage)
 }

 // updateInstantaneousValues updates values that should reflect current state, not averages
 func (gm *GPUManager) updateInstantaneousValues(gpuAvg *system.GPUData, gpu *system.GPUData) {
-	gpuAvg.Temperature = twoDecimals(gpu.Temperature)
-	gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed)
-	gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal)
+	gpuAvg.Temperature = utils.TwoDecimals(gpu.Temperature)
+	gpuAvg.MemoryUsed = utils.TwoDecimals(gpu.MemoryUsed)
+	gpuAvg.MemoryTotal = utils.TwoDecimals(gpu.MemoryTotal)
 }

 // storeSnapshot saves the current GPU state for this cache key
@@ -392,133 +439,324 @@ func (gm *GPUManager) storeSnapshot(id string, gpu *system.GPUData, cacheKey uin
 	gm.lastSnapshots[cacheKey][id] = snapshot
 }

-// detectGPUs checks for the presence of GPU management tools (nvidia-smi, rocm-smi, tegrastats)
-// in the system path. It sets the corresponding flags in the GPUManager struct if any of these
-// tools are found. If none of the tools are found, it returns an error indicating that no GPU
-// management tools are available.
-func (gm *GPUManager) detectGPUs() error {
+// discoverGpuCapabilities checks for available GPU tooling and sysfs support.
+// It only reports capability presence and does not apply policy decisions.
+func (gm *GPUManager) discoverGpuCapabilities() gpuCapabilities {
+	caps := gpuCapabilities{
+		hasAmdSysfs: gm.hasAmdSysfs(),
+	}
 	if _, err := exec.LookPath(nvidiaSmiCmd); err == nil {
-		gm.nvidiaSmi = true
+		caps.hasNvidiaSmi = true
 	}
 	if _, err := exec.LookPath(rocmSmiCmd); err == nil {
-		if val, _ := GetEnv("AMD_SYSFS"); val == "true" {
-			gm.amdgpu = true
-		} else {
-			gm.rocmSmi = true
-		}
-	} else if gm.hasAmdSysfs() {
-		gm.amdgpu = true
+		caps.hasRocmSmi = true
 	}
 	if _, err := exec.LookPath(tegraStatsCmd); err == nil {
-		gm.tegrastats = true
-		gm.nvidiaSmi = false
+		caps.hasTegrastats = true
 	}
 	if _, err := exec.LookPath(intelGpuStatsCmd); err == nil {
-		gm.intelGpuStats = true
+		caps.hasIntelGpuTop = true
 	}
-	if gm.nvidiaSmi || gm.rocmSmi || gm.amdgpu || gm.tegrastats || gm.intelGpuStats || gm.nvml {
-		return nil
+	if _, err := exec.LookPath(nvtopCmd); err == nil {
+		caps.hasNvtop = true
 	}
-	return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or intel_gpu_top")
+	if runtime.GOOS == "darwin" {
+		if _, err := exec.LookPath(macmonCmd); err == nil {
+			caps.hasMacmon = true
+		}
+		if _, err := exec.LookPath(powermetricsCmd); err == nil {
+			caps.hasPowermetrics = true
+		}
+	}
+	return caps
 }

-// startCollector starts the appropriate GPU data collector based on the command
-func (gm *GPUManager) startCollector(command string) {
-	collector := gpuCollector{
-		name:    command,
-		bufSize: 10 * 1024,
-	}
-	switch command {
-	case intelGpuStatsCmd:
-		go func() {
-			failures := 0
-			for {
-				if err := gm.collectIntelStats(); err != nil {
-					failures++
-					if failures > maxFailureRetries {
-						break
-					}
-					slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err)
-					time.Sleep(retryWaitTime)
-					continue
+func hasAnyGpuCollector(caps gpuCapabilities) bool {
+	return caps.hasNvidiaSmi || caps.hasRocmSmi || caps.hasAmdSysfs || caps.hasTegrastats || caps.hasIntelGpuTop || caps.hasNvtop || caps.hasMacmon || caps.hasPowermetrics
+}
+
+func (gm *GPUManager) startIntelCollector() {
+	go func() {
+		failures := 0
+		for {
+			if err := gm.collectIntelStats(); err != nil {
+				failures++
+				if failures > maxFailureRetries {
+					break
 				}
+				slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err)
+				time.Sleep(retryWaitTime)
+				continue
 			}
-		}()
-	case nvidiaSmiCmd:
-		collector.cmdArgs = []string{
-			"-l", nvidiaSmiInterval,
+		}
+	}()
+}
+
+func (gm *GPUManager) startNvidiaSmiCollector(intervalSeconds string) {
+	collector := gpuCollector{
+		name:    nvidiaSmiCmd,
+		bufSize: 10 * 1024,
+		cmdArgs: []string{
+			"-l", intervalSeconds,
 			"--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw",
 			"--format=csv,noheader,nounits",
-		}
-		collector.parse = gm.parseNvidiaData
-		go collector.start()
-	case tegraStatsCmd:
-		collector.cmdArgs = []string{"--interval", tegraStatsInterval}
-		collector.parse = gm.getJetsonParser()
-		go collector.start()
-	case amdgpuCmd:
-		go func() {
-			if err := gm.collectAmdStats(); err != nil {
-				slog.Warn("Error collecting AMD GPU data via sysfs", "err", err)
-			}
-		}()
-	case rocmSmiCmd:
-		collector.cmdArgs = []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"}
-		collector.parse = gm.parseAmdData
-		go func() {
-			failures := 0
-			for {
-				if err := collector.collect(); err != nil {
-					failures++
-					if failures > maxFailureRetries {
-						break
-					}
-					slog.Warn("Error collecting AMD GPU data via rocm-smi", "err", err)
-				}
-				time.Sleep(rocmSmiInterval)
-			}
-		}()
+		},
+		parse: gm.parseNvidiaData,
 	}
+	go collector.start()
+}
+
+func (gm *GPUManager) startTegraStatsCollector(intervalMilliseconds string) {
+	collector := gpuCollector{
+		name:    tegraStatsCmd,
+		bufSize: 10 * 1024,
+		cmdArgs: []string{"--interval", intervalMilliseconds},
+		parse:   gm.getJetsonParser(),
+	}
+	go collector.start()
+}
+
+func (gm *GPUManager) startRocmSmiCollector(pollInterval time.Duration) {
+	collector := gpuCollector{
+		name:    rocmSmiCmd,
+		bufSize: 10 * 1024,
+		cmdArgs: []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"},
+		parse:   gm.parseAmdData,
+	}
+	go func() {
+		failures := 0
+		for {
+			if err := collector.collect(); err != nil {
+				failures++
+				if failures > maxFailureRetries {
+					break
+				}
+				slog.Warn("Error collecting AMD GPU data via rocm-smi", "err", err)
+			}
+			time.Sleep(pollInterval)
+		}
+	}()
+}
+
+func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSource]collectorDefinition {
+	return map[collectorSource]collectorDefinition{
+		collectorSourceNVML: {
+			group:     collectorGroupNvidia,
+			available: caps.hasNvidiaSmi,
+			start: func(_ func()) bool {
+				return gm.startNvmlCollector()
+			},
+		},
+		collectorSourceNvidiaSMI: {
+			group:     collectorGroupNvidia,
+			available: caps.hasNvidiaSmi,
+			start: func(_ func()) bool {
+				gm.startNvidiaSmiCollector("4") // seconds
+				return true
+			},
+		},
+		collectorSourceIntelGpuTop: {
+			group:     collectorGroupIntel,
+			available: caps.hasIntelGpuTop,
+			start: func(_ func()) bool {
+				gm.startIntelCollector()
+				return true
+			},
+		},
+		collectorSourceAmdSysfs: {
+			group:     collectorGroupAmd,
+			available: caps.hasAmdSysfs,
+			start: func(_ func()) bool {
+				return gm.startAmdSysfsCollector()
+			},
+		},
+		collectorSourceRocmSMI: {
+			group:              collectorGroupAmd,
+			available:          caps.hasRocmSmi,
+			deprecationWarning: "rocm-smi is deprecated and may be removed in a future release",
+			start: func(_ func()) bool {
+				gm.startRocmSmiCollector(4300 * time.Millisecond)
+				return true
+			},
+		},
+		collectorSourceNVTop: {
+			available: caps.hasNvtop,
+			start: func(onFailure func()) bool {
+				gm.startNvtopCollector("30", onFailure) // tens of milliseconds
+				return true
+			},
+		},
+		collectorSourceMacmon: {
+			group:     collectorGroupApple,
+			available: caps.hasMacmon,
+			start: func(_ func()) bool {
+				gm.startMacmonCollector()
+				return true
+			},
+		},
+		collectorSourcePowermetrics: {
+			group:     collectorGroupApple,
+			available: caps.hasPowermetrics,
+			start: func(_ func()) bool {
+				gm.startPowermetricsCollector()
+				return true
+			},
+		},
+	}
+}
+
+// parseCollectorPriority parses GPU_COLLECTOR and returns valid ordered entries.
+func parseCollectorPriority(value string) []collectorSource {
+	parts := strings.Split(value, ",")
+	priorities := make([]collectorSource, 0, len(parts))
+	for _, raw := range parts {
+		name := collectorSource(strings.TrimSpace(strings.ToLower(raw)))
+		if !isValidCollectorSource(name) {
+			if name != "" {
+				slog.Warn("Ignoring unknown GPU collector", "collector", name)
+			}
+			continue
+		}
+		priorities = append(priorities, name)
+	}
+	return priorities
+}
+
+// startNvmlCollector initializes NVML and starts its polling loop.
+func (gm *GPUManager) startNvmlCollector() bool {
+	collector := &nvmlCollector{gm: gm}
+	if err := collector.init(); err != nil {
+		slog.Warn("Failed to initialize NVML", "err", err)
+		return false
+	}
+	go collector.start()
+	return true
+}
+
+// startAmdSysfsCollector starts AMD GPU collection via sysfs.
+func (gm *GPUManager) startAmdSysfsCollector() bool {
+	go func() {
+		if err := gm.collectAmdStats(); err != nil {
+			slog.Warn("Error collecting AMD GPU data via sysfs", "err", err)
+		}
+	}()
+	return true
+}
+
+// startCollectorsByPriority starts collectors in order with one source per vendor group.
+func (gm *GPUManager) startCollectorsByPriority(priorities []collectorSource, caps gpuCapabilities) int {
+	definitions := gm.collectorDefinitions(caps)
+	selectedGroups := make(map[string]bool, 3)
+	started := 0
+	for i, source := range priorities {
+		definition, ok := definitions[source]
+		if !ok || !definition.available {
+			continue
+		}
+		// nvtop is not a vendor-specific collector, so should only be used if no other collectors are selected or it is first in GPU_COLLECTOR.
+		if source == collectorSourceNVTop {
+			if len(selectedGroups) > 0 {
+				slog.Warn("Skipping nvtop because other collectors are selected")
+				continue
+			}
+			// if nvtop fails, fall back to remaining collectors.
+			remaining := append([]collectorSource(nil), priorities[i+1:]...)
+			if definition.start(func() {
+				gm.startCollectorsByPriority(remaining, caps)
+			}) {
+				started++
+				return started
+			}
+		}
+		group := definition.group
+		if group == "" || selectedGroups[group] {
+			continue
+		}
+		if definition.deprecationWarning != "" {
+			slog.Warn(definition.deprecationWarning)
+		}
+		if definition.start(nil) {
+			selectedGroups[group] = true
+			started++
+		}
+	}
+	return started
+}
+
+// resolveLegacyCollectorPriority builds the default collector order when GPU_COLLECTOR is unset.
+func (gm *GPUManager) resolveLegacyCollectorPriority(caps gpuCapabilities) []collectorSource {
+	priorities := make([]collectorSource, 0, 4)
+
+	if caps.hasNvidiaSmi && !caps.hasTegrastats {
+		if nvml, _ := utils.GetEnv("NVML"); nvml == "true" {
+			priorities = append(priorities, collectorSourceNVML, collectorSourceNvidiaSMI)
+		} else {
+			priorities = append(priorities, collectorSourceNvidiaSMI)
+		}
+	}
+
+	if caps.hasRocmSmi {
+		if val, _ := utils.GetEnv("AMD_SYSFS"); val == "true" {
+			priorities = append(priorities, collectorSourceAmdSysfs)
+		} else {
+			priorities = append(priorities, collectorSourceRocmSMI)
+		}
+	} else if caps.hasAmdSysfs {
+		priorities = append(priorities, collectorSourceAmdSysfs)
+	}
+
+	if caps.hasIntelGpuTop {
+		priorities = append(priorities, collectorSourceIntelGpuTop)
+	}
+
+	// Apple collectors are currently opt-in only for testing.
+	// Enable them with GPU_COLLECTOR=macmon or GPU_COLLECTOR=powermetrics.
+	// TODO: uncomment below when Apple collectors are confirmed to be working.
+	//
+	// Prefer macmon on macOS (no sudo). Fall back to powermetrics if present.
+	// if caps.hasMacmon {
+	// 	priorities = append(priorities, collectorSourceMacmon)
+	// } else if caps.hasPowermetrics {
+	// 	priorities = append(priorities, collectorSourcePowermetrics)
+	// }
+
+	// Keep nvtop as a last resort only when no vendor collector exists.
+	if len(priorities) == 0 && caps.hasNvtop {
+		priorities = append(priorities, collectorSourceNVTop)
+	}
+	return priorities
 }

 // NewGPUManager creates and initializes a new GPUManager
 func NewGPUManager() (*GPUManager, error) {
-	if skipGPU, _ := GetEnv("SKIP_GPU"); skipGPU == "true" {
+	if skipGPU, _ := utils.GetEnv("SKIP_GPU"); skipGPU == "true" {
 		return nil, nil
 	}
 	var gm GPUManager
-	if err := gm.detectGPUs(); err != nil {
-		return nil, err
+	caps := gm.discoverGpuCapabilities()
+	if !hasAnyGpuCollector(caps) {
+		return nil, fmt.Errorf(noGPUFoundMsg)
 	}
 	gm.GpuDataMap = make(map[string]*system.GPUData)

-	if gm.nvidiaSmi {
-		if nvml, _ := GetEnv("NVML"); nvml == "true" {
-			gm.nvml = true
-			gm.nvidiaSmi = false
-			collector := &nvmlCollector{gm: &gm}
-			if err := collector.init(); err == nil {
-				go collector.start()
-			} else {
-				slog.Warn("Failed to initialize NVML, falling back to nvidia-smi", "err", err)
-				gm.nvidiaSmi = true
-				gm.startCollector(nvidiaSmiCmd)
-			}
-		} else {
-			gm.startCollector(nvidiaSmiCmd)
+	// Jetson devices should always use tegrastats (ignore GPU_COLLECTOR).
+	if caps.hasTegrastats {
+		gm.startTegraStatsCollector("3700")
+		return &gm, nil
+	}
+
+	// if GPU_COLLECTOR is set, start user-defined collectors.
+	if collectorConfig, ok := utils.GetEnv("GPU_COLLECTOR"); ok && strings.TrimSpace(collectorConfig) != "" {
+		priorities := parseCollectorPriority(collectorConfig)
+		if gm.startCollectorsByPriority(priorities, caps) == 0 {
+			return nil, fmt.Errorf("no configured GPU collectors are available")
 		}
+		return &gm, nil
 	}
-	if gm.rocmSmi {
-		gm.startCollector(rocmSmiCmd)
-	}
-	if gm.amdgpu {
-		gm.startCollector(amdgpuCmd)
-	}
-	if gm.tegrastats {
-		gm.startCollector(tegraStatsCmd)
-	}
-	if gm.intelGpuStats {
-		gm.startCollector(intelGpuStatsCmd)
+
+	// auto-detect and start collectors when GPU_COLLECTOR is unset.
+	if gm.startCollectorsByPriority(gm.resolveLegacyCollectorPriority(caps), caps) == 0 {
+		return nil, fmt.Errorf(noGPUFoundMsg)
 	}

 	return &gm, nil
--- a/agent/gpu_amd_linux.go
+++ b/agent/gpu_amd_linux.go
@@ -3,6 +3,7 @@
 package agent

 import (
+	"bufio"
 	"fmt"
 	"log/slog"
 	"os"
@@ -12,9 +13,19 @@ import (
 	"sync"
 	"time"

+	"github.com/henrygd/beszel/agent/utils"
 	"github.com/henrygd/beszel/internal/entities/system"
 )

+var amdgpuNameCache = struct {
+	sync.RWMutex
+	hits   map[string]string
+	misses map[string]struct{}
+}{
+	hits:   make(map[string]string),
+	misses: make(map[string]struct{}),
+}
+
 // hasAmdSysfs returns true if any AMD GPU sysfs nodes are found
 func (gm *GPUManager) hasAmdSysfs() bool {
 	cards, err := filepath.Glob("/sys/class/drm/card*/device/vendor")
@@ -22,8 +33,8 @@ func (gm *GPUManager) hasAmdSysfs() bool {
 		return false
 	}
 	for _, vendorPath := range cards {
-		vendor, err := os.ReadFile(vendorPath)
-		if err == nil && strings.TrimSpace(string(vendor)) == "0x1002" {
+		vendor, err := utils.ReadStringFileLimited(vendorPath, 64)
+		if err == nil && vendor == "0x1002" {
 			return true
 		}
 	}
@@ -32,6 +43,7 @@ func (gm *GPUManager) hasAmdSysfs() bool {

 // collectAmdStats collects AMD GPU metrics directly from sysfs to avoid the overhead of rocm-smi
 func (gm *GPUManager) collectAmdStats() error {
+	sysfsPollInterval := 3000 * time.Millisecond
 	cards, err := filepath.Glob("/sys/class/drm/card*")
 	if err != nil {
 		return err
@@ -70,17 +82,17 @@ func (gm *GPUManager) collectAmdStats() error {
 			continue
 		}
 		failures = 0
-		time.Sleep(rocmSmiInterval)
+		time.Sleep(sysfsPollInterval)
 	}
 }

+// isAmdGpu checks whether a DRM card path belongs to AMD vendor ID 0x1002.
 func isAmdGpu(cardPath string) bool {
-	vendorPath := filepath.Join(cardPath, "device/vendor")
-	vendor, err := os.ReadFile(vendorPath)
+	vendor, err := utils.ReadStringFileLimited(filepath.Join(cardPath, "device/vendor"), 64)
 	if err != nil {
 		return false
 	}
-	return strings.TrimSpace(string(vendor)) == "0x1002"
+	return vendor == "0x1002"
 }

 // updateAmdGpuData reads GPU metrics from sysfs and updates the GPU data map.
@@ -93,6 +105,13 @@ func (gm *GPUManager) updateAmdGpuData(cardPath string) bool {
 	usage, usageErr := readSysfsFloat(filepath.Join(devicePath, "gpu_busy_percent"))
 	memUsed, memUsedErr := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_used"))
 	memTotal, _ := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_total"))
+	// if gtt is present, add it to the memory used and total (https://github.com/henrygd/beszel/issues/1569#issuecomment-3837640484)
+	if gttUsed, err := readSysfsFloat(filepath.Join(devicePath, "mem_info_gtt_used")); err == nil && gttUsed > 0 {
+		if gttTotal, err := readSysfsFloat(filepath.Join(devicePath, "mem_info_gtt_total")); err == nil {
+			memUsed += gttUsed
+			memTotal += gttTotal
+		}
+	}

 	var temp, power float64
 	hwmons, _ := filepath.Glob(filepath.Join(devicePath, "hwmon/hwmon*"))
@@ -125,20 +144,128 @@ func (gm *GPUManager) updateAmdGpuData(cardPath string) bool {
 	if usageErr == nil {
 		gpu.Usage += usage
 	}
-	gpu.MemoryUsed = bytesToMegabytes(memUsed)
-	gpu.MemoryTotal = bytesToMegabytes(memTotal)
+	gpu.MemoryUsed = utils.BytesToMegabytes(memUsed)
+	gpu.MemoryTotal = utils.BytesToMegabytes(memTotal)
 	gpu.Temperature = temp
 	gpu.Power += power
 	gpu.Count++
 	return true
 }

+// readSysfsFloat reads and parses a numeric value from a sysfs file.
 func readSysfsFloat(path string) (float64, error) {
-	val, err := os.ReadFile(path)
+	val, err := utils.ReadStringFileLimited(path, 64)
 	if err != nil {
 		return 0, err
 	}
-	return strconv.ParseFloat(strings.TrimSpace(string(val)), 64)
+	return strconv.ParseFloat(val, 64)
+}
+
+// normalizeHexID normalizes hex IDs by trimming spaces, lowercasing, and dropping 0x.
+func normalizeHexID(id string) string {
+	return strings.TrimPrefix(strings.ToLower(strings.TrimSpace(id)), "0x")
+}
+
+// cacheKeyForAmdgpu builds the cache key for a device and optional revision.
+func cacheKeyForAmdgpu(deviceID, revisionID string) string {
+	if revisionID != "" {
+		return deviceID + ":" + revisionID
+	}
+	return deviceID
+}
+
+// lookupAmdgpuNameInFile resolves an AMDGPU name from amdgpu.ids by device/revision.
+func lookupAmdgpuNameInFile(deviceID, revisionID, filePath string) (name string, exact bool, found bool) {
+	file, err := os.Open(filePath)
+	if err != nil {
+		return "", false, false
+	}
+	defer file.Close()
+
+	var byDevice string
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+		parts := strings.SplitN(line, ",", 3)
+		if len(parts) != 3 {
+			continue
+		}
+
+		dev := normalizeHexID(parts[0])
+		rev := normalizeHexID(parts[1])
+		productName := strings.TrimSpace(parts[2])
+		if dev == "" || productName == "" || dev != deviceID {
+			continue
+		}
+		if byDevice == "" {
+			byDevice = productName
+		}
+		if revisionID != "" && rev == revisionID {
+			return productName, true, true
+		}
+	}
+	if byDevice != "" {
+		return byDevice, false, true
+	}
+	return "", false, false
+}
+
+// getCachedAmdgpuName returns cached hit/miss status for the given device/revision.
+func getCachedAmdgpuName(deviceID, revisionID string) (name string, found bool, done bool) {
+	// Build the list of cache keys to check. We always look up the exact device+revision key.
+	// When revisionID is set, we also look up deviceID alone, since the cache may store a
+	// device-only fallback when we couldn't resolve the exact revision.
+	keys := []string{cacheKeyForAmdgpu(deviceID, revisionID)}
+	if revisionID != "" {
+		keys = append(keys, deviceID)
+	}
+
+	knownMisses := 0
+	amdgpuNameCache.RLock()
+	defer amdgpuNameCache.RUnlock()
+	for _, key := range keys {
+		if name, ok := amdgpuNameCache.hits[key]; ok {
+			return name, true, true
+		}
+		if _, ok := amdgpuNameCache.misses[key]; ok {
+			knownMisses++
+		}
+	}
+	// done=true means "don't bother doing slow lookup": we either found a name (above) or
+	// every key we checked was already a known miss, so we've tried before and failed.
+	return "", false, knownMisses == len(keys)
+}
+
+// normalizeAmdgpuName trims standard suffixes from AMDGPU product names.
+func normalizeAmdgpuName(name string) string {
+	for _, suffix := range []string{" Graphics", " Series"} {
+		name = strings.TrimSuffix(name, suffix)
+	}
+	return name
+}
+
+// cacheAmdgpuName stores a resolved AMDGPU name in the lookup cache.
+func cacheAmdgpuName(deviceID, revisionID, name string, exact bool) {
+	name = normalizeAmdgpuName(name)
+	amdgpuNameCache.Lock()
+	defer amdgpuNameCache.Unlock()
+	if exact && revisionID != "" {
+		amdgpuNameCache.hits[cacheKeyForAmdgpu(deviceID, revisionID)] = name
+	}
+	amdgpuNameCache.hits[deviceID] = name
+}
+
+// cacheMissingAmdgpuName records unresolved device/revision lookups.
+func cacheMissingAmdgpuName(deviceID, revisionID string) {
+	amdgpuNameCache.Lock()
+	defer amdgpuNameCache.Unlock()
+	amdgpuNameCache.misses[deviceID] = struct{}{}
+	if revisionID != "" {
+		amdgpuNameCache.misses[cacheKeyForAmdgpu(deviceID, revisionID)] = struct{}{}
+	}
 }

 // getAmdGpuName attempts to get a descriptive GPU name.
@@ -146,39 +273,30 @@ func readSysfsFloat(path string) (float64, error) {
 // Falls back to showing the raw device ID if not found in the lookup table.
 func getAmdGpuName(devicePath string) string {
 	// Try product_name first (works for some enterprise GPUs)
-	if prod, err := os.ReadFile(filepath.Join(devicePath, "product_name")); err == nil {
-		return strings.TrimSpace(string(prod))
+	if prod, err := utils.ReadStringFileLimited(filepath.Join(devicePath, "product_name"), 128); err == nil {
+		return prod
 	}

 	// Read PCI device ID and look it up
-	if deviceID, err := os.ReadFile(filepath.Join(devicePath, "device")); err == nil {
-		id := strings.TrimPrefix(strings.ToLower(strings.TrimSpace(string(deviceID))), "0x")
-		if name, ok := getRadeonNames()[id]; ok {
-			return fmt.Sprintf("Radeon %s", name)
+	if deviceID, err := utils.ReadStringFileLimited(filepath.Join(devicePath, "device"), 64); err == nil {
+		id := normalizeHexID(deviceID)
+		revision := ""
+		if rev, revErr := utils.ReadStringFileLimited(filepath.Join(devicePath, "revision"), 64); revErr == nil {
+			revision = normalizeHexID(rev)
 		}
+
+		if name, found, done := getCachedAmdgpuName(id, revision); found {
+			return name
+		} else if !done {
+			if name, exact, ok := lookupAmdgpuNameInFile(id, revision, "/usr/share/libdrm/amdgpu.ids"); ok {
+				cacheAmdgpuName(id, revision, name, exact)
+				return normalizeAmdgpuName(name)
+			}
+			cacheMissingAmdgpuName(id, revision)
+		}
+
 		return fmt.Sprintf("AMD GPU (%s)", id)
 	}

 	return "AMD GPU"
 }
-
-// getRadeonNames returns the AMD GPU name lookup table
-// Device IDs from https://pci-ids.ucw.cz/read/PC/1002
-var getRadeonNames = sync.OnceValue(func() map[string]string {
-	return map[string]string{
-		"7550": "RX 9070",
-		"7590": "RX 9060 XT",
-		"7551": "AI PRO R9700",
-
-		"744c": "RX 7900",
-
-		"1681": "680M",
-
-		"7448": "PRO W7900",
-		"745e": "PRO W7800",
-		"7470": "PRO W7700",
-		"73e3": "PRO W6600",
-		"7422": "PRO W6400",
-		"7341": "PRO W5500",
-	}
-})
--- a/agent/gpu_amd_linux_test.go
+++ b/agent/gpu_amd_linux_test.go
@@ -0,0 +1,265 @@
+//go:build linux
+
+package agent
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/henrygd/beszel/agent/utils"
+	"github.com/henrygd/beszel/internal/entities/system"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestNormalizeHexID(t *testing.T) {
+	tests := []struct {
+		in   string
+		want string
+	}{
+		{"0x1002", "1002"},
+		{"C2", "c2"},
+		{"  15BF  ", "15bf"},
+		{"0x15bf", "15bf"},
+		{"", ""},
+	}
+	for _, tt := range tests {
+		subName := tt.in
+		if subName == "" {
+			subName = "empty_string"
+		}
+		t.Run(subName, func(t *testing.T) {
+			got := normalizeHexID(tt.in)
+			assert.Equal(t, tt.want, got)
+		})
+	}
+}
+
+func TestCacheKeyForAmdgpu(t *testing.T) {
+	tests := []struct {
+		deviceID   string
+		revisionID string
+		want       string
+	}{
+		{"1114", "c2", "1114:c2"},
+		{"15bf", "", "15bf"},
+		{"1506", "c1", "1506:c1"},
+	}
+	for _, tt := range tests {
+		got := cacheKeyForAmdgpu(tt.deviceID, tt.revisionID)
+		assert.Equal(t, tt.want, got)
+	}
+}
+
+func TestReadSysfsFloat(t *testing.T) {
+	dir := t.TempDir()
+
+	validPath := filepath.Join(dir, "val")
+	require.NoError(t, os.WriteFile(validPath, []byte("  42.5  \n"), 0o644))
+	got, err := readSysfsFloat(validPath)
+	require.NoError(t, err)
+	assert.Equal(t, 42.5, got)
+
+	// Integer and scientific
+	sciPath := filepath.Join(dir, "sci")
+	require.NoError(t, os.WriteFile(sciPath, []byte("1e2"), 0o644))
+	got, err = readSysfsFloat(sciPath)
+	require.NoError(t, err)
+	assert.Equal(t, 100.0, got)
+
+	// Missing file
+	_, err = readSysfsFloat(filepath.Join(dir, "missing"))
+	require.Error(t, err)
+
+	// Invalid content
+	badPath := filepath.Join(dir, "bad")
+	require.NoError(t, os.WriteFile(badPath, []byte("not a number"), 0o644))
+	_, err = readSysfsFloat(badPath)
+	require.Error(t, err)
+}
+
+func TestIsAmdGpu(t *testing.T) {
+	dir := t.TempDir()
+	deviceDir := filepath.Join(dir, "device")
+	require.NoError(t, os.MkdirAll(deviceDir, 0o755))
+
+	// AMD vendor 0x1002 -> true
+	require.NoError(t, os.WriteFile(filepath.Join(deviceDir, "vendor"), []byte("0x1002\n"), 0o644))
+	assert.True(t, isAmdGpu(dir), "vendor 0x1002 should be AMD")
+
+	// Non-AMD vendor -> false
+	require.NoError(t, os.WriteFile(filepath.Join(deviceDir, "vendor"), []byte("0x10de\n"), 0o644))
+	assert.False(t, isAmdGpu(dir), "vendor 0x10de should not be AMD")
+
+	// Missing vendor file -> false
+	require.NoError(t, os.Remove(filepath.Join(deviceDir, "vendor")))
+	assert.False(t, isAmdGpu(dir), "missing vendor file should be false")
+}
+
+func TestAmdgpuNameCacheRoundTrip(t *testing.T) {
+	// Cache a name and retrieve it (unique key to avoid affecting other tests)
+	deviceID, revisionID := "cachedev99", "00"
+	cacheAmdgpuName(deviceID, revisionID, "AMD Test GPU 99 Graphics", true)
+
+	name, found, done := getCachedAmdgpuName(deviceID, revisionID)
+	assert.True(t, found)
+	assert.True(t, done)
+	assert.Equal(t, "AMD Test GPU 99", name)
+
+	// Device-only key also stored
+	name2, found2, _ := getCachedAmdgpuName(deviceID, "")
+	assert.True(t, found2)
+	assert.Equal(t, "AMD Test GPU 99", name2)
+
+	// Cache a miss
+	cacheMissingAmdgpuName("missedev99", "ab")
+	_, found3, done3 := getCachedAmdgpuName("missedev99", "ab")
+	assert.False(t, found3)
+	assert.True(t, done3, "done should be true so caller skips file lookup")
+}
+
+func TestUpdateAmdGpuDataWithFakeSysfs(t *testing.T) {
+	tests := []struct {
+		name            string
+		writeGTT        bool
+		wantMemoryUsed  float64
+		wantMemoryTotal float64
+	}{
+		{
+			name:            "sums vram and gtt when gtt is present",
+			writeGTT:        true,
+			wantMemoryUsed:  utils.BytesToMegabytes(1073741824 + 536870912),
+			wantMemoryTotal: utils.BytesToMegabytes(2147483648 + 4294967296),
+		},
+		{
+			name:            "falls back to vram when gtt is missing",
+			writeGTT:        false,
+			wantMemoryUsed:  utils.BytesToMegabytes(1073741824),
+			wantMemoryTotal: utils.BytesToMegabytes(2147483648),
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			dir := t.TempDir()
+			cardPath := filepath.Join(dir, "card0")
+			devicePath := filepath.Join(cardPath, "device")
+			hwmonPath := filepath.Join(devicePath, "hwmon", "hwmon0")
+			require.NoError(t, os.MkdirAll(hwmonPath, 0o755))
+
+			write := func(name, content string) {
+				require.NoError(t, os.WriteFile(filepath.Join(devicePath, name), []byte(content), 0o644))
+			}
+			write("vendor", "0x1002")
+			write("device", "0x1506")
+			write("revision", "0xc1")
+			write("gpu_busy_percent", "25")
+			write("mem_info_vram_used", "1073741824")
+			write("mem_info_vram_total", "2147483648")
+			if tt.writeGTT {
+				write("mem_info_gtt_used", "536870912")
+				write("mem_info_gtt_total", "4294967296")
+			}
+			require.NoError(t, os.WriteFile(filepath.Join(hwmonPath, "temp1_input"), []byte("45000"), 0o644))
+			require.NoError(t, os.WriteFile(filepath.Join(hwmonPath, "power1_input"), []byte("20000000"), 0o644))
+
+			// Pre-cache name so getAmdGpuName returns a known value (it uses system amdgpu.ids path)
+			cacheAmdgpuName("1506", "c1", "AMD Radeon 610M Graphics", true)
+
+			gm := &GPUManager{GpuDataMap: make(map[string]*system.GPUData)}
+			ok := gm.updateAmdGpuData(cardPath)
+			require.True(t, ok)
+
+			gpu, ok := gm.GpuDataMap["card0"]
+			require.True(t, ok)
+			assert.Equal(t, "AMD Radeon 610M", gpu.Name)
+			assert.Equal(t, 25.0, gpu.Usage)
+			assert.Equal(t, tt.wantMemoryUsed, gpu.MemoryUsed)
+			assert.Equal(t, tt.wantMemoryTotal, gpu.MemoryTotal)
+			assert.Equal(t, 45.0, gpu.Temperature)
+			assert.Equal(t, 20.0, gpu.Power)
+			assert.Equal(t, 1.0, gpu.Count)
+		})
+	}
+}
+
+func TestLookupAmdgpuNameInFile(t *testing.T) {
+	idsPath := filepath.Join("test-data", "amdgpu.ids")
+
+	tests := []struct {
+		name       string
+		deviceID   string
+		revisionID string
+		wantName   string
+		wantExact  bool
+		wantFound  bool
+	}{
+		{
+			name:       "exact device and revision match",
+			deviceID:   "1114",
+			revisionID: "c2",
+			wantName:   "AMD Radeon 860M Graphics",
+			wantExact:  true,
+			wantFound:  true,
+		},
+		{
+			name:       "exact match 15BF revision 01 returns 760M",
+			deviceID:   "15bf",
+			revisionID: "01",
+			wantName:   "AMD Radeon 760M Graphics",
+			wantExact:  true,
+			wantFound:  true,
+		},
+		{
+			name:       "exact match 15BF revision 00 returns 780M",
+			deviceID:   "15bf",
+			revisionID: "00",
+			wantName:   "AMD Radeon 780M Graphics",
+			wantExact:  true,
+			wantFound:  true,
+		},
+		{
+			name:       "device-only match returns first entry for device",
+			deviceID:   "1506",
+			revisionID: "",
+			wantName:   "AMD Radeon 610M",
+			wantExact:  false,
+			wantFound:  true,
+		},
+		{
+			name:       "unknown device not found",
+			deviceID:   "dead",
+			revisionID: "00",
+			wantName:   "",
+			wantExact:  false,
+			wantFound:  false,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			gotName, gotExact, gotFound := lookupAmdgpuNameInFile(tt.deviceID, tt.revisionID, idsPath)
+			assert.Equal(t, tt.wantName, gotName, "name")
+			assert.Equal(t, tt.wantExact, gotExact, "exact")
+			assert.Equal(t, tt.wantFound, gotFound, "found")
+		})
+	}
+}
+
+func TestGetAmdGpuNameFromIdsFile(t *testing.T) {
+	// Test that getAmdGpuName resolves a name when we can't inject the ids path.
+	// We only verify behavior when product_name is missing and device/revision
+	// would be read from sysfs; the actual lookup uses /usr/share/libdrm/amdgpu.ids.
+	// So this test focuses on normalizeAmdgpuName and that lookupAmdgpuNameInFile
+	// returns the expected name for our test-data file.
+	idsPath := filepath.Join("test-data", "amdgpu.ids")
+	name, exact, found := lookupAmdgpuNameInFile("1435", "ae", idsPath)
+	require.True(t, found)
+	require.True(t, exact)
+	assert.Equal(t, "AMD Custom GPU 0932", name)
+	assert.Equal(t, "AMD Custom GPU 0932", normalizeAmdgpuName(name))
+
+	// " Graphics" suffix is trimmed by normalizeAmdgpuName
+	name2 := "AMD Radeon 860M Graphics"
+	assert.Equal(t, "AMD Radeon 860M", normalizeAmdgpuName(name2))
+}
--- a/agent/gpu_darwin.go
+++ b/agent/gpu_darwin.go
@@ -0,0 +1,252 @@
+//go:build darwin
+
+package agent
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/json"
+	"io"
+	"log/slog"
+	"os/exec"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/henrygd/beszel/internal/entities/system"
+)
+
+const (
+	// powermetricsSampleIntervalMs is the sampling interval passed to powermetrics (-i).
+	powermetricsSampleIntervalMs = 500
+	// powermetricsPollInterval is how often we run powermetrics to collect a new sample.
+	powermetricsPollInterval = 2 * time.Second
+	// macmonIntervalMs is the sampling interval passed to macmon pipe (-i), in milliseconds.
+	macmonIntervalMs = 2500
+)
+
+const appleGPUID = "0"
+
+// startPowermetricsCollector runs powermetrics --samplers gpu_power in a loop and updates
+// GPU usage and power. Requires root (sudo) on macOS. A single logical GPU is reported as id "0".
+func (gm *GPUManager) startPowermetricsCollector() {
+	// Ensure single GPU entry for Apple GPU
+	if _, ok := gm.GpuDataMap[appleGPUID]; !ok {
+		gm.GpuDataMap[appleGPUID] = &system.GPUData{Name: "Apple GPU"}
+	}
+
+	go func() {
+		failures := 0
+		for {
+			if err := gm.collectPowermetrics(); err != nil {
+				failures++
+				if failures > maxFailureRetries {
+					slog.Warn("powermetrics GPU collector failed repeatedly, stopping", "err", err)
+					break
+				}
+				slog.Warn("Error collecting macOS GPU data via powermetrics (may require sudo)", "err", err)
+				time.Sleep(retryWaitTime)
+				continue
+			}
+			failures = 0
+			time.Sleep(powermetricsPollInterval)
+		}
+	}()
+}
+
+// collectPowermetrics runs powermetrics once and parses GPU usage and power from its output.
+func (gm *GPUManager) collectPowermetrics() error {
+	interval := strconv.Itoa(powermetricsSampleIntervalMs)
+	cmd := exec.Command(powermetricsCmd, "--samplers", "gpu_power", "-i", interval, "-n", "1")
+	cmd.Stderr = nil
+	out, err := cmd.Output()
+	if err != nil {
+		return err
+	}
+	if !gm.parsePowermetricsData(out) {
+		return errNoValidData
+	}
+	return nil
+}
+
+// parsePowermetricsData parses powermetrics gpu_power output and updates GpuDataMap["0"].
+// Example output:
+//
+//	**** GPU usage ****
+//	GPU HW active frequency: 444 MHz
+//	GPU HW active residency:   0.97% (444 MHz: .97% ...
+//	GPU idle residency:  99.03%
+//	GPU Power: 4 mW
+func (gm *GPUManager) parsePowermetricsData(output []byte) bool {
+	var idleResidency, powerMW float64
+	var gotIdle, gotPower bool
+
+	scanner := bufio.NewScanner(bytes.NewReader(output))
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+		if strings.HasPrefix(line, "GPU idle residency:") {
+			// "GPU idle residency:  99.03%"
+			fields := strings.Fields(strings.TrimPrefix(line, "GPU idle residency:"))
+			if len(fields) >= 1 {
+				pct := strings.TrimSuffix(fields[0], "%")
+				if v, err := strconv.ParseFloat(pct, 64); err == nil {
+					idleResidency = v
+					gotIdle = true
+				}
+			}
+		} else if strings.HasPrefix(line, "GPU Power:") {
+			// "GPU Power: 4 mW"
+			fields := strings.Fields(strings.TrimPrefix(line, "GPU Power:"))
+			if len(fields) >= 1 {
+				if v, err := strconv.ParseFloat(fields[0], 64); err == nil {
+					powerMW = v
+					gotPower = true
+				}
+			}
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return false
+	}
+	if !gotIdle && !gotPower {
+		return false
+	}
+
+	gm.Lock()
+	defer gm.Unlock()
+
+	if _, ok := gm.GpuDataMap[appleGPUID]; !ok {
+		gm.GpuDataMap[appleGPUID] = &system.GPUData{Name: "Apple GPU"}
+	}
+	gpu := gm.GpuDataMap[appleGPUID]
+
+	if gotIdle {
+		// Usage = 100 - idle residency (e.g. 100 - 99.03 = 0.97%)
+		gpu.Usage += 100 - idleResidency
+	}
+	if gotPower {
+		// mW -> W
+		gpu.Power += powerMW / milliwattsInAWatt
+	}
+	gpu.Count++
+	return true
+}
+
+// startMacmonCollector runs `macmon pipe` in a loop and parses one JSON object per line.
+// This collector does not require sudo. A single logical GPU is reported as id "0".
+func (gm *GPUManager) startMacmonCollector() {
+	if _, ok := gm.GpuDataMap[appleGPUID]; !ok {
+		gm.GpuDataMap[appleGPUID] = &system.GPUData{Name: "Apple GPU"}
+	}
+
+	go func() {
+		failures := 0
+		for {
+			if err := gm.collectMacmonPipe(); err != nil {
+				failures++
+				if failures > maxFailureRetries {
+					slog.Warn("macmon GPU collector failed repeatedly, stopping", "err", err)
+					break
+				}
+				slog.Warn("Error collecting macOS GPU data via macmon", "err", err)
+				time.Sleep(retryWaitTime)
+				continue
+			}
+			failures = 0
+			// `macmon pipe` is long-running; if it returns, wait a bit before restarting.
+			time.Sleep(retryWaitTime)
+		}
+	}()
+}
+
+type macmonTemp struct {
+	GPUTempAvg float64 `json:"gpu_temp_avg"`
+}
+
+type macmonSample struct {
+	GPUPower    float64    `json:"gpu_power"`     // watts (macmon reports fractional values)
+	GPURAMPower float64    `json:"gpu_ram_power"` // watts
+	GPUUsage    []float64  `json:"gpu_usage"`     // [freq_mhz, usage] where usage is typically 0..1
+	Temp        macmonTemp `json:"temp"`
+}
+
+func (gm *GPUManager) collectMacmonPipe() (err error) {
+	cmd := exec.Command(macmonCmd, "pipe", "-i", strconv.Itoa(macmonIntervalMs))
+	// Avoid blocking if macmon writes to stderr.
+	cmd.Stderr = io.Discard
+	stdout, err := cmd.StdoutPipe()
+	if err != nil {
+		return err
+	}
+	if err := cmd.Start(); err != nil {
+		return err
+	}
+
+	// Ensure we always reap the child to avoid zombies on any return path and
+	// propagate a non-zero exit code if no other error was set.
+	defer func() {
+		_ = stdout.Close()
+		if cmd.ProcessState == nil || !cmd.ProcessState.Exited() {
+			_ = cmd.Process.Kill()
+		}
+		if waitErr := cmd.Wait(); err == nil && waitErr != nil {
+			err = waitErr
+		}
+	}()
+
+	scanner := bufio.NewScanner(stdout)
+	var hadSample bool
+	for scanner.Scan() {
+		line := bytes.TrimSpace(scanner.Bytes())
+		if len(line) == 0 {
+			continue
+		}
+		if gm.parseMacmonLine(line) {
+			hadSample = true
+		}
+	}
+	if scanErr := scanner.Err(); scanErr != nil {
+		return scanErr
+	}
+	if !hadSample {
+		return errNoValidData
+	}
+	return nil
+}
+
+// parseMacmonLine parses a single macmon JSON line and updates Apple GPU metrics.
+func (gm *GPUManager) parseMacmonLine(line []byte) bool {
+	var sample macmonSample
+	if err := json.Unmarshal(line, &sample); err != nil {
+		return false
+	}
+
+	usage := 0.0
+	if len(sample.GPUUsage) >= 2 {
+		usage = sample.GPUUsage[1]
+		// Heuristic: macmon typically reports 0..1; convert to percentage.
+		if usage <= 1.0 {
+			usage *= 100
+		}
+	}
+
+	// Consider the line valid if it contains at least one GPU metric.
+	if usage == 0 && sample.GPUPower == 0 && sample.Temp.GPUTempAvg == 0 {
+		return false
+	}
+
+	gm.Lock()
+	defer gm.Unlock()
+
+	gpu, ok := gm.GpuDataMap[appleGPUID]
+	if !ok {
+		gpu = &system.GPUData{Name: "Apple GPU"}
+		gm.GpuDataMap[appleGPUID] = gpu
+	}
+	gpu.Temperature = sample.Temp.GPUTempAvg
+	gpu.Usage += usage
+	// macmon reports power in watts; include VRAM power if present.
+	gpu.Power += sample.GPUPower + sample.GPURAMPower
+	gpu.Count++
+	return true
+}
--- a/agent/gpu_darwin_test.go
+++ b/agent/gpu_darwin_test.go
@@ -0,0 +1,81 @@
+//go:build darwin
+
+package agent
+
+import (
+	"testing"
+
+	"github.com/henrygd/beszel/internal/entities/system"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestParsePowermetricsData(t *testing.T) {
+	input := `
+Machine model: Mac14,10
+OS version: 25D125
+
+*** Sampled system activity (Sat Feb 14 00:42:06 2026 -0500) (503.05ms elapsed) ***
+
+**** GPU usage ****
+
+GPU HW active frequency: 444 MHz
+GPU HW active residency:   0.97% (444 MHz: .97% 612 MHz:   0% 808 MHz:   0% 968 MHz:   0% 1110 MHz:   0% 1236 MHz:   0% 1338 MHz:   0% 1398 MHz:   0%)
+GPU SW requested state: (P1 : 100% P2 :   0% P3 :   0% P4 :   0% P5 :   0% P6 :   0% P7 :   0% P8 :   0%)
+GPU idle residency:  99.03%
+GPU Power: 4 mW
+`
+	gm := &GPUManager{
+		GpuDataMap: make(map[string]*system.GPUData),
+	}
+	valid := gm.parsePowermetricsData([]byte(input))
+	require.True(t, valid)
+
+	g0, ok := gm.GpuDataMap["0"]
+	require.True(t, ok)
+	assert.Equal(t, "Apple GPU", g0.Name)
+	// Usage = 100 - 99.03 = 0.97
+	assert.InDelta(t, 0.97, g0.Usage, 0.01)
+	// 4 mW -> 0.004 W
+	assert.InDelta(t, 0.004, g0.Power, 0.0001)
+	assert.Equal(t, 1.0, g0.Count)
+}
+
+func TestParsePowermetricsDataPartial(t *testing.T) {
+	// Only power line (e.g. older macOS or different sampler output)
+	input := `
+**** GPU usage ****
+GPU Power: 120 mW
+`
+	gm := &GPUManager{
+		GpuDataMap: make(map[string]*system.GPUData),
+	}
+	valid := gm.parsePowermetricsData([]byte(input))
+	require.True(t, valid)
+
+	g0, ok := gm.GpuDataMap["0"]
+	require.True(t, ok)
+	assert.Equal(t, "Apple GPU", g0.Name)
+	assert.InDelta(t, 0.12, g0.Power, 0.001)
+	assert.Equal(t, 1.0, g0.Count)
+}
+
+func TestParseMacmonLine(t *testing.T) {
+	input := `{"all_power":0.6468324661254883,"ane_power":0.0,"cpu_power":0.6359732151031494,"ecpu_usage":[2061,0.1726151406764984],"gpu_power":0.010859241709113121,"gpu_ram_power":0.000965250947047025,"gpu_usage":[503,0.013633215799927711],"memory":{"ram_total":17179869184,"ram_usage":12322914304,"swap_total":0,"swap_usage":0},"pcpu_usage":[1248,0.11792058497667313],"ram_power":0.14885640144348145,"sys_power":10.4955415725708,"temp":{"cpu_temp_avg":23.041261672973633,"gpu_temp_avg":29.44516944885254},"timestamp":"2026-02-17T19:34:27.942556+00:00"}`
+
+	gm := &GPUManager{
+		GpuDataMap: make(map[string]*system.GPUData),
+	}
+	valid := gm.parseMacmonLine([]byte(input))
+	require.True(t, valid)
+
+	g0, ok := gm.GpuDataMap["0"]
+	require.True(t, ok)
+	assert.Equal(t, "Apple GPU", g0.Name)
+	// macmon reports usage fraction 0..1; expect percent conversion.
+	assert.InDelta(t, 1.3633, g0.Usage, 0.05)
+	// power includes gpu_power + gpu_ram_power
+	assert.InDelta(t, 0.011824, g0.Power, 0.0005)
+	assert.InDelta(t, 29.445, g0.Temperature, 0.01)
+	assert.Equal(t, 1.0, g0.Count)
+}
--- a/agent/gpu_darwin_unsupported.go
+++ b/agent/gpu_darwin_unsupported.go
@@ -0,0 +1,9 @@
+//go:build !darwin
+
+package agent
+
+// startPowermetricsCollector is a no-op on non-darwin platforms; the real implementation is in gpu_darwin.go.
+func (gm *GPUManager) startPowermetricsCollector() {}
+
+// startMacmonCollector is a no-op on non-darwin platforms; the real implementation is in gpu_darwin.go.
+func (gm *GPUManager) startMacmonCollector() {}
--- a/agent/gpu_intel.go
+++ b/agent/gpu_intel.go
@@ -7,6 +7,7 @@ import (
 	"strconv"
 	"strings"

+	"github.com/henrygd/beszel/agent/utils"
 	"github.com/henrygd/beszel/internal/entities/system"
 )

@@ -52,7 +53,7 @@ func (gm *GPUManager) updateIntelFromStats(sample *intelGpuStats) bool {
 func (gm *GPUManager) collectIntelStats() (err error) {
 	// Build command arguments, optionally selecting a device via -d
 	args := []string{"-s", intelGpuStatsInterval, "-l"}
-	if dev, ok := GetEnv("INTEL_GPU_DEVICE"); ok && dev != "" {
+	if dev, ok := utils.GetEnv("INTEL_GPU_DEVICE"); ok && dev != "" {
 		args = append(args, "-d", dev)
 	}
 	cmd := exec.Command(intelGpuStatsCmd, args...)
--- a/agent/gpu_nvml_unsupported.go
+++ b/agent/gpu_nvml_unsupported.go
@@ -13,21 +13,3 @@ func (c *nvmlCollector) init() error {
 }

 func (c *nvmlCollector) start() {}
-
-func (c *nvmlCollector) collect() {}
-
-func openLibrary(name string) (uintptr, error) {
-	return 0, fmt.Errorf("nvml not supported on this platform")
-}
-
-func getNVMLPath() string {
-	return ""
-}
-
-func hasSymbol(lib uintptr, symbol string) bool {
-	return false
-}
-
-func (c *nvmlCollector) isGPUActive(bdf string) bool {
-	return true
-}
--- a/agent/gpu_nvtop.go
+++ b/agent/gpu_nvtop.go
@@ -0,0 +1,160 @@
+package agent
+
+import (
+	"encoding/json"
+	"io"
+	"log/slog"
+	"os/exec"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/henrygd/beszel/agent/utils"
+	"github.com/henrygd/beszel/internal/entities/system"
+)
+
+type nvtopSnapshot struct {
+	DeviceName string  `json:"device_name"`
+	Temp       *string `json:"temp"`
+	PowerDraw  *string `json:"power_draw"`
+	GpuUtil    *string `json:"gpu_util"`
+	MemTotal   *string `json:"mem_total"`
+	MemUsed    *string `json:"mem_used"`
+}
+
+// parseNvtopNumber parses nvtop numeric strings with units (C/W/%).
+func parseNvtopNumber(raw string) float64 {
+	cleaned := strings.TrimSpace(raw)
+	cleaned = strings.TrimSuffix(cleaned, "C")
+	cleaned = strings.TrimSuffix(cleaned, "W")
+	cleaned = strings.TrimSuffix(cleaned, "%")
+	val, _ := strconv.ParseFloat(cleaned, 64)
+	return val
+}
+
+// parseNvtopData parses a single nvtop JSON snapshot payload.
+func (gm *GPUManager) parseNvtopData(output []byte) bool {
+	var snapshots []nvtopSnapshot
+	if err := json.Unmarshal(output, &snapshots); err != nil || len(snapshots) == 0 {
+		return false
+	}
+	return gm.updateNvtopSnapshots(snapshots)
+}
+
+// updateNvtopSnapshots applies one decoded nvtop snapshot batch to GPU accumulators.
+func (gm *GPUManager) updateNvtopSnapshots(snapshots []nvtopSnapshot) bool {
+	gm.Lock()
+	defer gm.Unlock()
+
+	valid := false
+	usedIDs := make(map[string]struct{}, len(snapshots))
+	for i, sample := range snapshots {
+		if sample.DeviceName == "" {
+			continue
+		}
+		indexID := "n" + strconv.Itoa(i)
+		id := indexID
+
+		// nvtop ordering can change, so prefer reusing an existing slot with matching device name.
+		if existingByIndex, ok := gm.GpuDataMap[indexID]; ok && existingByIndex.Name != "" && existingByIndex.Name != sample.DeviceName {
+			for existingID, gpu := range gm.GpuDataMap {
+				if !strings.HasPrefix(existingID, "n") {
+					continue
+				}
+				if _, taken := usedIDs[existingID]; taken {
+					continue
+				}
+				if gpu.Name == sample.DeviceName {
+					id = existingID
+					break
+				}
+			}
+		}
+
+		if _, ok := gm.GpuDataMap[id]; !ok {
+			gm.GpuDataMap[id] = &system.GPUData{Name: sample.DeviceName}
+		}
+		gpu := gm.GpuDataMap[id]
+		gpu.Name = sample.DeviceName
+
+		if sample.Temp != nil {
+			gpu.Temperature = parseNvtopNumber(*sample.Temp)
+		}
+		if sample.MemUsed != nil {
+			gpu.MemoryUsed = utils.BytesToMegabytes(parseNvtopNumber(*sample.MemUsed))
+		}
+		if sample.MemTotal != nil {
+			gpu.MemoryTotal = utils.BytesToMegabytes(parseNvtopNumber(*sample.MemTotal))
+		}
+		if sample.GpuUtil != nil {
+			gpu.Usage += parseNvtopNumber(*sample.GpuUtil)
+		}
+		if sample.PowerDraw != nil {
+			gpu.Power += parseNvtopNumber(*sample.PowerDraw)
+		}
+		gpu.Count++
+		usedIDs[id] = struct{}{}
+		valid = true
+	}
+	return valid
+}
+
+// collectNvtopStats runs nvtop loop mode and continuously decodes JSON snapshots.
+func (gm *GPUManager) collectNvtopStats(interval string) error {
+	cmd := exec.Command(nvtopCmd, "-lP", "-d", interval)
+	stdout, err := cmd.StdoutPipe()
+	if err != nil {
+		return err
+	}
+	if err := cmd.Start(); err != nil {
+		return err
+	}
+	defer func() {
+		_ = stdout.Close()
+		if cmd.ProcessState == nil || !cmd.ProcessState.Exited() {
+			_ = cmd.Process.Kill()
+		}
+		_ = cmd.Wait()
+	}()
+
+	decoder := json.NewDecoder(stdout)
+	foundValid := false
+	for {
+		var snapshots []nvtopSnapshot
+		if err := decoder.Decode(&snapshots); err != nil {
+			if err == io.EOF {
+				if foundValid {
+					return nil
+				}
+				return errNoValidData
+			}
+			return err
+		}
+		if gm.updateNvtopSnapshots(snapshots) {
+			foundValid = true
+		}
+	}
+}
+
+// startNvtopCollector starts nvtop collection with retry or fallback callback handling.
+func (gm *GPUManager) startNvtopCollector(interval string, onFailure func()) {
+	go func() {
+		failures := 0
+		for {
+			if err := gm.collectNvtopStats(interval); err != nil {
+				if onFailure != nil {
+					slog.Warn("Error collecting GPU data via nvtop", "err", err)
+					onFailure()
+					return
+				}
+				failures++
+				if failures > maxFailureRetries {
+					break
+				}
+				slog.Warn("Error collecting GPU data via nvtop", "err", err)
+				time.Sleep(retryWaitTime)
+				continue
+			}
+		}
+	}()
+}
--- a/agent/gpu_test.go
+++ b/agent/gpu_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package agent

@@ -11,6 +10,7 @@ import (
 	"testing"
 	"time"

+	"github.com/henrygd/beszel/agent/utils"
 	"github.com/henrygd/beszel/internal/entities/system"

 	"github.com/stretchr/testify/assert"
@@ -250,6 +250,100 @@ func TestParseAmdData(t *testing.T) {
 	}
 }

+func TestParseNvtopData(t *testing.T) {
+	input, err := os.ReadFile("test-data/nvtop.json")
+	require.NoError(t, err)
+
+	gm := &GPUManager{
+		GpuDataMap: make(map[string]*system.GPUData),
+	}
+	valid := gm.parseNvtopData(input)
+	require.True(t, valid)
+
+	g0, ok := gm.GpuDataMap["n0"]
+	require.True(t, ok)
+	assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", g0.Name)
+	assert.Equal(t, 48.0, g0.Temperature)
+	assert.Equal(t, 5.0, g0.Usage)
+	assert.Equal(t, 13.0, g0.Power)
+	assert.Equal(t, utils.BytesToMegabytes(349372416), g0.MemoryUsed)
+	assert.Equal(t, utils.BytesToMegabytes(4294967296), g0.MemoryTotal)
+	assert.Equal(t, 1.0, g0.Count)
+
+	g1, ok := gm.GpuDataMap["n1"]
+	require.True(t, ok)
+	assert.Equal(t, "AMD Radeon 680M", g1.Name)
+	assert.Equal(t, 48.0, g1.Temperature)
+	assert.Equal(t, 12.0, g1.Usage)
+	assert.Equal(t, 9.0, g1.Power)
+	assert.Equal(t, utils.BytesToMegabytes(1213784064), g1.MemoryUsed)
+	assert.Equal(t, utils.BytesToMegabytes(16929173504), g1.MemoryTotal)
+	assert.Equal(t, 1.0, g1.Count)
+}
+
+func TestUpdateNvtopSnapshotsKeepsDeviceAssociationWhenOrderChanges(t *testing.T) {
+	strPtr := func(s string) *string { return &s }
+
+	gm := &GPUManager{
+		GpuDataMap: make(map[string]*system.GPUData),
+	}
+
+	firstBatch := []nvtopSnapshot{
+		{
+			DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
+			GpuUtil:    strPtr("20%"),
+			PowerDraw:  strPtr("10W"),
+		},
+		{
+			DeviceName: "AMD Radeon 680M",
+			GpuUtil:    strPtr("30%"),
+			PowerDraw:  strPtr("20W"),
+		},
+	}
+	secondBatchSwapped := []nvtopSnapshot{
+		{
+			DeviceName: "AMD Radeon 680M",
+			GpuUtil:    strPtr("40%"),
+			PowerDraw:  strPtr("25W"),
+		},
+		{
+			DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
+			GpuUtil:    strPtr("50%"),
+			PowerDraw:  strPtr("15W"),
+		},
+	}
+
+	require.True(t, gm.updateNvtopSnapshots(firstBatch))
+	require.True(t, gm.updateNvtopSnapshots(secondBatchSwapped))
+
+	nvidia := gm.GpuDataMap["n0"]
+	require.NotNil(t, nvidia)
+	assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", nvidia.Name)
+	assert.Equal(t, 70.0, nvidia.Usage)
+	assert.Equal(t, 25.0, nvidia.Power)
+	assert.Equal(t, 2.0, nvidia.Count)
+
+	amd := gm.GpuDataMap["n1"]
+	require.NotNil(t, amd)
+	assert.Equal(t, "AMD Radeon 680M", amd.Name)
+	assert.Equal(t, 70.0, amd.Usage)
+	assert.Equal(t, 45.0, amd.Power)
+	assert.Equal(t, 2.0, amd.Count)
+}
+
+func TestParseCollectorPriority(t *testing.T) {
+	got := parseCollectorPriority(" nvml, nvidia-smi, intel_gpu_top, amd_sysfs, nvtop, rocm-smi, bad ")
+	want := []collectorSource{
+		collectorSourceNVML,
+		collectorSourceNvidiaSMI,
+		collectorSourceIntelGpuTop,
+		collectorSourceAmdSysfs,
+		collectorSourceNVTop,
+		collectorSourceRocmSMI,
+	}
+	assert.Equal(t, want, got)
+}
+
 func TestParseJetsonData(t *testing.T) {
 	tests := []struct {
 		name        string
@@ -987,36 +1081,35 @@ func TestCalculateGPUAverage(t *testing.T) {
 	})
 }

-func TestDetectGPUs(t *testing.T) {
+func TestGPUCapabilitiesAndLegacyPriority(t *testing.T) {
 	// Save original PATH
 	origPath := os.Getenv("PATH")
 	defer os.Setenv("PATH", origPath)
-
-	// Set up temp dir with the commands
-	tempDir := t.TempDir()
-	os.Setenv("PATH", tempDir)
+	hasAmdSysfs := (&GPUManager{}).hasAmdSysfs()

 	tests := []struct {
 		name           string
-		setupCommands  func() error
+		setupCommands  func(string) error
 		wantNvidiaSmi  bool
 		wantRocmSmi    bool
 		wantTegrastats bool
+		wantNvtop      bool
 		wantErr        bool
 	}{
 		{
 			name: "nvidia-smi not available",
-			setupCommands: func() error {
+			setupCommands: func(_ string) error {
 				return nil
 			},
 			wantNvidiaSmi:  false,
 			wantRocmSmi:    false,
 			wantTegrastats: false,
+			wantNvtop:      false,
 			wantErr:        true,
 		},
 		{
 			name: "nvidia-smi available",
-			setupCommands: func() error {
+			setupCommands: func(tempDir string) error {
 				path := filepath.Join(tempDir, "nvidia-smi")
 				script := `#!/bin/sh
 echo "test"`
@@ -1028,29 +1121,14 @@ echo "test"`
 			wantNvidiaSmi:  true,
 			wantTegrastats: false,
 			wantRocmSmi:    false,
+			wantNvtop:      false,
 			wantErr:        false,
 		},
 		{
 			name: "rocm-smi available",
-			setupCommands: func() error {
+			setupCommands: func(tempDir string) error {
 				path := filepath.Join(tempDir, "rocm-smi")
 				script := `#!/bin/sh
-echo "test"`
-				if err := os.WriteFile(path, []byte(script), 0755); err != nil {
-					return err
-				}
-				return nil
-			},
-			wantNvidiaSmi:  true,
-			wantRocmSmi:    true,
-			wantTegrastats: false,
-			wantErr:        false,
-		},
-		{
-			name: "tegrastats available",
-			setupCommands: func() error {
-				path := filepath.Join(tempDir, "tegrastats")
-				script := `#!/bin/sh
 echo "test"`
 				if err := os.WriteFile(path, []byte(script), 0755); err != nil {
 					return err
@@ -1059,12 +1137,47 @@ echo "test"`
 			},
 			wantNvidiaSmi:  false,
 			wantRocmSmi:    true,
+			wantTegrastats: false,
+			wantNvtop:      false,
+			wantErr:        false,
+		},
+		{
+			name: "tegrastats available",
+			setupCommands: func(tempDir string) error {
+				path := filepath.Join(tempDir, "tegrastats")
+				script := `#!/bin/sh
+echo "test"`
+				if err := os.WriteFile(path, []byte(script), 0755); err != nil {
+					return err
+				}
+				return nil
+			},
+			wantNvidiaSmi:  false,
+			wantRocmSmi:    false,
 			wantTegrastats: true,
+			wantNvtop:      false,
+			wantErr:        false,
+		},
+		{
+			name: "nvtop available",
+			setupCommands: func(tempDir string) error {
+				path := filepath.Join(tempDir, "nvtop")
+				script := `#!/bin/sh
+echo "[]"`
+				if err := os.WriteFile(path, []byte(script), 0755); err != nil {
+					return err
+				}
+				return nil
+			},
+			wantNvidiaSmi:  false,
+			wantRocmSmi:    false,
+			wantTegrastats: false,
+			wantNvtop:      true,
 			wantErr:        false,
 		},
 		{
 			name: "no gpu tools available",
-			setupCommands: func() error {
+			setupCommands: func(_ string) error {
 				os.Setenv("PATH", "")
 				return nil
 			},
@@ -1074,29 +1187,53 @@ echo "test"`

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			if err := tt.setupCommands(); err != nil {
+			tempDir := t.TempDir()
+			os.Setenv("PATH", tempDir)
+			if err := tt.setupCommands(tempDir); err != nil {
 				t.Fatal(err)
 			}

 			gm := &GPUManager{}
-			err := gm.detectGPUs()
+			caps := gm.discoverGpuCapabilities()
+			var err error
+			if !hasAnyGpuCollector(caps) {
+				err = fmt.Errorf(noGPUFoundMsg)
+			}
+			priorities := gm.resolveLegacyCollectorPriority(caps)
+			hasPriority := func(source collectorSource) bool {
+				for _, s := range priorities {
+					if s == source {
+						return true
+					}
+				}
+				return false
+			}
+			gotNvidiaSmi := hasPriority(collectorSourceNvidiaSMI)
+			gotRocmSmi := hasPriority(collectorSourceRocmSMI)
+			gotTegrastats := caps.hasTegrastats
+			gotNvtop := caps.hasNvtop

-			t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gm.nvidiaSmi, gm.rocmSmi, gm.tegrastats)
+			t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gotNvidiaSmi, gotRocmSmi, gotTegrastats)

-			if tt.wantErr {
+			wantErr := tt.wantErr
+			if hasAmdSysfs && (tt.name == "nvidia-smi not available" || tt.name == "no gpu tools available") {
+				wantErr = false
+			}
+			if wantErr {
 				assert.Error(t, err)
 				return
 			}

 			assert.NoError(t, err)
-			assert.Equal(t, tt.wantNvidiaSmi, gm.nvidiaSmi)
-			assert.Equal(t, tt.wantRocmSmi, gm.rocmSmi)
-			assert.Equal(t, tt.wantTegrastats, gm.tegrastats)
+			assert.Equal(t, tt.wantNvidiaSmi, gotNvidiaSmi)
+			assert.Equal(t, tt.wantRocmSmi, gotRocmSmi)
+			assert.Equal(t, tt.wantTegrastats, gotTegrastats)
+			assert.Equal(t, tt.wantNvtop, gotNvtop)
 		})
 	}
 }

-func TestStartCollector(t *testing.T) {
+func TestCollectorStartHelpers(t *testing.T) {
 	// Save original PATH
 	origPath := os.Getenv("PATH")
 	defer os.Setenv("PATH", origPath)
@@ -1181,6 +1318,27 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m
 				},
 			},
 		},
+		{
+			name:    "nvtop collector",
+			command: "nvtop",
+			setup: func(t *testing.T) error {
+				path := filepath.Join(dir, "nvtop")
+				script := `#!/bin/sh
+echo '[{"device_name":"NVIDIA Test GPU","temp":"52C","power_draw":"31W","gpu_util":"37%","mem_total":"4294967296","mem_used":"536870912","processes":[]}]'`
+				if err := os.WriteFile(path, []byte(script), 0755); err != nil {
+					return err
+				}
+				return nil
+			},
+			validate: func(t *testing.T, gm *GPUManager) {
+				gpu, exists := gm.GpuDataMap["n0"]
+				assert.True(t, exists)
+				if exists {
+					assert.Equal(t, "NVIDIA Test GPU", gpu.Name)
+					assert.Equal(t, 52.0, gpu.Temperature)
+				}
+			},
+		},
 	}

 	for _, tt := range tests {
@@ -1193,13 +1351,157 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m
 					GpuDataMap: make(map[string]*system.GPUData),
 				}
 			}
-			tt.gm.startCollector(tt.command)
+			switch tt.command {
+			case nvidiaSmiCmd:
+				tt.gm.startNvidiaSmiCollector("4")
+			case rocmSmiCmd:
+				tt.gm.startRocmSmiCollector(4300 * time.Millisecond)
+			case tegraStatsCmd:
+				tt.gm.startTegraStatsCollector("3700")
+			case nvtopCmd:
+				tt.gm.startNvtopCollector("30", nil)
+			default:
+				t.Fatalf("unknown test command %q", tt.command)
+			}
 			time.Sleep(50 * time.Millisecond) // Give collector time to run
 			tt.validate(t, tt.gm)
 		})
 	}
 }

+func TestNewGPUManagerPriorityNvtopFallback(t *testing.T) {
+	origPath := os.Getenv("PATH")
+	defer os.Setenv("PATH", origPath)
+
+	dir := t.TempDir()
+	os.Setenv("PATH", dir)
+	t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvtop,nvidia-smi")
+
+	nvtopPath := filepath.Join(dir, "nvtop")
+	nvtopScript := `#!/bin/sh
+echo 'not-json'`
+	require.NoError(t, os.WriteFile(nvtopPath, []byte(nvtopScript), 0755))
+
+	nvidiaPath := filepath.Join(dir, "nvidia-smi")
+	nvidiaScript := `#!/bin/sh
+echo "0, NVIDIA Priority GPU, 45, 512, 2048, 12, 25"`
+	require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755))
+
+	gm, err := NewGPUManager()
+	require.NoError(t, err)
+	require.NotNil(t, gm)
+
+	time.Sleep(150 * time.Millisecond)
+	gpu, ok := gm.GpuDataMap["0"]
+	require.True(t, ok)
+	assert.Equal(t, "Priority GPU", gpu.Name)
+	assert.Equal(t, 45.0, gpu.Temperature)
+}
+
+func TestNewGPUManagerPriorityMixedCollectors(t *testing.T) {
+	origPath := os.Getenv("PATH")
+	defer os.Setenv("PATH", origPath)
+
+	dir := t.TempDir()
+	os.Setenv("PATH", dir)
+	t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "intel_gpu_top,rocm-smi")
+
+	intelPath := filepath.Join(dir, "intel_gpu_top")
+	intelScript := `#!/bin/sh
+echo "Freq MHz      IRQ RC6     Power W     IMC MiB/s             RCS             VCS"
+echo " req  act       /s   %   gpu   pkg     rd     wr       %  se  wa       %  se  wa"
+echo "226  223      338  58  2.00  2.69   1820    965   0.00    0   0    0.00   0   0"
+echo "189  187      412  67  1.80  2.45   1950    823   8.50    2   1    15.00   1   0"
+`
+	require.NoError(t, os.WriteFile(intelPath, []byte(intelScript), 0755))
+
+	rocmPath := filepath.Join(dir, "rocm-smi")
+	rocmScript := `#!/bin/sh
+echo '{"card0": {"Temperature (Sensor edge) (C)": "49.0", "Current Socket Graphics Package Power (W)": "28.159", "GPU use (%)": "0", "VRAM Total Memory (B)": "536870912", "VRAM Total Used Memory (B)": "445550592", "Card Series": "Rembrandt [Radeon 680M]", "GUID": "34756"}}'
+`
+	require.NoError(t, os.WriteFile(rocmPath, []byte(rocmScript), 0755))
+
+	gm, err := NewGPUManager()
+	require.NoError(t, err)
+	require.NotNil(t, gm)
+
+	time.Sleep(150 * time.Millisecond)
+	_, intelOk := gm.GpuDataMap["i0"]
+	_, amdOk := gm.GpuDataMap["34756"]
+	assert.True(t, intelOk)
+	assert.True(t, amdOk)
+}
+
+func TestNewGPUManagerPriorityNvmlFallbackToNvidiaSmi(t *testing.T) {
+	origPath := os.Getenv("PATH")
+	defer os.Setenv("PATH", origPath)
+
+	dir := t.TempDir()
+	os.Setenv("PATH", dir)
+	t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvml,nvidia-smi")
+
+	nvidiaPath := filepath.Join(dir, "nvidia-smi")
+	nvidiaScript := `#!/bin/sh
+echo "0, NVIDIA Fallback GPU, 41, 256, 1024, 8, 14"`
+	require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755))
+
+	gm, err := NewGPUManager()
+	require.NoError(t, err)
+	require.NotNil(t, gm)
+
+	time.Sleep(150 * time.Millisecond)
+	gpu, ok := gm.GpuDataMap["0"]
+	require.True(t, ok)
+	assert.Equal(t, "Fallback GPU", gpu.Name)
+}
+
+func TestNewGPUManagerConfiguredCollectorsMustStart(t *testing.T) {
+	origPath := os.Getenv("PATH")
+	defer os.Setenv("PATH", origPath)
+
+	dir := t.TempDir()
+	os.Setenv("PATH", dir)
+
+	t.Run("configured valid collector unavailable", func(t *testing.T) {
+		t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi")
+		gm, err := NewGPUManager()
+		require.Nil(t, gm)
+		require.Error(t, err)
+		assert.Contains(t, err.Error(), "no configured GPU collectors are available")
+	})
+
+	t.Run("configured collector list has only unknown entries", func(t *testing.T) {
+		t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "bad,unknown")
+		gm, err := NewGPUManager()
+		require.Nil(t, gm)
+		require.Error(t, err)
+		assert.Contains(t, err.Error(), "no configured GPU collectors are available")
+	})
+}
+
+func TestNewGPUManagerJetsonIgnoresCollectorConfig(t *testing.T) {
+	origPath := os.Getenv("PATH")
+	defer os.Setenv("PATH", origPath)
+
+	dir := t.TempDir()
+	os.Setenv("PATH", dir)
+	t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi")
+
+	tegraPath := filepath.Join(dir, "tegrastats")
+	tegraScript := `#!/bin/sh
+echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000mW"`
+	require.NoError(t, os.WriteFile(tegraPath, []byte(tegraScript), 0755))
+
+	gm, err := NewGPUManager()
+	require.NoError(t, err)
+	require.NotNil(t, gm)
+
+	time.Sleep(100 * time.Millisecond)
+	gpu, ok := gm.GpuDataMap["0"]
+	require.True(t, ok)
+	assert.Equal(t, "GPU", gpu.Name)
+}
+
 // TestAccumulationTableDriven tests the accumulation behavior for all three GPU types
 func TestAccumulation(t *testing.T) {
 	type expectedGPUValues struct {
--- a/agent/handlers_test.go
+++ b/agent/handlers_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package agent

--- a/agent/health/health_test.go
+++ b/agent/health/health_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package health

@@ -37,7 +36,6 @@ func TestHealth(t *testing.T) {
 	})

 	// This test uses synctest to simulate time passing.
-	// NOTE: This test requires GOEXPERIMENT=synctest to run.
 	t.Run("check with simulated time", func(t *testing.T) {
 		synctest.Test(t, func(t *testing.T) {
 			// Update the file to set the initial timestamp.
--- a/agent/mdraid_linux.go
+++ b/agent/mdraid_linux.go
@@ -0,0 +1,233 @@
+//go:build linux
+
+package agent
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/henrygd/beszel/agent/utils"
+	"github.com/henrygd/beszel/internal/entities/smart"
+)
+
+// mdraidSysfsRoot is a test hook; production value is "/sys".
+var mdraidSysfsRoot = "/sys"
+
+type mdraidHealth struct {
+	level         string
+	arrayState    string
+	degraded      uint64
+	raidDisks     uint64
+	syncAction    string
+	syncCompleted string
+	syncSpeed     string
+	mismatchCnt   uint64
+	capacity      uint64
+}
+
+// scanMdraidDevices discovers Linux md arrays exposed in sysfs.
+func scanMdraidDevices() []*DeviceInfo {
+	blockDir := filepath.Join(mdraidSysfsRoot, "block")
+	entries, err := os.ReadDir(blockDir)
+	if err != nil {
+		return nil
+	}
+
+	devices := make([]*DeviceInfo, 0, 2)
+	for _, ent := range entries {
+		name := ent.Name()
+		if !isMdraidBlockName(name) {
+			continue
+		}
+		mdDir := filepath.Join(blockDir, name, "md")
+		if !utils.FileExists(filepath.Join(mdDir, "array_state")) {
+			continue
+		}
+
+		devPath := filepath.Join("/dev", name)
+		devices = append(devices, &DeviceInfo{
+			Name:     devPath,
+			Type:     "mdraid",
+			InfoName: devPath + " [mdraid]",
+			Protocol: "MD",
+		})
+	}
+
+	return devices
+}
+
+// collectMdraidHealth reads mdraid health and stores it in SmartDataMap.
+func (sm *SmartManager) collectMdraidHealth(deviceInfo *DeviceInfo) (bool, error) {
+	if deviceInfo == nil || deviceInfo.Name == "" {
+		return false, nil
+	}
+
+	base := filepath.Base(deviceInfo.Name)
+	if !isMdraidBlockName(base) && !strings.EqualFold(deviceInfo.Type, "mdraid") {
+		return false, nil
+	}
+
+	health, ok := readMdraidHealth(base)
+	if !ok {
+		return false, nil
+	}
+
+	deviceInfo.Type = "mdraid"
+	key := fmt.Sprintf("mdraid:%s", base)
+	status := mdraidSmartStatus(health)
+
+	attrs := make([]*smart.SmartAttribute, 0, 10)
+	if health.arrayState != "" {
+		attrs = append(attrs, &smart.SmartAttribute{Name: "ArrayState", RawString: health.arrayState})
+	}
+	if health.level != "" {
+		attrs = append(attrs, &smart.SmartAttribute{Name: "RaidLevel", RawString: health.level})
+	}
+	if health.raidDisks > 0 {
+		attrs = append(attrs, &smart.SmartAttribute{Name: "RaidDisks", RawValue: health.raidDisks})
+	}
+	if health.degraded > 0 {
+		attrs = append(attrs, &smart.SmartAttribute{Name: "Degraded", RawValue: health.degraded})
+	}
+	if health.syncAction != "" {
+		attrs = append(attrs, &smart.SmartAttribute{Name: "SyncAction", RawString: health.syncAction})
+	}
+	if health.syncCompleted != "" {
+		attrs = append(attrs, &smart.SmartAttribute{Name: "SyncCompleted", RawString: health.syncCompleted})
+	}
+	if health.syncSpeed != "" {
+		attrs = append(attrs, &smart.SmartAttribute{Name: "SyncSpeed", RawString: health.syncSpeed})
+	}
+	if health.mismatchCnt > 0 {
+		attrs = append(attrs, &smart.SmartAttribute{Name: "MismatchCount", RawValue: health.mismatchCnt})
+	}
+
+	sm.Lock()
+	defer sm.Unlock()
+
+	if _, exists := sm.SmartDataMap[key]; !exists {
+		sm.SmartDataMap[key] = &smart.SmartData{}
+	}
+
+	data := sm.SmartDataMap[key]
+	data.ModelName = "Linux MD RAID"
+	if health.level != "" {
+		data.ModelName = "Linux MD RAID (" + health.level + ")"
+	}
+	data.Capacity = health.capacity
+	data.SmartStatus = status
+	data.DiskName = filepath.Join("/dev", base)
+	data.DiskType = "mdraid"
+	data.Attributes = attrs
+
+	return true, nil
+}
+
+// readMdraidHealth reads md array health fields from sysfs.
+func readMdraidHealth(blockName string) (mdraidHealth, bool) {
+	var out mdraidHealth
+
+	if !isMdraidBlockName(blockName) {
+		return out, false
+	}
+
+	mdDir := filepath.Join(mdraidSysfsRoot, "block", blockName, "md")
+	arrayState, okState := utils.ReadStringFileOK(filepath.Join(mdDir, "array_state"))
+	if !okState {
+		return out, false
+	}
+
+	out.arrayState = arrayState
+	out.level = utils.ReadStringFile(filepath.Join(mdDir, "level"))
+	out.syncAction = utils.ReadStringFile(filepath.Join(mdDir, "sync_action"))
+	out.syncCompleted = utils.ReadStringFile(filepath.Join(mdDir, "sync_completed"))
+	out.syncSpeed = utils.ReadStringFile(filepath.Join(mdDir, "sync_speed"))
+
+	if val, ok := utils.ReadUintFile(filepath.Join(mdDir, "raid_disks")); ok {
+		out.raidDisks = val
+	}
+	if val, ok := utils.ReadUintFile(filepath.Join(mdDir, "degraded")); ok {
+		out.degraded = val
+	}
+	if val, ok := utils.ReadUintFile(filepath.Join(mdDir, "mismatch_cnt")); ok {
+		out.mismatchCnt = val
+	}
+
+	if capBytes, ok := readMdraidBlockCapacityBytes(blockName, mdraidSysfsRoot); ok {
+		out.capacity = capBytes
+	}
+
+	return out, true
+}
+
+// mdraidSmartStatus maps md state/sync signals to a SMART-like status.
+func mdraidSmartStatus(health mdraidHealth) string {
+	state := strings.ToLower(strings.TrimSpace(health.arrayState))
+	switch state {
+	case "inactive", "faulty", "broken", "stopped":
+		return "FAILED"
+	}
+	// During rebuild/recovery, arrays are often temporarily degraded; report as
+	// warning instead of hard failure while synchronization is in progress.
+	syncAction := strings.ToLower(strings.TrimSpace(health.syncAction))
+	switch syncAction {
+	case "resync", "recover", "reshape":
+		return "WARNING"
+	}
+	if health.degraded > 0 {
+		return "FAILED"
+	}
+	switch syncAction {
+	case "check", "repair":
+		return "WARNING"
+	}
+	switch state {
+	case "clean", "active", "active-idle", "write-pending", "read-auto", "readonly":
+		return "PASSED"
+	}
+	return "UNKNOWN"
+}
+
+// isMdraidBlockName matches /dev/mdN-style block device names.
+func isMdraidBlockName(name string) bool {
+	if !strings.HasPrefix(name, "md") {
+		return false
+	}
+	suffix := strings.TrimPrefix(name, "md")
+	if suffix == "" {
+		return false
+	}
+	for _, c := range suffix {
+		if c < '0' || c > '9' {
+			return false
+		}
+	}
+	return true
+}
+
+// readMdraidBlockCapacityBytes converts block size metadata into bytes.
+func readMdraidBlockCapacityBytes(blockName, root string) (uint64, bool) {
+	sizePath := filepath.Join(root, "block", blockName, "size")
+	lbsPath := filepath.Join(root, "block", blockName, "queue", "logical_block_size")
+
+	sizeStr, ok := utils.ReadStringFileOK(sizePath)
+	if !ok {
+		return 0, false
+	}
+	sectors, err := strconv.ParseUint(sizeStr, 10, 64)
+	if err != nil || sectors == 0 {
+		return 0, false
+	}
+
+	logicalBlockSize := uint64(512)
+	if lbsStr, ok := utils.ReadStringFileOK(lbsPath); ok {
+		if parsed, err := strconv.ParseUint(lbsStr, 10, 64); err == nil && parsed > 0 {
+			logicalBlockSize = parsed
+		}
+	}
+
+	return sectors * logicalBlockSize, true
+}
--- a/agent/mdraid_linux_test.go
+++ b/agent/mdraid_linux_test.go
@@ -0,0 +1,103 @@
+//go:build linux
+
+package agent
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/henrygd/beszel/internal/entities/smart"
+)
+
+func TestMdraidMockSysfsScanAndCollect(t *testing.T) {
+	tmp := t.TempDir()
+	prev := mdraidSysfsRoot
+	mdraidSysfsRoot = tmp
+	t.Cleanup(func() { mdraidSysfsRoot = prev })
+
+	mdDir := filepath.Join(tmp, "block", "md0", "md")
+	queueDir := filepath.Join(tmp, "block", "md0", "queue")
+	if err := os.MkdirAll(mdDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.MkdirAll(queueDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	write := func(path, content string) {
+		t.Helper()
+		if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	write(filepath.Join(mdDir, "array_state"), "active\n")
+	write(filepath.Join(mdDir, "level"), "raid1\n")
+	write(filepath.Join(mdDir, "raid_disks"), "2\n")
+	write(filepath.Join(mdDir, "degraded"), "0\n")
+	write(filepath.Join(mdDir, "sync_action"), "resync\n")
+	write(filepath.Join(mdDir, "sync_completed"), "10%\n")
+	write(filepath.Join(mdDir, "sync_speed"), "100M\n")
+	write(filepath.Join(mdDir, "mismatch_cnt"), "0\n")
+	write(filepath.Join(queueDir, "logical_block_size"), "512\n")
+	write(filepath.Join(tmp, "block", "md0", "size"), "2048\n")
+
+	devs := scanMdraidDevices()
+	if len(devs) != 1 {
+		t.Fatalf("scanMdraidDevices() = %d devices, want 1", len(devs))
+	}
+	if devs[0].Name != "/dev/md0" || devs[0].Type != "mdraid" {
+		t.Fatalf("scanMdraidDevices()[0] = %+v, want Name=/dev/md0 Type=mdraid", devs[0])
+	}
+
+	sm := &SmartManager{SmartDataMap: map[string]*smart.SmartData{}}
+	ok, err := sm.collectMdraidHealth(devs[0])
+	if err != nil || !ok {
+		t.Fatalf("collectMdraidHealth() = (ok=%v, err=%v), want (true,nil)", ok, err)
+	}
+	if len(sm.SmartDataMap) != 1 {
+		t.Fatalf("SmartDataMap len=%d, want 1", len(sm.SmartDataMap))
+	}
+	var got *smart.SmartData
+	for _, v := range sm.SmartDataMap {
+		got = v
+		break
+	}
+	if got == nil {
+		t.Fatalf("SmartDataMap value nil")
+	}
+	if got.DiskType != "mdraid" || got.DiskName != "/dev/md0" {
+		t.Fatalf("disk fields = (type=%q name=%q), want (mdraid,/dev/md0)", got.DiskType, got.DiskName)
+	}
+	if got.SmartStatus != "WARNING" {
+		t.Fatalf("SmartStatus=%q, want WARNING", got.SmartStatus)
+	}
+	if got.ModelName == "" || got.Capacity == 0 {
+		t.Fatalf("identity fields = (model=%q cap=%d), want non-empty model and cap>0", got.ModelName, got.Capacity)
+	}
+	if len(got.Attributes) < 5 {
+		t.Fatalf("attributes len=%d, want >= 5", len(got.Attributes))
+	}
+}
+
+func TestMdraidSmartStatus(t *testing.T) {
+	if got := mdraidSmartStatus(mdraidHealth{arrayState: "inactive"}); got != "FAILED" {
+		t.Fatalf("mdraidSmartStatus(inactive) = %q, want FAILED", got)
+	}
+	if got := mdraidSmartStatus(mdraidHealth{arrayState: "active", degraded: 1, syncAction: "recover"}); got != "WARNING" {
+		t.Fatalf("mdraidSmartStatus(degraded+recover) = %q, want WARNING", got)
+	}
+	if got := mdraidSmartStatus(mdraidHealth{arrayState: "active", degraded: 1}); got != "FAILED" {
+		t.Fatalf("mdraidSmartStatus(degraded) = %q, want FAILED", got)
+	}
+	if got := mdraidSmartStatus(mdraidHealth{arrayState: "active", syncAction: "recover"}); got != "WARNING" {
+		t.Fatalf("mdraidSmartStatus(recover) = %q, want WARNING", got)
+	}
+	if got := mdraidSmartStatus(mdraidHealth{arrayState: "clean"}); got != "PASSED" {
+		t.Fatalf("mdraidSmartStatus(clean) = %q, want PASSED", got)
+	}
+	if got := mdraidSmartStatus(mdraidHealth{arrayState: "unknown"}); got != "UNKNOWN" {
+		t.Fatalf("mdraidSmartStatus(unknown) = %q, want UNKNOWN", got)
+	}
+}
--- a/agent/mdraid_stub.go
+++ b/agent/mdraid_stub.go
@@ -0,0 +1,11 @@
+//go:build !linux
+
+package agent
+
+func scanMdraidDevices() []*DeviceInfo {
+	return nil
+}
+
+func (sm *SmartManager) collectMdraidHealth(deviceInfo *DeviceInfo) (bool, error) {
+	return false, nil
+}
--- a/agent/network.go
+++ b/agent/network.go
@@ -8,6 +8,7 @@ import (
 	"time"

 	"github.com/henrygd/beszel/agent/deltatracker"
+	"github.com/henrygd/beszel/agent/utils"
 	"github.com/henrygd/beszel/internal/entities/system"
 	psutilNet "github.com/shirou/gopsutil/v4/net"
 )
@@ -94,7 +95,7 @@ func (a *Agent) initializeNetIoStats() {
 	a.netInterfaces = make(map[string]struct{}, 0)

 	// parse NICS env var for whitelist / blacklist
-	nicsEnvVal, nicsEnvExists := GetEnv("NICS")
+	nicsEnvVal, nicsEnvExists := utils.GetEnv("NICS")
 	var nicCfg *NicConfig
 	if nicsEnvExists {
 		nicCfg = newNicConfig(nicsEnvVal)
@@ -103,10 +104,7 @@ func (a *Agent) initializeNetIoStats() {
 	// get current network I/O stats and record valid interfaces
 	if netIO, err := psutilNet.IOCounters(true); err == nil {
 		for _, v := range netIO {
-			if nicsEnvExists && !isValidNic(v.Name, nicCfg) {
-				continue
-			}
-			if a.skipNetworkInterface(v) {
+			if skipNetworkInterface(v, nicCfg) {
 				continue
 			}
 			slog.Info("Detected network interface", "name", v.Name, "sent", v.BytesSent, "recv", v.BytesRecv)
@@ -215,10 +213,8 @@ func (a *Agent) applyNetworkTotals(
 	totalBytesSent, totalBytesRecv uint64,
 	bytesSentPerSecond, bytesRecvPerSecond uint64,
 ) {
-	networkSentPs := bytesToMegabytes(float64(bytesSentPerSecond))
-	networkRecvPs := bytesToMegabytes(float64(bytesRecvPerSecond))
-	if networkSentPs > 10_000 || networkRecvPs > 10_000 {
-		slog.Warn("Invalid net stats. Resetting.", "sent", networkSentPs, "recv", networkRecvPs)
+	if bytesSentPerSecond > 10_000_000_000 || bytesRecvPerSecond > 10_000_000_000 {
+		slog.Warn("Invalid net stats. Resetting.", "sent", bytesSentPerSecond, "recv", bytesRecvPerSecond)
 		for _, v := range netIO {
 			if _, exists := a.netInterfaces[v.Name]; !exists {
 				continue
@@ -228,21 +224,29 @@ func (a *Agent) applyNetworkTotals(
 		a.initializeNetIoStats()
 		delete(a.netIoStats, cacheTimeMs)
 		delete(a.netInterfaceDeltaTrackers, cacheTimeMs)
-		systemStats.NetworkSent = 0
-		systemStats.NetworkRecv = 0
 		systemStats.Bandwidth[0], systemStats.Bandwidth[1] = 0, 0
 		return
 	}

-	systemStats.NetworkSent = networkSentPs
-	systemStats.NetworkRecv = networkRecvPs
 	systemStats.Bandwidth[0], systemStats.Bandwidth[1] = bytesSentPerSecond, bytesRecvPerSecond
 	nis.BytesSent = totalBytesSent
 	nis.BytesRecv = totalBytesRecv
 	a.netIoStats[cacheTimeMs] = nis
 }

-func (a *Agent) skipNetworkInterface(v psutilNet.IOCountersStat) bool {
+// skipNetworkInterface returns true if the network interface should be ignored.
+func skipNetworkInterface(v psutilNet.IOCountersStat, nicCfg *NicConfig) bool {
+	if nicCfg != nil {
+		if !isValidNic(v.Name, nicCfg) {
+			return true
+		}
+		// In whitelist mode, we honor explicit inclusion without auto-filtering.
+		if !nicCfg.isBlacklist {
+			return false
+		}
+		// In blacklist mode, still apply the auto-filter below.
+	}
+
 	switch {
 	case strings.HasPrefix(v.Name, "lo"),
 		strings.HasPrefix(v.Name, "docker"),
--- a/agent/network_test.go
+++ b/agent/network_test.go
@@ -261,6 +261,39 @@ func TestNewNicConfig(t *testing.T) {
 		})
 	}
 }
+func TestSkipNetworkInterface(t *testing.T) {
+	tests := []struct {
+		name       string
+		nic        psutilNet.IOCountersStat
+		nicCfg     *NicConfig
+		expectSkip bool
+	}{
+		{"loopback lo", psutilNet.IOCountersStat{Name: "lo", BytesSent: 100, BytesRecv: 100}, nil, true},
+		{"loopback lo0", psutilNet.IOCountersStat{Name: "lo0", BytesSent: 100, BytesRecv: 100}, nil, true},
+		{"docker prefix", psutilNet.IOCountersStat{Name: "docker0", BytesSent: 100, BytesRecv: 100}, nil, true},
+		{"br- prefix", psutilNet.IOCountersStat{Name: "br-lan", BytesSent: 100, BytesRecv: 100}, nil, true},
+		{"veth prefix", psutilNet.IOCountersStat{Name: "veth0abc", BytesSent: 100, BytesRecv: 100}, nil, true},
+		{"bond prefix", psutilNet.IOCountersStat{Name: "bond0", BytesSent: 100, BytesRecv: 100}, nil, true},
+		{"cali prefix", psutilNet.IOCountersStat{Name: "cali1234", BytesSent: 100, BytesRecv: 100}, nil, true},
+		{"zero BytesRecv", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 100, BytesRecv: 0}, nil, true},
+		{"zero BytesSent", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 0, BytesRecv: 100}, nil, true},
+		{"both zero", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 0, BytesRecv: 0}, nil, true},
+		{"normal eth0", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 100, BytesRecv: 200}, nil, false},
+		{"normal wlan0", psutilNet.IOCountersStat{Name: "wlan0", BytesSent: 1, BytesRecv: 1}, nil, false},
+		{"whitelist overrides skip (docker)", psutilNet.IOCountersStat{Name: "docker0", BytesSent: 100, BytesRecv: 100}, newNicConfig("docker0"), false},
+		{"whitelist overrides skip (lo)", psutilNet.IOCountersStat{Name: "lo", BytesSent: 100, BytesRecv: 100}, newNicConfig("lo"), false},
+		{"whitelist exclusion", psutilNet.IOCountersStat{Name: "eth1", BytesSent: 100, BytesRecv: 100}, newNicConfig("eth0"), true},
+		{"blacklist skip lo", psutilNet.IOCountersStat{Name: "lo", BytesSent: 100, BytesRecv: 100}, newNicConfig("-eth0"), true},
+		{"blacklist explicit eth0", psutilNet.IOCountersStat{Name: "eth0", BytesSent: 100, BytesRecv: 100}, newNicConfig("-eth0"), true},
+		{"blacklist allow eth1", psutilNet.IOCountersStat{Name: "eth1", BytesSent: 100, BytesRecv: 100}, newNicConfig("-eth0"), false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			assert.Equal(t, tt.expectSkip, skipNetworkInterface(tt.nic, tt.nicCfg))
+		})
+	}
+}
+
 func TestEnsureNetworkInterfacesMap(t *testing.T) {
 	var a Agent
 	var stats system.Stats
@@ -383,8 +416,6 @@ func TestApplyNetworkTotals(t *testing.T) {
 		totalBytesSent        uint64
 		totalBytesRecv        uint64
 		expectReset           bool
-		expectedNetworkSent   float64
-		expectedNetworkRecv   float64
 		expectedBandwidthSent uint64
 		expectedBandwidthRecv uint64
 	}{
@@ -395,8 +426,6 @@ func TestApplyNetworkTotals(t *testing.T) {
 			totalBytesSent:        10000000,
 			totalBytesRecv:        20000000,
 			expectReset:           false,
-			expectedNetworkSent:   0.95, // ~1 MB/s rounded to 2 decimals
-			expectedNetworkRecv:   1.91, // ~2 MB/s rounded to 2 decimals
 			expectedBandwidthSent: 1000000,
 			expectedBandwidthRecv: 2000000,
 		},
@@ -424,18 +453,6 @@ func TestApplyNetworkTotals(t *testing.T) {
 			totalBytesRecv:     20000000,
 			expectReset:        true,
 		},
-		{
-			name:                  "Valid network stats - at threshold boundary",
-			bytesSentPerSecond:    10485750000, // ~9999.99 MB/s (rounds to 9999.99)
-			bytesRecvPerSecond:    10485750000, // ~9999.99 MB/s (rounds to 9999.99)
-			totalBytesSent:        10000000,
-			totalBytesRecv:        20000000,
-			expectReset:           false,
-			expectedNetworkSent:   9999.99,
-			expectedNetworkRecv:   9999.99,
-			expectedBandwidthSent: 10485750000,
-			expectedBandwidthRecv: 10485750000,
-		},
 		{
 			name:                  "Zero values",
 			bytesSentPerSecond:    0,
@@ -443,8 +460,6 @@ func TestApplyNetworkTotals(t *testing.T) {
 			totalBytesSent:        0,
 			totalBytesRecv:        0,
 			expectReset:           false,
-			expectedNetworkSent:   0.0,
-			expectedNetworkRecv:   0.0,
 			expectedBandwidthSent: 0,
 			expectedBandwidthRecv: 0,
 		},
@@ -481,14 +496,10 @@ func TestApplyNetworkTotals(t *testing.T) {
 				// Should have reset network tracking state - maps cleared and stats zeroed
 				assert.NotContains(t, a.netIoStats, cacheTimeMs, "cache entry should be cleared after reset")
 				assert.NotContains(t, a.netInterfaceDeltaTrackers, cacheTimeMs, "tracker should be cleared on reset")
-				assert.Zero(t, systemStats.NetworkSent)
-				assert.Zero(t, systemStats.NetworkRecv)
 				assert.Zero(t, systemStats.Bandwidth[0])
 				assert.Zero(t, systemStats.Bandwidth[1])
 			} else {
 				// Should have applied stats
-				assert.Equal(t, tt.expectedNetworkSent, systemStats.NetworkSent)
-				assert.Equal(t, tt.expectedNetworkRecv, systemStats.NetworkRecv)
 				assert.Equal(t, tt.expectedBandwidthSent, systemStats.Bandwidth[0])
 				assert.Equal(t, tt.expectedBandwidthRecv, systemStats.Bandwidth[1])

--- a/agent/sensors.go
+++ b/agent/sensors.go
@@ -10,6 +10,7 @@ import (
 	"strings"
 	"unicode/utf8"

+	"github.com/henrygd/beszel/agent/utils"
 	"github.com/henrygd/beszel/internal/entities/system"

 	"github.com/shirou/gopsutil/v4/common"
@@ -26,9 +27,9 @@ type SensorConfig struct {
 }

 func (a *Agent) newSensorConfig() *SensorConfig {
-	primarySensor, _ := GetEnv("PRIMARY_SENSOR")
-	sysSensors, _ := GetEnv("SYS_SENSORS")
-	sensorsEnvVal, sensorsSet := GetEnv("SENSORS")
+	primarySensor, _ := utils.GetEnv("PRIMARY_SENSOR")
+	sysSensors, _ := utils.GetEnv("SYS_SENSORS")
+	sensorsEnvVal, sensorsSet := utils.GetEnv("SENSORS")
 	skipCollection := sensorsSet && sensorsEnvVal == ""

 	return a.newSensorConfigWithEnv(primarySensor, sysSensors, sensorsEnvVal, skipCollection)
@@ -135,7 +136,7 @@ func (a *Agent) updateTemperatures(systemStats *system.Stats) {
 		case sensorName:
 			a.systemInfo.DashboardTemp = sensor.Temperature
 		}
-		systemStats.Temperatures[sensorName] = twoDecimals(sensor.Temperature)
+		systemStats.Temperatures[sensorName] = utils.TwoDecimals(sensor.Temperature)
 	}
 }

--- a/agent/sensors_test.go
+++ b/agent/sensors_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package agent

--- a/agent/server.go
+++ b/agent/server.go
@@ -12,6 +12,7 @@ import (
 	"time"

 	"github.com/henrygd/beszel"
+	"github.com/henrygd/beszel/agent/utils"
 	"github.com/henrygd/beszel/internal/common"
 	"github.com/henrygd/beszel/internal/entities/system"

@@ -36,6 +37,9 @@ var hubVersions map[string]semver.Version
 // and begins listening for connections. Returns an error if the server
 // is already running or if there's an issue starting the server.
 func (a *Agent) StartServer(opts ServerOptions) error {
+	if disableSSH, _ := utils.GetEnv("DISABLE_SSH"); disableSSH == "true" {
+		return errors.New("SSH disabled")
+	}
 	if a.server != nil {
 		return errors.New("server already started")
 	}
@@ -235,11 +239,11 @@ func ParseKeys(input string) ([]gossh.PublicKey, error) {
 // and finally defaults to ":45876".
 func GetAddress(addr string) string {
 	if addr == "" {
-		addr, _ = GetEnv("LISTEN")
+		addr, _ = utils.GetEnv("LISTEN")
 	}
 	if addr == "" {
 		// Legacy PORT environment variable support
-		addr, _ = GetEnv("PORT")
+		addr, _ = utils.GetEnv("PORT")
 	}
 	if addr == "" {
 		return ":45876"
@@ -255,7 +259,7 @@ func GetAddress(addr string) string {
 // It checks the NETWORK environment variable first, then infers from
 // the address format: addresses starting with "/" are "unix", others are "tcp".
 func GetNetwork(addr string) string {
-	if network, ok := GetEnv("NETWORK"); ok && network != "" {
+	if network, ok := utils.GetEnv("NETWORK"); ok && network != "" {
 		return network
 	}
 	if strings.HasPrefix(addr, "/") {
--- a/agent/server_test.go
+++ b/agent/server_test.go
@@ -1,3 +1,5 @@
+//go:build testing
+
 package agent

 import (
@@ -180,6 +182,23 @@ func TestStartServer(t *testing.T) {
 	}
 }

+func TestStartServerDisableSSH(t *testing.T) {
+	os.Setenv("BESZEL_AGENT_DISABLE_SSH", "true")
+	defer os.Unsetenv("BESZEL_AGENT_DISABLE_SSH")
+
+	agent, err := NewAgent("")
+	require.NoError(t, err)
+
+	opts := ServerOptions{
+		Network: "tcp",
+		Addr:    ":45990",
+	}
+
+	err = agent.StartServer(opts)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "SSH disabled")
+}
+
 /////////////////////////////////////////////////////////////////
 //////////////////// ParseKeys Tests ////////////////////////////
 /////////////////////////////////////////////////////////////////
--- a/agent/smart.go
+++ b/agent/smart.go
@@ -8,6 +8,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -17,9 +18,8 @@ import (
 	"sync"
 	"time"

+	"github.com/henrygd/beszel/agent/utils"
 	"github.com/henrygd/beszel/internal/entities/smart"
-
-	"log/slog"
 )

 // SmartManager manages data collection for SMART devices
@@ -29,7 +29,7 @@ type SmartManager struct {
 	SmartDevices    []*DeviceInfo
 	refreshMutex    sync.Mutex
 	lastScanTime    time.Time
-	binPath         string
+	smartctlPath    string
 	excludedDevices map[string]struct{}
 }

@@ -157,7 +157,7 @@ func (sm *SmartManager) ScanDevices(force bool) error {
 	currentDevices := sm.devicesSnapshot()

 	var configuredDevices []*DeviceInfo
-	if configuredRaw, ok := GetEnv("SMART_DEVICES"); ok {
+	if configuredRaw, ok := utils.GetEnv("SMART_DEVICES"); ok {
 		slog.Info("SMART_DEVICES", "value", configuredRaw)
 		config := strings.TrimSpace(configuredRaw)
 		if config == "" {
@@ -171,27 +171,42 @@ func (sm *SmartManager) ScanDevices(force bool) error {
 		configuredDevices = parsedDevices
 	}

-	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
-	defer cancel()
-
-	cmd := exec.CommandContext(ctx, sm.binPath, "--scan", "-j")
-	output, err := cmd.Output()
-
 	var (
 		scanErr        error
 		scannedDevices []*DeviceInfo
 		hasValidScan   bool
 	)

-	if err != nil {
-		scanErr = err
-	} else {
-		scannedDevices, hasValidScan = sm.parseScan(output)
-		if !hasValidScan {
-			scanErr = errNoValidSmartData
+	if sm.smartctlPath != "" {
+		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+		defer cancel()
+
+		cmd := exec.CommandContext(ctx, sm.smartctlPath, "--scan", "-j")
+		output, err := cmd.Output()
+		if err != nil {
+			scanErr = err
+		} else {
+			scannedDevices, hasValidScan = sm.parseScan(output)
+			if !hasValidScan {
+				scanErr = errNoValidSmartData
+			}
 		}
 	}

+	// Add eMMC devices (Linux only) by reading sysfs health fields. This does not
+	// require smartctl and does not scan the whole device.
+	if emmcDevices := scanEmmcDevices(); len(emmcDevices) > 0 {
+		scannedDevices = append(scannedDevices, emmcDevices...)
+		hasValidScan = true
+	}
+
+	// Add Linux mdraid arrays by reading sysfs health fields. This does not
+	// require smartctl and does not scan the whole device.
+	if raidDevices := scanMdraidDevices(); len(raidDevices) > 0 {
+		scannedDevices = append(scannedDevices, raidDevices...)
+		hasValidScan = true
+	}
+
 	finalDevices := mergeDeviceLists(currentDevices, scannedDevices, configuredDevices)
 	finalDevices = sm.filterExcludedDevices(finalDevices)
 	sm.updateSmartDevices(finalDevices)
@@ -208,7 +223,7 @@ func (sm *SmartManager) ScanDevices(force bool) error {
 }

 func (sm *SmartManager) parseConfiguredDevices(config string) ([]*DeviceInfo, error) {
-	splitChar := os.Getenv("SMART_DEVICES_SEPARATOR")
+	splitChar, _ := utils.GetEnv("SMART_DEVICES_SEPARATOR")
 	if splitChar == "" {
 		splitChar = ","
 	}
@@ -246,7 +261,7 @@ func (sm *SmartManager) parseConfiguredDevices(config string) ([]*DeviceInfo, er
 }

 func (sm *SmartManager) refreshExcludedDevices() {
-	rawValue, _ := GetEnv("EXCLUDE_SMART")
+	rawValue, _ := utils.GetEnv("EXCLUDE_SMART")
 	sm.excludedDevices = make(map[string]struct{})

 	for entry := range strings.SplitSeq(rawValue, ",") {
@@ -443,6 +458,24 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
 		return errNoValidSmartData
 	}

+	// mdraid health is not exposed via SMART; Linux exposes array state in sysfs.
+	if deviceInfo != nil {
+		if ok, err := sm.collectMdraidHealth(deviceInfo); ok {
+			return err
+		}
+	}
+	// eMMC health is not exposed via SMART on Linux, but the kernel provides
+	// wear / EOL indicators via sysfs. Prefer that path when available.
+	if deviceInfo != nil {
+		if ok, err := sm.collectEmmcHealth(deviceInfo); ok {
+			return err
+		}
+	}
+
+	if sm.smartctlPath == "" {
+		return errNoValidSmartData
+	}
+
 	// slog.Info("collecting SMART data", "device", deviceInfo.Name, "type", deviceInfo.Type, "has_existing_data", sm.hasDataForDevice(deviceInfo.Name))

 	// Check if we have any existing data for this device
@@ -453,11 +486,11 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {

 	// Try with -n standby first if we have existing data
 	args := sm.smartctlArgs(deviceInfo, hasExistingData)
-	cmd := exec.CommandContext(ctx, sm.binPath, args...)
+	cmd := exec.CommandContext(ctx, sm.smartctlPath, args...)
 	output, err := cmd.CombinedOutput()

 	// Check if device is in standby (exit status 2)
-	if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == 2 {
+	if exitErr, ok := errors.AsType[*exec.ExitError](err); ok && exitErr.ExitCode() == 2 {
 		if hasExistingData {
 			// Device is in standby and we have cached data, keep using cache
 			return nil
@@ -466,7 +499,7 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
 		ctx2, cancel2 := context.WithTimeout(context.Background(), 15*time.Second)
 		defer cancel2()
 		args = sm.smartctlArgs(deviceInfo, false)
-		cmd = exec.CommandContext(ctx2, sm.binPath, args...)
+		cmd = exec.CommandContext(ctx2, sm.smartctlPath, args...)
 		output, err = cmd.CombinedOutput()
 	}

@@ -483,7 +516,7 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
 			ctx3, cancel3 := context.WithTimeout(context.Background(), 15*time.Second)
 			defer cancel3()
 			args = sm.smartctlArgs(deviceInfo, false)
-			cmd = exec.CommandContext(ctx3, sm.binPath, args...)
+			cmd = exec.CommandContext(ctx3, sm.smartctlPath, args...)
 			output, err = cmd.CombinedOutput()
 			hasValidData = sm.parseSmartOutput(deviceInfo, output)

@@ -838,15 +871,18 @@ func (sm *SmartManager) parseSmartForSata(output []byte) (bool, int) {
 	smartData.FirmwareVersion = data.FirmwareVersion
 	smartData.Capacity = data.UserCapacity.Bytes
 	smartData.Temperature = data.Temperature.Current
-	if smartData.Temperature == 0 {
-		if temp, ok := temperatureFromAtaDeviceStatistics(data.AtaDeviceStatistics); ok {
-			smartData.Temperature = temp
-		}
-	}
 	smartData.SmartStatus = getSmartStatus(smartData.Temperature, data.SmartStatus.Passed)
 	smartData.DiskName = data.Device.Name
 	smartData.DiskType = data.Device.Type

+	// get values from ata_device_statistics if necessary
+	var ataDeviceStats smart.AtaDeviceStatistics
+	if smartData.Temperature == 0 {
+		if temp := findAtaDeviceStatisticsValue(&data, &ataDeviceStats, 5, "Current Temperature", 0, 255); temp != nil {
+			smartData.Temperature = uint8(*temp)
+		}
+	}
+
 	// update SmartAttributes
 	smartData.Attributes = make([]*smart.SmartAttribute, 0, len(data.AtaSmartAttributes.Table))
 	for _, attr := range data.AtaSmartAttributes.Table {
@@ -881,23 +917,20 @@ func getSmartStatus(temperature uint8, passed bool) string {
 	}
 }

-func temperatureFromAtaDeviceStatistics(stats smart.AtaDeviceStatistics) (uint8, bool) {
-	entry := findAtaDeviceStatisticsEntry(stats, 5, "Current Temperature")
-	if entry == nil || entry.Value == nil {
-		return 0, false
-	}
-	if *entry.Value > 255 {
-		return 0, false
-	}
-	return uint8(*entry.Value), true
-}
-
 // findAtaDeviceStatisticsEntry centralizes ATA devstat lookups so additional
 // metrics can be pulled from the same structure in the future.
-func findAtaDeviceStatisticsEntry(stats smart.AtaDeviceStatistics, pageNumber uint8, entryName string) *smart.AtaDeviceStatisticsEntry {
-	for pageIdx := range stats.Pages {
-		page := &stats.Pages[pageIdx]
-		if page.Number != pageNumber {
+func findAtaDeviceStatisticsValue(data *smart.SmartInfoForSata, ataDeviceStats *smart.AtaDeviceStatistics, entryNumber uint8, entryName string, minValue, maxValue int64) *int64 {
+	if len(ataDeviceStats.Pages) == 0 {
+		if len(data.AtaDeviceStatistics) == 0 {
+			return nil
+		}
+		if err := json.Unmarshal(data.AtaDeviceStatistics, ataDeviceStats); err != nil {
+			return nil
+		}
+	}
+	for pageIdx := range ataDeviceStats.Pages {
+		page := &ataDeviceStats.Pages[pageIdx]
+		if page.Number != entryNumber {
 			continue
 		}
 		for entryIdx := range page.Table {
@@ -905,7 +938,10 @@ func findAtaDeviceStatisticsEntry(stats smart.AtaDeviceStatistics, pageNumber ui
 			if !strings.EqualFold(entry.Name, entryName) {
 				continue
 			}
-			return entry
+			if entry.Value == nil || *entry.Value < minValue || *entry.Value > maxValue {
+				return nil
+			}
+			return entry.Value
 		}
 	}
 	return nil
@@ -1124,11 +1160,17 @@ func NewSmartManager() (*SmartManager, error) {
 	}
 	sm.refreshExcludedDevices()
 	path, err := sm.detectSmartctl()
+	slog.Debug("smartctl", "path", path, "err", err)
 	if err != nil {
-		slog.Debug(err.Error())
+		// Keep the previous fail-fast behavior unless this Linux host exposes
+		// eMMC or mdraid health via sysfs, in which case smartctl is optional.
+		if runtime.GOOS == "linux" {
+			if len(scanEmmcDevices()) > 0 || len(scanMdraidDevices()) > 0 {
+				return sm, nil
+			}
+		}
 		return nil, err
 	}
-	slog.Debug("smartctl", "path", path)
-	sm.binPath = path
+	sm.smartctlPath = path
 	return sm, nil
 }
--- a/agent/smart_test.go
+++ b/agent/smart_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package agent

@@ -122,6 +121,78 @@ func TestParseSmartForSataDeviceStatisticsTemperature(t *testing.T) {
 	assert.Equal(t, uint8(22), deviceData.Temperature)
 }

+func TestParseSmartForSataAtaDeviceStatistics(t *testing.T) {
+	// tests that ata_device_statistics values are parsed correctly
+	jsonPayload := []byte(`{
+		"smartctl": {"exit_status": 0},
+		"device": {"name": "/dev/sdb", "type": "sat"},
+		"model_name": "SanDisk SSD U110 16GB",
+		"serial_number": "lksjfh23lhj",
+		"firmware_version": "U21B001",
+		"user_capacity": {"bytes": 16013942784},
+		"smart_status": {"passed": true},
+		"ata_smart_attributes": {"table": []},
+		"ata_device_statistics": {
+			"pages": [
+				{
+					"number": 5,
+					"name": "Temperature Statistics",
+					"table": [
+						{"name": "Current Temperature", "value": 43, "flags": {"valid": true}},
+						{"name": "Specified Minimum Operating Temperature", "value": -20, "flags": {"valid": true}}
+					]
+				}
+			]
+		}
+	}`)
+
+	sm := &SmartManager{SmartDataMap: make(map[string]*smart.SmartData)}
+	hasData, exitStatus := sm.parseSmartForSata(jsonPayload)
+	require.True(t, hasData)
+	assert.Equal(t, 0, exitStatus)
+
+	deviceData, ok := sm.SmartDataMap["lksjfh23lhj"]
+	require.True(t, ok, "expected smart data entry for serial lksjfh23lhj")
+	assert.Equal(t, uint8(43), deviceData.Temperature)
+}
+
+func TestParseSmartForSataNegativeDeviceStatistics(t *testing.T) {
+	// Tests that negative values in ata_device_statistics (e.g. min operating temp)
+	// do not cause the entire SAT parser to fail.
+	jsonPayload := []byte(`{
+		"smartctl": {"exit_status": 0},
+		"device": {"name": "/dev/sdb", "type": "sat"},
+		"model_name": "SanDisk SSD U110 16GB",
+		"serial_number": "NEGATIVE123",
+		"firmware_version": "U21B001",
+		"user_capacity": {"bytes": 16013942784},
+		"smart_status": {"passed": true},
+		"temperature": {"current": 38},
+		"ata_smart_attributes": {"table": []},
+		"ata_device_statistics": {
+			"pages": [
+				{
+					"number": 5,
+					"name": "Temperature Statistics",
+					"table": [
+						{"name": "Current Temperature", "value": 38, "flags": {"valid": true}},
+						{"name": "Specified Minimum Operating Temperature", "value": -20, "flags": {"valid": true}}
+					]
+				}
+			]
+		}
+	}`)
+
+	sm := &SmartManager{SmartDataMap: make(map[string]*smart.SmartData)}
+	hasData, exitStatus := sm.parseSmartForSata(jsonPayload)
+	require.True(t, hasData)
+	assert.Equal(t, 0, exitStatus)
+
+	deviceData, ok := sm.SmartDataMap["NEGATIVE123"]
+	require.True(t, ok, "expected smart data entry for serial NEGATIVE123")
+	assert.Equal(t, uint8(38), deviceData.Temperature)
+}
+
 func TestParseSmartForSataParentheticalRawValue(t *testing.T) {
 	jsonPayload := []byte(`{
 		"smartctl": {"exit_status": 0},
@@ -728,6 +799,182 @@ func TestIsVirtualDeviceScsi(t *testing.T) {
 	}
 }

+func TestFindAtaDeviceStatisticsValue(t *testing.T) {
+	val42 := int64(42)
+	val100 := int64(100)
+	valMinus20 := int64(-20)
+
+	tests := []struct {
+		name           string
+		data           smart.SmartInfoForSata
+		ataDeviceStats smart.AtaDeviceStatistics
+		entryNumber    uint8
+		entryName      string
+		minValue       int64
+		maxValue       int64
+		expectedValue  *int64
+	}{
+		{
+			name: "value in ataDeviceStats",
+			ataDeviceStats: smart.AtaDeviceStatistics{
+				Pages: []smart.AtaDeviceStatisticsPage{
+					{
+						Number: 5,
+						Table: []smart.AtaDeviceStatisticsEntry{
+							{Name: "Current Temperature", Value: &val42},
+						},
+					},
+				},
+			},
+			entryNumber:   5,
+			entryName:     "Current Temperature",
+			minValue:      0,
+			maxValue:      100,
+			expectedValue: &val42,
+		},
+		{
+			name: "value unmarshaled from data",
+			data: smart.SmartInfoForSata{
+				AtaDeviceStatistics: []byte(`{"pages":[{"number":5,"table":[{"name":"Current Temperature","value":100}]}]}`),
+			},
+			entryNumber:   5,
+			entryName:     "Current Temperature",
+			minValue:      0,
+			maxValue:      255,
+			expectedValue: &val100,
+		},
+		{
+			name: "value out of range (too high)",
+			ataDeviceStats: smart.AtaDeviceStatistics{
+				Pages: []smart.AtaDeviceStatisticsPage{
+					{
+						Number: 5,
+						Table: []smart.AtaDeviceStatisticsEntry{
+							{Name: "Current Temperature", Value: &val100},
+						},
+					},
+				},
+			},
+			entryNumber:   5,
+			entryName:     "Current Temperature",
+			minValue:      0,
+			maxValue:      50,
+			expectedValue: nil,
+		},
+		{
+			name: "value out of range (too low)",
+			ataDeviceStats: smart.AtaDeviceStatistics{
+				Pages: []smart.AtaDeviceStatisticsPage{
+					{
+						Number: 5,
+						Table: []smart.AtaDeviceStatisticsEntry{
+							{Name: "Min Temp", Value: &valMinus20},
+						},
+					},
+				},
+			},
+			entryNumber:   5,
+			entryName:     "Min Temp",
+			minValue:      0,
+			maxValue:      100,
+			expectedValue: nil,
+		},
+		{
+			name:          "no statistics available",
+			data:          smart.SmartInfoForSata{},
+			entryNumber:   5,
+			entryName:     "Current Temperature",
+			minValue:      0,
+			maxValue:      255,
+			expectedValue: nil,
+		},
+		{
+			name: "wrong page number",
+			ataDeviceStats: smart.AtaDeviceStatistics{
+				Pages: []smart.AtaDeviceStatisticsPage{
+					{
+						Number: 1,
+						Table: []smart.AtaDeviceStatisticsEntry{
+							{Name: "Current Temperature", Value: &val42},
+						},
+					},
+				},
+			},
+			entryNumber:   5,
+			entryName:     "Current Temperature",
+			minValue:      0,
+			maxValue:      100,
+			expectedValue: nil,
+		},
+		{
+			name: "wrong entry name",
+			ataDeviceStats: smart.AtaDeviceStatistics{
+				Pages: []smart.AtaDeviceStatisticsPage{
+					{
+						Number: 5,
+						Table: []smart.AtaDeviceStatisticsEntry{
+							{Name: "Other Stat", Value: &val42},
+						},
+					},
+				},
+			},
+			entryNumber:   5,
+			entryName:     "Current Temperature",
+			minValue:      0,
+			maxValue:      100,
+			expectedValue: nil,
+		},
+		{
+			name: "case insensitive name match",
+			ataDeviceStats: smart.AtaDeviceStatistics{
+				Pages: []smart.AtaDeviceStatisticsPage{
+					{
+						Number: 5,
+						Table: []smart.AtaDeviceStatisticsEntry{
+							{Name: "CURRENT TEMPERATURE", Value: &val42},
+						},
+					},
+				},
+			},
+			entryNumber:   5,
+			entryName:     "Current Temperature",
+			minValue:      0,
+			maxValue:      100,
+			expectedValue: &val42,
+		},
+		{
+			name: "entry value is nil",
+			ataDeviceStats: smart.AtaDeviceStatistics{
+				Pages: []smart.AtaDeviceStatisticsPage{
+					{
+						Number: 5,
+						Table: []smart.AtaDeviceStatisticsEntry{
+							{Name: "Current Temperature", Value: nil},
+						},
+					},
+				},
+			},
+			entryNumber:   5,
+			entryName:     "Current Temperature",
+			minValue:      0,
+			maxValue:      100,
+			expectedValue: nil,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := findAtaDeviceStatisticsValue(&tt.data, &tt.ataDeviceStats, tt.entryNumber, tt.entryName, tt.minValue, tt.maxValue)
+			if tt.expectedValue == nil {
+				assert.Nil(t, result)
+			} else {
+				require.NotNil(t, result)
+				assert.Equal(t, *tt.expectedValue, *result)
+			}
+		})
+	}
+}
+
 func TestRefreshExcludedDevices(t *testing.T) {
 	tests := []struct {
 		name         string
--- a/agent/system.go
+++ b/agent/system.go
@@ -7,12 +7,13 @@ import (
 	"log/slog"
 	"os"
 	"runtime"
-	"strconv"
 	"strings"
 	"time"

 	"github.com/henrygd/beszel"
 	"github.com/henrygd/beszel/agent/battery"
+	"github.com/henrygd/beszel/agent/utils"
+	"github.com/henrygd/beszel/agent/zfs"
 	"github.com/henrygd/beszel/internal/entities/container"
 	"github.com/henrygd/beszel/internal/entities/system"

@@ -107,7 +108,7 @@ func (a *Agent) refreshSystemDetails() {
 	}

 	// zfs
-	if _, err := getARCSize(); err != nil {
+	if _, err := zfs.ARCSize(); err != nil {
 		slog.Debug("Not monitoring ZFS ARC", "err", err)
 	} else {
 		a.zfs = true
@@ -127,13 +128,13 @@ func (a *Agent) getSystemStats(cacheTimeMs uint16) system.Stats {
 	// cpu metrics
 	cpuMetrics, err := getCpuMetrics(cacheTimeMs)
 	if err == nil {
-		systemStats.Cpu = twoDecimals(cpuMetrics.Total)
+		systemStats.Cpu = utils.TwoDecimals(cpuMetrics.Total)
 		systemStats.CpuBreakdown = []float64{
-			twoDecimals(cpuMetrics.User),
-			twoDecimals(cpuMetrics.System),
-			twoDecimals(cpuMetrics.Iowait),
-			twoDecimals(cpuMetrics.Steal),
-			twoDecimals(cpuMetrics.Idle),
+			utils.TwoDecimals(cpuMetrics.User),
+			utils.TwoDecimals(cpuMetrics.System),
+			utils.TwoDecimals(cpuMetrics.Iowait),
+			utils.TwoDecimals(cpuMetrics.Steal),
+			utils.TwoDecimals(cpuMetrics.Idle),
 		}
 	} else {
 		slog.Error("Error getting cpu metrics", "err", err)
@@ -157,8 +158,8 @@ func (a *Agent) getSystemStats(cacheTimeMs uint16) system.Stats {
 	// memory
 	if v, err := mem.VirtualMemory(); err == nil {
 		// swap
-		systemStats.Swap = bytesToGigabytes(v.SwapTotal)
-		systemStats.SwapUsed = bytesToGigabytes(v.SwapTotal - v.SwapFree - v.SwapCached)
+		systemStats.Swap = utils.BytesToGigabytes(v.SwapTotal)
+		systemStats.SwapUsed = utils.BytesToGigabytes(v.SwapTotal - v.SwapFree - v.SwapCached)
 		// cache + buffers value for default mem calculation
 		// note: gopsutil automatically adds SReclaimable to v.Cached
 		cacheBuff := v.Cached + v.Buffers - v.Shared
@@ -178,16 +179,16 @@ func (a *Agent) getSystemStats(cacheTimeMs uint16) system.Stats {
 		// }
 		// subtract ZFS ARC size from used memory and add as its own category
 		if a.zfs {
-			if arcSize, _ := getARCSize(); arcSize > 0 && arcSize < v.Used {
+			if arcSize, _ := zfs.ARCSize(); arcSize > 0 && arcSize < v.Used {
 				v.Used = v.Used - arcSize
 				v.UsedPercent = float64(v.Used) / float64(v.Total) * 100.0
-				systemStats.MemZfsArc = bytesToGigabytes(arcSize)
+				systemStats.MemZfsArc = utils.BytesToGigabytes(arcSize)
 			}
 		}
-		systemStats.Mem = bytesToGigabytes(v.Total)
-		systemStats.MemBuffCache = bytesToGigabytes(cacheBuff)
-		systemStats.MemUsed = bytesToGigabytes(v.Used)
-		systemStats.MemPct = twoDecimals(v.UsedPercent)
+		systemStats.Mem = utils.BytesToGigabytes(v.Total)
+		systemStats.MemBuffCache = utils.BytesToGigabytes(cacheBuff)
+		systemStats.MemUsed = utils.BytesToGigabytes(v.Used)
+		systemStats.MemPct = utils.TwoDecimals(v.UsedPercent)
 	}

 	// disk usage
@@ -250,32 +251,6 @@ func (a *Agent) getSystemStats(cacheTimeMs uint16) system.Stats {
 	return systemStats
 }

-// Returns the size of the ZFS ARC memory cache in bytes
-func getARCSize() (uint64, error) {
-	file, err := os.Open("/proc/spl/kstat/zfs/arcstats")
-	if err != nil {
-		return 0, err
-	}
-	defer file.Close()
-
-	// Scan the lines
-	scanner := bufio.NewScanner(file)
-	for scanner.Scan() {
-		line := scanner.Text()
-		if strings.HasPrefix(line, "size") {
-			// Example line: size 4 15032385536
-			fields := strings.Fields(line)
-			if len(fields) < 3 {
-				return 0, err
-			}
-			// Return the size as uint64
-			return strconv.ParseUint(fields[2], 10, 64)
-		}
-	}
-
-	return 0, fmt.Errorf("failed to parse size field")
-}
-
 // getOsPrettyName attempts to get the pretty OS name from /etc/os-release on Linux systems
 func getOsPrettyName() (string, error) {
 	file, err := os.Open("/etc/os-release")
--- a/agent/systemd.go
+++ b/agent/systemd.go
@@ -15,6 +15,7 @@ import (
 	"time"

 	"github.com/coreos/go-systemd/v22/dbus"
+	"github.com/henrygd/beszel/agent/utils"
 	"github.com/henrygd/beszel/internal/entities/systemd"
 )

@@ -49,7 +50,7 @@ func isSystemdAvailable() bool {

 // newSystemdManager creates a new systemdManager.
 func newSystemdManager() (*systemdManager, error) {
-	if skipSystemd, _ := GetEnv("SKIP_SYSTEMD"); skipSystemd == "true" {
+	if skipSystemd, _ := utils.GetEnv("SKIP_SYSTEMD"); skipSystemd == "true" {
 		return nil, nil
 	}

@@ -294,7 +295,7 @@ func unescapeServiceName(name string) string {
 // otherwise defaults to "*service".
 func getServicePatterns() []string {
 	patterns := []string{}
-	if envPatterns, _ := GetEnv("SERVICE_PATTERNS"); envPatterns != "" {
+	if envPatterns, _ := utils.GetEnv("SERVICE_PATTERNS"); envPatterns != "" {
 		for pattern := range strings.SplitSeq(envPatterns, ",") {
 			pattern = strings.TrimSpace(pattern)
 			if pattern == "" {
--- a/agent/test-data/amdgpu.ids
+++ b/agent/test-data/amdgpu.ids
@@ -0,0 +1,700 @@
+# List of AMDGPU IDs
+#
+# Syntax:
+# device_id,	revision_id,	product_name        <-- single tab after comma
+
+1.0.0
+1114,	C2,	AMD Radeon 860M Graphics
+1114,	C3,	AMD Radeon 840M Graphics
+1114,	D2,	AMD Radeon 860M Graphics
+1114,	D3,	AMD Radeon 840M Graphics
+1309,	00,	AMD Radeon R7 Graphics
+130A,	00,	AMD Radeon R6 Graphics
+130B,	00,	AMD Radeon R4 Graphics
+130C,	00,	AMD Radeon R7 Graphics
+130D,	00,	AMD Radeon R6 Graphics
+130E,	00,	AMD Radeon R5 Graphics
+130F,	00,	AMD Radeon R7 Graphics
+130F,	D4,	AMD Radeon R7 Graphics
+130F,	D5,	AMD Radeon R7 Graphics
+130F,	D6,	AMD Radeon R7 Graphics
+130F,	D7,	AMD Radeon R7 Graphics
+1313,	00,	AMD Radeon R7 Graphics
+1313,	D4,	AMD Radeon R7 Graphics
+1313,	D5,	AMD Radeon R7 Graphics
+1313,	D6,	AMD Radeon R7 Graphics
+1315,	00,	AMD Radeon R5 Graphics
+1315,	D4,	AMD Radeon R5 Graphics
+1315,	D5,	AMD Radeon R5 Graphics
+1315,	D6,	AMD Radeon R5 Graphics
+1315,	D7,	AMD Radeon R5 Graphics
+1316,	00,	AMD Radeon R5 Graphics
+1318,	00,	AMD Radeon R5 Graphics
+131B,	00,	AMD Radeon R4 Graphics
+131C,	00,	AMD Radeon R7 Graphics
+131D,	00,	AMD Radeon R6 Graphics
+1435,	AE,	AMD Custom GPU 0932
+1506,	C1,	AMD Radeon 610M
+1506,	C2,	AMD Radeon 610M
+1506,	C3,	AMD Radeon 610M
+1506,	C4,	AMD Radeon 610M
+150E,	C1,	AMD Radeon 890M Graphics
+150E,	C4,	AMD Radeon 890M Graphics
+150E,	C5,	AMD Radeon 890M Graphics
+150E,	C6,	AMD Radeon 890M Graphics
+150E,	D1,	AMD Radeon 890M Graphics
+150E,	D2,	AMD Radeon 890M Graphics
+150E,	D3,	AMD Radeon 890M Graphics
+1586,	C1,	Radeon 8060S Graphics
+1586,	C2,	Radeon 8050S Graphics
+1586,	C4,	Radeon 8050S Graphics
+1586,	D1,	Radeon 8060S Graphics
+1586,	D2,	Radeon 8050S Graphics
+1586,	D4,	Radeon 8050S Graphics
+1586,	D5,	Radeon 8040S Graphics
+15BF,	00,	AMD Radeon 780M Graphics
+15BF,	01,	AMD Radeon 760M Graphics
+15BF,	02,	AMD Radeon 780M Graphics
+15BF,	03,	AMD Radeon 760M Graphics
+15BF,	C1,	AMD Radeon 780M Graphics
+15BF,	C2,	AMD Radeon 780M Graphics
+15BF,	C3,	AMD Radeon 760M Graphics
+15BF,	C4,	AMD Radeon 780M Graphics
+15BF,	C5,	AMD Radeon 740M Graphics
+15BF,	C6,	AMD Radeon 780M Graphics
+15BF,	C7,	AMD Radeon 780M Graphics
+15BF,	C8,	AMD Radeon 760M Graphics
+15BF,	C9,	AMD Radeon 780M Graphics
+15BF,	CA,	AMD Radeon 740M Graphics
+15BF,	CB,	AMD Radeon 760M Graphics
+15BF,	CC,	AMD Radeon 740M Graphics
+15BF,	CD,	AMD Radeon 760M Graphics
+15BF,	CF,	AMD Radeon 780M Graphics
+15BF,	D0,	AMD Radeon 780M Graphics
+15BF,	D1,	AMD Radeon 780M Graphics
+15BF,	D2,	AMD Radeon 780M Graphics
+15BF,	D3,	AMD Radeon 780M Graphics
+15BF,	D4,	AMD Radeon 780M Graphics
+15BF,	D5,	AMD Radeon 760M Graphics
+15BF,	D6,	AMD Radeon 760M Graphics
+15BF,	D7,	AMD Radeon 780M Graphics
+15BF,	D8,	AMD Radeon 740M Graphics
+15BF,	D9,	AMD Radeon 780M Graphics
+15BF,	DA,	AMD Radeon 780M Graphics
+15BF,	DB,	AMD Radeon 760M Graphics
+15BF,	DC,	AMD Radeon 760M Graphics
+15BF,	DD,	AMD Radeon 780M Graphics
+15BF,	DE,	AMD Radeon 740M Graphics
+15BF,	DF,	AMD Radeon 760M Graphics
+15BF,	F0,	AMD Radeon 760M Graphics
+15C8,	C1,	AMD Radeon 740M Graphics
+15C8,	C2,	AMD Radeon 740M Graphics
+15C8,	C3,	AMD Radeon 740M Graphics
+15C8,	C4,	AMD Radeon 740M Graphics
+15C8,	D1,	AMD Radeon 740M Graphics
+15C8,	D2,	AMD Radeon 740M Graphics
+15C8,	D3,	AMD Radeon 740M Graphics
+15C8,	D4,	AMD Radeon 740M Graphics
+15D8,	00,	AMD Radeon RX Vega 8 Graphics WS
+15D8,	91,	AMD Radeon Vega 3 Graphics
+15D8,	91,	AMD Ryzen Embedded R1606G with Radeon Vega Gfx
+15D8,	92,	AMD Radeon Vega 3 Graphics
+15D8,	92,	AMD Ryzen Embedded R1505G with Radeon Vega Gfx
+15D8,	93,	AMD Radeon Vega 1 Graphics
+15D8,	A1,	AMD Radeon Vega 10 Graphics
+15D8,	A2,	AMD Radeon Vega 8 Graphics
+15D8,	A3,	AMD Radeon Vega 6 Graphics
+15D8,	A4,	AMD Radeon Vega 3 Graphics
+15D8,	B1,	AMD Radeon Vega 10 Graphics
+15D8,	B2,	AMD Radeon Vega 8 Graphics
+15D8,	B3,	AMD Radeon Vega 6 Graphics
+15D8,	B4,	AMD Radeon Vega 3 Graphics
+15D8,	C1,	AMD Radeon Vega 10 Graphics
+15D8,	C2,	AMD Radeon Vega 8 Graphics
+15D8,	C3,	AMD Radeon Vega 6 Graphics
+15D8,	C4,	AMD Radeon Vega 3 Graphics
+15D8,	C5,	AMD Radeon Vega 3 Graphics
+15D8,	C8,	AMD Radeon Vega 11 Graphics
+15D8,	C9,	AMD Radeon Vega 8 Graphics
+15D8,	CA,	AMD Radeon Vega 11 Graphics
+15D8,	CB,	AMD Radeon Vega 8 Graphics
+15D8,	CC,	AMD Radeon Vega 3 Graphics
+15D8,	CE,	AMD Radeon Vega 3 Graphics
+15D8,	CF,	AMD Ryzen Embedded R1305G with Radeon Vega Gfx
+15D8,	D1,	AMD Radeon Vega 10 Graphics
+15D8,	D2,	AMD Radeon Vega 8 Graphics
+15D8,	D3,	AMD Radeon Vega 6 Graphics
+15D8,	D4,	AMD Radeon Vega 3 Graphics
+15D8,	D8,	AMD Radeon Vega 11 Graphics
+15D8,	D9,	AMD Radeon Vega 8 Graphics
+15D8,	DA,	AMD Radeon Vega 11 Graphics
+15D8,	DB,	AMD Radeon Vega 3 Graphics
+15D8,	DB,	AMD Radeon Vega 8 Graphics
+15D8,	DC,	AMD Radeon Vega 3 Graphics
+15D8,	DD,	AMD Radeon Vega 3 Graphics
+15D8,	DE,	AMD Radeon Vega 3 Graphics
+15D8,	DF,	AMD Radeon Vega 3 Graphics
+15D8,	E3,	AMD Radeon Vega 3 Graphics
+15D8,	E4,	AMD Ryzen Embedded R1102G with Radeon Vega Gfx
+15DD,	81,	AMD Ryzen Embedded V1807B with Radeon Vega Gfx
+15DD,	82,	AMD Ryzen Embedded V1756B with Radeon Vega Gfx
+15DD,	83,	AMD Ryzen Embedded V1605B with Radeon Vega Gfx
+15DD,	84,	AMD Radeon Vega 6 Graphics
+15DD,	85,	AMD Ryzen Embedded V1202B with Radeon Vega Gfx
+15DD,	86,	AMD Radeon Vega 11 Graphics
+15DD,	88,	AMD Radeon Vega 8 Graphics
+15DD,	C1,	AMD Radeon Vega 11 Graphics
+15DD,	C2,	AMD Radeon Vega 8 Graphics
+15DD,	C3,	AMD Radeon Vega 3 / 10 Graphics
+15DD,	C4,	AMD Radeon Vega 8 Graphics
+15DD,	C5,	AMD Radeon Vega 3 Graphics
+15DD,	C6,	AMD Radeon Vega 11 Graphics
+15DD,	C8,	AMD Radeon Vega 8 Graphics
+15DD,	C9,	AMD Radeon Vega 11 Graphics
+15DD,	CA,	AMD Radeon Vega 8 Graphics
+15DD,	CB,	AMD Radeon Vega 3 Graphics
+15DD,	CC,	AMD Radeon Vega 6 Graphics
+15DD,	CE,	AMD Radeon Vega 3 Graphics
+15DD,	CF,	AMD Radeon Vega 3 Graphics
+15DD,	D0,	AMD Radeon Vega 10 Graphics
+15DD,	D1,	AMD Radeon Vega 8 Graphics
+15DD,	D3,	AMD Radeon Vega 11 Graphics
+15DD,	D5,	AMD Radeon Vega 8 Graphics
+15DD,	D6,	AMD Radeon Vega 11 Graphics
+15DD,	D7,	AMD Radeon Vega 8 Graphics
+15DD,	D8,	AMD Radeon Vega 3 Graphics
+15DD,	D9,	AMD Radeon Vega 6 Graphics
+15DD,	E1,	AMD Radeon Vega 3 Graphics
+15DD,	E2,	AMD Radeon Vega 3 Graphics
+163F,	AE,	AMD Custom GPU 0405
+163F,	E1,	AMD Custom GPU 0405
+164E,	D8,	AMD Radeon 610M
+164E,	D9,	AMD Radeon 610M
+164E,	DA,	AMD Radeon 610M
+164E,	DB,	AMD Radeon 610M
+164E,	DC,	AMD Radeon 610M
+1681,	06,	AMD Radeon 680M
+1681,	07,	AMD Radeon 660M
+1681,	0A,	AMD Radeon 680M
+1681,	0B,	AMD Radeon 660M
+1681,	C7,	AMD Radeon 680M
+1681,	C8,	AMD Radeon 680M
+1681,	C9,	AMD Radeon 660M
+1900,	01,	AMD Radeon 780M Graphics
+1900,	02,	AMD Radeon 760M Graphics
+1900,	03,	AMD Radeon 780M Graphics
+1900,	04,	AMD Radeon 760M Graphics
+1900,	05,	AMD Radeon 780M Graphics
+1900,	06,	AMD Radeon 780M Graphics
+1900,	07,	AMD Radeon 760M Graphics
+1900,	B0,	AMD Radeon 780M Graphics
+1900,	B1,	AMD Radeon 780M Graphics
+1900,	B2,	AMD Radeon 780M Graphics
+1900,	B3,	AMD Radeon 780M Graphics
+1900,	B4,	AMD Radeon 780M Graphics
+1900,	B5,	AMD Radeon 780M Graphics
+1900,	B6,	AMD Radeon 780M Graphics
+1900,	B7,	AMD Radeon 760M Graphics
+1900,	B8,	AMD Radeon 760M Graphics
+1900,	B9,	AMD Radeon 780M Graphics
+1900,	BA,	AMD Radeon 780M Graphics
+1900,	BB,	AMD Radeon 780M Graphics
+1900,	C0,	AMD Radeon 780M Graphics
+1900,	C1,	AMD Radeon 760M Graphics
+1900,	C2,	AMD Radeon 780M Graphics
+1900,	C3,	AMD Radeon 760M Graphics
+1900,	C4,	AMD Radeon 780M Graphics
+1900,	C5,	AMD Radeon 780M Graphics
+1900,	C6,	AMD Radeon 760M Graphics
+1900,	C7,	AMD Radeon 780M Graphics
+1900,	C8,	AMD Radeon 760M Graphics
+1900,	C9,	AMD Radeon 780M Graphics
+1900,	CA,	AMD Radeon 760M Graphics
+1900,	CB,	AMD Radeon 780M Graphics
+1900,	CC,	AMD Radeon 780M Graphics
+1900,	CD,	AMD Radeon 760M Graphics
+1900,	CE,	AMD Radeon 780M Graphics
+1900,	CF,	AMD Radeon 760M Graphics
+1900,	D0,	AMD Radeon 780M Graphics
+1900,	D1,	AMD Radeon 760M Graphics
+1900,	D2,	AMD Radeon 780M Graphics
+1900,	D3,	AMD Radeon 760M Graphics
+1900,	D4,	AMD Radeon 780M Graphics
+1900,	D5,	AMD Radeon 780M Graphics
+1900,	D6,	AMD Radeon 760M Graphics
+1900,	D7,	AMD Radeon 780M Graphics
+1900,	D8,	AMD Radeon 760M Graphics
+1900,	D9,	AMD Radeon 780M Graphics
+1900,	DA,	AMD Radeon 760M Graphics
+1900,	DB,	AMD Radeon 780M Graphics
+1900,	DC,	AMD Radeon 780M Graphics
+1900,	DD,	AMD Radeon 760M Graphics
+1900,	DE,	AMD Radeon 780M Graphics
+1900,	DF,	AMD Radeon 760M Graphics
+1900,	F0,	AMD Radeon 780M Graphics
+1900,	F1,	AMD Radeon 780M Graphics
+1900,	F2,	AMD Radeon 780M Graphics
+1901,	C1,	AMD Radeon 740M Graphics
+1901,	C2,	AMD Radeon 740M Graphics
+1901,	C3,	AMD Radeon 740M Graphics
+1901,	C6,	AMD Radeon 740M Graphics
+1901,	C7,	AMD Radeon 740M Graphics
+1901,	C8,	AMD Radeon 740M Graphics
+1901,	C9,	AMD Radeon 740M Graphics
+1901,	CA,	AMD Radeon 740M Graphics
+1901,	D1,	AMD Radeon 740M Graphics
+1901,	D2,	AMD Radeon 740M Graphics
+1901,	D3,	AMD Radeon 740M Graphics
+1901,	D4,	AMD Radeon 740M Graphics
+1901,	D5,	AMD Radeon 740M Graphics
+1901,	D6,	AMD Radeon 740M Graphics
+1901,	D7,	AMD Radeon 740M Graphics
+1901,	D8,	AMD Radeon 740M Graphics
+6600,	00,	AMD Radeon HD 8600 / 8700M
+6600,	81,	AMD Radeon R7 M370
+6601,	00,	AMD Radeon HD 8500M / 8700M
+6604,	00,	AMD Radeon R7 M265 Series
+6604,	81,	AMD Radeon R7 M350
+6605,	00,	AMD Radeon R7 M260 Series
+6605,	81,	AMD Radeon R7 M340
+6606,	00,	AMD Radeon HD 8790M
+6607,	00,	AMD Radeon R5 M240
+6608,	00,	AMD FirePro W2100
+6610,	00,	AMD Radeon R7 200 Series
+6610,	81,	AMD Radeon R7 350
+6610,	83,	AMD Radeon R5 340
+6610,	87,	AMD Radeon R7 200 Series
+6611,	00,	AMD Radeon R7 200 Series
+6611,	87,	AMD Radeon R7 200 Series
+6613,	00,	AMD Radeon R7 200 Series
+6617,	00,	AMD Radeon R7 240 Series
+6617,	87,	AMD Radeon R7 200 Series
+6617,	C7,	AMD Radeon R7 240 Series
+6640,	00,	AMD Radeon HD 8950
+6640,	80,	AMD Radeon R9 M380
+6646,	00,	AMD Radeon R9 M280X
+6646,	80,	AMD Radeon R9 M385
+6646,	80,	AMD Radeon R9 M470X
+6647,	00,	AMD Radeon R9 M200X Series
+6647,	80,	AMD Radeon R9 M380
+6649,	00,	AMD FirePro W5100
+6658,	00,	AMD Radeon R7 200 Series
+665C,	00,	AMD Radeon HD 7700 Series
+665D,	00,	AMD Radeon R7 200 Series
+665F,	81,	AMD Radeon R7 360 Series
+6660,	00,	AMD Radeon HD 8600M Series
+6660,	81,	AMD Radeon R5 M335
+6660,	83,	AMD Radeon R5 M330
+6663,	00,	AMD Radeon HD 8500M Series
+6663,	83,	AMD Radeon R5 M320
+6664,	00,	AMD Radeon R5 M200 Series
+6665,	00,	AMD Radeon R5 M230 Series
+6665,	83,	AMD Radeon R5 M320
+6665,	C3,	AMD Radeon R5 M435
+6666,	00,	AMD Radeon R5 M200 Series
+6667,	00,	AMD Radeon R5 M200 Series
+666F,	00,	AMD Radeon HD 8500M
+66A1,	02,	AMD Instinct MI60 / MI50
+66A1,	06,	AMD Radeon Pro VII
+66AF,	C1,	AMD Radeon VII
+6780,	00,	AMD FirePro W9000
+6784,	00,	ATI FirePro V (FireGL V) Graphics Adapter
+6788,	00,	ATI FirePro V (FireGL V) Graphics Adapter
+678A,	00,	AMD FirePro W8000
+6798,	00,	AMD Radeon R9 200 / HD 7900 Series
+6799,	00,	AMD Radeon HD 7900 Series
+679A,	00,	AMD Radeon HD 7900 Series
+679B,	00,	AMD Radeon HD 7900 Series
+679E,	00,	AMD Radeon HD 7800 Series
+67A0,	00,	AMD Radeon FirePro W9100
+67A1,	00,	AMD Radeon FirePro W8100
+67B0,	00,	AMD Radeon R9 200 Series
+67B0,	80,	AMD Radeon R9 390 Series
+67B1,	00,	AMD Radeon R9 200 Series
+67B1,	80,	AMD Radeon R9 390 Series
+67B9,	00,	AMD Radeon R9 200 Series
+67C0,	00,	AMD Radeon Pro WX 7100 Graphics
+67C0,	80,	AMD Radeon E9550
+67C2,	01,	AMD Radeon Pro V7350x2
+67C2,	02,	AMD Radeon Pro V7300X
+67C4,	00,	AMD Radeon Pro WX 7100 Graphics
+67C4,	80,	AMD Radeon E9560 / E9565 Graphics
+67C7,	00,	AMD Radeon Pro WX 5100 Graphics
+67C7,	80,	AMD Radeon E9390 Graphics
+67D0,	01,	AMD Radeon Pro V7350x2
+67D0,	02,	AMD Radeon Pro V7300X
+67DF,	C0,	AMD Radeon Pro 580X
+67DF,	C1,	AMD Radeon RX 580 Series
+67DF,	C2,	AMD Radeon RX 570 Series
+67DF,	C3,	AMD Radeon RX 580 Series
+67DF,	C4,	AMD Radeon RX 480 Graphics
+67DF,	C5,	AMD Radeon RX 470 Graphics
+67DF,	C6,	AMD Radeon RX 570 Series
+67DF,	C7,	AMD Radeon RX 480 Graphics
+67DF,	CF,	AMD Radeon RX 470 Graphics
+67DF,	D7,	AMD Radeon RX 470 Graphics
+67DF,	E0,	AMD Radeon RX 470 Series
+67DF,	E1,	AMD Radeon RX 590 Series
+67DF,	E3,	AMD Radeon RX Series
+67DF,	E7,	AMD Radeon RX 580 Series
+67DF,	EB,	AMD Radeon Pro 580X
+67DF,	EF,	AMD Radeon RX 570 Series
+67DF,	F7,	AMD Radeon RX P30PH
+67DF,	FF,	AMD Radeon RX 470 Series
+67E0,	00,	AMD Radeon Pro WX Series
+67E3,	00,	AMD Radeon Pro WX 4100
+67E8,	00,	AMD Radeon Pro WX Series
+67E8,	01,	AMD Radeon Pro WX Series
+67E8,	80,	AMD Radeon E9260 Graphics
+67EB,	00,	AMD Radeon Pro V5300X
+67EF,	C0,	AMD Radeon RX Graphics
+67EF,	C1,	AMD Radeon RX 460 Graphics
+67EF,	C2,	AMD Radeon Pro Series
+67EF,	C3,	AMD Radeon RX Series
+67EF,	C5,	AMD Radeon RX 460 Graphics
+67EF,	C7,	AMD Radeon RX Graphics
+67EF,	CF,	AMD Radeon RX 460 Graphics
+67EF,	E0,	AMD Radeon RX 560 Series
+67EF,	E1,	AMD Radeon RX Series
+67EF,	E2,	AMD Radeon RX 560X
+67EF,	E3,	AMD Radeon RX Series
+67EF,	E5,	AMD Radeon RX 560 Series
+67EF,	E7,	AMD Radeon RX 560 Series
+67EF,	EF,	AMD Radeon 550 Series
+67EF,	FF,	AMD Radeon RX 460 Graphics
+67FF,	C0,	AMD Radeon Pro 465
+67FF,	C1,	AMD Radeon RX 560 Series
+67FF,	CF,	AMD Radeon RX 560 Series
+67FF,	EF,	AMD Radeon RX 560 Series
+67FF,	FF,	AMD Radeon RX 550 Series
+6800,	00,	AMD Radeon HD 7970M
+6801,	00,	AMD Radeon HD 8970M
+6806,	00,	AMD Radeon R9 M290X
+6808,	00,	AMD FirePro W7000
+6808,	00,	ATI FirePro V (FireGL V) Graphics Adapter
+6809,	00,	ATI FirePro W5000
+6810,	00,	AMD Radeon R9 200 Series
+6810,	81,	AMD Radeon R9 370 Series
+6811,	00,	AMD Radeon R9 200 Series
+6811,	81,	AMD Radeon R7 370 Series
+6818,	00,	AMD Radeon HD 7800 Series
+6819,	00,	AMD Radeon HD 7800 Series
+6820,	00,	AMD Radeon R9 M275X
+6820,	81,	AMD Radeon R9 M375
+6820,	83,	AMD Radeon R9 M375X
+6821,	00,	AMD Radeon R9 M200X Series
+6821,	83,	AMD Radeon R9 M370X
+6821,	87,	AMD Radeon R7 M380
+6822,	00,	AMD Radeon E8860
+6823,	00,	AMD Radeon R9 M200X Series
+6825,	00,	AMD Radeon HD 7800M Series
+6826,	00,	AMD Radeon HD 7700M Series
+6827,	00,	AMD Radeon HD 7800M Series
+6828,	00,	AMD FirePro W600
+682B,	00,	AMD Radeon HD 8800M Series
+682B,	87,	AMD Radeon R9 M360
+682C,	00,	AMD FirePro W4100
+682D,	00,	AMD Radeon HD 7700M Series
+682F,	00,	AMD Radeon HD 7700M Series
+6830,	00,	AMD Radeon 7800M Series
+6831,	00,	AMD Radeon 7700M Series
+6835,	00,	AMD Radeon R7 Series / HD 9000 Series
+6837,	00,	AMD Radeon HD 7700 Series
+683D,	00,	AMD Radeon HD 7700 Series
+683F,	00,	AMD Radeon HD 7700 Series
+684C,	00,	ATI FirePro V (FireGL V) Graphics Adapter
+6860,	00,	AMD Radeon Instinct MI25
+6860,	01,	AMD Radeon Instinct MI25
+6860,	02,	AMD Radeon Instinct MI25
+6860,	03,	AMD Radeon Pro V340
+6860,	04,	AMD Radeon Instinct MI25x2
+6860,	07,	AMD Radeon Pro V320
+6861,	00,	AMD Radeon Pro WX 9100
+6862,	00,	AMD Radeon Pro SSG
+6863,	00,	AMD Radeon Vega Frontier Edition
+6864,	03,	AMD Radeon Pro V340
+6864,	04,	AMD Radeon Instinct MI25x2
+6864,	05,	AMD Radeon Pro V340
+6868,	00,	AMD Radeon Pro WX 8200
+686C,	00,	AMD Radeon Instinct MI25 MxGPU
+686C,	01,	AMD Radeon Instinct MI25 MxGPU
+686C,	02,	AMD Radeon Instinct MI25 MxGPU
+686C,	03,	AMD Radeon Pro V340 MxGPU
+686C,	04,	AMD Radeon Instinct MI25x2 MxGPU
+686C,	05,	AMD Radeon Pro V340L MxGPU
+686C,	06,	AMD Radeon Instinct MI25 MxGPU
+687F,	01,	AMD Radeon RX Vega
+687F,	C0,	AMD Radeon RX Vega
+687F,	C1,	AMD Radeon RX Vega
+687F,	C3,	AMD Radeon RX Vega
+687F,	C7,	AMD Radeon RX Vega
+6900,	00,	AMD Radeon R7 M260
+6900,	81,	AMD Radeon R7 M360
+6900,	83,	AMD Radeon R7 M340
+6900,	C1,	AMD Radeon R5 M465 Series
+6900,	C3,	AMD Radeon R5 M445 Series
+6900,	D1,	AMD Radeon 530 Series
+6900,	D3,	AMD Radeon 530 Series
+6901,	00,	AMD Radeon R5 M255
+6902,	00,	AMD Radeon Series
+6907,	00,	AMD Radeon R5 M255
+6907,	87,	AMD Radeon R5 M315
+6920,	00,	AMD Radeon R9 M395X
+6920,	01,	AMD Radeon R9 M390X
+6921,	00,	AMD Radeon R9 M390X
+6929,	00,	AMD FirePro S7150
+6929,	01,	AMD FirePro S7100X
+692B,	00,	AMD FirePro W7100
+6938,	00,	AMD Radeon R9 200 Series
+6938,	F0,	AMD Radeon R9 200 Series
+6938,	F1,	AMD Radeon R9 380 Series
+6939,	00,	AMD Radeon R9 200 Series
+6939,	F0,	AMD Radeon R9 200 Series
+6939,	F1,	AMD Radeon R9 380 Series
+694C,	C0,	AMD Radeon RX Vega M GH Graphics
+694E,	C0,	AMD Radeon RX Vega M GL Graphics
+6980,	00,	AMD Radeon Pro WX 3100
+6981,	00,	AMD Radeon Pro WX 3200 Series
+6981,	01,	AMD Radeon Pro WX 3200 Series
+6981,	10,	AMD Radeon Pro WX 3200 Series
+6985,	00,	AMD Radeon Pro WX 3100
+6986,	00,	AMD Radeon Pro WX 2100
+6987,	80,	AMD Embedded Radeon E9171
+6987,	C0,	AMD Radeon 550X Series
+6987,	C1,	AMD Radeon RX 640
+6987,	C3,	AMD Radeon 540X Series
+6987,	C7,	AMD Radeon 540
+6995,	00,	AMD Radeon Pro WX 2100
+6997,	00,	AMD Radeon Pro WX 2100
+699F,	81,	AMD Embedded Radeon E9170 Series
+699F,	C0,	AMD Radeon 500 Series
+699F,	C1,	AMD Radeon 540 Series
+699F,	C3,	AMD Radeon 500 Series
+699F,	C7,	AMD Radeon RX 550 / 550 Series
+699F,	C9,	AMD Radeon 540
+6FDF,	E7,	AMD Radeon RX 590 GME
+6FDF,	EF,	AMD Radeon RX 580 2048SP
+7300,	C1,	AMD FirePro S9300 x2
+7300,	C8,	AMD Radeon R9 Fury Series
+7300,	C9,	AMD Radeon Pro Duo
+7300,	CA,	AMD Radeon R9 Fury Series
+7300,	CB,	AMD Radeon R9 Fury Series
+7312,	00,	AMD Radeon Pro W5700
+731E,	C6,	AMD Radeon RX 5700XTB
+731E,	C7,	AMD Radeon RX 5700B
+731F,	C0,	AMD Radeon RX 5700 XT 50th Anniversary
+731F,	C1,	AMD Radeon RX 5700 XT
+731F,	C2,	AMD Radeon RX 5600M
+731F,	C3,	AMD Radeon RX 5700M
+731F,	C4,	AMD Radeon RX 5700
+731F,	C5,	AMD Radeon RX 5700 XT
+731F,	CA,	AMD Radeon RX 5600 XT
+731F,	CB,	AMD Radeon RX 5600 OEM
+7340,	C1,	AMD Radeon RX 5500M
+7340,	C3,	AMD Radeon RX 5300M
+7340,	C5,	AMD Radeon RX 5500 XT
+7340,	C7,	AMD Radeon RX 5500
+7340,	C9,	AMD Radeon RX 5500XTB
+7340,	CF,	AMD Radeon RX 5300
+7341,	00,	AMD Radeon Pro W5500
+7347,	00,	AMD Radeon Pro W5500M
+7360,	41,	AMD Radeon Pro 5600M
+7360,	C3,	AMD Radeon Pro V520
+7362,	C1,	AMD Radeon Pro V540
+7362,	C3,	AMD Radeon Pro V520
+738C,	01,	AMD Instinct MI100
+73A1,	00,	AMD Radeon Pro V620
+73A3,	00,	AMD Radeon Pro W6800
+73A5,	C0,	AMD Radeon RX 6950 XT
+73AE,	00,	AMD Radeon Pro V620 MxGPU
+73AF,	C0,	AMD Radeon RX 6900 XT
+73BF,	C0,	AMD Radeon RX 6900 XT
+73BF,	C1,	AMD Radeon RX 6800 XT
+73BF,	C3,	AMD Radeon RX 6800
+73DF,	C0,	AMD Radeon RX 6750 XT
+73DF,	C1,	AMD Radeon RX 6700 XT
+73DF,	C2,	AMD Radeon RX 6800M
+73DF,	C3,	AMD Radeon RX 6800M
+73DF,	C5,	AMD Radeon RX 6700 XT
+73DF,	CF,	AMD Radeon RX 6700M
+73DF,	D5,	AMD Radeon RX 6750 GRE 12GB
+73DF,	D7,	AMD TDC-235
+73DF,	DF,	AMD Radeon RX 6700
+73DF,	E5,	AMD Radeon RX 6750 GRE 12GB
+73DF,	FF,	AMD Radeon RX 6700
+73E0,	00,	AMD Radeon RX 6600M
+73E1,	00,	AMD Radeon Pro W6600M
+73E3,	00,	AMD Radeon Pro W6600
+73EF,	C0,	AMD Radeon RX 6800S
+73EF,	C1,	AMD Radeon RX 6650 XT
+73EF,	C2,	AMD Radeon RX 6700S
+73EF,	C3,	AMD Radeon RX 6650M
+73EF,	C4,	AMD Radeon RX 6650M XT
+73FF,	C1,	AMD Radeon RX 6600 XT
+73FF,	C3,	AMD Radeon RX 6600M
+73FF,	C7,	AMD Radeon RX 6600
+73FF,	CB,	AMD Radeon RX 6600S
+73FF,	CF,	AMD Radeon RX 6600 LE
+73FF,	DF,	AMD Radeon RX 6750 GRE 10GB
+7408,	00,	AMD Instinct MI250X
+740C,	01,	AMD Instinct MI250X / MI250
+740F,	02,	AMD Instinct MI210
+7421,	00,	AMD Radeon Pro W6500M
+7422,	00,	AMD Radeon Pro W6400
+7423,	00,	AMD Radeon Pro W6300M
+7423,	01,	AMD Radeon Pro W6300
+7424,	00,	AMD Radeon RX 6300
+743F,	C1,	AMD Radeon RX 6500 XT
+743F,	C3,	AMD Radeon RX 6500
+743F,	C3,	AMD Radeon RX 6500M
+743F,	C7,	AMD Radeon RX 6400
+743F,	C8,	AMD Radeon RX 6500M
+743F,	CC,	AMD Radeon 6550S
+743F,	CE,	AMD Radeon RX 6450M
+743F,	CF,	AMD Radeon RX 6300M
+743F,	D3,	AMD Radeon RX 6550M
+743F,	D7,	AMD Radeon RX 6400
+7448,	00,	AMD Radeon Pro W7900
+7449,	00,	AMD Radeon Pro W7800 48GB
+744A,	00,	AMD Radeon Pro W7900 Dual Slot
+744B,	00,	AMD Radeon Pro W7900D
+744C,	C8,	AMD Radeon RX 7900 XTX
+744C,	CC,	AMD Radeon RX 7900 XT
+744C,	CE,	AMD Radeon RX 7900 GRE
+744C,	CF,	AMD Radeon RX 7900M
+745E,	CC,	AMD Radeon Pro W7800
+7460,	00,	AMD Radeon Pro V710
+7461,	00,	AMD Radeon Pro V710 MxGPU
+7470,	00,	AMD Radeon Pro W7700
+747E,	C8,	AMD Radeon RX 7800 XT
+747E,	D8,	AMD Radeon RX 7800M
+747E,	DB,	AMD Radeon RX 7700
+747E,	FF,	AMD Radeon RX 7700 XT
+7480,	00,	AMD Radeon Pro W7600
+7480,	C0,	AMD Radeon RX 7600 XT
+7480,	C1,	AMD Radeon RX 7700S
+7480,	C2,	AMD Radeon RX 7650 GRE
+7480,	C3,	AMD Radeon RX 7600S
+7480,	C7,	AMD Radeon RX 7600M XT
+7480,	CF,	AMD Radeon RX 7600
+7481,   C7,     AMD Steam Machine
+7483,	CF,	AMD Radeon RX 7600M
+7489,	00,	AMD Radeon Pro W7500
+7499,	00,	AMD Radeon Pro W7400
+7499,	C0,	AMD Radeon RX 7400
+7499,	C1,	AMD Radeon RX 7300
+74A0,	00,	AMD Instinct MI300A
+74A1,	00,	AMD Instinct MI300X
+74A2,	00,	AMD Instinct MI308X
+74A5,	00,	AMD Instinct MI325X
+74A8,	00,	AMD Instinct MI308X HF
+74A9,	00,	AMD Instinct MI300X HF
+74B5,	00,	AMD Instinct MI300X VF
+74B6,	00,	AMD Instinct MI308X
+74BD,	00,	AMD Instinct MI300X HF
+7550,	C0,	AMD Radeon RX 9070 XT
+7550,	C2,	AMD Radeon RX 9070 GRE
+7550,	C3,	AMD Radeon RX 9070
+7551,	C0,	AMD Radeon AI PRO R9700
+7590,	C0,	AMD Radeon RX 9060 XT
+7590,	C7,	AMD Radeon RX 9060
+75A0,	C0,	AMD Instinct MI350X
+75A3,	C0,	AMD Instinct MI355X
+75B0,	C0,	AMD Instinct MI350X VF
+75B3,	C0,	AMD Instinct MI355X VF
+9830,	00,	AMD Radeon HD 8400 / R3 Series
+9831,	00,	AMD Radeon HD 8400E
+9832,	00,	AMD Radeon HD 8330
+9833,	00,	AMD Radeon HD 8330E
+9834,	00,	AMD Radeon HD 8210
+9835,	00,	AMD Radeon HD 8210E
+9836,	00,	AMD Radeon HD 8200 / R3 Series
+9837,	00,	AMD Radeon HD 8280E
+9838,	00,	AMD Radeon HD 8200 / R3 series
+9839,	00,	AMD Radeon HD 8180
+983D,	00,	AMD Radeon HD 8250
+9850,	00,	AMD Radeon R3 Graphics
+9850,	03,	AMD Radeon R3 Graphics
+9850,	40,	AMD Radeon R2 Graphics
+9850,	45,	AMD Radeon R3 Graphics
+9851,	00,	AMD Radeon R4 Graphics
+9851,	01,	AMD Radeon R5E Graphics
+9851,	05,	AMD Radeon R5 Graphics
+9851,	06,	AMD Radeon R5E Graphics
+9851,	40,	AMD Radeon R4 Graphics
+9851,	45,	AMD Radeon R5 Graphics
+9852,	00,	AMD Radeon R2 Graphics
+9852,	40,	AMD Radeon E1 Graphics
+9853,	00,	AMD Radeon R2 Graphics
+9853,	01,	AMD Radeon R4E Graphics
+9853,	03,	AMD Radeon R2 Graphics
+9853,	05,	AMD Radeon R1E Graphics
+9853,	06,	AMD Radeon R1E Graphics
+9853,	07,	AMD Radeon R1E Graphics
+9853,	08,	AMD Radeon R1E Graphics
+9853,	40,	AMD Radeon R2 Graphics
+9854,	00,	AMD Radeon R3 Graphics
+9854,	01,	AMD Radeon R3E Graphics
+9854,	02,	AMD Radeon R3 Graphics
+9854,	05,	AMD Radeon R2 Graphics
+9854,	06,	AMD Radeon R4 Graphics
+9854,	07,	AMD Radeon R3 Graphics
+9855,	02,	AMD Radeon R6 Graphics
+9855,	05,	AMD Radeon R4 Graphics
+9856,	00,	AMD Radeon R2 Graphics
+9856,	01,	AMD Radeon R2E Graphics
+9856,	02,	AMD Radeon R2 Graphics
+9856,	05,	AMD Radeon R1E Graphics
+9856,	06,	AMD Radeon R2 Graphics
+9856,	07,	AMD Radeon R1E Graphics
+9856,	08,	AMD Radeon R1E Graphics
+9856,	13,	AMD Radeon R1E Graphics
+9874,	81,	AMD Radeon R6 Graphics
+9874,	84,	AMD Radeon R7 Graphics
+9874,	85,	AMD Radeon R6 Graphics
+9874,	87,	AMD Radeon R5 Graphics
+9874,	88,	AMD Radeon R7E Graphics
+9874,	89,	AMD Radeon R6E Graphics
+9874,	C4,	AMD Radeon R7 Graphics
+9874,	C5,	AMD Radeon R6 Graphics
+9874,	C6,	AMD Radeon R6 Graphics
+9874,	C7,	AMD Radeon R5 Graphics
+9874,	C8,	AMD Radeon R7 Graphics
+9874,	C9,	AMD Radeon R7 Graphics
+9874,	CA,	AMD Radeon R5 Graphics
+9874,	CB,	AMD Radeon R5 Graphics
+9874,	CC,	AMD Radeon R7 Graphics
+9874,	CD,	AMD Radeon R7 Graphics
+9874,	CE,	AMD Radeon R5 Graphics
+9874,	E1,	AMD Radeon R7 Graphics
+9874,	E2,	AMD Radeon R7 Graphics
+9874,	E3,	AMD Radeon R7 Graphics
+9874,	E4,	AMD Radeon R7 Graphics
+9874,	E5,	AMD Radeon R5 Graphics
+9874,	E6,	AMD Radeon R5 Graphics
+98E4,	80,	AMD Radeon R5E Graphics
+98E4,	81,	AMD Radeon R4E Graphics
+98E4,	83,	AMD Radeon R2E Graphics
+98E4,	84,	AMD Radeon R2E Graphics
+98E4,	86,	AMD Radeon R1E Graphics
+98E4,	C0,	AMD Radeon R4 Graphics
+98E4,	C1,	AMD Radeon R5 Graphics
+98E4,	C2,	AMD Radeon R4 Graphics
+98E4,	C4,	AMD Radeon R5 Graphics
+98E4,	C6,	AMD Radeon R5 Graphics
+98E4,	C8,	AMD Radeon R4 Graphics
+98E4,	C9,	AMD Radeon R4 Graphics
+98E4,	CA,	AMD Radeon R5 Graphics
+98E4,	D0,	AMD Radeon R2 Graphics
+98E4,	D1,	AMD Radeon R2 Graphics
+98E4,	D2,	AMD Radeon R2 Graphics
+98E4,	D4,	AMD Radeon R2 Graphics
+98E4,	D9,	AMD Radeon R5 Graphics
+98E4,	DA,	AMD Radeon R5 Graphics
+98E4,	DB,	AMD Radeon R3 Graphics
+98E4,	E1,	AMD Radeon R3 Graphics
+98E4,	E2,	AMD Radeon R3 Graphics
+98E4,	E9,	AMD Radeon R4 Graphics
+98E4,	EA,	AMD Radeon R4 Graphics
+98E4,	EB,	AMD Radeon R3 Graphics
+98E4,	EB,	AMD Radeon R4 Graphics
--- a/agent/test-data/nvtop.json
+++ b/agent/test-data/nvtop.json
@@ -0,0 +1,34 @@
+[
+  {
+   "device_name": "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
+   "gpu_clock": "1485MHz",
+   "mem_clock": "6001MHz",
+   "temp": "48C",
+   "fan_speed": null,
+   "power_draw": "13W",
+   "gpu_util": "5%",
+   "encode": "0%",
+   "decode": "0%",
+   "mem_util": "8%",
+   "mem_total": "4294967296",
+   "mem_used": "349372416",
+   "mem_free": "3945594880",
+   "processes" : []
+  },
+  {
+   "device_name": "AMD Radeon 680M",
+   "gpu_clock": "2200MHz",
+   "mem_clock": "2400MHz",
+   "temp": "48C",
+   "fan_speed": "CPU Fan",
+   "power_draw": "9W",
+   "gpu_util": "12%",
+   "encode": null,
+   "decode": "0%",
+   "mem_util": "7%",
+   "mem_total": "16929173504",
+   "mem_used": "1213784064",
+   "mem_free": "15715389440",
+   "processes" : []
+  }
+]
--- a/agent/update.go
+++ b/agent/update.go
@@ -63,9 +63,9 @@ func detectRestarter() restarter {
 	if path, err := exec.LookPath("rc-service"); err == nil {
 		return &openRCRestarter{cmd: path}
 	}
-    if path, err := exec.LookPath("procd"); err == nil {
-        return &openWRTRestarter{cmd: path}
-    }
+	if path, err := exec.LookPath("procd"); err == nil {
+		return &openWRTRestarter{cmd: path}
+	}
 	if path, err := exec.LookPath("service"); err == nil {
 		if runtime.GOOS == "freebsd" {
 			return &freeBSDRestarter{cmd: path}
@@ -79,7 +79,7 @@ func detectRestarter() restarter {
 func Update(useMirror bool) error {
 	exePath, _ := os.Executable()

-	dataDir, err := getDataDir()
+	dataDir, err := GetDataDir()
 	if err != nil {
 		dataDir = os.TempDir()
 	}
@@ -125,4 +125,3 @@ func Update(useMirror bool) error {

 	return nil
 }
-
--- a/agent/utils.go
+++ b/agent/utils.go
@@ -1,15 +0,0 @@
-package agent
-
-import "math"
-
-func bytesToMegabytes(b float64) float64 {
-	return twoDecimals(b / 1048576)
-}
-
-func bytesToGigabytes(b uint64) float64 {
-	return twoDecimals(float64(b) / 1073741824)
-}
-
-func twoDecimals(value float64) float64 {
-	return math.Round(value*100) / 100
-}
--- a/agent/utils/utils.go
+++ b/agent/utils/utils.go
@@ -0,0 +1,88 @@
+package utils
+
+import (
+	"io"
+	"math"
+	"os"
+	"strconv"
+	"strings"
+)
+
+// GetEnv retrieves an environment variable with a "BESZEL_AGENT_" prefix, or falls back to the unprefixed key.
+func GetEnv(key string) (value string, exists bool) {
+	if value, exists = os.LookupEnv("BESZEL_AGENT_" + key); exists {
+		return value, exists
+	}
+	return os.LookupEnv(key)
+}
+
+// BytesToMegabytes converts bytes to megabytes and rounds to two decimal places.
+func BytesToMegabytes(b float64) float64 {
+	return TwoDecimals(b / 1048576)
+}
+
+// BytesToGigabytes converts bytes to gigabytes and rounds to two decimal places.
+func BytesToGigabytes(b uint64) float64 {
+	return TwoDecimals(float64(b) / 1073741824)
+}
+
+// TwoDecimals rounds a float64 value to two decimal places.
+func TwoDecimals(value float64) float64 {
+	return math.Round(value*100) / 100
+}
+
+// func RoundFloat(val float64, precision uint) float64 {
+//     ratio := math.Pow(10, float64(precision))
+//     return math.Round(val*ratio) / ratio
+// }
+
+// ReadStringFile returns trimmed file contents or empty string on error.
+func ReadStringFile(path string) string {
+	content, _ := ReadStringFileOK(path)
+	return content
+}
+
+// ReadStringFileOK returns trimmed file contents and read success.
+func ReadStringFileOK(path string) (string, bool) {
+	b, err := os.ReadFile(path)
+	if err != nil {
+		return "", false
+	}
+	return strings.TrimSpace(string(b)), true
+}
+
+// ReadStringFileLimited reads a file into a string with a maximum size (in bytes) to avoid
+// allocating large buffers and potential panics with pseudo-files when the size is misreported.
+func ReadStringFileLimited(path string, maxSize int) (string, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+
+	buf := make([]byte, maxSize)
+	n, err := f.Read(buf)
+	if err != nil && err != io.EOF {
+		return "", err
+	}
+	return strings.TrimSpace(string(buf[:n])), nil
+}
+
+// FileExists reports whether the given path exists.
+func FileExists(path string) bool {
+	_, err := os.Stat(path)
+	return err == nil
+}
+
+// ReadUintFile parses a decimal uint64 value from a file.
+func ReadUintFile(path string) (uint64, bool) {
+	raw, ok := ReadStringFileOK(path)
+	if !ok {
+		return 0, false
+	}
+	parsed, err := strconv.ParseUint(raw, 10, 64)
+	if err != nil {
+		return 0, false
+	}
+	return parsed, true
+}
--- a/agent/utils/utils_test.go
+++ b/agent/utils/utils_test.go
@@ -0,0 +1,165 @@
+package utils
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestTwoDecimals(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    float64
+		expected float64
+	}{
+		{"round down", 1.234, 1.23},
+		{"round half up", 1.235, 1.24}, // math.Round rounds half up
+		{"no rounding needed", 1.23, 1.23},
+		{"negative number", -1.235, -1.24}, // math.Round rounds half up (more negative)
+		{"zero", 0.0, 0.0},
+		{"large number", 123.456, 123.46}, // rounds 5 up
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := TwoDecimals(tt.input)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}
+
+func TestBytesToMegabytes(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    float64
+		expected float64
+	}{
+		{"1 MB", 1048576, 1.0},
+		{"512 KB", 524288, 0.5},
+		{"zero", 0, 0},
+		{"large value", 1073741824, 1024}, // 1 GB = 1024 MB
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := BytesToMegabytes(tt.input)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}
+
+func TestBytesToGigabytes(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    uint64
+		expected float64
+	}{
+		{"1 GB", 1073741824, 1.0},
+		{"512 MB", 536870912, 0.5},
+		{"0 GB", 0, 0},
+		{"2 GB", 2147483648, 2.0},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := BytesToGigabytes(tt.input)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}
+
+func TestFileFunctions(t *testing.T) {
+	tmpDir := t.TempDir()
+	testFilePath := filepath.Join(tmpDir, "test.txt")
+	testContent := "hello world"
+
+	// Test FileExists (false)
+	assert.False(t, FileExists(testFilePath))
+
+	// Test ReadStringFileOK (false)
+	content, ok := ReadStringFileOK(testFilePath)
+	assert.False(t, ok)
+	assert.Empty(t, content)
+
+	// Test ReadStringFile (empty)
+	assert.Empty(t, ReadStringFile(testFilePath))
+
+	// Write file
+	err := os.WriteFile(testFilePath, []byte(testContent+"\n "), 0644)
+	assert.NoError(t, err)
+
+	// Test FileExists (true)
+	assert.True(t, FileExists(testFilePath))
+
+	// Test ReadStringFileOK (true)
+	content, ok = ReadStringFileOK(testFilePath)
+	assert.True(t, ok)
+	assert.Equal(t, testContent, content)
+
+	// Test ReadStringFile (content)
+	assert.Equal(t, testContent, ReadStringFile(testFilePath))
+}
+
+func TestReadUintFile(t *testing.T) {
+	tmpDir := t.TempDir()
+
+	t.Run("valid uint", func(t *testing.T) {
+		path := filepath.Join(tmpDir, "uint.txt")
+		os.WriteFile(path, []byte(" 12345\n"), 0644)
+		val, ok := ReadUintFile(path)
+		assert.True(t, ok)
+		assert.Equal(t, uint64(12345), val)
+	})
+
+	t.Run("invalid uint", func(t *testing.T) {
+		path := filepath.Join(tmpDir, "invalid.txt")
+		os.WriteFile(path, []byte("abc"), 0644)
+		val, ok := ReadUintFile(path)
+		assert.False(t, ok)
+		assert.Equal(t, uint64(0), val)
+	})
+
+	t.Run("missing file", func(t *testing.T) {
+		path := filepath.Join(tmpDir, "missing.txt")
+		val, ok := ReadUintFile(path)
+		assert.False(t, ok)
+		assert.Equal(t, uint64(0), val)
+	})
+}
+
+func TestGetEnv(t *testing.T) {
+	key := "TEST_VAR"
+	prefixedKey := "BESZEL_AGENT_" + key
+
+	t.Run("prefixed variable exists", func(t *testing.T) {
+		os.Setenv(prefixedKey, "prefixed_val")
+		os.Setenv(key, "unprefixed_val")
+		defer os.Unsetenv(prefixedKey)
+		defer os.Unsetenv(key)
+
+		val, exists := GetEnv(key)
+		assert.True(t, exists)
+		assert.Equal(t, "prefixed_val", val)
+	})
+
+	t.Run("only unprefixed variable exists", func(t *testing.T) {
+		os.Unsetenv(prefixedKey)
+		os.Setenv(key, "unprefixed_val")
+		defer os.Unsetenv(key)
+
+		val, exists := GetEnv(key)
+		assert.True(t, exists)
+		assert.Equal(t, "unprefixed_val", val)
+	})
+
+	t.Run("neither variable exists", func(t *testing.T) {
+		os.Unsetenv(prefixedKey)
+		os.Unsetenv(key)
+
+		val, exists := GetEnv(key)
+		assert.False(t, exists)
+		assert.Empty(t, val)
+	})
+}
--- a/agent/zfs/zfs_freebsd.go
+++ b/agent/zfs/zfs_freebsd.go
@@ -0,0 +1,11 @@
+//go:build freebsd
+
+package zfs
+
+import (
+	"golang.org/x/sys/unix"
+)
+
+func ARCSize() (uint64, error) {
+	return unix.SysctlUint64("kstat.zfs.misc.arcstats.size")
+}
--- a/agent/zfs/zfs_linux.go
+++ b/agent/zfs/zfs_linux.go
@@ -0,0 +1,34 @@
+//go:build linux
+
+// Package zfs provides functions to read ZFS statistics.
+package zfs
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+)
+
+func ARCSize() (uint64, error) {
+	file, err := os.Open("/proc/spl/kstat/zfs/arcstats")
+	if err != nil {
+		return 0, err
+	}
+	defer file.Close()
+
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		line := scanner.Text()
+		if strings.HasPrefix(line, "size") {
+			fields := strings.Fields(line)
+			if len(fields) < 3 {
+				return 0, fmt.Errorf("unexpected arcstats size format: %s", line)
+			}
+			return strconv.ParseUint(fields[2], 10, 64)
+		}
+	}
+
+	return 0, fmt.Errorf("size field not found in arcstats")
+}
--- a/agent/zfs/zfs_unsupported.go
+++ b/agent/zfs/zfs_unsupported.go
@@ -0,0 +1,9 @@
+//go:build !linux && !freebsd
+
+package zfs
+
+import "errors"
+
+func ARCSize() (uint64, error) {
+	return 0, errors.ErrUnsupported
+}
--- a/beszel.go
+++ b/beszel.go
@@ -6,7 +6,7 @@ import "github.com/blang/semver"

 const (
 	// Version is the current version of the application.
-	Version = "0.18.3"
+	Version = "0.18.4"
 	// AppName is the name of the application.
 	AppName = "beszel"
 )
--- a/go.mod
+++ b/go.mod
@@ -1,6 +1,6 @@
 module github.com/henrygd/beszel

-go 1.25.5
+go 1.26.1

 require (
 	github.com/blang/semver v3.5.1+incompatible
@@ -11,17 +11,17 @@ require (
 	github.com/gliderlabs/ssh v0.3.8
 	github.com/google/uuid v1.6.0
 	github.com/lxzan/gws v1.8.9
-	github.com/nicholas-fedor/shoutrrr v0.13.1
-	github.com/pocketbase/dbx v1.11.0
-	github.com/pocketbase/pocketbase v0.36.2
+	github.com/nicholas-fedor/shoutrrr v0.13.2
+	github.com/pocketbase/dbx v1.12.0
+	github.com/pocketbase/pocketbase v0.36.4
 	github.com/shirou/gopsutil/v4 v4.26.1
 	github.com/spf13/cast v1.10.0
 	github.com/spf13/cobra v1.10.2
 	github.com/spf13/pflag v1.0.10
 	github.com/stretchr/testify v1.11.1
-	golang.org/x/crypto v0.47.0
-	golang.org/x/exp v0.0.0-20260112195511-716be5621a96
-	golang.org/x/sys v0.40.0
+	golang.org/x/crypto v0.48.0
+	golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa
+	golang.org/x/sys v0.41.0
 	gopkg.in/yaml.v3 v3.0.1
 )

@@ -42,8 +42,8 @@ require (
 	github.com/godbus/dbus/v5 v5.2.2 // indirect
 	github.com/golang-jwt/jwt/v5 v5.3.1 // indirect
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
-	github.com/klauspost/compress v1.18.3 // indirect
-	github.com/lufia/plan9stats v0.0.0-20251013123823-9fd1530e3ec3 // indirect
+	github.com/klauspost/compress v1.18.4 // indirect
+	github.com/lufia/plan9stats v0.0.0-20260216142805-b3301c5f2a88 // indirect
 	github.com/mattn/go-colorable v0.1.14 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/ncruces/go-strftime v1.0.0 // indirect
@@ -54,15 +54,15 @@ require (
 	github.com/tklauser/numcpus v0.11.0 // indirect
 	github.com/x448/float16 v0.8.4 // indirect
 	github.com/yusufpapurcu/wmi v1.2.4 // indirect
-	golang.org/x/image v0.35.0 // indirect
-	golang.org/x/net v0.49.0 // indirect
-	golang.org/x/oauth2 v0.34.0 // indirect
+	golang.org/x/image v0.36.0 // indirect
+	golang.org/x/net v0.50.0 // indirect
+	golang.org/x/oauth2 v0.35.0 // indirect
 	golang.org/x/sync v0.19.0 // indirect
-	golang.org/x/term v0.39.0 // indirect
-	golang.org/x/text v0.33.0 // indirect
+	golang.org/x/term v0.40.0 // indirect
+	golang.org/x/text v0.34.0 // indirect
 	howett.net/plist v1.0.1 // indirect
 	modernc.org/libc v1.67.6 // indirect
 	modernc.org/mathutil v1.7.1 // indirect
 	modernc.org/memory v1.11.0 // indirect
-	modernc.org/sqlite v1.44.3 // indirect
+	modernc.org/sqlite v1.45.0 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -69,14 +69,14 @@ github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLf
 github.com/jarcoal/httpmock v1.4.1 h1:0Ju+VCFuARfFlhVXFc2HxlcQkfB+Xq12/EotHko+x2A=
 github.com/jarcoal/httpmock v1.4.1/go.mod h1:ftW1xULwo+j0R0JJkJIIi7UKigZUXCLLanykgjwBXL0=
 github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
-github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw=
-github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
+github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c=
+github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
-github.com/lufia/plan9stats v0.0.0-20251013123823-9fd1530e3ec3 h1:PwQumkgq4/acIiZhtifTV5OUqqiP82UAl0h87xj/l9k=
-github.com/lufia/plan9stats v0.0.0-20251013123823-9fd1530e3ec3/go.mod h1:autxFIvghDt3jPTLoqZ9OZ7s9qTGNAWmYCjVFWPX/zg=
+github.com/lufia/plan9stats v0.0.0-20260216142805-b3301c5f2a88 h1:PTw+yKnXcOFCR6+8hHTyWBeQ/P4Nb7dd4/0ohEcWQuM=
+github.com/lufia/plan9stats v0.0.0-20260216142805-b3301c5f2a88/go.mod h1:autxFIvghDt3jPTLoqZ9OZ7s9qTGNAWmYCjVFWPX/zg=
 github.com/lxzan/gws v1.8.9 h1:VU3SGUeWlQrEwfUSfokcZep8mdg/BrUF+y73YYshdBM=
 github.com/lxzan/gws v1.8.9/go.mod h1:d9yHaR1eDTBHagQC6KY7ycUOaz5KWeqQtP3xu7aMK8Y=
 github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE=
@@ -85,19 +85,19 @@ github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWE
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
 github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
-github.com/nicholas-fedor/shoutrrr v0.13.1 h1:llEoHNbnMM4GfQ9+2Ns3n6ssvNfi3NPWluM0AQiicoY=
-github.com/nicholas-fedor/shoutrrr v0.13.1/go.mod h1:kU4cFJpEAtTzl3iV0l+XUXmM90OlC5T01b7roM4/pYM=
-github.com/onsi/ginkgo/v2 v2.27.3 h1:ICsZJ8JoYafeXFFlFAG75a7CxMsJHwgKwtO+82SE9L8=
-github.com/onsi/ginkgo/v2 v2.27.3/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo=
-github.com/onsi/gomega v1.38.3 h1:eTX+W6dobAYfFeGC2PV6RwXRu/MyT+cQguijutvkpSM=
-github.com/onsi/gomega v1.38.3/go.mod h1:ZCU1pkQcXDO5Sl9/VVEGlDyp+zm0m1cmeG5TOzLgdh4=
+github.com/nicholas-fedor/shoutrrr v0.13.2 h1:hfsYBIqSFYGg92pZP5CXk/g7/OJIkLYmiUnRl+AD1IA=
+github.com/nicholas-fedor/shoutrrr v0.13.2/go.mod h1:ZqzV3gY/Wj6AvWs1etlO7+yKbh4iptSbeL8avBpMQbA=
+github.com/onsi/ginkgo/v2 v2.28.1 h1:S4hj+HbZp40fNKuLUQOYLDgZLwNUVn19N3Atb98NCyI=
+github.com/onsi/ginkgo/v2 v2.28.1/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsxPwvdOgE=
+github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28=
+github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/pocketbase/dbx v1.11.0 h1:LpZezioMfT3K4tLrqA55wWFw1EtH1pM4tzSVa7kgszU=
-github.com/pocketbase/dbx v1.11.0/go.mod h1:xXRCIAKTHMgUCyCKZm55pUOdvFziJjQfXaWKhu2vhMs=
-github.com/pocketbase/pocketbase v0.36.2 h1:mzrxnvXKc3yxKlvZdbwoYXkH8kfIETteD0hWdgj0VI4=
-github.com/pocketbase/pocketbase v0.36.2/go.mod h1:71vSF8whUDzC8mcLFE10+Qatf9JQdeOGIRWawOuLLKM=
+github.com/pocketbase/dbx v1.12.0 h1:/oLErM+A0b4xI0PWTGPqSDVjzix48PqI/bng2l0PzoA=
+github.com/pocketbase/dbx v1.12.0/go.mod h1:xXRCIAKTHMgUCyCKZm55pUOdvFziJjQfXaWKhu2vhMs=
+github.com/pocketbase/pocketbase v0.36.4 h1:zTjRZbp2WfTOJJfb+pFRWa200UaQwxZYt8RzkFMlAZ4=
+github.com/pocketbase/pocketbase v0.36.4/go.mod h1:9CiezhRudd9FZGa5xZa53QZBTNxc5vvw/FGG+diAECI=
 github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU=
 github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
@@ -129,20 +129,20 @@ github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQ
 go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
 go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8=
-golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A=
-golang.org/x/exp v0.0.0-20260112195511-716be5621a96 h1:Z/6YuSHTLOHfNFdb8zVZomZr7cqNgTJvA8+Qz75D8gU=
-golang.org/x/exp v0.0.0-20260112195511-716be5621a96/go.mod h1:nzimsREAkjBCIEFtHiYkrJyT+2uy9YZJB7H1k68CXZU=
+golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts=
+golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos=
+golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa h1:Zt3DZoOFFYkKhDT3v7Lm9FDMEV06GpzjG2jrqW+QTE0=
+golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa/go.mod h1:K79w1Vqn7PoiZn+TkNpx3BUWUQksGO3JcVX6qIjytmA=
 golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.35.0 h1:LKjiHdgMtO8z7Fh18nGY6KDcoEtVfsgLDPeLyguqb7I=
-golang.org/x/image v0.35.0/go.mod h1:MwPLTVgvxSASsxdLzKrl8BRFuyqMyGhLwmC+TO1Sybk=
-golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c=
-golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU=
+golang.org/x/image v0.36.0 h1:Iknbfm1afbgtwPTmHnS2gTM/6PPZfH+z2EFuOkSbqwc=
+golang.org/x/image v0.36.0/go.mod h1:YsWD2TyyGKiIX1kZlu9QfKIsQ4nAAK9bdgdrIsE7xy4=
+golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
+golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
 golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
-golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o=
-golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8=
-golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw=
-golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
+golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60=
+golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM=
+golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ=
+golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
 golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
 golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -150,20 +150,20 @@ golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
-golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
-golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY=
-golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww=
+golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k=
+golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/term v0.40.0 h1:36e4zGLqU4yhjlmxEaagx2KuYbJq3EwY8K943ZsHcvg=
+golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
-golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE=
-golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8=
+golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk=
+golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc=
-golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg=
+golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
+golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
 google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
-google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
-google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
+google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
+google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
@@ -195,8 +195,8 @@ modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
 modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
 modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
 modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
-modernc.org/sqlite v1.44.3 h1:+39JvV/HWMcYslAwRxHb8067w+2zowvFOUrOWIy9PjY=
-modernc.org/sqlite v1.44.3/go.mod h1:CzbrU2lSB1DKUusvwGz7rqEKIq+NUd8GWuBBZDs9/nA=
+modernc.org/sqlite v1.45.0 h1:r51cSGzKpbptxnby+EIIz5fop4VuE4qFoVEjNvWoObs=
+modernc.org/sqlite v1.45.0/go.mod h1:CzbrU2lSB1DKUusvwGz7rqEKIq+NUd8GWuBBZDs9/nA=
 modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
 modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
 modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
--- a/internal/alerts/alerts.go
+++ b/internal/alerts/alerts.go
@@ -21,8 +21,7 @@ type hubLike interface {

 type AlertManager struct {
 	hub           hubLike
-	alertQueue    chan alertTask
-	stopChan      chan struct{}
+	stopOnce      sync.Once
 	pendingAlerts sync.Map
 }

@@ -40,16 +39,22 @@ type UserNotificationSettings struct {
 	Webhooks []string `json:"webhooks"`
 }

+type SystemAlertFsStats struct {
+	DiskTotal float64 `json:"d"`
+	DiskUsed  float64 `json:"du"`
+}
+
+// Values pulled from system_stats.stats that are relevant to alerts.
 type SystemAlertStats struct {
 	Cpu          float64                       `json:"cpu"`
 	Mem          float64                       `json:"mp"`
 	Disk         float64                       `json:"dp"`
-	NetSent      float64                       `json:"ns"`
-	NetRecv      float64                       `json:"nr"`
+	Bandwidth    [2]uint64                     `json:"b"`
 	GPU          map[string]SystemAlertGPUData `json:"g"`
 	Temperatures map[string]float32            `json:"t"`
 	LoadAvg      [3]float64                    `json:"la"`
 	Battery      [2]uint8                      `json:"bat"`
+	ExtraFs      map[string]SystemAlertFsStats `json:"efs"`
 }

 type SystemAlertGPUData struct {
@@ -92,12 +97,9 @@ var supportsTitle = map[string]struct{}{
 // NewAlertManager creates a new AlertManager instance.
 func NewAlertManager(app hubLike) *AlertManager {
 	am := &AlertManager{
-		hub:        app,
-		alertQueue: make(chan alertTask, 5),
-		stopChan:   make(chan struct{}),
+		hub: app,
 	}
 	am.bindEvents()
-	go am.startWorker()
 	return am
 }

@@ -106,6 +108,16 @@ func (am *AlertManager) bindEvents() {
 	am.hub.OnRecordAfterUpdateSuccess("alerts").BindFunc(updateHistoryOnAlertUpdate)
 	am.hub.OnRecordAfterDeleteSuccess("alerts").BindFunc(resolveHistoryOnAlertDelete)
 	am.hub.OnRecordAfterUpdateSuccess("smart_devices").BindFunc(am.handleSmartDeviceAlert)
+
+	am.hub.OnServe().BindFunc(func(e *core.ServeEvent) error {
+		if err := resolveStatusAlerts(e.App); err != nil {
+			e.App.Logger().Error("Failed to resolve stale status alerts", "err", err)
+		}
+		if err := am.restorePendingStatusAlerts(); err != nil {
+			e.App.Logger().Error("Failed to restore pending status alerts", "err", err)
+		}
+		return e.Next()
+	})
 }

 // IsNotificationSilenced checks if a notification should be silenced based on configured quiet hours
@@ -259,13 +271,14 @@ func (am *AlertManager) SendShoutrrrAlert(notificationUrl, title, message, link,
 	}

 	// Add link
-	if scheme == "ntfy" {
+	switch scheme {
+	case "ntfy":
 		queryParams.Add("Actions", fmt.Sprintf("view, %s, %s", linkText, link))
-	} else if scheme == "lark" {
+	case "lark":
 		queryParams.Add("link", link)
-	} else if scheme == "bark" {
+	case "bark":
 		queryParams.Add("url", link)
-	} else {
+	default:
 		message += "\n\n" + link
 	}

--- a/internal/alerts/alerts_battery_test.go
+++ b/internal/alerts/alerts_battery_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package alerts_test

--- a/internal/alerts/alerts_disk_test.go
+++ b/internal/alerts/alerts_disk_test.go
@@ -0,0 +1,155 @@
+//go:build testing
+
+package alerts_test
+
+import (
+	"encoding/json"
+	"testing"
+	"time"
+
+	"github.com/henrygd/beszel/internal/entities/system"
+	beszelTests "github.com/henrygd/beszel/internal/tests"
+
+	"github.com/pocketbase/dbx"
+	"github.com/pocketbase/pocketbase/tools/types"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// TestDiskAlertExtraFsMultiMinute tests that multi-minute disk alerts correctly use
+// historical per-minute values for extra (non-root) filesystems, not the current live snapshot.
+func TestDiskAlertExtraFsMultiMinute(t *testing.T) {
+	hub, user := beszelTests.GetHubWithUser(t)
+	defer hub.Cleanup()
+
+	systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up")
+	require.NoError(t, err)
+	systemRecord := systems[0]
+
+	// Disk alert: threshold 80%, min=2 (requires historical averaging)
+	diskAlert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
+		"name":   "Disk",
+		"system": systemRecord.Id,
+		"user":   user.Id,
+		"value":  80, // threshold: 80%
+		"min":    2,  // 2 minutes - requires historical averaging
+	})
+	require.NoError(t, err)
+	assert.False(t, diskAlert.GetBool("triggered"), "Alert should not be triggered initially")
+
+	am := hub.GetAlertManager()
+	now := time.Now().UTC()
+
+	extraFsHigh := map[string]*system.FsStats{
+		"/mnt/data": {DiskTotal: 1000, DiskUsed: 920}, // 92% - above threshold
+	}
+
+	// Insert 4 historical records spread over 3 minutes (same pattern as battery tests).
+	// The oldest record must predate (now - 2min) so the alert time window is valid.
+	recordTimes := []time.Duration{
+		-180 * time.Second, // 3 min ago - anchors oldest record before alert.time
+		-90 * time.Second,
+		-60 * time.Second,
+		-30 * time.Second,
+	}
+
+	for _, offset := range recordTimes {
+		stats := system.Stats{
+			DiskPct: 30, // root disk at 30% - below threshold
+			ExtraFs: extraFsHigh,
+		}
+		statsJSON, _ := json.Marshal(stats)
+
+		recordTime := now.Add(offset)
+		record, err := beszelTests.CreateRecord(hub, "system_stats", map[string]any{
+			"system": systemRecord.Id,
+			"type":   "1m",
+			"stats":  string(statsJSON),
+		})
+		require.NoError(t, err)
+		record.SetRaw("created", recordTime.Format(types.DefaultDateLayout))
+		err = hub.SaveNoValidate(record)
+		require.NoError(t, err)
+	}
+
+	combinedDataHigh := &system.CombinedData{
+		Stats: system.Stats{
+			DiskPct: 30,
+			ExtraFs: extraFsHigh,
+		},
+		Info: system.Info{
+			DiskPct: 30,
+		},
+	}
+
+	systemRecord.Set("updated", now)
+	err = hub.SaveNoValidate(systemRecord)
+	require.NoError(t, err)
+
+	err = am.HandleSystemAlerts(systemRecord, combinedDataHigh)
+	require.NoError(t, err)
+
+	time.Sleep(20 * time.Millisecond)
+
+	diskAlert, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": diskAlert.Id})
+	require.NoError(t, err)
+	assert.True(t, diskAlert.GetBool("triggered"),
+		"Alert SHOULD be triggered when extra disk average (92%%) exceeds threshold (80%%)")
+
+	// --- Resolution: extra disk drops to 50%, alert should resolve ---
+
+	extraFsLow := map[string]*system.FsStats{
+		"/mnt/data": {DiskTotal: 1000, DiskUsed: 500}, // 50% - below threshold
+	}
+
+	newNow := now.Add(2 * time.Minute)
+	recordTimesLow := []time.Duration{
+		-180 * time.Second,
+		-90 * time.Second,
+		-60 * time.Second,
+		-30 * time.Second,
+	}
+
+	for _, offset := range recordTimesLow {
+		stats := system.Stats{
+			DiskPct: 30,
+			ExtraFs: extraFsLow,
+		}
+		statsJSON, _ := json.Marshal(stats)
+
+		recordTime := newNow.Add(offset)
+		record, err := beszelTests.CreateRecord(hub, "system_stats", map[string]any{
+			"system": systemRecord.Id,
+			"type":   "1m",
+			"stats":  string(statsJSON),
+		})
+		require.NoError(t, err)
+		record.SetRaw("created", recordTime.Format(types.DefaultDateLayout))
+		err = hub.SaveNoValidate(record)
+		require.NoError(t, err)
+	}
+
+	combinedDataLow := &system.CombinedData{
+		Stats: system.Stats{
+			DiskPct: 30,
+			ExtraFs: extraFsLow,
+		},
+		Info: system.Info{
+			DiskPct: 30,
+		},
+	}
+
+	systemRecord.Set("updated", newNow)
+	err = hub.SaveNoValidate(systemRecord)
+	require.NoError(t, err)
+
+	err = am.HandleSystemAlerts(systemRecord, combinedDataLow)
+	require.NoError(t, err)
+
+	time.Sleep(20 * time.Millisecond)
+
+	diskAlert, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": diskAlert.Id})
+	require.NoError(t, err)
+	assert.False(t, diskAlert.GetBool("triggered"),
+		"Alert should be resolved when extra disk average (50%%) drops below threshold (80%%)")
+}
--- a/internal/alerts/alerts_quiet_hours_test.go
+++ b/internal/alerts/alerts_quiet_hours_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package alerts_test

@@ -50,7 +49,7 @@ func TestAlertSilencedOneTime(t *testing.T) {

 	// Get alert manager
 	am := alerts.NewAlertManager(hub)
-	defer am.StopWorker()
+	defer am.Stop()

 	// Test that alert is silenced
 	silenced := am.IsNotificationSilenced(user.Id, system.Id)
@@ -107,7 +106,7 @@ func TestAlertSilencedDaily(t *testing.T) {

 	// Get alert manager
 	am := alerts.NewAlertManager(hub)
-	defer am.StopWorker()
+	defer am.Stop()

 	// Get current hour and create a window that includes current time
 	now := time.Now().UTC()
@@ -171,7 +170,7 @@ func TestAlertSilencedDailyMidnightCrossing(t *testing.T) {

 	// Get alert manager
 	am := alerts.NewAlertManager(hub)
-	defer am.StopWorker()
+	defer am.Stop()

 	// Create a window that crosses midnight: 22:00 - 02:00
 	startTime := time.Date(2000, 1, 1, 22, 0, 0, 0, time.UTC)
@@ -212,7 +211,7 @@ func TestAlertSilencedGlobal(t *testing.T) {

 	// Get alert manager
 	am := alerts.NewAlertManager(hub)
-	defer am.StopWorker()
+	defer am.Stop()

 	// Create a global quiet hours window (no system specified)
 	now := time.Now().UTC()
@@ -251,7 +250,7 @@ func TestAlertSilencedSystemSpecific(t *testing.T) {

 	// Get alert manager
 	am := alerts.NewAlertManager(hub)
-	defer am.StopWorker()
+	defer am.Stop()

 	// Create a system-specific quiet hours window for system1 only
 	now := time.Now().UTC()
@@ -297,7 +296,7 @@ func TestAlertSilencedMultiUser(t *testing.T) {

 	// Get alert manager
 	am := alerts.NewAlertManager(hub)
-	defer am.StopWorker()
+	defer am.Stop()

 	// Create a quiet hours window for user1 only
 	now := time.Now().UTC()
@@ -418,7 +417,7 @@ func TestAlertSilencedNoWindows(t *testing.T) {

 	// Get alert manager
 	am := alerts.NewAlertManager(hub)
-	defer am.StopWorker()
+	defer am.Stop()

 	// Without any quiet hours windows, alert should NOT be silenced
 	silenced := am.IsNotificationSilenced(user.Id, system.Id)
--- a/internal/alerts/alerts_smart.go
+++ b/internal/alerts/alerts_smart.go
@@ -2,18 +2,18 @@ package alerts

 import (
 	"fmt"
+	"strings"

 	"github.com/pocketbase/pocketbase/core"
 )

-// handleSmartDeviceAlert sends alerts when a SMART device state changes from PASSED to FAILED.
+// handleSmartDeviceAlert sends alerts when a SMART device state worsens into WARNING/FAILED.
 // This is automatic and does not require user opt-in.
 func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
 	oldState := e.Record.Original().GetString("state")
 	newState := e.Record.GetString("state")

-	// Only alert when transitioning from PASSED to FAILED
-	if oldState != "PASSED" || newState != "FAILED" {
+	if !shouldSendSmartDeviceAlert(oldState, newState) {
 		return e.Next()
 	}

@@ -32,14 +32,15 @@ func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
 	systemName := systemRecord.GetString("name")
 	deviceName := e.Record.GetString("name")
 	model := e.Record.GetString("model")
+	statusLabel := smartStateLabel(newState)

 	// Build alert message
-	title := fmt.Sprintf("SMART failure on %s: %s \U0001F534", systemName, deviceName)
+	title := fmt.Sprintf("SMART %s on %s: %s %s", statusLabel, systemName, deviceName, smartStateEmoji(newState))
 	var message string
 	if model != "" {
-		message = fmt.Sprintf("Disk %s (%s) SMART status changed to FAILED", deviceName, model)
+		message = fmt.Sprintf("Disk %s (%s) SMART status changed to %s", deviceName, model, newState)
 	} else {
-		message = fmt.Sprintf("Disk %s SMART status changed to FAILED", deviceName)
+		message = fmt.Sprintf("Disk %s SMART status changed to %s", deviceName, newState)
 	}

 	// Get users associated with the system
@@ -65,3 +66,42 @@ func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
 	return e.Next()
 }

+func shouldSendSmartDeviceAlert(oldState, newState string) bool {
+	oldSeverity := smartStateSeverity(oldState)
+	newSeverity := smartStateSeverity(newState)
+
+	// Ignore unknown states and recoveries; only alert on worsening transitions
+	// from known-good/degraded states into WARNING/FAILED.
+	return oldSeverity >= 1 && newSeverity > oldSeverity
+}
+
+func smartStateSeverity(state string) int {
+	switch state {
+	case "PASSED":
+		return 1
+	case "WARNING":
+		return 2
+	case "FAILED":
+		return 3
+	default:
+		return 0
+	}
+}
+
+func smartStateEmoji(state string) string {
+	switch state {
+	case "WARNING":
+		return "\U0001F7E0"
+	default:
+		return "\U0001F534"
+	}
+}
+
+func smartStateLabel(state string) string {
+	switch state {
+	case "FAILED":
+		return "failure"
+	default:
+		return strings.ToLower(state)
+	}
+}
--- a/internal/alerts/alerts_smart_test.go
+++ b/internal/alerts/alerts_smart_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package alerts_test

@@ -58,6 +57,74 @@ func TestSmartDeviceAlert(t *testing.T) {
 	assert.Contains(t, lastMessage.Text, "FAILED")
 }

+func TestSmartDeviceAlertPassedToWarning(t *testing.T) {
+	hub, user := beszelTests.GetHubWithUser(t)
+	defer hub.Cleanup()
+
+	system, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
+		"name":  "test-system",
+		"users": []string{user.Id},
+		"host":  "127.0.0.1",
+	})
+	assert.NoError(t, err)
+
+	smartDevice, err := beszelTests.CreateRecord(hub, "smart_devices", map[string]any{
+		"system": system.Id,
+		"name":   "/dev/mmcblk0",
+		"model":  "eMMC",
+		"state":  "PASSED",
+	})
+	assert.NoError(t, err)
+
+	smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
+	assert.NoError(t, err)
+
+	smartDevice.Set("state", "WARNING")
+	err = hub.Save(smartDevice)
+	assert.NoError(t, err)
+
+	time.Sleep(50 * time.Millisecond)
+
+	assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 email sent after state changed to WARNING")
+	lastMessage := hub.TestMailer.LastMessage()
+	assert.Contains(t, lastMessage.Subject, "SMART warning on test-system")
+	assert.Contains(t, lastMessage.Text, "WARNING")
+}
+
+func TestSmartDeviceAlertWarningToFailed(t *testing.T) {
+	hub, user := beszelTests.GetHubWithUser(t)
+	defer hub.Cleanup()
+
+	system, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
+		"name":  "test-system",
+		"users": []string{user.Id},
+		"host":  "127.0.0.1",
+	})
+	assert.NoError(t, err)
+
+	smartDevice, err := beszelTests.CreateRecord(hub, "smart_devices", map[string]any{
+		"system": system.Id,
+		"name":   "/dev/mmcblk0",
+		"model":  "eMMC",
+		"state":  "WARNING",
+	})
+	assert.NoError(t, err)
+
+	smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
+	assert.NoError(t, err)
+
+	smartDevice.Set("state", "FAILED")
+	err = hub.Save(smartDevice)
+	assert.NoError(t, err)
+
+	time.Sleep(50 * time.Millisecond)
+
+	assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 email sent after state changed from WARNING to FAILED")
+	lastMessage := hub.TestMailer.LastMessage()
+	assert.Contains(t, lastMessage.Subject, "SMART failure on test-system")
+	assert.Contains(t, lastMessage.Text, "FAILED")
+}
+
 func TestSmartDeviceAlertNoAlertOnNonPassedToFailed(t *testing.T) {
 	hub, user := beszelTests.GetHubWithUser(t)
 	defer hub.Cleanup()
@@ -83,7 +150,8 @@ func TestSmartDeviceAlertNoAlertOnNonPassedToFailed(t *testing.T) {
 	smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
 	assert.NoError(t, err)

-	// Update the state from UNKNOWN to FAILED - should NOT trigger alert
+	// Update the state from UNKNOWN to FAILED - should NOT trigger alert.
+	// We only alert from known healthy/degraded states.
 	smartDevice.Set("state", "FAILED")
 	err = hub.Save(smartDevice)
 	assert.NoError(t, err)
--- a/internal/alerts/alerts_status.go
+++ b/internal/alerts/alerts_status.go
@@ -9,63 +9,25 @@ import (
 	"github.com/pocketbase/pocketbase/core"
 )

-type alertTask struct {
-	action      string // "schedule" or "cancel"
-	systemName  string
-	alertRecord *core.Record
-	delay       time.Duration
-}
-
 type alertInfo struct {
 	systemName  string
 	alertRecord *core.Record
 	expireTime  time.Time
+	timer       *time.Timer
 }

-// startWorker is a long-running goroutine that processes alert tasks
-// every x seconds. It must be running to process status alerts.
-func (am *AlertManager) startWorker() {
-	processPendingAlerts := time.Tick(15 * time.Second)
-
-	// check for status alerts that are not resolved when system comes up
-	// (can be removed if we figure out core bug in #1052)
-	checkStatusAlerts := time.Tick(561 * time.Second)
-
-	for {
-		select {
-		case <-am.stopChan:
-			return
-		case task := <-am.alertQueue:
-			switch task.action {
-			case "schedule":
-				am.pendingAlerts.Store(task.alertRecord.Id, &alertInfo{
-					systemName:  task.systemName,
-					alertRecord: task.alertRecord,
-					expireTime:  time.Now().Add(task.delay),
-				})
-			case "cancel":
-				am.pendingAlerts.Delete(task.alertRecord.Id)
+// Stop cancels all pending status alert timers.
+func (am *AlertManager) Stop() {
+	am.stopOnce.Do(func() {
+		am.pendingAlerts.Range(func(key, value any) bool {
+			info := value.(*alertInfo)
+			if info.timer != nil {
+				info.timer.Stop()
 			}
-		case <-checkStatusAlerts:
-			resolveStatusAlerts(am.hub)
-		case <-processPendingAlerts:
-			// Check for expired alerts every tick
-			now := time.Now()
-			for key, value := range am.pendingAlerts.Range {
-				info := value.(*alertInfo)
-				if now.After(info.expireTime) {
-					// Downtime delay has passed, process alert
-					am.sendStatusAlert("down", info.systemName, info.alertRecord)
-					am.pendingAlerts.Delete(key)
-				}
-			}
-		}
-	}
-}
-
-// StopWorker shuts down the AlertManager.worker goroutine
-func (am *AlertManager) StopWorker() {
-	close(am.stopChan)
+			am.pendingAlerts.Delete(key)
+			return true
+		})
+	})
 }

 // HandleStatusAlerts manages the logic when system status changes.
@@ -103,44 +65,82 @@ func (am *AlertManager) getSystemStatusAlerts(systemID string) ([]*core.Record,
 	return alertRecords, nil
 }

-// Schedules delayed "down" alerts for each alert record.
+// handleSystemDown manages the logic when a system status changes to "down". It schedules pending alerts for each alert record.
 func (am *AlertManager) handleSystemDown(systemName string, alertRecords []*core.Record) {
 	for _, alertRecord := range alertRecords {
-		// Continue if alert is already scheduled
-		if _, exists := am.pendingAlerts.Load(alertRecord.Id); exists {
-			continue
-		}
-		// Schedule by adding to queue
 		min := max(1, alertRecord.GetInt("min"))
-		am.alertQueue <- alertTask{
-			action:      "schedule",
-			systemName:  systemName,
-			alertRecord: alertRecord,
-			delay:       time.Duration(min) * time.Minute,
-		}
+		am.schedulePendingStatusAlert(systemName, alertRecord, time.Duration(min)*time.Minute)
 	}
 }

+// schedulePendingStatusAlert sets up a timer to send a "down" alert after the specified delay if the system is still down.
+// It returns true if the alert was scheduled, or false if an alert was already pending for the given alert record.
+func (am *AlertManager) schedulePendingStatusAlert(systemName string, alertRecord *core.Record, delay time.Duration) bool {
+	alert := &alertInfo{
+		systemName:  systemName,
+		alertRecord: alertRecord,
+		expireTime:  time.Now().Add(delay),
+	}
+
+	storedAlert, loaded := am.pendingAlerts.LoadOrStore(alertRecord.Id, alert)
+	if loaded {
+		return false
+	}
+
+	stored := storedAlert.(*alertInfo)
+	stored.timer = time.AfterFunc(time.Until(stored.expireTime), func() {
+		am.processPendingAlert(alertRecord.Id)
+	})
+	return true
+}
+
 // handleSystemUp manages the logic when a system status changes to "up".
 // It cancels any pending alerts and sends "up" alerts.
 func (am *AlertManager) handleSystemUp(systemName string, alertRecords []*core.Record) {
 	for _, alertRecord := range alertRecords {
-		alertRecordID := alertRecord.Id
 		// If alert exists for record, delete and continue (down alert not sent)
-		if _, exists := am.pendingAlerts.Load(alertRecordID); exists {
-			am.alertQueue <- alertTask{
-				action:      "cancel",
-				alertRecord: alertRecord,
-			}
+		if am.cancelPendingAlert(alertRecord.Id) {
+			continue
+		}
+		if !alertRecord.GetBool("triggered") {
 			continue
 		}
-		// No alert scheduled for this record, send "up" alert
 		if err := am.sendStatusAlert("up", systemName, alertRecord); err != nil {
 			am.hub.Logger().Error("Failed to send alert", "err", err)
 		}
 	}
 }

+// cancelPendingAlert stops the timer and removes the pending alert for the given alert ID. Returns true if a pending alert was found and cancelled.
+func (am *AlertManager) cancelPendingAlert(alertID string) bool {
+	value, loaded := am.pendingAlerts.LoadAndDelete(alertID)
+	if !loaded {
+		return false
+	}
+
+	info := value.(*alertInfo)
+	if info.timer != nil {
+		info.timer.Stop()
+	}
+	return true
+}
+
+// processPendingAlert sends a "down" alert if the pending alert has expired and the system is still down.
+func (am *AlertManager) processPendingAlert(alertID string) {
+	value, loaded := am.pendingAlerts.LoadAndDelete(alertID)
+	if !loaded {
+		return
+	}
+
+	info := value.(*alertInfo)
+	if info.alertRecord.GetBool("triggered") {
+		return
+	}
+	if err := am.sendStatusAlert("down", info.systemName, info.alertRecord); err != nil {
+		am.hub.Logger().Error("Failed to send alert", "err", err)
+	}
+}
+
 // sendStatusAlert sends a status alert ("up" or "down") to the users associated with the alert records.
 func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, alertRecord *core.Record) error {
 	switch alertStatus {
@@ -174,8 +174,8 @@ func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, a
 	})
 }

-// resolveStatusAlerts resolves any status alerts that weren't resolved
-// when system came up (https://github.com/henrygd/beszel/issues/1052)
+// resolveStatusAlerts resolves any triggered status alerts that weren't resolved
+// when system came up (https://github.com/henrygd/beszel/issues/1052).
 func resolveStatusAlerts(app core.App) error {
 	db := app.DB()
 	// Find all active status alerts where the system is actually up
@@ -205,3 +205,36 @@ func resolveStatusAlerts(app core.App) error {
 	}
 	return nil
 }
+
+// restorePendingStatusAlerts re-queues untriggered status alerts for systems that
+// are still down after a hub restart. This rebuilds the lost in-memory timer state.
+func (am *AlertManager) restorePendingStatusAlerts() error {
+	type pendingStatusAlert struct {
+		AlertID    string `db:"alert_id"`
+		SystemName string `db:"system_name"`
+	}
+
+	var pending []pendingStatusAlert
+	err := am.hub.DB().NewQuery(`
+		SELECT a.id AS alert_id, s.name AS system_name
+		FROM alerts a
+		JOIN systems s ON a.system = s.id
+		WHERE a.name = 'Status'
+		AND a.triggered = false
+		AND s.status = 'down'
+	`).All(&pending)
+	if err != nil {
+		return err
+	}
+
+	for _, item := range pending {
+		alertRecord, err := am.hub.FindRecordById("alerts", item.AlertID)
+		if err != nil {
+			return err
+		}
+		min := max(1, alertRecord.GetInt("min"))
+		am.schedulePendingStatusAlert(item.SystemName, alertRecord, time.Duration(min)*time.Minute)
+	}
+
+	return nil
+}
--- a/internal/alerts/alerts_status_test.go
+++ b/internal/alerts/alerts_status_test.go
@@ -0,0 +1,628 @@
+//go:build testing
+
+package alerts_test
+
+import (
+	"testing"
+	"testing/synctest"
+	"time"
+
+	"github.com/henrygd/beszel/internal/alerts"
+	beszelTests "github.com/henrygd/beszel/internal/tests"
+	"github.com/pocketbase/dbx"
+	"github.com/pocketbase/pocketbase/core"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestStatusAlerts(t *testing.T) {
+	synctest.Test(t, func(t *testing.T) {
+		hub, user := beszelTests.GetHubWithUser(t)
+		defer hub.Cleanup()
+
+		systems, err := beszelTests.CreateSystems(hub, 4, user.Id, "paused")
+		assert.NoError(t, err)
+
+		var alerts []*core.Record
+		for i, system := range systems {
+			alert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
+				"name":   "Status",
+				"system": system.Id,
+				"user":   user.Id,
+				"min":    i + 1,
+			})
+			assert.NoError(t, err)
+			alerts = append(alerts, alert)
+		}
+
+		time.Sleep(10 * time.Millisecond)
+
+		for _, alert := range alerts {
+			assert.False(t, alert.GetBool("triggered"), "Alert should not be triggered immediately")
+		}
+		if hub.TestMailer.TotalSend() != 0 {
+			assert.Zero(t, hub.TestMailer.TotalSend(), "Expected 0 messages, got %d", hub.TestMailer.TotalSend())
+		}
+		for _, system := range systems {
+			assert.EqualValues(t, "paused", system.GetString("status"), "System should be paused")
+		}
+		for _, system := range systems {
+			system.Set("status", "up")
+			err = hub.SaveNoValidate(system)
+			assert.NoError(t, err)
+		}
+		time.Sleep(time.Second)
+		assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
+		for _, system := range systems {
+			system.Set("status", "down")
+			err = hub.SaveNoValidate(system)
+			assert.NoError(t, err)
+		}
+		// after 30 seconds, should have 4 alerts in the pendingAlerts map, no triggered alerts
+		time.Sleep(time.Second * 30)
+		assert.EqualValues(t, 4, hub.GetPendingAlertsCount(), "should have 4 alerts in the pendingAlerts map")
+		triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
+		assert.NoError(t, err)
+		assert.EqualValues(t, 0, triggeredCount, "should have 0 alert triggered")
+		assert.EqualValues(t, 0, hub.TestMailer.TotalSend(), "should have 0 messages sent")
+		// after 1:30 seconds, should have 1 triggered alert and 3 pending alerts
+		time.Sleep(time.Second * 60)
+		assert.EqualValues(t, 3, hub.GetPendingAlertsCount(), "should have 3 alerts in the pendingAlerts map")
+		triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
+		assert.NoError(t, err)
+		assert.EqualValues(t, 1, triggeredCount, "should have 1 alert triggered")
+		assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 messages sent")
+		// after 2:30 seconds, should have 2 triggered alerts and 2 pending alerts
+		time.Sleep(time.Second * 60)
+		assert.EqualValues(t, 2, hub.GetPendingAlertsCount(), "should have 2 alerts in the pendingAlerts map")
+		triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
+		assert.NoError(t, err)
+		assert.EqualValues(t, 2, triggeredCount, "should have 2 alert triggered")
+		assert.EqualValues(t, 2, hub.TestMailer.TotalSend(), "should have 2 messages sent")
+		// now we will bring the remaning systems back up
+		for _, system := range systems {
+			system.Set("status", "up")
+			err = hub.SaveNoValidate(system)
+			assert.NoError(t, err)
+		}
+		time.Sleep(time.Second)
+		// should have 0 alerts in the pendingAlerts map and 0 alerts triggered
+		assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
+		triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
+		assert.NoError(t, err)
+		assert.Zero(t, triggeredCount, "should have 0 alert triggered")
+		// 4 messages sent, 2 down alerts and 2 up alerts for first 2 systems
+		assert.EqualValues(t, 4, hub.TestMailer.TotalSend(), "should have 4 messages sent")
+	})
+}
+func TestStatusAlertRecoveryBeforeDeadline(t *testing.T) {
+	hub, user := beszelTests.GetHubWithUser(t)
+	defer hub.Cleanup()
+
+	// Ensure user settings have an email
+	userSettings, _ := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
+	userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
+	hub.Save(userSettings)
+
+	// Initial email count
+	initialEmailCount := hub.TestMailer.TotalSend()
+
+	systemCollection, _ := hub.FindCollectionByNameOrId("systems")
+	system := core.NewRecord(systemCollection)
+	system.Set("name", "test-system")
+	system.Set("status", "up")
+	system.Set("host", "127.0.0.1")
+	system.Set("users", []string{user.Id})
+	hub.Save(system)
+
+	alertCollection, _ := hub.FindCollectionByNameOrId("alerts")
+	alert := core.NewRecord(alertCollection)
+	alert.Set("user", user.Id)
+	alert.Set("system", system.Id)
+	alert.Set("name", "Status")
+	alert.Set("triggered", false)
+	alert.Set("min", 1)
+	hub.Save(alert)
+
+	am := hub.AlertManager
+
+	// 1. System goes down
+	am.HandleStatusAlerts("down", system)
+	assert.Equal(t, 1, am.GetPendingAlertsCount(), "Alert should be scheduled")
+
+	// 2. System goes up BEFORE delay expires
+	// Triggering HandleStatusAlerts("up") SHOULD NOT send an alert.
+	am.HandleStatusAlerts("up", system)
+
+	assert.Equal(t, 0, am.GetPendingAlertsCount(), "Alert should be canceled if system recovers before delay expires")
+
+	// Verify that NO email was sent.
+	assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "Recovery notification should not be sent if system never went down")
+
+}
+
+func TestStatusAlertNormalRecovery(t *testing.T) {
+	hub, user := beszelTests.GetHubWithUser(t)
+	defer hub.Cleanup()
+
+	// Ensure user settings have an email
+	userSettings, _ := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
+	userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
+	hub.Save(userSettings)
+
+	systemCollection, _ := hub.FindCollectionByNameOrId("systems")
+	system := core.NewRecord(systemCollection)
+	system.Set("name", "test-system")
+	system.Set("status", "up")
+	system.Set("host", "127.0.0.1")
+	system.Set("users", []string{user.Id})
+	hub.Save(system)
+
+	alertCollection, _ := hub.FindCollectionByNameOrId("alerts")
+	alert := core.NewRecord(alertCollection)
+	alert.Set("user", user.Id)
+	alert.Set("system", system.Id)
+	alert.Set("name", "Status")
+	alert.Set("triggered", true) // System was confirmed DOWN
+	hub.Save(alert)
+
+	am := hub.AlertManager
+	initialEmailCount := hub.TestMailer.TotalSend()
+
+	// System goes up
+	am.HandleStatusAlerts("up", system)
+
+	// Verify that an email WAS sent (normal recovery).
+	assert.Equal(t, initialEmailCount+1, hub.TestMailer.TotalSend(), "Recovery notification should be sent if system was triggered as down")
+
+}
+
+func TestHandleStatusAlertsDoesNotSendRecoveryWhileDownIsOnlyPending(t *testing.T) {
+	hub, user := beszelTests.GetHubWithUser(t)
+	defer hub.Cleanup()
+
+	userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
+	require.NoError(t, err)
+	userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
+	require.NoError(t, hub.Save(userSettings))
+
+	systemCollection, err := hub.FindCollectionByNameOrId("systems")
+	require.NoError(t, err)
+	system := core.NewRecord(systemCollection)
+	system.Set("name", "test-system")
+	system.Set("status", "up")
+	system.Set("host", "127.0.0.1")
+	system.Set("users", []string{user.Id})
+	require.NoError(t, hub.Save(system))
+
+	alertCollection, err := hub.FindCollectionByNameOrId("alerts")
+	require.NoError(t, err)
+	alert := core.NewRecord(alertCollection)
+	alert.Set("user", user.Id)
+	alert.Set("system", system.Id)
+	alert.Set("name", "Status")
+	alert.Set("triggered", false)
+	alert.Set("min", 1)
+	require.NoError(t, hub.Save(alert))
+
+	initialEmailCount := hub.TestMailer.TotalSend()
+	am := alerts.NewTestAlertManagerWithoutWorker(hub)
+
+	require.NoError(t, am.HandleStatusAlerts("down", system))
+	assert.Equal(t, 1, am.GetPendingAlertsCount(), "down transition should register a pending alert immediately")
+
+	require.NoError(t, am.HandleStatusAlerts("up", system))
+	assert.Zero(t, am.GetPendingAlertsCount(), "recovery should cancel the pending down alert")
+	assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "recovery notification should not be sent before a down alert triggers")
+
+	alertRecord, err := hub.FindRecordById("alerts", alert.Id)
+	require.NoError(t, err)
+	assert.False(t, alertRecord.GetBool("triggered"), "alert should remain untriggered when downtime never matured")
+}
+
+func TestStatusAlertTimerCancellationPreventsBoundaryDelivery(t *testing.T) {
+	synctest.Test(t, func(t *testing.T) {
+		hub, user := beszelTests.GetHubWithUser(t)
+		defer hub.Cleanup()
+
+		userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
+		require.NoError(t, err)
+		userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
+		require.NoError(t, hub.Save(userSettings))
+
+		systemCollection, err := hub.FindCollectionByNameOrId("systems")
+		require.NoError(t, err)
+		system := core.NewRecord(systemCollection)
+		system.Set("name", "test-system")
+		system.Set("status", "up")
+		system.Set("host", "127.0.0.1")
+		system.Set("users", []string{user.Id})
+		require.NoError(t, hub.Save(system))
+
+		alertCollection, err := hub.FindCollectionByNameOrId("alerts")
+		require.NoError(t, err)
+		alert := core.NewRecord(alertCollection)
+		alert.Set("user", user.Id)
+		alert.Set("system", system.Id)
+		alert.Set("name", "Status")
+		alert.Set("triggered", false)
+		alert.Set("min", 1)
+		require.NoError(t, hub.Save(alert))
+
+		initialEmailCount := hub.TestMailer.TotalSend()
+		am := alerts.NewTestAlertManagerWithoutWorker(hub)
+
+		require.NoError(t, am.HandleStatusAlerts("down", system))
+		assert.Equal(t, 1, am.GetPendingAlertsCount(), "down transition should register a pending alert immediately")
+		require.True(t, am.ResetPendingAlertTimer(alert.Id, 25*time.Millisecond), "test should shorten the pending alert timer")
+
+		time.Sleep(10 * time.Millisecond)
+		require.NoError(t, am.HandleStatusAlerts("up", system))
+		assert.Zero(t, am.GetPendingAlertsCount(), "recovery should remove the pending alert before the timer callback runs")
+
+		time.Sleep(40 * time.Millisecond)
+		assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "timer callback should not deliver after recovery cancels the pending alert")
+
+		alertRecord, err := hub.FindRecordById("alerts", alert.Id)
+		require.NoError(t, err)
+		assert.False(t, alertRecord.GetBool("triggered"), "alert should remain untriggered when cancellation wins the timer race")
+
+		time.Sleep(time.Minute)
+		synctest.Wait()
+	})
+}
+
+func TestStatusAlertDownFiresAfterDelayExpires(t *testing.T) {
+	hub, user := beszelTests.GetHubWithUser(t)
+	defer hub.Cleanup()
+
+	userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
+	require.NoError(t, err)
+	userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
+	require.NoError(t, hub.Save(userSettings))
+
+	systemCollection, err := hub.FindCollectionByNameOrId("systems")
+	require.NoError(t, err)
+	system := core.NewRecord(systemCollection)
+	system.Set("name", "test-system")
+	system.Set("status", "up")
+	system.Set("host", "127.0.0.1")
+	system.Set("users", []string{user.Id})
+	require.NoError(t, hub.Save(system))
+
+	alertCollection, err := hub.FindCollectionByNameOrId("alerts")
+	require.NoError(t, err)
+	alert := core.NewRecord(alertCollection)
+	alert.Set("user", user.Id)
+	alert.Set("system", system.Id)
+	alert.Set("name", "Status")
+	alert.Set("triggered", false)
+	alert.Set("min", 1)
+	require.NoError(t, hub.Save(alert))
+
+	initialEmailCount := hub.TestMailer.TotalSend()
+	am := alerts.NewTestAlertManagerWithoutWorker(hub)
+
+	require.NoError(t, am.HandleStatusAlerts("down", system))
+	assert.Equal(t, 1, am.GetPendingAlertsCount(), "alert should be pending after system goes down")
+
+	// Expire the pending alert and process it
+	am.ForceExpirePendingAlerts()
+	processed, err := am.ProcessPendingAlerts()
+	require.NoError(t, err)
+	assert.Len(t, processed, 1, "one alert should have been processed")
+	assert.Equal(t, 0, am.GetPendingAlertsCount(), "pending alert should be consumed after processing")
+
+	// Verify down email was sent
+	assert.Equal(t, initialEmailCount+1, hub.TestMailer.TotalSend(), "down notification should be sent after delay expires")
+
+	// Verify triggered flag is set in the DB
+	alertRecord, err := hub.FindRecordById("alerts", alert.Id)
+	require.NoError(t, err)
+	assert.True(t, alertRecord.GetBool("triggered"), "alert should be marked triggered after downtime matures")
+}
+
+func TestStatusAlertDuplicateDownCallIsIdempotent(t *testing.T) {
+	hub, user := beszelTests.GetHubWithUser(t)
+	defer hub.Cleanup()
+
+	userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
+	require.NoError(t, err)
+	userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
+	require.NoError(t, hub.Save(userSettings))
+
+	systemCollection, err := hub.FindCollectionByNameOrId("systems")
+	require.NoError(t, err)
+	system := core.NewRecord(systemCollection)
+	system.Set("name", "test-system")
+	system.Set("status", "up")
+	system.Set("host", "127.0.0.1")
+	system.Set("users", []string{user.Id})
+	require.NoError(t, hub.Save(system))
+
+	alertCollection, err := hub.FindCollectionByNameOrId("alerts")
+	require.NoError(t, err)
+	alert := core.NewRecord(alertCollection)
+	alert.Set("user", user.Id)
+	alert.Set("system", system.Id)
+	alert.Set("name", "Status")
+	alert.Set("triggered", false)
+	alert.Set("min", 5)
+	require.NoError(t, hub.Save(alert))
+
+	am := alerts.NewTestAlertManagerWithoutWorker(hub)
+
+	require.NoError(t, am.HandleStatusAlerts("down", system))
+	require.NoError(t, am.HandleStatusAlerts("down", system))
+	require.NoError(t, am.HandleStatusAlerts("down", system))
+
+	assert.Equal(t, 1, am.GetPendingAlertsCount(), "repeated down calls should not schedule duplicate pending alerts")
+}
+
+func TestStatusAlertNoAlertRecord(t *testing.T) {
+	hub, user := beszelTests.GetHubWithUser(t)
+	defer hub.Cleanup()
+
+	systemCollection, err := hub.FindCollectionByNameOrId("systems")
+	require.NoError(t, err)
+	system := core.NewRecord(systemCollection)
+	system.Set("name", "test-system")
+	system.Set("status", "up")
+	system.Set("host", "127.0.0.1")
+	system.Set("users", []string{user.Id})
+	require.NoError(t, hub.Save(system))
+
+	// No Status alert record created for this system
+	initialEmailCount := hub.TestMailer.TotalSend()
+	am := alerts.NewTestAlertManagerWithoutWorker(hub)
+
+	require.NoError(t, am.HandleStatusAlerts("down", system))
+	assert.Equal(t, 0, am.GetPendingAlertsCount(), "no pending alert when no alert record exists")
+
+	require.NoError(t, am.HandleStatusAlerts("up", system))
+	assert.Equal(t, initialEmailCount, hub.TestMailer.TotalSend(), "no email when no alert record exists")
+}
+
+func TestRestorePendingStatusAlertsRequeuesDownSystemsAfterRestart(t *testing.T) {
+	hub, user := beszelTests.GetHubWithUser(t)
+	defer hub.Cleanup()
+
+	userSettings, err := hub.FindFirstRecordByFilter("user_settings", "user={:user}", map[string]any{"user": user.Id})
+	require.NoError(t, err)
+	userSettings.Set("settings", `{"emails":["test@example.com"],"webhooks":[]}`)
+	require.NoError(t, hub.Save(userSettings))
+
+	systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "down")
+	require.NoError(t, err)
+	system := systems[0]
+
+	alertCollection, err := hub.FindCollectionByNameOrId("alerts")
+	require.NoError(t, err)
+	alert := core.NewRecord(alertCollection)
+	alert.Set("user", user.Id)
+	alert.Set("system", system.Id)
+	alert.Set("name", "Status")
+	alert.Set("triggered", false)
+	alert.Set("min", 1)
+	require.NoError(t, hub.Save(alert))
+
+	initialEmailCount := hub.TestMailer.TotalSend()
+	am := alerts.NewTestAlertManagerWithoutWorker(hub)
+
+	require.NoError(t, am.RestorePendingStatusAlerts())
+	assert.Equal(t, 1, am.GetPendingAlertsCount(), "startup restore should requeue a pending down alert for a system still marked down")
+
+	am.ForceExpirePendingAlerts()
+	processed, err := am.ProcessPendingAlerts()
+	require.NoError(t, err)
+	assert.Len(t, processed, 1, "restored pending alert should be processable after the delay expires")
+	assert.Equal(t, initialEmailCount+1, hub.TestMailer.TotalSend(), "restored pending alert should send the down notification")
+
+	alertRecord, err := hub.FindRecordById("alerts", alert.Id)
+	require.NoError(t, err)
+	assert.True(t, alertRecord.GetBool("triggered"), "restored pending alert should mark the alert as triggered once delivered")
+}
+
+func TestRestorePendingStatusAlertsSkipsNonDownOrAlreadyTriggeredAlerts(t *testing.T) {
+	hub, user := beszelTests.GetHubWithUser(t)
+	defer hub.Cleanup()
+
+	systemsDown, err := beszelTests.CreateSystems(hub, 2, user.Id, "down")
+	require.NoError(t, err)
+	systemDownPending := systemsDown[0]
+	systemDownTriggered := systemsDown[1]
+
+	systemUp, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
+		"name":   "up-system",
+		"users":  []string{user.Id},
+		"host":   "127.0.0.2",
+		"status": "up",
+	})
+	require.NoError(t, err)
+
+	_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
+		"name":      "Status",
+		"system":    systemDownPending.Id,
+		"user":      user.Id,
+		"min":       1,
+		"triggered": false,
+	})
+	require.NoError(t, err)
+
+	_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
+		"name":      "Status",
+		"system":    systemUp.Id,
+		"user":      user.Id,
+		"min":       1,
+		"triggered": false,
+	})
+	require.NoError(t, err)
+
+	_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
+		"name":      "Status",
+		"system":    systemDownTriggered.Id,
+		"user":      user.Id,
+		"min":       1,
+		"triggered": true,
+	})
+	require.NoError(t, err)
+
+	am := alerts.NewTestAlertManagerWithoutWorker(hub)
+	require.NoError(t, am.RestorePendingStatusAlerts())
+	assert.Equal(t, 1, am.GetPendingAlertsCount(), "only untriggered alerts for currently down systems should be restored")
+}
+
+func TestRestorePendingStatusAlertsIsIdempotent(t *testing.T) {
+	hub, user := beszelTests.GetHubWithUser(t)
+	defer hub.Cleanup()
+
+	systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "down")
+	require.NoError(t, err)
+	system := systems[0]
+
+	_, err = beszelTests.CreateRecord(hub, "alerts", map[string]any{
+		"name":      "Status",
+		"system":    system.Id,
+		"user":      user.Id,
+		"min":       1,
+		"triggered": false,
+	})
+	require.NoError(t, err)
+
+	am := alerts.NewTestAlertManagerWithoutWorker(hub)
+	require.NoError(t, am.RestorePendingStatusAlerts())
+	require.NoError(t, am.RestorePendingStatusAlerts())
+
+	assert.Equal(t, 1, am.GetPendingAlertsCount(), "restoring twice should not create duplicate pending alerts")
+	am.ForceExpirePendingAlerts()
+	processed, err := am.ProcessPendingAlerts()
+	require.NoError(t, err)
+	assert.Len(t, processed, 1, "restored alert should still be processable exactly once")
+	assert.Zero(t, am.GetPendingAlertsCount(), "processing the restored alert should empty the pending map")
+}
+
+func TestResolveStatusAlertsFixesStaleTriggered(t *testing.T) {
+	hub, user := beszelTests.GetHubWithUser(t)
+	defer hub.Cleanup()
+
+	// CreateSystems uses SaveNoValidate after initial save to bypass the
+	// onRecordCreate hook that forces status = "pending".
+	systems, err := beszelTests.CreateSystems(hub, 1, user.Id, "up")
+	require.NoError(t, err)
+	system := systems[0]
+
+	alertCollection, err := hub.FindCollectionByNameOrId("alerts")
+	require.NoError(t, err)
+	alert := core.NewRecord(alertCollection)
+	alert.Set("user", user.Id)
+	alert.Set("system", system.Id)
+	alert.Set("name", "Status")
+	alert.Set("triggered", true) // Stale: system is up but alert still says triggered
+	require.NoError(t, hub.Save(alert))
+
+	// resolveStatusAlerts should clear the stale triggered flag
+	require.NoError(t, alerts.ResolveStatusAlerts(hub))
+
+	alertRecord, err := hub.FindRecordById("alerts", alert.Id)
+	require.NoError(t, err)
+	assert.False(t, alertRecord.GetBool("triggered"), "stale triggered flag should be cleared when system is up")
+}
+func TestResolveStatusAlerts(t *testing.T) {
+	hub, user := beszelTests.GetHubWithUser(t)
+	defer hub.Cleanup()
+
+	// Create a systemUp
+	systemUp, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
+		"name":   "test-system",
+		"users":  []string{user.Id},
+		"host":   "127.0.0.1",
+		"status": "up",
+	})
+	assert.NoError(t, err)
+
+	systemDown, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
+		"name":   "test-system-2",
+		"users":  []string{user.Id},
+		"host":   "127.0.0.2",
+		"status": "up",
+	})
+	assert.NoError(t, err)
+
+	// Create a status alertUp for the system
+	alertUp, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
+		"name":   "Status",
+		"system": systemUp.Id,
+		"user":   user.Id,
+		"min":    1,
+	})
+	assert.NoError(t, err)
+
+	alertDown, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
+		"name":   "Status",
+		"system": systemDown.Id,
+		"user":   user.Id,
+		"min":    1,
+	})
+	assert.NoError(t, err)
+
+	// Verify alert is not triggered initially
+	assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered initially")
+
+	// Set the system to 'up' (this should not trigger the alert)
+	systemUp.Set("status", "up")
+	err = hub.SaveNoValidate(systemUp)
+	assert.NoError(t, err)
+
+	systemDown.Set("status", "down")
+	err = hub.SaveNoValidate(systemDown)
+	assert.NoError(t, err)
+
+	// Wait a moment for any processing
+	time.Sleep(10 * time.Millisecond)
+
+	// Verify alertUp is still not triggered after setting system to up
+	alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
+	assert.NoError(t, err)
+	assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered when system is up")
+
+	// Manually set both alerts triggered to true
+	alertUp.Set("triggered", true)
+	err = hub.SaveNoValidate(alertUp)
+	assert.NoError(t, err)
+	alertDown.Set("triggered", true)
+	err = hub.SaveNoValidate(alertDown)
+	assert.NoError(t, err)
+
+	// Verify we have exactly one alert with triggered true
+	triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
+	assert.NoError(t, err)
+	assert.EqualValues(t, 2, triggeredCount, "Should have exactly two alerts with triggered true")
+
+	// Verify the specific alertUp is triggered
+	alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
+	assert.NoError(t, err)
+	assert.True(t, alertUp.GetBool("triggered"), "Alert should be triggered")
+
+	// Verify we have two unresolved alert history records
+	alertHistoryCount, err := hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
+	assert.NoError(t, err)
+	assert.EqualValues(t, 2, alertHistoryCount, "Should have exactly two unresolved alert history records")
+
+	err = alerts.ResolveStatusAlerts(hub)
+	assert.NoError(t, err)
+
+	// Verify alertUp is not triggered after resolving
+	alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
+	assert.NoError(t, err)
+	assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered after resolving")
+	// Verify alertDown is still triggered
+	alertDown, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertDown.Id})
+	assert.NoError(t, err)
+	assert.True(t, alertDown.GetBool("triggered"), "Alert should still be triggered after resolving")
+
+	// Verify we have one unresolved alert history record
+	alertHistoryCount, err = hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
+	assert.NoError(t, err)
+	assert.EqualValues(t, 1, alertHistoryCount, "Should have exactly one unresolved alert history record")
+
+}
--- a/internal/alerts/alerts_system.go
+++ b/internal/alerts/alerts_system.go
@@ -11,7 +11,6 @@ import (
 	"github.com/pocketbase/dbx"
 	"github.com/pocketbase/pocketbase/core"
 	"github.com/pocketbase/pocketbase/tools/types"
-	"github.com/spf13/cast"
 )

 func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *system.CombinedData) error {
@@ -38,7 +37,7 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *syst
 		case "Memory":
 			val = data.Info.MemPct
 		case "Bandwidth":
-			val = data.Info.Bandwidth
+			val = float64(data.Info.BandwidthBytes) / (1024 * 1024)
 			unit = " MB/s"
 		case "Disk":
 			maxUsedPct := data.Info.DiskPct
@@ -92,7 +91,7 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *syst
 			}
 		}

-		min := max(1, cast.ToUint8(alertRecord.Get("min")))
+		min := max(1, uint8(alertRecord.GetInt("min")))

 		alert := SystemAlertData{
 			systemRecord: systemRecord,
@@ -192,22 +191,24 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *syst
 			case "Memory":
 				alert.val += stats.Mem
 			case "Bandwidth":
-				alert.val += stats.NetSent + stats.NetRecv
+				alert.val += float64(stats.Bandwidth[0]+stats.Bandwidth[1]) / (1024 * 1024)
 			case "Disk":
 				if alert.mapSums == nil {
-					alert.mapSums = make(map[string]float32, len(data.Stats.ExtraFs)+1)
+					alert.mapSums = make(map[string]float32, len(stats.ExtraFs)+1)
 				}
 				// add root disk
 				if _, ok := alert.mapSums["root"]; !ok {
 					alert.mapSums["root"] = 0.0
 				}
 				alert.mapSums["root"] += float32(stats.Disk)
-				// add extra disks
-				for key, fs := range data.Stats.ExtraFs {
-					if _, ok := alert.mapSums[key]; !ok {
-						alert.mapSums[key] = 0.0
+				// add extra disks from historical record
+				for key, fs := range stats.ExtraFs {
+					if fs.DiskTotal > 0 {
+						if _, ok := alert.mapSums[key]; !ok {
+							alert.mapSums[key] = 0.0
+						}
+						alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100)
 					}
-					alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100)
 				}
 			case "Temperature":
 				if alert.mapSums == nil {
--- a/internal/alerts/alerts_test.go
+++ b/internal/alerts/alerts_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package alerts_test

@@ -13,7 +12,6 @@ import (
 	"testing/synctest"
 	"time"

-	"github.com/henrygd/beszel/internal/alerts"
 	beszelTests "github.com/henrygd/beszel/internal/tests"

 	"github.com/pocketbase/dbx"
@@ -370,87 +368,6 @@ func TestUserAlertsApi(t *testing.T) {
 	}
 }

-func TestStatusAlerts(t *testing.T) {
-	synctest.Test(t, func(t *testing.T) {
-		hub, user := beszelTests.GetHubWithUser(t)
-		defer hub.Cleanup()
-
-		systems, err := beszelTests.CreateSystems(hub, 4, user.Id, "paused")
-		assert.NoError(t, err)
-
-		var alerts []*core.Record
-		for i, system := range systems {
-			alert, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
-				"name":   "Status",
-				"system": system.Id,
-				"user":   user.Id,
-				"min":    i + 1,
-			})
-			assert.NoError(t, err)
-			alerts = append(alerts, alert)
-		}
-
-		time.Sleep(10 * time.Millisecond)
-
-		for _, alert := range alerts {
-			assert.False(t, alert.GetBool("triggered"), "Alert should not be triggered immediately")
-		}
-		if hub.TestMailer.TotalSend() != 0 {
-			assert.Zero(t, hub.TestMailer.TotalSend(), "Expected 0 messages, got %d", hub.TestMailer.TotalSend())
-		}
-		for _, system := range systems {
-			assert.EqualValues(t, "paused", system.GetString("status"), "System should be paused")
-		}
-		for _, system := range systems {
-			system.Set("status", "up")
-			err = hub.SaveNoValidate(system)
-			assert.NoError(t, err)
-		}
-		time.Sleep(time.Second)
-		assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
-		for _, system := range systems {
-			system.Set("status", "down")
-			err = hub.SaveNoValidate(system)
-			assert.NoError(t, err)
-		}
-		// after 30 seconds, should have 4 alerts in the pendingAlerts map, no triggered alerts
-		time.Sleep(time.Second * 30)
-		assert.EqualValues(t, 4, hub.GetPendingAlertsCount(), "should have 4 alerts in the pendingAlerts map")
-		triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
-		assert.NoError(t, err)
-		assert.EqualValues(t, 0, triggeredCount, "should have 0 alert triggered")
-		assert.EqualValues(t, 0, hub.TestMailer.TotalSend(), "should have 0 messages sent")
-		// after 1:30 seconds, should have 1 triggered alert and 3 pending alerts
-		time.Sleep(time.Second * 60)
-		assert.EqualValues(t, 3, hub.GetPendingAlertsCount(), "should have 3 alerts in the pendingAlerts map")
-		triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
-		assert.NoError(t, err)
-		assert.EqualValues(t, 1, triggeredCount, "should have 1 alert triggered")
-		assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 messages sent")
-		// after 2:30 seconds, should have 2 triggered alerts and 2 pending alerts
-		time.Sleep(time.Second * 60)
-		assert.EqualValues(t, 2, hub.GetPendingAlertsCount(), "should have 2 alerts in the pendingAlerts map")
-		triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
-		assert.NoError(t, err)
-		assert.EqualValues(t, 2, triggeredCount, "should have 2 alert triggered")
-		assert.EqualValues(t, 2, hub.TestMailer.TotalSend(), "should have 2 messages sent")
-		// now we will bring the remaning systems back up
-		for _, system := range systems {
-			system.Set("status", "up")
-			err = hub.SaveNoValidate(system)
-			assert.NoError(t, err)
-		}
-		time.Sleep(time.Second)
-		// should have 0 alerts in the pendingAlerts map and 0 alerts triggered
-		assert.EqualValues(t, 0, hub.GetPendingAlertsCount(), "should have 0 alerts in the pendingAlerts map")
-		triggeredCount, err = hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
-		assert.NoError(t, err)
-		assert.Zero(t, triggeredCount, "should have 0 alert triggered")
-		// 4 messages sent, 2 down alerts and 2 up alerts for first 2 systems
-		assert.EqualValues(t, 4, hub.TestMailer.TotalSend(), "should have 4 messages sent")
-	})
-}
-
 func TestAlertsHistory(t *testing.T) {
 	synctest.Test(t, func(t *testing.T) {
 		hub, user := beszelTests.GetHubWithUser(t)
@@ -579,102 +496,3 @@ func TestAlertsHistory(t *testing.T) {
 		assert.EqualValues(t, 2, totalHistoryCount, "Should have 2 total alert history records")
 	})
 }
-func TestResolveStatusAlerts(t *testing.T) {
-	hub, user := beszelTests.GetHubWithUser(t)
-	defer hub.Cleanup()
-
-	// Create a systemUp
-	systemUp, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
-		"name":   "test-system",
-		"users":  []string{user.Id},
-		"host":   "127.0.0.1",
-		"status": "up",
-	})
-	assert.NoError(t, err)
-
-	systemDown, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
-		"name":   "test-system-2",
-		"users":  []string{user.Id},
-		"host":   "127.0.0.2",
-		"status": "up",
-	})
-	assert.NoError(t, err)
-
-	// Create a status alertUp for the system
-	alertUp, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
-		"name":   "Status",
-		"system": systemUp.Id,
-		"user":   user.Id,
-		"min":    1,
-	})
-	assert.NoError(t, err)
-
-	alertDown, err := beszelTests.CreateRecord(hub, "alerts", map[string]any{
-		"name":   "Status",
-		"system": systemDown.Id,
-		"user":   user.Id,
-		"min":    1,
-	})
-	assert.NoError(t, err)
-
-	// Verify alert is not triggered initially
-	assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered initially")
-
-	// Set the system to 'up' (this should not trigger the alert)
-	systemUp.Set("status", "up")
-	err = hub.SaveNoValidate(systemUp)
-	assert.NoError(t, err)
-
-	systemDown.Set("status", "down")
-	err = hub.SaveNoValidate(systemDown)
-	assert.NoError(t, err)
-
-	// Wait a moment for any processing
-	time.Sleep(10 * time.Millisecond)
-
-	// Verify alertUp is still not triggered after setting system to up
-	alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
-	assert.NoError(t, err)
-	assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered when system is up")
-
-	// Manually set both alerts triggered to true
-	alertUp.Set("triggered", true)
-	err = hub.SaveNoValidate(alertUp)
-	assert.NoError(t, err)
-	alertDown.Set("triggered", true)
-	err = hub.SaveNoValidate(alertDown)
-	assert.NoError(t, err)
-
-	// Verify we have exactly one alert with triggered true
-	triggeredCount, err := hub.CountRecords("alerts", dbx.HashExp{"triggered": true})
-	assert.NoError(t, err)
-	assert.EqualValues(t, 2, triggeredCount, "Should have exactly two alerts with triggered true")
-
-	// Verify the specific alertUp is triggered
-	alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
-	assert.NoError(t, err)
-	assert.True(t, alertUp.GetBool("triggered"), "Alert should be triggered")
-
-	// Verify we have two unresolved alert history records
-	alertHistoryCount, err := hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
-	assert.NoError(t, err)
-	assert.EqualValues(t, 2, alertHistoryCount, "Should have exactly two unresolved alert history records")
-
-	err = alerts.ResolveStatusAlerts(hub)
-	assert.NoError(t, err)
-
-	// Verify alertUp is not triggered after resolving
-	alertUp, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertUp.Id})
-	assert.NoError(t, err)
-	assert.False(t, alertUp.GetBool("triggered"), "Alert should not be triggered after resolving")
-	// Verify alertDown is still triggered
-	alertDown, err = hub.FindFirstRecordByFilter("alerts", "id={:id}", dbx.Params{"id": alertDown.Id})
-	assert.NoError(t, err)
-	assert.True(t, alertDown.GetBool("triggered"), "Alert should still be triggered after resolving")
-
-	// Verify we have one unresolved alert history record
-	alertHistoryCount, err = hub.CountRecords("alerts_history", dbx.HashExp{"resolved": ""})
-	assert.NoError(t, err)
-	assert.EqualValues(t, 1, alertHistoryCount, "Should have exactly one unresolved alert history record")
-
-}
--- a/internal/alerts/alerts_test_helpers.go
+++ b/internal/alerts/alerts_test_helpers.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package alerts

@@ -10,6 +9,12 @@ import (
 	"github.com/pocketbase/pocketbase/core"
 )

+func NewTestAlertManagerWithoutWorker(app hubLike) *AlertManager {
+	return &AlertManager{
+		hub: app,
+	}
+}
+
 func (am *AlertManager) GetAlertManager() *AlertManager {
 	return am
 }
@@ -35,12 +40,11 @@ func (am *AlertManager) ProcessPendingAlerts() ([]*core.Record, error) {
 	am.pendingAlerts.Range(func(key, value any) bool {
 		info := value.(*alertInfo)
 		if now.After(info.expireTime) {
-			// Downtime delay has passed, process alert
-			if err := am.sendStatusAlert("down", info.systemName, info.alertRecord); err != nil {
-				lastErr = err
+			if info.timer != nil {
+				info.timer.Stop()
 			}
+			am.processPendingAlert(key.(string))
 			processedAlerts = append(processedAlerts, info.alertRecord)
-			am.pendingAlerts.Delete(key)
 		}
 		return true
 	})
@@ -57,6 +61,27 @@ func (am *AlertManager) ForceExpirePendingAlerts() {
 	})
 }

+func (am *AlertManager) ResetPendingAlertTimer(alertID string, delay time.Duration) bool {
+	value, loaded := am.pendingAlerts.Load(alertID)
+	if !loaded {
+		return false
+	}
+
+	info := value.(*alertInfo)
+	if info.timer != nil {
+		info.timer.Stop()
+	}
+	info.expireTime = time.Now().Add(delay)
+	info.timer = time.AfterFunc(delay, func() {
+		am.processPendingAlert(alertID)
+	})
+	return true
+}
+
 func ResolveStatusAlerts(app core.App) error {
 	return resolveStatusAlerts(app)
 }
+
+func (am *AlertManager) RestorePendingStatusAlerts() error {
+	return am.restorePendingStatusAlerts()
+}
--- a/internal/cmd/agent/agent.go
+++ b/internal/cmd/agent/agent.go
@@ -9,6 +9,7 @@ import (
 	"github.com/henrygd/beszel"
 	"github.com/henrygd/beszel/agent"
 	"github.com/henrygd/beszel/agent/health"
+	"github.com/henrygd/beszel/agent/utils"
 	"github.com/spf13/pflag"
 	"golang.org/x/crypto/ssh"
 )
@@ -31,9 +32,6 @@ func (opts *cmdOptions) parse() bool {

 	// Subcommands that don't require any pflag parsing
 	switch subcommand {
-	case "-v", "version":
-		fmt.Println(beszel.AppName+"-agent", beszel.Version)
-		return true
 	case "health":
 		err := health.Check()
 		if err != nil {
@@ -41,6 +39,9 @@ func (opts *cmdOptions) parse() bool {
 		}
 		fmt.Print("ok")
 		return true
+	case "fingerprint":
+		handleFingerprint()
+		return true
 	}

 	// pflag.CommandLine.ParseErrorsWhitelist.UnknownFlags = true
@@ -49,6 +50,7 @@ func (opts *cmdOptions) parse() bool {
 	pflag.StringVarP(&opts.hubURL, "url", "u", "", "URL of the Beszel hub")
 	pflag.StringVarP(&opts.token, "token", "t", "", "Token to use for authentication")
 	chinaMirrors := pflag.BoolP("china-mirrors", "c", false, "Use mirror for update (gh.beszel.dev) instead of GitHub")
+	version := pflag.BoolP("version", "v", false, "Show version information")
 	help := pflag.BoolP("help", "h", false, "Show this help message")

 	// Convert old single-dash long flags to double-dash for backward compatibility
@@ -73,9 +75,9 @@ func (opts *cmdOptions) parse() bool {
 		builder.WriteString(os.Args[0])
 		builder.WriteString(" [command] [flags]\n")
 		builder.WriteString("\nCommands:\n")
-		builder.WriteString("  health    Check if the agent is running\n")
-		// builder.WriteString("  help      Display this help message\n")
-		builder.WriteString("  update    Update to the latest version\n")
+		builder.WriteString("  fingerprint  View or reset the agent fingerprint\n")
+		builder.WriteString("  health       Check if the agent is running\n")
+		builder.WriteString("  update       Update to the latest version\n")
 		builder.WriteString("\nFlags:\n")
 		fmt.Print(builder.String())
 		pflag.PrintDefaults()
@@ -86,6 +88,9 @@ func (opts *cmdOptions) parse() bool {

 	// Must run after pflag.Parse()
 	switch {
+	case *version:
+		fmt.Println(beszel.AppName+"-agent", beszel.Version)
+		return true
 	case *help || subcommand == "help":
 		pflag.Usage()
 		return true
@@ -112,12 +117,12 @@ func (opts *cmdOptions) loadPublicKeys() ([]ssh.PublicKey, error) {
 	}

 	// Try environment variable
-	if key, ok := agent.GetEnv("KEY"); ok && key != "" {
+	if key, ok := utils.GetEnv("KEY"); ok && key != "" {
 		return agent.ParseKeys(key)
 	}

 	// Try key file
-	keyFile, ok := agent.GetEnv("KEY_FILE")
+	keyFile, ok := utils.GetEnv("KEY_FILE")
 	if !ok {
 		return nil, fmt.Errorf("no key provided: must set -key flag, KEY env var, or KEY_FILE env var. Use 'beszel-agent help' for usage")
 	}
@@ -133,6 +138,38 @@ func (opts *cmdOptions) getAddress() string {
 	return agent.GetAddress(opts.listen)
 }

+// handleFingerprint handles the "fingerprint" command with subcommands "view" and "reset".
+func handleFingerprint() {
+	subCmd := ""
+	if len(os.Args) > 2 {
+		subCmd = os.Args[2]
+	}
+
+	switch subCmd {
+	case "", "view":
+		dataDir, _ := agent.GetDataDir()
+		fp := agent.GetFingerprint(dataDir, "", "")
+		fmt.Println(fp)
+	case "help", "-h", "--help":
+		fmt.Print(fingerprintUsage())
+	case "reset":
+		dataDir, err := agent.GetDataDir()
+		if err != nil {
+			log.Fatal(err)
+		}
+		if err := agent.DeleteFingerprint(dataDir); err != nil {
+			log.Fatal(err)
+		}
+		fmt.Println("Fingerprint reset. A new one will be generated on next start.")
+	default:
+		log.Fatalf("Unknown command: %q\n\n%s", subCmd, fingerprintUsage())
+	}
+}
+
+func fingerprintUsage() string {
+	return fmt.Sprintf("Usage: %s fingerprint [view|reset]\n\nCommands:\n  view   Print fingerprint (default)\n  reset  Reset saved fingerprint\n", os.Args[0])
+}
+
 func main() {
 	var opts cmdOptions
 	subcommandHandled := opts.parse()
--- a/internal/dockerfile_agent
+++ b/internal/dockerfile_agent
@@ -23,6 +23,9 @@ COPY --from=builder /agent /agent
 # this is so we don't need to create the /tmp directory in the scratch container
 COPY --from=builder /tmp /tmp

+# AMD GPU name lookup (used by agent on Linux when /usr/share/libdrm/amdgpu.ids is read)
+COPY --from=builder /app/agent/test-data/amdgpu.ids /usr/share/libdrm/amdgpu.ids
+
 # Ensure data persistence across container recreations
 VOLUME ["/var/lib/beszel-agent"]

--- a/internal/dockerfile_agent_alpine
+++ b/internal/dockerfile_agent_alpine
@@ -20,6 +20,9 @@ RUN rm -rf /tmp/*
 FROM alpine:3.23
 COPY --from=builder /agent /agent

+# AMD GPU name lookup (used by agent on Linux when /usr/share/libdrm/amdgpu.ids is read)
+COPY --from=builder /app/agent/test-data/amdgpu.ids /usr/share/libdrm/amdgpu.ids
+
 RUN apk add --no-cache smartmontools

 # Ensure data persistence across container recreations
--- a/internal/dockerfile_agent_nvidia
+++ b/internal/dockerfile_agent_nvidia
@@ -37,6 +37,9 @@ RUN apt-get update && apt-get install -y \
 FROM nvidia/cuda:12.2.2-base-ubuntu22.04
 COPY --from=builder /agent /agent

+# AMD GPU name lookup (used by agent on hybrid laptops when /usr/share/libdrm/amdgpu.ids is read)
+COPY --from=builder /app/agent/test-data/amdgpu.ids /usr/share/libdrm/amdgpu.ids
+
 # Copy smartmontools binaries and config files
 COPY --from=smartmontools-builder /usr/sbin/smartctl /usr/sbin/smartctl

--- a/internal/entities/smart/smart.go
+++ b/internal/entities/smart/smart.go
@@ -143,8 +143,8 @@ type AtaDeviceStatisticsPage struct {
 }

 type AtaDeviceStatisticsEntry struct {
-	Name  string  `json:"name"`
-	Value *uint64 `json:"value,omitempty"`
+	Name  string `json:"name"`
+	Value *int64 `json:"value,omitempty"`
 }

 type AtaSmartAttribute struct {
@@ -356,8 +356,8 @@ type SmartInfoForSata struct {
 	SmartStatus SmartStatusInfo `json:"smart_status"`
 	// AtaSmartData                 AtaSmartData                 `json:"ata_smart_data"`
 	// AtaSctCapabilities           AtaSctCapabilities           `json:"ata_sct_capabilities"`
-	AtaSmartAttributes  AtaSmartAttributes  `json:"ata_smart_attributes"`
-	AtaDeviceStatistics AtaDeviceStatistics `json:"ata_device_statistics"`
+	AtaSmartAttributes  AtaSmartAttributes `json:"ata_smart_attributes"`
+	AtaDeviceStatistics json.RawMessage    `json:"ata_device_statistics"`
 	// PowerOnTime                  PowerOnTimeInfo              `json:"power_on_time"`
 	// PowerCycleCount              uint16                       `json:"power_cycle_count"`
 	Temperature TemperatureInfo `json:"temperature"`
--- a/internal/entities/system/system.go
+++ b/internal/entities/system/system.go
@@ -12,8 +12,9 @@ import (

 type Stats struct {
 	Cpu            float64             `json:"cpu" cbor:"0,keyasint"`
-	MaxCpu         float64             `json:"cpum,omitempty" cbor:"1,keyasint,omitempty"`
+	MaxCpu         float64             `json:"cpum,omitempty" cbor:"-"`
 	Mem            float64             `json:"m" cbor:"2,keyasint"`
+	MaxMem         float64             `json:"mm,omitempty" cbor:"-"`
 	MemUsed        float64             `json:"mu" cbor:"3,keyasint"`
 	MemPct         float64             `json:"mp" cbor:"4,keyasint"`
 	MemBuffCache   float64             `json:"mb" cbor:"5,keyasint"`
@@ -23,26 +24,25 @@ type Stats struct {
 	DiskTotal      float64             `json:"d" cbor:"9,keyasint"`
 	DiskUsed       float64             `json:"du" cbor:"10,keyasint"`
 	DiskPct        float64             `json:"dp" cbor:"11,keyasint"`
-	DiskReadPs     float64             `json:"dr" cbor:"12,keyasint"`
-	DiskWritePs    float64             `json:"dw" cbor:"13,keyasint"`
-	MaxDiskReadPs  float64             `json:"drm,omitempty" cbor:"14,keyasint,omitempty"`
-	MaxDiskWritePs float64             `json:"dwm,omitempty" cbor:"15,keyasint,omitempty"`
+	DiskReadPs     float64             `json:"dr,omitzero" cbor:"12,keyasint,omitzero"`
+	DiskWritePs    float64             `json:"dw,omitzero" cbor:"13,keyasint,omitzero"`
+	MaxDiskReadPs  float64             `json:"drm,omitempty" cbor:"-"`
+	MaxDiskWritePs float64             `json:"dwm,omitempty" cbor:"-"`
 	NetworkSent    float64             `json:"ns,omitzero" cbor:"16,keyasint,omitzero"`
 	NetworkRecv    float64             `json:"nr,omitzero" cbor:"17,keyasint,omitzero"`
-	MaxNetworkSent float64             `json:"nsm,omitempty" cbor:"18,keyasint,omitempty"`
-	MaxNetworkRecv float64             `json:"nrm,omitempty" cbor:"19,keyasint,omitempty"`
+	MaxNetworkSent float64             `json:"nsm,omitempty" cbor:"-"`
+	MaxNetworkRecv float64             `json:"nrm,omitempty" cbor:"-"`
 	Temperatures   map[string]float64  `json:"t,omitempty" cbor:"20,keyasint,omitempty"`
 	ExtraFs        map[string]*FsStats `json:"efs,omitempty" cbor:"21,keyasint,omitempty"`
 	GPUData        map[string]GPUData  `json:"g,omitempty" cbor:"22,keyasint,omitempty"`
-	LoadAvg1       float64             `json:"l1,omitempty" cbor:"23,keyasint,omitempty"`
-	LoadAvg5       float64             `json:"l5,omitempty" cbor:"24,keyasint,omitempty"`
-	LoadAvg15      float64             `json:"l15,omitempty" cbor:"25,keyasint,omitempty"`
-	Bandwidth      [2]uint64           `json:"b,omitzero" cbor:"26,keyasint,omitzero"`  // [sent bytes, recv bytes]
-	MaxBandwidth   [2]uint64           `json:"bm,omitzero" cbor:"27,keyasint,omitzero"` // [sent bytes, recv bytes]
+	// LoadAvg1       float64             `json:"l1,omitempty" cbor:"23,keyasint,omitempty"`
+	// LoadAvg5       float64             `json:"l5,omitempty" cbor:"24,keyasint,omitempty"`
+	// LoadAvg15      float64             `json:"l15,omitempty" cbor:"25,keyasint,omitempty"`
+	Bandwidth    [2]uint64 `json:"b,omitzero" cbor:"26,keyasint,omitzero"` // [sent bytes, recv bytes]
+	MaxBandwidth [2]uint64 `json:"bm,omitzero" cbor:"-"`                   // [sent bytes, recv bytes]
 	// TODO: remove other load fields in future release in favor of load avg array
 	LoadAvg           [3]float64           `json:"la,omitempty" cbor:"28,keyasint"`
-	Battery           [2]uint8             `json:"bat,omitzero" cbor:"29,keyasint,omitzero"` // [percent, charge state, current]
-	MaxMem            float64              `json:"mm,omitempty" cbor:"30,keyasint,omitempty"`
+	Battery           [2]uint8             `json:"bat,omitzero" cbor:"29,keyasint,omitzero"`    // [percent, charge state, current]
 	NetworkInterfaces map[string][4]uint64 `json:"ni,omitempty" cbor:"31,keyasint,omitempty"`   // [upload bytes, download bytes, total upload, total download]
 	DiskIO            [2]uint64            `json:"dio,omitzero" cbor:"32,keyasint,omitzero"`    // [read bytes, write bytes]
 	MaxDiskIO         [2]uint64            `json:"diom,omitzero" cbor:"-"`                      // [max read bytes, max write bytes]
@@ -90,8 +90,8 @@ type FsStats struct {
 	TotalWrite     uint64    `json:"-"`
 	DiskReadPs     float64   `json:"r" cbor:"2,keyasint"`
 	DiskWritePs    float64   `json:"w" cbor:"3,keyasint"`
-	MaxDiskReadPS  float64   `json:"rm,omitempty" cbor:"4,keyasint,omitempty"`
-	MaxDiskWritePS float64   `json:"wm,omitempty" cbor:"5,keyasint,omitempty"`
+	MaxDiskReadPS  float64   `json:"rm,omitempty" cbor:"-"`
+	MaxDiskWritePS float64   `json:"wm,omitempty" cbor:"-"`
 	// TODO: remove DiskReadPs and DiskWritePs in future release in favor of DiskReadBytes and DiskWriteBytes
 	DiskReadBytes     uint64 `json:"rb" cbor:"6,keyasint,omitempty"`
 	DiskWriteBytes    uint64 `json:"wb" cbor:"7,keyasint,omitempty"`
@@ -129,23 +129,23 @@ type Info struct {
 	KernelVersion string `json:"k,omitempty" cbor:"1,keyasint,omitempty"` // deprecated - moved to Details struct
 	Cores         int    `json:"c,omitzero" cbor:"2,keyasint,omitzero"`   // deprecated - moved to Details struct
 	// Threads is needed in Info struct to calculate load average thresholds
-	Threads        int     `json:"t,omitempty" cbor:"3,keyasint,omitempty"`
-	CpuModel       string  `json:"m,omitempty" cbor:"4,keyasint,omitempty"` // deprecated - moved to Details struct
-	Uptime         uint64  `json:"u" cbor:"5,keyasint"`
-	Cpu            float64 `json:"cpu" cbor:"6,keyasint"`
-	MemPct         float64 `json:"mp" cbor:"7,keyasint"`
-	DiskPct        float64 `json:"dp" cbor:"8,keyasint"`
-	Bandwidth      float64 `json:"b" cbor:"9,keyasint"`
-	AgentVersion   string  `json:"v" cbor:"10,keyasint"`
-	Podman         bool    `json:"p,omitempty" cbor:"11,keyasint,omitempty"` // deprecated - moved to Details struct
-	GpuPct         float64 `json:"g,omitempty" cbor:"12,keyasint,omitempty"`
-	DashboardTemp  float64 `json:"dt,omitempty" cbor:"13,keyasint,omitempty"`
-	Os             Os      `json:"os,omitempty" cbor:"14,keyasint,omitempty"`  // deprecated - moved to Details struct
-	LoadAvg1       float64 `json:"l1,omitempty" cbor:"15,keyasint,omitempty"`  // deprecated - use `la` array instead
-	LoadAvg5       float64 `json:"l5,omitempty" cbor:"16,keyasint,omitempty"`  // deprecated - use `la` array instead
-	LoadAvg15      float64 `json:"l15,omitempty" cbor:"17,keyasint,omitempty"` // deprecated - use `la` array instead
-	BandwidthBytes uint64  `json:"bb" cbor:"18,keyasint"`
+	Threads       int     `json:"t,omitempty" cbor:"3,keyasint,omitempty"`
+	CpuModel      string  `json:"m,omitempty" cbor:"4,keyasint,omitempty"` // deprecated - moved to Details struct
+	Uptime        uint64  `json:"u" cbor:"5,keyasint"`
+	Cpu           float64 `json:"cpu" cbor:"6,keyasint"`
+	MemPct        float64 `json:"mp" cbor:"7,keyasint"`
+	DiskPct       float64 `json:"dp" cbor:"8,keyasint"`
+	Bandwidth     float64 `json:"b,omitzero" cbor:"9,keyasint"` // deprecated in favor of BandwidthBytes
+	AgentVersion  string  `json:"v" cbor:"10,keyasint"`
+	Podman        bool    `json:"p,omitempty" cbor:"11,keyasint,omitempty"` // deprecated - moved to Details struct
+	GpuPct        float64 `json:"g,omitempty" cbor:"12,keyasint,omitempty"`
+	DashboardTemp float64 `json:"dt,omitempty" cbor:"13,keyasint,omitempty"`
+	Os            Os      `json:"os,omitempty" cbor:"14,keyasint,omitempty"` // deprecated - moved to Details struct
+	// LoadAvg1       float64 `json:"l1,omitempty" cbor:"15,keyasint,omitempty"`  // deprecated - use `la` array instead
+	// LoadAvg5       float64 `json:"l5,omitempty" cbor:"16,keyasint,omitempty"`  // deprecated - use `la` array instead
+	// LoadAvg15      float64 `json:"l15,omitempty" cbor:"17,keyasint,omitempty"` // deprecated - use `la` array instead

+	BandwidthBytes uint64             `json:"bb" cbor:"18,keyasint"`
 	LoadAvg        [3]float64         `json:"la,omitempty" cbor:"19,keyasint"`
 	ConnectionType ConnectionType     `json:"ct,omitempty" cbor:"20,keyasint,omitempty,omitzero"`
 	ExtraFsPct     map[string]float64 `json:"efs,omitempty" cbor:"21,keyasint,omitempty"`
--- a/internal/ghupdate/ghupdate.go
+++ b/internal/ghupdate/ghupdate.go
@@ -34,7 +34,7 @@ func ColorPrint(color, text string) {
 	fmt.Println(color + text + colorReset)
 }

-func ColorPrintf(color, format string, args ...interface{}) {
+func ColorPrintf(color, format string, args ...any) {
 	fmt.Printf(color+format+colorReset+"\n", args...)
 }

--- a/internal/hub/agent_connect_test.go
+++ b/internal/hub/agent_connect_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package hub

@@ -10,6 +9,7 @@ import (
 	"net/http/httptest"
 	"os"
 	"path/filepath"
+	"runtime"
 	"strings"
 	"testing"
 	"time"
@@ -35,6 +35,26 @@ func createTestHub(t testing.TB) (*Hub, *pbtests.TestApp, error) {
 	return NewHub(testApp), testApp, nil
 }

+// cleanupTestHub stops background system goroutines before tearing down the app.
+func cleanupTestHub(hub *Hub, testApp *pbtests.TestApp) {
+	if hub != nil {
+		sm := hub.GetSystemManager()
+		sm.RemoveAllSystems()
+		// Give updater goroutines a brief window to observe cancellation before DB teardown.
+		for range 20 {
+			if sm.GetSystemCount() == 0 {
+				break
+			}
+			runtime.Gosched()
+			time.Sleep(5 * time.Millisecond)
+		}
+		time.Sleep(20 * time.Millisecond)
+	}
+	if testApp != nil {
+		testApp.Cleanup()
+	}
+}
+
 // Helper function to create a test record
 func createTestRecord(app core.App, collection string, data map[string]any) (*core.Record, error) {
 	col, err := app.FindCachedCollectionByNameOrId(collection)
@@ -64,7 +84,7 @@ func TestValidateAgentHeaders(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	defer testApp.Cleanup()
+	defer cleanupTestHub(hub, testApp)

 	testCases := []struct {
 		name          string
@@ -145,7 +165,7 @@ func TestGetAllFingerprintRecordsByToken(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	defer testApp.Cleanup()
+	defer cleanupTestHub(hub, testApp)

 	// create test user
 	userRecord, err := createTestUser(testApp)
@@ -235,7 +255,7 @@ func TestSetFingerprint(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	defer testApp.Cleanup()
+	defer cleanupTestHub(hub, testApp)

 	// Create test user
 	userRecord, err := createTestUser(testApp)
@@ -315,7 +335,7 @@ func TestCreateSystemFromAgentData(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	defer testApp.Cleanup()
+	defer cleanupTestHub(hub, testApp)

 	// Create test user
 	userRecord, err := createTestUser(testApp)
@@ -425,7 +445,7 @@ func TestUniversalTokenFlow(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	defer testApp.Cleanup()
+	defer cleanupTestHub(nil, testApp)

 	// Create test user
 	userRecord, err := createTestUser(testApp)
@@ -493,7 +513,7 @@ func TestAgentConnect(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	defer testApp.Cleanup()
+	defer cleanupTestHub(hub, testApp)

 	// Create test user
 	userRecord, err := createTestUser(testApp)
@@ -652,7 +672,7 @@ func TestHandleAgentConnect(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	defer testApp.Cleanup()
+	defer cleanupTestHub(hub, testApp)

 	// Create test user
 	userRecord, err := createTestUser(testApp)
@@ -737,7 +757,7 @@ func TestAgentWebSocketIntegration(t *testing.T) {
 	// Create hub and test app
 	hub, testApp, err := createTestHub(t)
 	require.NoError(t, err)
-	defer testApp.Cleanup()
+	defer cleanupTestHub(hub, testApp)

 	// Get the hub's SSH key
 	hubSigner, err := hub.GetSSHKey("")
@@ -897,7 +917,7 @@ func TestAgentWebSocketIntegration(t *testing.T) {

 			// Wait for connection result
 			maxWait := 2 * time.Second
-			time.Sleep(20 * time.Millisecond)
+			time.Sleep(40 * time.Millisecond)
 			checkInterval := 20 * time.Millisecond
 			timeout := time.After(maxWait)
 			ticker := time.Tick(checkInterval)
@@ -942,6 +962,8 @@ func TestAgentWebSocketIntegration(t *testing.T) {
 				}
 			}

+			time.Sleep(20 * time.Millisecond)
+
 			// Verify fingerprint state by re-reading the specific record
 			updatedFingerprintRecord, err := testApp.FindRecordById("fingerprints", fingerprintRecord.Id)
 			require.NoError(t, err)
@@ -976,7 +998,7 @@ func TestMultipleSystemsWithSameUniversalToken(t *testing.T) {
 	// Create hub and test app
 	hub, testApp, err := createTestHub(t)
 	require.NoError(t, err)
-	defer testApp.Cleanup()
+	defer cleanupTestHub(hub, testApp)

 	// Get the hub's SSH key
 	hubSigner, err := hub.GetSSHKey("")
@@ -1144,6 +1166,8 @@ func TestMultipleSystemsWithSameUniversalToken(t *testing.T) {
 					assert.Equal(t, systemCount, systemsAfterCount, "Total system count should remain the same")
 				}

+				time.Sleep(20 * time.Millisecond)
+
 				// Verify that a fingerprint record exists for this fingerprint
 				fingerprints, err := testApp.FindRecordsByFilter("fingerprints", "token = {:token} && fingerprint = {:fingerprint}", "", -1, 0, map[string]any{
 					"token":       universalToken,
@@ -1176,7 +1200,7 @@ func TestPermanentUniversalTokenFromDB(t *testing.T) {
 	// Create hub and test app
 	hub, testApp, err := createTestHub(t)
 	require.NoError(t, err)
-	defer testApp.Cleanup()
+	defer cleanupTestHub(hub, testApp)

 	// Get the hub's SSH key
 	hubSigner, err := hub.GetSSHKey("")
@@ -1273,7 +1297,7 @@ verify:
 func TestFindOrCreateSystemForToken(t *testing.T) {
 	hub, testApp, err := createTestHub(t)
 	require.NoError(t, err)
-	defer testApp.Cleanup()
+	defer cleanupTestHub(hub, testApp)

 	// Create test user
 	userRecord, err := createTestUser(testApp)
--- a/internal/hub/config/config_test.go
+++ b/internal/hub/config/config_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package config_test

--- a/internal/hub/expirymap/expirymap.go
+++ b/internal/hub/expirymap/expirymap.go
@@ -1,35 +1,39 @@
+// Package expirymap provides a thread-safe map with expiring entries.
+// It supports TTL-based expiration with both lazy cleanup on access
+// and periodic background cleanup.
 package expirymap

 import (
-	"reflect"
+	"sync"
 	"time"

 	"github.com/pocketbase/pocketbase/tools/store"
 )

-type val[T any] struct {
+type val[T comparable] struct {
 	value   T
 	expires time.Time
 }

-type ExpiryMap[T any] struct {
-	store           *store.Store[string, *val[T]]
-	cleanupInterval time.Duration
+type ExpiryMap[T comparable] struct {
+	store    store.Store[string, val[T]]
+	stopChan chan struct{}
+	stopOnce sync.Once
 }

 // New creates a new expiry map with custom cleanup interval
-func New[T any](cleanupInterval time.Duration) *ExpiryMap[T] {
+func New[T comparable](cleanupInterval time.Duration) *ExpiryMap[T] {
 	m := &ExpiryMap[T]{
-		store:           store.New(map[string]*val[T]{}),
-		cleanupInterval: cleanupInterval,
+		store:    *store.New(map[string]val[T]{}),
+		stopChan: make(chan struct{}),
 	}
-	m.startCleaner()
+	go m.startCleaner(cleanupInterval)
 	return m
 }

 // Set stores a value with the given TTL
 func (m *ExpiryMap[T]) Set(key string, value T, ttl time.Duration) {
-	m.store.Set(key, &val[T]{
+	m.store.Set(key, val[T]{
 		value:   value,
 		expires: time.Now().Add(ttl),
 	})
@@ -55,7 +59,7 @@ func (m *ExpiryMap[T]) GetOk(key string) (T, bool) {
 // GetByValue retrieves a value by value
 func (m *ExpiryMap[T]) GetByValue(val T) (key string, value T, ok bool) {
 	for key, v := range m.store.GetAll() {
-		if reflect.DeepEqual(v.value, val) {
+		if v.value == val {
 			// check if expired
 			if v.expires.Before(time.Now()) {
 				m.store.Remove(key)
@@ -75,7 +79,7 @@ func (m *ExpiryMap[T]) Remove(key string) {
 // RemovebyValue removes a value by value
 func (m *ExpiryMap[T]) RemovebyValue(value T) (T, bool) {
 	for key, val := range m.store.GetAll() {
-		if reflect.DeepEqual(val.value, value) {
+		if val.value == value {
 			m.store.Remove(key)
 			return val.value, true
 		}
@@ -84,13 +88,23 @@ func (m *ExpiryMap[T]) RemovebyValue(value T) (T, bool) {
 }

 // startCleaner runs the background cleanup process
-func (m *ExpiryMap[T]) startCleaner() {
-	go func() {
-		tick := time.Tick(m.cleanupInterval)
-		for range tick {
+func (m *ExpiryMap[T]) startCleaner(interval time.Duration) {
+	tick := time.Tick(interval)
+	for {
+		select {
+		case <-tick:
 			m.cleanup()
+		case <-m.stopChan:
+			return
 		}
-	}()
+	}
+}
+
+// StopCleaner stops the background cleanup process
+func (m *ExpiryMap[T]) StopCleaner() {
+	m.stopOnce.Do(func() {
+		close(m.stopChan)
+	})
 }

 // cleanup removes all expired entries
@@ -102,3 +116,12 @@ func (m *ExpiryMap[T]) cleanup() {
 		}
 	}
 }
+
+// UpdateExpiration updates the expiration time of a key
+func (m *ExpiryMap[T]) UpdateExpiration(key string, ttl time.Duration) {
+	value, ok := m.store.GetOk(key)
+	if ok {
+		value.expires = time.Now().Add(ttl)
+		m.store.Set(key, value)
+	}
+}
--- a/internal/hub/expirymap/expirymap_test.go
+++ b/internal/hub/expirymap/expirymap_test.go
@@ -1,10 +1,10 @@
 //go:build testing
-// +build testing

 package expirymap

 import (
 	"testing"
+	"testing/synctest"
 	"time"

 	"github.com/stretchr/testify/assert"
@@ -178,6 +178,33 @@ func TestExpiryMap_GenericTypes(t *testing.T) {
 	})
 }

+func TestExpiryMap_UpdateExpiration(t *testing.T) {
+	em := New[string](time.Hour)
+
+	// Set a value with short TTL
+	em.Set("key1", "value1", time.Millisecond*50)
+
+	// Verify it exists
+	assert.True(t, em.Has("key1"))
+
+	// Update expiration to a longer TTL
+	em.UpdateExpiration("key1", time.Hour)
+
+	// Wait for the original TTL to pass
+	time.Sleep(time.Millisecond * 100)
+
+	// Should still exist because expiration was updated
+	assert.True(t, em.Has("key1"))
+	value, ok := em.GetOk("key1")
+	assert.True(t, ok)
+	assert.Equal(t, "value1", value)
+
+	// Try updating non-existent key (should not panic)
+	assert.NotPanics(t, func() {
+		em.UpdateExpiration("nonexistent", time.Hour)
+	})
+}
+
 func TestExpiryMap_ZeroValues(t *testing.T) {
 	em := New[string](time.Hour)

@@ -474,3 +501,52 @@ func TestExpiryMap_ValueOperations_Integration(t *testing.T) {
 	assert.Equal(t, "unique", value)
 	assert.Equal(t, "key2", key)
 }
+
+func TestExpiryMap_Cleaner(t *testing.T) {
+	synctest.Test(t, func(t *testing.T) {
+		em := New[string](time.Second)
+		defer em.StopCleaner()
+
+		em.Set("test", "value", 500*time.Millisecond)
+
+		// Wait 600ms, value is expired but cleaner hasn't run yet (interval is 1s)
+		time.Sleep(600 * time.Millisecond)
+		synctest.Wait()
+
+		// Map should still hold the value in its internal store before lazy access or cleaner
+		assert.Equal(t, 1, len(em.store.GetAll()), "store should still have 1 item before cleaner runs")
+
+		// Wait another 500ms so cleaner (1s interval) runs
+		time.Sleep(500 * time.Millisecond)
+		synctest.Wait() // Wait for background goroutine to process the tick
+
+		assert.Equal(t, 0, len(em.store.GetAll()), "store should be empty after cleaner runs")
+	})
+}
+
+func TestExpiryMap_StopCleaner(t *testing.T) {
+	em := New[string](time.Hour)
+
+	// Initially, stopChan is open, reading would block
+	select {
+	case <-em.stopChan:
+		t.Fatal("stopChan should be open initially")
+	default:
+		// success
+	}
+
+	em.StopCleaner()
+
+	// After StopCleaner, stopChan is closed, reading returns immediately
+	select {
+	case <-em.stopChan:
+		// success
+	default:
+		t.Fatal("stopChan was not closed by StopCleaner")
+	}
+
+	// Calling StopCleaner again should NOT panic thanks to sync.Once
+	assert.NotPanics(t, func() {
+		em.StopCleaner()
+	})
+}
--- a/internal/hub/heartbeat/heartbeat.go
+++ b/internal/hub/heartbeat/heartbeat.go
@@ -0,0 +1,303 @@
+// Package heartbeat sends periodic outbound pings to an external monitoring
+// endpoint (e.g. BetterStack, Uptime Kuma, Healthchecks.io) so operators can
+// monitor Beszel without exposing it to the internet.
+package heartbeat
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"net/url"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/henrygd/beszel"
+	"github.com/pocketbase/pocketbase/core"
+)
+
+// Default values for heartbeat configuration.
+const (
+	defaultInterval = 60 // seconds
+	httpTimeout     = 10 * time.Second
+)
+
+// Payload is the JSON body sent with each heartbeat request.
+type Payload struct {
+	// Status is "ok" when all non-paused systems are up, "warn" when alerts
+	// are triggered but no systems are down, and "error" when any system is down.
+	Status    string         `json:"status"`
+	Timestamp string         `json:"timestamp"`
+	Msg       string         `json:"msg"`
+	Systems   SystemsSummary `json:"systems"`
+	Down      []SystemInfo   `json:"down_systems,omitempty"`
+	Alerts    []AlertInfo    `json:"triggered_alerts,omitempty"`
+	Version   string         `json:"beszel_version"`
+}
+
+// SystemsSummary contains counts of systems by status.
+type SystemsSummary struct {
+	Total   int `json:"total"`
+	Up      int `json:"up"`
+	Down    int `json:"down"`
+	Paused  int `json:"paused"`
+	Pending int `json:"pending"`
+}
+
+// SystemInfo identifies a system that is currently down.
+type SystemInfo struct {
+	ID   string `json:"id" db:"id"`
+	Name string `json:"name" db:"name"`
+	Host string `json:"host" db:"host"`
+}
+
+// AlertInfo describes a currently triggered alert.
+type AlertInfo struct {
+	SystemID   string  `json:"system_id"`
+	SystemName string  `json:"system_name"`
+	AlertName  string  `json:"alert_name"`
+	Threshold  float64 `json:"threshold"`
+}
+
+// Config holds heartbeat settings read from environment variables.
+type Config struct {
+	URL      string // endpoint to ping
+	Interval int    // seconds between pings
+	Method   string // HTTP method (GET or POST, default POST)
+}
+
+// Heartbeat manages the periodic outbound health check.
+type Heartbeat struct {
+	app    core.App
+	config Config
+	client *http.Client
+}
+
+// New creates a Heartbeat if configuration is present.
+// Returns nil if HEARTBEAT_URL is not set (feature disabled).
+func New(app core.App, getEnv func(string) (string, bool)) *Heartbeat {
+	url, _ := getEnv("HEARTBEAT_URL")
+	url = strings.TrimSpace(url)
+	if app == nil || url == "" {
+		return nil
+	}
+
+	interval := defaultInterval
+	if v, ok := getEnv("HEARTBEAT_INTERVAL"); ok {
+		if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 {
+			interval = parsed
+		}
+	}
+
+	method := http.MethodPost
+	if v, ok := getEnv("HEARTBEAT_METHOD"); ok {
+		v = strings.ToUpper(strings.TrimSpace(v))
+		if v == http.MethodGet || v == http.MethodHead {
+			method = v
+		}
+	}
+
+	return &Heartbeat{
+		app: app,
+		config: Config{
+			URL:      url,
+			Interval: interval,
+			Method:   method,
+		},
+		client: &http.Client{Timeout: httpTimeout},
+	}
+}
+
+// Start begins the heartbeat loop. It blocks and should be called in a goroutine.
+// The loop runs until the provided stop channel is closed.
+func (hb *Heartbeat) Start(stop <-chan struct{}) {
+	sanitizedURL := sanitizeHeartbeatURL(hb.config.URL)
+	hb.app.Logger().Info("Heartbeat enabled",
+		"url", sanitizedURL,
+		"interval", fmt.Sprintf("%ds", hb.config.Interval),
+		"method", hb.config.Method,
+	)
+
+	// Send an initial heartbeat immediately on startup.
+	hb.send()
+
+	ticker := time.NewTicker(time.Duration(hb.config.Interval) * time.Second)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-stop:
+			return
+		case <-ticker.C:
+			hb.send()
+		}
+	}
+}
+
+// Send performs a single heartbeat ping. Exposed for the test-heartbeat API endpoint.
+func (hb *Heartbeat) Send() error {
+	return hb.send()
+}
+
+// GetConfig returns the current heartbeat configuration.
+func (hb *Heartbeat) GetConfig() Config {
+	return hb.config
+}
+
+func (hb *Heartbeat) send() error {
+	var req *http.Request
+	var err error
+	method := normalizeMethod(hb.config.Method)
+
+	if method == http.MethodGet || method == http.MethodHead {
+		req, err = http.NewRequest(method, hb.config.URL, nil)
+	} else {
+		payload, payloadErr := hb.buildPayload()
+		if payloadErr != nil {
+			hb.app.Logger().Error("Heartbeat: failed to build payload", "err", payloadErr)
+			return payloadErr
+		}
+
+		body, jsonErr := json.Marshal(payload)
+		if jsonErr != nil {
+			hb.app.Logger().Error("Heartbeat: failed to marshal payload", "err", jsonErr)
+			return jsonErr
+		}
+		req, err = http.NewRequest(http.MethodPost, hb.config.URL, bytes.NewReader(body))
+		if err == nil {
+			req.Header.Set("Content-Type", "application/json")
+		}
+	}
+
+	if err != nil {
+		hb.app.Logger().Error("Heartbeat: failed to create request", "err", err)
+		return err
+	}
+
+	req.Header.Set("User-Agent", "Beszel-Heartbeat")
+
+	resp, err := hb.client.Do(req)
+	if err != nil {
+		hb.app.Logger().Error("Heartbeat: request failed", "url", sanitizeHeartbeatURL(hb.config.URL), "err", err)
+		return err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode >= 400 {
+		hb.app.Logger().Warn("Heartbeat: non-success response",
+			"url", sanitizeHeartbeatURL(hb.config.URL),
+			"status", resp.StatusCode,
+		)
+		return fmt.Errorf("heartbeat endpoint returned status %d", resp.StatusCode)
+	}
+
+	return nil
+}
+
+func (hb *Heartbeat) buildPayload() (*Payload, error) {
+	db := hb.app.DB()
+
+	// Count systems by status.
+	var systemCounts []struct {
+		Status string `db:"status"`
+		Count  int    `db:"cnt"`
+	}
+	err := db.NewQuery("SELECT status, COUNT(*) as cnt FROM systems GROUP BY status").All(&systemCounts)
+	if err != nil {
+		return nil, fmt.Errorf("query system counts: %w", err)
+	}
+
+	summary := SystemsSummary{}
+	for _, sc := range systemCounts {
+		switch sc.Status {
+		case "up":
+			summary.Up = sc.Count
+		case "down":
+			summary.Down = sc.Count
+		case "paused":
+			summary.Paused = sc.Count
+		case "pending":
+			summary.Pending = sc.Count
+		}
+		summary.Total += sc.Count
+	}
+
+	// Get names of down systems.
+	var downSystems []SystemInfo
+	if summary.Down > 0 {
+		err = db.NewQuery("SELECT id, name, host FROM systems WHERE status = 'down'").All(&downSystems)
+		if err != nil {
+			return nil, fmt.Errorf("query down systems: %w", err)
+		}
+	}
+
+	// Get triggered alerts with system names.
+	var triggeredAlerts []struct {
+		SystemID   string  `db:"system"`
+		SystemName string  `db:"system_name"`
+		AlertName  string  `db:"name"`
+		Value      float64 `db:"value"`
+	}
+	err = db.NewQuery(`
+		SELECT a.system, s.name as system_name, a.name, a.value
+		FROM alerts a
+		JOIN systems s ON a.system = s.id
+		WHERE a.triggered = true
+	`).All(&triggeredAlerts)
+	if err != nil {
+		// Non-fatal: alerts info is supplementary.
+		triggeredAlerts = nil
+	}
+
+	alerts := make([]AlertInfo, 0, len(triggeredAlerts))
+	for _, ta := range triggeredAlerts {
+		alerts = append(alerts, AlertInfo{
+			SystemID:   ta.SystemID,
+			SystemName: ta.SystemName,
+			AlertName:  ta.AlertName,
+			Threshold:  ta.Value,
+		})
+	}
+
+	// Determine overall status.
+	status := "ok"
+	msg := "All systems operational"
+	if summary.Down > 0 {
+		status = "error"
+		names := make([]string, len(downSystems))
+		for i, ds := range downSystems {
+			names[i] = ds.Name
+		}
+		msg = fmt.Sprintf("%d system(s) down: %s", summary.Down, strings.Join(names, ", "))
+	} else if len(alerts) > 0 {
+		status = "warn"
+		msg = fmt.Sprintf("%d alert(s) triggered", len(alerts))
+	}
+
+	return &Payload{
+		Status:    status,
+		Timestamp: time.Now().UTC().Format(time.RFC3339),
+		Msg:       msg,
+		Systems:   summary,
+		Down:      downSystems,
+		Alerts:    alerts,
+		Version:   beszel.Version,
+	}, nil
+}
+
+func normalizeMethod(method string) string {
+	upper := strings.ToUpper(strings.TrimSpace(method))
+	if upper == http.MethodGet || upper == http.MethodHead || upper == http.MethodPost {
+		return upper
+	}
+	return http.MethodPost
+}
+
+func sanitizeHeartbeatURL(rawURL string) string {
+	parsed, err := url.Parse(strings.TrimSpace(rawURL))
+	if err != nil || parsed.Scheme == "" || parsed.Host == "" {
+		return "<invalid-url>"
+	}
+	return parsed.Scheme + "://" + parsed.Host
+}
--- a/internal/hub/heartbeat/heartbeat_test.go
+++ b/internal/hub/heartbeat/heartbeat_test.go
@@ -0,0 +1,257 @@
+//go:build testing
+
+package heartbeat_test
+
+import (
+	"encoding/json"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"github.com/henrygd/beszel/internal/hub/heartbeat"
+	beszeltests "github.com/henrygd/beszel/internal/tests"
+	"github.com/pocketbase/pocketbase/core"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestNew(t *testing.T) {
+	t.Run("returns nil when app is missing", func(t *testing.T) {
+		hb := heartbeat.New(nil, envGetter(map[string]string{
+			"HEARTBEAT_URL": "https://heartbeat.example.com/ping",
+		}))
+		assert.Nil(t, hb)
+	})
+
+	t.Run("returns nil when URL is missing", func(t *testing.T) {
+		app := newTestHub(t)
+		hb := heartbeat.New(app.App, func(string) (string, bool) {
+			return "", false
+		})
+		assert.Nil(t, hb)
+	})
+
+	t.Run("parses and normalizes config values", func(t *testing.T) {
+		app := newTestHub(t)
+		env := map[string]string{
+			"HEARTBEAT_URL":      "  https://heartbeat.example.com/ping  ",
+			"HEARTBEAT_INTERVAL": "90",
+			"HEARTBEAT_METHOD":   "head",
+		}
+		getEnv := func(key string) (string, bool) {
+			v, ok := env[key]
+			return v, ok
+		}
+
+		hb := heartbeat.New(app.App, getEnv)
+		require.NotNil(t, hb)
+		cfg := hb.GetConfig()
+		assert.Equal(t, "https://heartbeat.example.com/ping", cfg.URL)
+		assert.Equal(t, 90, cfg.Interval)
+		assert.Equal(t, http.MethodHead, cfg.Method)
+	})
+}
+
+func TestSendGETDoesNotRequireAppOrDB(t *testing.T) {
+	app := newTestHub(t)
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		assert.Equal(t, http.MethodGet, r.Method)
+		assert.Equal(t, "Beszel-Heartbeat", r.Header.Get("User-Agent"))
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer server.Close()
+
+	hb := heartbeat.New(app.App, envGetter(map[string]string{
+		"HEARTBEAT_URL":    server.URL,
+		"HEARTBEAT_METHOD": "GET",
+	}))
+	require.NotNil(t, hb)
+
+	require.NoError(t, hb.Send())
+}
+
+func TestSendReturnsErrorOnHTTPFailureStatus(t *testing.T) {
+	app := newTestHub(t)
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusInternalServerError)
+	}))
+	defer server.Close()
+
+	hb := heartbeat.New(app.App, envGetter(map[string]string{
+		"HEARTBEAT_URL":    server.URL,
+		"HEARTBEAT_METHOD": "GET",
+	}))
+	require.NotNil(t, hb)
+
+	err := hb.Send()
+	require.Error(t, err)
+	assert.ErrorContains(t, err, "heartbeat endpoint returned status 500")
+}
+
+func TestSendPOSTBuildsExpectedStatuses(t *testing.T) {
+	tests := []struct {
+		name           string
+		setup          func(t *testing.T, app *beszeltests.TestHub, user *core.Record)
+		expectStatus   string
+		expectMsgPart  string
+		expectDown     int
+		expectAlerts   int
+		expectTotal    int
+		expectUp       int
+		expectPaused   int
+		expectPending  int
+		expectDownSumm int
+	}{
+		{
+			name: "error when at least one system is down",
+			setup: func(t *testing.T, app *beszeltests.TestHub, user *core.Record) {
+				downSystem := createTestSystem(t, app, user.Id, "db-1", "10.0.0.1", "down")
+				_ = createTestSystem(t, app, user.Id, "web-1", "10.0.0.2", "up")
+				createTriggeredAlert(t, app, user.Id, downSystem.Id, "CPU", 95)
+			},
+			expectStatus:   "error",
+			expectMsgPart:  "1 system(s) down",
+			expectDown:     1,
+			expectAlerts:   1,
+			expectTotal:    2,
+			expectUp:       1,
+			expectDownSumm: 1,
+		},
+		{
+			name: "warn when only alerts are triggered",
+			setup: func(t *testing.T, app *beszeltests.TestHub, user *core.Record) {
+				system := createTestSystem(t, app, user.Id, "api-1", "10.1.0.1", "up")
+				createTriggeredAlert(t, app, user.Id, system.Id, "CPU", 90)
+			},
+			expectStatus:   "warn",
+			expectMsgPart:  "1 alert(s) triggered",
+			expectDown:     0,
+			expectAlerts:   1,
+			expectTotal:    1,
+			expectUp:       1,
+			expectDownSumm: 0,
+		},
+		{
+			name: "ok when no down systems and no alerts",
+			setup: func(t *testing.T, app *beszeltests.TestHub, user *core.Record) {
+				_ = createTestSystem(t, app, user.Id, "node-1", "10.2.0.1", "up")
+				_ = createTestSystem(t, app, user.Id, "node-2", "10.2.0.2", "paused")
+				_ = createTestSystem(t, app, user.Id, "node-3", "10.2.0.3", "pending")
+			},
+			expectStatus:   "ok",
+			expectMsgPart:  "All systems operational",
+			expectDown:     0,
+			expectAlerts:   0,
+			expectTotal:    3,
+			expectUp:       1,
+			expectPaused:   1,
+			expectPending:  1,
+			expectDownSumm: 0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			app := newTestHub(t)
+			user := createTestUser(t, app)
+			tt.setup(t, app, user)
+
+			type requestCapture struct {
+				method      string
+				userAgent   string
+				contentType string
+				payload     heartbeat.Payload
+			}
+
+			captured := make(chan requestCapture, 1)
+			server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				defer r.Body.Close()
+				body, err := io.ReadAll(r.Body)
+				require.NoError(t, err)
+
+				var payload heartbeat.Payload
+				require.NoError(t, json.Unmarshal(body, &payload))
+				captured <- requestCapture{
+					method:      r.Method,
+					userAgent:   r.Header.Get("User-Agent"),
+					contentType: r.Header.Get("Content-Type"),
+					payload:     payload,
+				}
+				w.WriteHeader(http.StatusNoContent)
+			}))
+			defer server.Close()
+
+			hb := heartbeat.New(app.App, envGetter(map[string]string{
+				"HEARTBEAT_URL":    server.URL,
+				"HEARTBEAT_METHOD": "POST",
+			}))
+			require.NotNil(t, hb)
+			require.NoError(t, hb.Send())
+
+			req := <-captured
+			assert.Equal(t, http.MethodPost, req.method)
+			assert.Equal(t, "Beszel-Heartbeat", req.userAgent)
+			assert.Equal(t, "application/json", req.contentType)
+
+			assert.Equal(t, tt.expectStatus, req.payload.Status)
+			assert.Contains(t, req.payload.Msg, tt.expectMsgPart)
+			assert.Equal(t, tt.expectDown, len(req.payload.Down))
+			assert.Equal(t, tt.expectAlerts, len(req.payload.Alerts))
+			assert.Equal(t, tt.expectTotal, req.payload.Systems.Total)
+			assert.Equal(t, tt.expectUp, req.payload.Systems.Up)
+			assert.Equal(t, tt.expectDownSumm, req.payload.Systems.Down)
+			assert.Equal(t, tt.expectPaused, req.payload.Systems.Paused)
+			assert.Equal(t, tt.expectPending, req.payload.Systems.Pending)
+		})
+	}
+}
+
+func newTestHub(t *testing.T) *beszeltests.TestHub {
+	t.Helper()
+	app, err := beszeltests.NewTestHub(t.TempDir())
+	require.NoError(t, err)
+	t.Cleanup(app.Cleanup)
+	return app
+}
+
+func createTestUser(t *testing.T, app *beszeltests.TestHub) *core.Record {
+	t.Helper()
+	user, err := beszeltests.CreateUser(app.App, "admin@example.com", "password123")
+	require.NoError(t, err)
+	return user
+}
+
+func createTestSystem(t *testing.T, app *beszeltests.TestHub, userID, name, host, status string) *core.Record {
+	t.Helper()
+	system, err := beszeltests.CreateRecord(app.App, "systems", map[string]any{
+		"name":   name,
+		"host":   host,
+		"port":   "45876",
+		"users":  []string{userID},
+		"status": status,
+	})
+	require.NoError(t, err)
+	return system
+}
+
+func createTriggeredAlert(t *testing.T, app *beszeltests.TestHub, userID, systemID, name string, threshold float64) *core.Record {
+	t.Helper()
+	alert, err := beszeltests.CreateRecord(app.App, "alerts", map[string]any{
+		"name":      name,
+		"system":    systemID,
+		"user":      userID,
+		"value":     threshold,
+		"min":       0,
+		"triggered": true,
+	})
+	require.NoError(t, err)
+	return alert
+}
+
+func envGetter(values map[string]string) func(string) (string, bool) {
+	return func(key string) (string, bool) {
+		v, ok := values[key]
+		return v, ok
+	}
+}
--- a/internal/hub/hub.go
+++ b/internal/hub/hub.go
@@ -9,12 +9,14 @@ import (
 	"net/url"
 	"os"
 	"path"
+	"regexp"
 	"strings"
 	"time"

 	"github.com/henrygd/beszel"
 	"github.com/henrygd/beszel/internal/alerts"
 	"github.com/henrygd/beszel/internal/hub/config"
+	"github.com/henrygd/beszel/internal/hub/heartbeat"
 	"github.com/henrygd/beszel/internal/hub/systems"
 	"github.com/henrygd/beszel/internal/records"
 	"github.com/henrygd/beszel/internal/users"
@@ -33,11 +35,15 @@ type Hub struct {
 	um     *users.UserManager
 	rm     *records.RecordManager
 	sm     *systems.SystemManager
+	hb     *heartbeat.Heartbeat
+	hbStop chan struct{}
 	pubKey string
 	signer ssh.Signer
 	appURL string
 }

+var containerIDPattern = regexp.MustCompile(`^[a-fA-F0-9]{12,64}$`)
+
 // NewHub creates a new Hub instance with default configuration
 func NewHub(app core.App) *Hub {
 	hub := &Hub{}
@@ -48,6 +54,10 @@ func NewHub(app core.App) *Hub {
 	hub.rm = records.NewRecordManager(hub)
 	hub.sm = systems.NewSystemManager(hub)
 	hub.appURL, _ = GetEnv("APP_URL")
+	hub.hb = heartbeat.New(app, GetEnv)
+	if hub.hb != nil {
+		hub.hbStop = make(chan struct{})
+	}
 	return hub
 }

@@ -88,6 +98,10 @@ func (h *Hub) StartHub() error {
 		if err := h.sm.Initialize(); err != nil {
 			return err
 		}
+		// start heartbeat if configured
+		if h.hb != nil {
+			go h.hb.Start(h.hbStop)
+		}
 		return e.Next()
 	})

@@ -287,6 +301,9 @@ func (h *Hub) registerApiRoutes(se *core.ServeEvent) error {
 	})
 	// send test notification
 	apiAuth.POST("/test-notification", h.SendTestNotification)
+	// heartbeat status and test
+	apiAuth.GET("/heartbeat-status", h.getHeartbeatStatus)
+	apiAuth.POST("/test-heartbeat", h.testHeartbeat)
 	// get config.yml content
 	apiAuth.GET("/config-yaml", config.GetYamlConfig)
 	// handle agent websocket connection
@@ -403,6 +420,42 @@ func (h *Hub) getUniversalToken(e *core.RequestEvent) error {
 	return e.JSON(http.StatusOK, response)
 }

+// getHeartbeatStatus returns current heartbeat configuration and whether it's enabled
+func (h *Hub) getHeartbeatStatus(e *core.RequestEvent) error {
+	if e.Auth.GetString("role") != "admin" {
+		return e.ForbiddenError("Requires admin role", nil)
+	}
+	if h.hb == nil {
+		return e.JSON(http.StatusOK, map[string]any{
+			"enabled": false,
+			"msg":     "Set HEARTBEAT_URL to enable outbound heartbeat monitoring",
+		})
+	}
+	cfg := h.hb.GetConfig()
+	return e.JSON(http.StatusOK, map[string]any{
+		"enabled":  true,
+		"url":      cfg.URL,
+		"interval": cfg.Interval,
+		"method":   cfg.Method,
+	})
+}
+
+// testHeartbeat triggers a single heartbeat ping and returns the result
+func (h *Hub) testHeartbeat(e *core.RequestEvent) error {
+	if e.Auth.GetString("role") != "admin" {
+		return e.ForbiddenError("Requires admin role", nil)
+	}
+	if h.hb == nil {
+		return e.JSON(http.StatusOK, map[string]any{
+			"err": "Heartbeat not configured. Set HEARTBEAT_URL environment variable.",
+		})
+	}
+	if err := h.hb.Send(); err != nil {
+		return e.JSON(http.StatusOK, map[string]any{"err": err.Error()})
+	}
+	return e.JSON(http.StatusOK, map[string]any{"err": false})
+}
+
 // containerRequestHandler handles both container logs and info requests
 func (h *Hub) containerRequestHandler(e *core.RequestEvent, fetchFunc func(*systems.System, string) (string, error), responseKey string) error {
 	systemID := e.Request.URL.Query().Get("system")
@@ -411,6 +464,9 @@ func (h *Hub) containerRequestHandler(e *core.RequestEvent, fetchFunc func(*syst
 	if systemID == "" || containerID == "" {
 		return e.JSON(http.StatusBadRequest, map[string]string{"error": "system and container parameters are required"})
 	}
+	if !containerIDPattern.MatchString(containerID) {
+		return e.JSON(http.StatusBadRequest, map[string]string{"error": "invalid container parameter"})
+	}

 	system, err := h.sm.GetSystem(systemID)
 	if err != nil {
--- a/internal/hub/hub_test.go
+++ b/internal/hub/hub_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package hub_test

@@ -362,6 +361,58 @@ func TestApiRoutesAuthentication(t *testing.T) {
 			ExpectedContent: []string{"test-system"},
 			TestAppFactory:  testAppFactory,
 		},
+		{
+			Name:            "GET /heartbeat-status - no auth should fail",
+			Method:          http.MethodGet,
+			URL:             "/api/beszel/heartbeat-status",
+			ExpectedStatus:  401,
+			ExpectedContent: []string{"requires valid"},
+			TestAppFactory:  testAppFactory,
+		},
+		{
+			Name:   "GET /heartbeat-status - with user auth should fail",
+			Method: http.MethodGet,
+			URL:    "/api/beszel/heartbeat-status",
+			Headers: map[string]string{
+				"Authorization": userToken,
+			},
+			ExpectedStatus:  403,
+			ExpectedContent: []string{"Requires admin role"},
+			TestAppFactory:  testAppFactory,
+		},
+		{
+			Name:   "GET /heartbeat-status - with admin auth should succeed",
+			Method: http.MethodGet,
+			URL:    "/api/beszel/heartbeat-status",
+			Headers: map[string]string{
+				"Authorization": adminUserToken,
+			},
+			ExpectedStatus:  200,
+			ExpectedContent: []string{`"enabled":false`},
+			TestAppFactory:  testAppFactory,
+		},
+		{
+			Name:   "POST /test-heartbeat - with user auth should fail",
+			Method: http.MethodPost,
+			URL:    "/api/beszel/test-heartbeat",
+			Headers: map[string]string{
+				"Authorization": userToken,
+			},
+			ExpectedStatus:  403,
+			ExpectedContent: []string{"Requires admin role"},
+			TestAppFactory:  testAppFactory,
+		},
+		{
+			Name:   "POST /test-heartbeat - with admin auth should report disabled state",
+			Method: http.MethodPost,
+			URL:    "/api/beszel/test-heartbeat",
+			Headers: map[string]string{
+				"Authorization": adminUserToken,
+			},
+			ExpectedStatus:  200,
+			ExpectedContent: []string{"Heartbeat not configured"},
+			TestAppFactory:  testAppFactory,
+		},
 		{
 			Name:            "GET /universal-token - no auth should fail",
 			Method:          http.MethodGet,
@@ -493,7 +544,7 @@ func TestApiRoutesAuthentication(t *testing.T) {
 		{
 			Name:   "GET /containers/logs - with auth but invalid system should fail",
 			Method: http.MethodGet,
-			URL:    "/api/beszel/containers/logs?system=invalid-system&container=test-container",
+			URL:    "/api/beszel/containers/logs?system=invalid-system&container=0123456789ab",
 			Headers: map[string]string{
 				"Authorization": userToken,
 			},
@@ -501,6 +552,39 @@ func TestApiRoutesAuthentication(t *testing.T) {
 			ExpectedContent: []string{"system not found"},
 			TestAppFactory:  testAppFactory,
 		},
+		{
+			Name:   "GET /containers/logs - traversal container should fail validation",
+			Method: http.MethodGet,
+			URL:    "/api/beszel/containers/logs?system=" + system.Id + "&container=..%2F..%2Fversion",
+			Headers: map[string]string{
+				"Authorization": userToken,
+			},
+			ExpectedStatus:  400,
+			ExpectedContent: []string{"invalid container parameter"},
+			TestAppFactory:  testAppFactory,
+		},
+		{
+			Name:   "GET /containers/info - traversal container should fail validation",
+			Method: http.MethodGet,
+			URL:    "/api/beszel/containers/info?system=" + system.Id + "&container=../../version?x=",
+			Headers: map[string]string{
+				"Authorization": userToken,
+			},
+			ExpectedStatus:  400,
+			ExpectedContent: []string{"invalid container parameter"},
+			TestAppFactory:  testAppFactory,
+		},
+		{
+			Name:   "GET /containers/info - non-hex container should fail validation",
+			Method: http.MethodGet,
+			URL:    "/api/beszel/containers/info?system=" + system.Id + "&container=container_name",
+			Headers: map[string]string{
+				"Authorization": userToken,
+			},
+			ExpectedStatus:  400,
+			ExpectedContent: []string{"invalid container parameter"},
+			TestAppFactory:  testAppFactory,
+		},

 		// Auth Optional Routes - Should work without authentication
 		{
--- a/internal/hub/hub_test_helpers.go
+++ b/internal/hub/hub_test_helpers.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package hub

--- a/internal/hub/systems/system.go
+++ b/internal/hub/systems/system.go
@@ -48,7 +48,6 @@ type System struct {
 	detailsFetched atomic.Bool             // True if static system details have been fetched and saved
 	smartFetching  atomic.Bool             // True if SMART devices are currently being fetched
 	smartInterval  time.Duration           // Interval for periodic SMART data updates
-	lastSmartFetch atomic.Int64            // Unix milliseconds of last SMART data fetch
 }

 func (sm *SystemManager) NewSystem(systemId string) *System {
@@ -134,19 +133,34 @@ func (sys *System) update() error {
 		return err
 	}

+	// ensure deprecated fields from older agents are migrated to current fields
+	migrateDeprecatedFields(data, !sys.detailsFetched.Load())
+
 	// create system records
 	_, err = sys.createRecords(data)

+	// if details were included and fetched successfully, mark details as fetched and update smart interval if set by agent
+	if err == nil && data.Details != nil {
+		sys.detailsFetched.Store(true)
+		// update smart interval if it's set on the agent side
+		if data.Details.SmartInterval > 0 {
+			sys.smartInterval = data.Details.SmartInterval
+			// make sure we reset expiration of lastFetch to remain as long as the new smart interval
+			// to prevent premature expiration leading to new fetch if interval is different.
+			sys.manager.smartFetchMap.UpdateExpiration(sys.Id, sys.smartInterval+time.Minute)
+		}
+	}
+
 	// Fetch and save SMART devices when system first comes online or at intervals
-	if backgroundSmartFetchEnabled() {
+	if backgroundSmartFetchEnabled() && sys.detailsFetched.Load() {
 		if sys.smartInterval <= 0 {
 			sys.smartInterval = time.Hour
 		}
-		lastFetch := sys.lastSmartFetch.Load()
-		if time.Since(time.UnixMilli(lastFetch)) >= sys.smartInterval && sys.smartFetching.CompareAndSwap(false, true) {
+		lastFetch, _ := sys.manager.smartFetchMap.GetOk(sys.Id)
+		if time.Since(time.UnixMilli(lastFetch-1e4)) >= sys.smartInterval && sys.smartFetching.CompareAndSwap(false, true) {
 			go func() {
 				defer sys.smartFetching.Store(false)
-				sys.lastSmartFetch.Store(time.Now().UnixMilli())
+				sys.manager.smartFetchMap.Set(sys.Id, time.Now().UnixMilli(), sys.smartInterval+time.Minute)
 				_ = sys.FetchAndSaveSmartDevices()
 			}()
 		}
@@ -221,11 +235,6 @@ func (sys *System) createRecords(data *system.CombinedData) (*core.Record, error
 			if err := createSystemDetailsRecord(txApp, data.Details, sys.Id); err != nil {
 				return err
 			}
-			sys.detailsFetched.Store(true)
-			// update smart interval if it's set on the agent side
-			if data.Details.SmartInterval > 0 {
-				sys.smartInterval = data.Details.SmartInterval
-			}
 		}

 		// update system record (do this last because it triggers alerts and we need above records to be inserted first)
@@ -703,3 +712,50 @@ func getJitter() <-chan time.Time {
 	msDelay := (interval * minPercent / 100) + rand.Intn(interval*jitterRange/100)
 	return time.After(time.Duration(msDelay) * time.Millisecond)
 }
+
+// migrateDeprecatedFields moves values from deprecated fields to their new locations if the new
+// fields are not already populated. Deprecated fields and refs may be removed at least 30 days
+// and one minor version release after the release that includes the migration.
+//
+// This is run when processing incoming system data from agents, which may be on older versions.
+func migrateDeprecatedFields(cd *system.CombinedData, createDetails bool) {
+	// migration added 0.19.0
+	if cd.Stats.Bandwidth[0] == 0 && cd.Stats.Bandwidth[1] == 0 {
+		cd.Stats.Bandwidth[0] = uint64(cd.Stats.NetworkSent * 1024 * 1024)
+		cd.Stats.Bandwidth[1] = uint64(cd.Stats.NetworkRecv * 1024 * 1024)
+		cd.Stats.NetworkSent, cd.Stats.NetworkRecv = 0, 0
+	}
+	// migration added 0.19.0
+	if cd.Info.BandwidthBytes == 0 {
+		cd.Info.BandwidthBytes = uint64(cd.Info.Bandwidth * 1024 * 1024)
+		cd.Info.Bandwidth = 0
+	}
+	// migration added 0.19.0
+	if cd.Stats.DiskIO[0] == 0 && cd.Stats.DiskIO[1] == 0 {
+		cd.Stats.DiskIO[0] = uint64(cd.Stats.DiskReadPs * 1024 * 1024)
+		cd.Stats.DiskIO[1] = uint64(cd.Stats.DiskWritePs * 1024 * 1024)
+		cd.Stats.DiskReadPs, cd.Stats.DiskWritePs = 0, 0
+	}
+	// migration added 0.19.0 - Move deprecated Info fields to Details struct
+	if cd.Details == nil && cd.Info.Hostname != "" {
+		if createDetails {
+			cd.Details = &system.Details{
+				Hostname:    cd.Info.Hostname,
+				Kernel:      cd.Info.KernelVersion,
+				Cores:       cd.Info.Cores,
+				Threads:     cd.Info.Threads,
+				CpuModel:    cd.Info.CpuModel,
+				Podman:      cd.Info.Podman,
+				Os:          cd.Info.Os,
+				MemoryTotal: uint64(cd.Stats.Mem * 1024 * 1024 * 1024),
+			}
+		}
+		// zero the deprecated fields to prevent saving them in systems.info DB json payload
+		cd.Info.Hostname = ""
+		cd.Info.KernelVersion = ""
+		cd.Info.Cores = 0
+		cd.Info.CpuModel = ""
+		cd.Info.Podman = false
+		cd.Info.Os = 0
+	}
+}
--- a/internal/hub/systems/system_manager.go
+++ b/internal/hub/systems/system_manager.go
@@ -8,6 +8,7 @@ import (
 	"github.com/henrygd/beszel/internal/hub/ws"

 	"github.com/henrygd/beszel/internal/entities/system"
+	"github.com/henrygd/beszel/internal/hub/expirymap"

 	"github.com/henrygd/beszel/internal/common"

@@ -40,9 +41,10 @@ var errSystemExists = errors.New("system exists")
 // SystemManager manages a collection of monitored systems and their connections.
 // It handles system lifecycle, status updates, and maintains both SSH and WebSocket connections.
 type SystemManager struct {
-	hub       hubLike                       // Hub interface for database and alert operations
-	systems   *store.Store[string, *System] // Thread-safe store of active systems
-	sshConfig *ssh.ClientConfig             // SSH client configuration for system connections
+	hub           hubLike                       // Hub interface for database and alert operations
+	systems       *store.Store[string, *System] // Thread-safe store of active systems
+	sshConfig     *ssh.ClientConfig             // SSH client configuration for system connections
+	smartFetchMap *expirymap.ExpiryMap[int64]   // Stores last SMART fetch time per system ID
 }

 // hubLike defines the interface requirements for the hub dependency.
@@ -58,8 +60,9 @@ type hubLike interface {
 // The hub must implement the hubLike interface to provide database and alert functionality.
 func NewSystemManager(hub hubLike) *SystemManager {
 	return &SystemManager{
-		systems: store.New(map[string]*System{}),
-		hub:     hub,
+		systems:       store.New(map[string]*System{}),
+		hub:           hub,
+		smartFetchMap: expirymap.New[int64](time.Hour),
 	}
 }

--- a/internal/hub/systems/system_test.go
+++ b/internal/hub/systems/system_test.go
@@ -0,0 +1,159 @@
+//go:build testing
+
+package systems
+
+import (
+	"testing"
+
+	"github.com/henrygd/beszel/internal/entities/system"
+)
+
+func TestCombinedData_MigrateDeprecatedFields(t *testing.T) {
+	t.Run("Migrate NetworkSent and NetworkRecv to Bandwidth", func(t *testing.T) {
+		cd := &system.CombinedData{
+			Stats: system.Stats{
+				NetworkSent: 1.5, // 1.5 MB
+				NetworkRecv: 2.5, // 2.5 MB
+			},
+		}
+		migrateDeprecatedFields(cd, true)
+
+		expectedSent := uint64(1.5 * 1024 * 1024)
+		expectedRecv := uint64(2.5 * 1024 * 1024)
+
+		if cd.Stats.Bandwidth[0] != expectedSent {
+			t.Errorf("expected Bandwidth[0] %d, got %d", expectedSent, cd.Stats.Bandwidth[0])
+		}
+		if cd.Stats.Bandwidth[1] != expectedRecv {
+			t.Errorf("expected Bandwidth[1] %d, got %d", expectedRecv, cd.Stats.Bandwidth[1])
+		}
+		if cd.Stats.NetworkSent != 0 || cd.Stats.NetworkRecv != 0 {
+			t.Errorf("expected NetworkSent and NetworkRecv to be reset, got %f, %f", cd.Stats.NetworkSent, cd.Stats.NetworkRecv)
+		}
+	})
+
+	t.Run("Migrate Info.Bandwidth to Info.BandwidthBytes", func(t *testing.T) {
+		cd := &system.CombinedData{
+			Info: system.Info{
+				Bandwidth: 10.0, // 10 MB
+			},
+		}
+		migrateDeprecatedFields(cd, true)
+
+		expected := uint64(10 * 1024 * 1024)
+		if cd.Info.BandwidthBytes != expected {
+			t.Errorf("expected BandwidthBytes %d, got %d", expected, cd.Info.BandwidthBytes)
+		}
+		if cd.Info.Bandwidth != 0 {
+			t.Errorf("expected Info.Bandwidth to be reset, got %f", cd.Info.Bandwidth)
+		}
+	})
+
+	t.Run("Migrate DiskReadPs and DiskWritePs to DiskIO", func(t *testing.T) {
+		cd := &system.CombinedData{
+			Stats: system.Stats{
+				DiskReadPs:  3.0, // 3 MB
+				DiskWritePs: 4.0, // 4 MB
+			},
+		}
+		migrateDeprecatedFields(cd, true)
+
+		expectedRead := uint64(3 * 1024 * 1024)
+		expectedWrite := uint64(4 * 1024 * 1024)
+
+		if cd.Stats.DiskIO[0] != expectedRead {
+			t.Errorf("expected DiskIO[0] %d, got %d", expectedRead, cd.Stats.DiskIO[0])
+		}
+		if cd.Stats.DiskIO[1] != expectedWrite {
+			t.Errorf("expected DiskIO[1] %d, got %d", expectedWrite, cd.Stats.DiskIO[1])
+		}
+		if cd.Stats.DiskReadPs != 0 || cd.Stats.DiskWritePs != 0 {
+			t.Errorf("expected DiskReadPs and DiskWritePs to be reset, got %f, %f", cd.Stats.DiskReadPs, cd.Stats.DiskWritePs)
+		}
+	})
+
+	t.Run("Migrate Info fields to Details struct", func(t *testing.T) {
+		cd := &system.CombinedData{
+			Stats: system.Stats{
+				Mem: 16.0, // 16 GB
+			},
+			Info: system.Info{
+				Hostname:      "test-host",
+				KernelVersion: "6.8.0",
+				Cores:         8,
+				Threads:       16,
+				CpuModel:      "Intel i7",
+				Podman:        true,
+				Os:            system.Linux,
+			},
+		}
+		migrateDeprecatedFields(cd, true)
+
+		if cd.Details == nil {
+			t.Fatal("expected Details struct to be created")
+		}
+		if cd.Details.Hostname != "test-host" {
+			t.Errorf("expected Hostname 'test-host', got '%s'", cd.Details.Hostname)
+		}
+		if cd.Details.Kernel != "6.8.0" {
+			t.Errorf("expected Kernel '6.8.0', got '%s'", cd.Details.Kernel)
+		}
+		if cd.Details.Cores != 8 {
+			t.Errorf("expected Cores 8, got %d", cd.Details.Cores)
+		}
+		if cd.Details.Threads != 16 {
+			t.Errorf("expected Threads 16, got %d", cd.Details.Threads)
+		}
+		if cd.Details.CpuModel != "Intel i7" {
+			t.Errorf("expected CpuModel 'Intel i7', got '%s'", cd.Details.CpuModel)
+		}
+		if cd.Details.Podman != true {
+			t.Errorf("expected Podman true, got %v", cd.Details.Podman)
+		}
+		if cd.Details.Os != system.Linux {
+			t.Errorf("expected Os Linux, got %d", cd.Details.Os)
+		}
+		expectedMem := uint64(16 * 1024 * 1024 * 1024)
+		if cd.Details.MemoryTotal != expectedMem {
+			t.Errorf("expected MemoryTotal %d, got %d", expectedMem, cd.Details.MemoryTotal)
+		}
+
+		if cd.Info.Hostname != "" || cd.Info.KernelVersion != "" || cd.Info.Cores != 0 || cd.Info.CpuModel != "" || cd.Info.Podman != false || cd.Info.Os != 0 {
+			t.Errorf("expected Info fields to be reset, got %+v", cd.Info)
+		}
+	})
+
+	t.Run("Do not migrate if Details already exists", func(t *testing.T) {
+		cd := &system.CombinedData{
+			Details: &system.Details{Hostname: "existing-host"},
+			Info: system.Info{
+				Hostname: "deprecated-host",
+			},
+		}
+		migrateDeprecatedFields(cd, true)
+
+		if cd.Details.Hostname != "existing-host" {
+			t.Errorf("expected Hostname 'existing-host', got '%s'", cd.Details.Hostname)
+		}
+		if cd.Info.Hostname != "deprecated-host" {
+			t.Errorf("expected Info.Hostname to remain 'deprecated-host', got '%s'", cd.Info.Hostname)
+		}
+	})
+
+	t.Run("Do not create details if migrateDetails is false", func(t *testing.T) {
+		cd := &system.CombinedData{
+			Info: system.Info{
+				Hostname: "deprecated-host",
+			},
+		}
+		migrateDeprecatedFields(cd, false)
+
+		if cd.Details != nil {
+			t.Fatal("expected Details struct to not be created")
+		}
+
+		if cd.Info.Hostname != "" {
+			t.Errorf("expected Info.Hostname to be reset, got '%s'", cd.Info.Hostname)
+		}
+	})
+}
--- a/internal/hub/systems/systems_production.go
+++ b/internal/hub/systems/systems_production.go
@@ -1,5 +1,4 @@
 //go:build !testing
-// +build !testing

 package systems

--- a/internal/hub/systems/systems_test.go
+++ b/internal/hub/systems/systems_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package systems_test

--- a/internal/hub/systems/systems_test_helpers.go
+++ b/internal/hub/systems/systems_test_helpers.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package systems

@@ -114,4 +113,5 @@ func (sm *SystemManager) RemoveAllSystems() {
 	for _, system := range sm.systems.GetAll() {
 		sm.RemoveSystem(system.Id)
 	}
+	sm.smartFetchMap.StopCleaner()
 }
--- a/internal/hub/ws/request_manager_test.go
+++ b/internal/hub/ws/request_manager_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package ws

--- a/internal/hub/ws/ws_test.go
+++ b/internal/hub/ws/ws_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package ws

--- a/internal/hub/ws/ws_test_helpers.go
+++ b/internal/hub/ws/ws_test_helpers.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package ws

--- a/internal/records/records_test.go
+++ b/internal/records/records_test.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package records_test

--- a/internal/records/records_test_helpers.go
+++ b/internal/records/records_test_helpers.go
@@ -1,5 +1,4 @@
 //go:build testing
-// +build testing

 package records

--- a/internal/site/biome.json
+++ b/internal/site/biome.json
@@ -1,5 +1,5 @@
 {
-	"$schema": "https://biomejs.dev/schemas/2.2.3/schema.json",
+	"$schema": "https://biomejs.dev/schemas/2.2.4/schema.json",
 	"vcs": {
 		"enabled": true,
 		"clientKind": "git",
@@ -12,7 +12,7 @@
 		"lineWidth": 120,
 		"formatWithErrors": true
 	},
-	"assist": { "actions": { "source": { "organizeImports": "on" } } },
+	"assist": { "actions": { "source": { "organizeImports": "off" } } },
 	"linter": {
 		"enabled": true,
 		"rules": {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
henrygd	8e2316f845	refactor: simplify/improve status alert handling (#1519 ) also adds new functionality to restore any pending down alerts that were lost by hub restart before creation	2026-03-12 15:53:40 -04:00
Sven van Ginkel	0d3dfcb207	fix(hub): check if status alert is triggered before sending up alert (#1806 )	2026-03-12 13:38:42 -04:00
henrygd	b386ce5190	hub: add ExpiryMap.UpdateExpiration and sync SMART fetch intervals (#1800 ) - Update smartFetchMap expiration when agent smart interval changes - Prevent background SMART fetching before initial system details are loaded - Add buffer to SMART fetch timing check - Get rid of unnecessary pointers in expirymap	2026-03-11 16:25:52 -04:00
henrygd	e527534016	ensure deprecated system fields are migrated to newer structures also removes refs to legacy load avg fields (l1, l5, l15) that were around for a very short period	2026-03-10 18:46:57 -04:00
Victor Eduardo	ec7ad632a9	fix: Use historical records to average disk usage for extra disk alerts (#1801 ) - Introduced a new test file `alerts_disk_test.go` to validate the behavior of disk alerts using historical data for extra filesystems. - Enhanced the `HandleSystemAlerts` function to correctly calculate disk usage for extra filesystems based on historical records. - Updated the `SystemAlertStats` struct to include `ExtraFs` for tracking additional filesystem statistics.	2026-03-09 18:32:35 -04:00
VACInc	963fce5a33	agent: mark mdraid rebuild as warning, not failed (#1797 )	2026-03-09 17:54:53 -04:00
Sven van Ginkel	d38c0da06d	fix: bypass NIC auto-filter when interface is explicitly whitelisted via NICS (#1805 ) Co-authored-by: henrygd <hank@henrygd.me>	2026-03-09 17:47:59 -04:00
henrygd	cae6ac4626	update go version to 1.26.1	2026-03-09 16:10:38 -04:00
henrygd	6b1ff264f2	gpu(amd): add workaround for misreported sysfs filesize (#1799 )	2026-03-09 14:53:52 -04:00
henrygd	35d0e792ad	refactor(expirymap): optimize performance and add StopCleaner method	2026-03-08 19:09:41 -04:00
henrygd	654cd06b19	respect SMART_INTERVAL across agent reconnects (#1800 ) Move tracking of the last SMART data fetch from individual System instances to the SystemManager using a TTL-based ExpiryMap. This ensures that the SMART_INTERVAL is respected even if an agent connection is dropped and re-established, preventing redundant data collection on every reconnect.	2026-03-08 19:03:50 -04:00
henrygd	5e1b028130	refactor(smart): improve perf by skipping ata_device_statistics parsing if unnecessary	2026-03-08 15:19:50 -04:00
henrygd	638e7dc12a	fix(smart): handle negative ATA device statistics values (#1791 )	2026-03-08 13:34:16 -04:00
henrygd	73c262455d	refactor(agent): move GetEnv to utils package	2026-03-07 14:12:17 -05:00
henrygd	0c4d2edd45	refactor(agent): add utils package; rm utils.go and fs_utils.go	2026-03-07 13:50:49 -05:00
henrygd	8f23fff1c9	refactor: mdraid comments and organization also hide serial / firmware in smart details if empty, remove a few unnecessary ops, and add a few more passed state values	2026-02-27 14:23:10 -05:00
VACInc	02c1a0c13d	Add Linux mdraid health monitoring (#1750 )	2026-02-27 13:42:47 -05:00
henrygd	69fdcb36ab	support ZFS ARC on freebsd	2026-02-26 18:38:54 -05:00
henrygd	b91eb6de40	improve root I/O device detection and fallback (#1772 ) - Match FILESYSTEM directly against I/O devices if partition lookup fails - Fall back to the most active I/O device if no root device is detected - Add WARN logs in final fallback case to most active device	2026-02-26 18:11:33 -05:00
henrygd	ec69f6c6e0	improve disk I/O device matching for partition-to-disk mismatches (#1772 ) findIoDevice now normalizes device names and falls back to prefix-based matching when partition names differ from IOCounter names (e.g. nda0p2 → nda0 on FreeBSD). The most-active prefix-related device is selected, avoiding the broad "most active of all" heuristic that caused Docker misattribution in #1737.	2026-02-26 16:59:12 -05:00
henrygd	a86cb91e07	improve install scripts with retries, validation, and better error messages Add curl retries/timeouts, archive integrity checks, binary existence checks, and temp dir cleanup on all failure paths. Unify --mirror flag handling in hub script to match agent. Use cat instead of tee for systemd service file, quiet systemctl output.	2026-02-26 12:29:05 -05:00
henrygd	004841717a	add checks for non-empty CPU times during initialization (#401 )	2026-02-25 19:04:29 -05:00
henrygd	096296ba7b	fix: ensure rc.d directory exists for minimal FreeBSD installs in install-agent.sh	2026-02-25 16:22:37 -05:00
ilya	b012df5669	Fix volume path in Docker run command (#1764 )	2026-02-24 15:47:16 -05:00
henrygd	12545b4b6d	fix: dedupe root-mirrored extra filesystems during disk discovery (#1428 )	2026-02-24 15:41:29 -05:00
henrygd	9e2296452b	fix: compute bandwidth alerts from byte-per-second source (#1770 ) Use Info.BandwidthBytes converted to MB/s with float division so bandwidth alert checks are based on current data without integer truncation near thresholds.	2026-02-24 13:07:27 -05:00
henrygd	ac79860d4a	dev: update biome schema and disable assist/source/organizeImports	2026-02-20 15:50:44 -05:00
henrygd	e13a99fdac	ui: add fallback to display language code if no emoji / flag	2026-02-20 15:46:24 -05:00
henrygd	4cfb2a86ad	0.18.4 release	2026-02-20 15:00:15 -05:00
henrygd	191f25f6e0	ui: refactor heartbeat settings page	2026-02-20 14:48:59 -05:00
henrygd	aa8b3711d7	update translations	2026-02-19 19:22:54 -05:00
henrygd	1fb0b25988	testing: improve flaky hub cleanup in `agent_connect_test.go`	2026-02-19 18:35:31 -05:00
henrygd	04600d83cc	refactor: small go 1.26 updates and `go fix` changes	2026-02-19 18:04:33 -05:00
henrygd	5d8906c9b2	amd gpu: small refactor + trim "series" from device name	2026-02-19 17:39:13 -05:00
henrygd	daac287b9d	ui: fix race issue with meter threshold colors also increase the default container width	2026-02-19 17:37:57 -05:00
henrygd	d526ea61a9	ui: freeze header of smart device details table	2026-02-19 17:35:12 -05:00
henrygd	79616e1662	update translations	2026-02-19 16:21:59 -05:00
Sven van Ginkel	01e8bdf040	feat: allow precise value entry for alerts via text input (#1718 )	2026-02-19 13:15:12 -05:00
henrygd	1e3a44e05d	agent: improve multiplexed logs detection for podman (#1755 )	2026-02-18 17:45:37 -05:00
henrygd	311095cfdd	harden against docker api path traversal Validate container IDs (12-64 hex) in hub container endpoints and agent Docker requests, and build Docker URLs with escaped path segments. Add regression tests for traversal/malformed container inputs and safe endpoint construction.	2026-02-18 17:33:00 -05:00
henrygd	4869c834bb	fix(ui): update bandwidth fallback to 0 when data is empty (avoid NaN)	2026-02-18 16:28:18 -05:00
henrygd	e1c1e97f0a	chore: update go version / go deps / changelog	2026-02-18 16:17:05 -05:00
henrygd	f6b2824ccc	rename `gpu_apple_unsupported.go` to `gpu_darwin_unsupported.go`	2026-02-18 15:15:58 -05:00
henrygd	f17ffc21b8	gate apple gpu collectors + revert readme change	2026-02-18 14:57:41 -05:00
Robert Accettura	f792f9b102	Mac GPU Stats (#1747 )	2026-02-18 14:51:30 -05:00
henrygd	1def7d8d3a	agent: add dockerManager.retrySleep method to mock time.Sleep in tests	2026-02-18 13:45:03 -05:00
Elio Di Nino	ef92b254bf	fix(agent): Retry Docker check on non-200 HTTP response (#1754 ) The previous behavior only caught some errors including inaccessible hosts, but not others like failed authentication or service unavailability. This largely applies when using a socket proxy and having the retry mitigates some erroneous behavior.	2026-02-18 13:42:58 -05:00
henrygd	10d853c004	heartbeat: tweaks and tests (#1729 )	2026-02-17 16:12:29 -05:00
Amir Moradi	cdfd116da0	Add outbound heartbeat monitoring (#1729 ) * feat: add outbound heartbeat monitoring to external endpoints Allow Beszel hub to periodically ping an external monitoring service (e.g. BetterStack, Uptime Kuma, Healthchecks.io) with system status summaries, enabling monitoring without exposing Beszel to the internet. Configuration via environment variables: - BESZEL_HUB_HEARTBEAT_URL: endpoint to ping (required to enable) - BESZEL_HUB_HEARTBEAT_INTERVAL: seconds between pings (default: 60) - BESZEL_HUB_HEARTBEAT_METHOD: HTTP method - POST/GET/HEAD (default: POST)	2026-02-17 15:48:20 -05:00
henrygd	283fa9d5c2	include GTT memory in AMD GPU metrics (#1569 )	2026-02-13 20:06:37 -05:00
henrygd	7d6c0caafc	add amdgpu.ids to docker images (#1569 )	2026-02-13 19:55:02 -05:00
henrygd	04d54a3efc	update sysfs amd collector to pull pretty name from amdgpu.ids (#1569 )	2026-02-13 19:41:40 -05:00
henrygd	14ecb1b069	add nvtop integration and introduce GPU_COLLECTOR env var	2026-02-13 19:41:40 -05:00
henrygd	1f1a448aef	ui: small refactoring / auto formatting	2026-02-12 18:40:16 -05:00
VACInc	e816ea143a	SMART: add eMMC health via sysfs (#1736 ) * SMART: add eMMC health via sysfs Read eMMC wear/EOL indicators from /sys/class/block/mmcblk/device and expose in SMART device list. Includes mocked sysfs tests and UI tweaks for unknown temps. small optimizations for emmc scan and parsing * smart: keep smartctl optional only for Linux hosts with eMMC * update smart alerts to handle warning state * refactor: rename binPath to smartctlPath and replace hasSmartctl with smartctlPath checks --------- Co-authored-by: henrygd <hank@henrygd.me>	2026-02-12 15:27:42 -05:00
Sven van Ginkel	2230097dc7	chore: update inactivity-actions (#1742 )	2026-02-12 12:29:22 -05:00
henrygd	25c77c5664	make: auto-apply glibc tag for agent on linux/amd64 glibc	2026-02-11 13:49:29 -05:00
henrygd	dba3519b2c	fix(agent): avoid mismatched root disk I/O mapping in docker (#1737 ) - Stop using max-read fallback when mapping root filesystem to diskstats. - Keep root usage reporting even when root I/O cannot be mapped. - Expand docker fallback mount detection to include /etc/resolv.conf and /etc/hostname (in addition to /etc/hosts). - Add clearer warnings when root block device detection is uncertain.	2026-02-10 18:12:04 -05:00
henrygd	48c35aa54d	update to go 1.25.7 (fixes GO-2026-4337)	2026-02-06 14:35:53 -05:00
Sven van Ginkel	6b7845b03e	feat: add fingerprint command to agent (#1726 ) Co-authored-by: henrygd <hank@henrygd.me>	2026-02-06 14:32:57 -05:00
Sven van Ginkel	221be1da58	Add version flag insteaf of subcommand (#1639 )	2026-02-05 20:36:57 -05:00
Sven van Ginkel	8347afd68e	feat: add uptime to table (#1719 )	2026-02-04 20:18:28 -05:00
henrygd	2a3885a52e	add check to make sure fingerprint file isn't empty (#1714 )	2026-02-04 20:05:07 -05:00
henrygd	5452e50080	add DISABLE_SSH env var (#1061 )	2026-02-04 18:48:55 -05:00
henrygd	028f7bafb2	add InstallMethod parameter to Windows install script Allows users to explicitly choose Scoop or WinGet for installation instead of relying on auto-detection. Useful when both package managers are installed but the user prefers one over the other.	2026-02-02 14:30:08 -05:00