mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-21 21:26:16 +01:00
SMART: add eMMC health via sysfs (#1736)
* SMART: add eMMC health via sysfs Read eMMC wear/EOL indicators from /sys/class/block/mmcblk*/device and expose in SMART device list. Includes mocked sysfs tests and UI tweaks for unknown temps. * small optimizations for emmc scan and parsing * smart: keep smartctl optional only for Linux hosts with eMMC * update smart alerts to handle warning state * refactor: rename binPath to smartctlPath and replace hasSmartctl with smartctlPath checks --------- Co-authored-by: henrygd <hank@henrygd.me>
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -10,6 +10,7 @@ dist
|
|||||||
*.exe
|
*.exe
|
||||||
internal/cmd/hub/hub
|
internal/cmd/hub/hub
|
||||||
internal/cmd/agent/agent
|
internal/cmd/agent/agent
|
||||||
|
agent.test
|
||||||
node_modules
|
node_modules
|
||||||
build
|
build
|
||||||
*timestamp*
|
*timestamp*
|
||||||
|
|||||||
95
agent/emmc_common.go
Normal file
95
agent/emmc_common.go
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
package agent
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func isEmmcBlockName(name string) bool {
|
||||||
|
if !strings.HasPrefix(name, "mmcblk") {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
suffix := strings.TrimPrefix(name, "mmcblk")
|
||||||
|
if suffix == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, c := range suffix {
|
||||||
|
if c < '0' || c > '9' {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseHexOrDecByte(s string) (uint8, bool) {
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
if s == "" {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
base := 10
|
||||||
|
if strings.HasPrefix(s, "0x") || strings.HasPrefix(s, "0X") {
|
||||||
|
base = 16
|
||||||
|
s = s[2:]
|
||||||
|
}
|
||||||
|
parsed, err := strconv.ParseUint(s, base, 8)
|
||||||
|
if err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return uint8(parsed), true
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseHexBytePair(s string) (uint8, uint8, bool) {
|
||||||
|
fields := strings.Fields(s)
|
||||||
|
if len(fields) < 2 {
|
||||||
|
return 0, 0, false
|
||||||
|
}
|
||||||
|
a, okA := parseHexOrDecByte(fields[0])
|
||||||
|
b, okB := parseHexOrDecByte(fields[1])
|
||||||
|
if !okA && !okB {
|
||||||
|
return 0, 0, false
|
||||||
|
}
|
||||||
|
return a, b, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func emmcSmartStatus(preEOL uint8) string {
|
||||||
|
switch preEOL {
|
||||||
|
case 0x01:
|
||||||
|
return "PASSED"
|
||||||
|
case 0x02:
|
||||||
|
return "WARNING"
|
||||||
|
case 0x03:
|
||||||
|
return "FAILED"
|
||||||
|
default:
|
||||||
|
return "UNKNOWN"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func emmcPreEOLString(preEOL uint8) string {
|
||||||
|
switch preEOL {
|
||||||
|
case 0x01:
|
||||||
|
return "0x01 (normal)"
|
||||||
|
case 0x02:
|
||||||
|
return "0x02 (warning)"
|
||||||
|
case 0x03:
|
||||||
|
return "0x03 (urgent)"
|
||||||
|
default:
|
||||||
|
return fmt.Sprintf("0x%02x", preEOL)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func emmcLifeTimeString(v uint8) string {
|
||||||
|
// JEDEC eMMC: 0x01..0x0A => 0-100% used in 10% steps, 0x0B => exceeded.
|
||||||
|
switch {
|
||||||
|
case v == 0:
|
||||||
|
return "0x00 (not reported)"
|
||||||
|
case v >= 0x01 && v <= 0x0A:
|
||||||
|
low := int(v-1) * 10
|
||||||
|
high := int(v) * 10
|
||||||
|
return fmt.Sprintf("0x%02x (%d-%d%% used)", v, low, high)
|
||||||
|
case v == 0x0B:
|
||||||
|
return "0x0b (>100% used)"
|
||||||
|
default:
|
||||||
|
return fmt.Sprintf("0x%02x", v)
|
||||||
|
}
|
||||||
|
}
|
||||||
78
agent/emmc_common_test.go
Normal file
78
agent/emmc_common_test.go
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
package agent
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestParseHexOrDecByte(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
in string
|
||||||
|
want uint8
|
||||||
|
ok bool
|
||||||
|
}{
|
||||||
|
{"0x01", 1, true},
|
||||||
|
{"0X0b", 11, true},
|
||||||
|
{"01", 1, true},
|
||||||
|
{" 3 ", 3, true},
|
||||||
|
{"", 0, false},
|
||||||
|
{"0x", 0, false},
|
||||||
|
{"nope", 0, false},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
got, ok := parseHexOrDecByte(tt.in)
|
||||||
|
if ok != tt.ok || got != tt.want {
|
||||||
|
t.Fatalf("parseHexOrDecByte(%q) = (%d,%v), want (%d,%v)", tt.in, got, ok, tt.want, tt.ok)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseHexBytePair(t *testing.T) {
|
||||||
|
a, b, ok := parseHexBytePair("0x01 0x02\n")
|
||||||
|
if !ok || a != 1 || b != 2 {
|
||||||
|
t.Fatalf("parseHexBytePair hex = (%d,%d,%v), want (1,2,true)", a, b, ok)
|
||||||
|
}
|
||||||
|
|
||||||
|
a, b, ok = parseHexBytePair("01 02")
|
||||||
|
if !ok || a != 1 || b != 2 {
|
||||||
|
t.Fatalf("parseHexBytePair dec = (%d,%d,%v), want (1,2,true)", a, b, ok)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, _, ok = parseHexBytePair("0x01")
|
||||||
|
if ok {
|
||||||
|
t.Fatalf("parseHexBytePair short input ok=true, want false")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEmmcSmartStatus(t *testing.T) {
|
||||||
|
if got := emmcSmartStatus(0x01); got != "PASSED" {
|
||||||
|
t.Fatalf("emmcSmartStatus(0x01) = %q, want PASSED", got)
|
||||||
|
}
|
||||||
|
if got := emmcSmartStatus(0x02); got != "WARNING" {
|
||||||
|
t.Fatalf("emmcSmartStatus(0x02) = %q, want WARNING", got)
|
||||||
|
}
|
||||||
|
if got := emmcSmartStatus(0x03); got != "FAILED" {
|
||||||
|
t.Fatalf("emmcSmartStatus(0x03) = %q, want FAILED", got)
|
||||||
|
}
|
||||||
|
if got := emmcSmartStatus(0x00); got != "UNKNOWN" {
|
||||||
|
t.Fatalf("emmcSmartStatus(0x00) = %q, want UNKNOWN", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIsEmmcBlockName(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
ok bool
|
||||||
|
}{
|
||||||
|
{"mmcblk0", true},
|
||||||
|
{"mmcblk1", true},
|
||||||
|
{"mmcblk10", true},
|
||||||
|
{"mmcblk0p1", false},
|
||||||
|
{"sda", false},
|
||||||
|
{"mmcblk", false},
|
||||||
|
{"mmcblkA", false},
|
||||||
|
}
|
||||||
|
for _, c := range cases {
|
||||||
|
if got := isEmmcBlockName(c.name); got != c.ok {
|
||||||
|
t.Fatalf("isEmmcBlockName(%q) = %v, want %v", c.name, got, c.ok)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
227
agent/emmc_linux.go
Normal file
227
agent/emmc_linux.go
Normal file
@@ -0,0 +1,227 @@
|
|||||||
|
//go:build linux
|
||||||
|
|
||||||
|
package agent
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/henrygd/beszel/internal/entities/smart"
|
||||||
|
)
|
||||||
|
|
||||||
|
// emmcSysfsRoot is a test hook; production value is "/sys".
|
||||||
|
var emmcSysfsRoot = "/sys"
|
||||||
|
|
||||||
|
type emmcHealth struct {
|
||||||
|
model string
|
||||||
|
serial string
|
||||||
|
revision string
|
||||||
|
capacity uint64
|
||||||
|
preEOL uint8
|
||||||
|
lifeA uint8
|
||||||
|
lifeB uint8
|
||||||
|
}
|
||||||
|
|
||||||
|
func scanEmmcDevices() []*DeviceInfo {
|
||||||
|
blockDir := filepath.Join(emmcSysfsRoot, "class", "block")
|
||||||
|
entries, err := os.ReadDir(blockDir)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
devices := make([]*DeviceInfo, 0, 2)
|
||||||
|
for _, ent := range entries {
|
||||||
|
name := ent.Name()
|
||||||
|
if !isEmmcBlockName(name) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
deviceDir := filepath.Join(blockDir, name, "device")
|
||||||
|
if !hasEmmcHealthFiles(deviceDir) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
devPath := filepath.Join("/dev", name)
|
||||||
|
devices = append(devices, &DeviceInfo{
|
||||||
|
Name: devPath,
|
||||||
|
Type: "emmc",
|
||||||
|
InfoName: devPath + " [eMMC]",
|
||||||
|
Protocol: "MMC",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return devices
|
||||||
|
}
|
||||||
|
|
||||||
|
func (sm *SmartManager) collectEmmcHealth(deviceInfo *DeviceInfo) (bool, error) {
|
||||||
|
if deviceInfo == nil || deviceInfo.Name == "" {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
base := filepath.Base(deviceInfo.Name)
|
||||||
|
if !isEmmcBlockName(base) && !strings.EqualFold(deviceInfo.Type, "emmc") && !strings.EqualFold(deviceInfo.Type, "mmc") {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
health, ok := readEmmcHealth(base)
|
||||||
|
if !ok {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize the device type to keep pruning logic stable across refreshes.
|
||||||
|
deviceInfo.Type = "emmc"
|
||||||
|
|
||||||
|
key := health.serial
|
||||||
|
if key == "" {
|
||||||
|
key = filepath.Join("/dev", base)
|
||||||
|
}
|
||||||
|
|
||||||
|
status := emmcSmartStatus(health.preEOL)
|
||||||
|
|
||||||
|
attrs := []*smart.SmartAttribute{
|
||||||
|
{
|
||||||
|
Name: "PreEOLInfo",
|
||||||
|
RawValue: uint64(health.preEOL),
|
||||||
|
RawString: emmcPreEOLString(health.preEOL),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "DeviceLifeTimeEstA",
|
||||||
|
RawValue: uint64(health.lifeA),
|
||||||
|
RawString: emmcLifeTimeString(health.lifeA),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "DeviceLifeTimeEstB",
|
||||||
|
RawValue: uint64(health.lifeB),
|
||||||
|
RawString: emmcLifeTimeString(health.lifeB),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
sm.Lock()
|
||||||
|
defer sm.Unlock()
|
||||||
|
|
||||||
|
if _, exists := sm.SmartDataMap[key]; !exists {
|
||||||
|
sm.SmartDataMap[key] = &smart.SmartData{}
|
||||||
|
}
|
||||||
|
|
||||||
|
data := sm.SmartDataMap[key]
|
||||||
|
data.ModelName = health.model
|
||||||
|
data.SerialNumber = health.serial
|
||||||
|
data.FirmwareVersion = health.revision
|
||||||
|
data.Capacity = health.capacity
|
||||||
|
data.Temperature = 0
|
||||||
|
data.SmartStatus = status
|
||||||
|
data.DiskName = filepath.Join("/dev", base)
|
||||||
|
data.DiskType = "emmc"
|
||||||
|
data.Attributes = attrs
|
||||||
|
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func readEmmcHealth(blockName string) (emmcHealth, bool) {
|
||||||
|
var out emmcHealth
|
||||||
|
|
||||||
|
if !isEmmcBlockName(blockName) {
|
||||||
|
return out, false
|
||||||
|
}
|
||||||
|
|
||||||
|
deviceDir := filepath.Join(emmcSysfsRoot, "class", "block", blockName, "device")
|
||||||
|
preEOL, okPre := readHexByteFile(filepath.Join(deviceDir, "pre_eol_info"))
|
||||||
|
|
||||||
|
// Some kernels expose EXT_CSD lifetime via "life_time" (two bytes), others as
|
||||||
|
// separate files. Support both.
|
||||||
|
lifeA, lifeB, okLife := readLifeTime(deviceDir)
|
||||||
|
|
||||||
|
if !okPre && !okLife {
|
||||||
|
return out, false
|
||||||
|
}
|
||||||
|
|
||||||
|
out.preEOL = preEOL
|
||||||
|
out.lifeA = lifeA
|
||||||
|
out.lifeB = lifeB
|
||||||
|
|
||||||
|
out.model = readStringFile(filepath.Join(deviceDir, "name"))
|
||||||
|
out.serial = readStringFile(filepath.Join(deviceDir, "serial"))
|
||||||
|
out.revision = readStringFile(filepath.Join(deviceDir, "prv"))
|
||||||
|
|
||||||
|
if capBytes, ok := readBlockCapacityBytes(blockName); ok {
|
||||||
|
out.capacity = capBytes
|
||||||
|
}
|
||||||
|
|
||||||
|
return out, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func readLifeTime(deviceDir string) (uint8, uint8, bool) {
|
||||||
|
if content, ok := readStringFileOK(filepath.Join(deviceDir, "life_time")); ok {
|
||||||
|
a, b, ok := parseHexBytePair(content)
|
||||||
|
return a, b, ok
|
||||||
|
}
|
||||||
|
|
||||||
|
a, okA := readHexByteFile(filepath.Join(deviceDir, "device_life_time_est_typ_a"))
|
||||||
|
b, okB := readHexByteFile(filepath.Join(deviceDir, "device_life_time_est_typ_b"))
|
||||||
|
if okA || okB {
|
||||||
|
return a, b, true
|
||||||
|
}
|
||||||
|
return 0, 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func readBlockCapacityBytes(blockName string) (uint64, bool) {
|
||||||
|
sizePath := filepath.Join(emmcSysfsRoot, "class", "block", blockName, "size")
|
||||||
|
lbsPath := filepath.Join(emmcSysfsRoot, "class", "block", blockName, "queue", "logical_block_size")
|
||||||
|
|
||||||
|
sizeStr, ok := readStringFileOK(sizePath)
|
||||||
|
if !ok {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
sectors, err := strconv.ParseUint(sizeStr, 10, 64)
|
||||||
|
if err != nil || sectors == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
lbsStr, ok := readStringFileOK(lbsPath)
|
||||||
|
logicalBlockSize := uint64(512)
|
||||||
|
if ok {
|
||||||
|
if parsed, err := strconv.ParseUint(lbsStr, 10, 64); err == nil && parsed > 0 {
|
||||||
|
logicalBlockSize = parsed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sectors * logicalBlockSize, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func readHexByteFile(path string) (uint8, bool) {
|
||||||
|
content, ok := readStringFileOK(path)
|
||||||
|
if !ok {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
b, ok := parseHexOrDecByte(content)
|
||||||
|
return b, ok
|
||||||
|
}
|
||||||
|
|
||||||
|
func readStringFile(path string) string {
|
||||||
|
content, _ := readStringFileOK(path)
|
||||||
|
return content
|
||||||
|
}
|
||||||
|
|
||||||
|
func readStringFileOK(path string) (string, bool) {
|
||||||
|
b, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(b)), true
|
||||||
|
}
|
||||||
|
|
||||||
|
func hasEmmcHealthFiles(deviceDir string) bool {
|
||||||
|
entries, err := os.ReadDir(deviceDir)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, ent := range entries {
|
||||||
|
switch ent.Name() {
|
||||||
|
case "pre_eol_info", "life_time", "device_life_time_est_typ_a", "device_life_time_est_typ_b":
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
80
agent/emmc_linux_test.go
Normal file
80
agent/emmc_linux_test.go
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
//go:build linux
|
||||||
|
|
||||||
|
package agent
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/henrygd/beszel/internal/entities/smart"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestEmmcMockSysfsScanAndCollect(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
prev := emmcSysfsRoot
|
||||||
|
emmcSysfsRoot = tmp
|
||||||
|
t.Cleanup(func() { emmcSysfsRoot = prev })
|
||||||
|
|
||||||
|
// Fake: /sys/class/block/mmcblk0
|
||||||
|
mmcDeviceDir := filepath.Join(tmp, "class", "block", "mmcblk0", "device")
|
||||||
|
mmcQueueDir := filepath.Join(tmp, "class", "block", "mmcblk0", "queue")
|
||||||
|
if err := os.MkdirAll(mmcDeviceDir, 0o755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(mmcQueueDir, 0o755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
write := func(path, content string) {
|
||||||
|
t.Helper()
|
||||||
|
if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
write(filepath.Join(mmcDeviceDir, "pre_eol_info"), "0x02\n")
|
||||||
|
write(filepath.Join(mmcDeviceDir, "life_time"), "0x04 0x05\n")
|
||||||
|
write(filepath.Join(mmcDeviceDir, "name"), "H26M52103FMR\n")
|
||||||
|
write(filepath.Join(mmcDeviceDir, "serial"), "01234567\n")
|
||||||
|
write(filepath.Join(mmcDeviceDir, "prv"), "0x08\n")
|
||||||
|
write(filepath.Join(mmcQueueDir, "logical_block_size"), "512\n")
|
||||||
|
write(filepath.Join(tmp, "class", "block", "mmcblk0", "size"), "1024\n") // sectors
|
||||||
|
|
||||||
|
devs := scanEmmcDevices()
|
||||||
|
if len(devs) != 1 {
|
||||||
|
t.Fatalf("scanEmmcDevices() = %d devices, want 1", len(devs))
|
||||||
|
}
|
||||||
|
if devs[0].Name != "/dev/mmcblk0" || devs[0].Type != "emmc" {
|
||||||
|
t.Fatalf("scanEmmcDevices()[0] = %+v, want Name=/dev/mmcblk0 Type=emmc", devs[0])
|
||||||
|
}
|
||||||
|
|
||||||
|
sm := &SmartManager{SmartDataMap: map[string]*smart.SmartData{}}
|
||||||
|
ok, err := sm.collectEmmcHealth(devs[0])
|
||||||
|
if err != nil || !ok {
|
||||||
|
t.Fatalf("collectEmmcHealth() = (ok=%v, err=%v), want (true,nil)", ok, err)
|
||||||
|
}
|
||||||
|
if len(sm.SmartDataMap) != 1 {
|
||||||
|
t.Fatalf("SmartDataMap len=%d, want 1", len(sm.SmartDataMap))
|
||||||
|
}
|
||||||
|
var got *smart.SmartData
|
||||||
|
for _, v := range sm.SmartDataMap {
|
||||||
|
got = v
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if got == nil {
|
||||||
|
t.Fatalf("SmartDataMap value nil")
|
||||||
|
}
|
||||||
|
if got.DiskType != "emmc" || got.DiskName != "/dev/mmcblk0" {
|
||||||
|
t.Fatalf("disk fields = (type=%q name=%q), want (emmc,/dev/mmcblk0)", got.DiskType, got.DiskName)
|
||||||
|
}
|
||||||
|
if got.SmartStatus != "WARNING" {
|
||||||
|
t.Fatalf("SmartStatus=%q, want WARNING", got.SmartStatus)
|
||||||
|
}
|
||||||
|
if got.SerialNumber != "01234567" || got.ModelName == "" || got.Capacity == 0 {
|
||||||
|
t.Fatalf("identity fields = (model=%q serial=%q cap=%d), want non-empty model, serial 01234567, cap>0", got.ModelName, got.SerialNumber, got.Capacity)
|
||||||
|
}
|
||||||
|
if len(got.Attributes) < 3 {
|
||||||
|
t.Fatalf("attributes len=%d, want >= 3", len(got.Attributes))
|
||||||
|
}
|
||||||
|
}
|
||||||
14
agent/emmc_stub.go
Normal file
14
agent/emmc_stub.go
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
//go:build !linux
|
||||||
|
|
||||||
|
package agent
|
||||||
|
|
||||||
|
// Non-Linux builds: eMMC health via sysfs is not available.
|
||||||
|
|
||||||
|
func scanEmmcDevices() []*DeviceInfo {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (sm *SmartManager) collectEmmcHealth(deviceInfo *DeviceInfo) (bool, error) {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
@@ -28,7 +28,7 @@ type SmartManager struct {
|
|||||||
SmartDevices []*DeviceInfo
|
SmartDevices []*DeviceInfo
|
||||||
refreshMutex sync.Mutex
|
refreshMutex sync.Mutex
|
||||||
lastScanTime time.Time
|
lastScanTime time.Time
|
||||||
binPath string
|
smartctlPath string
|
||||||
excludedDevices map[string]struct{}
|
excludedDevices map[string]struct{}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -170,27 +170,35 @@ func (sm *SmartManager) ScanDevices(force bool) error {
|
|||||||
configuredDevices = parsedDevices
|
configuredDevices = parsedDevices
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
cmd := exec.CommandContext(ctx, sm.binPath, "--scan", "-j")
|
|
||||||
output, err := cmd.Output()
|
|
||||||
|
|
||||||
var (
|
var (
|
||||||
scanErr error
|
scanErr error
|
||||||
scannedDevices []*DeviceInfo
|
scannedDevices []*DeviceInfo
|
||||||
hasValidScan bool
|
hasValidScan bool
|
||||||
)
|
)
|
||||||
|
|
||||||
if err != nil {
|
if sm.smartctlPath != "" {
|
||||||
scanErr = err
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
} else {
|
defer cancel()
|
||||||
scannedDevices, hasValidScan = sm.parseScan(output)
|
|
||||||
if !hasValidScan {
|
cmd := exec.CommandContext(ctx, sm.smartctlPath, "--scan", "-j")
|
||||||
scanErr = errNoValidSmartData
|
output, err := cmd.Output()
|
||||||
|
if err != nil {
|
||||||
|
scanErr = err
|
||||||
|
} else {
|
||||||
|
scannedDevices, hasValidScan = sm.parseScan(output)
|
||||||
|
if !hasValidScan {
|
||||||
|
scanErr = errNoValidSmartData
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add eMMC devices (Linux only) by reading sysfs health fields. This does not
|
||||||
|
// require smartctl and does not scan the whole device.
|
||||||
|
if emmcDevices := scanEmmcDevices(); len(emmcDevices) > 0 {
|
||||||
|
scannedDevices = append(scannedDevices, emmcDevices...)
|
||||||
|
hasValidScan = true
|
||||||
|
}
|
||||||
|
|
||||||
finalDevices := mergeDeviceLists(currentDevices, scannedDevices, configuredDevices)
|
finalDevices := mergeDeviceLists(currentDevices, scannedDevices, configuredDevices)
|
||||||
finalDevices = sm.filterExcludedDevices(finalDevices)
|
finalDevices = sm.filterExcludedDevices(finalDevices)
|
||||||
sm.updateSmartDevices(finalDevices)
|
sm.updateSmartDevices(finalDevices)
|
||||||
@@ -442,6 +450,18 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
|
|||||||
return errNoValidSmartData
|
return errNoValidSmartData
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// eMMC health is not exposed via SMART on Linux, but the kernel provides
|
||||||
|
// wear / EOL indicators via sysfs. Prefer that path when available.
|
||||||
|
if deviceInfo != nil {
|
||||||
|
if ok, err := sm.collectEmmcHealth(deviceInfo); ok {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if sm.smartctlPath == "" {
|
||||||
|
return errNoValidSmartData
|
||||||
|
}
|
||||||
|
|
||||||
// slog.Info("collecting SMART data", "device", deviceInfo.Name, "type", deviceInfo.Type, "has_existing_data", sm.hasDataForDevice(deviceInfo.Name))
|
// slog.Info("collecting SMART data", "device", deviceInfo.Name, "type", deviceInfo.Type, "has_existing_data", sm.hasDataForDevice(deviceInfo.Name))
|
||||||
|
|
||||||
// Check if we have any existing data for this device
|
// Check if we have any existing data for this device
|
||||||
@@ -452,7 +472,7 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
|
|||||||
|
|
||||||
// Try with -n standby first if we have existing data
|
// Try with -n standby first if we have existing data
|
||||||
args := sm.smartctlArgs(deviceInfo, hasExistingData)
|
args := sm.smartctlArgs(deviceInfo, hasExistingData)
|
||||||
cmd := exec.CommandContext(ctx, sm.binPath, args...)
|
cmd := exec.CommandContext(ctx, sm.smartctlPath, args...)
|
||||||
output, err := cmd.CombinedOutput()
|
output, err := cmd.CombinedOutput()
|
||||||
|
|
||||||
// Check if device is in standby (exit status 2)
|
// Check if device is in standby (exit status 2)
|
||||||
@@ -465,7 +485,7 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
|
|||||||
ctx2, cancel2 := context.WithTimeout(context.Background(), 15*time.Second)
|
ctx2, cancel2 := context.WithTimeout(context.Background(), 15*time.Second)
|
||||||
defer cancel2()
|
defer cancel2()
|
||||||
args = sm.smartctlArgs(deviceInfo, false)
|
args = sm.smartctlArgs(deviceInfo, false)
|
||||||
cmd = exec.CommandContext(ctx2, sm.binPath, args...)
|
cmd = exec.CommandContext(ctx2, sm.smartctlPath, args...)
|
||||||
output, err = cmd.CombinedOutput()
|
output, err = cmd.CombinedOutput()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -482,7 +502,7 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
|
|||||||
ctx3, cancel3 := context.WithTimeout(context.Background(), 15*time.Second)
|
ctx3, cancel3 := context.WithTimeout(context.Background(), 15*time.Second)
|
||||||
defer cancel3()
|
defer cancel3()
|
||||||
args = sm.smartctlArgs(deviceInfo, false)
|
args = sm.smartctlArgs(deviceInfo, false)
|
||||||
cmd = exec.CommandContext(ctx3, sm.binPath, args...)
|
cmd = exec.CommandContext(ctx3, sm.smartctlPath, args...)
|
||||||
output, err = cmd.CombinedOutput()
|
output, err = cmd.CombinedOutput()
|
||||||
hasValidData = sm.parseSmartOutput(deviceInfo, output)
|
hasValidData = sm.parseSmartOutput(deviceInfo, output)
|
||||||
|
|
||||||
@@ -1123,10 +1143,15 @@ func NewSmartManager() (*SmartManager, error) {
|
|||||||
}
|
}
|
||||||
sm.refreshExcludedDevices()
|
sm.refreshExcludedDevices()
|
||||||
path, err := sm.detectSmartctl()
|
path, err := sm.detectSmartctl()
|
||||||
|
slog.Debug("smartctl", "path", path, "err", err)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
// Keep the previous fail-fast behavior unless this Linux host exposes
|
||||||
|
// eMMC health via sysfs, in which case smartctl is optional.
|
||||||
|
if runtime.GOOS == "linux" && len(scanEmmcDevices()) > 0 {
|
||||||
|
return sm, nil
|
||||||
|
}
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
slog.Debug("smartctl", "path", path)
|
sm.smartctlPath = path
|
||||||
sm.binPath = path
|
|
||||||
return sm, nil
|
return sm, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,18 +2,18 @@ package alerts
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/pocketbase/pocketbase/core"
|
"github.com/pocketbase/pocketbase/core"
|
||||||
)
|
)
|
||||||
|
|
||||||
// handleSmartDeviceAlert sends alerts when a SMART device state changes from PASSED to FAILED.
|
// handleSmartDeviceAlert sends alerts when a SMART device state worsens into WARNING/FAILED.
|
||||||
// This is automatic and does not require user opt-in.
|
// This is automatic and does not require user opt-in.
|
||||||
func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
|
func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
|
||||||
oldState := e.Record.Original().GetString("state")
|
oldState := e.Record.Original().GetString("state")
|
||||||
newState := e.Record.GetString("state")
|
newState := e.Record.GetString("state")
|
||||||
|
|
||||||
// Only alert when transitioning from PASSED to FAILED
|
if !shouldSendSmartDeviceAlert(oldState, newState) {
|
||||||
if oldState != "PASSED" || newState != "FAILED" {
|
|
||||||
return e.Next()
|
return e.Next()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -32,14 +32,15 @@ func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
|
|||||||
systemName := systemRecord.GetString("name")
|
systemName := systemRecord.GetString("name")
|
||||||
deviceName := e.Record.GetString("name")
|
deviceName := e.Record.GetString("name")
|
||||||
model := e.Record.GetString("model")
|
model := e.Record.GetString("model")
|
||||||
|
statusLabel := smartStateLabel(newState)
|
||||||
|
|
||||||
// Build alert message
|
// Build alert message
|
||||||
title := fmt.Sprintf("SMART failure on %s: %s \U0001F534", systemName, deviceName)
|
title := fmt.Sprintf("SMART %s on %s: %s %s", statusLabel, systemName, deviceName, smartStateEmoji(newState))
|
||||||
var message string
|
var message string
|
||||||
if model != "" {
|
if model != "" {
|
||||||
message = fmt.Sprintf("Disk %s (%s) SMART status changed to FAILED", deviceName, model)
|
message = fmt.Sprintf("Disk %s (%s) SMART status changed to %s", deviceName, model, newState)
|
||||||
} else {
|
} else {
|
||||||
message = fmt.Sprintf("Disk %s SMART status changed to FAILED", deviceName)
|
message = fmt.Sprintf("Disk %s SMART status changed to %s", deviceName, newState)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get users associated with the system
|
// Get users associated with the system
|
||||||
@@ -65,3 +66,42 @@ func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
|
|||||||
return e.Next()
|
return e.Next()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func shouldSendSmartDeviceAlert(oldState, newState string) bool {
|
||||||
|
oldSeverity := smartStateSeverity(oldState)
|
||||||
|
newSeverity := smartStateSeverity(newState)
|
||||||
|
|
||||||
|
// Ignore unknown states and recoveries; only alert on worsening transitions
|
||||||
|
// from known-good/degraded states into WARNING/FAILED.
|
||||||
|
return oldSeverity >= 1 && newSeverity > oldSeverity
|
||||||
|
}
|
||||||
|
|
||||||
|
func smartStateSeverity(state string) int {
|
||||||
|
switch state {
|
||||||
|
case "PASSED":
|
||||||
|
return 1
|
||||||
|
case "WARNING":
|
||||||
|
return 2
|
||||||
|
case "FAILED":
|
||||||
|
return 3
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func smartStateEmoji(state string) string {
|
||||||
|
switch state {
|
||||||
|
case "WARNING":
|
||||||
|
return "\U0001F7E0"
|
||||||
|
default:
|
||||||
|
return "\U0001F534"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func smartStateLabel(state string) string {
|
||||||
|
switch state {
|
||||||
|
case "FAILED":
|
||||||
|
return "failure"
|
||||||
|
default:
|
||||||
|
return strings.ToLower(state)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -58,6 +58,74 @@ func TestSmartDeviceAlert(t *testing.T) {
|
|||||||
assert.Contains(t, lastMessage.Text, "FAILED")
|
assert.Contains(t, lastMessage.Text, "FAILED")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSmartDeviceAlertPassedToWarning(t *testing.T) {
|
||||||
|
hub, user := beszelTests.GetHubWithUser(t)
|
||||||
|
defer hub.Cleanup()
|
||||||
|
|
||||||
|
system, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
|
||||||
|
"name": "test-system",
|
||||||
|
"users": []string{user.Id},
|
||||||
|
"host": "127.0.0.1",
|
||||||
|
})
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
smartDevice, err := beszelTests.CreateRecord(hub, "smart_devices", map[string]any{
|
||||||
|
"system": system.Id,
|
||||||
|
"name": "/dev/mmcblk0",
|
||||||
|
"model": "eMMC",
|
||||||
|
"state": "PASSED",
|
||||||
|
})
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
smartDevice.Set("state", "WARNING")
|
||||||
|
err = hub.Save(smartDevice)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
time.Sleep(50 * time.Millisecond)
|
||||||
|
|
||||||
|
assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 email sent after state changed to WARNING")
|
||||||
|
lastMessage := hub.TestMailer.LastMessage()
|
||||||
|
assert.Contains(t, lastMessage.Subject, "SMART warning on test-system")
|
||||||
|
assert.Contains(t, lastMessage.Text, "WARNING")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSmartDeviceAlertWarningToFailed(t *testing.T) {
|
||||||
|
hub, user := beszelTests.GetHubWithUser(t)
|
||||||
|
defer hub.Cleanup()
|
||||||
|
|
||||||
|
system, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
|
||||||
|
"name": "test-system",
|
||||||
|
"users": []string{user.Id},
|
||||||
|
"host": "127.0.0.1",
|
||||||
|
})
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
smartDevice, err := beszelTests.CreateRecord(hub, "smart_devices", map[string]any{
|
||||||
|
"system": system.Id,
|
||||||
|
"name": "/dev/mmcblk0",
|
||||||
|
"model": "eMMC",
|
||||||
|
"state": "WARNING",
|
||||||
|
})
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
smartDevice.Set("state", "FAILED")
|
||||||
|
err = hub.Save(smartDevice)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
time.Sleep(50 * time.Millisecond)
|
||||||
|
|
||||||
|
assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 email sent after state changed from WARNING to FAILED")
|
||||||
|
lastMessage := hub.TestMailer.LastMessage()
|
||||||
|
assert.Contains(t, lastMessage.Subject, "SMART failure on test-system")
|
||||||
|
assert.Contains(t, lastMessage.Text, "FAILED")
|
||||||
|
}
|
||||||
|
|
||||||
func TestSmartDeviceAlertNoAlertOnNonPassedToFailed(t *testing.T) {
|
func TestSmartDeviceAlertNoAlertOnNonPassedToFailed(t *testing.T) {
|
||||||
hub, user := beszelTests.GetHubWithUser(t)
|
hub, user := beszelTests.GetHubWithUser(t)
|
||||||
defer hub.Cleanup()
|
defer hub.Cleanup()
|
||||||
@@ -83,7 +151,8 @@ func TestSmartDeviceAlertNoAlertOnNonPassedToFailed(t *testing.T) {
|
|||||||
smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
|
smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
|
|
||||||
// Update the state from UNKNOWN to FAILED - should NOT trigger alert
|
// Update the state from UNKNOWN to FAILED - should NOT trigger alert.
|
||||||
|
// We only alert from known healthy/degraded states.
|
||||||
smartDevice.Set("state", "FAILED")
|
smartDevice.Set("state", "FAILED")
|
||||||
err = hub.Save(smartDevice)
|
err = hub.Save(smartDevice)
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
|
|||||||
@@ -206,7 +206,12 @@ export const columns: ColumnDef<SmartDeviceRecord>[] = [
|
|||||||
invertSorting: true,
|
invertSorting: true,
|
||||||
header: ({ column }) => <HeaderButton column={column} name={t`Temp`} Icon={ThermometerIcon} />,
|
header: ({ column }) => <HeaderButton column={column} name={t`Temp`} Icon={ThermometerIcon} />,
|
||||||
cell: ({ getValue }) => {
|
cell: ({ getValue }) => {
|
||||||
const { value, unit } = formatTemperature(getValue() as number)
|
const temp = getValue() as number | undefined | null
|
||||||
|
// Most devices won't report a real 0C temperature; treat 0 as "unknown".
|
||||||
|
if (temp == null || temp === 0) {
|
||||||
|
return <div className="text-muted-foreground ms-1.5">N/A</div>
|
||||||
|
}
|
||||||
|
const { value, unit } = formatTemperature(temp)
|
||||||
return <span className="ms-1.5">{`${value} ${unit}`}</span>
|
return <span className="ms-1.5">{`${value} ${unit}`}</span>
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ The [quick start guide](https://beszel.dev/guide/getting-started) and other docu
|
|||||||
- **GPU usage / power draw** - Nvidia, AMD, and Intel.
|
- **GPU usage / power draw** - Nvidia, AMD, and Intel.
|
||||||
- **Battery** - Host system battery charge.
|
- **Battery** - Host system battery charge.
|
||||||
- **Containers** - Status and metrics of all running Docker / Podman containers.
|
- **Containers** - Status and metrics of all running Docker / Podman containers.
|
||||||
- **S.M.A.R.T.** - Host system disk health.
|
- **S.M.A.R.T.** - Host system disk health (includes eMMC wear/EOL via Linux sysfs when available).
|
||||||
|
|
||||||
## Help and discussion
|
## Help and discussion
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user