mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-23 14:06:18 +01:00
SMART: add eMMC health via sysfs (#1736)
* SMART: add eMMC health via sysfs Read eMMC wear/EOL indicators from /sys/class/block/mmcblk*/device and expose in SMART device list. Includes mocked sysfs tests and UI tweaks for unknown temps. * small optimizations for emmc scan and parsing * smart: keep smartctl optional only for Linux hosts with eMMC * update smart alerts to handle warning state * refactor: rename binPath to smartctlPath and replace hasSmartctl with smartctlPath checks --------- Co-authored-by: henrygd <hank@henrygd.me>
This commit is contained in:
@@ -2,18 +2,18 @@ package alerts
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/pocketbase/pocketbase/core"
|
||||
)
|
||||
|
||||
// handleSmartDeviceAlert sends alerts when a SMART device state changes from PASSED to FAILED.
|
||||
// handleSmartDeviceAlert sends alerts when a SMART device state worsens into WARNING/FAILED.
|
||||
// This is automatic and does not require user opt-in.
|
||||
func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
|
||||
oldState := e.Record.Original().GetString("state")
|
||||
newState := e.Record.GetString("state")
|
||||
|
||||
// Only alert when transitioning from PASSED to FAILED
|
||||
if oldState != "PASSED" || newState != "FAILED" {
|
||||
if !shouldSendSmartDeviceAlert(oldState, newState) {
|
||||
return e.Next()
|
||||
}
|
||||
|
||||
@@ -32,14 +32,15 @@ func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
|
||||
systemName := systemRecord.GetString("name")
|
||||
deviceName := e.Record.GetString("name")
|
||||
model := e.Record.GetString("model")
|
||||
statusLabel := smartStateLabel(newState)
|
||||
|
||||
// Build alert message
|
||||
title := fmt.Sprintf("SMART failure on %s: %s \U0001F534", systemName, deviceName)
|
||||
title := fmt.Sprintf("SMART %s on %s: %s %s", statusLabel, systemName, deviceName, smartStateEmoji(newState))
|
||||
var message string
|
||||
if model != "" {
|
||||
message = fmt.Sprintf("Disk %s (%s) SMART status changed to FAILED", deviceName, model)
|
||||
message = fmt.Sprintf("Disk %s (%s) SMART status changed to %s", deviceName, model, newState)
|
||||
} else {
|
||||
message = fmt.Sprintf("Disk %s SMART status changed to FAILED", deviceName)
|
||||
message = fmt.Sprintf("Disk %s SMART status changed to %s", deviceName, newState)
|
||||
}
|
||||
|
||||
// Get users associated with the system
|
||||
@@ -65,3 +66,42 @@ func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
|
||||
return e.Next()
|
||||
}
|
||||
|
||||
func shouldSendSmartDeviceAlert(oldState, newState string) bool {
|
||||
oldSeverity := smartStateSeverity(oldState)
|
||||
newSeverity := smartStateSeverity(newState)
|
||||
|
||||
// Ignore unknown states and recoveries; only alert on worsening transitions
|
||||
// from known-good/degraded states into WARNING/FAILED.
|
||||
return oldSeverity >= 1 && newSeverity > oldSeverity
|
||||
}
|
||||
|
||||
func smartStateSeverity(state string) int {
|
||||
switch state {
|
||||
case "PASSED":
|
||||
return 1
|
||||
case "WARNING":
|
||||
return 2
|
||||
case "FAILED":
|
||||
return 3
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func smartStateEmoji(state string) string {
|
||||
switch state {
|
||||
case "WARNING":
|
||||
return "\U0001F7E0"
|
||||
default:
|
||||
return "\U0001F534"
|
||||
}
|
||||
}
|
||||
|
||||
func smartStateLabel(state string) string {
|
||||
switch state {
|
||||
case "FAILED":
|
||||
return "failure"
|
||||
default:
|
||||
return strings.ToLower(state)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,6 +58,74 @@ func TestSmartDeviceAlert(t *testing.T) {
|
||||
assert.Contains(t, lastMessage.Text, "FAILED")
|
||||
}
|
||||
|
||||
func TestSmartDeviceAlertPassedToWarning(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
system, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
|
||||
"name": "test-system",
|
||||
"users": []string{user.Id},
|
||||
"host": "127.0.0.1",
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
|
||||
smartDevice, err := beszelTests.CreateRecord(hub, "smart_devices", map[string]any{
|
||||
"system": system.Id,
|
||||
"name": "/dev/mmcblk0",
|
||||
"model": "eMMC",
|
||||
"state": "PASSED",
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
|
||||
smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
|
||||
assert.NoError(t, err)
|
||||
|
||||
smartDevice.Set("state", "WARNING")
|
||||
err = hub.Save(smartDevice)
|
||||
assert.NoError(t, err)
|
||||
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 email sent after state changed to WARNING")
|
||||
lastMessage := hub.TestMailer.LastMessage()
|
||||
assert.Contains(t, lastMessage.Subject, "SMART warning on test-system")
|
||||
assert.Contains(t, lastMessage.Text, "WARNING")
|
||||
}
|
||||
|
||||
func TestSmartDeviceAlertWarningToFailed(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
|
||||
system, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
|
||||
"name": "test-system",
|
||||
"users": []string{user.Id},
|
||||
"host": "127.0.0.1",
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
|
||||
smartDevice, err := beszelTests.CreateRecord(hub, "smart_devices", map[string]any{
|
||||
"system": system.Id,
|
||||
"name": "/dev/mmcblk0",
|
||||
"model": "eMMC",
|
||||
"state": "WARNING",
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
|
||||
smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
|
||||
assert.NoError(t, err)
|
||||
|
||||
smartDevice.Set("state", "FAILED")
|
||||
err = hub.Save(smartDevice)
|
||||
assert.NoError(t, err)
|
||||
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 email sent after state changed from WARNING to FAILED")
|
||||
lastMessage := hub.TestMailer.LastMessage()
|
||||
assert.Contains(t, lastMessage.Subject, "SMART failure on test-system")
|
||||
assert.Contains(t, lastMessage.Text, "FAILED")
|
||||
}
|
||||
|
||||
func TestSmartDeviceAlertNoAlertOnNonPassedToFailed(t *testing.T) {
|
||||
hub, user := beszelTests.GetHubWithUser(t)
|
||||
defer hub.Cleanup()
|
||||
@@ -83,7 +151,8 @@ func TestSmartDeviceAlertNoAlertOnNonPassedToFailed(t *testing.T) {
|
||||
smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
|
||||
assert.NoError(t, err)
|
||||
|
||||
// Update the state from UNKNOWN to FAILED - should NOT trigger alert
|
||||
// Update the state from UNKNOWN to FAILED - should NOT trigger alert.
|
||||
// We only alert from known healthy/degraded states.
|
||||
smartDevice.Set("state", "FAILED")
|
||||
err = hub.Save(smartDevice)
|
||||
assert.NoError(t, err)
|
||||
|
||||
Reference in New Issue
Block a user