mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-21 21:26:16 +01:00
Add Linux mdraid health monitoring (#1750)
This commit is contained in:
@@ -199,19 +199,6 @@ func readHexByteFile(path string) (uint8, bool) {
|
||||
return b, ok
|
||||
}
|
||||
|
||||
func readStringFile(path string) string {
|
||||
content, _ := readStringFileOK(path)
|
||||
return content
|
||||
}
|
||||
|
||||
func readStringFileOK(path string) (string, bool) {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", false
|
||||
}
|
||||
return strings.TrimSpace(string(b)), true
|
||||
}
|
||||
|
||||
func hasEmmcHealthFiles(deviceDir string) bool {
|
||||
entries, err := os.ReadDir(deviceDir)
|
||||
if err != nil {
|
||||
|
||||
24
agent/file_utils.go
Normal file
24
agent/file_utils.go
Normal file
@@ -0,0 +1,24 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func readStringFile(path string) string {
|
||||
content, _ := readStringFileOK(path)
|
||||
return content
|
||||
}
|
||||
|
||||
func readStringFileOK(path string) (string, bool) {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", false
|
||||
}
|
||||
return strings.TrimSpace(string(b)), true
|
||||
}
|
||||
|
||||
func fileExists(path string) bool {
|
||||
_, err := os.Stat(path)
|
||||
return err == nil
|
||||
}
|
||||
239
agent/mdraid_linux.go
Normal file
239
agent/mdraid_linux.go
Normal file
@@ -0,0 +1,239 @@
|
||||
//go:build linux
|
||||
|
||||
package agent
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/henrygd/beszel/internal/entities/smart"
|
||||
)
|
||||
|
||||
// mdraidSysfsRoot is a test hook; production value is "/sys".
|
||||
var mdraidSysfsRoot = "/sys"
|
||||
|
||||
type mdraidHealth struct {
|
||||
name string
|
||||
level string
|
||||
arrayState string
|
||||
degraded uint64
|
||||
raidDisks uint64
|
||||
syncAction string
|
||||
syncCompleted string
|
||||
syncSpeed string
|
||||
mismatchCnt uint64
|
||||
capacity uint64
|
||||
}
|
||||
|
||||
func scanMdraidDevices() []*DeviceInfo {
|
||||
blockDir := filepath.Join(mdraidSysfsRoot, "block")
|
||||
entries, err := os.ReadDir(blockDir)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
devices := make([]*DeviceInfo, 0, 2)
|
||||
for _, ent := range entries {
|
||||
name := ent.Name()
|
||||
if !isMdraidBlockName(name) {
|
||||
continue
|
||||
}
|
||||
mdDir := filepath.Join(blockDir, name, "md")
|
||||
if !fileExists(filepath.Join(mdDir, "array_state")) {
|
||||
continue
|
||||
}
|
||||
|
||||
devPath := filepath.Join("/dev", name)
|
||||
devices = append(devices, &DeviceInfo{
|
||||
Name: devPath,
|
||||
Type: "mdraid",
|
||||
InfoName: devPath + " [mdraid]",
|
||||
Protocol: "MD",
|
||||
})
|
||||
}
|
||||
|
||||
return devices
|
||||
}
|
||||
|
||||
func (sm *SmartManager) collectMdraidHealth(deviceInfo *DeviceInfo) (bool, error) {
|
||||
if deviceInfo == nil || deviceInfo.Name == "" {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
base := filepath.Base(deviceInfo.Name)
|
||||
if !isMdraidBlockName(base) && !strings.EqualFold(deviceInfo.Type, "mdraid") {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
health, ok := readMdraidHealth(base)
|
||||
if !ok {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
deviceInfo.Type = "mdraid"
|
||||
key := fmt.Sprintf("mdraid:%s", base)
|
||||
status := mdraidSmartStatus(health)
|
||||
|
||||
attrs := make([]*smart.SmartAttribute, 0, 10)
|
||||
if health.arrayState != "" {
|
||||
attrs = append(attrs, &smart.SmartAttribute{Name: "ArrayState", RawString: health.arrayState})
|
||||
}
|
||||
if health.level != "" {
|
||||
attrs = append(attrs, &smart.SmartAttribute{Name: "RaidLevel", RawString: health.level})
|
||||
}
|
||||
if health.raidDisks > 0 {
|
||||
attrs = append(attrs, &smart.SmartAttribute{Name: "RaidDisks", RawValue: health.raidDisks})
|
||||
}
|
||||
if health.degraded > 0 {
|
||||
attrs = append(attrs, &smart.SmartAttribute{Name: "Degraded", RawValue: health.degraded})
|
||||
}
|
||||
if health.syncAction != "" {
|
||||
attrs = append(attrs, &smart.SmartAttribute{Name: "SyncAction", RawString: health.syncAction})
|
||||
}
|
||||
if health.syncCompleted != "" {
|
||||
attrs = append(attrs, &smart.SmartAttribute{Name: "SyncCompleted", RawString: health.syncCompleted})
|
||||
}
|
||||
if health.syncSpeed != "" {
|
||||
attrs = append(attrs, &smart.SmartAttribute{Name: "SyncSpeed", RawString: health.syncSpeed})
|
||||
}
|
||||
if health.mismatchCnt > 0 {
|
||||
attrs = append(attrs, &smart.SmartAttribute{Name: "MismatchCount", RawValue: health.mismatchCnt})
|
||||
}
|
||||
|
||||
sm.Lock()
|
||||
defer sm.Unlock()
|
||||
|
||||
if _, exists := sm.SmartDataMap[key]; !exists {
|
||||
sm.SmartDataMap[key] = &smart.SmartData{}
|
||||
}
|
||||
|
||||
data := sm.SmartDataMap[key]
|
||||
data.ModelName = "Linux MD RAID"
|
||||
if health.level != "" {
|
||||
data.ModelName = "Linux MD RAID (" + health.level + ")"
|
||||
}
|
||||
data.SerialNumber = ""
|
||||
data.FirmwareVersion = ""
|
||||
data.Capacity = health.capacity
|
||||
data.Temperature = 0
|
||||
data.SmartStatus = status
|
||||
data.DiskName = filepath.Join("/dev", base)
|
||||
data.DiskType = "mdraid"
|
||||
data.Attributes = attrs
|
||||
sm.SmartDataMap[key] = data
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func readMdraidHealth(blockName string) (mdraidHealth, bool) {
|
||||
var out mdraidHealth
|
||||
|
||||
if !isMdraidBlockName(blockName) {
|
||||
return out, false
|
||||
}
|
||||
|
||||
mdDir := filepath.Join(mdraidSysfsRoot, "block", blockName, "md")
|
||||
arrayState, okState := readStringFileOK(filepath.Join(mdDir, "array_state"))
|
||||
if !okState {
|
||||
return out, false
|
||||
}
|
||||
|
||||
out.name = blockName
|
||||
out.arrayState = arrayState
|
||||
out.level = readStringFile(filepath.Join(mdDir, "level"))
|
||||
out.syncAction = readStringFile(filepath.Join(mdDir, "sync_action"))
|
||||
out.syncCompleted = readStringFile(filepath.Join(mdDir, "sync_completed"))
|
||||
out.syncSpeed = readStringFile(filepath.Join(mdDir, "sync_speed"))
|
||||
|
||||
if val, ok := readUintFile(filepath.Join(mdDir, "raid_disks")); ok {
|
||||
out.raidDisks = val
|
||||
}
|
||||
if val, ok := readUintFile(filepath.Join(mdDir, "degraded")); ok {
|
||||
out.degraded = val
|
||||
}
|
||||
if val, ok := readUintFile(filepath.Join(mdDir, "mismatch_cnt")); ok {
|
||||
out.mismatchCnt = val
|
||||
}
|
||||
|
||||
if capBytes, ok := readMdraidBlockCapacityBytes(blockName, mdraidSysfsRoot); ok {
|
||||
out.capacity = capBytes
|
||||
}
|
||||
|
||||
return out, true
|
||||
}
|
||||
|
||||
func mdraidSmartStatus(health mdraidHealth) string {
|
||||
state := strings.ToLower(strings.TrimSpace(health.arrayState))
|
||||
switch state {
|
||||
case "inactive", "faulty", "broken", "stopped":
|
||||
return "FAILED"
|
||||
}
|
||||
if health.degraded > 0 {
|
||||
return "FAILED"
|
||||
}
|
||||
|
||||
switch strings.ToLower(strings.TrimSpace(health.syncAction)) {
|
||||
case "resync", "recover", "reshape", "check", "repair":
|
||||
return "WARNING"
|
||||
}
|
||||
|
||||
if state == "clean" || state == "active" || state == "readonly" {
|
||||
return "PASSED"
|
||||
}
|
||||
return "UNKNOWN"
|
||||
}
|
||||
|
||||
func isMdraidBlockName(name string) bool {
|
||||
if !strings.HasPrefix(name, "md") {
|
||||
return false
|
||||
}
|
||||
suffix := strings.TrimPrefix(name, "md")
|
||||
if suffix == "" {
|
||||
return false
|
||||
}
|
||||
for _, c := range suffix {
|
||||
if c < '0' || c > '9' {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func readUintFile(path string) (uint64, bool) {
|
||||
raw, ok := readStringFileOK(path)
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
parsed, err := strconv.ParseUint(strings.TrimSpace(raw), 10, 64)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return parsed, true
|
||||
}
|
||||
|
||||
func readMdraidBlockCapacityBytes(blockName, root string) (uint64, bool) {
|
||||
sizePath := filepath.Join(root, "block", blockName, "size")
|
||||
lbsPath := filepath.Join(root, "block", blockName, "queue", "logical_block_size")
|
||||
|
||||
sizeStr, ok := readStringFileOK(sizePath)
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
sectors, err := strconv.ParseUint(strings.TrimSpace(sizeStr), 10, 64)
|
||||
if err != nil || sectors == 0 {
|
||||
return 0, false
|
||||
}
|
||||
|
||||
lbsStr, ok := readStringFileOK(lbsPath)
|
||||
logicalBlockSize := uint64(512)
|
||||
if ok {
|
||||
if parsed, err := strconv.ParseUint(strings.TrimSpace(lbsStr), 10, 64); err == nil && parsed > 0 {
|
||||
logicalBlockSize = parsed
|
||||
}
|
||||
}
|
||||
|
||||
return sectors * logicalBlockSize, true
|
||||
}
|
||||
100
agent/mdraid_linux_test.go
Normal file
100
agent/mdraid_linux_test.go
Normal file
@@ -0,0 +1,100 @@
|
||||
//go:build linux
|
||||
|
||||
package agent
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/henrygd/beszel/internal/entities/smart"
|
||||
)
|
||||
|
||||
func TestMdraidMockSysfsScanAndCollect(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
prev := mdraidSysfsRoot
|
||||
mdraidSysfsRoot = tmp
|
||||
t.Cleanup(func() { mdraidSysfsRoot = prev })
|
||||
|
||||
mdDir := filepath.Join(tmp, "block", "md0", "md")
|
||||
queueDir := filepath.Join(tmp, "block", "md0", "queue")
|
||||
if err := os.MkdirAll(mdDir, 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.MkdirAll(queueDir, 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
write := func(path, content string) {
|
||||
t.Helper()
|
||||
if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
write(filepath.Join(mdDir, "array_state"), "active\n")
|
||||
write(filepath.Join(mdDir, "level"), "raid1\n")
|
||||
write(filepath.Join(mdDir, "raid_disks"), "2\n")
|
||||
write(filepath.Join(mdDir, "degraded"), "0\n")
|
||||
write(filepath.Join(mdDir, "sync_action"), "resync\n")
|
||||
write(filepath.Join(mdDir, "sync_completed"), "10%\n")
|
||||
write(filepath.Join(mdDir, "sync_speed"), "100M\n")
|
||||
write(filepath.Join(mdDir, "mismatch_cnt"), "0\n")
|
||||
write(filepath.Join(queueDir, "logical_block_size"), "512\n")
|
||||
write(filepath.Join(tmp, "block", "md0", "size"), "2048\n")
|
||||
|
||||
devs := scanMdraidDevices()
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("scanMdraidDevices() = %d devices, want 1", len(devs))
|
||||
}
|
||||
if devs[0].Name != "/dev/md0" || devs[0].Type != "mdraid" {
|
||||
t.Fatalf("scanMdraidDevices()[0] = %+v, want Name=/dev/md0 Type=mdraid", devs[0])
|
||||
}
|
||||
|
||||
sm := &SmartManager{SmartDataMap: map[string]*smart.SmartData{}}
|
||||
ok, err := sm.collectMdraidHealth(devs[0])
|
||||
if err != nil || !ok {
|
||||
t.Fatalf("collectMdraidHealth() = (ok=%v, err=%v), want (true,nil)", ok, err)
|
||||
}
|
||||
if len(sm.SmartDataMap) != 1 {
|
||||
t.Fatalf("SmartDataMap len=%d, want 1", len(sm.SmartDataMap))
|
||||
}
|
||||
var got *smart.SmartData
|
||||
for _, v := range sm.SmartDataMap {
|
||||
got = v
|
||||
break
|
||||
}
|
||||
if got == nil {
|
||||
t.Fatalf("SmartDataMap value nil")
|
||||
}
|
||||
if got.DiskType != "mdraid" || got.DiskName != "/dev/md0" {
|
||||
t.Fatalf("disk fields = (type=%q name=%q), want (mdraid,/dev/md0)", got.DiskType, got.DiskName)
|
||||
}
|
||||
if got.SmartStatus != "WARNING" {
|
||||
t.Fatalf("SmartStatus=%q, want WARNING", got.SmartStatus)
|
||||
}
|
||||
if got.ModelName == "" || got.Capacity == 0 {
|
||||
t.Fatalf("identity fields = (model=%q cap=%d), want non-empty model and cap>0", got.ModelName, got.Capacity)
|
||||
}
|
||||
if len(got.Attributes) < 5 {
|
||||
t.Fatalf("attributes len=%d, want >= 5", len(got.Attributes))
|
||||
}
|
||||
}
|
||||
|
||||
func TestMdraidSmartStatus(t *testing.T) {
|
||||
if got := mdraidSmartStatus(mdraidHealth{arrayState: "inactive"}); got != "FAILED" {
|
||||
t.Fatalf("mdraidSmartStatus(inactive) = %q, want FAILED", got)
|
||||
}
|
||||
if got := mdraidSmartStatus(mdraidHealth{arrayState: "active", degraded: 1}); got != "FAILED" {
|
||||
t.Fatalf("mdraidSmartStatus(degraded) = %q, want FAILED", got)
|
||||
}
|
||||
if got := mdraidSmartStatus(mdraidHealth{arrayState: "active", syncAction: "recover"}); got != "WARNING" {
|
||||
t.Fatalf("mdraidSmartStatus(recover) = %q, want WARNING", got)
|
||||
}
|
||||
if got := mdraidSmartStatus(mdraidHealth{arrayState: "clean"}); got != "PASSED" {
|
||||
t.Fatalf("mdraidSmartStatus(clean) = %q, want PASSED", got)
|
||||
}
|
||||
if got := mdraidSmartStatus(mdraidHealth{arrayState: "unknown"}); got != "UNKNOWN" {
|
||||
t.Fatalf("mdraidSmartStatus(unknown) = %q, want UNKNOWN", got)
|
||||
}
|
||||
}
|
||||
11
agent/mdraid_stub.go
Normal file
11
agent/mdraid_stub.go
Normal file
@@ -0,0 +1,11 @@
|
||||
//go:build !linux
|
||||
|
||||
package agent
|
||||
|
||||
func scanMdraidDevices() []*DeviceInfo {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (sm *SmartManager) collectMdraidHealth(deviceInfo *DeviceInfo) (bool, error) {
|
||||
return false, nil
|
||||
}
|
||||
@@ -199,6 +199,13 @@ func (sm *SmartManager) ScanDevices(force bool) error {
|
||||
hasValidScan = true
|
||||
}
|
||||
|
||||
// Add Linux mdraid arrays by reading sysfs health fields. This does not
|
||||
// require smartctl and does not scan the whole device.
|
||||
if raidDevices := scanMdraidDevices(); len(raidDevices) > 0 {
|
||||
scannedDevices = append(scannedDevices, raidDevices...)
|
||||
hasValidScan = true
|
||||
}
|
||||
|
||||
finalDevices := mergeDeviceLists(currentDevices, scannedDevices, configuredDevices)
|
||||
finalDevices = sm.filterExcludedDevices(finalDevices)
|
||||
sm.updateSmartDevices(finalDevices)
|
||||
@@ -450,6 +457,12 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
|
||||
return errNoValidSmartData
|
||||
}
|
||||
|
||||
// mdraid health is not exposed via SMART; Linux exposes array state in sysfs.
|
||||
if deviceInfo != nil {
|
||||
if ok, err := sm.collectMdraidHealth(deviceInfo); ok {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// eMMC health is not exposed via SMART on Linux, but the kernel provides
|
||||
// wear / EOL indicators via sysfs. Prefer that path when available.
|
||||
if deviceInfo != nil {
|
||||
@@ -1146,9 +1159,11 @@ func NewSmartManager() (*SmartManager, error) {
|
||||
slog.Debug("smartctl", "path", path, "err", err)
|
||||
if err != nil {
|
||||
// Keep the previous fail-fast behavior unless this Linux host exposes
|
||||
// eMMC health via sysfs, in which case smartctl is optional.
|
||||
if runtime.GOOS == "linux" && len(scanEmmcDevices()) > 0 {
|
||||
return sm, nil
|
||||
// eMMC or mdraid health via sysfs, in which case smartctl is optional.
|
||||
if runtime.GOOS == "linux" {
|
||||
if len(scanEmmcDevices()) > 0 || len(scanMdraidDevices()) > 0 {
|
||||
return sm, nil
|
||||
}
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -51,7 +51,7 @@ The [quick start guide](https://beszel.dev/guide/getting-started) and other docu
|
||||
- **GPU usage / power draw** - Nvidia, AMD, and Intel.
|
||||
- **Battery** - Host system battery charge.
|
||||
- **Containers** - Status and metrics of all running Docker / Podman containers.
|
||||
- **S.M.A.R.T.** - Host system disk health (includes eMMC wear/EOL via Linux sysfs when available).
|
||||
- **S.M.A.R.T.** - Host system disk health (includes eMMC wear/EOL and Linux mdraid array health via sysfs when available).
|
||||
|
||||
## Help and discussion
|
||||
|
||||
|
||||
Reference in New Issue
Block a user