Add initial S.M.A.R.T. support

- Implement SmartManager for collecting SMART data from SATA and NVMe drives
- Add smartctl-based data collection with standby mode detection
- Support comprehensive SMART attributes parsing and storage
- Add hub API endpoint for fetching SMART data from agents
- Create SMART table UI with detailed disk information

Co-authored-by: geekifan <i@ifan.dev>
This commit is contained in:
henrygd
2025-10-24 18:54:51 -04:00
parent 92b1f236e3
commit 962613df7c
13 changed files with 1476 additions and 10 deletions

View File

@@ -42,6 +42,7 @@ type Agent struct {
server *ssh.Server // SSH server
dataDir string // Directory for persisting data
keys []gossh.PublicKey // SSH public keys
smartManager *SmartManager // Manages SMART data
}
// NewAgent creates a new agent with the given data directory for persisting data.
@@ -100,11 +101,15 @@ func NewAgent(dataDir ...string) (agent *Agent, err error) {
// initialize docker manager
agent.dockerManager = newDockerManager(agent)
agent.smartManager, err = NewSmartManager()
if err != nil {
slog.Debug("SMART", "err", err)
}
// initialize GPU manager
if gm, err := NewGPUManager(); err != nil {
agent.gpuManager, err = NewGPUManager()
if err != nil {
slog.Debug("GPU", "err", err)
} else {
agent.gpuManager = gm
}
// if debugging, print stats

View File

@@ -15,6 +15,7 @@ import (
"github.com/henrygd/beszel"
"github.com/henrygd/beszel/internal/common"
"github.com/henrygd/beszel/internal/entities/smart"
"github.com/henrygd/beszel/internal/entities/system"
"github.com/fxamacker/cbor/v2"
@@ -273,6 +274,8 @@ func (client *WebSocketClient) sendResponse(data any, requestID *uint32) error {
response.Fingerprint = v
case string:
response.String = &v
case map[string]smart.SmartData:
response.SmartData = v
// case []byte:
// response.RawBytes = v
// case string:

View File

@@ -7,6 +7,9 @@ import (
"github.com/fxamacker/cbor/v2"
"github.com/henrygd/beszel/internal/common"
"github.com/henrygd/beszel/internal/entities/smart"
"golang.org/x/exp/slog"
)
// HandlerContext provides context for request handlers
@@ -46,6 +49,7 @@ func NewHandlerRegistry() *HandlerRegistry {
registry.Register(common.CheckFingerprint, &CheckFingerprintHandler{})
registry.Register(common.GetContainerLogs, &GetContainerLogsHandler{})
registry.Register(common.GetContainerInfo, &GetContainerInfoHandler{})
registry.Register(common.GetSmartData, &GetSmartDataHandler{})
return registry
}
@@ -152,3 +156,21 @@ func (h *GetContainerInfoHandler) Handle(hctx *HandlerContext) error {
return hctx.SendResponse(info, hctx.RequestID)
}
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
// GetSmartDataHandler handles SMART data requests
type GetSmartDataHandler struct{}
func (h *GetSmartDataHandler) Handle(hctx *HandlerContext) error {
if hctx.Agent.smartManager == nil {
// return empty map to indicate no data
return hctx.SendResponse(map[string]smart.SmartData{}, hctx.RequestID)
}
if err := hctx.Agent.smartManager.Refresh(); err != nil {
slog.Debug("smart refresh failed", "err", err)
}
data := hctx.Agent.smartManager.GetCurrentData()
return hctx.SendResponse(data, hctx.RequestID)
}

402
agent/smart.go Normal file
View File

@@ -0,0 +1,402 @@
package agent
import (
"context"
"encoding/json"
"fmt"
"os/exec"
"sync"
"time"
"github.com/henrygd/beszel/internal/entities/smart"
"golang.org/x/exp/slog"
)
// SmartManager manages data collection for SMART devices
type SmartManager struct {
sync.Mutex
SmartDataMap map[string]*smart.SmartData
SmartDevices []*DeviceInfo
refreshMutex sync.Mutex
}
type scanOutput struct {
Devices []struct {
Name string `json:"name"`
Type string `json:"type"`
InfoName string `json:"info_name"`
Protocol string `json:"protocol"`
} `json:"devices"`
}
type DeviceInfo struct {
Name string `json:"name"`
Type string `json:"type"`
InfoName string `json:"info_name"`
Protocol string `json:"protocol"`
}
var errNoValidSmartData = fmt.Errorf("no valid SMART data found") // Error for missing data
// Refresh updates SMART data for all known devices on demand.
func (sm *SmartManager) Refresh() error {
sm.refreshMutex.Lock()
defer sm.refreshMutex.Unlock()
scanErr := sm.ScanDevices()
if scanErr != nil {
slog.Warn("smartctl scan failed", "err", scanErr)
}
devices := sm.devicesSnapshot()
var collectErr error
for _, deviceInfo := range devices {
if deviceInfo == nil {
continue
}
if err := sm.CollectSmart(deviceInfo); err != nil {
slog.Info("smartctl collect failed for device, skipping", "device", deviceInfo.Name, "err", err)
collectErr = err
}
}
return sm.resolveRefreshError(scanErr, collectErr)
}
// devicesSnapshot returns a copy of the current device slice to avoid iterating
// while holding the primary mutex for longer than necessary.
func (sm *SmartManager) devicesSnapshot() []*DeviceInfo {
sm.Lock()
defer sm.Unlock()
devices := make([]*DeviceInfo, len(sm.SmartDevices))
copy(devices, sm.SmartDevices)
return devices
}
// hasSmartData reports whether any SMART data has been collected.
// func (sm *SmartManager) hasSmartData() bool {
// sm.Lock()
// defer sm.Unlock()
// return len(sm.SmartDataMap) > 0
// }
// resolveRefreshError determines the proper error to return after a refresh.
func (sm *SmartManager) resolveRefreshError(scanErr, collectErr error) error {
sm.Lock()
noDevices := len(sm.SmartDevices) == 0
noData := len(sm.SmartDataMap) == 0
sm.Unlock()
if noDevices {
if scanErr != nil {
return scanErr
}
}
if !noData {
return nil
}
if collectErr != nil {
return collectErr
}
if scanErr != nil {
return scanErr
}
return errNoValidSmartData
}
// GetCurrentData returns the current SMART data
func (sm *SmartManager) GetCurrentData() map[string]smart.SmartData {
sm.Lock()
defer sm.Unlock()
result := make(map[string]smart.SmartData, len(sm.SmartDataMap))
for key, value := range sm.SmartDataMap {
if value != nil {
result[key] = *value
}
}
return result
}
// ScanDevices scans for SMART devices
// Scan devices using `smartctl --scan -j`
// If scan fails, return error
// If scan succeeds, parse the output and update the SmartDevices slice
func (sm *SmartManager) ScanDevices() error {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "smartctl", "--scan", "-j")
output, err := cmd.Output()
if err != nil {
return err
}
hasValidData := sm.parseScan(output)
if !hasValidData {
return errNoValidSmartData
}
return nil
}
// CollectSmart collects SMART data for a device
// Collect data using `smartctl --all -j /dev/sdX` or `smartctl --all -j /dev/nvmeX`
// Always attempts to parse output even if command fails, as some data may still be available
// If collect fails, return error
// If collect succeeds, parse the output and update the SmartDataMap
// Uses -n standby to avoid waking up sleeping disks, but bypasses standby mode
// for initial data collection when no cached data exists
func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
// Check if we have any existing data for this device
hasExistingData := sm.hasDataForDevice(deviceInfo.Name)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
// Try with -n standby first if we have existing data
cmd := exec.CommandContext(ctx, "smartctl", "-aj", "-n", "standby", deviceInfo.Name)
output, err := cmd.CombinedOutput()
// Check if device is in standby (exit status 2)
if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == 2 {
if hasExistingData {
// Device is in standby and we have cached data, keep using cache
slog.Debug("device in standby mode, using cached data", "device", deviceInfo.Name)
return nil
}
// No cached data, need to collect initial data by bypassing standby
slog.Debug("device in standby but no cached data, collecting initial data", "device", deviceInfo.Name)
ctx2, cancel2 := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel2()
cmd = exec.CommandContext(ctx2, "smartctl", "-aj", deviceInfo.Name)
output, err = cmd.CombinedOutput()
}
hasValidData := false
switch deviceInfo.Type {
case "scsi", "sat", "ata":
// parse SATA/SCSI/ATA devices
hasValidData, _ = sm.parseSmartForSata(output)
case "nvme":
// parse nvme devices
hasValidData, _ = sm.parseSmartForNvme(output)
}
if !hasValidData {
if err != nil {
return err
}
return errNoValidSmartData
}
return nil
}
// hasDataForDevice checks if we have cached SMART data for a specific device
func (sm *SmartManager) hasDataForDevice(deviceName string) bool {
sm.Lock()
defer sm.Unlock()
// Check if any cached data has this device name
for _, data := range sm.SmartDataMap {
if data != nil && data.DiskName == deviceName {
return true
}
}
return false
}
// parseScan parses the output of smartctl --scan -j and updates the SmartDevices slice
func (sm *SmartManager) parseScan(output []byte) bool {
sm.Lock()
defer sm.Unlock()
sm.SmartDevices = make([]*DeviceInfo, 0)
scan := &scanOutput{}
if err := json.Unmarshal(output, scan); err != nil {
slog.Warn("Failed to parse smartctl scan JSON", "err", err)
return false
}
if len(scan.Devices) == 0 {
return false
}
scannedDeviceNameMap := make(map[string]bool, len(scan.Devices))
for _, device := range scan.Devices {
deviceInfo := &DeviceInfo{
Name: device.Name,
Type: device.Type,
InfoName: device.InfoName,
Protocol: device.Protocol,
}
sm.SmartDevices = append(sm.SmartDevices, deviceInfo)
scannedDeviceNameMap[device.Name] = true
}
// remove devices that are not in the scan
for key := range sm.SmartDataMap {
if _, ok := scannedDeviceNameMap[key]; !ok {
delete(sm.SmartDataMap, key)
}
}
return true
}
// parseSmartForSata parses the output of smartctl --all -j for SATA/ATA devices and updates the SmartDataMap
// Returns hasValidData and exitStatus
func (sm *SmartManager) parseSmartForSata(output []byte) (bool, int) {
var data smart.SmartInfoForSata
if err := json.Unmarshal(output, &data); err != nil {
return false, 0
}
if data.SerialNumber == "" {
slog.Warn("device has no serial number, skipping", "device", data.Device.Name)
return false, data.Smartctl.ExitStatus
}
sm.Lock()
defer sm.Unlock()
// get device name (e.g. /dev/sda)
keyName := data.SerialNumber
// if device does not exist in SmartDataMap, initialize it
if _, ok := sm.SmartDataMap[keyName]; !ok {
sm.SmartDataMap[keyName] = &smart.SmartData{}
}
// update SmartData
smartData := sm.SmartDataMap[keyName]
// smartData.ModelFamily = data.ModelFamily
smartData.ModelName = data.ModelName
smartData.SerialNumber = data.SerialNumber
smartData.FirmwareVersion = data.FirmwareVersion
smartData.Capacity = data.UserCapacity.Bytes
smartData.Temperature = data.Temperature.Current
smartData.SmartStatus = getSmartStatus(smartData.Temperature, data.SmartStatus.Passed)
smartData.DiskName = data.Device.Name
smartData.DiskType = data.Device.Type
// update SmartAttributes
smartData.Attributes = make([]*smart.SmartAttribute, 0, len(data.AtaSmartAttributes.Table))
for _, attr := range data.AtaSmartAttributes.Table {
smartAttr := &smart.SmartAttribute{
ID: attr.ID,
Name: attr.Name,
Value: attr.Value,
Worst: attr.Worst,
Threshold: attr.Thresh,
RawValue: attr.Raw.Value,
RawString: attr.Raw.String,
WhenFailed: attr.WhenFailed,
}
smartData.Attributes = append(smartData.Attributes, smartAttr)
}
sm.SmartDataMap[keyName] = smartData
return true, data.Smartctl.ExitStatus
}
func getSmartStatus(temperature uint8, passed bool) string {
if passed {
return "PASSED"
} else if temperature > 0 {
return "FAILED"
} else {
return "UNKNOWN"
}
}
// parseSmartForNvme parses the output of smartctl --all -j /dev/nvmeX and updates the SmartDataMap
// Returns hasValidData and exitStatus
func (sm *SmartManager) parseSmartForNvme(output []byte) (bool, int) {
data := &smart.SmartInfoForNvme{}
if err := json.Unmarshal(output, &data); err != nil {
return false, 0
}
if data.SerialNumber == "" {
slog.Warn("device has no serial number, skipping", "device", data.Device.Name)
return false, data.Smartctl.ExitStatus
}
sm.Lock()
defer sm.Unlock()
// get device name (e.g. /dev/nvme0)
keyName := data.SerialNumber
// if device does not exist in SmartDataMap, initialize it
if _, ok := sm.SmartDataMap[keyName]; !ok {
sm.SmartDataMap[keyName] = &smart.SmartData{}
}
// update SmartData
smartData := sm.SmartDataMap[keyName]
smartData.ModelName = data.ModelName
smartData.SerialNumber = data.SerialNumber
smartData.FirmwareVersion = data.FirmwareVersion
smartData.Capacity = data.UserCapacity.Bytes
smartData.Temperature = data.NVMeSmartHealthInformationLog.Temperature
smartData.SmartStatus = getSmartStatus(smartData.Temperature, data.SmartStatus.Passed)
smartData.DiskName = data.Device.Name
smartData.DiskType = data.Device.Type
// nvme attributes does not follow the same format as ata attributes,
// so we manually map each field to SmartAttributes
log := data.NVMeSmartHealthInformationLog
smartData.Attributes = []*smart.SmartAttribute{
{Name: "CriticalWarning", RawValue: uint64(log.CriticalWarning)},
{Name: "Temperature", RawValue: uint64(log.Temperature)},
{Name: "AvailableSpare", RawValue: uint64(log.AvailableSpare)},
{Name: "AvailableSpareThreshold", RawValue: uint64(log.AvailableSpareThreshold)},
{Name: "PercentageUsed", RawValue: uint64(log.PercentageUsed)},
{Name: "DataUnitsRead", RawValue: log.DataUnitsRead},
{Name: "DataUnitsWritten", RawValue: log.DataUnitsWritten},
{Name: "HostReads", RawValue: uint64(log.HostReads)},
{Name: "HostWrites", RawValue: uint64(log.HostWrites)},
{Name: "ControllerBusyTime", RawValue: uint64(log.ControllerBusyTime)},
{Name: "PowerCycles", RawValue: uint64(log.PowerCycles)},
{Name: "PowerOnHours", RawValue: uint64(log.PowerOnHours)},
{Name: "UnsafeShutdowns", RawValue: uint64(log.UnsafeShutdowns)},
{Name: "MediaErrors", RawValue: uint64(log.MediaErrors)},
{Name: "NumErrLogEntries", RawValue: uint64(log.NumErrLogEntries)},
{Name: "WarningTempTime", RawValue: uint64(log.WarningTempTime)},
{Name: "CriticalCompTime", RawValue: uint64(log.CriticalCompTime)},
}
sm.SmartDataMap[keyName] = smartData
return true, data.Smartctl.ExitStatus
}
// detectSmartctl checks if smartctl is installed, returns an error if not
func (sm *SmartManager) detectSmartctl() error {
if _, err := exec.LookPath("smartctl"); err == nil {
return nil
}
return fmt.Errorf("no smartctl found - install smartctl")
}
// NewSmartManager creates and initializes a new SmartManager
func NewSmartManager() (*SmartManager, error) {
sm := &SmartManager{
SmartDataMap: make(map[string]*smart.SmartData),
}
if err := sm.detectSmartctl(); err != nil {
return nil, err
}
return sm, nil
}