S.M.A.R.T support (#614)

* add agent smart support

* refactor(system): update JSON tags in SmartData struct

* refactor(agent): use serial number as the key of SmartDataMap

Updated the SmartManager's methods to use the device's serial number as the key in the SmartDataMap instead of the device name.

* refactor: use raw values in smart attributes for nvme devices

* feat: add S.M.A.R.T. data display in web ui

Introduced a new Disks tab in the SystemDetail component to display disk information and S.M.A.R.T. data. The tab includes a table for visualizing disk attributes and their statuses.

Also added SmartData and SmartAttribute interfaces to support the new functionality.
This commit is contained in:
Yifan
2025-10-24 21:49:04 +08:00
committed by GitHub
parent ca7642cc91
commit 16d5ec267d
9 changed files with 1335 additions and 25 deletions

View File

@@ -25,6 +25,7 @@ type Agent struct {
systemInfo system.Info // Host system info
gpuManager *GPUManager // Manages GPU data
cache *SessionCache // Cache for system stats based on primary session ID
smartManager *SmartManager // Manages SMART data
}
func NewAgent() *Agent {
@@ -62,6 +63,12 @@ func NewAgent() *Agent {
agent.gpuManager = gm
}
if sm, err := NewSmartManager(); err != nil {
slog.Debug("SMART", "err", err)
} else {
agent.smartManager = sm
}
// if debugging, print stats
if agent.debug {
slog.Debug("Stats", "data", agent.gatherStats(""))

View File

@@ -0,0 +1,304 @@
package agent
import (
"beszel/internal/entities/smart"
"beszel/internal/entities/system"
"context"
"encoding/json"
"fmt"
"os/exec"
"reflect"
"sync"
"time"
"golang.org/x/exp/slog"
)
// SmartManager manages data collection for SMART devices
// TODO: add retry argument
// TODO: add timeout argument
type SmartManager struct {
SmartDataMap map[string]*system.SmartData
SmartDevices []*DeviceInfo
mutex sync.Mutex
}
type scanOutput struct {
Devices []struct {
Name string `json:"name"`
Type string `json:"type"`
InfoName string `json:"info_name"`
Protocol string `json:"protocol"`
} `json:"devices"`
}
type DeviceInfo struct {
Name string `json:"name"`
Type string `json:"type"`
InfoName string `json:"info_name"`
Protocol string `json:"protocol"`
}
var errNoValidSmartData = fmt.Errorf("no valid GPU data found") // Error for missing data
// Starts the SmartManager
func (sm *SmartManager) Start() {
sm.SmartDataMap = make(map[string]*system.SmartData)
for {
err := sm.ScanDevices()
if err != nil {
slog.Warn("smartctl scan failed, stopping", "err", err)
return
}
// TODO: add retry logic
for _, deviceInfo := range sm.SmartDevices {
err := sm.CollectSmart(deviceInfo)
if err != nil {
slog.Warn("smartctl collect failed, stopping", "err", err)
return
}
}
// Sleep for 10 seconds before next scan
time.Sleep(10 * time.Second)
}
}
// GetCurrentData returns the current SMART data
func (sm *SmartManager) GetCurrentData() map[string]system.SmartData {
sm.mutex.Lock()
defer sm.mutex.Unlock()
result := make(map[string]system.SmartData)
for key, value := range sm.SmartDataMap {
result[key] = *value
}
return result
}
// ScanDevices scans for SMART devices
// Scan devices using `smartctl --scan -j`
// If scan fails, return error
// If scan succeeds, parse the output and update the SmartDevices slice
func (sm *SmartManager) ScanDevices() error {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "smartctl", "--scan", "-j")
output, err := cmd.Output()
if err != nil {
return err
}
hasValidData := sm.parseScan(output)
if !hasValidData {
return errNoValidSmartData
}
return nil
}
// CollectSmart collects SMART data for a device
// Collect data using `smartctl --all -j /dev/sdX` or `smartctl --all -j /dev/nvmeX`
// If collect fails, return error
// If collect succeeds, parse the output and update the SmartDataMap
func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "smartctl", "--all", "-j", deviceInfo.Name)
output, err := cmd.Output()
if err != nil {
return err
}
hasValidData := false
if deviceInfo.Type == "scsi" {
// parse scsi devices
hasValidData = sm.parseSmartForScsi(output)
} else if deviceInfo.Type == "nvme" {
// parse nvme devices
hasValidData = sm.parseSmartForNvme(output)
}
if !hasValidData {
return errNoValidSmartData
}
return nil
}
// parseScan parses the output of smartctl --scan -j and updates the SmartDevices slice
func (sm *SmartManager) parseScan(output []byte) bool {
sm.mutex.Lock()
defer sm.mutex.Unlock()
sm.SmartDevices = make([]*DeviceInfo, 0)
scan := &scanOutput{}
if err := json.Unmarshal(output, scan); err != nil {
fmt.Printf("Failed to parse JSON: %v\n", err)
return false
}
scannedDeviceNameMap := make(map[string]bool)
for _, device := range scan.Devices {
deviceInfo := &DeviceInfo{
Name: device.Name,
Type: device.Type,
InfoName: device.InfoName,
Protocol: device.Protocol,
}
sm.SmartDevices = append(sm.SmartDevices, deviceInfo)
scannedDeviceNameMap[device.Name] = true
}
// remove devices that are not in the scan
for key := range sm.SmartDataMap {
if _, ok := scannedDeviceNameMap[key]; !ok {
delete(sm.SmartDataMap, key)
}
}
devicesString := ""
for _, device := range sm.SmartDevices {
devicesString += device.Name + " "
}
return true
}
// parseSmartForScsi parses the output of smartctl --all -j /dev/sdX and updates the SmartDataMap
func (sm *SmartManager) parseSmartForScsi(output []byte) bool {
data := &smart.SmartInfoForSata{}
if err := json.Unmarshal(output, &data); err != nil {
return false
}
sm.mutex.Lock()
defer sm.mutex.Unlock()
// get device name (e.g. /dev/sda)
keyName := data.SerialNumber
// if device does not exist in SmartDataMap, initialize it
if _, ok := sm.SmartDataMap[keyName]; !ok {
sm.SmartDataMap[keyName] = &system.SmartData{}
}
// update SmartData
smartData := sm.SmartDataMap[keyName]
smartData.ModelFamily = data.ModelFamily
smartData.ModelName = data.ModelName
smartData.SerialNumber = data.SerialNumber
smartData.FirmwareVersion = data.FirmwareVersion
smartData.Capacity = data.UserCapacity.Bytes
if data.SmartStatus.Passed {
smartData.SmartStatus = "PASSED"
} else {
smartData.SmartStatus = "FAILED"
}
smartData.DiskName = data.Device.Name
smartData.DiskType = data.Device.Type
// update SmartAttributes
smartData.Attributes = make([]*system.SmartAttribute, 0, len(data.AtaSmartAttributes.Table))
for _, attr := range data.AtaSmartAttributes.Table {
smartAttr := &system.SmartAttribute{
Id: attr.ID,
Name: attr.Name,
Value: attr.Value,
Worst: attr.Worst,
Threshold: attr.Thresh,
RawValue: attr.Raw.Value,
RawString: attr.Raw.String,
Flags: attr.Flags.String,
WhenFailed: attr.WhenFailed,
}
smartData.Attributes = append(smartData.Attributes, smartAttr)
}
smartData.Temperature = data.Temperature.Current
sm.SmartDataMap[keyName] = smartData
return true
}
// parseSmartForNvme parses the output of smartctl --all -j /dev/nvmeX and updates the SmartDataMap
func (sm *SmartManager) parseSmartForNvme(output []byte) bool {
data := &smart.SmartInfoForNvme{}
if err := json.Unmarshal(output, &data); err != nil {
return false
}
sm.mutex.Lock()
defer sm.mutex.Unlock()
// get device name (e.g. /dev/nvme0)
keyName := data.SerialNumber
// if device does not exist in SmartDataMap, initialize it
if _, ok := sm.SmartDataMap[keyName]; !ok {
sm.SmartDataMap[keyName] = &system.SmartData{}
}
// update SmartData
smartData := sm.SmartDataMap[keyName]
smartData.ModelName = data.ModelName
smartData.SerialNumber = data.SerialNumber
smartData.FirmwareVersion = data.FirmwareVersion
smartData.Capacity = data.UserCapacity.Bytes
if data.SmartStatus.Passed {
smartData.SmartStatus = "PASSED"
} else {
smartData.SmartStatus = "FAILED"
}
smartData.DiskName = data.Device.Name
smartData.DiskType = data.Device.Type
v := reflect.ValueOf(data.NVMeSmartHealthInformationLog)
t := v.Type()
smartData.Attributes = make([]*system.SmartAttribute, 0, v.NumField())
// nvme attributes does not follow the same format as ata attributes,
// so we have to manually iterate over the fields and update SmartAttributes
for i := 0; i < v.NumField(); i++ {
field := t.Field(i)
value := v.Field(i)
key := field.Name
val := value.Interface()
// drop non int values
if _, ok := val.(int); !ok {
continue
}
smartAttr := &system.SmartAttribute{
Name: key,
RawValue: val.(int),
}
smartData.Attributes = append(smartData.Attributes, smartAttr)
}
smartData.Temperature = data.NVMeSmartHealthInformationLog.Temperature
sm.SmartDataMap[keyName] = smartData
return true
}
// detectSmartctl checks if smartctl is installed, returns an error if not
func (sm *SmartManager) detectSmartctl() error {
if _, err := exec.LookPath("smartctl"); err == nil {
return nil
}
return fmt.Errorf("no smartctl found - install smartctl")
}
// NewGPUManager creates and initializes a new GPUManager
func NewSmartManager() (*SmartManager, error) {
var sm SmartManager
if err := sm.detectSmartctl(); err != nil {
return nil, err
}
go sm.Start()
return &sm, nil
}

View File

@@ -237,6 +237,17 @@ func (a *Agent) getSystemStats() system.Stats {
}
}
}
if a.smartManager != nil {
if smartData := a.smartManager.GetCurrentData(); len(smartData) > 0 {
systemStats.SmartData = smartData
if systemStats.Temperatures == nil {
systemStats.Temperatures = make(map[string]float64, len(a.smartManager.SmartDataMap))
}
for key, value := range a.smartManager.SmartDataMap {
systemStats.Temperatures[key] = float64(value.Temperature)
}
}
}
// update base system info
a.systemInfo.Cpu = systemStats.Cpu

View File

@@ -0,0 +1,269 @@
package smart
type SmartInfoForSata struct {
JSONFormatVersion []int `json:"json_format_version"`
Smartctl struct {
Version []int `json:"version"`
SvnRevision string `json:"svn_revision"`
PlatformInfo string `json:"platform_info"`
BuildInfo string `json:"build_info"`
Argv []string `json:"argv"`
ExitStatus int `json:"exit_status"`
} `json:"smartctl"`
Device struct {
Name string `json:"name"`
InfoName string `json:"info_name"`
Type string `json:"type"`
Protocol string `json:"protocol"`
} `json:"device"`
ModelFamily string `json:"model_family"`
ModelName string `json:"model_name"`
SerialNumber string `json:"serial_number"`
Wwn struct {
Naa int `json:"naa"`
Oui int `json:"oui"`
ID int `json:"id"`
} `json:"wwn"`
FirmwareVersion string `json:"firmware_version"`
UserCapacity struct {
Blocks uint64 `json:"blocks"`
Bytes uint64 `json:"bytes"`
} `json:"user_capacity"`
LogicalBlockSize int `json:"logical_block_size"`
PhysicalBlockSize int `json:"physical_block_size"`
RotationRate int `json:"rotation_rate"`
FormFactor struct {
AtaValue int `json:"ata_value"`
Name string `json:"name"`
} `json:"form_factor"`
Trim struct {
Supported bool `json:"supported"`
} `json:"trim"`
InSmartctlDatabase bool `json:"in_smartctl_database"`
AtaVersion struct {
String string `json:"string"`
MajorValue int `json:"major_value"`
MinorValue int `json:"minor_value"`
} `json:"ata_version"`
SataVersion struct {
String string `json:"string"`
Value int `json:"value"`
} `json:"sata_version"`
InterfaceSpeed struct {
Max struct {
SataValue int `json:"sata_value"`
String string `json:"string"`
UnitsPerSecond int `json:"units_per_second"`
BitsPerUnit int `json:"bits_per_unit"`
} `json:"max"`
Current struct {
SataValue int `json:"sata_value"`
String string `json:"string"`
UnitsPerSecond int `json:"units_per_second"`
BitsPerUnit int `json:"bits_per_unit"`
} `json:"current"`
} `json:"interface_speed"`
LocalTime struct {
TimeT int `json:"time_t"`
Asctime string `json:"asctime"`
} `json:"local_time"`
SmartStatus struct {
Passed bool `json:"passed"`
} `json:"smart_status"`
AtaSmartData struct {
OfflineDataCollection struct {
Status struct {
Value int `json:"value"`
String string `json:"string"`
Passed bool `json:"passed"`
} `json:"status"`
CompletionSeconds int `json:"completion_seconds"`
} `json:"offline_data_collection"`
SelfTest struct {
Status struct {
Value int `json:"value"`
String string `json:"string"`
Passed bool `json:"passed"`
} `json:"status"`
PollingMinutes struct {
Short int `json:"short"`
Extended int `json:"extended"`
} `json:"polling_minutes"`
} `json:"self_test"`
Capabilities struct {
Values []int `json:"values"`
ExecOfflineImmediateSupported bool `json:"exec_offline_immediate_supported"`
OfflineIsAbortedUponNewCmd bool `json:"offline_is_aborted_upon_new_cmd"`
OfflineSurfaceScanSupported bool `json:"offline_surface_scan_supported"`
SelfTestsSupported bool `json:"self_tests_supported"`
ConveyanceSelfTestSupported bool `json:"conveyance_self_test_supported"`
SelectiveSelfTestSupported bool `json:"selective_self_test_supported"`
AttributeAutosaveEnabled bool `json:"attribute_autosave_enabled"`
ErrorLoggingSupported bool `json:"error_logging_supported"`
GpLoggingSupported bool `json:"gp_logging_supported"`
} `json:"capabilities"`
} `json:"ata_smart_data"`
AtaSctCapabilities struct {
Value int `json:"value"`
ErrorRecoveryControlSupported bool `json:"error_recovery_control_supported"`
FeatureControlSupported bool `json:"feature_control_supported"`
DataTableSupported bool `json:"data_table_supported"`
} `json:"ata_sct_capabilities"`
AtaSmartAttributes struct {
Revision int `json:"revision"`
Table []struct {
ID int `json:"id"`
Name string `json:"name"`
Value int `json:"value"`
Worst int `json:"worst"`
Thresh int `json:"thresh"`
WhenFailed string `json:"when_failed"`
Flags struct {
Value int `json:"value"`
String string `json:"string"`
Prefailure bool `json:"prefailure"`
UpdatedOnline bool `json:"updated_online"`
Performance bool `json:"performance"`
ErrorRate bool `json:"error_rate"`
EventCount bool `json:"event_count"`
AutoKeep bool `json:"auto_keep"`
} `json:"flags"`
Raw struct {
Value int `json:"value"`
String string `json:"string"`
} `json:"raw"`
} `json:"table"`
} `json:"ata_smart_attributes"`
PowerOnTime struct {
Hours int `json:"hours"`
} `json:"power_on_time"`
PowerCycleCount int `json:"power_cycle_count"`
Temperature struct {
Current int `json:"current"`
} `json:"temperature"`
AtaSmartErrorLog struct {
Summary struct {
Revision int `json:"revision"`
Count int `json:"count"`
} `json:"summary"`
} `json:"ata_smart_error_log"`
AtaSmartSelfTestLog struct {
Standard struct {
Revision int `json:"revision"`
Count int `json:"count"`
} `json:"standard"`
} `json:"ata_smart_self_test_log"`
AtaSmartSelectiveSelfTestLog struct {
Revision int `json:"revision"`
Table []struct {
LbaMin int `json:"lba_min"`
LbaMax int `json:"lba_max"`
Status struct {
Value int `json:"value"`
String string `json:"string"`
} `json:"status"`
} `json:"table"`
Flags struct {
Value int `json:"value"`
RemainderScanEnabled bool `json:"remainder_scan_enabled"`
} `json:"flags"`
PowerUpScanResumeMinutes int `json:"power_up_scan_resume_minutes"`
} `json:"ata_smart_selective_self_test_log"`
}
type SmartInfoForNvme struct {
JSONFormatVersion [2]int `json:"json_format_version"`
Smartctl struct {
Version [2]int `json:"version"`
SVNRevision string `json:"svn_revision"`
PlatformInfo string `json:"platform_info"`
BuildInfo string `json:"build_info"`
Argv []string `json:"argv"`
ExitStatus int `json:"exit_status"`
} `json:"smartctl"`
Device struct {
Name string `json:"name"`
InfoName string `json:"info_name"`
Type string `json:"type"`
Protocol string `json:"protocol"`
} `json:"device"`
ModelName string `json:"model_name"`
SerialNumber string `json:"serial_number"`
FirmwareVersion string `json:"firmware_version"`
NVMePCIVendor struct {
ID int `json:"id"`
SubsystemID int `json:"subsystem_id"`
} `json:"nvme_pci_vendor"`
NVMeIEEEOUIIdentifier int `json:"nvme_ieee_oui_identifier"`
NVMeTotalCapacity int `json:"nvme_total_capacity"`
NVMeUnallocatedCapacity int `json:"nvme_unallocated_capacity"`
NVMeControllerID int `json:"nvme_controller_id"`
NVMeVersion struct {
String string `json:"string"`
Value int `json:"value"`
} `json:"nvme_version"`
NVMeNumberOfNamespaces int `json:"nvme_number_of_namespaces"`
NVMeNamespaces []struct {
ID int `json:"id"`
Size struct {
Blocks int `json:"blocks"`
Bytes int `json:"bytes"`
} `json:"size"`
Capacity struct {
Blocks int `json:"blocks"`
Bytes int `json:"bytes"`
} `json:"capacity"`
Utilization struct {
Blocks int `json:"blocks"`
Bytes int `json:"bytes"`
} `json:"utilization"`
FormattedLBASize int `json:"formatted_lba_size"`
EUI64 struct {
OUI int `json:"oui"`
ExtID int `json:"ext_id"`
} `json:"eui64"`
} `json:"nvme_namespaces"`
UserCapacity struct {
Blocks uint64 `json:"blocks"`
Bytes uint64 `json:"bytes"`
} `json:"user_capacity"`
LogicalBlockSize int `json:"logical_block_size"`
LocalTime struct {
TimeT int64 `json:"time_t"`
Asctime string `json:"asctime"`
} `json:"local_time"`
SmartStatus struct {
Passed bool `json:"passed"`
NVMe struct {
Value int `json:"value"`
} `json:"nvme"`
} `json:"smart_status"`
NVMeSmartHealthInformationLog struct {
CriticalWarning int `json:"critical_warning"`
Temperature int `json:"temperature"`
AvailableSpare int `json:"available_spare"`
AvailableSpareThreshold int `json:"available_spare_threshold"`
PercentageUsed int `json:"percentage_used"`
DataUnitsRead int `json:"data_units_read"`
DataUnitsWritten int `json:"data_units_written"`
HostReads int `json:"host_reads"`
HostWrites int `json:"host_writes"`
ControllerBusyTime int `json:"controller_busy_time"`
PowerCycles int `json:"power_cycles"`
PowerOnHours int `json:"power_on_hours"`
UnsafeShutdowns int `json:"unsafe_shutdowns"`
MediaErrors int `json:"media_errors"`
NumErrLogEntries int `json:"num_err_log_entries"`
WarningTempTime int `json:"warning_temp_time"`
CriticalCompTime int `json:"critical_comp_time"`
TemperatureSensors []int `json:"temperature_sensors"`
} `json:"nvme_smart_health_information_log"`
Temperature struct {
Current int `json:"current"`
} `json:"temperature"`
PowerCycleCount int `json:"power_cycle_count"`
PowerOnTime struct {
Hours int `json:"hours"`
} `json:"power_on_time"`
}

View File

@@ -8,29 +8,30 @@ import (
)
type Stats struct {
Cpu float64 `json:"cpu"`
MaxCpu float64 `json:"cpum,omitempty"`
Mem float64 `json:"m"`
MemUsed float64 `json:"mu"`
MemPct float64 `json:"mp"`
MemBuffCache float64 `json:"mb"`
MemZfsArc float64 `json:"mz,omitempty"` // ZFS ARC memory
Swap float64 `json:"s,omitempty"`
SwapUsed float64 `json:"su,omitempty"`
DiskTotal float64 `json:"d"`
DiskUsed float64 `json:"du"`
DiskPct float64 `json:"dp"`
DiskReadPs float64 `json:"dr"`
DiskWritePs float64 `json:"dw"`
MaxDiskReadPs float64 `json:"drm,omitempty"`
MaxDiskWritePs float64 `json:"dwm,omitempty"`
NetworkSent float64 `json:"ns"`
NetworkRecv float64 `json:"nr"`
MaxNetworkSent float64 `json:"nsm,omitempty"`
MaxNetworkRecv float64 `json:"nrm,omitempty"`
Temperatures map[string]float64 `json:"t,omitempty"`
ExtraFs map[string]*FsStats `json:"efs,omitempty"`
GPUData map[string]GPUData `json:"g,omitempty"`
Cpu float64 `json:"cpu"`
MaxCpu float64 `json:"cpum,omitempty"`
Mem float64 `json:"m"`
MemUsed float64 `json:"mu"`
MemPct float64 `json:"mp"`
MemBuffCache float64 `json:"mb"`
MemZfsArc float64 `json:"mz,omitempty"` // ZFS ARC memory
Swap float64 `json:"s,omitempty"`
SwapUsed float64 `json:"su,omitempty"`
DiskTotal float64 `json:"d"`
DiskUsed float64 `json:"du"`
DiskPct float64 `json:"dp"`
DiskReadPs float64 `json:"dr"`
DiskWritePs float64 `json:"dw"`
MaxDiskReadPs float64 `json:"drm,omitempty"`
MaxDiskWritePs float64 `json:"dwm,omitempty"`
NetworkSent float64 `json:"ns"`
NetworkRecv float64 `json:"nr"`
MaxNetworkSent float64 `json:"nsm,omitempty"`
MaxNetworkRecv float64 `json:"nrm,omitempty"`
Temperatures map[string]float64 `json:"t,omitempty"`
ExtraFs map[string]*FsStats `json:"efs,omitempty"`
GPUData map[string]GPUData `json:"g,omitempty"`
SmartData map[string]SmartData `json:"sm,omitempty"`
}
type GPUData struct {
@@ -73,6 +74,31 @@ const (
Freebsd
)
type SmartData struct {
ModelFamily string `json:"mf,omitempty"`
ModelName string `json:"mn,omitempty"`
SerialNumber string `json:"sn,omitempty"`
FirmwareVersion string `json:"fv,omitempty"`
Capacity uint64 `json:"c,omitempty"`
SmartStatus string `json:"s,omitempty"`
DiskName string `json:"dn,omitempty"` // something like /dev/sda
DiskType string `json:"dt,omitempty"`
Temperature int `json:"t,omitempty"`
Attributes []*SmartAttribute `json:"a,omitempty"`
}
type SmartAttribute struct {
Id int `json:"id,omitempty"`
Name string `json:"n"`
Value int `json:"v,omitempty"`
Worst int `json:"w,omitempty"`
Threshold int `json:"t,omitempty"`
RawValue int `json:"rv"`
RawString string `json:"rs,omitempty"`
Flags string `json:"f,omitempty"`
WhenFailed string `json:"wf,omitempty"`
}
type Info struct {
Hostname string `json:"h"`
KernelVersion string `json:"k,omitempty"`