Files
beszel-ipv6/beszel/internal/agent/smart.go
Yifan 16d5ec267d S.M.A.R.T support (#614)
* add agent smart support

* refactor(system): update JSON tags in SmartData struct

* refactor(agent): use serial number as the key of SmartDataMap

Updated the SmartManager's methods to use the device's serial number as the key in the SmartDataMap instead of the device name.

* refactor: use raw values in smart attributes for nvme devices

* feat: add S.M.A.R.T. data display in web ui

Introduced a new Disks tab in the SystemDetail component to display disk information and S.M.A.R.T. data. The tab includes a table for visualizing disk attributes and their statuses.

Also added SmartData and SmartAttribute interfaces to support the new functionality.
2025-10-24 09:49:04 -04:00

305 lines
7.8 KiB
Go

package agent
import (
"beszel/internal/entities/smart"
"beszel/internal/entities/system"
"context"
"encoding/json"
"fmt"
"os/exec"
"reflect"
"sync"
"time"
"golang.org/x/exp/slog"
)
// SmartManager manages data collection for SMART devices
// TODO: add retry argument
// TODO: add timeout argument
type SmartManager struct {
SmartDataMap map[string]*system.SmartData
SmartDevices []*DeviceInfo
mutex sync.Mutex
}
type scanOutput struct {
Devices []struct {
Name string `json:"name"`
Type string `json:"type"`
InfoName string `json:"info_name"`
Protocol string `json:"protocol"`
} `json:"devices"`
}
type DeviceInfo struct {
Name string `json:"name"`
Type string `json:"type"`
InfoName string `json:"info_name"`
Protocol string `json:"protocol"`
}
var errNoValidSmartData = fmt.Errorf("no valid GPU data found") // Error for missing data
// Starts the SmartManager
func (sm *SmartManager) Start() {
sm.SmartDataMap = make(map[string]*system.SmartData)
for {
err := sm.ScanDevices()
if err != nil {
slog.Warn("smartctl scan failed, stopping", "err", err)
return
}
// TODO: add retry logic
for _, deviceInfo := range sm.SmartDevices {
err := sm.CollectSmart(deviceInfo)
if err != nil {
slog.Warn("smartctl collect failed, stopping", "err", err)
return
}
}
// Sleep for 10 seconds before next scan
time.Sleep(10 * time.Second)
}
}
// GetCurrentData returns the current SMART data
func (sm *SmartManager) GetCurrentData() map[string]system.SmartData {
sm.mutex.Lock()
defer sm.mutex.Unlock()
result := make(map[string]system.SmartData)
for key, value := range sm.SmartDataMap {
result[key] = *value
}
return result
}
// ScanDevices scans for SMART devices
// Scan devices using `smartctl --scan -j`
// If scan fails, return error
// If scan succeeds, parse the output and update the SmartDevices slice
func (sm *SmartManager) ScanDevices() error {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "smartctl", "--scan", "-j")
output, err := cmd.Output()
if err != nil {
return err
}
hasValidData := sm.parseScan(output)
if !hasValidData {
return errNoValidSmartData
}
return nil
}
// CollectSmart collects SMART data for a device
// Collect data using `smartctl --all -j /dev/sdX` or `smartctl --all -j /dev/nvmeX`
// If collect fails, return error
// If collect succeeds, parse the output and update the SmartDataMap
func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "smartctl", "--all", "-j", deviceInfo.Name)
output, err := cmd.Output()
if err != nil {
return err
}
hasValidData := false
if deviceInfo.Type == "scsi" {
// parse scsi devices
hasValidData = sm.parseSmartForScsi(output)
} else if deviceInfo.Type == "nvme" {
// parse nvme devices
hasValidData = sm.parseSmartForNvme(output)
}
if !hasValidData {
return errNoValidSmartData
}
return nil
}
// parseScan parses the output of smartctl --scan -j and updates the SmartDevices slice
func (sm *SmartManager) parseScan(output []byte) bool {
sm.mutex.Lock()
defer sm.mutex.Unlock()
sm.SmartDevices = make([]*DeviceInfo, 0)
scan := &scanOutput{}
if err := json.Unmarshal(output, scan); err != nil {
fmt.Printf("Failed to parse JSON: %v\n", err)
return false
}
scannedDeviceNameMap := make(map[string]bool)
for _, device := range scan.Devices {
deviceInfo := &DeviceInfo{
Name: device.Name,
Type: device.Type,
InfoName: device.InfoName,
Protocol: device.Protocol,
}
sm.SmartDevices = append(sm.SmartDevices, deviceInfo)
scannedDeviceNameMap[device.Name] = true
}
// remove devices that are not in the scan
for key := range sm.SmartDataMap {
if _, ok := scannedDeviceNameMap[key]; !ok {
delete(sm.SmartDataMap, key)
}
}
devicesString := ""
for _, device := range sm.SmartDevices {
devicesString += device.Name + " "
}
return true
}
// parseSmartForScsi parses the output of smartctl --all -j /dev/sdX and updates the SmartDataMap
func (sm *SmartManager) parseSmartForScsi(output []byte) bool {
data := &smart.SmartInfoForSata{}
if err := json.Unmarshal(output, &data); err != nil {
return false
}
sm.mutex.Lock()
defer sm.mutex.Unlock()
// get device name (e.g. /dev/sda)
keyName := data.SerialNumber
// if device does not exist in SmartDataMap, initialize it
if _, ok := sm.SmartDataMap[keyName]; !ok {
sm.SmartDataMap[keyName] = &system.SmartData{}
}
// update SmartData
smartData := sm.SmartDataMap[keyName]
smartData.ModelFamily = data.ModelFamily
smartData.ModelName = data.ModelName
smartData.SerialNumber = data.SerialNumber
smartData.FirmwareVersion = data.FirmwareVersion
smartData.Capacity = data.UserCapacity.Bytes
if data.SmartStatus.Passed {
smartData.SmartStatus = "PASSED"
} else {
smartData.SmartStatus = "FAILED"
}
smartData.DiskName = data.Device.Name
smartData.DiskType = data.Device.Type
// update SmartAttributes
smartData.Attributes = make([]*system.SmartAttribute, 0, len(data.AtaSmartAttributes.Table))
for _, attr := range data.AtaSmartAttributes.Table {
smartAttr := &system.SmartAttribute{
Id: attr.ID,
Name: attr.Name,
Value: attr.Value,
Worst: attr.Worst,
Threshold: attr.Thresh,
RawValue: attr.Raw.Value,
RawString: attr.Raw.String,
Flags: attr.Flags.String,
WhenFailed: attr.WhenFailed,
}
smartData.Attributes = append(smartData.Attributes, smartAttr)
}
smartData.Temperature = data.Temperature.Current
sm.SmartDataMap[keyName] = smartData
return true
}
// parseSmartForNvme parses the output of smartctl --all -j /dev/nvmeX and updates the SmartDataMap
func (sm *SmartManager) parseSmartForNvme(output []byte) bool {
data := &smart.SmartInfoForNvme{}
if err := json.Unmarshal(output, &data); err != nil {
return false
}
sm.mutex.Lock()
defer sm.mutex.Unlock()
// get device name (e.g. /dev/nvme0)
keyName := data.SerialNumber
// if device does not exist in SmartDataMap, initialize it
if _, ok := sm.SmartDataMap[keyName]; !ok {
sm.SmartDataMap[keyName] = &system.SmartData{}
}
// update SmartData
smartData := sm.SmartDataMap[keyName]
smartData.ModelName = data.ModelName
smartData.SerialNumber = data.SerialNumber
smartData.FirmwareVersion = data.FirmwareVersion
smartData.Capacity = data.UserCapacity.Bytes
if data.SmartStatus.Passed {
smartData.SmartStatus = "PASSED"
} else {
smartData.SmartStatus = "FAILED"
}
smartData.DiskName = data.Device.Name
smartData.DiskType = data.Device.Type
v := reflect.ValueOf(data.NVMeSmartHealthInformationLog)
t := v.Type()
smartData.Attributes = make([]*system.SmartAttribute, 0, v.NumField())
// nvme attributes does not follow the same format as ata attributes,
// so we have to manually iterate over the fields and update SmartAttributes
for i := 0; i < v.NumField(); i++ {
field := t.Field(i)
value := v.Field(i)
key := field.Name
val := value.Interface()
// drop non int values
if _, ok := val.(int); !ok {
continue
}
smartAttr := &system.SmartAttribute{
Name: key,
RawValue: val.(int),
}
smartData.Attributes = append(smartData.Attributes, smartAttr)
}
smartData.Temperature = data.NVMeSmartHealthInformationLog.Temperature
sm.SmartDataMap[keyName] = smartData
return true
}
// detectSmartctl checks if smartctl is installed, returns an error if not
func (sm *SmartManager) detectSmartctl() error {
if _, err := exec.LookPath("smartctl"); err == nil {
return nil
}
return fmt.Errorf("no smartctl found - install smartctl")
}
// NewGPUManager creates and initializes a new GPUManager
func NewSmartManager() (*SmartManager, error) {
var sm SmartManager
if err := sm.detectSmartctl(); err != nil {
return nil, err
}
go sm.Start()
return &sm, nil
}