mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-24 22:46:16 +01:00
S.M.A.R.T support (#614)
* add agent smart support * refactor(system): update JSON tags in SmartData struct * refactor(agent): use serial number as the key of SmartDataMap Updated the SmartManager's methods to use the device's serial number as the key in the SmartDataMap instead of the device name. * refactor: use raw values in smart attributes for nvme devices * feat: add S.M.A.R.T. data display in web ui Introduced a new Disks tab in the SystemDetail component to display disk information and S.M.A.R.T. data. The tab includes a table for visualizing disk attributes and their statuses. Also added SmartData and SmartAttribute interfaces to support the new functionality.
This commit is contained in:
@@ -25,6 +25,7 @@ type Agent struct {
|
||||
systemInfo system.Info // Host system info
|
||||
gpuManager *GPUManager // Manages GPU data
|
||||
cache *SessionCache // Cache for system stats based on primary session ID
|
||||
smartManager *SmartManager // Manages SMART data
|
||||
}
|
||||
|
||||
func NewAgent() *Agent {
|
||||
@@ -62,6 +63,12 @@ func NewAgent() *Agent {
|
||||
agent.gpuManager = gm
|
||||
}
|
||||
|
||||
if sm, err := NewSmartManager(); err != nil {
|
||||
slog.Debug("SMART", "err", err)
|
||||
} else {
|
||||
agent.smartManager = sm
|
||||
}
|
||||
|
||||
// if debugging, print stats
|
||||
if agent.debug {
|
||||
slog.Debug("Stats", "data", agent.gatherStats(""))
|
||||
|
||||
304
beszel/internal/agent/smart.go
Normal file
304
beszel/internal/agent/smart.go
Normal file
@@ -0,0 +1,304 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"beszel/internal/entities/smart"
|
||||
"beszel/internal/entities/system"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"reflect"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/exp/slog"
|
||||
)
|
||||
|
||||
// SmartManager manages data collection for SMART devices
|
||||
// TODO: add retry argument
|
||||
// TODO: add timeout argument
|
||||
type SmartManager struct {
|
||||
SmartDataMap map[string]*system.SmartData
|
||||
SmartDevices []*DeviceInfo
|
||||
mutex sync.Mutex
|
||||
}
|
||||
|
||||
type scanOutput struct {
|
||||
Devices []struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
InfoName string `json:"info_name"`
|
||||
Protocol string `json:"protocol"`
|
||||
} `json:"devices"`
|
||||
}
|
||||
|
||||
type DeviceInfo struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
InfoName string `json:"info_name"`
|
||||
Protocol string `json:"protocol"`
|
||||
}
|
||||
|
||||
var errNoValidSmartData = fmt.Errorf("no valid GPU data found") // Error for missing data
|
||||
|
||||
// Starts the SmartManager
|
||||
func (sm *SmartManager) Start() {
|
||||
sm.SmartDataMap = make(map[string]*system.SmartData)
|
||||
for {
|
||||
err := sm.ScanDevices()
|
||||
if err != nil {
|
||||
slog.Warn("smartctl scan failed, stopping", "err", err)
|
||||
return
|
||||
}
|
||||
// TODO: add retry logic
|
||||
for _, deviceInfo := range sm.SmartDevices {
|
||||
err := sm.CollectSmart(deviceInfo)
|
||||
if err != nil {
|
||||
slog.Warn("smartctl collect failed, stopping", "err", err)
|
||||
return
|
||||
}
|
||||
}
|
||||
// Sleep for 10 seconds before next scan
|
||||
time.Sleep(10 * time.Second)
|
||||
}
|
||||
}
|
||||
|
||||
// GetCurrentData returns the current SMART data
|
||||
func (sm *SmartManager) GetCurrentData() map[string]system.SmartData {
|
||||
sm.mutex.Lock()
|
||||
defer sm.mutex.Unlock()
|
||||
result := make(map[string]system.SmartData)
|
||||
for key, value := range sm.SmartDataMap {
|
||||
result[key] = *value
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// ScanDevices scans for SMART devices
|
||||
// Scan devices using `smartctl --scan -j`
|
||||
// If scan fails, return error
|
||||
// If scan succeeds, parse the output and update the SmartDevices slice
|
||||
func (sm *SmartManager) ScanDevices() error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, "smartctl", "--scan", "-j")
|
||||
output, err := cmd.Output()
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
hasValidData := sm.parseScan(output)
|
||||
if !hasValidData {
|
||||
return errNoValidSmartData
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// CollectSmart collects SMART data for a device
|
||||
// Collect data using `smartctl --all -j /dev/sdX` or `smartctl --all -j /dev/nvmeX`
|
||||
// If collect fails, return error
|
||||
// If collect succeeds, parse the output and update the SmartDataMap
|
||||
func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, "smartctl", "--all", "-j", deviceInfo.Name)
|
||||
|
||||
output, err := cmd.Output()
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
hasValidData := false
|
||||
if deviceInfo.Type == "scsi" {
|
||||
// parse scsi devices
|
||||
hasValidData = sm.parseSmartForScsi(output)
|
||||
} else if deviceInfo.Type == "nvme" {
|
||||
// parse nvme devices
|
||||
hasValidData = sm.parseSmartForNvme(output)
|
||||
}
|
||||
|
||||
if !hasValidData {
|
||||
return errNoValidSmartData
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// parseScan parses the output of smartctl --scan -j and updates the SmartDevices slice
|
||||
func (sm *SmartManager) parseScan(output []byte) bool {
|
||||
sm.mutex.Lock()
|
||||
defer sm.mutex.Unlock()
|
||||
|
||||
sm.SmartDevices = make([]*DeviceInfo, 0)
|
||||
scan := &scanOutput{}
|
||||
|
||||
if err := json.Unmarshal(output, scan); err != nil {
|
||||
fmt.Printf("Failed to parse JSON: %v\n", err)
|
||||
return false
|
||||
}
|
||||
|
||||
scannedDeviceNameMap := make(map[string]bool)
|
||||
|
||||
for _, device := range scan.Devices {
|
||||
deviceInfo := &DeviceInfo{
|
||||
Name: device.Name,
|
||||
Type: device.Type,
|
||||
InfoName: device.InfoName,
|
||||
Protocol: device.Protocol,
|
||||
}
|
||||
sm.SmartDevices = append(sm.SmartDevices, deviceInfo)
|
||||
scannedDeviceNameMap[device.Name] = true
|
||||
}
|
||||
// remove devices that are not in the scan
|
||||
for key := range sm.SmartDataMap {
|
||||
if _, ok := scannedDeviceNameMap[key]; !ok {
|
||||
delete(sm.SmartDataMap, key)
|
||||
}
|
||||
}
|
||||
devicesString := ""
|
||||
for _, device := range sm.SmartDevices {
|
||||
devicesString += device.Name + " "
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// parseSmartForScsi parses the output of smartctl --all -j /dev/sdX and updates the SmartDataMap
|
||||
func (sm *SmartManager) parseSmartForScsi(output []byte) bool {
|
||||
data := &smart.SmartInfoForSata{}
|
||||
|
||||
if err := json.Unmarshal(output, &data); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
sm.mutex.Lock()
|
||||
defer sm.mutex.Unlock()
|
||||
|
||||
// get device name (e.g. /dev/sda)
|
||||
keyName := data.SerialNumber
|
||||
|
||||
// if device does not exist in SmartDataMap, initialize it
|
||||
if _, ok := sm.SmartDataMap[keyName]; !ok {
|
||||
sm.SmartDataMap[keyName] = &system.SmartData{}
|
||||
}
|
||||
|
||||
// update SmartData
|
||||
smartData := sm.SmartDataMap[keyName]
|
||||
smartData.ModelFamily = data.ModelFamily
|
||||
smartData.ModelName = data.ModelName
|
||||
smartData.SerialNumber = data.SerialNumber
|
||||
smartData.FirmwareVersion = data.FirmwareVersion
|
||||
smartData.Capacity = data.UserCapacity.Bytes
|
||||
if data.SmartStatus.Passed {
|
||||
smartData.SmartStatus = "PASSED"
|
||||
} else {
|
||||
smartData.SmartStatus = "FAILED"
|
||||
}
|
||||
smartData.DiskName = data.Device.Name
|
||||
smartData.DiskType = data.Device.Type
|
||||
|
||||
// update SmartAttributes
|
||||
smartData.Attributes = make([]*system.SmartAttribute, 0, len(data.AtaSmartAttributes.Table))
|
||||
for _, attr := range data.AtaSmartAttributes.Table {
|
||||
smartAttr := &system.SmartAttribute{
|
||||
Id: attr.ID,
|
||||
Name: attr.Name,
|
||||
Value: attr.Value,
|
||||
Worst: attr.Worst,
|
||||
Threshold: attr.Thresh,
|
||||
RawValue: attr.Raw.Value,
|
||||
RawString: attr.Raw.String,
|
||||
Flags: attr.Flags.String,
|
||||
WhenFailed: attr.WhenFailed,
|
||||
}
|
||||
smartData.Attributes = append(smartData.Attributes, smartAttr)
|
||||
}
|
||||
smartData.Temperature = data.Temperature.Current
|
||||
sm.SmartDataMap[keyName] = smartData
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// parseSmartForNvme parses the output of smartctl --all -j /dev/nvmeX and updates the SmartDataMap
|
||||
func (sm *SmartManager) parseSmartForNvme(output []byte) bool {
|
||||
data := &smart.SmartInfoForNvme{}
|
||||
|
||||
if err := json.Unmarshal(output, &data); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
sm.mutex.Lock()
|
||||
defer sm.mutex.Unlock()
|
||||
|
||||
// get device name (e.g. /dev/nvme0)
|
||||
keyName := data.SerialNumber
|
||||
|
||||
// if device does not exist in SmartDataMap, initialize it
|
||||
if _, ok := sm.SmartDataMap[keyName]; !ok {
|
||||
sm.SmartDataMap[keyName] = &system.SmartData{}
|
||||
}
|
||||
|
||||
// update SmartData
|
||||
smartData := sm.SmartDataMap[keyName]
|
||||
smartData.ModelName = data.ModelName
|
||||
smartData.SerialNumber = data.SerialNumber
|
||||
smartData.FirmwareVersion = data.FirmwareVersion
|
||||
smartData.Capacity = data.UserCapacity.Bytes
|
||||
if data.SmartStatus.Passed {
|
||||
smartData.SmartStatus = "PASSED"
|
||||
} else {
|
||||
smartData.SmartStatus = "FAILED"
|
||||
}
|
||||
smartData.DiskName = data.Device.Name
|
||||
smartData.DiskType = data.Device.Type
|
||||
|
||||
v := reflect.ValueOf(data.NVMeSmartHealthInformationLog)
|
||||
t := v.Type()
|
||||
smartData.Attributes = make([]*system.SmartAttribute, 0, v.NumField())
|
||||
|
||||
// nvme attributes does not follow the same format as ata attributes,
|
||||
// so we have to manually iterate over the fields and update SmartAttributes
|
||||
for i := 0; i < v.NumField(); i++ {
|
||||
field := t.Field(i)
|
||||
value := v.Field(i)
|
||||
key := field.Name
|
||||
val := value.Interface()
|
||||
// drop non int values
|
||||
if _, ok := val.(int); !ok {
|
||||
continue
|
||||
}
|
||||
smartAttr := &system.SmartAttribute{
|
||||
Name: key,
|
||||
RawValue: val.(int),
|
||||
}
|
||||
smartData.Attributes = append(smartData.Attributes, smartAttr)
|
||||
}
|
||||
smartData.Temperature = data.NVMeSmartHealthInformationLog.Temperature
|
||||
|
||||
sm.SmartDataMap[keyName] = smartData
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// detectSmartctl checks if smartctl is installed, returns an error if not
|
||||
func (sm *SmartManager) detectSmartctl() error {
|
||||
if _, err := exec.LookPath("smartctl"); err == nil {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("no smartctl found - install smartctl")
|
||||
}
|
||||
|
||||
// NewGPUManager creates and initializes a new GPUManager
|
||||
func NewSmartManager() (*SmartManager, error) {
|
||||
var sm SmartManager
|
||||
if err := sm.detectSmartctl(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
go sm.Start()
|
||||
|
||||
return &sm, nil
|
||||
}
|
||||
@@ -237,6 +237,17 @@ func (a *Agent) getSystemStats() system.Stats {
|
||||
}
|
||||
}
|
||||
}
|
||||
if a.smartManager != nil {
|
||||
if smartData := a.smartManager.GetCurrentData(); len(smartData) > 0 {
|
||||
systemStats.SmartData = smartData
|
||||
if systemStats.Temperatures == nil {
|
||||
systemStats.Temperatures = make(map[string]float64, len(a.smartManager.SmartDataMap))
|
||||
}
|
||||
for key, value := range a.smartManager.SmartDataMap {
|
||||
systemStats.Temperatures[key] = float64(value.Temperature)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// update base system info
|
||||
a.systemInfo.Cpu = systemStats.Cpu
|
||||
|
||||
Reference in New Issue
Block a user