mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-23 14:06:18 +01:00
* add agent smart support * refactor(system): update JSON tags in SmartData struct * refactor(agent): use serial number as the key of SmartDataMap Updated the SmartManager's methods to use the device's serial number as the key in the SmartDataMap instead of the device name. * refactor: use raw values in smart attributes for nvme devices * feat: add S.M.A.R.T. data display in web ui Introduced a new Disks tab in the SystemDetail component to display disk information and S.M.A.R.T. data. The tab includes a table for visualizing disk attributes and their statuses. Also added SmartData and SmartAttribute interfaces to support the new functionality.
305 lines
7.8 KiB
Go
305 lines
7.8 KiB
Go
package agent
|
|
|
|
import (
|
|
"beszel/internal/entities/smart"
|
|
"beszel/internal/entities/system"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os/exec"
|
|
"reflect"
|
|
"sync"
|
|
"time"
|
|
|
|
"golang.org/x/exp/slog"
|
|
)
|
|
|
|
// SmartManager manages data collection for SMART devices
|
|
// TODO: add retry argument
|
|
// TODO: add timeout argument
|
|
type SmartManager struct {
|
|
SmartDataMap map[string]*system.SmartData
|
|
SmartDevices []*DeviceInfo
|
|
mutex sync.Mutex
|
|
}
|
|
|
|
type scanOutput struct {
|
|
Devices []struct {
|
|
Name string `json:"name"`
|
|
Type string `json:"type"`
|
|
InfoName string `json:"info_name"`
|
|
Protocol string `json:"protocol"`
|
|
} `json:"devices"`
|
|
}
|
|
|
|
type DeviceInfo struct {
|
|
Name string `json:"name"`
|
|
Type string `json:"type"`
|
|
InfoName string `json:"info_name"`
|
|
Protocol string `json:"protocol"`
|
|
}
|
|
|
|
var errNoValidSmartData = fmt.Errorf("no valid GPU data found") // Error for missing data
|
|
|
|
// Starts the SmartManager
|
|
func (sm *SmartManager) Start() {
|
|
sm.SmartDataMap = make(map[string]*system.SmartData)
|
|
for {
|
|
err := sm.ScanDevices()
|
|
if err != nil {
|
|
slog.Warn("smartctl scan failed, stopping", "err", err)
|
|
return
|
|
}
|
|
// TODO: add retry logic
|
|
for _, deviceInfo := range sm.SmartDevices {
|
|
err := sm.CollectSmart(deviceInfo)
|
|
if err != nil {
|
|
slog.Warn("smartctl collect failed, stopping", "err", err)
|
|
return
|
|
}
|
|
}
|
|
// Sleep for 10 seconds before next scan
|
|
time.Sleep(10 * time.Second)
|
|
}
|
|
}
|
|
|
|
// GetCurrentData returns the current SMART data
|
|
func (sm *SmartManager) GetCurrentData() map[string]system.SmartData {
|
|
sm.mutex.Lock()
|
|
defer sm.mutex.Unlock()
|
|
result := make(map[string]system.SmartData)
|
|
for key, value := range sm.SmartDataMap {
|
|
result[key] = *value
|
|
}
|
|
return result
|
|
}
|
|
|
|
// ScanDevices scans for SMART devices
|
|
// Scan devices using `smartctl --scan -j`
|
|
// If scan fails, return error
|
|
// If scan succeeds, parse the output and update the SmartDevices slice
|
|
func (sm *SmartManager) ScanDevices() error {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
|
|
cmd := exec.CommandContext(ctx, "smartctl", "--scan", "-j")
|
|
output, err := cmd.Output()
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
hasValidData := sm.parseScan(output)
|
|
if !hasValidData {
|
|
return errNoValidSmartData
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// CollectSmart collects SMART data for a device
|
|
// Collect data using `smartctl --all -j /dev/sdX` or `smartctl --all -j /dev/nvmeX`
|
|
// If collect fails, return error
|
|
// If collect succeeds, parse the output and update the SmartDataMap
|
|
func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
|
|
cmd := exec.CommandContext(ctx, "smartctl", "--all", "-j", deviceInfo.Name)
|
|
|
|
output, err := cmd.Output()
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
hasValidData := false
|
|
if deviceInfo.Type == "scsi" {
|
|
// parse scsi devices
|
|
hasValidData = sm.parseSmartForScsi(output)
|
|
} else if deviceInfo.Type == "nvme" {
|
|
// parse nvme devices
|
|
hasValidData = sm.parseSmartForNvme(output)
|
|
}
|
|
|
|
if !hasValidData {
|
|
return errNoValidSmartData
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// parseScan parses the output of smartctl --scan -j and updates the SmartDevices slice
|
|
func (sm *SmartManager) parseScan(output []byte) bool {
|
|
sm.mutex.Lock()
|
|
defer sm.mutex.Unlock()
|
|
|
|
sm.SmartDevices = make([]*DeviceInfo, 0)
|
|
scan := &scanOutput{}
|
|
|
|
if err := json.Unmarshal(output, scan); err != nil {
|
|
fmt.Printf("Failed to parse JSON: %v\n", err)
|
|
return false
|
|
}
|
|
|
|
scannedDeviceNameMap := make(map[string]bool)
|
|
|
|
for _, device := range scan.Devices {
|
|
deviceInfo := &DeviceInfo{
|
|
Name: device.Name,
|
|
Type: device.Type,
|
|
InfoName: device.InfoName,
|
|
Protocol: device.Protocol,
|
|
}
|
|
sm.SmartDevices = append(sm.SmartDevices, deviceInfo)
|
|
scannedDeviceNameMap[device.Name] = true
|
|
}
|
|
// remove devices that are not in the scan
|
|
for key := range sm.SmartDataMap {
|
|
if _, ok := scannedDeviceNameMap[key]; !ok {
|
|
delete(sm.SmartDataMap, key)
|
|
}
|
|
}
|
|
devicesString := ""
|
|
for _, device := range sm.SmartDevices {
|
|
devicesString += device.Name + " "
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// parseSmartForScsi parses the output of smartctl --all -j /dev/sdX and updates the SmartDataMap
|
|
func (sm *SmartManager) parseSmartForScsi(output []byte) bool {
|
|
data := &smart.SmartInfoForSata{}
|
|
|
|
if err := json.Unmarshal(output, &data); err != nil {
|
|
return false
|
|
}
|
|
|
|
sm.mutex.Lock()
|
|
defer sm.mutex.Unlock()
|
|
|
|
// get device name (e.g. /dev/sda)
|
|
keyName := data.SerialNumber
|
|
|
|
// if device does not exist in SmartDataMap, initialize it
|
|
if _, ok := sm.SmartDataMap[keyName]; !ok {
|
|
sm.SmartDataMap[keyName] = &system.SmartData{}
|
|
}
|
|
|
|
// update SmartData
|
|
smartData := sm.SmartDataMap[keyName]
|
|
smartData.ModelFamily = data.ModelFamily
|
|
smartData.ModelName = data.ModelName
|
|
smartData.SerialNumber = data.SerialNumber
|
|
smartData.FirmwareVersion = data.FirmwareVersion
|
|
smartData.Capacity = data.UserCapacity.Bytes
|
|
if data.SmartStatus.Passed {
|
|
smartData.SmartStatus = "PASSED"
|
|
} else {
|
|
smartData.SmartStatus = "FAILED"
|
|
}
|
|
smartData.DiskName = data.Device.Name
|
|
smartData.DiskType = data.Device.Type
|
|
|
|
// update SmartAttributes
|
|
smartData.Attributes = make([]*system.SmartAttribute, 0, len(data.AtaSmartAttributes.Table))
|
|
for _, attr := range data.AtaSmartAttributes.Table {
|
|
smartAttr := &system.SmartAttribute{
|
|
Id: attr.ID,
|
|
Name: attr.Name,
|
|
Value: attr.Value,
|
|
Worst: attr.Worst,
|
|
Threshold: attr.Thresh,
|
|
RawValue: attr.Raw.Value,
|
|
RawString: attr.Raw.String,
|
|
Flags: attr.Flags.String,
|
|
WhenFailed: attr.WhenFailed,
|
|
}
|
|
smartData.Attributes = append(smartData.Attributes, smartAttr)
|
|
}
|
|
smartData.Temperature = data.Temperature.Current
|
|
sm.SmartDataMap[keyName] = smartData
|
|
|
|
return true
|
|
}
|
|
|
|
// parseSmartForNvme parses the output of smartctl --all -j /dev/nvmeX and updates the SmartDataMap
|
|
func (sm *SmartManager) parseSmartForNvme(output []byte) bool {
|
|
data := &smart.SmartInfoForNvme{}
|
|
|
|
if err := json.Unmarshal(output, &data); err != nil {
|
|
return false
|
|
}
|
|
|
|
sm.mutex.Lock()
|
|
defer sm.mutex.Unlock()
|
|
|
|
// get device name (e.g. /dev/nvme0)
|
|
keyName := data.SerialNumber
|
|
|
|
// if device does not exist in SmartDataMap, initialize it
|
|
if _, ok := sm.SmartDataMap[keyName]; !ok {
|
|
sm.SmartDataMap[keyName] = &system.SmartData{}
|
|
}
|
|
|
|
// update SmartData
|
|
smartData := sm.SmartDataMap[keyName]
|
|
smartData.ModelName = data.ModelName
|
|
smartData.SerialNumber = data.SerialNumber
|
|
smartData.FirmwareVersion = data.FirmwareVersion
|
|
smartData.Capacity = data.UserCapacity.Bytes
|
|
if data.SmartStatus.Passed {
|
|
smartData.SmartStatus = "PASSED"
|
|
} else {
|
|
smartData.SmartStatus = "FAILED"
|
|
}
|
|
smartData.DiskName = data.Device.Name
|
|
smartData.DiskType = data.Device.Type
|
|
|
|
v := reflect.ValueOf(data.NVMeSmartHealthInformationLog)
|
|
t := v.Type()
|
|
smartData.Attributes = make([]*system.SmartAttribute, 0, v.NumField())
|
|
|
|
// nvme attributes does not follow the same format as ata attributes,
|
|
// so we have to manually iterate over the fields and update SmartAttributes
|
|
for i := 0; i < v.NumField(); i++ {
|
|
field := t.Field(i)
|
|
value := v.Field(i)
|
|
key := field.Name
|
|
val := value.Interface()
|
|
// drop non int values
|
|
if _, ok := val.(int); !ok {
|
|
continue
|
|
}
|
|
smartAttr := &system.SmartAttribute{
|
|
Name: key,
|
|
RawValue: val.(int),
|
|
}
|
|
smartData.Attributes = append(smartData.Attributes, smartAttr)
|
|
}
|
|
smartData.Temperature = data.NVMeSmartHealthInformationLog.Temperature
|
|
|
|
sm.SmartDataMap[keyName] = smartData
|
|
|
|
return true
|
|
}
|
|
|
|
// detectSmartctl checks if smartctl is installed, returns an error if not
|
|
func (sm *SmartManager) detectSmartctl() error {
|
|
if _, err := exec.LookPath("smartctl"); err == nil {
|
|
return nil
|
|
}
|
|
return fmt.Errorf("no smartctl found - install smartctl")
|
|
}
|
|
|
|
// NewGPUManager creates and initializes a new GPUManager
|
|
func NewSmartManager() (*SmartManager, error) {
|
|
var sm SmartManager
|
|
if err := sm.detectSmartctl(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
go sm.Start()
|
|
|
|
return &sm, nil
|
|
}
|