update logs in smart.go

also change max execution time to 2 sec
This commit is contained in:
henrygd
2025-10-28 17:34:49 -04:00
parent 9f969d843c
commit cd9c2bd9ab

View File

@@ -3,6 +3,7 @@ package agent
import ( import (
"context" "context"
"encoding/json" "encoding/json"
"errors"
"fmt" "fmt"
"os/exec" "os/exec"
"slices" "slices"
@@ -60,7 +61,7 @@ func (sm *SmartManager) Refresh(forceScan bool) error {
continue continue
} }
if err := sm.CollectSmart(deviceInfo); err != nil { if err := sm.CollectSmart(deviceInfo); err != nil {
slog.Debug("smartctl collect failed, skipping", "device", deviceInfo.Name, "err", err) slog.Debug("smartctl collect failed", "device", deviceInfo.Name, "err", err)
collectErr = err collectErr = err
} }
} }
@@ -131,18 +132,18 @@ func (sm *SmartManager) GetCurrentData() map[string]smart.SmartData {
// If scan fails, return error // If scan fails, return error
// If scan succeeds, parse the output and update the SmartDevices slice // If scan succeeds, parse the output and update the SmartDevices slice
func (sm *SmartManager) ScanDevices(force bool) error { func (sm *SmartManager) ScanDevices(force bool) error {
if !force && time.Since(sm.lastScanTime) < 10*time.Minute { if !force && time.Since(sm.lastScanTime) < 30*time.Minute {
return nil return nil
} }
sm.lastScanTime = time.Now() sm.lastScanTime = time.Now()
var configuredDevices []*DeviceInfo var configuredDevices []*DeviceInfo
if configuredRaw, ok := GetEnv("SMART_DEVICES"); ok { if configuredRaw, ok := GetEnv("SMART_DEVICES"); ok {
slog.Info("SMART_DEVICES", "value", configuredRaw)
config := strings.TrimSpace(configuredRaw) config := strings.TrimSpace(configuredRaw)
if config == "" { if config == "" {
return errNoValidSmartData return errNoValidSmartData
} }
slog.Info("SMART_DEVICES", "config", config)
parsedDevices, err := sm.parseConfiguredDevices(config) parsedDevices, err := sm.parseConfiguredDevices(config)
if err != nil { if err != nil {
@@ -174,8 +175,10 @@ func (sm *SmartManager) ScanDevices(force bool) error {
finalDevices := mergeDeviceLists(scannedDevices, configuredDevices) finalDevices := mergeDeviceLists(scannedDevices, configuredDevices)
sm.updateSmartDevices(finalDevices) sm.updateSmartDevices(finalDevices)
if len(finalDevices) == 0 { if len(finalDevices) == 0 {
if scanErr != nil { if scanErr != nil {
slog.Debug("smartctl scan failed", "err", scanErr)
return scanErr return scanErr
} }
return errNoValidSmartData return errNoValidSmartData
@@ -197,7 +200,7 @@ func (sm *SmartManager) parseConfiguredDevices(config string) ([]*DeviceInfo, er
name := strings.TrimSpace(parts[0]) name := strings.TrimSpace(parts[0])
if name == "" { if name == "" {
return nil, fmt.Errorf("invalid SMART_DEVICES entry %q: device name is required", entry) return nil, fmt.Errorf("invalid SMART_DEVICES entry %q", entry)
} }
devType := "" devType := ""
@@ -269,9 +272,12 @@ func (sm *SmartManager) parseSmartOutput(deviceInfo *DeviceInfo, output []byte)
deviceInfo.Type = parser.Type deviceInfo.Type = parser.Type
} }
return true return true
} else {
slog.Debug("parser failed", "device", deviceInfo.Name, "parser", parser.Type)
} }
} }
slog.Debug("parsing failed", "device", deviceInfo.Name)
return false return false
} }
@@ -283,10 +289,12 @@ func (sm *SmartManager) parseSmartOutput(deviceInfo *DeviceInfo, output []byte)
// Uses -n standby to avoid waking up sleeping disks, but bypasses standby mode // Uses -n standby to avoid waking up sleeping disks, but bypasses standby mode
// for initial data collection when no cached data exists // for initial data collection when no cached data exists
func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error { func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
// slog.Info("collecting SMART data", "device", deviceInfo.Name, "type", deviceInfo.Type, "has_existing_data", sm.hasDataForDevice(deviceInfo.Name))
// Check if we have any existing data for this device // Check if we have any existing data for this device
hasExistingData := sm.hasDataForDevice(deviceInfo.Name) hasExistingData := sm.hasDataForDevice(deviceInfo.Name)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel() defer cancel()
// Try with -n standby first if we have existing data // Try with -n standby first if we have existing data
@@ -298,12 +306,10 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == 2 { if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == 2 {
if hasExistingData { if hasExistingData {
// Device is in standby and we have cached data, keep using cache // Device is in standby and we have cached data, keep using cache
slog.Debug("device in standby mode, using cached data", "device", deviceInfo.Name)
return nil return nil
} }
// No cached data, need to collect initial data by bypassing standby // No cached data, need to collect initial data by bypassing standby
slog.Debug("device in standby but no cached data, collecting initial data", "device", deviceInfo.Name) ctx2, cancel2 := context.WithTimeout(context.Background(), 2*time.Second)
ctx2, cancel2 := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel2() defer cancel2()
args = sm.smartctlArgs(deviceInfo, false) args = sm.smartctlArgs(deviceInfo, false)
cmd = exec.CommandContext(ctx2, "smartctl", args...) cmd = exec.CommandContext(ctx2, "smartctl", args...)
@@ -314,10 +320,13 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
if !hasValidData { if !hasValidData {
if err != nil { if err != nil {
slog.Debug("smartctl failed", "device", deviceInfo.Name, "err", err)
return err return err
} }
slog.Debug("no valid SMART data found", "device", deviceInfo.Name)
return errNoValidSmartData return errNoValidSmartData
} }
return nil return nil
} }
@@ -362,17 +371,17 @@ func (sm *SmartManager) parseScan(output []byte) ([]*DeviceInfo, bool) {
scan := &scanOutput{} scan := &scanOutput{}
if err := json.Unmarshal(output, scan); err != nil { if err := json.Unmarshal(output, scan); err != nil {
slog.Debug("Failed to parse smartctl scan JSON", "err", err)
return nil, false return nil, false
} }
if len(scan.Devices) == 0 { if len(scan.Devices) == 0 {
slog.Debug("no devices found in smartctl scan")
return nil, false return nil, false
} }
devices := make([]*DeviceInfo, 0, len(scan.Devices)) devices := make([]*DeviceInfo, 0, len(scan.Devices))
for _, device := range scan.Devices { for _, device := range scan.Devices {
// slog.Info("found device during scan", "name", device.Name, "type", device.Type, "protocol", device.Protocol) slog.Debug("smartctl scan", "name", device.Name, "type", device.Type, "protocol", device.Protocol)
devices = append(devices, &DeviceInfo{ devices = append(devices, &DeviceInfo{
Name: device.Name, Name: device.Name,
Type: device.Type, Type: device.Type,
@@ -452,7 +461,7 @@ func (sm *SmartManager) updateSmartDevices(devices []*DeviceInfo) {
for key, data := range sm.SmartDataMap { for key, data := range sm.SmartDataMap {
if data == nil { if data == nil {
delete(sm.SmartDataMap, key) delete(sm.SmartDataMap, key)
continue continue
} }
if _, ok := validNames[data.DiskName]; ok { if _, ok := validNames[data.DiskName]; ok {
@@ -496,13 +505,13 @@ func (sm *SmartManager) parseSmartForSata(output []byte) (bool, int) {
} }
if data.SerialNumber == "" { if data.SerialNumber == "" {
slog.Debug("device has no serial number, skipping", "device", data.Device.Name) slog.Debug("no serial number", "device", data.Device.Name)
return false, data.Smartctl.ExitStatus return false, data.Smartctl.ExitStatus
} }
// Skip virtual devices (e.g., Kubernetes PVCs, QEMU, VirtualBox, etc.) // Skip virtual devices (e.g., Kubernetes PVCs, QEMU, VirtualBox, etc.)
if sm.isVirtualDevice(&data) { if sm.isVirtualDevice(&data) {
slog.Debug("skipping virtual device", "device", data.Device.Name, "model", data.ModelName) slog.Debug("skipping smart", "device", data.Device.Name, "model", data.ModelName)
return false, data.Smartctl.ExitStatus return false, data.Smartctl.ExitStatus
} }
@@ -566,7 +575,7 @@ func (sm *SmartManager) parseSmartForScsi(output []byte) (bool, int) {
} }
if data.SerialNumber == "" { if data.SerialNumber == "" {
slog.Debug("scsi device has no serial number, skipping", "device", data.Device.Name) slog.Debug("no serial number", "device", data.Device.Name)
return false, data.Smartctl.ExitStatus return false, data.Smartctl.ExitStatus
} }
@@ -633,7 +642,6 @@ func parseScsiGigabytesProcessed(value string) int64 {
normalized := strings.ReplaceAll(value, ",", "") normalized := strings.ReplaceAll(value, ",", "")
parsed, err := strconv.ParseInt(normalized, 10, 64) parsed, err := strconv.ParseInt(normalized, 10, 64)
if err != nil { if err != nil {
slog.Debug("failed to parse SCSI gigabytes processed", "value", value, "err", err)
return -1 return -1
} }
return parsed return parsed
@@ -649,7 +657,7 @@ func (sm *SmartManager) parseSmartForNvme(output []byte) (bool, int) {
} }
if data.SerialNumber == "" { if data.SerialNumber == "" {
slog.Debug("device has no serial number, skipping", "device", data.Device.Name) slog.Debug("no serial number", "device", data.Device.Name)
return false, data.Smartctl.ExitStatus return false, data.Smartctl.ExitStatus
} }
@@ -705,9 +713,11 @@ func (sm *SmartManager) parseSmartForNvme(output []byte) (bool, int) {
// detectSmartctl checks if smartctl is installed, returns an error if not // detectSmartctl checks if smartctl is installed, returns an error if not
func (sm *SmartManager) detectSmartctl() error { func (sm *SmartManager) detectSmartctl() error {
if _, err := exec.LookPath("smartctl"); err == nil { if _, err := exec.LookPath("smartctl"); err == nil {
slog.Debug("smartctl found")
return nil return nil
} }
return fmt.Errorf("smartctl not found") slog.Debug("smartctl not found")
return errors.New("smartctl not found")
} }
// NewSmartManager creates and initializes a new SmartManager // NewSmartManager creates and initializes a new SmartManager