Compare commits

..

3 Commits

Author SHA1 Message Date
henrygd
ea354ec030 initial nvml collector 2025-12-21 14:33:04 -05:00
henrygd
f6ab5f2af1 refactor: rm diskinfo abstraction from smart-table.tsx 2025-12-21 12:25:12 -05:00
Sven van Ginkel
7d943633a3 fix capacity sorting in smart table (#1551) 2025-12-21 12:21:44 -05:00
6 changed files with 203 additions and 169 deletions

View File

@@ -2,8 +2,6 @@ package agent
import (
"fmt"
"os"
"path/filepath"
"strings"
"time"
"unsafe"
@@ -75,10 +73,9 @@ type nvmlCollector struct {
func (c *nvmlCollector) init() error {
slog.Info("NVML: Initializing")
libPath := "libnvidia-ml.so.1"
libPath := getNVMLPath()
// Check for standard locations if necessary, but purego/dlopen usually handles this
lib, err := purego.Dlopen(libPath, purego.RTLD_NOW|purego.RTLD_GLOBAL)
lib, err := openLibrary(libPath)
if err != nil {
return fmt.Errorf("failed to load %s: %w", libPath, err)
}
@@ -90,8 +87,7 @@ func (c *nvmlCollector) init() error {
purego.RegisterLibFunc(&nvmlDeviceGetHandleByIndex, lib, "nvmlDeviceGetHandleByIndex")
purego.RegisterLibFunc(&nvmlDeviceGetName, lib, "nvmlDeviceGetName")
// Try to get v2 memory info, fallback to v1 if not available
_, err = purego.Dlsym(lib, "nvmlDeviceGetMemoryInfo_v2")
if err == nil {
if hasSymbol(lib, "nvmlDeviceGetMemoryInfo_v2") {
c.isV2 = true
purego.RegisterLibFunc(&nvmlDeviceGetMemoryInfo, lib, "nvmlDeviceGetMemoryInfo_v2")
} else {
@@ -204,34 +200,3 @@ func (c *nvmlCollector) collect() {
slog.Info("NVML: Collected data", "gpu", gpu)
}
}
func (c *nvmlCollector) isGPUActive(bdf string) bool {
// runtime_status
statusPath := filepath.Join("/sys/bus/pci/devices", bdf, "power/runtime_status")
status, err := os.ReadFile(statusPath)
if err != nil {
slog.Info("NVML: Can't read runtime_status", "bdf", bdf, "err", err)
return true // Assume active if we can't read status
}
statusStr := strings.TrimSpace(string(status))
if statusStr != "active" && statusStr != "resuming" {
slog.Info("NVML: GPU is not active", "bdf", bdf, "status", statusStr)
return false
}
// power_state (D0 check)
// Find any drm card device power_state
pstatePathPattern := filepath.Join("/sys/bus/pci/devices", bdf, "drm/card*/device/power_state")
matches, _ := filepath.Glob(pstatePathPattern)
if len(matches) > 0 {
pstate, err := os.ReadFile(matches[0])
if err == nil {
pstateStr := strings.TrimSpace(string(pstate))
if pstateStr != "D0" {
return false
}
}
}
return true
}

56
agent/gpu_nvml_linux.go Normal file
View File

@@ -0,0 +1,56 @@
//go:build linux
package agent
import (
"os"
"path/filepath"
"strings"
"github.com/ebitengine/purego"
"golang.org/x/exp/slog"
)
func openLibrary(name string) (uintptr, error) {
return purego.Dlopen(name, purego.RTLD_NOW|purego.RTLD_GLOBAL)
}
func getNVMLPath() string {
return "libnvidia-ml.so.1"
}
func hasSymbol(lib uintptr, symbol string) bool {
_, err := purego.Dlsym(lib, symbol)
return err == nil
}
func (c *nvmlCollector) isGPUActive(bdf string) bool {
// runtime_status
statusPath := filepath.Join("/sys/bus/pci/devices", bdf, "power/runtime_status")
status, err := os.ReadFile(statusPath)
if err != nil {
slog.Info("NVML: Can't read runtime_status", "bdf", bdf, "err", err)
return true // Assume active if we can't read status
}
statusStr := strings.TrimSpace(string(status))
if statusStr != "active" && statusStr != "resuming" {
slog.Info("NVML: GPU is not active", "bdf", bdf, "status", statusStr)
return false
}
// power_state (D0 check)
// Find any drm card device power_state
pstatePathPattern := filepath.Join("/sys/bus/pci/devices", bdf, "drm/card*/device/power_state")
matches, _ := filepath.Glob(pstatePathPattern)
if len(matches) > 0 {
pstate, err := os.ReadFile(matches[0])
if err == nil {
pstateStr := strings.TrimSpace(string(pstate))
if pstateStr != "D0" {
return false
}
}
}
return true
}

View File

@@ -0,0 +1,21 @@
//go:build !linux && !windows
package agent
import "fmt"
func openLibrary(name string) (uintptr, error) {
return 0, fmt.Errorf("nvml not supported on this platform")
}
func getNVMLPath() string {
return ""
}
func hasSymbol(lib uintptr, symbol string) bool {
return false
}
func (c *nvmlCollector) isGPUActive(bdf string) bool {
return true
}

25
agent/gpu_nvml_windows.go Normal file
View File

@@ -0,0 +1,25 @@
//go:build windows
package agent
import (
"golang.org/x/sys/windows"
)
func openLibrary(name string) (uintptr, error) {
handle, err := windows.LoadLibrary(name)
return uintptr(handle), err
}
func getNVMLPath() string {
return "nvml.dll"
}
func hasSymbol(lib uintptr, symbol string) bool {
_, err := windows.GetProcAddress(windows.Handle(lib), symbol)
return err == nil
}
func (c *nvmlCollector) isGPUActive(bdf string) bool {
return true
}

2
go.mod
View File

@@ -21,6 +21,7 @@ require (
github.com/stretchr/testify v1.11.1
golang.org/x/crypto v0.45.0
golang.org/x/exp v0.0.0-20251125195548-87e1e737ad39
golang.org/x/sys v0.38.0
gopkg.in/yaml.v3 v3.0.1
)
@@ -57,7 +58,6 @@ require (
golang.org/x/net v0.47.0 // indirect
golang.org/x/oauth2 v0.33.0 // indirect
golang.org/x/sync v0.18.0 // indirect
golang.org/x/sys v0.38.0 // indirect
golang.org/x/term v0.37.0 // indirect
golang.org/x/text v0.31.0 // indirect
howett.net/plist v1.0.1 // indirect

View File

@@ -93,51 +93,15 @@ export const smartColumns: ColumnDef<SmartAttribute>[] = [
},
]
export type DiskInfo = {
id: string
system: string
device: string
model: string
capacity: string
status: string
temperature: number
deviceType: string
powerOnHours?: number
powerCycles?: number
attributes?: SmartAttribute[]
updated: string
}
// Function to format capacity display
function formatCapacity(bytes: number): string {
const { value, unit } = formatBytes(bytes)
return `${toFixedFloat(value, value >= 10 ? 1 : 2)} ${unit}`
}
// Function to convert SmartDeviceRecord to DiskInfo
function convertSmartDeviceRecordToDiskInfo(records: SmartDeviceRecord[]): DiskInfo[] {
const unknown = "Unknown"
return records.map((record) => ({
id: record.id,
system: record.system,
device: record.name || unknown,
model: record.model || unknown,
serialNumber: record.serial || unknown,
firmwareVersion: record.firmware || unknown,
capacity: record.capacity ? formatCapacity(record.capacity) : unknown,
status: record.state || unknown,
temperature: record.temp || 0,
deviceType: record.type || unknown,
attributes: record.attributes,
updated: record.updated,
powerOnHours: record.hours,
powerCycles: record.cycles,
}))
}
const SMART_DEVICE_FIELDS = "id,system,name,model,state,capacity,temp,type,hours,cycles,updated"
export const columns: ColumnDef<DiskInfo>[] = [
export const columns: ColumnDef<SmartDeviceRecord>[] = [
{
id: "system",
accessorFn: (record) => record.system,
@@ -154,12 +118,12 @@ export const columns: ColumnDef<DiskInfo>[] = [
},
},
{
accessorKey: "device",
sortingFn: (a, b) => a.original.device.localeCompare(b.original.device),
accessorKey: "name",
sortingFn: (a, b) => a.original.name.localeCompare(b.original.name),
header: ({ column }) => <HeaderButton column={column} name={t`Device`} Icon={HardDrive} />,
cell: ({ row }) => (
<div className="font-medium max-w-40 truncate ms-1.5" title={row.getValue("device")}>
{row.getValue("device")}
cell: ({ getValue }) => (
<div className="font-medium max-w-40 truncate ms-1.5" title={getValue() as string}>
{getValue() as string}
</div>
),
},
@@ -167,19 +131,20 @@ export const columns: ColumnDef<DiskInfo>[] = [
accessorKey: "model",
sortingFn: (a, b) => a.original.model.localeCompare(b.original.model),
header: ({ column }) => <HeaderButton column={column} name={t`Model`} Icon={Box} />,
cell: ({ row }) => (
<div className="max-w-48 truncate ms-1.5" title={row.getValue("model")}>
{row.getValue("model")}
cell: ({ getValue }) => (
<div className="max-w-48 truncate ms-1.5" title={getValue() as string}>
{getValue() as string}
</div>
),
},
{
accessorKey: "capacity",
invertSorting: true,
header: ({ column }) => <HeaderButton column={column} name={t`Capacity`} Icon={BinaryIcon} />,
cell: ({ getValue }) => <span className="ms-1.5">{getValue() as string}</span>,
cell: ({ getValue }) => <span className="ms-1.5">{formatCapacity(getValue() as number)}</span>,
},
{
accessorKey: "status",
accessorKey: "state",
header: ({ column }) => <HeaderButton column={column} name={t`Status`} Icon={Activity} />,
cell: ({ getValue }) => {
const status = getValue() as string
@@ -191,8 +156,8 @@ export const columns: ColumnDef<DiskInfo>[] = [
},
},
{
accessorKey: "deviceType",
sortingFn: (a, b) => a.original.deviceType.localeCompare(b.original.deviceType),
accessorKey: "type",
sortingFn: (a, b) => a.original.type.localeCompare(b.original.type),
header: ({ column }) => <HeaderButton column={column} name={t`Type`} Icon={ArrowLeftRightIcon} />,
cell: ({ getValue }) => (
<div className="ms-1.5">
@@ -203,7 +168,7 @@ export const columns: ColumnDef<DiskInfo>[] = [
),
},
{
accessorKey: "powerOnHours",
accessorKey: "hours",
invertSorting: true,
header: ({ column }) => (
<HeaderButton column={column} name={t({ message: "Power On", comment: "Power On Time" })} Icon={Clock} />
@@ -223,7 +188,7 @@ export const columns: ColumnDef<DiskInfo>[] = [
},
},
{
accessorKey: "powerCycles",
accessorKey: "cycles",
invertSorting: true,
header: ({ column }) => (
<HeaderButton column={column} name={t({ message: "Cycles", comment: "Power Cycles" })} Icon={RotateCwIcon} />
@@ -237,7 +202,7 @@ export const columns: ColumnDef<DiskInfo>[] = [
},
},
{
accessorKey: "temperature",
accessorKey: "temp",
invertSorting: true,
header: ({ column }) => <HeaderButton column={column} name={t`Temp`} Icon={ThermometerIcon} />,
cell: ({ getValue }) => {
@@ -246,14 +211,14 @@ export const columns: ColumnDef<DiskInfo>[] = [
},
},
// {
// accessorKey: "serialNumber",
// sortingFn: (a, b) => a.original.serialNumber.localeCompare(b.original.serialNumber),
// accessorKey: "serial",
// sortingFn: (a, b) => a.original.serial.localeCompare(b.original.serial),
// header: ({ column }) => <HeaderButton column={column} name={t`Serial Number`} Icon={HashIcon} />,
// cell: ({ getValue }) => <span className="ms-1.5">{getValue() as string}</span>,
// },
// {
// accessorKey: "firmwareVersion",
// sortingFn: (a, b) => a.original.firmwareVersion.localeCompare(b.original.firmwareVersion),
// accessorKey: "firmware",
// sortingFn: (a, b) => a.original.firmware.localeCompare(b.original.firmware),
// header: ({ column }) => <HeaderButton column={column} name={t`Firmware`} Icon={CpuIcon} />,
// cell: ({ getValue }) => <span className="ms-1.5">{getValue() as string}</span>,
// },
@@ -272,7 +237,15 @@ export const columns: ColumnDef<DiskInfo>[] = [
},
]
function HeaderButton({ column, name, Icon }: { column: Column<DiskInfo>; name: string; Icon: React.ElementType }) {
function HeaderButton({
column,
name,
Icon,
}: {
column: Column<SmartDeviceRecord>
name: string
Icon: React.ElementType
}) {
const isSorted = column.getIsSorted()
return (
<Button
@@ -290,7 +263,7 @@ function HeaderButton({ column, name, Icon }: { column: Column<DiskInfo>; name:
}
export default function DisksTable({ systemId }: { systemId?: string }) {
const [sorting, setSorting] = useState<SortingState>([{ id: systemId ? "device" : "system", desc: false }])
const [sorting, setSorting] = useState<SortingState>([{ id: systemId ? "name" : "system", desc: false }])
const [columnFilters, setColumnFilters] = useState<ColumnFiltersState>([])
const [rowSelection, setRowSelection] = useState({})
const [smartDevices, setSmartDevices] = useState<SmartDeviceRecord[] | undefined>(undefined)
@@ -299,96 +272,95 @@ export default function DisksTable({ systemId }: { systemId?: string }) {
const [rowActionState, setRowActionState] = useState<{ type: "refresh" | "delete"; id: string } | null>(null)
const [globalFilter, setGlobalFilter] = useState("")
const openSheet = (disk: DiskInfo) => {
const openSheet = (disk: SmartDeviceRecord) => {
setActiveDiskId(disk.id)
setSheetOpen(true)
}
// Fetch smart devices from collection (without attributes to save bandwidth)
const fetchSmartDevices = useCallback(() => {
// Fetch smart devices
useEffect(() => {
const controller = new AbortController()
pb.collection<SmartDeviceRecord>("smart_devices")
.getFullList({
filter: systemId ? pb.filter("system = {:system}", { system: systemId }) : undefined,
fields: SMART_DEVICE_FIELDS,
signal: controller.signal,
})
.then((records) => {
setSmartDevices(records)
.then(setSmartDevices)
.catch((err) => {
if (!err.isAbort) {
setSmartDevices([])
}
})
.catch(() => setSmartDevices([]))
return () => controller.abort()
}, [systemId])
// Fetch smart devices when component mounts or systemId changes
useEffect(() => {
fetchSmartDevices()
}, [fetchSmartDevices])
// Subscribe to live updates so rows add/remove without manual refresh/filtering
// Subscribe to updates
useEffect(() => {
let unsubscribe: (() => void) | undefined
const pbOptions = systemId
? { fields: SMART_DEVICE_FIELDS, filter: pb.filter("system = {:system}", { system: systemId }) }
: { fields: SMART_DEVICE_FIELDS }
;(async () => {
try {
unsubscribe = await pb.collection("smart_devices").subscribe(
"*",
(event) => {
const record = event.record as SmartDeviceRecord
setSmartDevices((currentDevices) => {
const devices = currentDevices ?? []
const matchesSystemScope = !systemId || record.system === systemId
; (async () => {
try {
unsubscribe = await pb.collection("smart_devices").subscribe(
"*",
(event) => {
const record = event.record as SmartDeviceRecord
setSmartDevices((currentDevices) => {
const devices = currentDevices ?? []
const matchesSystemScope = !systemId || record.system === systemId
if (event.action === "delete") {
return devices.filter((device) => device.id !== record.id)
}
if (event.action === "delete") {
return devices.filter((device) => device.id !== record.id)
}
if (!matchesSystemScope) {
// Record moved out of scope; ensure it disappears locally.
return devices.filter((device) => device.id !== record.id)
}
if (!matchesSystemScope) {
// Record moved out of scope; ensure it disappears locally.
return devices.filter((device) => device.id !== record.id)
}
const existingIndex = devices.findIndex((device) => device.id === record.id)
if (existingIndex === -1) {
return [record, ...devices]
}
const existingIndex = devices.findIndex((device) => device.id === record.id)
if (existingIndex === -1) {
return [record, ...devices]
}
const next = [...devices]
next[existingIndex] = record
return next
})
},
pbOptions
)
} catch (error) {
console.error("Failed to subscribe to SMART device updates:", error)
}
})()
const next = [...devices]
next[existingIndex] = record
return next
})
},
pbOptions
)
} catch (error) {
console.error("Failed to subscribe to SMART device updates:", error)
}
})()
return () => {
unsubscribe?.()
}
}, [systemId])
const handleRowRefresh = useCallback(
async (disk: DiskInfo) => {
if (!disk.system) return
setRowActionState({ type: "refresh", id: disk.id })
try {
await pb.send("/api/beszel/smart/refresh", {
method: "POST",
query: { system: disk.system },
})
} catch (error) {
console.error("Failed to refresh SMART device:", error)
} finally {
setRowActionState((state) => (state?.id === disk.id ? null : state))
}
},
[fetchSmartDevices]
)
const handleRowRefresh = useCallback(async (disk: SmartDeviceRecord) => {
if (!disk.system) return
setRowActionState({ type: "refresh", id: disk.id })
try {
await pb.send("/api/beszel/smart/refresh", {
method: "POST",
query: { system: disk.system },
})
} catch (error) {
console.error("Failed to refresh SMART device:", error)
} finally {
setRowActionState((state) => (state?.id === disk.id ? null : state))
}
}, [])
const handleDeleteDevice = useCallback(async (disk: DiskInfo) => {
const handleDeleteDevice = useCallback(async (disk: SmartDeviceRecord) => {
setRowActionState({ type: "delete", id: disk.id })
try {
await pb.collection("smart_devices").delete(disk.id)
@@ -400,7 +372,7 @@ export default function DisksTable({ systemId }: { systemId?: string }) {
}
}, [])
const actionColumn = useMemo<ColumnDef<DiskInfo>>(
const actionColumn = useMemo<ColumnDef<SmartDeviceRecord>>(
() => ({
id: "actions",
enableSorting: false,
@@ -468,13 +440,8 @@ export default function DisksTable({ systemId }: { systemId?: string }) {
return [...baseColumns, actionColumn]
}, [systemId, actionColumn])
// Convert SmartDeviceRecord to DiskInfo
const diskData = useMemo(() => {
return smartDevices ? convertSmartDeviceRecordToDiskInfo(smartDevices) : []
}, [smartDevices])
const table = useReactTable({
data: diskData,
data: smartDevices || ([] as SmartDeviceRecord[]),
columns: tableColumns,
onSortingChange: setSorting,
onColumnFiltersChange: setColumnFilters,
@@ -492,10 +459,10 @@ export default function DisksTable({ systemId }: { systemId?: string }) {
globalFilterFn: (row, _columnId, filterValue) => {
const disk = row.original
const systemName = $allSystemsById.get()[disk.system]?.name ?? ""
const device = disk.device ?? ""
const device = disk.name ?? ""
const model = disk.model ?? ""
const status = disk.status ?? ""
const type = disk.deviceType ?? ""
const status = disk.state ?? ""
const type = disk.type ?? ""
const searchString = `${systemName} ${device} ${model} ${status} ${type}`.toLowerCase()
return (filterValue as string)
.toLowerCase()
@@ -505,7 +472,7 @@ export default function DisksTable({ systemId }: { systemId?: string }) {
})
// Hide the table on system pages if there's no data, but always show on global page
if (systemId && !diskData.length && !columnFilters.length) {
if (systemId && !smartDevices?.length && !columnFilters.length) {
return null
}