From 6a562ce03b50e1ad5d088bf010c78a828d2db869 Mon Sep 17 00:00:00 2001 From: henrygd Date: Sat, 1 Nov 2025 12:57:48 -0400 Subject: [PATCH] add more cpu metrics (#1356) - adds monitoring for cpu state time and per-core usage Co-authored-by: Sven van Ginkel --- agent/battery/battery.go | 2 +- agent/cpu.go | 92 +++++++-- agent/system.go | 20 +- internal/entities/system/system.go | 26 ++- .../0_collections_snapshot_0_14_1.go | 4 +- internal/records/records.go | 43 ++++ .../site/src/components/charts/area-chart.tsx | 39 ++-- internal/site/src/components/charts/hooks.ts | 2 +- .../site/src/components/routes/system.tsx | 16 +- .../components/routes/system/cpu-sheet.tsx | 195 ++++++++++++++++++ .../routes/system/network-sheet.tsx | 2 +- internal/site/src/index.css | 6 +- internal/site/src/lib/utils.ts | 2 +- internal/site/src/types.d.ts | 4 + 14 files changed, 407 insertions(+), 46 deletions(-) create mode 100644 internal/site/src/components/routes/system/cpu-sheet.tsx diff --git a/agent/battery/battery.go b/agent/battery/battery.go index 28644652..bd6d494c 100644 --- a/agent/battery/battery.go +++ b/agent/battery/battery.go @@ -56,7 +56,7 @@ func GetBatteryStats() (batteryPercent uint8, batteryState uint8, err error) { // if there were some errors, like missing data, skip it continue } - if bat.Full == 0 { + if bat == nil || bat.Full == 0 { // skip batteries with no capacity. Charge is unlikely to ever be zero, but // we can't guarantee that, so don't skip based on charge. continue diff --git a/agent/cpu.go b/agent/cpu.go index bd4afc21..4ed52f7d 100644 --- a/agent/cpu.go +++ b/agent/cpu.go @@ -4,10 +4,12 @@ import ( "math" "runtime" + "github.com/henrygd/beszel/internal/entities/system" "github.com/shirou/gopsutil/v4/cpu" ) var lastCpuTimes = make(map[uint16]cpu.TimesStat) +var lastPerCoreCpuTimes = make(map[uint16][]cpu.TimesStat) // init initializes the CPU monitoring by storing the initial CPU times // for the default 60-second cache interval. @@ -15,23 +17,92 @@ func init() { if times, err := cpu.Times(false); err == nil { lastCpuTimes[60000] = times[0] } + if perCoreTimes, err := cpu.Times(true); err == nil { + lastPerCoreCpuTimes[60000] = perCoreTimes + } } -// getCpuPercent calculates the CPU usage percentage using cached previous measurements. -// It uses the specified cache time interval to determine the time window for calculation. -// Returns the CPU usage percentage (0-100) and any error encountered. -func getCpuPercent(cacheTimeMs uint16) (float64, error) { +// CpuMetrics contains detailed CPU usage breakdown +type CpuMetrics struct { + Total float64 + User float64 + System float64 + Iowait float64 + Steal float64 + Idle float64 +} + +// getCpuMetrics calculates detailed CPU usage metrics using cached previous measurements. +// It returns percentages for total, user, system, iowait, and steal time. +func getCpuMetrics(cacheTimeMs uint16) (CpuMetrics, error) { times, err := cpu.Times(false) if err != nil || len(times) == 0 { - return 0, err + return CpuMetrics{}, err } // if cacheTimeMs is not in lastCpuTimes, use 60000 as fallback lastCpuTime if _, ok := lastCpuTimes[cacheTimeMs]; !ok { lastCpuTimes[cacheTimeMs] = lastCpuTimes[60000] } - delta := calculateBusy(lastCpuTimes[cacheTimeMs], times[0]) + + t1 := lastCpuTimes[cacheTimeMs] + t2 := times[0] + + t1All, _ := getAllBusy(t1) + t2All, _ := getAllBusy(t2) + + totalDelta := t2All - t1All + if totalDelta <= 0 { + return CpuMetrics{}, nil + } + + metrics := CpuMetrics{ + Total: calculateBusy(t1, t2), + User: clampPercent((t2.User - t1.User) / totalDelta * 100), + System: clampPercent((t2.System - t1.System) / totalDelta * 100), + Iowait: clampPercent((t2.Iowait - t1.Iowait) / totalDelta * 100), + Steal: clampPercent((t2.Steal - t1.Steal) / totalDelta * 100), + Idle: clampPercent((t2.Idle - t1.Idle) / totalDelta * 100), + } + lastCpuTimes[cacheTimeMs] = times[0] - return delta, nil + return metrics, nil +} + +// clampPercent ensures the percentage is between 0 and 100 +func clampPercent(value float64) float64 { + return math.Min(100, math.Max(0, value)) +} + +// getPerCoreCpuUsage calculates per-core CPU busy usage as integer percentages (0-100). +// It uses cached previous measurements for the provided cache interval. +func getPerCoreCpuUsage(cacheTimeMs uint16) (system.Uint8Slice, error) { + perCoreTimes, err := cpu.Times(true) + if err != nil || len(perCoreTimes) == 0 { + return nil, err + } + + // Initialize cache if needed + if _, ok := lastPerCoreCpuTimes[cacheTimeMs]; !ok { + lastPerCoreCpuTimes[cacheTimeMs] = lastPerCoreCpuTimes[60000] + } + + lastTimes := lastPerCoreCpuTimes[cacheTimeMs] + + // Limit to the number of cores available in both samples + length := len(perCoreTimes) + if len(lastTimes) < length { + length = len(lastTimes) + } + + usage := make([]uint8, length) + for i := 0; i < length; i++ { + t1 := lastTimes[i] + t2 := perCoreTimes[i] + usage[i] = uint8(math.Round(calculateBusy(t1, t2))) + } + + lastPerCoreCpuTimes[cacheTimeMs] = perCoreTimes + return usage, nil } // calculateBusy calculates the CPU busy percentage between two time points. @@ -41,13 +112,10 @@ func calculateBusy(t1, t2 cpu.TimesStat) float64 { t1All, t1Busy := getAllBusy(t1) t2All, t2Busy := getAllBusy(t2) - if t2Busy <= t1Busy { + if t2All <= t1All || t2Busy <= t1Busy { return 0 } - if t2All <= t1All { - return 100 - } - return math.Min(100, math.Max(0, (t2Busy-t1Busy)/(t2All-t1All)*100)) + return clampPercent((t2Busy - t1Busy) / (t2All - t1All) * 100) } // getAllBusy calculates the total CPU time and busy CPU time from CPU times statistics. diff --git a/agent/system.go b/agent/system.go index dad48ed2..f6b3c7d7 100644 --- a/agent/system.go +++ b/agent/system.go @@ -83,12 +83,24 @@ func (a *Agent) getSystemStats(cacheTimeMs uint16) system.Stats { systemStats.Battery[1] = batteryState } - // cpu percent - cpuPercent, err := getCpuPercent(cacheTimeMs) + // cpu metrics + cpuMetrics, err := getCpuMetrics(cacheTimeMs) if err == nil { - systemStats.Cpu = twoDecimals(cpuPercent) + systemStats.Cpu = twoDecimals(cpuMetrics.Total) + systemStats.CpuBreakdown = []float64{ + twoDecimals(cpuMetrics.User), + twoDecimals(cpuMetrics.System), + twoDecimals(cpuMetrics.Iowait), + twoDecimals(cpuMetrics.Steal), + twoDecimals(cpuMetrics.Idle), + } } else { - slog.Error("Error getting cpu percent", "err", err) + slog.Error("Error getting cpu metrics", "err", err) + } + + // per-core cpu usage + if perCoreUsage, err := getPerCoreCpuUsage(cacheTimeMs); err == nil { + systemStats.CpuCoresUsage = perCoreUsage } // load average diff --git a/internal/entities/system/system.go b/internal/entities/system/system.go index 5177bbfb..cf9df3bd 100644 --- a/internal/entities/system/system.go +++ b/internal/entities/system/system.go @@ -3,6 +3,7 @@ package system // TODO: this is confusing, make common package with common/types common/helpers etc import ( + "encoding/json" "time" "github.com/henrygd/beszel/internal/entities/container" @@ -41,9 +42,28 @@ type Stats struct { LoadAvg [3]float64 `json:"la,omitempty" cbor:"28,keyasint"` Battery [2]uint8 `json:"bat,omitzero" cbor:"29,keyasint,omitzero"` // [percent, charge state, current] MaxMem float64 `json:"mm,omitempty" cbor:"30,keyasint,omitempty"` - NetworkInterfaces map[string][4]uint64 `json:"ni,omitempty" cbor:"31,keyasint,omitempty"` // [upload bytes, download bytes, total upload, total download] - DiskIO [2]uint64 `json:"dio,omitzero" cbor:"32,keyasint,omitzero"` // [read bytes, write bytes] - MaxDiskIO [2]uint64 `json:"diom,omitzero" cbor:"-"` // [max read bytes, max write bytes] + NetworkInterfaces map[string][4]uint64 `json:"ni,omitempty" cbor:"31,keyasint,omitempty"` // [upload bytes, download bytes, total upload, total download] + DiskIO [2]uint64 `json:"dio,omitzero" cbor:"32,keyasint,omitzero"` // [read bytes, write bytes] + MaxDiskIO [2]uint64 `json:"diom,omitzero" cbor:"-"` // [max read bytes, max write bytes] + CpuBreakdown []float64 `json:"cpub,omitempty" cbor:"33,keyasint,omitempty"` // [user, system, iowait, steal, idle] + CpuCoresUsage Uint8Slice `json:"cpus,omitempty" cbor:"34,keyasint,omitempty"` // per-core busy usage [CPU0..] +} + +// Uint8Slice wraps []uint8 to customize JSON encoding while keeping CBOR efficient. +// JSON: encodes as array of numbers (avoids base64 string). +// CBOR: falls back to default handling for []uint8 (byte string), keeping payload small. +type Uint8Slice []uint8 + +func (s Uint8Slice) MarshalJSON() ([]byte, error) { + if s == nil { + return []byte("null"), nil + } + // Convert to wider ints to force array-of-numbers encoding. + arr := make([]uint16, len(s)) + for i, v := range s { + arr[i] = uint16(v) + } + return json.Marshal(arr) } type GPUData struct { diff --git a/internal/migrations/0_collections_snapshot_0_14_1.go b/internal/migrations/0_collections_snapshot_0_14_1.go index f95a6b35..b04c4c94 100644 --- a/internal/migrations/0_collections_snapshot_0_14_1.go +++ b/internal/migrations/0_collections_snapshot_0_14_1.go @@ -718,7 +718,9 @@ func init() { "type": "autodate" } ], - "indexes": [], + "indexes": [ + "CREATE INDEX ` + "`" + `idx_systems_status` + "`" + ` ON ` + "`" + `systems` + "`" + ` (` + "`" + `status` + "`" + `)" + ], "system": false }, { diff --git a/internal/records/records.go b/internal/records/records.go index 5da41c4e..173f3c76 100644 --- a/internal/records/records.go +++ b/internal/records/records.go @@ -177,6 +177,10 @@ func (rm *RecordManager) AverageSystemStats(db dbx.Builder, records RecordIds) * stats := &tempStats // necessary because uint8 is not big enough for the sum batterySum := 0 + // accumulate per-core usage across records + var cpuCoresSums []uint64 + // accumulate cpu breakdown [user, system, iowait, steal, idle] + var cpuBreakdownSums []float64 count := float64(len(records)) tempCount := float64(0) @@ -194,6 +198,15 @@ func (rm *RecordManager) AverageSystemStats(db dbx.Builder, records RecordIds) * } sum.Cpu += stats.Cpu + // accumulate cpu time breakdowns if present + if stats.CpuBreakdown != nil { + if len(cpuBreakdownSums) < len(stats.CpuBreakdown) { + cpuBreakdownSums = append(cpuBreakdownSums, make([]float64, len(stats.CpuBreakdown)-len(cpuBreakdownSums))...) + } + for i, v := range stats.CpuBreakdown { + cpuBreakdownSums[i] += v + } + } sum.Mem += stats.Mem sum.MemUsed += stats.MemUsed sum.MemPct += stats.MemPct @@ -217,6 +230,17 @@ func (rm *RecordManager) AverageSystemStats(db dbx.Builder, records RecordIds) * sum.DiskIO[1] += stats.DiskIO[1] batterySum += int(stats.Battery[0]) sum.Battery[1] = stats.Battery[1] + + // accumulate per-core usage if present + if stats.CpuCoresUsage != nil { + if len(cpuCoresSums) < len(stats.CpuCoresUsage) { + // extend slices to accommodate core count + cpuCoresSums = append(cpuCoresSums, make([]uint64, len(stats.CpuCoresUsage)-len(cpuCoresSums))...) + } + for i, v := range stats.CpuCoresUsage { + cpuCoresSums[i] += uint64(v) + } + } // Set peak values sum.MaxCpu = max(sum.MaxCpu, stats.MaxCpu, stats.Cpu) sum.MaxMem = max(sum.MaxMem, stats.MaxMem, stats.MemUsed) @@ -385,6 +409,25 @@ func (rm *RecordManager) AverageSystemStats(db dbx.Builder, records RecordIds) * sum.GPUData[id] = gpu } } + + // Average per-core usage + if len(cpuCoresSums) > 0 { + avg := make(system.Uint8Slice, len(cpuCoresSums)) + for i := range cpuCoresSums { + v := math.Round(float64(cpuCoresSums[i]) / count) + avg[i] = uint8(v) + } + sum.CpuCoresUsage = avg + } + + // Average CPU breakdown + if len(cpuBreakdownSums) > 0 { + avg := make([]float64, len(cpuBreakdownSums)) + for i := range cpuBreakdownSums { + avg[i] = twoDecimals(cpuBreakdownSums[i] / count) + } + sum.CpuBreakdown = avg + } } return sum diff --git a/internal/site/src/components/charts/area-chart.tsx b/internal/site/src/components/charts/area-chart.tsx index c6f00875..ef4ffdd9 100644 --- a/internal/site/src/components/charts/area-chart.tsx +++ b/internal/site/src/components/charts/area-chart.tsx @@ -17,6 +17,7 @@ export type DataPoint = { dataKey: (data: SystemStatsRecord) => number | undefined color: number | string opacity: number + stackId?: string | number } export default function AreaChartDefault({ @@ -29,19 +30,23 @@ export default function AreaChartDefault({ domain, legend, itemSorter, + reverseStackOrder = false, + hideYAxis = false, }: // logRender = false, -{ - chartData: ChartData - max?: number - maxToggled?: boolean - tickFormatter: (value: number, index: number) => string - contentFormatter: ({ value, payload }: { value: number; payload: SystemStatsRecord }) => string - dataPoints?: DataPoint[] - domain?: [number, number] - legend?: boolean - itemSorter?: (a: any, b: any) => number - // logRender?: boolean -}) { + { + chartData: ChartData + max?: number + maxToggled?: boolean + tickFormatter: (value: number, index: number) => string + contentFormatter: ({ value, payload }: { value: number; payload: SystemStatsRecord }) => string + dataPoints?: DataPoint[] + domain?: [number, number] + legend?: boolean + itemSorter?: (a: any, b: any) => number + reverseStackOrder?: boolean + hideYAxis?: boolean + // logRender?: boolean + }) { const { yAxisWidth, updateYAxisWidth } = useYAxisWidth() // biome-ignore lint/correctness/useExhaustiveDependencies: ignore @@ -56,12 +61,13 @@ export default function AreaChartDefault({
- + - updateYAxisWidth(tickFormatter(value, index))} tickLine={false} axisLine={false} - /> + />} {xAxis(chartData)} ) })} diff --git a/internal/site/src/components/charts/hooks.ts b/internal/site/src/components/charts/hooks.ts index 8bb75076..c619ffb8 100644 --- a/internal/site/src/components/charts/hooks.ts +++ b/internal/site/src/components/charts/hooks.ts @@ -69,7 +69,7 @@ export function useContainerChartConfigs(containerData: ChartData["containerData const hue = ((i * 360) / count) % 360 chartConfig[containerName] = { label: containerName, - color: `hsl(${hue}, 60%, 55%)`, + color: `hsl(${hue}, var(--chart-saturation), var(--chart-lightness))`, } } diff --git a/internal/site/src/components/routes/system.tsx b/internal/site/src/components/routes/system.tsx index 19e6a541..c46e6427 100644 --- a/internal/site/src/components/routes/system.tsx +++ b/internal/site/src/components/routes/system.tsx @@ -1,5 +1,5 @@ import { t } from "@lingui/core/macro" -import { Plural, Trans, useLingui } from "@lingui/react/macro" +import { Trans, useLingui } from "@lingui/react/macro" import { useStore } from "@nanostores/react" import { getPagePath } from "@nanostores/router" import { timeTicks } from "d3-time" @@ -73,6 +73,7 @@ import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from ". import { Separator } from "../ui/separator" import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "../ui/tooltip" import NetworkSheet from "./system/network-sheet" +import CpuCoresSheet from "./system/cpu-sheet" import LineChartDefault from "../charts/line-chart" @@ -97,8 +98,8 @@ function getTimeData(chartTime: ChartTimes, lastCreated: number) { } } - const buffer = chartTime === "1m" ? 400 : 20_000 - const now = new Date(Date.now() + buffer) + // const buffer = chartTime === "1m" ? 400 : 20_000 + const now = new Date(Date.now()) const startTime = chartTimeData[chartTime].getOffset(now) const ticks = timeTicks(startTime, now, chartTimeData[chartTime].ticks ?? 12).map((date) => date.getTime()) const data = { @@ -585,7 +586,12 @@ export default memo(function SystemDetail({ id }: { id: string }) { grid={grid} title={t`CPU Usage`} description={t`Average system-wide CPU utilization`} - cornerEl={maxValSelect} + cornerEl={ +
+ {maxValSelect} + +
+ } > {description} {cornerEl &&
{cornerEl}
} -
+
{ compareSemVer(chartData.agentVersion, minAgentVersion) >= 0, [chartData.agentVersion]) + + if (!supportsBreakdown) { + return null + } + + if (cpuCoresOpen && !hasOpened.current) { + hasOpened.current = true + } + + // Latest stats snapshot + const latest = chartData.systemStats.at(-1)?.stats + const cpus = latest?.cpus ?? [] + const numCores = cpus.length + const hasBreakdown = (latest?.cpub?.length ?? 0) > 0 + + const breakdownDataPoints = [ + { + label: t`Other`, + dataKey: ({ stats }: SystemStatsRecord) => { + const total = stats?.cpub?.reduce((acc, curr) => acc + curr, 0) ?? 0 + return total > 0 ? 100 - total : null + }, + color: `hsl(80, 65%, 52%)`, + opacity: 0.4, + stackId: "a" + }, + { + label: "Steal", + dataKey: ({ stats }: SystemStatsRecord) => stats?.cpub?.[3], + color: 5, + opacity: 0.4, + stackId: "a" + }, + { + label: "Idle", + dataKey: ({ stats }: SystemStatsRecord) => stats?.cpub?.[4], + color: 2, + opacity: 0.4, + stackId: "a" + }, + { + label: "IOWait", + dataKey: ({ stats }: SystemStatsRecord) => stats?.cpub?.[2], + color: 4, + opacity: 0.4, + stackId: "a" + }, + { + label: "User", + dataKey: ({ stats }: SystemStatsRecord) => stats?.cpub?.[0], + color: 1, + opacity: 0.4, + stackId: "a" + }, + { + label: "System", + dataKey: ({ stats }: SystemStatsRecord) => stats?.cpub?.[1], + color: 3, + opacity: 0.4, + stackId: "a" + }, + ] as DataPoint[] + + + return ( + + {t`CPU Usage`} + + + + {hasOpened.current && ( + + + {hasBreakdown && ( + + `${toFixedFloat(val, 2)}%`} + contentFormatter={({ value }) => `${decimalString(value)}%`} + itemSorter={() => 1} + domain={[0, 100]} + /> + + )} + + {numCores > 0 && ( + + ({ + label: `CPU ${i}`, + dataKey: ({ stats }: SystemStatsRecord) => stats?.cpus?.[i] ?? 1 / (stats?.cpus?.length ?? 1), + color: `hsl(${226 + (((i * 360) / Math.max(1, numCores)) % 360)}, var(--chart-saturation), var(--chart-lightness))`, + opacity: 0.35, + stackId: "a" + }))} + tickFormatter={(val) => `${val}%`} + contentFormatter={({ value }) => `${value}%`} + reverseStackOrder={true} + itemSorter={() => 1} + /> + + )} + + {Array.from({ length: numCores }).map((_, i) => ( + + stats?.cpus?.[i], + color: `hsl(${226 + (((i * 360) / Math.max(1, numCores)) % 360)}, 65%, 52%)`, + opacity: 0.35, + }, + ]} + tickFormatter={(val) => `${val}%`} + contentFormatter={({ value }) => `${value}%`} + /> + + ))} + + )} + + ) +}) diff --git a/internal/site/src/components/routes/system/network-sheet.tsx b/internal/site/src/components/routes/system/network-sheet.tsx index 62eb16bc..9e86f348 100644 --- a/internal/site/src/components/routes/system/network-sheet.tsx +++ b/internal/site/src/components/routes/system/network-sheet.tsx @@ -53,7 +53,7 @@ export default memo(function NetworkSheet({ {hasOpened.current && ( - +