From aaa788bc2f8391c660979e4d91714e1e249dc4b8 Mon Sep 17 00:00:00 2001 From: henrygd Date: Tue, 11 Nov 2025 12:38:47 -0500 Subject: [PATCH] add gpu usage alerts --- internal/alerts/alerts.go | 19 +++++++++------ internal/alerts/alerts_system.go | 13 ++++++++++ .../0_collections_snapshot_0_16_0.go | 1 + internal/site/src/lib/alerts.ts | 8 ++++++- internal/site/src/types.d.ts | 24 +++++++++---------- 5 files changed, 45 insertions(+), 20 deletions(-) diff --git a/internal/alerts/alerts.go b/internal/alerts/alerts.go index e25e830f..cee4258b 100644 --- a/internal/alerts/alerts.go +++ b/internal/alerts/alerts.go @@ -40,13 +40,18 @@ type UserNotificationSettings struct { } type SystemAlertStats struct { - Cpu float64 `json:"cpu"` - Mem float64 `json:"mp"` - Disk float64 `json:"dp"` - NetSent float64 `json:"ns"` - NetRecv float64 `json:"nr"` - Temperatures map[string]float32 `json:"t"` - LoadAvg [3]float64 `json:"la"` + Cpu float64 `json:"cpu"` + Mem float64 `json:"mp"` + Disk float64 `json:"dp"` + NetSent float64 `json:"ns"` + NetRecv float64 `json:"nr"` + GPU map[string]SystemAlertGPUData `json:"g"` + Temperatures map[string]float32 `json:"t"` + LoadAvg [3]float64 `json:"la"` +} + +type SystemAlertGPUData struct { + Usage float64 `json:"u"` } type SystemAlertData struct { diff --git a/internal/alerts/alerts_system.go b/internal/alerts/alerts_system.go index 4ba2f940..cc506d21 100644 --- a/internal/alerts/alerts_system.go +++ b/internal/alerts/alerts_system.go @@ -64,6 +64,8 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *syst case "LoadAvg15": val = data.Info.LoadAvg[2] unit = "" + case "GPU": + val = data.Info.GpuPct } triggered := alertRecord.GetBool("triggered") @@ -206,6 +208,17 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *syst alert.val += stats.LoadAvg[1] case "LoadAvg15": alert.val += stats.LoadAvg[2] + case "GPU": + if len(stats.GPU) == 0 { + continue + } + maxUsage := 0.0 + for _, gpu := range stats.GPU { + if gpu.Usage > maxUsage { + maxUsage = gpu.Usage + } + } + alert.val += maxUsage default: continue } diff --git a/internal/migrations/0_collections_snapshot_0_16_0.go b/internal/migrations/0_collections_snapshot_0_16_0.go index a90a011a..56a07c50 100644 --- a/internal/migrations/0_collections_snapshot_0_16_0.go +++ b/internal/migrations/0_collections_snapshot_0_16_0.go @@ -75,6 +75,7 @@ func init() { "Disk", "Temperature", "Bandwidth", + "GPU", "LoadAvg1", "LoadAvg5", "LoadAvg15" diff --git a/internal/site/src/lib/alerts.ts b/internal/site/src/lib/alerts.ts index c5f93be5..79df5b46 100644 --- a/internal/site/src/lib/alerts.ts +++ b/internal/site/src/lib/alerts.ts @@ -1,7 +1,7 @@ import { t } from "@lingui/core/macro" import { CpuIcon, HardDriveIcon, HourglassIcon, MemoryStickIcon, ServerIcon, ThermometerIcon } from "lucide-react" import type { RecordSubscription } from "pocketbase" -import { EthernetIcon } from "@/components/ui/icons" +import { EthernetIcon, GpuIcon } from "@/components/ui/icons" import { $alerts } from "@/lib/stores" import type { AlertInfo, AlertRecord } from "@/types" import { pb } from "./api" @@ -41,6 +41,12 @@ export const alertInfo: Record = { desc: () => t`Triggers when combined up/down exceeds a threshold`, max: 125, }, + GPU: { + name: () => t`GPU Usage`, + unit: "%", + icon: GpuIcon, + desc: () => t`Triggers when GPU usage exceeds a threshold`, + }, Temperature: { name: () => t`Temperature`, unit: "°C", diff --git a/internal/site/src/types.d.ts b/internal/site/src/types.d.ts index ddeb9b36..75b2c51c 100644 --- a/internal/site/src/types.d.ts +++ b/internal/site/src/types.d.ts @@ -301,18 +301,18 @@ export interface ChartData { chartTime: ChartTimes } -// interface AlertInfo { -// name: () => string -// unit: string -// icon: any -// desc: () => string -// max?: number -// min?: number -// step?: number -// start?: number -// /** Single value description (when there's only one value, like status) */ -// singleDesc?: () => string -// } +export interface AlertInfo { + name: () => string + unit: string + icon: any + desc: () => string + max?: number + min?: number + step?: number + start?: number + /** Single value description (when there's only one value, like status) */ + singleDesc?: () => string +} export type AlertMap = Record>