mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-22 05:36:15 +01:00
intel_gpu_top testing
This commit is contained in:
54
agent/gpu.go
54
agent/gpu.go
@@ -27,13 +27,10 @@ const (
|
||||
nvidiaSmiInterval string = "4" // in seconds
|
||||
tegraStatsInterval string = "3700" // in milliseconds
|
||||
rocmSmiInterval time.Duration = 4300 * time.Millisecond
|
||||
|
||||
// Command retry and timeout constants
|
||||
retryWaitTime time.Duration = 5 * time.Second
|
||||
maxFailureRetries int = 5
|
||||
|
||||
cmdBufferSize uint16 = 10 * 1024
|
||||
|
||||
// Unit Conversions
|
||||
mebibytesInAMegabyte float64 = 1.024 // nvidia-smi reports memory in MiB
|
||||
milliwattsInAWatt float64 = 1000.0 // tegrastats reports power in mW
|
||||
@@ -42,10 +39,11 @@ const (
|
||||
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
|
||||
type GPUManager struct {
|
||||
sync.Mutex
|
||||
nvidiaSmi bool
|
||||
rocmSmi bool
|
||||
tegrastats bool
|
||||
GpuDataMap map[string]*system.GPUData
|
||||
nvidiaSmi bool
|
||||
rocmSmi bool
|
||||
tegrastats bool
|
||||
intelGpuStats bool
|
||||
GpuDataMap map[string]*system.GPUData
|
||||
}
|
||||
|
||||
// RocmSmiJson represents the JSON structure of rocm-smi output
|
||||
@@ -66,6 +64,7 @@ type gpuCollector struct {
|
||||
cmdArgs []string
|
||||
parse func([]byte) bool // returns true if valid data was found
|
||||
buf []byte
|
||||
bufSize uint16
|
||||
}
|
||||
|
||||
var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data
|
||||
@@ -99,7 +98,7 @@ func (c *gpuCollector) collect() error {
|
||||
|
||||
scanner := bufio.NewScanner(stdout)
|
||||
if c.buf == nil {
|
||||
c.buf = make([]byte, 0, cmdBufferSize)
|
||||
c.buf = make([]byte, 0, c.bufSize)
|
||||
}
|
||||
scanner.Buffer(c.buf, bufio.MaxScanTokenSize)
|
||||
|
||||
@@ -244,20 +243,24 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData {
|
||||
// copy / reset the data
|
||||
gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap))
|
||||
for id, gpu := range gm.GpuDataMap {
|
||||
gpuAvg := *gpu
|
||||
// avoid division by zero
|
||||
count := max(gpu.Count, 1)
|
||||
|
||||
// average the data
|
||||
gpuAvg := *gpu
|
||||
gpuAvg.Temperature = twoDecimals(gpu.Temperature)
|
||||
gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed)
|
||||
gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal)
|
||||
|
||||
// avoid division by zero
|
||||
if gpu.Count > 0 {
|
||||
gpuAvg.Usage = twoDecimals(gpu.Usage / gpu.Count)
|
||||
gpuAvg.Power = twoDecimals(gpu.Power / gpu.Count)
|
||||
gpuAvg.Usage = twoDecimals(gpu.Usage / count)
|
||||
gpuAvg.Power = twoDecimals(gpu.Power / count)
|
||||
gpuAvg.Engines = make(map[string]float64, len(gpu.Engines))
|
||||
for name, engine := range gpu.Engines {
|
||||
gpuAvg.Engines[name] = twoDecimals(engine / count)
|
||||
}
|
||||
|
||||
// reset accumulators in the original
|
||||
gpu.Usage, gpu.Power, gpu.Count = 0, 0, 0
|
||||
// reset accumulators in the original gpu data for next collection
|
||||
gpu.Usage, gpu.Power, gpu.Count = gpuAvg.Usage, gpuAvg.Power, 1
|
||||
gpu.Engines = gpuAvg.Engines
|
||||
|
||||
// append id to the name if there are multiple GPUs with the same name
|
||||
if nameCounts[gpu.Name] > 1 {
|
||||
@@ -284,18 +287,28 @@ func (gm *GPUManager) detectGPUs() error {
|
||||
gm.tegrastats = true
|
||||
gm.nvidiaSmi = false
|
||||
}
|
||||
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats {
|
||||
if _, err := exec.LookPath(intelGpuStatsCmd); err == nil {
|
||||
slog.Info("Intel GPU stats found")
|
||||
gm.intelGpuStats = true
|
||||
}
|
||||
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelGpuStats {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or tegrastats")
|
||||
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, tegrastats, or intel_gpu_top")
|
||||
}
|
||||
|
||||
// startCollector starts the appropriate GPU data collector based on the command
|
||||
func (gm *GPUManager) startCollector(command string) {
|
||||
collector := gpuCollector{
|
||||
name: command,
|
||||
name: command,
|
||||
bufSize: 10 * 1024,
|
||||
}
|
||||
switch command {
|
||||
case intelGpuStatsCmd:
|
||||
slog.Info("Starting Intel GPU stats collector")
|
||||
collector.cmdArgs = []string{"-s", intelGpuStatsInterval, "-J"}
|
||||
collector.parse = gm.parseIntelData
|
||||
go collector.start()
|
||||
case nvidiaSmiCmd:
|
||||
collector.cmdArgs = []string{
|
||||
"-l", nvidiaSmiInterval,
|
||||
@@ -344,6 +357,9 @@ func NewGPUManager() (*GPUManager, error) {
|
||||
if gm.tegrastats {
|
||||
gm.startCollector(tegraStatsCmd)
|
||||
}
|
||||
if gm.intelGpuStats {
|
||||
gm.startCollector(intelGpuStatsCmd)
|
||||
}
|
||||
|
||||
return &gm, nil
|
||||
}
|
||||
|
||||
53
agent/gpu_intel.go
Normal file
53
agent/gpu_intel.go
Normal file
@@ -0,0 +1,53 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
|
||||
"github.com/henrygd/beszel/internal/entities/system"
|
||||
)
|
||||
|
||||
const (
|
||||
intelGpuStatsCmd string = "intel_gpu_top"
|
||||
intelGpuStatsInterval string = "3800" // in milliseconds
|
||||
)
|
||||
|
||||
type intelGpuStats struct {
|
||||
Power struct {
|
||||
GPU float64 `json:"gpu"`
|
||||
} `json:"power"`
|
||||
Engines map[string]struct {
|
||||
Busy float64 `json:"busy"`
|
||||
} `json:"engines"`
|
||||
}
|
||||
|
||||
func (gm *GPUManager) parseIntelData(output []byte) bool {
|
||||
slog.Info("Parsing Intel GPU stats")
|
||||
var intelGpuStats intelGpuStats
|
||||
if err := json.Unmarshal(output, &intelGpuStats); err != nil {
|
||||
slog.Error("Error parsing Intel GPU stats", "err", err)
|
||||
return false
|
||||
}
|
||||
gm.Lock()
|
||||
defer gm.Unlock()
|
||||
|
||||
// only one gpu for now - cmd doesn't provide all by default
|
||||
gpuData, ok := gm.GpuDataMap["0"]
|
||||
if !ok {
|
||||
gpuData = &system.GPUData{Name: "GPU", Engines: make(map[string]float64, len(intelGpuStats.Engines))}
|
||||
gm.GpuDataMap["0"] = gpuData
|
||||
}
|
||||
|
||||
if intelGpuStats.Power.GPU > 0 {
|
||||
gpuData.Power += intelGpuStats.Power.GPU
|
||||
}
|
||||
|
||||
for name, engine := range intelGpuStats.Engines {
|
||||
gpuData.Engines[name] += engine.Busy
|
||||
}
|
||||
|
||||
gpuData.Count++
|
||||
|
||||
slog.Info("GPU Data", "gpuData", gpuData)
|
||||
return true
|
||||
}
|
||||
@@ -45,13 +45,14 @@ type Stats struct {
|
||||
}
|
||||
|
||||
type GPUData struct {
|
||||
Name string `json:"n" cbor:"0,keyasint"`
|
||||
Temperature float64 `json:"-"`
|
||||
MemoryUsed float64 `json:"mu,omitempty" cbor:"1,keyasint,omitempty"`
|
||||
MemoryTotal float64 `json:"mt,omitempty" cbor:"2,keyasint,omitempty"`
|
||||
Usage float64 `json:"u" cbor:"3,keyasint"`
|
||||
Power float64 `json:"p,omitempty" cbor:"4,keyasint,omitempty"`
|
||||
Count float64 `json:"-"`
|
||||
Name string `json:"n" cbor:"0,keyasint"`
|
||||
Temperature float64 `json:"-"`
|
||||
MemoryUsed float64 `json:"mu,omitempty" cbor:"1,keyasint,omitempty"`
|
||||
MemoryTotal float64 `json:"mt,omitempty" cbor:"2,keyasint,omitempty"`
|
||||
Usage float64 `json:"u" cbor:"3,keyasint"`
|
||||
Power float64 `json:"p,omitempty" cbor:"4,keyasint,omitempty"`
|
||||
Count float64 `json:"-"`
|
||||
Engines map[string]float64 `json:"e,omitempty" cbor:"5,keyasint,omitempty"`
|
||||
}
|
||||
|
||||
type FsStats struct {
|
||||
|
||||
@@ -115,7 +115,7 @@ export function useNetworkInterfaces(interfaces: SystemStats["ni"]) {
|
||||
data: (index = 3) => {
|
||||
return sortedKeys.map((key) => ({
|
||||
label: key,
|
||||
dataKey: (stats: SystemStatsRecord) => stats.stats?.ni?.[key]?.[index],
|
||||
dataKey: ({ stats }: SystemStatsRecord) => stats?.ni?.[key]?.[index],
|
||||
color: `hsl(${220 + (((sortedKeys.indexOf(key) * 360) / sortedKeys.length) % 360)}, 70%, 50%)`,
|
||||
|
||||
opacity: 0.3,
|
||||
@@ -123,3 +123,15 @@ export function useNetworkInterfaces(interfaces: SystemStats["ni"]) {
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/** Generates chart configurations for GPU engines */
|
||||
export function useGpuEngines(systemStats?: SystemStatsRecord) {
|
||||
const keys = Object.keys(systemStats?.stats.g?.[0]?.e ?? {})
|
||||
const sortedKeys = keys.sort()
|
||||
return sortedKeys.map((engine) => ({
|
||||
label: engine,
|
||||
dataKey: ({ stats }: SystemStatsRecord) => stats?.g?.[0]?.e?.[engine] ?? 0,
|
||||
color: `hsl(${220 + ((sortedKeys.indexOf(engine) * 360) / sortedKeys.length) % 360}, 65%, 52%)`,
|
||||
opacity: 0.35,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ import AreaChartDefault from "@/components/charts/area-chart"
|
||||
import ContainerChart from "@/components/charts/container-chart"
|
||||
import DiskChart from "@/components/charts/disk-chart"
|
||||
import GpuPowerChart from "@/components/charts/gpu-power-chart"
|
||||
import { useContainerChartConfigs } from "@/components/charts/hooks"
|
||||
import { useContainerChartConfigs, useGpuEngines } from "@/components/charts/hooks"
|
||||
import LoadAverageChart from "@/components/charts/load-average-chart"
|
||||
import MemChart from "@/components/charts/mem-chart"
|
||||
import SwapChart from "@/components/charts/swap-chart"
|
||||
@@ -61,6 +61,7 @@ import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from ".
|
||||
import { Separator } from "../ui/separator"
|
||||
import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "../ui/tooltip"
|
||||
import NetworkSheet from "./system/network-sheet"
|
||||
import LineChartDefault from "../charts/line-chart"
|
||||
|
||||
type ChartTimeData = {
|
||||
time: number
|
||||
@@ -398,6 +399,7 @@ export default memo(function SystemDetail({ name }: { name: string }) {
|
||||
const lastGpuVals = Object.values(systemStats.at(-1)?.stats.g ?? {})
|
||||
const hasGpuData = lastGpuVals.length > 0
|
||||
const hasGpuPowerData = lastGpuVals.some((gpu) => gpu.p !== undefined)
|
||||
const hasGpuEnginesData = lastGpuVals.some((gpu) => gpu.e !== undefined)
|
||||
|
||||
let translatedStatus: string = system.status
|
||||
if (system.status === SystemStatus.Up) {
|
||||
@@ -770,6 +772,17 @@ export default memo(function SystemDetail({ name }: { name: string }) {
|
||||
<GpuPowerChart chartData={chartData} />
|
||||
</ChartCard>
|
||||
)}
|
||||
|
||||
{hasGpuEnginesData && (
|
||||
<ChartCard
|
||||
empty={dataEmpty}
|
||||
grid={grid}
|
||||
title={t`GPU Engines`}
|
||||
description={t`Average utilization of GPU engines`}
|
||||
>
|
||||
<GpuEnginesChart chartData={chartData} />
|
||||
</ChartCard>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* GPU charts */}
|
||||
@@ -897,6 +910,13 @@ export default memo(function SystemDetail({ name }: { name: string }) {
|
||||
)
|
||||
})
|
||||
|
||||
function GpuEnginesChart({ chartData }: { chartData: ChartData }) {
|
||||
const engineData = useGpuEngines(chartData.systemStats.at(-1))
|
||||
return (
|
||||
<LineChartDefault legend={true} chartData={chartData} dataPoints={engineData} tickFormatter={(val) => `${toFixedFloat(val, 2)}%`} contentFormatter={({ value }) => `${decimalString(value)}%`} />
|
||||
)
|
||||
}
|
||||
|
||||
function FilterBar({ store = $containerFilter }: { store?: typeof $containerFilter }) {
|
||||
const containerFilter = useStore(store)
|
||||
const { t } = useLingui()
|
||||
|
||||
2
internal/site/src/types.d.ts
vendored
2
internal/site/src/types.d.ts
vendored
@@ -158,6 +158,8 @@ export interface GPUData {
|
||||
u: number
|
||||
/** power (w) */
|
||||
p?: number
|
||||
/** engines */
|
||||
e?: Record<string, number>
|
||||
}
|
||||
|
||||
export interface ExtraFsStats {
|
||||
|
||||
Reference in New Issue
Block a user