mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-21 21:26:16 +01:00
add nvtop integration and introduce GPU_COLLECTOR env var
This commit is contained in:
354
agent/gpu.go
354
agent/gpu.go
@@ -21,13 +21,10 @@ const (
|
||||
// Commands
|
||||
nvidiaSmiCmd string = "nvidia-smi"
|
||||
rocmSmiCmd string = "rocm-smi"
|
||||
amdgpuCmd string = "amdgpu" // internal cmd for sysfs collection
|
||||
tegraStatsCmd string = "tegrastats"
|
||||
nvtopCmd string = "nvtop"
|
||||
noGPUFoundMsg string = "no GPU found - see https://beszel.dev/guide/gpu"
|
||||
|
||||
// Polling intervals
|
||||
nvidiaSmiInterval string = "4" // in seconds
|
||||
tegraStatsInterval string = "3700" // in milliseconds
|
||||
rocmSmiInterval time.Duration = 4300 * time.Millisecond
|
||||
// Command retry and timeout constants
|
||||
retryWaitTime time.Duration = 5 * time.Second
|
||||
maxFailureRetries int = 5
|
||||
@@ -40,12 +37,6 @@ const (
|
||||
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
|
||||
type GPUManager struct {
|
||||
sync.Mutex
|
||||
nvidiaSmi bool
|
||||
rocmSmi bool
|
||||
amdgpu bool
|
||||
tegrastats bool
|
||||
intelGpuStats bool
|
||||
nvml bool
|
||||
GpuDataMap map[string]*system.GPUData
|
||||
// lastAvgData stores the last calculated averages for each GPU
|
||||
// Used when a collection happens before new data arrives (Count == 0)
|
||||
@@ -87,6 +78,51 @@ type gpuCollector struct {
|
||||
|
||||
var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data
|
||||
|
||||
// collectorSource identifies a selectable GPU collector in GPU_COLLECTOR.
|
||||
type collectorSource string
|
||||
|
||||
const (
|
||||
collectorSourceNVTop collectorSource = collectorSource(nvtopCmd)
|
||||
collectorSourceNVML collectorSource = "nvml"
|
||||
collectorSourceNvidiaSMI collectorSource = collectorSource(nvidiaSmiCmd)
|
||||
collectorSourceIntelGpuTop collectorSource = collectorSource(intelGpuStatsCmd)
|
||||
collectorSourceAmdSysfs collectorSource = "amd_sysfs"
|
||||
collectorSourceRocmSMI collectorSource = collectorSource(rocmSmiCmd)
|
||||
collectorGroupNvidia string = "nvidia"
|
||||
collectorGroupIntel string = "intel"
|
||||
collectorGroupAmd string = "amd"
|
||||
)
|
||||
|
||||
func isValidCollectorSource(source collectorSource) bool {
|
||||
switch source {
|
||||
case collectorSourceNVTop,
|
||||
collectorSourceNVML,
|
||||
collectorSourceNvidiaSMI,
|
||||
collectorSourceIntelGpuTop,
|
||||
collectorSourceAmdSysfs,
|
||||
collectorSourceRocmSMI:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// gpuCapabilities describes detected GPU tooling and sysfs support on the host.
|
||||
type gpuCapabilities struct {
|
||||
hasNvidiaSmi bool
|
||||
hasRocmSmi bool
|
||||
hasAmdSysfs bool
|
||||
hasTegrastats bool
|
||||
hasIntelGpuTop bool
|
||||
hasNvtop bool
|
||||
}
|
||||
|
||||
type collectorDefinition struct {
|
||||
group string
|
||||
available bool
|
||||
start func(onFailure func()) bool
|
||||
deprecationWarning string
|
||||
}
|
||||
|
||||
// starts and manages the ongoing collection of GPU data for the specified GPU management utility
|
||||
func (c *gpuCollector) start() {
|
||||
for {
|
||||
@@ -392,44 +428,35 @@ func (gm *GPUManager) storeSnapshot(id string, gpu *system.GPUData, cacheKey uin
|
||||
gm.lastSnapshots[cacheKey][id] = snapshot
|
||||
}
|
||||
|
||||
// detectGPUs checks for the presence of GPU management tools (nvidia-smi, rocm-smi, tegrastats)
|
||||
// in the system path. It sets the corresponding flags in the GPUManager struct if any of these
|
||||
// tools are found. If none of the tools are found, it returns an error indicating that no GPU
|
||||
// management tools are available.
|
||||
func (gm *GPUManager) detectGPUs() error {
|
||||
// discoverGpuCapabilities checks for available GPU tooling and sysfs support.
|
||||
// It only reports capability presence and does not apply policy decisions.
|
||||
func (gm *GPUManager) discoverGpuCapabilities() gpuCapabilities {
|
||||
caps := gpuCapabilities{
|
||||
hasAmdSysfs: gm.hasAmdSysfs(),
|
||||
}
|
||||
if _, err := exec.LookPath(nvidiaSmiCmd); err == nil {
|
||||
gm.nvidiaSmi = true
|
||||
caps.hasNvidiaSmi = true
|
||||
}
|
||||
if _, err := exec.LookPath(rocmSmiCmd); err == nil {
|
||||
if val, _ := GetEnv("AMD_SYSFS"); val == "true" {
|
||||
gm.amdgpu = true
|
||||
} else {
|
||||
gm.rocmSmi = true
|
||||
}
|
||||
} else if gm.hasAmdSysfs() {
|
||||
gm.amdgpu = true
|
||||
caps.hasRocmSmi = true
|
||||
}
|
||||
if _, err := exec.LookPath(tegraStatsCmd); err == nil {
|
||||
gm.tegrastats = true
|
||||
gm.nvidiaSmi = false
|
||||
caps.hasTegrastats = true
|
||||
}
|
||||
if _, err := exec.LookPath(intelGpuStatsCmd); err == nil {
|
||||
gm.intelGpuStats = true
|
||||
caps.hasIntelGpuTop = true
|
||||
}
|
||||
if gm.nvidiaSmi || gm.rocmSmi || gm.amdgpu || gm.tegrastats || gm.intelGpuStats || gm.nvml {
|
||||
return nil
|
||||
if _, err := exec.LookPath(nvtopCmd); err == nil {
|
||||
caps.hasNvtop = true
|
||||
}
|
||||
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or intel_gpu_top")
|
||||
return caps
|
||||
}
|
||||
|
||||
// startCollector starts the appropriate GPU data collector based on the command
|
||||
func (gm *GPUManager) startCollector(command string) {
|
||||
collector := gpuCollector{
|
||||
name: command,
|
||||
bufSize: 10 * 1024,
|
||||
}
|
||||
switch command {
|
||||
case intelGpuStatsCmd:
|
||||
func hasAnyGpuCollector(caps gpuCapabilities) bool {
|
||||
return caps.hasNvidiaSmi || caps.hasRocmSmi || caps.hasAmdSysfs || caps.hasTegrastats || caps.hasIntelGpuTop || caps.hasNvtop
|
||||
}
|
||||
|
||||
func (gm *GPUManager) startIntelCollector() {
|
||||
go func() {
|
||||
failures := 0
|
||||
for {
|
||||
@@ -444,27 +471,39 @@ func (gm *GPUManager) startCollector(command string) {
|
||||
}
|
||||
}
|
||||
}()
|
||||
case nvidiaSmiCmd:
|
||||
collector.cmdArgs = []string{
|
||||
"-l", nvidiaSmiInterval,
|
||||
}
|
||||
|
||||
func (gm *GPUManager) startNvidiaSmiCollector(intervalSeconds string) {
|
||||
collector := gpuCollector{
|
||||
name: nvidiaSmiCmd,
|
||||
bufSize: 10 * 1024,
|
||||
cmdArgs: []string{
|
||||
"-l", intervalSeconds,
|
||||
"--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw",
|
||||
"--format=csv,noheader,nounits",
|
||||
},
|
||||
parse: gm.parseNvidiaData,
|
||||
}
|
||||
collector.parse = gm.parseNvidiaData
|
||||
go collector.start()
|
||||
case tegraStatsCmd:
|
||||
collector.cmdArgs = []string{"--interval", tegraStatsInterval}
|
||||
collector.parse = gm.getJetsonParser()
|
||||
go collector.start()
|
||||
case amdgpuCmd:
|
||||
go func() {
|
||||
if err := gm.collectAmdStats(); err != nil {
|
||||
slog.Warn("Error collecting AMD GPU data via sysfs", "err", err)
|
||||
}
|
||||
|
||||
func (gm *GPUManager) startTegraStatsCollector(intervalMilliseconds string) {
|
||||
collector := gpuCollector{
|
||||
name: tegraStatsCmd,
|
||||
bufSize: 10 * 1024,
|
||||
cmdArgs: []string{"--interval", intervalMilliseconds},
|
||||
parse: gm.getJetsonParser(),
|
||||
}
|
||||
go collector.start()
|
||||
}
|
||||
|
||||
func (gm *GPUManager) startRocmSmiCollector(pollInterval time.Duration) {
|
||||
collector := gpuCollector{
|
||||
name: rocmSmiCmd,
|
||||
bufSize: 10 * 1024,
|
||||
cmdArgs: []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"},
|
||||
parse: gm.parseAmdData,
|
||||
}
|
||||
}()
|
||||
case rocmSmiCmd:
|
||||
collector.cmdArgs = []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"}
|
||||
collector.parse = gm.parseAmdData
|
||||
go func() {
|
||||
failures := 0
|
||||
for {
|
||||
@@ -475,50 +514,203 @@ func (gm *GPUManager) startCollector(command string) {
|
||||
}
|
||||
slog.Warn("Error collecting AMD GPU data via rocm-smi", "err", err)
|
||||
}
|
||||
time.Sleep(rocmSmiInterval)
|
||||
time.Sleep(pollInterval)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSource]collectorDefinition {
|
||||
return map[collectorSource]collectorDefinition{
|
||||
collectorSourceNVML: {
|
||||
group: collectorGroupNvidia,
|
||||
available: caps.hasNvidiaSmi,
|
||||
start: func(_ func()) bool {
|
||||
return gm.startNvmlCollector()
|
||||
},
|
||||
},
|
||||
collectorSourceNvidiaSMI: {
|
||||
group: collectorGroupNvidia,
|
||||
available: caps.hasNvidiaSmi,
|
||||
start: func(_ func()) bool {
|
||||
gm.startNvidiaSmiCollector("4") // seconds
|
||||
return true
|
||||
},
|
||||
},
|
||||
collectorSourceIntelGpuTop: {
|
||||
group: collectorGroupIntel,
|
||||
available: caps.hasIntelGpuTop,
|
||||
start: func(_ func()) bool {
|
||||
gm.startIntelCollector()
|
||||
return true
|
||||
},
|
||||
},
|
||||
collectorSourceAmdSysfs: {
|
||||
group: collectorGroupAmd,
|
||||
available: caps.hasAmdSysfs,
|
||||
start: func(_ func()) bool {
|
||||
return gm.startAmdSysfsCollector()
|
||||
},
|
||||
},
|
||||
collectorSourceRocmSMI: {
|
||||
group: collectorGroupAmd,
|
||||
available: caps.hasRocmSmi,
|
||||
deprecationWarning: "rocm-smi is deprecated and may be removed in a future release",
|
||||
start: func(_ func()) bool {
|
||||
gm.startRocmSmiCollector(4300 * time.Millisecond)
|
||||
return true
|
||||
},
|
||||
},
|
||||
collectorSourceNVTop: {
|
||||
available: caps.hasNvtop,
|
||||
start: func(onFailure func()) bool {
|
||||
gm.startNvtopCollector("30", onFailure) // tens of milliseconds
|
||||
return true
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// parseCollectorPriority parses GPU_COLLECTOR and returns valid ordered entries.
|
||||
func parseCollectorPriority(value string) []collectorSource {
|
||||
parts := strings.Split(value, ",")
|
||||
priorities := make([]collectorSource, 0, len(parts))
|
||||
for _, raw := range parts {
|
||||
name := collectorSource(strings.TrimSpace(strings.ToLower(raw)))
|
||||
if !isValidCollectorSource(name) {
|
||||
if name != "" {
|
||||
slog.Warn("Ignoring unknown GPU collector", "collector", name)
|
||||
}
|
||||
continue
|
||||
}
|
||||
priorities = append(priorities, name)
|
||||
}
|
||||
return priorities
|
||||
}
|
||||
|
||||
// startNvmlCollector initializes NVML and starts its polling loop.
|
||||
func (gm *GPUManager) startNvmlCollector() bool {
|
||||
collector := &nvmlCollector{gm: gm}
|
||||
if err := collector.init(); err != nil {
|
||||
slog.Warn("Failed to initialize NVML", "err", err)
|
||||
return false
|
||||
}
|
||||
go collector.start()
|
||||
return true
|
||||
}
|
||||
|
||||
// startAmdSysfsCollector starts AMD GPU collection via sysfs.
|
||||
func (gm *GPUManager) startAmdSysfsCollector() bool {
|
||||
go func() {
|
||||
if err := gm.collectAmdStats(); err != nil {
|
||||
slog.Warn("Error collecting AMD GPU data via sysfs", "err", err)
|
||||
}
|
||||
}()
|
||||
return true
|
||||
}
|
||||
|
||||
// startCollectorsByPriority starts collectors in order with one source per vendor group.
|
||||
func (gm *GPUManager) startCollectorsByPriority(priorities []collectorSource, caps gpuCapabilities) int {
|
||||
definitions := gm.collectorDefinitions(caps)
|
||||
selectedGroups := make(map[string]bool, 3)
|
||||
started := 0
|
||||
for i, source := range priorities {
|
||||
definition, ok := definitions[source]
|
||||
if !ok || !definition.available {
|
||||
continue
|
||||
}
|
||||
// nvtop is not a vendor-specific collector, so should only be used if no other collectors are selected or it is first in GPU_COLLECTOR.
|
||||
if source == collectorSourceNVTop {
|
||||
if len(selectedGroups) > 0 {
|
||||
slog.Warn("Skipping nvtop because other collectors are selected")
|
||||
continue
|
||||
}
|
||||
// if nvtop fails, fall back to remaining collectors.
|
||||
remaining := append([]collectorSource(nil), priorities[i+1:]...)
|
||||
if definition.start(func() {
|
||||
gm.startCollectorsByPriority(remaining, caps)
|
||||
}) {
|
||||
started++
|
||||
return started
|
||||
}
|
||||
}
|
||||
group := definition.group
|
||||
if group == "" || selectedGroups[group] {
|
||||
continue
|
||||
}
|
||||
if definition.deprecationWarning != "" {
|
||||
slog.Warn(definition.deprecationWarning)
|
||||
}
|
||||
if definition.start(nil) {
|
||||
selectedGroups[group] = true
|
||||
started++
|
||||
}
|
||||
}
|
||||
return started
|
||||
}
|
||||
|
||||
// resolveLegacyCollectorPriority builds the default collector order when GPU_COLLECTOR is unset.
|
||||
func (gm *GPUManager) resolveLegacyCollectorPriority(caps gpuCapabilities) []collectorSource {
|
||||
priorities := make([]collectorSource, 0, 4)
|
||||
|
||||
if caps.hasNvidiaSmi && !caps.hasTegrastats {
|
||||
if nvml, _ := GetEnv("NVML"); nvml == "true" {
|
||||
priorities = append(priorities, collectorSourceNVML, collectorSourceNvidiaSMI)
|
||||
} else {
|
||||
priorities = append(priorities, collectorSourceNvidiaSMI)
|
||||
}
|
||||
}
|
||||
|
||||
if caps.hasRocmSmi {
|
||||
if val, _ := GetEnv("AMD_SYSFS"); val == "true" {
|
||||
priorities = append(priorities, collectorSourceAmdSysfs)
|
||||
} else {
|
||||
priorities = append(priorities, collectorSourceRocmSMI)
|
||||
}
|
||||
} else if caps.hasAmdSysfs {
|
||||
priorities = append(priorities, collectorSourceAmdSysfs)
|
||||
}
|
||||
|
||||
if caps.hasIntelGpuTop {
|
||||
priorities = append(priorities, collectorSourceIntelGpuTop)
|
||||
}
|
||||
|
||||
// Keep nvtop as a legacy last resort only when no vendor collector exists.
|
||||
if len(priorities) == 0 && caps.hasNvtop {
|
||||
priorities = append(priorities, collectorSourceNVTop)
|
||||
}
|
||||
return priorities
|
||||
}
|
||||
|
||||
// NewGPUManager creates and initializes a new GPUManager
|
||||
func NewGPUManager() (*GPUManager, error) {
|
||||
if skipGPU, _ := GetEnv("SKIP_GPU"); skipGPU == "true" {
|
||||
return nil, nil
|
||||
}
|
||||
var gm GPUManager
|
||||
if err := gm.detectGPUs(); err != nil {
|
||||
return nil, err
|
||||
caps := gm.discoverGpuCapabilities()
|
||||
if !hasAnyGpuCollector(caps) {
|
||||
return nil, fmt.Errorf(noGPUFoundMsg)
|
||||
}
|
||||
gm.GpuDataMap = make(map[string]*system.GPUData)
|
||||
|
||||
if gm.nvidiaSmi {
|
||||
if nvml, _ := GetEnv("NVML"); nvml == "true" {
|
||||
gm.nvml = true
|
||||
gm.nvidiaSmi = false
|
||||
collector := &nvmlCollector{gm: &gm}
|
||||
if err := collector.init(); err == nil {
|
||||
go collector.start()
|
||||
} else {
|
||||
slog.Warn("Failed to initialize NVML, falling back to nvidia-smi", "err", err)
|
||||
gm.nvidiaSmi = true
|
||||
gm.startCollector(nvidiaSmiCmd)
|
||||
// Jetson devices should always use tegrastats (ignore GPU_COLLECTOR).
|
||||
if caps.hasTegrastats {
|
||||
gm.startTegraStatsCollector("3700")
|
||||
return &gm, nil
|
||||
}
|
||||
} else {
|
||||
gm.startCollector(nvidiaSmiCmd)
|
||||
|
||||
// if GPU_COLLECTOR is set, start user-defined collectors.
|
||||
if collectorConfig, ok := GetEnv("GPU_COLLECTOR"); ok && strings.TrimSpace(collectorConfig) != "" {
|
||||
priorities := parseCollectorPriority(collectorConfig)
|
||||
if gm.startCollectorsByPriority(priorities, caps) == 0 {
|
||||
return nil, fmt.Errorf("no configured GPU collectors are available")
|
||||
}
|
||||
return &gm, nil
|
||||
}
|
||||
if gm.rocmSmi {
|
||||
gm.startCollector(rocmSmiCmd)
|
||||
}
|
||||
if gm.amdgpu {
|
||||
gm.startCollector(amdgpuCmd)
|
||||
}
|
||||
if gm.tegrastats {
|
||||
gm.startCollector(tegraStatsCmd)
|
||||
}
|
||||
if gm.intelGpuStats {
|
||||
gm.startCollector(intelGpuStatsCmd)
|
||||
|
||||
// auto-detect and start collectors when GPU_COLLECTOR is unset.
|
||||
if gm.startCollectorsByPriority(gm.resolveLegacyCollectorPriority(caps), caps) == 0 {
|
||||
return nil, fmt.Errorf(noGPUFoundMsg)
|
||||
}
|
||||
|
||||
return &gm, nil
|
||||
|
||||
@@ -32,6 +32,7 @@ func (gm *GPUManager) hasAmdSysfs() bool {
|
||||
|
||||
// collectAmdStats collects AMD GPU metrics directly from sysfs to avoid the overhead of rocm-smi
|
||||
func (gm *GPUManager) collectAmdStats() error {
|
||||
sysfsPollInterval := 3000 * time.Millisecond
|
||||
cards, err := filepath.Glob("/sys/class/drm/card*")
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -70,7 +71,7 @@ func (gm *GPUManager) collectAmdStats() error {
|
||||
continue
|
||||
}
|
||||
failures = 0
|
||||
time.Sleep(rocmSmiInterval)
|
||||
time.Sleep(sysfsPollInterval)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -13,21 +13,3 @@ func (c *nvmlCollector) init() error {
|
||||
}
|
||||
|
||||
func (c *nvmlCollector) start() {}
|
||||
|
||||
func (c *nvmlCollector) collect() {}
|
||||
|
||||
func openLibrary(name string) (uintptr, error) {
|
||||
return 0, fmt.Errorf("nvml not supported on this platform")
|
||||
}
|
||||
|
||||
func getNVMLPath() string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func hasSymbol(lib uintptr, symbol string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func (c *nvmlCollector) isGPUActive(bdf string) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
159
agent/gpu_nvtop.go
Normal file
159
agent/gpu_nvtop.go
Normal file
@@ -0,0 +1,159 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/henrygd/beszel/internal/entities/system"
|
||||
)
|
||||
|
||||
type nvtopSnapshot struct {
|
||||
DeviceName string `json:"device_name"`
|
||||
Temp *string `json:"temp"`
|
||||
PowerDraw *string `json:"power_draw"`
|
||||
GpuUtil *string `json:"gpu_util"`
|
||||
MemTotal *string `json:"mem_total"`
|
||||
MemUsed *string `json:"mem_used"`
|
||||
}
|
||||
|
||||
// parseNvtopNumber parses nvtop numeric strings with units (C/W/%).
|
||||
func parseNvtopNumber(raw string) float64 {
|
||||
cleaned := strings.TrimSpace(raw)
|
||||
cleaned = strings.TrimSuffix(cleaned, "C")
|
||||
cleaned = strings.TrimSuffix(cleaned, "W")
|
||||
cleaned = strings.TrimSuffix(cleaned, "%")
|
||||
val, _ := strconv.ParseFloat(cleaned, 64)
|
||||
return val
|
||||
}
|
||||
|
||||
// parseNvtopData parses a single nvtop JSON snapshot payload.
|
||||
func (gm *GPUManager) parseNvtopData(output []byte) bool {
|
||||
var snapshots []nvtopSnapshot
|
||||
if err := json.Unmarshal(output, &snapshots); err != nil || len(snapshots) == 0 {
|
||||
return false
|
||||
}
|
||||
return gm.updateNvtopSnapshots(snapshots)
|
||||
}
|
||||
|
||||
// updateNvtopSnapshots applies one decoded nvtop snapshot batch to GPU accumulators.
|
||||
func (gm *GPUManager) updateNvtopSnapshots(snapshots []nvtopSnapshot) bool {
|
||||
gm.Lock()
|
||||
defer gm.Unlock()
|
||||
|
||||
valid := false
|
||||
usedIDs := make(map[string]struct{}, len(snapshots))
|
||||
for i, sample := range snapshots {
|
||||
if sample.DeviceName == "" {
|
||||
continue
|
||||
}
|
||||
indexID := "n" + strconv.Itoa(i)
|
||||
id := indexID
|
||||
|
||||
// nvtop ordering can change, so prefer reusing an existing slot with matching device name.
|
||||
if existingByIndex, ok := gm.GpuDataMap[indexID]; ok && existingByIndex.Name != "" && existingByIndex.Name != sample.DeviceName {
|
||||
for existingID, gpu := range gm.GpuDataMap {
|
||||
if !strings.HasPrefix(existingID, "n") {
|
||||
continue
|
||||
}
|
||||
if _, taken := usedIDs[existingID]; taken {
|
||||
continue
|
||||
}
|
||||
if gpu.Name == sample.DeviceName {
|
||||
id = existingID
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok := gm.GpuDataMap[id]; !ok {
|
||||
gm.GpuDataMap[id] = &system.GPUData{Name: sample.DeviceName}
|
||||
}
|
||||
gpu := gm.GpuDataMap[id]
|
||||
gpu.Name = sample.DeviceName
|
||||
|
||||
if sample.Temp != nil {
|
||||
gpu.Temperature = parseNvtopNumber(*sample.Temp)
|
||||
}
|
||||
if sample.MemUsed != nil {
|
||||
gpu.MemoryUsed = bytesToMegabytes(parseNvtopNumber(*sample.MemUsed))
|
||||
}
|
||||
if sample.MemTotal != nil {
|
||||
gpu.MemoryTotal = bytesToMegabytes(parseNvtopNumber(*sample.MemTotal))
|
||||
}
|
||||
if sample.GpuUtil != nil {
|
||||
gpu.Usage += parseNvtopNumber(*sample.GpuUtil)
|
||||
}
|
||||
if sample.PowerDraw != nil {
|
||||
gpu.Power += parseNvtopNumber(*sample.PowerDraw)
|
||||
}
|
||||
gpu.Count++
|
||||
usedIDs[id] = struct{}{}
|
||||
valid = true
|
||||
}
|
||||
return valid
|
||||
}
|
||||
|
||||
// collectNvtopStats runs nvtop loop mode and continuously decodes JSON snapshots.
|
||||
func (gm *GPUManager) collectNvtopStats(interval string) error {
|
||||
cmd := exec.Command(nvtopCmd, "-lP", "-d", interval)
|
||||
stdout, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
defer func() {
|
||||
_ = stdout.Close()
|
||||
if cmd.ProcessState == nil || !cmd.ProcessState.Exited() {
|
||||
_ = cmd.Process.Kill()
|
||||
}
|
||||
_ = cmd.Wait()
|
||||
}()
|
||||
|
||||
decoder := json.NewDecoder(stdout)
|
||||
foundValid := false
|
||||
for {
|
||||
var snapshots []nvtopSnapshot
|
||||
if err := decoder.Decode(&snapshots); err != nil {
|
||||
if err == io.EOF {
|
||||
if foundValid {
|
||||
return nil
|
||||
}
|
||||
return errNoValidData
|
||||
}
|
||||
return err
|
||||
}
|
||||
if gm.updateNvtopSnapshots(snapshots) {
|
||||
foundValid = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// startNvtopCollector starts nvtop collection with retry or fallback callback handling.
|
||||
func (gm *GPUManager) startNvtopCollector(interval string, onFailure func()) {
|
||||
go func() {
|
||||
failures := 0
|
||||
for {
|
||||
if err := gm.collectNvtopStats(interval); err != nil {
|
||||
if onFailure != nil {
|
||||
slog.Warn("Error collecting GPU data via nvtop", "err", err)
|
||||
onFailure()
|
||||
return
|
||||
}
|
||||
failures++
|
||||
if failures > maxFailureRetries {
|
||||
break
|
||||
}
|
||||
slog.Warn("Error collecting GPU data via nvtop", "err", err)
|
||||
time.Sleep(retryWaitTime)
|
||||
continue
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
@@ -250,6 +250,100 @@ func TestParseAmdData(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNvtopData(t *testing.T) {
|
||||
input, err := os.ReadFile("test-data/nvtop.json")
|
||||
require.NoError(t, err)
|
||||
|
||||
gm := &GPUManager{
|
||||
GpuDataMap: make(map[string]*system.GPUData),
|
||||
}
|
||||
valid := gm.parseNvtopData(input)
|
||||
require.True(t, valid)
|
||||
|
||||
g0, ok := gm.GpuDataMap["n0"]
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", g0.Name)
|
||||
assert.Equal(t, 48.0, g0.Temperature)
|
||||
assert.Equal(t, 5.0, g0.Usage)
|
||||
assert.Equal(t, 13.0, g0.Power)
|
||||
assert.Equal(t, bytesToMegabytes(349372416), g0.MemoryUsed)
|
||||
assert.Equal(t, bytesToMegabytes(4294967296), g0.MemoryTotal)
|
||||
assert.Equal(t, 1.0, g0.Count)
|
||||
|
||||
g1, ok := gm.GpuDataMap["n1"]
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, "AMD Radeon 680M", g1.Name)
|
||||
assert.Equal(t, 48.0, g1.Temperature)
|
||||
assert.Equal(t, 12.0, g1.Usage)
|
||||
assert.Equal(t, 9.0, g1.Power)
|
||||
assert.Equal(t, bytesToMegabytes(1213784064), g1.MemoryUsed)
|
||||
assert.Equal(t, bytesToMegabytes(16929173504), g1.MemoryTotal)
|
||||
assert.Equal(t, 1.0, g1.Count)
|
||||
}
|
||||
|
||||
func TestUpdateNvtopSnapshotsKeepsDeviceAssociationWhenOrderChanges(t *testing.T) {
|
||||
strPtr := func(s string) *string { return &s }
|
||||
|
||||
gm := &GPUManager{
|
||||
GpuDataMap: make(map[string]*system.GPUData),
|
||||
}
|
||||
|
||||
firstBatch := []nvtopSnapshot{
|
||||
{
|
||||
DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
|
||||
GpuUtil: strPtr("20%"),
|
||||
PowerDraw: strPtr("10W"),
|
||||
},
|
||||
{
|
||||
DeviceName: "AMD Radeon 680M",
|
||||
GpuUtil: strPtr("30%"),
|
||||
PowerDraw: strPtr("20W"),
|
||||
},
|
||||
}
|
||||
secondBatchSwapped := []nvtopSnapshot{
|
||||
{
|
||||
DeviceName: "AMD Radeon 680M",
|
||||
GpuUtil: strPtr("40%"),
|
||||
PowerDraw: strPtr("25W"),
|
||||
},
|
||||
{
|
||||
DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
|
||||
GpuUtil: strPtr("50%"),
|
||||
PowerDraw: strPtr("15W"),
|
||||
},
|
||||
}
|
||||
|
||||
require.True(t, gm.updateNvtopSnapshots(firstBatch))
|
||||
require.True(t, gm.updateNvtopSnapshots(secondBatchSwapped))
|
||||
|
||||
nvidia := gm.GpuDataMap["n0"]
|
||||
require.NotNil(t, nvidia)
|
||||
assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", nvidia.Name)
|
||||
assert.Equal(t, 70.0, nvidia.Usage)
|
||||
assert.Equal(t, 25.0, nvidia.Power)
|
||||
assert.Equal(t, 2.0, nvidia.Count)
|
||||
|
||||
amd := gm.GpuDataMap["n1"]
|
||||
require.NotNil(t, amd)
|
||||
assert.Equal(t, "AMD Radeon 680M", amd.Name)
|
||||
assert.Equal(t, 70.0, amd.Usage)
|
||||
assert.Equal(t, 45.0, amd.Power)
|
||||
assert.Equal(t, 2.0, amd.Count)
|
||||
}
|
||||
|
||||
func TestParseCollectorPriority(t *testing.T) {
|
||||
got := parseCollectorPriority(" nvml, nvidia-smi, intel_gpu_top, amd_sysfs, nvtop, rocm-smi, bad ")
|
||||
want := []collectorSource{
|
||||
collectorSourceNVML,
|
||||
collectorSourceNvidiaSMI,
|
||||
collectorSourceIntelGpuTop,
|
||||
collectorSourceAmdSysfs,
|
||||
collectorSourceNVTop,
|
||||
collectorSourceRocmSMI,
|
||||
}
|
||||
assert.Equal(t, want, got)
|
||||
}
|
||||
|
||||
func TestParseJetsonData(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
@@ -987,36 +1081,35 @@ func TestCalculateGPUAverage(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestDetectGPUs(t *testing.T) {
|
||||
func TestGPUCapabilitiesAndLegacyPriority(t *testing.T) {
|
||||
// Save original PATH
|
||||
origPath := os.Getenv("PATH")
|
||||
defer os.Setenv("PATH", origPath)
|
||||
|
||||
// Set up temp dir with the commands
|
||||
tempDir := t.TempDir()
|
||||
os.Setenv("PATH", tempDir)
|
||||
hasAmdSysfs := (&GPUManager{}).hasAmdSysfs()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
setupCommands func() error
|
||||
setupCommands func(string) error
|
||||
wantNvidiaSmi bool
|
||||
wantRocmSmi bool
|
||||
wantTegrastats bool
|
||||
wantNvtop bool
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "nvidia-smi not available",
|
||||
setupCommands: func() error {
|
||||
setupCommands: func(_ string) error {
|
||||
return nil
|
||||
},
|
||||
wantNvidiaSmi: false,
|
||||
wantRocmSmi: false,
|
||||
wantTegrastats: false,
|
||||
wantNvtop: false,
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "nvidia-smi available",
|
||||
setupCommands: func() error {
|
||||
setupCommands: func(tempDir string) error {
|
||||
path := filepath.Join(tempDir, "nvidia-smi")
|
||||
script := `#!/bin/sh
|
||||
echo "test"`
|
||||
@@ -1028,29 +1121,14 @@ echo "test"`
|
||||
wantNvidiaSmi: true,
|
||||
wantTegrastats: false,
|
||||
wantRocmSmi: false,
|
||||
wantNvtop: false,
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "rocm-smi available",
|
||||
setupCommands: func() error {
|
||||
setupCommands: func(tempDir string) error {
|
||||
path := filepath.Join(tempDir, "rocm-smi")
|
||||
script := `#!/bin/sh
|
||||
echo "test"`
|
||||
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
},
|
||||
wantNvidiaSmi: true,
|
||||
wantRocmSmi: true,
|
||||
wantTegrastats: false,
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "tegrastats available",
|
||||
setupCommands: func() error {
|
||||
path := filepath.Join(tempDir, "tegrastats")
|
||||
script := `#!/bin/sh
|
||||
echo "test"`
|
||||
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
||||
return err
|
||||
@@ -1059,12 +1137,47 @@ echo "test"`
|
||||
},
|
||||
wantNvidiaSmi: false,
|
||||
wantRocmSmi: true,
|
||||
wantTegrastats: false,
|
||||
wantNvtop: false,
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "tegrastats available",
|
||||
setupCommands: func(tempDir string) error {
|
||||
path := filepath.Join(tempDir, "tegrastats")
|
||||
script := `#!/bin/sh
|
||||
echo "test"`
|
||||
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
},
|
||||
wantNvidiaSmi: false,
|
||||
wantRocmSmi: false,
|
||||
wantTegrastats: true,
|
||||
wantNvtop: false,
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "nvtop available",
|
||||
setupCommands: func(tempDir string) error {
|
||||
path := filepath.Join(tempDir, "nvtop")
|
||||
script := `#!/bin/sh
|
||||
echo "[]"`
|
||||
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
},
|
||||
wantNvidiaSmi: false,
|
||||
wantRocmSmi: false,
|
||||
wantTegrastats: false,
|
||||
wantNvtop: true,
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "no gpu tools available",
|
||||
setupCommands: func() error {
|
||||
setupCommands: func(_ string) error {
|
||||
os.Setenv("PATH", "")
|
||||
return nil
|
||||
},
|
||||
@@ -1074,29 +1187,53 @@ echo "test"`
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if err := tt.setupCommands(); err != nil {
|
||||
tempDir := t.TempDir()
|
||||
os.Setenv("PATH", tempDir)
|
||||
if err := tt.setupCommands(tempDir); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
gm := &GPUManager{}
|
||||
err := gm.detectGPUs()
|
||||
caps := gm.discoverGpuCapabilities()
|
||||
var err error
|
||||
if !hasAnyGpuCollector(caps) {
|
||||
err = fmt.Errorf(noGPUFoundMsg)
|
||||
}
|
||||
priorities := gm.resolveLegacyCollectorPriority(caps)
|
||||
hasPriority := func(source collectorSource) bool {
|
||||
for _, s := range priorities {
|
||||
if s == source {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
gotNvidiaSmi := hasPriority(collectorSourceNvidiaSMI)
|
||||
gotRocmSmi := hasPriority(collectorSourceRocmSMI)
|
||||
gotTegrastats := caps.hasTegrastats
|
||||
gotNvtop := caps.hasNvtop
|
||||
|
||||
t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gm.nvidiaSmi, gm.rocmSmi, gm.tegrastats)
|
||||
t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gotNvidiaSmi, gotRocmSmi, gotTegrastats)
|
||||
|
||||
if tt.wantErr {
|
||||
wantErr := tt.wantErr
|
||||
if hasAmdSysfs && (tt.name == "nvidia-smi not available" || tt.name == "no gpu tools available") {
|
||||
wantErr = false
|
||||
}
|
||||
if wantErr {
|
||||
assert.Error(t, err)
|
||||
return
|
||||
}
|
||||
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, tt.wantNvidiaSmi, gm.nvidiaSmi)
|
||||
assert.Equal(t, tt.wantRocmSmi, gm.rocmSmi)
|
||||
assert.Equal(t, tt.wantTegrastats, gm.tegrastats)
|
||||
assert.Equal(t, tt.wantNvidiaSmi, gotNvidiaSmi)
|
||||
assert.Equal(t, tt.wantRocmSmi, gotRocmSmi)
|
||||
assert.Equal(t, tt.wantTegrastats, gotTegrastats)
|
||||
assert.Equal(t, tt.wantNvtop, gotNvtop)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestStartCollector(t *testing.T) {
|
||||
func TestCollectorStartHelpers(t *testing.T) {
|
||||
// Save original PATH
|
||||
origPath := os.Getenv("PATH")
|
||||
defer os.Setenv("PATH", origPath)
|
||||
@@ -1181,6 +1318,27 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "nvtop collector",
|
||||
command: "nvtop",
|
||||
setup: func(t *testing.T) error {
|
||||
path := filepath.Join(dir, "nvtop")
|
||||
script := `#!/bin/sh
|
||||
echo '[{"device_name":"NVIDIA Test GPU","temp":"52C","power_draw":"31W","gpu_util":"37%","mem_total":"4294967296","mem_used":"536870912","processes":[]}]'`
|
||||
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
},
|
||||
validate: func(t *testing.T, gm *GPUManager) {
|
||||
gpu, exists := gm.GpuDataMap["n0"]
|
||||
assert.True(t, exists)
|
||||
if exists {
|
||||
assert.Equal(t, "NVIDIA Test GPU", gpu.Name)
|
||||
assert.Equal(t, 52.0, gpu.Temperature)
|
||||
}
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
@@ -1193,13 +1351,157 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m
|
||||
GpuDataMap: make(map[string]*system.GPUData),
|
||||
}
|
||||
}
|
||||
tt.gm.startCollector(tt.command)
|
||||
switch tt.command {
|
||||
case nvidiaSmiCmd:
|
||||
tt.gm.startNvidiaSmiCollector("4")
|
||||
case rocmSmiCmd:
|
||||
tt.gm.startRocmSmiCollector(4300 * time.Millisecond)
|
||||
case tegraStatsCmd:
|
||||
tt.gm.startTegraStatsCollector("3700")
|
||||
case nvtopCmd:
|
||||
tt.gm.startNvtopCollector("30", nil)
|
||||
default:
|
||||
t.Fatalf("unknown test command %q", tt.command)
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond) // Give collector time to run
|
||||
tt.validate(t, tt.gm)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewGPUManagerPriorityNvtopFallback(t *testing.T) {
|
||||
origPath := os.Getenv("PATH")
|
||||
defer os.Setenv("PATH", origPath)
|
||||
|
||||
dir := t.TempDir()
|
||||
os.Setenv("PATH", dir)
|
||||
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvtop,nvidia-smi")
|
||||
|
||||
nvtopPath := filepath.Join(dir, "nvtop")
|
||||
nvtopScript := `#!/bin/sh
|
||||
echo 'not-json'`
|
||||
require.NoError(t, os.WriteFile(nvtopPath, []byte(nvtopScript), 0755))
|
||||
|
||||
nvidiaPath := filepath.Join(dir, "nvidia-smi")
|
||||
nvidiaScript := `#!/bin/sh
|
||||
echo "0, NVIDIA Priority GPU, 45, 512, 2048, 12, 25"`
|
||||
require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755))
|
||||
|
||||
gm, err := NewGPUManager()
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, gm)
|
||||
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
gpu, ok := gm.GpuDataMap["0"]
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, "Priority GPU", gpu.Name)
|
||||
assert.Equal(t, 45.0, gpu.Temperature)
|
||||
}
|
||||
|
||||
func TestNewGPUManagerPriorityMixedCollectors(t *testing.T) {
|
||||
origPath := os.Getenv("PATH")
|
||||
defer os.Setenv("PATH", origPath)
|
||||
|
||||
dir := t.TempDir()
|
||||
os.Setenv("PATH", dir)
|
||||
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "intel_gpu_top,rocm-smi")
|
||||
|
||||
intelPath := filepath.Join(dir, "intel_gpu_top")
|
||||
intelScript := `#!/bin/sh
|
||||
echo "Freq MHz IRQ RC6 Power W IMC MiB/s RCS VCS"
|
||||
echo " req act /s % gpu pkg rd wr % se wa % se wa"
|
||||
echo "226 223 338 58 2.00 2.69 1820 965 0.00 0 0 0.00 0 0"
|
||||
echo "189 187 412 67 1.80 2.45 1950 823 8.50 2 1 15.00 1 0"
|
||||
`
|
||||
require.NoError(t, os.WriteFile(intelPath, []byte(intelScript), 0755))
|
||||
|
||||
rocmPath := filepath.Join(dir, "rocm-smi")
|
||||
rocmScript := `#!/bin/sh
|
||||
echo '{"card0": {"Temperature (Sensor edge) (C)": "49.0", "Current Socket Graphics Package Power (W)": "28.159", "GPU use (%)": "0", "VRAM Total Memory (B)": "536870912", "VRAM Total Used Memory (B)": "445550592", "Card Series": "Rembrandt [Radeon 680M]", "GUID": "34756"}}'
|
||||
`
|
||||
require.NoError(t, os.WriteFile(rocmPath, []byte(rocmScript), 0755))
|
||||
|
||||
gm, err := NewGPUManager()
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, gm)
|
||||
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
_, intelOk := gm.GpuDataMap["i0"]
|
||||
_, amdOk := gm.GpuDataMap["34756"]
|
||||
assert.True(t, intelOk)
|
||||
assert.True(t, amdOk)
|
||||
}
|
||||
|
||||
func TestNewGPUManagerPriorityNvmlFallbackToNvidiaSmi(t *testing.T) {
|
||||
origPath := os.Getenv("PATH")
|
||||
defer os.Setenv("PATH", origPath)
|
||||
|
||||
dir := t.TempDir()
|
||||
os.Setenv("PATH", dir)
|
||||
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvml,nvidia-smi")
|
||||
|
||||
nvidiaPath := filepath.Join(dir, "nvidia-smi")
|
||||
nvidiaScript := `#!/bin/sh
|
||||
echo "0, NVIDIA Fallback GPU, 41, 256, 1024, 8, 14"`
|
||||
require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755))
|
||||
|
||||
gm, err := NewGPUManager()
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, gm)
|
||||
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
gpu, ok := gm.GpuDataMap["0"]
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, "Fallback GPU", gpu.Name)
|
||||
}
|
||||
|
||||
func TestNewGPUManagerConfiguredCollectorsMustStart(t *testing.T) {
|
||||
origPath := os.Getenv("PATH")
|
||||
defer os.Setenv("PATH", origPath)
|
||||
|
||||
dir := t.TempDir()
|
||||
os.Setenv("PATH", dir)
|
||||
|
||||
t.Run("configured valid collector unavailable", func(t *testing.T) {
|
||||
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi")
|
||||
gm, err := NewGPUManager()
|
||||
require.Nil(t, gm)
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
|
||||
})
|
||||
|
||||
t.Run("configured collector list has only unknown entries", func(t *testing.T) {
|
||||
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "bad,unknown")
|
||||
gm, err := NewGPUManager()
|
||||
require.Nil(t, gm)
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
|
||||
})
|
||||
}
|
||||
|
||||
func TestNewGPUManagerJetsonIgnoresCollectorConfig(t *testing.T) {
|
||||
origPath := os.Getenv("PATH")
|
||||
defer os.Setenv("PATH", origPath)
|
||||
|
||||
dir := t.TempDir()
|
||||
os.Setenv("PATH", dir)
|
||||
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi")
|
||||
|
||||
tegraPath := filepath.Join(dir, "tegrastats")
|
||||
tegraScript := `#!/bin/sh
|
||||
echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000mW"`
|
||||
require.NoError(t, os.WriteFile(tegraPath, []byte(tegraScript), 0755))
|
||||
|
||||
gm, err := NewGPUManager()
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, gm)
|
||||
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
gpu, ok := gm.GpuDataMap["0"]
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, "GPU", gpu.Name)
|
||||
}
|
||||
|
||||
// TestAccumulationTableDriven tests the accumulation behavior for all three GPU types
|
||||
func TestAccumulation(t *testing.T) {
|
||||
type expectedGPUValues struct {
|
||||
|
||||
34
agent/test-data/nvtop.json
Normal file
34
agent/test-data/nvtop.json
Normal file
@@ -0,0 +1,34 @@
|
||||
[
|
||||
{
|
||||
"device_name": "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
|
||||
"gpu_clock": "1485MHz",
|
||||
"mem_clock": "6001MHz",
|
||||
"temp": "48C",
|
||||
"fan_speed": null,
|
||||
"power_draw": "13W",
|
||||
"gpu_util": "5%",
|
||||
"encode": "0%",
|
||||
"decode": "0%",
|
||||
"mem_util": "8%",
|
||||
"mem_total": "4294967296",
|
||||
"mem_used": "349372416",
|
||||
"mem_free": "3945594880",
|
||||
"processes" : []
|
||||
},
|
||||
{
|
||||
"device_name": "AMD Radeon 680M",
|
||||
"gpu_clock": "2200MHz",
|
||||
"mem_clock": "2400MHz",
|
||||
"temp": "48C",
|
||||
"fan_speed": "CPU Fan",
|
||||
"power_draw": "9W",
|
||||
"gpu_util": "12%",
|
||||
"encode": null,
|
||||
"decode": "0%",
|
||||
"mem_util": "7%",
|
||||
"mem_total": "16929173504",
|
||||
"mem_used": "1213784064",
|
||||
"mem_free": "15715389440",
|
||||
"processes" : []
|
||||
}
|
||||
]
|
||||
Reference in New Issue
Block a user