mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-21 21:26:16 +01:00
add nvtop integration and introduce GPU_COLLECTOR env var
This commit is contained in:
412
agent/gpu.go
412
agent/gpu.go
@@ -21,13 +21,10 @@ const (
|
|||||||
// Commands
|
// Commands
|
||||||
nvidiaSmiCmd string = "nvidia-smi"
|
nvidiaSmiCmd string = "nvidia-smi"
|
||||||
rocmSmiCmd string = "rocm-smi"
|
rocmSmiCmd string = "rocm-smi"
|
||||||
amdgpuCmd string = "amdgpu" // internal cmd for sysfs collection
|
|
||||||
tegraStatsCmd string = "tegrastats"
|
tegraStatsCmd string = "tegrastats"
|
||||||
|
nvtopCmd string = "nvtop"
|
||||||
|
noGPUFoundMsg string = "no GPU found - see https://beszel.dev/guide/gpu"
|
||||||
|
|
||||||
// Polling intervals
|
|
||||||
nvidiaSmiInterval string = "4" // in seconds
|
|
||||||
tegraStatsInterval string = "3700" // in milliseconds
|
|
||||||
rocmSmiInterval time.Duration = 4300 * time.Millisecond
|
|
||||||
// Command retry and timeout constants
|
// Command retry and timeout constants
|
||||||
retryWaitTime time.Duration = 5 * time.Second
|
retryWaitTime time.Duration = 5 * time.Second
|
||||||
maxFailureRetries int = 5
|
maxFailureRetries int = 5
|
||||||
@@ -40,13 +37,7 @@ const (
|
|||||||
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
|
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
|
||||||
type GPUManager struct {
|
type GPUManager struct {
|
||||||
sync.Mutex
|
sync.Mutex
|
||||||
nvidiaSmi bool
|
GpuDataMap map[string]*system.GPUData
|
||||||
rocmSmi bool
|
|
||||||
amdgpu bool
|
|
||||||
tegrastats bool
|
|
||||||
intelGpuStats bool
|
|
||||||
nvml bool
|
|
||||||
GpuDataMap map[string]*system.GPUData
|
|
||||||
// lastAvgData stores the last calculated averages for each GPU
|
// lastAvgData stores the last calculated averages for each GPU
|
||||||
// Used when a collection happens before new data arrives (Count == 0)
|
// Used when a collection happens before new data arrives (Count == 0)
|
||||||
lastAvgData map[string]system.GPUData
|
lastAvgData map[string]system.GPUData
|
||||||
@@ -87,6 +78,51 @@ type gpuCollector struct {
|
|||||||
|
|
||||||
var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data
|
var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data
|
||||||
|
|
||||||
|
// collectorSource identifies a selectable GPU collector in GPU_COLLECTOR.
|
||||||
|
type collectorSource string
|
||||||
|
|
||||||
|
const (
|
||||||
|
collectorSourceNVTop collectorSource = collectorSource(nvtopCmd)
|
||||||
|
collectorSourceNVML collectorSource = "nvml"
|
||||||
|
collectorSourceNvidiaSMI collectorSource = collectorSource(nvidiaSmiCmd)
|
||||||
|
collectorSourceIntelGpuTop collectorSource = collectorSource(intelGpuStatsCmd)
|
||||||
|
collectorSourceAmdSysfs collectorSource = "amd_sysfs"
|
||||||
|
collectorSourceRocmSMI collectorSource = collectorSource(rocmSmiCmd)
|
||||||
|
collectorGroupNvidia string = "nvidia"
|
||||||
|
collectorGroupIntel string = "intel"
|
||||||
|
collectorGroupAmd string = "amd"
|
||||||
|
)
|
||||||
|
|
||||||
|
func isValidCollectorSource(source collectorSource) bool {
|
||||||
|
switch source {
|
||||||
|
case collectorSourceNVTop,
|
||||||
|
collectorSourceNVML,
|
||||||
|
collectorSourceNvidiaSMI,
|
||||||
|
collectorSourceIntelGpuTop,
|
||||||
|
collectorSourceAmdSysfs,
|
||||||
|
collectorSourceRocmSMI:
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// gpuCapabilities describes detected GPU tooling and sysfs support on the host.
|
||||||
|
type gpuCapabilities struct {
|
||||||
|
hasNvidiaSmi bool
|
||||||
|
hasRocmSmi bool
|
||||||
|
hasAmdSysfs bool
|
||||||
|
hasTegrastats bool
|
||||||
|
hasIntelGpuTop bool
|
||||||
|
hasNvtop bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type collectorDefinition struct {
|
||||||
|
group string
|
||||||
|
available bool
|
||||||
|
start func(onFailure func()) bool
|
||||||
|
deprecationWarning string
|
||||||
|
}
|
||||||
|
|
||||||
// starts and manages the ongoing collection of GPU data for the specified GPU management utility
|
// starts and manages the ongoing collection of GPU data for the specified GPU management utility
|
||||||
func (c *gpuCollector) start() {
|
func (c *gpuCollector) start() {
|
||||||
for {
|
for {
|
||||||
@@ -392,93 +428,257 @@ func (gm *GPUManager) storeSnapshot(id string, gpu *system.GPUData, cacheKey uin
|
|||||||
gm.lastSnapshots[cacheKey][id] = snapshot
|
gm.lastSnapshots[cacheKey][id] = snapshot
|
||||||
}
|
}
|
||||||
|
|
||||||
// detectGPUs checks for the presence of GPU management tools (nvidia-smi, rocm-smi, tegrastats)
|
// discoverGpuCapabilities checks for available GPU tooling and sysfs support.
|
||||||
// in the system path. It sets the corresponding flags in the GPUManager struct if any of these
|
// It only reports capability presence and does not apply policy decisions.
|
||||||
// tools are found. If none of the tools are found, it returns an error indicating that no GPU
|
func (gm *GPUManager) discoverGpuCapabilities() gpuCapabilities {
|
||||||
// management tools are available.
|
caps := gpuCapabilities{
|
||||||
func (gm *GPUManager) detectGPUs() error {
|
hasAmdSysfs: gm.hasAmdSysfs(),
|
||||||
|
}
|
||||||
if _, err := exec.LookPath(nvidiaSmiCmd); err == nil {
|
if _, err := exec.LookPath(nvidiaSmiCmd); err == nil {
|
||||||
gm.nvidiaSmi = true
|
caps.hasNvidiaSmi = true
|
||||||
}
|
}
|
||||||
if _, err := exec.LookPath(rocmSmiCmd); err == nil {
|
if _, err := exec.LookPath(rocmSmiCmd); err == nil {
|
||||||
if val, _ := GetEnv("AMD_SYSFS"); val == "true" {
|
caps.hasRocmSmi = true
|
||||||
gm.amdgpu = true
|
|
||||||
} else {
|
|
||||||
gm.rocmSmi = true
|
|
||||||
}
|
|
||||||
} else if gm.hasAmdSysfs() {
|
|
||||||
gm.amdgpu = true
|
|
||||||
}
|
}
|
||||||
if _, err := exec.LookPath(tegraStatsCmd); err == nil {
|
if _, err := exec.LookPath(tegraStatsCmd); err == nil {
|
||||||
gm.tegrastats = true
|
caps.hasTegrastats = true
|
||||||
gm.nvidiaSmi = false
|
|
||||||
}
|
}
|
||||||
if _, err := exec.LookPath(intelGpuStatsCmd); err == nil {
|
if _, err := exec.LookPath(intelGpuStatsCmd); err == nil {
|
||||||
gm.intelGpuStats = true
|
caps.hasIntelGpuTop = true
|
||||||
}
|
}
|
||||||
if gm.nvidiaSmi || gm.rocmSmi || gm.amdgpu || gm.tegrastats || gm.intelGpuStats || gm.nvml {
|
if _, err := exec.LookPath(nvtopCmd); err == nil {
|
||||||
return nil
|
caps.hasNvtop = true
|
||||||
}
|
}
|
||||||
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or intel_gpu_top")
|
return caps
|
||||||
}
|
}
|
||||||
|
|
||||||
// startCollector starts the appropriate GPU data collector based on the command
|
func hasAnyGpuCollector(caps gpuCapabilities) bool {
|
||||||
func (gm *GPUManager) startCollector(command string) {
|
return caps.hasNvidiaSmi || caps.hasRocmSmi || caps.hasAmdSysfs || caps.hasTegrastats || caps.hasIntelGpuTop || caps.hasNvtop
|
||||||
collector := gpuCollector{
|
}
|
||||||
name: command,
|
|
||||||
bufSize: 10 * 1024,
|
func (gm *GPUManager) startIntelCollector() {
|
||||||
}
|
go func() {
|
||||||
switch command {
|
failures := 0
|
||||||
case intelGpuStatsCmd:
|
for {
|
||||||
go func() {
|
if err := gm.collectIntelStats(); err != nil {
|
||||||
failures := 0
|
failures++
|
||||||
for {
|
if failures > maxFailureRetries {
|
||||||
if err := gm.collectIntelStats(); err != nil {
|
break
|
||||||
failures++
|
|
||||||
if failures > maxFailureRetries {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err)
|
|
||||||
time.Sleep(retryWaitTime)
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
|
slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err)
|
||||||
|
time.Sleep(retryWaitTime)
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
}()
|
}
|
||||||
case nvidiaSmiCmd:
|
}()
|
||||||
collector.cmdArgs = []string{
|
}
|
||||||
"-l", nvidiaSmiInterval,
|
|
||||||
|
func (gm *GPUManager) startNvidiaSmiCollector(intervalSeconds string) {
|
||||||
|
collector := gpuCollector{
|
||||||
|
name: nvidiaSmiCmd,
|
||||||
|
bufSize: 10 * 1024,
|
||||||
|
cmdArgs: []string{
|
||||||
|
"-l", intervalSeconds,
|
||||||
"--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw",
|
"--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw",
|
||||||
"--format=csv,noheader,nounits",
|
"--format=csv,noheader,nounits",
|
||||||
}
|
},
|
||||||
collector.parse = gm.parseNvidiaData
|
parse: gm.parseNvidiaData,
|
||||||
go collector.start()
|
|
||||||
case tegraStatsCmd:
|
|
||||||
collector.cmdArgs = []string{"--interval", tegraStatsInterval}
|
|
||||||
collector.parse = gm.getJetsonParser()
|
|
||||||
go collector.start()
|
|
||||||
case amdgpuCmd:
|
|
||||||
go func() {
|
|
||||||
if err := gm.collectAmdStats(); err != nil {
|
|
||||||
slog.Warn("Error collecting AMD GPU data via sysfs", "err", err)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
case rocmSmiCmd:
|
|
||||||
collector.cmdArgs = []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"}
|
|
||||||
collector.parse = gm.parseAmdData
|
|
||||||
go func() {
|
|
||||||
failures := 0
|
|
||||||
for {
|
|
||||||
if err := collector.collect(); err != nil {
|
|
||||||
failures++
|
|
||||||
if failures > maxFailureRetries {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
slog.Warn("Error collecting AMD GPU data via rocm-smi", "err", err)
|
|
||||||
}
|
|
||||||
time.Sleep(rocmSmiInterval)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
}
|
}
|
||||||
|
go collector.start()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (gm *GPUManager) startTegraStatsCollector(intervalMilliseconds string) {
|
||||||
|
collector := gpuCollector{
|
||||||
|
name: tegraStatsCmd,
|
||||||
|
bufSize: 10 * 1024,
|
||||||
|
cmdArgs: []string{"--interval", intervalMilliseconds},
|
||||||
|
parse: gm.getJetsonParser(),
|
||||||
|
}
|
||||||
|
go collector.start()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (gm *GPUManager) startRocmSmiCollector(pollInterval time.Duration) {
|
||||||
|
collector := gpuCollector{
|
||||||
|
name: rocmSmiCmd,
|
||||||
|
bufSize: 10 * 1024,
|
||||||
|
cmdArgs: []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"},
|
||||||
|
parse: gm.parseAmdData,
|
||||||
|
}
|
||||||
|
go func() {
|
||||||
|
failures := 0
|
||||||
|
for {
|
||||||
|
if err := collector.collect(); err != nil {
|
||||||
|
failures++
|
||||||
|
if failures > maxFailureRetries {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
slog.Warn("Error collecting AMD GPU data via rocm-smi", "err", err)
|
||||||
|
}
|
||||||
|
time.Sleep(pollInterval)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSource]collectorDefinition {
|
||||||
|
return map[collectorSource]collectorDefinition{
|
||||||
|
collectorSourceNVML: {
|
||||||
|
group: collectorGroupNvidia,
|
||||||
|
available: caps.hasNvidiaSmi,
|
||||||
|
start: func(_ func()) bool {
|
||||||
|
return gm.startNvmlCollector()
|
||||||
|
},
|
||||||
|
},
|
||||||
|
collectorSourceNvidiaSMI: {
|
||||||
|
group: collectorGroupNvidia,
|
||||||
|
available: caps.hasNvidiaSmi,
|
||||||
|
start: func(_ func()) bool {
|
||||||
|
gm.startNvidiaSmiCollector("4") // seconds
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
},
|
||||||
|
collectorSourceIntelGpuTop: {
|
||||||
|
group: collectorGroupIntel,
|
||||||
|
available: caps.hasIntelGpuTop,
|
||||||
|
start: func(_ func()) bool {
|
||||||
|
gm.startIntelCollector()
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
},
|
||||||
|
collectorSourceAmdSysfs: {
|
||||||
|
group: collectorGroupAmd,
|
||||||
|
available: caps.hasAmdSysfs,
|
||||||
|
start: func(_ func()) bool {
|
||||||
|
return gm.startAmdSysfsCollector()
|
||||||
|
},
|
||||||
|
},
|
||||||
|
collectorSourceRocmSMI: {
|
||||||
|
group: collectorGroupAmd,
|
||||||
|
available: caps.hasRocmSmi,
|
||||||
|
deprecationWarning: "rocm-smi is deprecated and may be removed in a future release",
|
||||||
|
start: func(_ func()) bool {
|
||||||
|
gm.startRocmSmiCollector(4300 * time.Millisecond)
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
},
|
||||||
|
collectorSourceNVTop: {
|
||||||
|
available: caps.hasNvtop,
|
||||||
|
start: func(onFailure func()) bool {
|
||||||
|
gm.startNvtopCollector("30", onFailure) // tens of milliseconds
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseCollectorPriority parses GPU_COLLECTOR and returns valid ordered entries.
|
||||||
|
func parseCollectorPriority(value string) []collectorSource {
|
||||||
|
parts := strings.Split(value, ",")
|
||||||
|
priorities := make([]collectorSource, 0, len(parts))
|
||||||
|
for _, raw := range parts {
|
||||||
|
name := collectorSource(strings.TrimSpace(strings.ToLower(raw)))
|
||||||
|
if !isValidCollectorSource(name) {
|
||||||
|
if name != "" {
|
||||||
|
slog.Warn("Ignoring unknown GPU collector", "collector", name)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
priorities = append(priorities, name)
|
||||||
|
}
|
||||||
|
return priorities
|
||||||
|
}
|
||||||
|
|
||||||
|
// startNvmlCollector initializes NVML and starts its polling loop.
|
||||||
|
func (gm *GPUManager) startNvmlCollector() bool {
|
||||||
|
collector := &nvmlCollector{gm: gm}
|
||||||
|
if err := collector.init(); err != nil {
|
||||||
|
slog.Warn("Failed to initialize NVML", "err", err)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
go collector.start()
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// startAmdSysfsCollector starts AMD GPU collection via sysfs.
|
||||||
|
func (gm *GPUManager) startAmdSysfsCollector() bool {
|
||||||
|
go func() {
|
||||||
|
if err := gm.collectAmdStats(); err != nil {
|
||||||
|
slog.Warn("Error collecting AMD GPU data via sysfs", "err", err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// startCollectorsByPriority starts collectors in order with one source per vendor group.
|
||||||
|
func (gm *GPUManager) startCollectorsByPriority(priorities []collectorSource, caps gpuCapabilities) int {
|
||||||
|
definitions := gm.collectorDefinitions(caps)
|
||||||
|
selectedGroups := make(map[string]bool, 3)
|
||||||
|
started := 0
|
||||||
|
for i, source := range priorities {
|
||||||
|
definition, ok := definitions[source]
|
||||||
|
if !ok || !definition.available {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// nvtop is not a vendor-specific collector, so should only be used if no other collectors are selected or it is first in GPU_COLLECTOR.
|
||||||
|
if source == collectorSourceNVTop {
|
||||||
|
if len(selectedGroups) > 0 {
|
||||||
|
slog.Warn("Skipping nvtop because other collectors are selected")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// if nvtop fails, fall back to remaining collectors.
|
||||||
|
remaining := append([]collectorSource(nil), priorities[i+1:]...)
|
||||||
|
if definition.start(func() {
|
||||||
|
gm.startCollectorsByPriority(remaining, caps)
|
||||||
|
}) {
|
||||||
|
started++
|
||||||
|
return started
|
||||||
|
}
|
||||||
|
}
|
||||||
|
group := definition.group
|
||||||
|
if group == "" || selectedGroups[group] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if definition.deprecationWarning != "" {
|
||||||
|
slog.Warn(definition.deprecationWarning)
|
||||||
|
}
|
||||||
|
if definition.start(nil) {
|
||||||
|
selectedGroups[group] = true
|
||||||
|
started++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return started
|
||||||
|
}
|
||||||
|
|
||||||
|
// resolveLegacyCollectorPriority builds the default collector order when GPU_COLLECTOR is unset.
|
||||||
|
func (gm *GPUManager) resolveLegacyCollectorPriority(caps gpuCapabilities) []collectorSource {
|
||||||
|
priorities := make([]collectorSource, 0, 4)
|
||||||
|
|
||||||
|
if caps.hasNvidiaSmi && !caps.hasTegrastats {
|
||||||
|
if nvml, _ := GetEnv("NVML"); nvml == "true" {
|
||||||
|
priorities = append(priorities, collectorSourceNVML, collectorSourceNvidiaSMI)
|
||||||
|
} else {
|
||||||
|
priorities = append(priorities, collectorSourceNvidiaSMI)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if caps.hasRocmSmi {
|
||||||
|
if val, _ := GetEnv("AMD_SYSFS"); val == "true" {
|
||||||
|
priorities = append(priorities, collectorSourceAmdSysfs)
|
||||||
|
} else {
|
||||||
|
priorities = append(priorities, collectorSourceRocmSMI)
|
||||||
|
}
|
||||||
|
} else if caps.hasAmdSysfs {
|
||||||
|
priorities = append(priorities, collectorSourceAmdSysfs)
|
||||||
|
}
|
||||||
|
|
||||||
|
if caps.hasIntelGpuTop {
|
||||||
|
priorities = append(priorities, collectorSourceIntelGpuTop)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Keep nvtop as a legacy last resort only when no vendor collector exists.
|
||||||
|
if len(priorities) == 0 && caps.hasNvtop {
|
||||||
|
priorities = append(priorities, collectorSourceNVTop)
|
||||||
|
}
|
||||||
|
return priorities
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewGPUManager creates and initializes a new GPUManager
|
// NewGPUManager creates and initializes a new GPUManager
|
||||||
@@ -487,38 +687,30 @@ func NewGPUManager() (*GPUManager, error) {
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
var gm GPUManager
|
var gm GPUManager
|
||||||
if err := gm.detectGPUs(); err != nil {
|
caps := gm.discoverGpuCapabilities()
|
||||||
return nil, err
|
if !hasAnyGpuCollector(caps) {
|
||||||
|
return nil, fmt.Errorf(noGPUFoundMsg)
|
||||||
}
|
}
|
||||||
gm.GpuDataMap = make(map[string]*system.GPUData)
|
gm.GpuDataMap = make(map[string]*system.GPUData)
|
||||||
|
|
||||||
if gm.nvidiaSmi {
|
// Jetson devices should always use tegrastats (ignore GPU_COLLECTOR).
|
||||||
if nvml, _ := GetEnv("NVML"); nvml == "true" {
|
if caps.hasTegrastats {
|
||||||
gm.nvml = true
|
gm.startTegraStatsCollector("3700")
|
||||||
gm.nvidiaSmi = false
|
return &gm, nil
|
||||||
collector := &nvmlCollector{gm: &gm}
|
}
|
||||||
if err := collector.init(); err == nil {
|
|
||||||
go collector.start()
|
// if GPU_COLLECTOR is set, start user-defined collectors.
|
||||||
} else {
|
if collectorConfig, ok := GetEnv("GPU_COLLECTOR"); ok && strings.TrimSpace(collectorConfig) != "" {
|
||||||
slog.Warn("Failed to initialize NVML, falling back to nvidia-smi", "err", err)
|
priorities := parseCollectorPriority(collectorConfig)
|
||||||
gm.nvidiaSmi = true
|
if gm.startCollectorsByPriority(priorities, caps) == 0 {
|
||||||
gm.startCollector(nvidiaSmiCmd)
|
return nil, fmt.Errorf("no configured GPU collectors are available")
|
||||||
}
|
|
||||||
} else {
|
|
||||||
gm.startCollector(nvidiaSmiCmd)
|
|
||||||
}
|
}
|
||||||
|
return &gm, nil
|
||||||
}
|
}
|
||||||
if gm.rocmSmi {
|
|
||||||
gm.startCollector(rocmSmiCmd)
|
// auto-detect and start collectors when GPU_COLLECTOR is unset.
|
||||||
}
|
if gm.startCollectorsByPriority(gm.resolveLegacyCollectorPriority(caps), caps) == 0 {
|
||||||
if gm.amdgpu {
|
return nil, fmt.Errorf(noGPUFoundMsg)
|
||||||
gm.startCollector(amdgpuCmd)
|
|
||||||
}
|
|
||||||
if gm.tegrastats {
|
|
||||||
gm.startCollector(tegraStatsCmd)
|
|
||||||
}
|
|
||||||
if gm.intelGpuStats {
|
|
||||||
gm.startCollector(intelGpuStatsCmd)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return &gm, nil
|
return &gm, nil
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ func (gm *GPUManager) hasAmdSysfs() bool {
|
|||||||
|
|
||||||
// collectAmdStats collects AMD GPU metrics directly from sysfs to avoid the overhead of rocm-smi
|
// collectAmdStats collects AMD GPU metrics directly from sysfs to avoid the overhead of rocm-smi
|
||||||
func (gm *GPUManager) collectAmdStats() error {
|
func (gm *GPUManager) collectAmdStats() error {
|
||||||
|
sysfsPollInterval := 3000 * time.Millisecond
|
||||||
cards, err := filepath.Glob("/sys/class/drm/card*")
|
cards, err := filepath.Glob("/sys/class/drm/card*")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -70,7 +71,7 @@ func (gm *GPUManager) collectAmdStats() error {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
failures = 0
|
failures = 0
|
||||||
time.Sleep(rocmSmiInterval)
|
time.Sleep(sysfsPollInterval)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -13,21 +13,3 @@ func (c *nvmlCollector) init() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c *nvmlCollector) start() {}
|
func (c *nvmlCollector) start() {}
|
||||||
|
|
||||||
func (c *nvmlCollector) collect() {}
|
|
||||||
|
|
||||||
func openLibrary(name string) (uintptr, error) {
|
|
||||||
return 0, fmt.Errorf("nvml not supported on this platform")
|
|
||||||
}
|
|
||||||
|
|
||||||
func getNVMLPath() string {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
func hasSymbol(lib uintptr, symbol string) bool {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *nvmlCollector) isGPUActive(bdf string) bool {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|||||||
159
agent/gpu_nvtop.go
Normal file
159
agent/gpu_nvtop.go
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
package agent
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/henrygd/beszel/internal/entities/system"
|
||||||
|
)
|
||||||
|
|
||||||
|
type nvtopSnapshot struct {
|
||||||
|
DeviceName string `json:"device_name"`
|
||||||
|
Temp *string `json:"temp"`
|
||||||
|
PowerDraw *string `json:"power_draw"`
|
||||||
|
GpuUtil *string `json:"gpu_util"`
|
||||||
|
MemTotal *string `json:"mem_total"`
|
||||||
|
MemUsed *string `json:"mem_used"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseNvtopNumber parses nvtop numeric strings with units (C/W/%).
|
||||||
|
func parseNvtopNumber(raw string) float64 {
|
||||||
|
cleaned := strings.TrimSpace(raw)
|
||||||
|
cleaned = strings.TrimSuffix(cleaned, "C")
|
||||||
|
cleaned = strings.TrimSuffix(cleaned, "W")
|
||||||
|
cleaned = strings.TrimSuffix(cleaned, "%")
|
||||||
|
val, _ := strconv.ParseFloat(cleaned, 64)
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseNvtopData parses a single nvtop JSON snapshot payload.
|
||||||
|
func (gm *GPUManager) parseNvtopData(output []byte) bool {
|
||||||
|
var snapshots []nvtopSnapshot
|
||||||
|
if err := json.Unmarshal(output, &snapshots); err != nil || len(snapshots) == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return gm.updateNvtopSnapshots(snapshots)
|
||||||
|
}
|
||||||
|
|
||||||
|
// updateNvtopSnapshots applies one decoded nvtop snapshot batch to GPU accumulators.
|
||||||
|
func (gm *GPUManager) updateNvtopSnapshots(snapshots []nvtopSnapshot) bool {
|
||||||
|
gm.Lock()
|
||||||
|
defer gm.Unlock()
|
||||||
|
|
||||||
|
valid := false
|
||||||
|
usedIDs := make(map[string]struct{}, len(snapshots))
|
||||||
|
for i, sample := range snapshots {
|
||||||
|
if sample.DeviceName == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
indexID := "n" + strconv.Itoa(i)
|
||||||
|
id := indexID
|
||||||
|
|
||||||
|
// nvtop ordering can change, so prefer reusing an existing slot with matching device name.
|
||||||
|
if existingByIndex, ok := gm.GpuDataMap[indexID]; ok && existingByIndex.Name != "" && existingByIndex.Name != sample.DeviceName {
|
||||||
|
for existingID, gpu := range gm.GpuDataMap {
|
||||||
|
if !strings.HasPrefix(existingID, "n") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, taken := usedIDs[existingID]; taken {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if gpu.Name == sample.DeviceName {
|
||||||
|
id = existingID
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, ok := gm.GpuDataMap[id]; !ok {
|
||||||
|
gm.GpuDataMap[id] = &system.GPUData{Name: sample.DeviceName}
|
||||||
|
}
|
||||||
|
gpu := gm.GpuDataMap[id]
|
||||||
|
gpu.Name = sample.DeviceName
|
||||||
|
|
||||||
|
if sample.Temp != nil {
|
||||||
|
gpu.Temperature = parseNvtopNumber(*sample.Temp)
|
||||||
|
}
|
||||||
|
if sample.MemUsed != nil {
|
||||||
|
gpu.MemoryUsed = bytesToMegabytes(parseNvtopNumber(*sample.MemUsed))
|
||||||
|
}
|
||||||
|
if sample.MemTotal != nil {
|
||||||
|
gpu.MemoryTotal = bytesToMegabytes(parseNvtopNumber(*sample.MemTotal))
|
||||||
|
}
|
||||||
|
if sample.GpuUtil != nil {
|
||||||
|
gpu.Usage += parseNvtopNumber(*sample.GpuUtil)
|
||||||
|
}
|
||||||
|
if sample.PowerDraw != nil {
|
||||||
|
gpu.Power += parseNvtopNumber(*sample.PowerDraw)
|
||||||
|
}
|
||||||
|
gpu.Count++
|
||||||
|
usedIDs[id] = struct{}{}
|
||||||
|
valid = true
|
||||||
|
}
|
||||||
|
return valid
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectNvtopStats runs nvtop loop mode and continuously decodes JSON snapshots.
|
||||||
|
func (gm *GPUManager) collectNvtopStats(interval string) error {
|
||||||
|
cmd := exec.Command(nvtopCmd, "-lP", "-d", interval)
|
||||||
|
stdout, err := cmd.StdoutPipe()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
_ = stdout.Close()
|
||||||
|
if cmd.ProcessState == nil || !cmd.ProcessState.Exited() {
|
||||||
|
_ = cmd.Process.Kill()
|
||||||
|
}
|
||||||
|
_ = cmd.Wait()
|
||||||
|
}()
|
||||||
|
|
||||||
|
decoder := json.NewDecoder(stdout)
|
||||||
|
foundValid := false
|
||||||
|
for {
|
||||||
|
var snapshots []nvtopSnapshot
|
||||||
|
if err := decoder.Decode(&snapshots); err != nil {
|
||||||
|
if err == io.EOF {
|
||||||
|
if foundValid {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return errNoValidData
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if gm.updateNvtopSnapshots(snapshots) {
|
||||||
|
foundValid = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// startNvtopCollector starts nvtop collection with retry or fallback callback handling.
|
||||||
|
func (gm *GPUManager) startNvtopCollector(interval string, onFailure func()) {
|
||||||
|
go func() {
|
||||||
|
failures := 0
|
||||||
|
for {
|
||||||
|
if err := gm.collectNvtopStats(interval); err != nil {
|
||||||
|
if onFailure != nil {
|
||||||
|
slog.Warn("Error collecting GPU data via nvtop", "err", err)
|
||||||
|
onFailure()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
failures++
|
||||||
|
if failures > maxFailureRetries {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
slog.Warn("Error collecting GPU data via nvtop", "err", err)
|
||||||
|
time.Sleep(retryWaitTime)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
@@ -250,6 +250,100 @@ func TestParseAmdData(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseNvtopData(t *testing.T) {
|
||||||
|
input, err := os.ReadFile("test-data/nvtop.json")
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
gm := &GPUManager{
|
||||||
|
GpuDataMap: make(map[string]*system.GPUData),
|
||||||
|
}
|
||||||
|
valid := gm.parseNvtopData(input)
|
||||||
|
require.True(t, valid)
|
||||||
|
|
||||||
|
g0, ok := gm.GpuDataMap["n0"]
|
||||||
|
require.True(t, ok)
|
||||||
|
assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", g0.Name)
|
||||||
|
assert.Equal(t, 48.0, g0.Temperature)
|
||||||
|
assert.Equal(t, 5.0, g0.Usage)
|
||||||
|
assert.Equal(t, 13.0, g0.Power)
|
||||||
|
assert.Equal(t, bytesToMegabytes(349372416), g0.MemoryUsed)
|
||||||
|
assert.Equal(t, bytesToMegabytes(4294967296), g0.MemoryTotal)
|
||||||
|
assert.Equal(t, 1.0, g0.Count)
|
||||||
|
|
||||||
|
g1, ok := gm.GpuDataMap["n1"]
|
||||||
|
require.True(t, ok)
|
||||||
|
assert.Equal(t, "AMD Radeon 680M", g1.Name)
|
||||||
|
assert.Equal(t, 48.0, g1.Temperature)
|
||||||
|
assert.Equal(t, 12.0, g1.Usage)
|
||||||
|
assert.Equal(t, 9.0, g1.Power)
|
||||||
|
assert.Equal(t, bytesToMegabytes(1213784064), g1.MemoryUsed)
|
||||||
|
assert.Equal(t, bytesToMegabytes(16929173504), g1.MemoryTotal)
|
||||||
|
assert.Equal(t, 1.0, g1.Count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUpdateNvtopSnapshotsKeepsDeviceAssociationWhenOrderChanges(t *testing.T) {
|
||||||
|
strPtr := func(s string) *string { return &s }
|
||||||
|
|
||||||
|
gm := &GPUManager{
|
||||||
|
GpuDataMap: make(map[string]*system.GPUData),
|
||||||
|
}
|
||||||
|
|
||||||
|
firstBatch := []nvtopSnapshot{
|
||||||
|
{
|
||||||
|
DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
|
||||||
|
GpuUtil: strPtr("20%"),
|
||||||
|
PowerDraw: strPtr("10W"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
DeviceName: "AMD Radeon 680M",
|
||||||
|
GpuUtil: strPtr("30%"),
|
||||||
|
PowerDraw: strPtr("20W"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
secondBatchSwapped := []nvtopSnapshot{
|
||||||
|
{
|
||||||
|
DeviceName: "AMD Radeon 680M",
|
||||||
|
GpuUtil: strPtr("40%"),
|
||||||
|
PowerDraw: strPtr("25W"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
|
||||||
|
GpuUtil: strPtr("50%"),
|
||||||
|
PowerDraw: strPtr("15W"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
require.True(t, gm.updateNvtopSnapshots(firstBatch))
|
||||||
|
require.True(t, gm.updateNvtopSnapshots(secondBatchSwapped))
|
||||||
|
|
||||||
|
nvidia := gm.GpuDataMap["n0"]
|
||||||
|
require.NotNil(t, nvidia)
|
||||||
|
assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", nvidia.Name)
|
||||||
|
assert.Equal(t, 70.0, nvidia.Usage)
|
||||||
|
assert.Equal(t, 25.0, nvidia.Power)
|
||||||
|
assert.Equal(t, 2.0, nvidia.Count)
|
||||||
|
|
||||||
|
amd := gm.GpuDataMap["n1"]
|
||||||
|
require.NotNil(t, amd)
|
||||||
|
assert.Equal(t, "AMD Radeon 680M", amd.Name)
|
||||||
|
assert.Equal(t, 70.0, amd.Usage)
|
||||||
|
assert.Equal(t, 45.0, amd.Power)
|
||||||
|
assert.Equal(t, 2.0, amd.Count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseCollectorPriority(t *testing.T) {
|
||||||
|
got := parseCollectorPriority(" nvml, nvidia-smi, intel_gpu_top, amd_sysfs, nvtop, rocm-smi, bad ")
|
||||||
|
want := []collectorSource{
|
||||||
|
collectorSourceNVML,
|
||||||
|
collectorSourceNvidiaSMI,
|
||||||
|
collectorSourceIntelGpuTop,
|
||||||
|
collectorSourceAmdSysfs,
|
||||||
|
collectorSourceNVTop,
|
||||||
|
collectorSourceRocmSMI,
|
||||||
|
}
|
||||||
|
assert.Equal(t, want, got)
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseJetsonData(t *testing.T) {
|
func TestParseJetsonData(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
@@ -987,36 +1081,35 @@ func TestCalculateGPUAverage(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestDetectGPUs(t *testing.T) {
|
func TestGPUCapabilitiesAndLegacyPriority(t *testing.T) {
|
||||||
// Save original PATH
|
// Save original PATH
|
||||||
origPath := os.Getenv("PATH")
|
origPath := os.Getenv("PATH")
|
||||||
defer os.Setenv("PATH", origPath)
|
defer os.Setenv("PATH", origPath)
|
||||||
|
hasAmdSysfs := (&GPUManager{}).hasAmdSysfs()
|
||||||
// Set up temp dir with the commands
|
|
||||||
tempDir := t.TempDir()
|
|
||||||
os.Setenv("PATH", tempDir)
|
|
||||||
|
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
setupCommands func() error
|
setupCommands func(string) error
|
||||||
wantNvidiaSmi bool
|
wantNvidiaSmi bool
|
||||||
wantRocmSmi bool
|
wantRocmSmi bool
|
||||||
wantTegrastats bool
|
wantTegrastats bool
|
||||||
|
wantNvtop bool
|
||||||
wantErr bool
|
wantErr bool
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "nvidia-smi not available",
|
name: "nvidia-smi not available",
|
||||||
setupCommands: func() error {
|
setupCommands: func(_ string) error {
|
||||||
return nil
|
return nil
|
||||||
},
|
},
|
||||||
wantNvidiaSmi: false,
|
wantNvidiaSmi: false,
|
||||||
wantRocmSmi: false,
|
wantRocmSmi: false,
|
||||||
wantTegrastats: false,
|
wantTegrastats: false,
|
||||||
|
wantNvtop: false,
|
||||||
wantErr: true,
|
wantErr: true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "nvidia-smi available",
|
name: "nvidia-smi available",
|
||||||
setupCommands: func() error {
|
setupCommands: func(tempDir string) error {
|
||||||
path := filepath.Join(tempDir, "nvidia-smi")
|
path := filepath.Join(tempDir, "nvidia-smi")
|
||||||
script := `#!/bin/sh
|
script := `#!/bin/sh
|
||||||
echo "test"`
|
echo "test"`
|
||||||
@@ -1028,29 +1121,14 @@ echo "test"`
|
|||||||
wantNvidiaSmi: true,
|
wantNvidiaSmi: true,
|
||||||
wantTegrastats: false,
|
wantTegrastats: false,
|
||||||
wantRocmSmi: false,
|
wantRocmSmi: false,
|
||||||
|
wantNvtop: false,
|
||||||
wantErr: false,
|
wantErr: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "rocm-smi available",
|
name: "rocm-smi available",
|
||||||
setupCommands: func() error {
|
setupCommands: func(tempDir string) error {
|
||||||
path := filepath.Join(tempDir, "rocm-smi")
|
path := filepath.Join(tempDir, "rocm-smi")
|
||||||
script := `#!/bin/sh
|
script := `#!/bin/sh
|
||||||
echo "test"`
|
|
||||||
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
},
|
|
||||||
wantNvidiaSmi: true,
|
|
||||||
wantRocmSmi: true,
|
|
||||||
wantTegrastats: false,
|
|
||||||
wantErr: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "tegrastats available",
|
|
||||||
setupCommands: func() error {
|
|
||||||
path := filepath.Join(tempDir, "tegrastats")
|
|
||||||
script := `#!/bin/sh
|
|
||||||
echo "test"`
|
echo "test"`
|
||||||
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -1059,12 +1137,47 @@ echo "test"`
|
|||||||
},
|
},
|
||||||
wantNvidiaSmi: false,
|
wantNvidiaSmi: false,
|
||||||
wantRocmSmi: true,
|
wantRocmSmi: true,
|
||||||
|
wantTegrastats: false,
|
||||||
|
wantNvtop: false,
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "tegrastats available",
|
||||||
|
setupCommands: func(tempDir string) error {
|
||||||
|
path := filepath.Join(tempDir, "tegrastats")
|
||||||
|
script := `#!/bin/sh
|
||||||
|
echo "test"`
|
||||||
|
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
wantNvidiaSmi: false,
|
||||||
|
wantRocmSmi: false,
|
||||||
wantTegrastats: true,
|
wantTegrastats: true,
|
||||||
|
wantNvtop: false,
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "nvtop available",
|
||||||
|
setupCommands: func(tempDir string) error {
|
||||||
|
path := filepath.Join(tempDir, "nvtop")
|
||||||
|
script := `#!/bin/sh
|
||||||
|
echo "[]"`
|
||||||
|
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
wantNvidiaSmi: false,
|
||||||
|
wantRocmSmi: false,
|
||||||
|
wantTegrastats: false,
|
||||||
|
wantNvtop: true,
|
||||||
wantErr: false,
|
wantErr: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "no gpu tools available",
|
name: "no gpu tools available",
|
||||||
setupCommands: func() error {
|
setupCommands: func(_ string) error {
|
||||||
os.Setenv("PATH", "")
|
os.Setenv("PATH", "")
|
||||||
return nil
|
return nil
|
||||||
},
|
},
|
||||||
@@ -1074,29 +1187,53 @@ echo "test"`
|
|||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
if err := tt.setupCommands(); err != nil {
|
tempDir := t.TempDir()
|
||||||
|
os.Setenv("PATH", tempDir)
|
||||||
|
if err := tt.setupCommands(tempDir); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
gm := &GPUManager{}
|
gm := &GPUManager{}
|
||||||
err := gm.detectGPUs()
|
caps := gm.discoverGpuCapabilities()
|
||||||
|
var err error
|
||||||
|
if !hasAnyGpuCollector(caps) {
|
||||||
|
err = fmt.Errorf(noGPUFoundMsg)
|
||||||
|
}
|
||||||
|
priorities := gm.resolveLegacyCollectorPriority(caps)
|
||||||
|
hasPriority := func(source collectorSource) bool {
|
||||||
|
for _, s := range priorities {
|
||||||
|
if s == source {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
gotNvidiaSmi := hasPriority(collectorSourceNvidiaSMI)
|
||||||
|
gotRocmSmi := hasPriority(collectorSourceRocmSMI)
|
||||||
|
gotTegrastats := caps.hasTegrastats
|
||||||
|
gotNvtop := caps.hasNvtop
|
||||||
|
|
||||||
t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gm.nvidiaSmi, gm.rocmSmi, gm.tegrastats)
|
t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gotNvidiaSmi, gotRocmSmi, gotTegrastats)
|
||||||
|
|
||||||
if tt.wantErr {
|
wantErr := tt.wantErr
|
||||||
|
if hasAmdSysfs && (tt.name == "nvidia-smi not available" || tt.name == "no gpu tools available") {
|
||||||
|
wantErr = false
|
||||||
|
}
|
||||||
|
if wantErr {
|
||||||
assert.Error(t, err)
|
assert.Error(t, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.Equal(t, tt.wantNvidiaSmi, gm.nvidiaSmi)
|
assert.Equal(t, tt.wantNvidiaSmi, gotNvidiaSmi)
|
||||||
assert.Equal(t, tt.wantRocmSmi, gm.rocmSmi)
|
assert.Equal(t, tt.wantRocmSmi, gotRocmSmi)
|
||||||
assert.Equal(t, tt.wantTegrastats, gm.tegrastats)
|
assert.Equal(t, tt.wantTegrastats, gotTegrastats)
|
||||||
|
assert.Equal(t, tt.wantNvtop, gotNvtop)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestStartCollector(t *testing.T) {
|
func TestCollectorStartHelpers(t *testing.T) {
|
||||||
// Save original PATH
|
// Save original PATH
|
||||||
origPath := os.Getenv("PATH")
|
origPath := os.Getenv("PATH")
|
||||||
defer os.Setenv("PATH", origPath)
|
defer os.Setenv("PATH", origPath)
|
||||||
@@ -1181,6 +1318,27 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "nvtop collector",
|
||||||
|
command: "nvtop",
|
||||||
|
setup: func(t *testing.T) error {
|
||||||
|
path := filepath.Join(dir, "nvtop")
|
||||||
|
script := `#!/bin/sh
|
||||||
|
echo '[{"device_name":"NVIDIA Test GPU","temp":"52C","power_draw":"31W","gpu_util":"37%","mem_total":"4294967296","mem_used":"536870912","processes":[]}]'`
|
||||||
|
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
validate: func(t *testing.T, gm *GPUManager) {
|
||||||
|
gpu, exists := gm.GpuDataMap["n0"]
|
||||||
|
assert.True(t, exists)
|
||||||
|
if exists {
|
||||||
|
assert.Equal(t, "NVIDIA Test GPU", gpu.Name)
|
||||||
|
assert.Equal(t, 52.0, gpu.Temperature)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
@@ -1193,13 +1351,157 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m
|
|||||||
GpuDataMap: make(map[string]*system.GPUData),
|
GpuDataMap: make(map[string]*system.GPUData),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
tt.gm.startCollector(tt.command)
|
switch tt.command {
|
||||||
|
case nvidiaSmiCmd:
|
||||||
|
tt.gm.startNvidiaSmiCollector("4")
|
||||||
|
case rocmSmiCmd:
|
||||||
|
tt.gm.startRocmSmiCollector(4300 * time.Millisecond)
|
||||||
|
case tegraStatsCmd:
|
||||||
|
tt.gm.startTegraStatsCollector("3700")
|
||||||
|
case nvtopCmd:
|
||||||
|
tt.gm.startNvtopCollector("30", nil)
|
||||||
|
default:
|
||||||
|
t.Fatalf("unknown test command %q", tt.command)
|
||||||
|
}
|
||||||
time.Sleep(50 * time.Millisecond) // Give collector time to run
|
time.Sleep(50 * time.Millisecond) // Give collector time to run
|
||||||
tt.validate(t, tt.gm)
|
tt.validate(t, tt.gm)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNewGPUManagerPriorityNvtopFallback(t *testing.T) {
|
||||||
|
origPath := os.Getenv("PATH")
|
||||||
|
defer os.Setenv("PATH", origPath)
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
os.Setenv("PATH", dir)
|
||||||
|
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvtop,nvidia-smi")
|
||||||
|
|
||||||
|
nvtopPath := filepath.Join(dir, "nvtop")
|
||||||
|
nvtopScript := `#!/bin/sh
|
||||||
|
echo 'not-json'`
|
||||||
|
require.NoError(t, os.WriteFile(nvtopPath, []byte(nvtopScript), 0755))
|
||||||
|
|
||||||
|
nvidiaPath := filepath.Join(dir, "nvidia-smi")
|
||||||
|
nvidiaScript := `#!/bin/sh
|
||||||
|
echo "0, NVIDIA Priority GPU, 45, 512, 2048, 12, 25"`
|
||||||
|
require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755))
|
||||||
|
|
||||||
|
gm, err := NewGPUManager()
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.NotNil(t, gm)
|
||||||
|
|
||||||
|
time.Sleep(150 * time.Millisecond)
|
||||||
|
gpu, ok := gm.GpuDataMap["0"]
|
||||||
|
require.True(t, ok)
|
||||||
|
assert.Equal(t, "Priority GPU", gpu.Name)
|
||||||
|
assert.Equal(t, 45.0, gpu.Temperature)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewGPUManagerPriorityMixedCollectors(t *testing.T) {
|
||||||
|
origPath := os.Getenv("PATH")
|
||||||
|
defer os.Setenv("PATH", origPath)
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
os.Setenv("PATH", dir)
|
||||||
|
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "intel_gpu_top,rocm-smi")
|
||||||
|
|
||||||
|
intelPath := filepath.Join(dir, "intel_gpu_top")
|
||||||
|
intelScript := `#!/bin/sh
|
||||||
|
echo "Freq MHz IRQ RC6 Power W IMC MiB/s RCS VCS"
|
||||||
|
echo " req act /s % gpu pkg rd wr % se wa % se wa"
|
||||||
|
echo "226 223 338 58 2.00 2.69 1820 965 0.00 0 0 0.00 0 0"
|
||||||
|
echo "189 187 412 67 1.80 2.45 1950 823 8.50 2 1 15.00 1 0"
|
||||||
|
`
|
||||||
|
require.NoError(t, os.WriteFile(intelPath, []byte(intelScript), 0755))
|
||||||
|
|
||||||
|
rocmPath := filepath.Join(dir, "rocm-smi")
|
||||||
|
rocmScript := `#!/bin/sh
|
||||||
|
echo '{"card0": {"Temperature (Sensor edge) (C)": "49.0", "Current Socket Graphics Package Power (W)": "28.159", "GPU use (%)": "0", "VRAM Total Memory (B)": "536870912", "VRAM Total Used Memory (B)": "445550592", "Card Series": "Rembrandt [Radeon 680M]", "GUID": "34756"}}'
|
||||||
|
`
|
||||||
|
require.NoError(t, os.WriteFile(rocmPath, []byte(rocmScript), 0755))
|
||||||
|
|
||||||
|
gm, err := NewGPUManager()
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.NotNil(t, gm)
|
||||||
|
|
||||||
|
time.Sleep(150 * time.Millisecond)
|
||||||
|
_, intelOk := gm.GpuDataMap["i0"]
|
||||||
|
_, amdOk := gm.GpuDataMap["34756"]
|
||||||
|
assert.True(t, intelOk)
|
||||||
|
assert.True(t, amdOk)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewGPUManagerPriorityNvmlFallbackToNvidiaSmi(t *testing.T) {
|
||||||
|
origPath := os.Getenv("PATH")
|
||||||
|
defer os.Setenv("PATH", origPath)
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
os.Setenv("PATH", dir)
|
||||||
|
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvml,nvidia-smi")
|
||||||
|
|
||||||
|
nvidiaPath := filepath.Join(dir, "nvidia-smi")
|
||||||
|
nvidiaScript := `#!/bin/sh
|
||||||
|
echo "0, NVIDIA Fallback GPU, 41, 256, 1024, 8, 14"`
|
||||||
|
require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755))
|
||||||
|
|
||||||
|
gm, err := NewGPUManager()
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.NotNil(t, gm)
|
||||||
|
|
||||||
|
time.Sleep(150 * time.Millisecond)
|
||||||
|
gpu, ok := gm.GpuDataMap["0"]
|
||||||
|
require.True(t, ok)
|
||||||
|
assert.Equal(t, "Fallback GPU", gpu.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewGPUManagerConfiguredCollectorsMustStart(t *testing.T) {
|
||||||
|
origPath := os.Getenv("PATH")
|
||||||
|
defer os.Setenv("PATH", origPath)
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
os.Setenv("PATH", dir)
|
||||||
|
|
||||||
|
t.Run("configured valid collector unavailable", func(t *testing.T) {
|
||||||
|
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi")
|
||||||
|
gm, err := NewGPUManager()
|
||||||
|
require.Nil(t, gm)
|
||||||
|
require.Error(t, err)
|
||||||
|
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("configured collector list has only unknown entries", func(t *testing.T) {
|
||||||
|
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "bad,unknown")
|
||||||
|
gm, err := NewGPUManager()
|
||||||
|
require.Nil(t, gm)
|
||||||
|
require.Error(t, err)
|
||||||
|
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewGPUManagerJetsonIgnoresCollectorConfig(t *testing.T) {
|
||||||
|
origPath := os.Getenv("PATH")
|
||||||
|
defer os.Setenv("PATH", origPath)
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
os.Setenv("PATH", dir)
|
||||||
|
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi")
|
||||||
|
|
||||||
|
tegraPath := filepath.Join(dir, "tegrastats")
|
||||||
|
tegraScript := `#!/bin/sh
|
||||||
|
echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000mW"`
|
||||||
|
require.NoError(t, os.WriteFile(tegraPath, []byte(tegraScript), 0755))
|
||||||
|
|
||||||
|
gm, err := NewGPUManager()
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.NotNil(t, gm)
|
||||||
|
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
gpu, ok := gm.GpuDataMap["0"]
|
||||||
|
require.True(t, ok)
|
||||||
|
assert.Equal(t, "GPU", gpu.Name)
|
||||||
|
}
|
||||||
|
|
||||||
// TestAccumulationTableDriven tests the accumulation behavior for all three GPU types
|
// TestAccumulationTableDriven tests the accumulation behavior for all three GPU types
|
||||||
func TestAccumulation(t *testing.T) {
|
func TestAccumulation(t *testing.T) {
|
||||||
type expectedGPUValues struct {
|
type expectedGPUValues struct {
|
||||||
|
|||||||
34
agent/test-data/nvtop.json
Normal file
34
agent/test-data/nvtop.json
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"device_name": "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
|
||||||
|
"gpu_clock": "1485MHz",
|
||||||
|
"mem_clock": "6001MHz",
|
||||||
|
"temp": "48C",
|
||||||
|
"fan_speed": null,
|
||||||
|
"power_draw": "13W",
|
||||||
|
"gpu_util": "5%",
|
||||||
|
"encode": "0%",
|
||||||
|
"decode": "0%",
|
||||||
|
"mem_util": "8%",
|
||||||
|
"mem_total": "4294967296",
|
||||||
|
"mem_used": "349372416",
|
||||||
|
"mem_free": "3945594880",
|
||||||
|
"processes" : []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"device_name": "AMD Radeon 680M",
|
||||||
|
"gpu_clock": "2200MHz",
|
||||||
|
"mem_clock": "2400MHz",
|
||||||
|
"temp": "48C",
|
||||||
|
"fan_speed": "CPU Fan",
|
||||||
|
"power_draw": "9W",
|
||||||
|
"gpu_util": "12%",
|
||||||
|
"encode": null,
|
||||||
|
"decode": "0%",
|
||||||
|
"mem_util": "7%",
|
||||||
|
"mem_total": "16929173504",
|
||||||
|
"mem_used": "1213784064",
|
||||||
|
"mem_free": "15715389440",
|
||||||
|
"processes" : []
|
||||||
|
}
|
||||||
|
]
|
||||||
Reference in New Issue
Block a user