mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-29 09:56:17 +02:00
fix(agent): allow GPU_COLLECTOR=nvml without nvidia-smi (#1849)
This commit is contained in:
11
agent/gpu.go
11
agent/gpu.go
@@ -542,7 +542,7 @@ func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSo
|
||||
return map[collectorSource]collectorDefinition{
|
||||
collectorSourceNVML: {
|
||||
group: collectorGroupNvidia,
|
||||
available: caps.hasNvidiaSmi,
|
||||
available: true,
|
||||
start: func(_ func()) bool {
|
||||
return gm.startNvmlCollector()
|
||||
},
|
||||
@@ -734,9 +734,6 @@ func NewGPUManager() (*GPUManager, error) {
|
||||
}
|
||||
var gm GPUManager
|
||||
caps := gm.discoverGpuCapabilities()
|
||||
if !hasAnyGpuCollector(caps) {
|
||||
return nil, fmt.Errorf(noGPUFoundMsg)
|
||||
}
|
||||
gm.GpuDataMap = make(map[string]*system.GPUData)
|
||||
|
||||
// Jetson devices should always use tegrastats (ignore GPU_COLLECTOR).
|
||||
@@ -745,7 +742,7 @@ func NewGPUManager() (*GPUManager, error) {
|
||||
return &gm, nil
|
||||
}
|
||||
|
||||
// if GPU_COLLECTOR is set, start user-defined collectors.
|
||||
// Respect explicit collector selection before capability auto-detection.
|
||||
if collectorConfig, ok := utils.GetEnv("GPU_COLLECTOR"); ok && strings.TrimSpace(collectorConfig) != "" {
|
||||
priorities := parseCollectorPriority(collectorConfig)
|
||||
if gm.startCollectorsByPriority(priorities, caps) == 0 {
|
||||
@@ -754,6 +751,10 @@ func NewGPUManager() (*GPUManager, error) {
|
||||
return &gm, nil
|
||||
}
|
||||
|
||||
if !hasAnyGpuCollector(caps) {
|
||||
return nil, fmt.Errorf(noGPUFoundMsg)
|
||||
}
|
||||
|
||||
// auto-detect and start collectors when GPU_COLLECTOR is unset.
|
||||
if gm.startCollectorsByPriority(gm.resolveLegacyCollectorPriority(caps), caps) == 0 {
|
||||
return nil, fmt.Errorf(noGPUFoundMsg)
|
||||
|
||||
@@ -1461,6 +1461,25 @@ func TestNewGPUManagerConfiguredCollectorsMustStart(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestCollectorDefinitionsNvmlDoesNotRequireNvidiaSmi(t *testing.T) {
|
||||
gm := &GPUManager{}
|
||||
definitions := gm.collectorDefinitions(gpuCapabilities{})
|
||||
require.Contains(t, definitions, collectorSourceNVML)
|
||||
assert.True(t, definitions[collectorSourceNVML].available)
|
||||
}
|
||||
|
||||
func TestNewGPUManagerConfiguredNvmlBypassesCapabilityGate(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
t.Setenv("PATH", dir)
|
||||
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvml")
|
||||
|
||||
gm, err := NewGPUManager()
|
||||
require.Nil(t, gm)
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
|
||||
assert.NotContains(t, err.Error(), noGPUFoundMsg)
|
||||
}
|
||||
|
||||
func TestNewGPUManagerJetsonIgnoresCollectorConfig(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
t.Setenv("PATH", dir)
|
||||
|
||||
Reference in New Issue
Block a user