fix(agent): allow GPU_COLLECTOR=nvml without nvidia-smi (#1849)

This commit is contained in:
henrygd
2026-03-28 18:58:16 -04:00
parent a227c77526
commit afdc3f7779
2 changed files with 25 additions and 5 deletions

View File

@@ -542,7 +542,7 @@ func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSo
return map[collectorSource]collectorDefinition{
collectorSourceNVML: {
group: collectorGroupNvidia,
available: caps.hasNvidiaSmi,
available: true,
start: func(_ func()) bool {
return gm.startNvmlCollector()
},
@@ -734,9 +734,6 @@ func NewGPUManager() (*GPUManager, error) {
}
var gm GPUManager
caps := gm.discoverGpuCapabilities()
if !hasAnyGpuCollector(caps) {
return nil, fmt.Errorf(noGPUFoundMsg)
}
gm.GpuDataMap = make(map[string]*system.GPUData)
// Jetson devices should always use tegrastats (ignore GPU_COLLECTOR).
@@ -745,7 +742,7 @@ func NewGPUManager() (*GPUManager, error) {
return &gm, nil
}
// if GPU_COLLECTOR is set, start user-defined collectors.
// Respect explicit collector selection before capability auto-detection.
if collectorConfig, ok := utils.GetEnv("GPU_COLLECTOR"); ok && strings.TrimSpace(collectorConfig) != "" {
priorities := parseCollectorPriority(collectorConfig)
if gm.startCollectorsByPriority(priorities, caps) == 0 {
@@ -754,6 +751,10 @@ func NewGPUManager() (*GPUManager, error) {
return &gm, nil
}
if !hasAnyGpuCollector(caps) {
return nil, fmt.Errorf(noGPUFoundMsg)
}
// auto-detect and start collectors when GPU_COLLECTOR is unset.
if gm.startCollectorsByPriority(gm.resolveLegacyCollectorPriority(caps), caps) == 0 {
return nil, fmt.Errorf(noGPUFoundMsg)

View File

@@ -1461,6 +1461,25 @@ func TestNewGPUManagerConfiguredCollectorsMustStart(t *testing.T) {
})
}
func TestCollectorDefinitionsNvmlDoesNotRequireNvidiaSmi(t *testing.T) {
gm := &GPUManager{}
definitions := gm.collectorDefinitions(gpuCapabilities{})
require.Contains(t, definitions, collectorSourceNVML)
assert.True(t, definitions[collectorSourceNVML].available)
}
func TestNewGPUManagerConfiguredNvmlBypassesCapabilityGate(t *testing.T) {
dir := t.TempDir()
t.Setenv("PATH", dir)
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvml")
gm, err := NewGPUManager()
require.Nil(t, gm)
require.Error(t, err)
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
assert.NotContains(t, err.Error(), noGPUFoundMsg)
}
func TestNewGPUManagerJetsonIgnoresCollectorConfig(t *testing.T) {
dir := t.TempDir()
t.Setenv("PATH", dir)