From afdc3f777927ac8c8be5f362b74b8f793d7bc1bd Mon Sep 17 00:00:00 2001 From: henrygd Date: Sat, 28 Mar 2026 18:58:16 -0400 Subject: [PATCH] fix(agent): allow GPU_COLLECTOR=nvml without nvidia-smi (#1849) --- agent/gpu.go | 11 ++++++----- agent/gpu_test.go | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/agent/gpu.go b/agent/gpu.go index 03c0783e..6e043e6f 100644 --- a/agent/gpu.go +++ b/agent/gpu.go @@ -542,7 +542,7 @@ func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSo return map[collectorSource]collectorDefinition{ collectorSourceNVML: { group: collectorGroupNvidia, - available: caps.hasNvidiaSmi, + available: true, start: func(_ func()) bool { return gm.startNvmlCollector() }, @@ -734,9 +734,6 @@ func NewGPUManager() (*GPUManager, error) { } var gm GPUManager caps := gm.discoverGpuCapabilities() - if !hasAnyGpuCollector(caps) { - return nil, fmt.Errorf(noGPUFoundMsg) - } gm.GpuDataMap = make(map[string]*system.GPUData) // Jetson devices should always use tegrastats (ignore GPU_COLLECTOR). @@ -745,7 +742,7 @@ func NewGPUManager() (*GPUManager, error) { return &gm, nil } - // if GPU_COLLECTOR is set, start user-defined collectors. + // Respect explicit collector selection before capability auto-detection. if collectorConfig, ok := utils.GetEnv("GPU_COLLECTOR"); ok && strings.TrimSpace(collectorConfig) != "" { priorities := parseCollectorPriority(collectorConfig) if gm.startCollectorsByPriority(priorities, caps) == 0 { @@ -754,6 +751,10 @@ func NewGPUManager() (*GPUManager, error) { return &gm, nil } + if !hasAnyGpuCollector(caps) { + return nil, fmt.Errorf(noGPUFoundMsg) + } + // auto-detect and start collectors when GPU_COLLECTOR is unset. if gm.startCollectorsByPriority(gm.resolveLegacyCollectorPriority(caps), caps) == 0 { return nil, fmt.Errorf(noGPUFoundMsg) diff --git a/agent/gpu_test.go b/agent/gpu_test.go index ade17d91..d595ab1b 100644 --- a/agent/gpu_test.go +++ b/agent/gpu_test.go @@ -1461,6 +1461,25 @@ func TestNewGPUManagerConfiguredCollectorsMustStart(t *testing.T) { }) } +func TestCollectorDefinitionsNvmlDoesNotRequireNvidiaSmi(t *testing.T) { + gm := &GPUManager{} + definitions := gm.collectorDefinitions(gpuCapabilities{}) + require.Contains(t, definitions, collectorSourceNVML) + assert.True(t, definitions[collectorSourceNVML].available) +} + +func TestNewGPUManagerConfiguredNvmlBypassesCapabilityGate(t *testing.T) { + dir := t.TempDir() + t.Setenv("PATH", dir) + t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvml") + + gm, err := NewGPUManager() + require.Nil(t, gm) + require.Error(t, err) + assert.Contains(t, err.Error(), "no configured GPU collectors are available") + assert.NotContains(t, err.Error(), noGPUFoundMsg) +} + func TestNewGPUManagerJetsonIgnoresCollectorConfig(t *testing.T) { dir := t.TempDir() t.Setenv("PATH", dir)