diff --git a/agent/gpu.go b/agent/gpu.go index 03c0783e..6e043e6f 100644 --- a/agent/gpu.go +++ b/agent/gpu.go @@ -542,7 +542,7 @@ func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSo return map[collectorSource]collectorDefinition{ collectorSourceNVML: { group: collectorGroupNvidia, - available: caps.hasNvidiaSmi, + available: true, start: func(_ func()) bool { return gm.startNvmlCollector() }, @@ -734,9 +734,6 @@ func NewGPUManager() (*GPUManager, error) { } var gm GPUManager caps := gm.discoverGpuCapabilities() - if !hasAnyGpuCollector(caps) { - return nil, fmt.Errorf(noGPUFoundMsg) - } gm.GpuDataMap = make(map[string]*system.GPUData) // Jetson devices should always use tegrastats (ignore GPU_COLLECTOR). @@ -745,7 +742,7 @@ func NewGPUManager() (*GPUManager, error) { return &gm, nil } - // if GPU_COLLECTOR is set, start user-defined collectors. + // Respect explicit collector selection before capability auto-detection. if collectorConfig, ok := utils.GetEnv("GPU_COLLECTOR"); ok && strings.TrimSpace(collectorConfig) != "" { priorities := parseCollectorPriority(collectorConfig) if gm.startCollectorsByPriority(priorities, caps) == 0 { @@ -754,6 +751,10 @@ func NewGPUManager() (*GPUManager, error) { return &gm, nil } + if !hasAnyGpuCollector(caps) { + return nil, fmt.Errorf(noGPUFoundMsg) + } + // auto-detect and start collectors when GPU_COLLECTOR is unset. if gm.startCollectorsByPriority(gm.resolveLegacyCollectorPriority(caps), caps) == 0 { return nil, fmt.Errorf(noGPUFoundMsg) diff --git a/agent/gpu_test.go b/agent/gpu_test.go index ade17d91..d595ab1b 100644 --- a/agent/gpu_test.go +++ b/agent/gpu_test.go @@ -1461,6 +1461,25 @@ func TestNewGPUManagerConfiguredCollectorsMustStart(t *testing.T) { }) } +func TestCollectorDefinitionsNvmlDoesNotRequireNvidiaSmi(t *testing.T) { + gm := &GPUManager{} + definitions := gm.collectorDefinitions(gpuCapabilities{}) + require.Contains(t, definitions, collectorSourceNVML) + assert.True(t, definitions[collectorSourceNVML].available) +} + +func TestNewGPUManagerConfiguredNvmlBypassesCapabilityGate(t *testing.T) { + dir := t.TempDir() + t.Setenv("PATH", dir) + t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvml") + + gm, err := NewGPUManager() + require.Nil(t, gm) + require.Error(t, err) + assert.Contains(t, err.Error(), "no configured GPU collectors are available") + assert.NotContains(t, err.Error(), noGPUFoundMsg) +} + func TestNewGPUManagerJetsonIgnoresCollectorConfig(t *testing.T) { dir := t.TempDir() t.Setenv("PATH", dir)