mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-31 18:56:41 +02:00
fix(agent): allow GPU_COLLECTOR=nvml without nvidia-smi (#1849)
This commit is contained in:
11
agent/gpu.go
11
agent/gpu.go
@@ -542,7 +542,7 @@ func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSo
|
|||||||
return map[collectorSource]collectorDefinition{
|
return map[collectorSource]collectorDefinition{
|
||||||
collectorSourceNVML: {
|
collectorSourceNVML: {
|
||||||
group: collectorGroupNvidia,
|
group: collectorGroupNvidia,
|
||||||
available: caps.hasNvidiaSmi,
|
available: true,
|
||||||
start: func(_ func()) bool {
|
start: func(_ func()) bool {
|
||||||
return gm.startNvmlCollector()
|
return gm.startNvmlCollector()
|
||||||
},
|
},
|
||||||
@@ -734,9 +734,6 @@ func NewGPUManager() (*GPUManager, error) {
|
|||||||
}
|
}
|
||||||
var gm GPUManager
|
var gm GPUManager
|
||||||
caps := gm.discoverGpuCapabilities()
|
caps := gm.discoverGpuCapabilities()
|
||||||
if !hasAnyGpuCollector(caps) {
|
|
||||||
return nil, fmt.Errorf(noGPUFoundMsg)
|
|
||||||
}
|
|
||||||
gm.GpuDataMap = make(map[string]*system.GPUData)
|
gm.GpuDataMap = make(map[string]*system.GPUData)
|
||||||
|
|
||||||
// Jetson devices should always use tegrastats (ignore GPU_COLLECTOR).
|
// Jetson devices should always use tegrastats (ignore GPU_COLLECTOR).
|
||||||
@@ -745,7 +742,7 @@ func NewGPUManager() (*GPUManager, error) {
|
|||||||
return &gm, nil
|
return &gm, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// if GPU_COLLECTOR is set, start user-defined collectors.
|
// Respect explicit collector selection before capability auto-detection.
|
||||||
if collectorConfig, ok := utils.GetEnv("GPU_COLLECTOR"); ok && strings.TrimSpace(collectorConfig) != "" {
|
if collectorConfig, ok := utils.GetEnv("GPU_COLLECTOR"); ok && strings.TrimSpace(collectorConfig) != "" {
|
||||||
priorities := parseCollectorPriority(collectorConfig)
|
priorities := parseCollectorPriority(collectorConfig)
|
||||||
if gm.startCollectorsByPriority(priorities, caps) == 0 {
|
if gm.startCollectorsByPriority(priorities, caps) == 0 {
|
||||||
@@ -754,6 +751,10 @@ func NewGPUManager() (*GPUManager, error) {
|
|||||||
return &gm, nil
|
return &gm, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !hasAnyGpuCollector(caps) {
|
||||||
|
return nil, fmt.Errorf(noGPUFoundMsg)
|
||||||
|
}
|
||||||
|
|
||||||
// auto-detect and start collectors when GPU_COLLECTOR is unset.
|
// auto-detect and start collectors when GPU_COLLECTOR is unset.
|
||||||
if gm.startCollectorsByPriority(gm.resolveLegacyCollectorPriority(caps), caps) == 0 {
|
if gm.startCollectorsByPriority(gm.resolveLegacyCollectorPriority(caps), caps) == 0 {
|
||||||
return nil, fmt.Errorf(noGPUFoundMsg)
|
return nil, fmt.Errorf(noGPUFoundMsg)
|
||||||
|
|||||||
@@ -1461,6 +1461,25 @@ func TestNewGPUManagerConfiguredCollectorsMustStart(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestCollectorDefinitionsNvmlDoesNotRequireNvidiaSmi(t *testing.T) {
|
||||||
|
gm := &GPUManager{}
|
||||||
|
definitions := gm.collectorDefinitions(gpuCapabilities{})
|
||||||
|
require.Contains(t, definitions, collectorSourceNVML)
|
||||||
|
assert.True(t, definitions[collectorSourceNVML].available)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewGPUManagerConfiguredNvmlBypassesCapabilityGate(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
t.Setenv("PATH", dir)
|
||||||
|
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvml")
|
||||||
|
|
||||||
|
gm, err := NewGPUManager()
|
||||||
|
require.Nil(t, gm)
|
||||||
|
require.Error(t, err)
|
||||||
|
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
|
||||||
|
assert.NotContains(t, err.Error(), noGPUFoundMsg)
|
||||||
|
}
|
||||||
|
|
||||||
func TestNewGPUManagerJetsonIgnoresCollectorConfig(t *testing.T) {
|
func TestNewGPUManagerJetsonIgnoresCollectorConfig(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
t.Setenv("PATH", dir)
|
t.Setenv("PATH", dir)
|
||||||
|
|||||||
Reference in New Issue
Block a user