Compare commits

..

9 Commits

Author SHA1 Message Date
henrygd
5db4eb4346 0.18.6 release 2026-03-29 13:03:48 -04:00
Yi Zhou
f6c5e2928a Add apple-touch-icon link to index.html (#1850) 2026-03-29 12:39:38 -04:00
henrygd
6a207c33fa agent: change disk.Partitions(false) to true - likely fixes empty partition list in docker as of gopsutil 4.26.2 2026-03-29 12:33:45 -04:00
henrygd
9f19afccde hub: reset smart interval on agent reconnect if agent hasn't successfully saved smart devices
this is so people trying to get smart working can see the config changes immediately. not need to wait for the smart interval.
2026-03-29 12:30:39 -04:00
henrygd
f25f2469e3 hub: add debug logs for smart behavior (#1800) 2026-03-28 21:16:26 -04:00
henrygd
5bd43ed461 hub: reset smart interval on agent reconnect if agent hasn't successfully saved smart devices
this is so people trying to get smart working can see the config changes immediately. not need to wait for the smart interval.
2026-03-28 20:47:16 -04:00
henrygd
afdc3f7779 fix(agent): allow GPU_COLLECTOR=nvml without nvidia-smi (#1849) 2026-03-28 18:58:16 -04:00
henrygd
a227c77526 agent: detect podman correctly when using socket proxy (#1846) 2026-03-28 17:43:29 -04:00
henrygd
8202d746af fix(hub): ui bug where charts didn't display 1m max until next update 2026-03-28 12:16:12 -04:00
20 changed files with 638 additions and 148 deletions

View File

@@ -19,6 +19,8 @@ import (
gossh "golang.org/x/crypto/ssh"
)
const defaultDataCacheTimeMs uint16 = 60_000
type Agent struct {
sync.Mutex // Used to lock agent while collecting data
debug bool // true if LOG_LEVEL is set to debug
@@ -36,6 +38,7 @@ type Agent struct {
sensorConfig *SensorConfig // Sensors config
systemInfo system.Info // Host system info (dynamic)
systemDetails system.Details // Host system details (static, once-per-connection)
detailsDirty bool // Whether system details have changed and need to be resent
gpuManager *GPUManager // Manages GPU data
cache *systemDataCache // Cache for system stats based on cache time
connectionManager *ConnectionManager // Channel to signal connection events
@@ -97,7 +100,7 @@ func NewAgent(dataDir ...string) (agent *Agent, err error) {
slog.Debug(beszel.Version)
// initialize docker manager
agent.dockerManager = newDockerManager()
agent.dockerManager = newDockerManager(agent)
// initialize system info
agent.refreshSystemDetails()
@@ -142,7 +145,7 @@ func NewAgent(dataDir ...string) (agent *Agent, err error) {
// if debugging, print stats
if agent.debug {
slog.Debug("Stats", "data", agent.gatherStats(common.DataRequestOptions{CacheTimeMs: 60_000, IncludeDetails: true}))
slog.Debug("Stats", "data", agent.gatherStats(common.DataRequestOptions{CacheTimeMs: defaultDataCacheTimeMs, IncludeDetails: true}))
}
return agent, nil
@@ -164,11 +167,6 @@ func (a *Agent) gatherStats(options common.DataRequestOptions) *system.CombinedD
Info: a.systemInfo,
}
// Include static system details only when requested
if options.IncludeDetails {
data.Details = &a.systemDetails
}
// slog.Info("System data", "data", data, "cacheTimeMs", cacheTimeMs)
if a.dockerManager != nil {
@@ -181,7 +179,7 @@ func (a *Agent) gatherStats(options common.DataRequestOptions) *system.CombinedD
}
// skip updating systemd services if cache time is not the default 60sec interval
if a.systemdManager != nil && cacheTimeMs == 60_000 {
if a.systemdManager != nil && cacheTimeMs == defaultDataCacheTimeMs {
totalCount := uint16(a.systemdManager.getServiceStatsCount())
if totalCount > 0 {
numFailed := a.systemdManager.getFailedServiceCount()
@@ -212,7 +210,8 @@ func (a *Agent) gatherStats(options common.DataRequestOptions) *system.CombinedD
slog.Debug("Extra FS", "data", data.Stats.ExtraFs)
a.cache.Set(data, cacheTimeMs)
return data
return a.attachSystemDetails(data, cacheTimeMs, options.IncludeDetails)
}
// Start initializes and starts the agent with optional WebSocket connection

View File

@@ -1,6 +1,7 @@
package agent
import (
"context"
"log/slog"
"os"
"path/filepath"
@@ -273,7 +274,7 @@ func (a *Agent) initializeDiskInfo() {
hasRoot := false
isWindows := runtime.GOOS == "windows"
partitions, err := disk.Partitions(false)
partitions, err := disk.PartitionsWithContext(context.Background(), true)
if err != nil {
slog.Error("Error getting disk partitions", "err", err)
}

View File

@@ -25,6 +25,7 @@ import (
"github.com/henrygd/beszel/agent/deltatracker"
"github.com/henrygd/beszel/agent/utils"
"github.com/henrygd/beszel/internal/entities/container"
"github.com/henrygd/beszel/internal/entities/system"
"github.com/blang/semver"
)
@@ -52,20 +53,22 @@ const (
)
type dockerManager struct {
client *http.Client // Client to query Docker API
wg sync.WaitGroup // WaitGroup to wait for all goroutines to finish
sem chan struct{} // Semaphore to limit concurrent container requests
containerStatsMutex sync.RWMutex // Mutex to prevent concurrent access to containerStatsMap
apiContainerList []*container.ApiInfo // List of containers from Docker API
containerStatsMap map[string]*container.Stats // Keeps track of container stats
validIds map[string]struct{} // Map of valid container ids, used to prune invalid containers from containerStatsMap
goodDockerVersion bool // Whether docker version is at least 25.0.0 (one-shot works correctly)
isWindows bool // Whether the Docker Engine API is running on Windows
buf *bytes.Buffer // Buffer to store and read response bodies
decoder *json.Decoder // Reusable JSON decoder that reads from buf
apiStats *container.ApiStats // Reusable API stats object
excludeContainers []string // Patterns to exclude containers by name
usingPodman bool // Whether the Docker Engine API is running on Podman
agent *Agent // Used to propagate system detail changes back to the agent
client *http.Client // Client to query Docker API
wg sync.WaitGroup // WaitGroup to wait for all goroutines to finish
sem chan struct{} // Semaphore to limit concurrent container requests
containerStatsMutex sync.RWMutex // Mutex to prevent concurrent access to containerStatsMap
apiContainerList []*container.ApiInfo // List of containers from Docker API
containerStatsMap map[string]*container.Stats // Keeps track of container stats
validIds map[string]struct{} // Map of valid container ids, used to prune invalid containers from containerStatsMap
goodDockerVersion bool // Whether docker version is at least 25.0.0 (one-shot works correctly)
dockerVersionChecked bool // Whether a version probe has completed successfully
isWindows bool // Whether the Docker Engine API is running on Windows
buf *bytes.Buffer // Buffer to store and read response bodies
decoder *json.Decoder // Reusable JSON decoder that reads from buf
apiStats *container.ApiStats // Reusable API stats object
excludeContainers []string // Patterns to exclude containers by name
usingPodman bool // Whether the Docker Engine API is running on Podman
// Cache-time-aware tracking for CPU stats (similar to cpu.go)
// Maps cache time intervals to container-specific CPU usage tracking
@@ -78,7 +81,6 @@ type dockerManager struct {
networkSentTrackers map[uint16]*deltatracker.DeltaTracker[string, uint64]
networkRecvTrackers map[uint16]*deltatracker.DeltaTracker[string, uint64]
lastNetworkReadTime map[uint16]map[string]time.Time // cacheTimeMs -> containerId -> last network read time
retrySleep func(time.Duration)
}
// userAgentRoundTripper is a custom http.RoundTripper that adds a User-Agent header to all requests
@@ -87,6 +89,14 @@ type userAgentRoundTripper struct {
userAgent string
}
// dockerVersionResponse contains the /version fields used for engine checks.
type dockerVersionResponse struct {
Version string `json:"Version"`
Components []struct {
Name string `json:"Name"`
} `json:"Components"`
}
// RoundTrip implements the http.RoundTripper interface
func (u *userAgentRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
req.Header.Set("User-Agent", u.userAgent)
@@ -134,7 +144,14 @@ func (dm *dockerManager) getDockerStats(cacheTimeMs uint16) ([]*container.Stats,
return nil, err
}
dm.isWindows = strings.Contains(resp.Header.Get("Server"), "windows")
// Detect Podman and Windows from Server header
serverHeader := resp.Header.Get("Server")
if !dm.usingPodman && detectPodmanFromHeader(serverHeader) {
dm.setIsPodman()
}
dm.isWindows = strings.Contains(serverHeader, "windows")
dm.ensureDockerVersionChecked()
containersLength := len(dm.apiContainerList)
@@ -588,7 +605,7 @@ func (dm *dockerManager) deleteContainerStatsSync(id string) {
}
// Creates a new http client for Docker or Podman API
func newDockerManager() *dockerManager {
func newDockerManager(agent *Agent) *dockerManager {
dockerHost, exists := utils.GetEnv("DOCKER_HOST")
if exists {
// return nil if set to empty string
@@ -654,6 +671,7 @@ func newDockerManager() *dockerManager {
}
manager := &dockerManager{
agent: agent,
client: &http.Client{
Timeout: timeout,
Transport: userAgentTransport,
@@ -671,51 +689,54 @@ func newDockerManager() *dockerManager {
networkSentTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
networkRecvTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
lastNetworkReadTime: make(map[uint16]map[string]time.Time),
retrySleep: time.Sleep,
}
// If using podman, return client
if strings.Contains(dockerHost, "podman") {
manager.usingPodman = true
manager.goodDockerVersion = true
return manager
}
// run version check in goroutine to avoid blocking (server may not be ready and requires retries)
go manager.checkDockerVersion()
// give version check a chance to complete before returning
time.Sleep(50 * time.Millisecond)
// Best-effort startup probe. If the engine is not ready yet, getDockerStats will
// retry after the first successful /containers/json request.
_, _ = manager.checkDockerVersion()
return manager
}
// checkDockerVersion checks Docker version and sets goodDockerVersion if at least 25.0.0.
// Versions before 25.0.0 have a bug with one-shot which requires all requests to be made in one batch.
func (dm *dockerManager) checkDockerVersion() {
var err error
var resp *http.Response
var versionInfo struct {
Version string `json:"Version"`
func (dm *dockerManager) checkDockerVersion() (bool, error) {
resp, err := dm.client.Get("http://localhost/version")
if err != nil {
return false, err
}
const versionMaxTries = 2
for i := 1; i <= versionMaxTries; i++ {
resp, err = dm.client.Get("http://localhost/version")
if err == nil && resp.StatusCode == http.StatusOK {
break
}
if resp != nil {
resp.Body.Close()
}
if i < versionMaxTries {
slog.Debug("Failed to get Docker version; retrying", "attempt", i, "err", err, "response", resp)
dm.retrySleep(5 * time.Second)
}
if resp.StatusCode != http.StatusOK {
status := resp.Status
resp.Body.Close()
return false, fmt.Errorf("docker version request failed: %s", status)
}
if err != nil || resp.StatusCode != http.StatusOK {
var versionInfo dockerVersionResponse
serverHeader := resp.Header.Get("Server")
if err := dm.decode(resp, &versionInfo); err != nil {
return false, err
}
dm.applyDockerVersionInfo(serverHeader, &versionInfo)
dm.dockerVersionChecked = true
return true, nil
}
// ensureDockerVersionChecked retries the version probe after a successful
// container list request.
func (dm *dockerManager) ensureDockerVersionChecked() {
if dm.dockerVersionChecked {
return
}
if err := dm.decode(resp, &versionInfo); err != nil {
if _, err := dm.checkDockerVersion(); err != nil {
slog.Debug("Failed to get Docker version", "err", err)
}
}
// applyDockerVersionInfo updates version-dependent behavior from engine metadata.
func (dm *dockerManager) applyDockerVersionInfo(serverHeader string, versionInfo *dockerVersionResponse) {
if detectPodmanEngine(serverHeader, versionInfo) {
dm.setIsPodman()
return
}
// if version > 24, one-shot works correctly and we can limit concurrent operations
@@ -941,3 +962,46 @@ func (dm *dockerManager) GetHostInfo() (info container.HostInfo, err error) {
func (dm *dockerManager) IsPodman() bool {
return dm.usingPodman
}
// setIsPodman sets the manager to Podman mode and updates system details accordingly.
func (dm *dockerManager) setIsPodman() {
if dm.usingPodman {
return
}
dm.usingPodman = true
dm.goodDockerVersion = true
dm.dockerVersionChecked = true
// keep system details updated - this may be detected late if server isn't ready when
// agent starts, so make sure we notify the hub if this happens later.
if dm.agent != nil {
dm.agent.updateSystemDetails(func(details *system.Details) {
details.Podman = true
})
}
}
// detectPodmanFromHeader identifies Podman from the Docker API server header.
func detectPodmanFromHeader(server string) bool {
return strings.HasPrefix(server, "Libpod")
}
// detectPodmanFromVersion identifies Podman from the version payload.
func detectPodmanFromVersion(versionInfo *dockerVersionResponse) bool {
if versionInfo == nil {
return false
}
for _, component := range versionInfo.Components {
if strings.HasPrefix(component.Name, "Podman") {
return true
}
}
return false
}
// detectPodmanEngine checks both header and version metadata for Podman.
func detectPodmanEngine(serverHeader string, versionInfo *dockerVersionResponse) bool {
if detectPodmanFromHeader(serverHeader) {
return true
}
return detectPodmanFromVersion(versionInfo)
}

View File

@@ -539,59 +539,53 @@ func TestDockerManagerCreation(t *testing.T) {
func TestCheckDockerVersion(t *testing.T) {
tests := []struct {
name string
responses []struct {
statusCode int
body string
}
expectedGood bool
expectedRequests int
name string
statusCode int
body string
server string
expectSuccess bool
expectedGood bool
expectedPodman bool
expectError bool
expectedRequest string
}{
{
name: "200 with good version on first try",
responses: []struct {
statusCode int
body string
}{
{http.StatusOK, `{"Version":"25.0.1"}`},
},
expectedGood: true,
expectedRequests: 1,
name: "good docker version",
statusCode: http.StatusOK,
body: `{"Version":"25.0.1"}`,
expectSuccess: true,
expectedGood: true,
expectedPodman: false,
expectedRequest: "/version",
},
{
name: "200 with old version on first try",
responses: []struct {
statusCode int
body string
}{
{http.StatusOK, `{"Version":"24.0.7"}`},
},
expectedGood: false,
expectedRequests: 1,
name: "old docker version",
statusCode: http.StatusOK,
body: `{"Version":"24.0.7"}`,
expectSuccess: true,
expectedGood: false,
expectedPodman: false,
expectedRequest: "/version",
},
{
name: "non-200 then 200 with good version",
responses: []struct {
statusCode int
body string
}{
{http.StatusServiceUnavailable, `"not ready"`},
{http.StatusOK, `{"Version":"25.1.0"}`},
},
expectedGood: true,
expectedRequests: 2,
name: "podman from server header",
statusCode: http.StatusOK,
body: `{"Version":"5.5.0"}`,
server: "Libpod/5.5.0",
expectSuccess: true,
expectedGood: true,
expectedPodman: true,
expectedRequest: "/version",
},
{
name: "non-200 on all retries",
responses: []struct {
statusCode int
body string
}{
{http.StatusInternalServerError, `"error"`},
{http.StatusUnauthorized, `"error"`},
},
expectedGood: false,
expectedRequests: 2,
name: "non-200 response",
statusCode: http.StatusServiceUnavailable,
body: `"not ready"`,
expectSuccess: false,
expectedGood: false,
expectedPodman: false,
expectError: true,
expectedRequest: "/version",
},
}
@@ -599,13 +593,13 @@ func TestCheckDockerVersion(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
requestCount := 0
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
idx := requestCount
requestCount++
if idx >= len(tt.responses) {
idx = len(tt.responses) - 1
assert.Equal(t, tt.expectedRequest, r.URL.EscapedPath())
if tt.server != "" {
w.Header().Set("Server", tt.server)
}
w.WriteHeader(tt.responses[idx].statusCode)
fmt.Fprint(w, tt.responses[idx].body)
w.WriteHeader(tt.statusCode)
fmt.Fprint(w, tt.body)
}))
defer server.Close()
@@ -617,17 +611,24 @@ func TestCheckDockerVersion(t *testing.T) {
},
},
},
retrySleep: func(time.Duration) {},
}
dm.checkDockerVersion()
success, err := dm.checkDockerVersion()
assert.Equal(t, tt.expectSuccess, success)
assert.Equal(t, tt.expectSuccess, dm.dockerVersionChecked)
assert.Equal(t, tt.expectedGood, dm.goodDockerVersion)
assert.Equal(t, tt.expectedRequests, requestCount)
assert.Equal(t, tt.expectedPodman, dm.usingPodman)
assert.Equal(t, 1, requestCount)
if tt.expectError {
require.Error(t, err)
} else {
require.NoError(t, err)
}
})
}
t.Run("request error on all retries", func(t *testing.T) {
t.Run("request error", func(t *testing.T) {
requestCount := 0
dm := &dockerManager{
client: &http.Client{
@@ -638,16 +639,171 @@ func TestCheckDockerVersion(t *testing.T) {
},
},
},
retrySleep: func(time.Duration) {},
}
dm.checkDockerVersion()
success, err := dm.checkDockerVersion()
assert.False(t, success)
require.Error(t, err)
assert.False(t, dm.dockerVersionChecked)
assert.False(t, dm.goodDockerVersion)
assert.Equal(t, 2, requestCount)
assert.False(t, dm.usingPodman)
assert.Equal(t, 1, requestCount)
})
}
// newDockerManagerForVersionTest creates a dockerManager wired to a test server.
func newDockerManagerForVersionTest(server *httptest.Server) *dockerManager {
return &dockerManager{
client: &http.Client{
Transport: &http.Transport{
DialContext: func(_ context.Context, network, _ string) (net.Conn, error) {
return net.Dial(network, server.Listener.Addr().String())
},
},
},
containerStatsMap: make(map[string]*container.Stats),
lastCpuContainer: make(map[uint16]map[string]uint64),
lastCpuSystem: make(map[uint16]map[string]uint64),
lastCpuReadTime: make(map[uint16]map[string]time.Time),
networkSentTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
networkRecvTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
lastNetworkReadTime: make(map[uint16]map[string]time.Time),
}
}
func TestGetDockerStatsChecksDockerVersionAfterContainerList(t *testing.T) {
tests := []struct {
name string
containerServer string
versionServer string
versionBody string
expectedGood bool
expectedPodman bool
}{
{
name: "200 with good version on first try",
versionBody: `{"Version":"25.0.1"}`,
expectedGood: true,
expectedPodman: false,
},
{
name: "200 with old version on first try",
versionBody: `{"Version":"24.0.7"}`,
expectedGood: false,
expectedPodman: false,
},
{
name: "podman detected from server header",
containerServer: "Libpod/5.5.0",
expectedGood: true,
expectedPodman: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
requestCounts := map[string]int{}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
requestCounts[r.URL.EscapedPath()]++
switch r.URL.EscapedPath() {
case "/containers/json":
if tt.containerServer != "" {
w.Header().Set("Server", tt.containerServer)
}
w.WriteHeader(http.StatusOK)
fmt.Fprint(w, `[]`)
case "/version":
if tt.versionServer != "" {
w.Header().Set("Server", tt.versionServer)
}
w.WriteHeader(http.StatusOK)
fmt.Fprint(w, tt.versionBody)
default:
t.Fatalf("unexpected path: %s", r.URL.EscapedPath())
}
}))
defer server.Close()
dm := newDockerManagerForVersionTest(server)
stats, err := dm.getDockerStats(defaultCacheTimeMs)
require.NoError(t, err)
assert.Empty(t, stats)
assert.True(t, dm.dockerVersionChecked)
assert.Equal(t, tt.expectedGood, dm.goodDockerVersion)
assert.Equal(t, tt.expectedPodman, dm.usingPodman)
assert.Equal(t, 1, requestCounts["/containers/json"])
if tt.expectedPodman {
assert.Equal(t, 0, requestCounts["/version"])
} else {
assert.Equal(t, 1, requestCounts["/version"])
}
stats, err = dm.getDockerStats(defaultCacheTimeMs)
require.NoError(t, err)
assert.Empty(t, stats)
assert.Equal(t, tt.expectedGood, dm.goodDockerVersion)
assert.Equal(t, tt.expectedPodman, dm.usingPodman)
assert.Equal(t, 2, requestCounts["/containers/json"])
if tt.expectedPodman {
assert.Equal(t, 0, requestCounts["/version"])
} else {
assert.Equal(t, 1, requestCounts["/version"])
}
})
}
}
func TestGetDockerStatsRetriesVersionCheckUntilSuccess(t *testing.T) {
requestCounts := map[string]int{}
versionStatuses := []int{http.StatusServiceUnavailable, http.StatusOK}
versionBodies := []string{`"not ready"`, `{"Version":"25.1.0"}`}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
requestCounts[r.URL.EscapedPath()]++
switch r.URL.EscapedPath() {
case "/containers/json":
w.WriteHeader(http.StatusOK)
fmt.Fprint(w, `[]`)
case "/version":
idx := requestCounts["/version"] - 1
if idx >= len(versionStatuses) {
idx = len(versionStatuses) - 1
}
w.WriteHeader(versionStatuses[idx])
fmt.Fprint(w, versionBodies[idx])
default:
t.Fatalf("unexpected path: %s", r.URL.EscapedPath())
}
}))
defer server.Close()
dm := newDockerManagerForVersionTest(server)
stats, err := dm.getDockerStats(defaultCacheTimeMs)
require.NoError(t, err)
assert.Empty(t, stats)
assert.False(t, dm.dockerVersionChecked)
assert.False(t, dm.goodDockerVersion)
assert.Equal(t, 1, requestCounts["/version"])
stats, err = dm.getDockerStats(defaultCacheTimeMs)
require.NoError(t, err)
assert.Empty(t, stats)
assert.True(t, dm.dockerVersionChecked)
assert.True(t, dm.goodDockerVersion)
assert.Equal(t, 2, requestCounts["/containers/json"])
assert.Equal(t, 2, requestCounts["/version"])
stats, err = dm.getDockerStats(defaultCacheTimeMs)
require.NoError(t, err)
assert.Empty(t, stats)
assert.Equal(t, 3, requestCounts["/containers/json"])
assert.Equal(t, 2, requestCounts["/version"])
}
func TestCycleCpuDeltas(t *testing.T) {
dm := &dockerManager{
lastCpuContainer: map[uint16]map[string]uint64{

View File

@@ -542,7 +542,7 @@ func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSo
return map[collectorSource]collectorDefinition{
collectorSourceNVML: {
group: collectorGroupNvidia,
available: caps.hasNvidiaSmi,
available: true,
start: func(_ func()) bool {
return gm.startNvmlCollector()
},
@@ -734,9 +734,6 @@ func NewGPUManager() (*GPUManager, error) {
}
var gm GPUManager
caps := gm.discoverGpuCapabilities()
if !hasAnyGpuCollector(caps) {
return nil, fmt.Errorf(noGPUFoundMsg)
}
gm.GpuDataMap = make(map[string]*system.GPUData)
// Jetson devices should always use tegrastats (ignore GPU_COLLECTOR).
@@ -745,7 +742,7 @@ func NewGPUManager() (*GPUManager, error) {
return &gm, nil
}
// if GPU_COLLECTOR is set, start user-defined collectors.
// Respect explicit collector selection before capability auto-detection.
if collectorConfig, ok := utils.GetEnv("GPU_COLLECTOR"); ok && strings.TrimSpace(collectorConfig) != "" {
priorities := parseCollectorPriority(collectorConfig)
if gm.startCollectorsByPriority(priorities, caps) == 0 {
@@ -754,6 +751,10 @@ func NewGPUManager() (*GPUManager, error) {
return &gm, nil
}
if !hasAnyGpuCollector(caps) {
return nil, fmt.Errorf(noGPUFoundMsg)
}
// auto-detect and start collectors when GPU_COLLECTOR is unset.
if gm.startCollectorsByPriority(gm.resolveLegacyCollectorPriority(caps), caps) == 0 {
return nil, fmt.Errorf(noGPUFoundMsg)

View File

@@ -1461,6 +1461,25 @@ func TestNewGPUManagerConfiguredCollectorsMustStart(t *testing.T) {
})
}
func TestCollectorDefinitionsNvmlDoesNotRequireNvidiaSmi(t *testing.T) {
gm := &GPUManager{}
definitions := gm.collectorDefinitions(gpuCapabilities{})
require.Contains(t, definitions, collectorSourceNVML)
assert.True(t, definitions[collectorSourceNVML].available)
}
func TestNewGPUManagerConfiguredNvmlBypassesCapabilityGate(t *testing.T) {
dir := t.TempDir()
t.Setenv("PATH", dir)
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvml")
gm, err := NewGPUManager()
require.Nil(t, gm)
require.Error(t, err)
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
assert.NotContains(t, err.Error(), noGPUFoundMsg)
}
func TestNewGPUManagerJetsonIgnoresCollectorConfig(t *testing.T) {
dir := t.TempDir()
t.Setenv("PATH", dir)

View File

@@ -193,7 +193,7 @@ func (a *Agent) handleSSHRequest(w io.Writer, req *common.HubRequest[cbor.RawMes
// handleLegacyStats serves the legacy one-shot stats payload for older hubs
func (a *Agent) handleLegacyStats(w io.Writer, hubVersion semver.Version) error {
stats := a.gatherStats(common.DataRequestOptions{CacheTimeMs: 60_000})
stats := a.gatherStats(common.DataRequestOptions{CacheTimeMs: defaultDataCacheTimeMs})
return a.writeToSession(w, stats, hubVersion)
}

View File

@@ -115,6 +115,26 @@ func (a *Agent) refreshSystemDetails() {
}
}
// attachSystemDetails returns details only for fresh default-interval responses.
func (a *Agent) attachSystemDetails(data *system.CombinedData, cacheTimeMs uint16, includeRequested bool) *system.CombinedData {
if cacheTimeMs != defaultDataCacheTimeMs || (!includeRequested && !a.detailsDirty) {
return data
}
// copy data to avoid adding details to the original cached struct
response := *data
response.Details = &a.systemDetails
a.detailsDirty = false
return &response
}
// updateSystemDetails applies a mutation to the static details payload and marks
// it for inclusion on the next fresh default-interval response.
func (a *Agent) updateSystemDetails(updateFunc func(details *system.Details)) {
updateFunc(&a.systemDetails)
a.detailsDirty = true
}
// Returns current info, stats about the host system
func (a *Agent) getSystemStats(cacheTimeMs uint16) system.Stats {
var systemStats system.Stats

61
agent/system_test.go Normal file
View File

@@ -0,0 +1,61 @@
package agent
import (
"testing"
"github.com/henrygd/beszel/internal/common"
"github.com/henrygd/beszel/internal/entities/system"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestGatherStatsDoesNotAttachDetailsToCachedRequests(t *testing.T) {
agent := &Agent{
cache: NewSystemDataCache(),
systemDetails: system.Details{Hostname: "updated-host", Podman: true},
detailsDirty: true,
}
cached := &system.CombinedData{
Info: system.Info{Hostname: "cached-host"},
}
agent.cache.Set(cached, defaultDataCacheTimeMs)
response := agent.gatherStats(common.DataRequestOptions{CacheTimeMs: defaultDataCacheTimeMs})
assert.Same(t, cached, response)
assert.Nil(t, response.Details)
assert.True(t, agent.detailsDirty)
assert.Equal(t, "cached-host", response.Info.Hostname)
assert.Nil(t, cached.Details)
secondResponse := agent.gatherStats(common.DataRequestOptions{CacheTimeMs: defaultDataCacheTimeMs})
assert.Same(t, cached, secondResponse)
assert.Nil(t, secondResponse.Details)
}
func TestUpdateSystemDetailsMarksDetailsDirty(t *testing.T) {
agent := &Agent{}
agent.updateSystemDetails(func(details *system.Details) {
details.Hostname = "updated-host"
details.Podman = true
})
assert.True(t, agent.detailsDirty)
assert.Equal(t, "updated-host", agent.systemDetails.Hostname)
assert.True(t, agent.systemDetails.Podman)
original := &system.CombinedData{}
realTimeResponse := agent.attachSystemDetails(original, 1000, true)
assert.Same(t, original, realTimeResponse)
assert.Nil(t, realTimeResponse.Details)
assert.True(t, agent.detailsDirty)
response := agent.attachSystemDetails(original, defaultDataCacheTimeMs, false)
require.NotNil(t, response.Details)
assert.NotSame(t, original, response)
assert.Equal(t, "updated-host", response.Details.Hostname)
assert.True(t, response.Details.Podman)
assert.False(t, agent.detailsDirty)
assert.Nil(t, original.Details)
}

View File

@@ -6,7 +6,7 @@ import "github.com/blang/semver"
const (
// Version is the current version of the application.
Version = "0.18.5"
Version = "0.18.6"
// AppName is the name of the application.
AppName = "beszel"
)

View File

@@ -145,6 +145,7 @@ func (sys *System) update() error {
// update smart interval if it's set on the agent side
if data.Details.SmartInterval > 0 {
sys.smartInterval = data.Details.SmartInterval
sys.manager.hub.Logger().Info("SMART interval updated from agent details", "system", sys.Id, "interval", sys.smartInterval.String())
// make sure we reset expiration of lastFetch to remain as long as the new smart interval
// to prevent premature expiration leading to new fetch if interval is different.
sys.manager.smartFetchMap.UpdateExpiration(sys.Id, sys.smartInterval+time.Minute)
@@ -156,11 +157,10 @@ func (sys *System) update() error {
if sys.smartInterval <= 0 {
sys.smartInterval = time.Hour
}
lastFetch, _ := sys.manager.smartFetchMap.GetOk(sys.Id)
if time.Since(time.UnixMilli(lastFetch-1e4)) >= sys.smartInterval && sys.smartFetching.CompareAndSwap(false, true) {
if sys.shouldFetchSmart() && sys.smartFetching.CompareAndSwap(false, true) {
sys.manager.hub.Logger().Info("SMART fetch", "system", sys.Id, "interval", sys.smartInterval.String())
go func() {
defer sys.smartFetching.Store(false)
sys.manager.smartFetchMap.Set(sys.Id, time.Now().UnixMilli(), sys.smartInterval+time.Minute)
_ = sys.FetchAndSaveSmartDevices()
}()
}
@@ -643,6 +643,7 @@ func (s *System) createSSHClient() error {
return err
}
s.agentVersion, _ = extractAgentVersion(string(s.client.Conn.ServerVersion()))
s.manager.resetFailedSmartFetchState(s.Id)
return nil
}

View File

@@ -41,10 +41,10 @@ var errSystemExists = errors.New("system exists")
// SystemManager manages a collection of monitored systems and their connections.
// It handles system lifecycle, status updates, and maintains both SSH and WebSocket connections.
type SystemManager struct {
hub hubLike // Hub interface for database and alert operations
systems *store.Store[string, *System] // Thread-safe store of active systems
sshConfig *ssh.ClientConfig // SSH client configuration for system connections
smartFetchMap *expirymap.ExpiryMap[int64] // Stores last SMART fetch time per system ID
hub hubLike // Hub interface for database and alert operations
systems *store.Store[string, *System] // Thread-safe store of active systems
sshConfig *ssh.ClientConfig // SSH client configuration for system connections
smartFetchMap *expirymap.ExpiryMap[smartFetchState] // Stores last SMART fetch time/result; TTL is only for cleanup
}
// hubLike defines the interface requirements for the hub dependency.
@@ -62,7 +62,7 @@ func NewSystemManager(hub hubLike) *SystemManager {
return &SystemManager{
systems: store.New(map[string]*System{}),
hub: hub,
smartFetchMap: expirymap.New[int64](time.Hour),
smartFetchMap: expirymap.New[smartFetchState](time.Hour),
}
}
@@ -306,6 +306,7 @@ func (sm *SystemManager) AddWebSocketSystem(systemId string, agentVersion semver
if err != nil {
return err
}
sm.resetFailedSmartFetchState(systemId)
system := sm.NewSystem(systemId)
system.WsConn = wsConn
@@ -317,6 +318,15 @@ func (sm *SystemManager) AddWebSocketSystem(systemId string, agentVersion semver
return nil
}
// resetFailedSmartFetchState clears only failed SMART cooldown entries so a fresh
// agent reconnect retries SMART discovery immediately after configuration changes.
func (sm *SystemManager) resetFailedSmartFetchState(systemID string) {
state, ok := sm.smartFetchMap.GetOk(systemID)
if ok && !state.Successful {
sm.smartFetchMap.Remove(systemID)
}
}
// createSSHClientConfig initializes the SSH client configuration for connecting to an agent's server
func (sm *SystemManager) createSSHClientConfig() error {
privateKey, err := sm.hub.GetSSHKey("")

View File

@@ -4,18 +4,61 @@ import (
"database/sql"
"errors"
"strings"
"time"
"github.com/henrygd/beszel/internal/entities/smart"
"github.com/pocketbase/pocketbase/core"
)
type smartFetchState struct {
LastAttempt int64
Successful bool
}
// FetchAndSaveSmartDevices fetches SMART data from the agent and saves it to the database
func (sys *System) FetchAndSaveSmartDevices() error {
smartData, err := sys.FetchSmartDataFromAgent()
if err != nil || len(smartData) == 0 {
if err != nil {
sys.recordSmartFetchResult(err, 0)
return err
}
return sys.saveSmartDevices(smartData)
err = sys.saveSmartDevices(smartData)
sys.recordSmartFetchResult(err, len(smartData))
return err
}
// recordSmartFetchResult stores a cooldown entry for the SMART interval and marks
// whether the last fetch produced any devices, so failed setup can retry on reconnect.
func (sys *System) recordSmartFetchResult(err error, deviceCount int) {
if sys.manager == nil {
return
}
interval := sys.smartFetchInterval()
success := err == nil && deviceCount > 0
if sys.manager.hub != nil {
sys.manager.hub.Logger().Info("SMART fetch result", "system", sys.Id, "success", success, "devices", deviceCount, "interval", interval.String(), "err", err)
}
sys.manager.smartFetchMap.Set(sys.Id, smartFetchState{LastAttempt: time.Now().UnixMilli(), Successful: success}, interval+time.Minute)
}
// shouldFetchSmart returns true when there is no active SMART cooldown entry for this system.
func (sys *System) shouldFetchSmart() bool {
if sys.manager == nil {
return true
}
state, ok := sys.manager.smartFetchMap.GetOk(sys.Id)
if !ok {
return true
}
return !time.UnixMilli(state.LastAttempt).Add(sys.smartFetchInterval()).After(time.Now())
}
// smartFetchInterval returns the agent-provided SMART interval or the default when unset.
func (sys *System) smartFetchInterval() time.Duration {
if sys.smartInterval > 0 {
return sys.smartInterval
}
return time.Hour
}
// saveSmartDevices saves SMART device data to the smart_devices collection

View File

@@ -0,0 +1,94 @@
//go:build testing
package systems
import (
"errors"
"testing"
"time"
"github.com/henrygd/beszel/internal/hub/expirymap"
"github.com/stretchr/testify/assert"
)
func TestRecordSmartFetchResult(t *testing.T) {
sm := &SystemManager{smartFetchMap: expirymap.New[smartFetchState](time.Hour)}
t.Cleanup(sm.smartFetchMap.StopCleaner)
sys := &System{
Id: "system-1",
manager: sm,
smartInterval: time.Hour,
}
// Successful fetch with devices
sys.recordSmartFetchResult(nil, 5)
state, ok := sm.smartFetchMap.GetOk(sys.Id)
assert.True(t, ok, "expected smart fetch result to be stored")
assert.True(t, state.Successful, "expected successful fetch state to be recorded")
// Failed fetch
sys.recordSmartFetchResult(errors.New("failed"), 0)
state, ok = sm.smartFetchMap.GetOk(sys.Id)
assert.True(t, ok, "expected failed smart fetch state to be stored")
assert.False(t, state.Successful, "expected failed smart fetch state to be marked unsuccessful")
// Successful fetch but no devices
sys.recordSmartFetchResult(nil, 0)
state, ok = sm.smartFetchMap.GetOk(sys.Id)
assert.True(t, ok, "expected fetch with zero devices to be stored")
assert.False(t, state.Successful, "expected fetch with zero devices to be marked unsuccessful")
}
func TestShouldFetchSmart(t *testing.T) {
sm := &SystemManager{smartFetchMap: expirymap.New[smartFetchState](time.Hour)}
t.Cleanup(sm.smartFetchMap.StopCleaner)
sys := &System{
Id: "system-1",
manager: sm,
smartInterval: time.Hour,
}
assert.True(t, sys.shouldFetchSmart(), "expected initial smart fetch to be allowed")
sys.recordSmartFetchResult(errors.New("failed"), 0)
assert.False(t, sys.shouldFetchSmart(), "expected smart fetch to be blocked while interval entry exists")
sm.smartFetchMap.Remove(sys.Id)
assert.True(t, sys.shouldFetchSmart(), "expected smart fetch to be allowed after interval entry is cleared")
}
func TestShouldFetchSmart_IgnoresExtendedTTLWhenFetchIsDue(t *testing.T) {
sm := &SystemManager{smartFetchMap: expirymap.New[smartFetchState](time.Hour)}
t.Cleanup(sm.smartFetchMap.StopCleaner)
sys := &System{
Id: "system-1",
manager: sm,
smartInterval: time.Hour,
}
sm.smartFetchMap.Set(sys.Id, smartFetchState{
LastAttempt: time.Now().Add(-2 * time.Hour).UnixMilli(),
Successful: true,
}, 10*time.Minute)
sm.smartFetchMap.UpdateExpiration(sys.Id, 3*time.Hour)
assert.True(t, sys.shouldFetchSmart(), "expected fetch time to take precedence over updated TTL")
}
func TestResetFailedSmartFetchState(t *testing.T) {
sm := &SystemManager{smartFetchMap: expirymap.New[smartFetchState](time.Hour)}
t.Cleanup(sm.smartFetchMap.StopCleaner)
sm.smartFetchMap.Set("system-1", smartFetchState{LastAttempt: time.Now().UnixMilli(), Successful: false}, time.Hour)
sm.resetFailedSmartFetchState("system-1")
_, ok := sm.smartFetchMap.GetOk("system-1")
assert.False(t, ok, "expected failed smart fetch state to be cleared on reconnect")
sm.smartFetchMap.Set("system-1", smartFetchState{LastAttempt: time.Now().UnixMilli(), Successful: true}, time.Hour)
sm.resetFailedSmartFetchState("system-1")
_, ok = sm.smartFetchMap.GetOk("system-1")
assert.True(t, ok, "expected successful smart fetch state to be preserved")
}

View File

@@ -4,6 +4,7 @@
<meta charset="UTF-8" />
<link rel="manifest" href="./static/manifest.json" crossorigin="use-credentials" />
<link rel="icon" type="image/svg+xml" href="./static/icon.svg" />
<link rel="apple-touch-icon" href="./static/icon.png" />
<meta name="viewport" content="width=device-width, initial-scale=1.0,maximum-scale=1.0, user-scalable=no, viewport-fit=cover" />
<meta name="robots" content="noindex, nofollow" />
<title>Beszel</title>

View File

@@ -1,7 +1,7 @@
{
"name": "beszel",
"private": true,
"version": "0.18.5",
"version": "0.18.6",
"type": "module",
"scripts": {
"dev": "vite --host",

View File

@@ -67,8 +67,8 @@ export default function AreaChartDefault({
const { yAxisWidth, updateYAxisWidth } = useYAxisWidth()
const { isIntersecting, ref } = useIntersectionObserver({ freeze: false })
const sourceData = customData ?? chartData.systemStats
// Only update the rendered data while the chart is visible
const [displayData, setDisplayData] = useState(sourceData)
const [displayMaxToggled, setDisplayMaxToggled] = useState(maxToggled)
// Reduce chart redraws by only updating while visible or when chart time changes
useEffect(() => {
@@ -78,7 +78,10 @@ export default function AreaChartDefault({
if (shouldUpdate) {
setDisplayData(sourceData)
}
}, [displayData, isIntersecting, sourceData])
if (isIntersecting && maxToggled !== displayMaxToggled) {
setDisplayMaxToggled(maxToggled)
}
}, [displayData, displayMaxToggled, isIntersecting, maxToggled, sourceData])
// Use a stable key derived from data point identities and visual properties
const areasKey = dataPoints?.map((d) => `${d.label}:${d.opacity}`).join("\0")
@@ -106,14 +109,14 @@ export default function AreaChartDefault({
/>
)
})
}, [areasKey, maxToggled])
}, [areasKey, displayMaxToggled])
return useMemo(() => {
if (displayData.length === 0) {
return null
}
// if (logRender) {
// console.log("Rendered at", new Date(), "for", dataPoints?.at(0)?.label)
// console.log("Rendered", dataPoints?.map((d) => d.label).join(", "), new Date())
// }
return (
<ChartContainer
@@ -163,5 +166,5 @@ export default function AreaChartDefault({
</AreaChart>
</ChartContainer>
)
}, [displayData, yAxisWidth, showTotal, filter])
}, [displayData, yAxisWidth, filter, Areas])
}

View File

@@ -66,8 +66,8 @@ export default function LineChartDefault({
const { yAxisWidth, updateYAxisWidth } = useYAxisWidth()
const { isIntersecting, ref } = useIntersectionObserver({ freeze: false })
const sourceData = customData ?? chartData.systemStats
// Only update the rendered data while the chart is visible
const [displayData, setDisplayData] = useState(sourceData)
const [displayMaxToggled, setDisplayMaxToggled] = useState(maxToggled)
// Reduce chart redraws by only updating while visible or when chart time changes
useEffect(() => {
@@ -77,7 +77,10 @@ export default function LineChartDefault({
if (shouldUpdate) {
setDisplayData(sourceData)
}
}, [displayData, isIntersecting, sourceData])
if (isIntersecting && maxToggled !== displayMaxToggled) {
setDisplayMaxToggled(maxToggled)
}
}, [displayData, displayMaxToggled, isIntersecting, maxToggled, sourceData])
// Use a stable key derived from data point identities and visual properties
const linesKey = dataPoints?.map((d) => `${d.label}:${d.strokeOpacity ?? ""}`).join("\0")
@@ -105,14 +108,14 @@ export default function LineChartDefault({
/>
)
})
}, [linesKey, maxToggled])
}, [linesKey, displayMaxToggled])
return useMemo(() => {
if (displayData.length === 0) {
return null
}
// if (logRender) {
// console.log("Rendered at", new Date(), "for", dataPoints?.at(0)?.label)
// console.log("Rendered", dataPoints?.map((d) => d.label).join(", "), new Date())
// }
return (
<ChartContainer
@@ -162,5 +165,5 @@ export default function LineChartDefault({
</LineChart>
</ChartContainer>
)
}, [displayData, yAxisWidth, showTotal, filter, chartData.chartTime])
}, [displayData, yAxisWidth, filter, Lines])
}

View File

@@ -18,7 +18,7 @@ import { listenKeys } from "nanostores"
import { memo, type ReactNode, useEffect, useMemo, useRef, useState } from "react"
import { getStatusColor, systemdTableCols } from "@/components/systemd-table/systemd-table-columns"
import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"
import { Card, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"
import { Card, CardHeader, CardTitle } from "@/components/ui/card"
import { Input } from "@/components/ui/input"
import { Sheet, SheetContent, SheetHeader, SheetTitle } from "@/components/ui/sheet"
import { TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table"
@@ -161,13 +161,13 @@ export default function SystemdTable({ systemId }: { systemId?: string }) {
<CardTitle className="mb-2">
<Trans>Systemd Services</Trans>
</CardTitle>
<CardDescription className="flex items-center">
<div className="text-sm text-muted-foreground flex items-center flex-wrap">
<Trans>Total: {data.length}</Trans>
<Separator orientation="vertical" className="h-4 mx-2 bg-primary/40" />
<Trans>Failed: {statusTotals[ServiceStatus.Failed]}</Trans>
<Separator orientation="vertical" className="h-4 mx-2 bg-primary/40" />
<Trans>Updated every 10 minutes.</Trans>
</CardDescription>
</div>
</div>
<Input
placeholder={t`Filter...`}

View File

@@ -1,3 +1,17 @@
## 0.18.6
- Add apple-touch-icon link to index.html (#1850)
- Fix UI bug where charts did not display 1m max until next update
- Fix regression in partition discovery on Docker (#1847)
- Fix agent detection of Podman when using socket proxy (#1846)
- Fix NVML GPU collection being disabled when `nvidia-smi` is not in PATH (#1849)
- Reset SMART interval on agent reconnect if the agent hasn't collected SMART data, allowing config changes to take effect immediately
## 0.18.5
- Add "update available" notification in hub web UI with `CHECK_UPDATES=true` (#1830)