mirror of
https://github.com/henrygd/beszel.git
synced 2026-03-24 06:26:17 +01:00
Compare commits
6 Commits
283fa9d5c2
...
apple-gpu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c561aef409 | ||
|
|
f792f9b102 | ||
|
|
1def7d8d3a | ||
|
|
ef92b254bf | ||
|
|
10d853c004 | ||
|
|
cdfd116da0 |
@@ -72,6 +72,7 @@ type dockerManager struct {
|
|||||||
// cacheTimeMs -> DeltaTracker for network bytes sent/received
|
// cacheTimeMs -> DeltaTracker for network bytes sent/received
|
||||||
networkSentTrackers map[uint16]*deltatracker.DeltaTracker[string, uint64]
|
networkSentTrackers map[uint16]*deltatracker.DeltaTracker[string, uint64]
|
||||||
networkRecvTrackers map[uint16]*deltatracker.DeltaTracker[string, uint64]
|
networkRecvTrackers map[uint16]*deltatracker.DeltaTracker[string, uint64]
|
||||||
|
retrySleep func(time.Duration)
|
||||||
}
|
}
|
||||||
|
|
||||||
// userAgentRoundTripper is a custom http.RoundTripper that adds a User-Agent header to all requests
|
// userAgentRoundTripper is a custom http.RoundTripper that adds a User-Agent header to all requests
|
||||||
@@ -565,6 +566,7 @@ func newDockerManager() *dockerManager {
|
|||||||
lastCpuReadTime: make(map[uint16]map[string]time.Time),
|
lastCpuReadTime: make(map[uint16]map[string]time.Time),
|
||||||
networkSentTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
|
networkSentTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
|
||||||
networkRecvTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
|
networkRecvTrackers: make(map[uint16]*deltatracker.DeltaTracker[string, uint64]),
|
||||||
|
retrySleep: time.Sleep,
|
||||||
}
|
}
|
||||||
|
|
||||||
// If using podman, return client
|
// If using podman, return client
|
||||||
@@ -574,7 +576,7 @@ func newDockerManager() *dockerManager {
|
|||||||
return manager
|
return manager
|
||||||
}
|
}
|
||||||
|
|
||||||
// this can take up to 5 seconds with retry, so run in goroutine
|
// run version check in goroutine to avoid blocking (server may not be ready and requires retries)
|
||||||
go manager.checkDockerVersion()
|
go manager.checkDockerVersion()
|
||||||
|
|
||||||
// give version check a chance to complete before returning
|
// give version check a chance to complete before returning
|
||||||
@@ -594,18 +596,18 @@ func (dm *dockerManager) checkDockerVersion() {
|
|||||||
const versionMaxTries = 2
|
const versionMaxTries = 2
|
||||||
for i := 1; i <= versionMaxTries; i++ {
|
for i := 1; i <= versionMaxTries; i++ {
|
||||||
resp, err = dm.client.Get("http://localhost/version")
|
resp, err = dm.client.Get("http://localhost/version")
|
||||||
if err == nil {
|
if err == nil && resp.StatusCode == http.StatusOK {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
if resp != nil {
|
if resp != nil {
|
||||||
resp.Body.Close()
|
resp.Body.Close()
|
||||||
}
|
}
|
||||||
if i < versionMaxTries {
|
if i < versionMaxTries {
|
||||||
slog.Debug("Failed to get Docker version; retrying", "attempt", i, "error", err)
|
slog.Debug("Failed to get Docker version; retrying", "attempt", i, "err", err, "response", resp)
|
||||||
time.Sleep(5 * time.Second)
|
dm.retrySleep(5 * time.Second)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil || resp.StatusCode != http.StatusOK {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if err := dm.decode(resp, &versionInfo); err != nil {
|
if err := dm.decode(resp, &versionInfo); err != nil {
|
||||||
|
|||||||
@@ -5,7 +5,13 @@ package agent
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"net"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
@@ -379,6 +385,117 @@ func TestDockerManagerCreation(t *testing.T) {
|
|||||||
assert.NotNil(t, dm.networkRecvTrackers)
|
assert.NotNil(t, dm.networkRecvTrackers)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestCheckDockerVersion(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
responses []struct {
|
||||||
|
statusCode int
|
||||||
|
body string
|
||||||
|
}
|
||||||
|
expectedGood bool
|
||||||
|
expectedRequests int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "200 with good version on first try",
|
||||||
|
responses: []struct {
|
||||||
|
statusCode int
|
||||||
|
body string
|
||||||
|
}{
|
||||||
|
{http.StatusOK, `{"Version":"25.0.1"}`},
|
||||||
|
},
|
||||||
|
expectedGood: true,
|
||||||
|
expectedRequests: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "200 with old version on first try",
|
||||||
|
responses: []struct {
|
||||||
|
statusCode int
|
||||||
|
body string
|
||||||
|
}{
|
||||||
|
{http.StatusOK, `{"Version":"24.0.7"}`},
|
||||||
|
},
|
||||||
|
expectedGood: false,
|
||||||
|
expectedRequests: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "non-200 then 200 with good version",
|
||||||
|
responses: []struct {
|
||||||
|
statusCode int
|
||||||
|
body string
|
||||||
|
}{
|
||||||
|
{http.StatusServiceUnavailable, `"not ready"`},
|
||||||
|
{http.StatusOK, `{"Version":"25.1.0"}`},
|
||||||
|
},
|
||||||
|
expectedGood: true,
|
||||||
|
expectedRequests: 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "non-200 on all retries",
|
||||||
|
responses: []struct {
|
||||||
|
statusCode int
|
||||||
|
body string
|
||||||
|
}{
|
||||||
|
{http.StatusInternalServerError, `"error"`},
|
||||||
|
{http.StatusUnauthorized, `"error"`},
|
||||||
|
},
|
||||||
|
expectedGood: false,
|
||||||
|
expectedRequests: 2,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
requestCount := 0
|
||||||
|
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
idx := requestCount
|
||||||
|
requestCount++
|
||||||
|
if idx >= len(tt.responses) {
|
||||||
|
idx = len(tt.responses) - 1
|
||||||
|
}
|
||||||
|
w.WriteHeader(tt.responses[idx].statusCode)
|
||||||
|
fmt.Fprint(w, tt.responses[idx].body)
|
||||||
|
}))
|
||||||
|
defer server.Close()
|
||||||
|
|
||||||
|
dm := &dockerManager{
|
||||||
|
client: &http.Client{
|
||||||
|
Transport: &http.Transport{
|
||||||
|
DialContext: func(_ context.Context, network, _ string) (net.Conn, error) {
|
||||||
|
return net.Dial(network, server.Listener.Addr().String())
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
retrySleep: func(time.Duration) {},
|
||||||
|
}
|
||||||
|
|
||||||
|
dm.checkDockerVersion()
|
||||||
|
|
||||||
|
assert.Equal(t, tt.expectedGood, dm.goodDockerVersion)
|
||||||
|
assert.Equal(t, tt.expectedRequests, requestCount)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Run("request error on all retries", func(t *testing.T) {
|
||||||
|
requestCount := 0
|
||||||
|
dm := &dockerManager{
|
||||||
|
client: &http.Client{
|
||||||
|
Transport: &http.Transport{
|
||||||
|
DialContext: func(_ context.Context, _, _ string) (net.Conn, error) {
|
||||||
|
requestCount++
|
||||||
|
return nil, errors.New("connection refused")
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
retrySleep: func(time.Duration) {},
|
||||||
|
}
|
||||||
|
|
||||||
|
dm.checkDockerVersion()
|
||||||
|
|
||||||
|
assert.False(t, dm.goodDockerVersion)
|
||||||
|
assert.Equal(t, 2, requestCount)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func TestCycleCpuDeltas(t *testing.T) {
|
func TestCycleCpuDeltas(t *testing.T) {
|
||||||
dm := &dockerManager{
|
dm := &dockerManager{
|
||||||
lastCpuContainer: map[uint16]map[string]uint64{
|
lastCpuContainer: map[uint16]map[string]uint64{
|
||||||
|
|||||||
83
agent/gpu.go
83
agent/gpu.go
@@ -9,6 +9,7 @@ import (
|
|||||||
"maps"
|
"maps"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"runtime"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -19,11 +20,13 @@ import (
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
// Commands
|
// Commands
|
||||||
nvidiaSmiCmd string = "nvidia-smi"
|
nvidiaSmiCmd string = "nvidia-smi"
|
||||||
rocmSmiCmd string = "rocm-smi"
|
rocmSmiCmd string = "rocm-smi"
|
||||||
tegraStatsCmd string = "tegrastats"
|
tegraStatsCmd string = "tegrastats"
|
||||||
nvtopCmd string = "nvtop"
|
nvtopCmd string = "nvtop"
|
||||||
noGPUFoundMsg string = "no GPU found - see https://beszel.dev/guide/gpu"
|
powermetricsCmd string = "powermetrics"
|
||||||
|
macmonCmd string = "macmon"
|
||||||
|
noGPUFoundMsg string = "no GPU found - see https://beszel.dev/guide/gpu"
|
||||||
|
|
||||||
// Command retry and timeout constants
|
// Command retry and timeout constants
|
||||||
retryWaitTime time.Duration = 5 * time.Second
|
retryWaitTime time.Duration = 5 * time.Second
|
||||||
@@ -82,15 +85,18 @@ var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing
|
|||||||
type collectorSource string
|
type collectorSource string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
collectorSourceNVTop collectorSource = collectorSource(nvtopCmd)
|
collectorSourceNVTop collectorSource = collectorSource(nvtopCmd)
|
||||||
collectorSourceNVML collectorSource = "nvml"
|
collectorSourceNVML collectorSource = "nvml"
|
||||||
collectorSourceNvidiaSMI collectorSource = collectorSource(nvidiaSmiCmd)
|
collectorSourceNvidiaSMI collectorSource = collectorSource(nvidiaSmiCmd)
|
||||||
collectorSourceIntelGpuTop collectorSource = collectorSource(intelGpuStatsCmd)
|
collectorSourceIntelGpuTop collectorSource = collectorSource(intelGpuStatsCmd)
|
||||||
collectorSourceAmdSysfs collectorSource = "amd_sysfs"
|
collectorSourceAmdSysfs collectorSource = "amd_sysfs"
|
||||||
collectorSourceRocmSMI collectorSource = collectorSource(rocmSmiCmd)
|
collectorSourceRocmSMI collectorSource = collectorSource(rocmSmiCmd)
|
||||||
collectorGroupNvidia string = "nvidia"
|
collectorSourceMacmon collectorSource = collectorSource(macmonCmd)
|
||||||
collectorGroupIntel string = "intel"
|
collectorSourcePowermetrics collectorSource = collectorSource(powermetricsCmd)
|
||||||
collectorGroupAmd string = "amd"
|
collectorGroupNvidia string = "nvidia"
|
||||||
|
collectorGroupIntel string = "intel"
|
||||||
|
collectorGroupAmd string = "amd"
|
||||||
|
collectorGroupApple string = "apple"
|
||||||
)
|
)
|
||||||
|
|
||||||
func isValidCollectorSource(source collectorSource) bool {
|
func isValidCollectorSource(source collectorSource) bool {
|
||||||
@@ -100,7 +106,9 @@ func isValidCollectorSource(source collectorSource) bool {
|
|||||||
collectorSourceNvidiaSMI,
|
collectorSourceNvidiaSMI,
|
||||||
collectorSourceIntelGpuTop,
|
collectorSourceIntelGpuTop,
|
||||||
collectorSourceAmdSysfs,
|
collectorSourceAmdSysfs,
|
||||||
collectorSourceRocmSMI:
|
collectorSourceRocmSMI,
|
||||||
|
collectorSourceMacmon,
|
||||||
|
collectorSourcePowermetrics:
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
@@ -108,12 +116,14 @@ func isValidCollectorSource(source collectorSource) bool {
|
|||||||
|
|
||||||
// gpuCapabilities describes detected GPU tooling and sysfs support on the host.
|
// gpuCapabilities describes detected GPU tooling and sysfs support on the host.
|
||||||
type gpuCapabilities struct {
|
type gpuCapabilities struct {
|
||||||
hasNvidiaSmi bool
|
hasNvidiaSmi bool
|
||||||
hasRocmSmi bool
|
hasRocmSmi bool
|
||||||
hasAmdSysfs bool
|
hasAmdSysfs bool
|
||||||
hasTegrastats bool
|
hasTegrastats bool
|
||||||
hasIntelGpuTop bool
|
hasIntelGpuTop bool
|
||||||
hasNvtop bool
|
hasNvtop bool
|
||||||
|
hasMacmon bool
|
||||||
|
hasPowermetrics bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type collectorDefinition struct {
|
type collectorDefinition struct {
|
||||||
@@ -449,11 +459,19 @@ func (gm *GPUManager) discoverGpuCapabilities() gpuCapabilities {
|
|||||||
if _, err := exec.LookPath(nvtopCmd); err == nil {
|
if _, err := exec.LookPath(nvtopCmd); err == nil {
|
||||||
caps.hasNvtop = true
|
caps.hasNvtop = true
|
||||||
}
|
}
|
||||||
|
if runtime.GOOS == "darwin" {
|
||||||
|
if _, err := exec.LookPath(macmonCmd); err == nil {
|
||||||
|
caps.hasMacmon = true
|
||||||
|
}
|
||||||
|
if _, err := exec.LookPath(powermetricsCmd); err == nil {
|
||||||
|
caps.hasPowermetrics = true
|
||||||
|
}
|
||||||
|
}
|
||||||
return caps
|
return caps
|
||||||
}
|
}
|
||||||
|
|
||||||
func hasAnyGpuCollector(caps gpuCapabilities) bool {
|
func hasAnyGpuCollector(caps gpuCapabilities) bool {
|
||||||
return caps.hasNvidiaSmi || caps.hasRocmSmi || caps.hasAmdSysfs || caps.hasTegrastats || caps.hasIntelGpuTop || caps.hasNvtop
|
return caps.hasNvidiaSmi || caps.hasRocmSmi || caps.hasAmdSysfs || caps.hasTegrastats || caps.hasIntelGpuTop || caps.hasNvtop || caps.hasMacmon || caps.hasPowermetrics
|
||||||
}
|
}
|
||||||
|
|
||||||
func (gm *GPUManager) startIntelCollector() {
|
func (gm *GPUManager) startIntelCollector() {
|
||||||
@@ -567,6 +585,22 @@ func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSo
|
|||||||
return true
|
return true
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
collectorSourceMacmon: {
|
||||||
|
group: collectorGroupApple,
|
||||||
|
available: caps.hasMacmon,
|
||||||
|
start: func(_ func()) bool {
|
||||||
|
gm.startMacmonCollector()
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
},
|
||||||
|
collectorSourcePowermetrics: {
|
||||||
|
group: collectorGroupApple,
|
||||||
|
available: caps.hasPowermetrics,
|
||||||
|
start: func(_ func()) bool {
|
||||||
|
gm.startPowermetricsCollector()
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -674,7 +708,10 @@ func (gm *GPUManager) resolveLegacyCollectorPriority(caps gpuCapabilities) []col
|
|||||||
priorities = append(priorities, collectorSourceIntelGpuTop)
|
priorities = append(priorities, collectorSourceIntelGpuTop)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Keep nvtop as a legacy last resort only when no vendor collector exists.
|
// Apple collectors are currently opt-in only.
|
||||||
|
// Enable them with GPU_COLLECTOR=macmon or GPU_COLLECTOR=powermetrics.
|
||||||
|
|
||||||
|
// Keep nvtop as a last resort only when no vendor collector exists.
|
||||||
if len(priorities) == 0 && caps.hasNvtop {
|
if len(priorities) == 0 && caps.hasNvtop {
|
||||||
priorities = append(priorities, collectorSourceNVTop)
|
priorities = append(priorities, collectorSourceNVTop)
|
||||||
}
|
}
|
||||||
|
|||||||
9
agent/gpu_apple_unsupported.go
Normal file
9
agent/gpu_apple_unsupported.go
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
//go:build !darwin
|
||||||
|
|
||||||
|
package agent
|
||||||
|
|
||||||
|
// startPowermetricsCollector is a no-op on non-darwin platforms; the real implementation is in gpu_darwin.go.
|
||||||
|
func (gm *GPUManager) startPowermetricsCollector() {}
|
||||||
|
|
||||||
|
// startMacmonCollector is a no-op on non-darwin platforms; the real implementation is in gpu_darwin.go.
|
||||||
|
func (gm *GPUManager) startMacmonCollector() {}
|
||||||
252
agent/gpu_darwin.go
Normal file
252
agent/gpu_darwin.go
Normal file
@@ -0,0 +1,252 @@
|
|||||||
|
//go:build darwin
|
||||||
|
|
||||||
|
package agent
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"bytes"
|
||||||
|
"encoding/json"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/henrygd/beszel/internal/entities/system"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// powermetricsSampleIntervalMs is the sampling interval passed to powermetrics (-i).
|
||||||
|
powermetricsSampleIntervalMs = 500
|
||||||
|
// powermetricsPollInterval is how often we run powermetrics to collect a new sample.
|
||||||
|
powermetricsPollInterval = 2 * time.Second
|
||||||
|
// macmonIntervalMs is the sampling interval passed to macmon pipe (-i), in milliseconds.
|
||||||
|
macmonIntervalMs = 2500
|
||||||
|
)
|
||||||
|
|
||||||
|
const appleGPUID = "0"
|
||||||
|
|
||||||
|
// startPowermetricsCollector runs powermetrics --samplers gpu_power in a loop and updates
|
||||||
|
// GPU usage and power. Requires root (sudo) on macOS. A single logical GPU is reported as id "0".
|
||||||
|
func (gm *GPUManager) startPowermetricsCollector() {
|
||||||
|
// Ensure single GPU entry for Apple GPU
|
||||||
|
if _, ok := gm.GpuDataMap[appleGPUID]; !ok {
|
||||||
|
gm.GpuDataMap[appleGPUID] = &system.GPUData{Name: "Apple GPU"}
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
failures := 0
|
||||||
|
for {
|
||||||
|
if err := gm.collectPowermetrics(); err != nil {
|
||||||
|
failures++
|
||||||
|
if failures > maxFailureRetries {
|
||||||
|
slog.Warn("powermetrics GPU collector failed repeatedly, stopping", "err", err)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
slog.Warn("Error collecting macOS GPU data via powermetrics (may require sudo)", "err", err)
|
||||||
|
time.Sleep(retryWaitTime)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
failures = 0
|
||||||
|
time.Sleep(powermetricsPollInterval)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectPowermetrics runs powermetrics once and parses GPU usage and power from its output.
|
||||||
|
func (gm *GPUManager) collectPowermetrics() error {
|
||||||
|
interval := strconv.Itoa(powermetricsSampleIntervalMs)
|
||||||
|
cmd := exec.Command(powermetricsCmd, "--samplers", "gpu_power", "-i", interval, "-n", "1")
|
||||||
|
cmd.Stderr = nil
|
||||||
|
out, err := cmd.Output()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if !gm.parsePowermetricsData(out) {
|
||||||
|
return errNoValidData
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// parsePowermetricsData parses powermetrics gpu_power output and updates GpuDataMap["0"].
|
||||||
|
// Example output:
|
||||||
|
//
|
||||||
|
// **** GPU usage ****
|
||||||
|
// GPU HW active frequency: 444 MHz
|
||||||
|
// GPU HW active residency: 0.97% (444 MHz: .97% ...
|
||||||
|
// GPU idle residency: 99.03%
|
||||||
|
// GPU Power: 4 mW
|
||||||
|
func (gm *GPUManager) parsePowermetricsData(output []byte) bool {
|
||||||
|
var idleResidency, powerMW float64
|
||||||
|
var gotIdle, gotPower bool
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(bytes.NewReader(output))
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := strings.TrimSpace(scanner.Text())
|
||||||
|
if strings.HasPrefix(line, "GPU idle residency:") {
|
||||||
|
// "GPU idle residency: 99.03%"
|
||||||
|
fields := strings.Fields(strings.TrimPrefix(line, "GPU idle residency:"))
|
||||||
|
if len(fields) >= 1 {
|
||||||
|
pct := strings.TrimSuffix(fields[0], "%")
|
||||||
|
if v, err := strconv.ParseFloat(pct, 64); err == nil {
|
||||||
|
idleResidency = v
|
||||||
|
gotIdle = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if strings.HasPrefix(line, "GPU Power:") {
|
||||||
|
// "GPU Power: 4 mW"
|
||||||
|
fields := strings.Fields(strings.TrimPrefix(line, "GPU Power:"))
|
||||||
|
if len(fields) >= 1 {
|
||||||
|
if v, err := strconv.ParseFloat(fields[0], 64); err == nil {
|
||||||
|
powerMW = v
|
||||||
|
gotPower = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if !gotIdle && !gotPower {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
gm.Lock()
|
||||||
|
defer gm.Unlock()
|
||||||
|
|
||||||
|
if _, ok := gm.GpuDataMap[appleGPUID]; !ok {
|
||||||
|
gm.GpuDataMap[appleGPUID] = &system.GPUData{Name: "Apple GPU"}
|
||||||
|
}
|
||||||
|
gpu := gm.GpuDataMap[appleGPUID]
|
||||||
|
|
||||||
|
if gotIdle {
|
||||||
|
// Usage = 100 - idle residency (e.g. 100 - 99.03 = 0.97%)
|
||||||
|
gpu.Usage += 100 - idleResidency
|
||||||
|
}
|
||||||
|
if gotPower {
|
||||||
|
// mW -> W
|
||||||
|
gpu.Power += powerMW / milliwattsInAWatt
|
||||||
|
}
|
||||||
|
gpu.Count++
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// startMacmonCollector runs `macmon pipe` in a loop and parses one JSON object per line.
|
||||||
|
// This collector does not require sudo. A single logical GPU is reported as id "0".
|
||||||
|
func (gm *GPUManager) startMacmonCollector() {
|
||||||
|
if _, ok := gm.GpuDataMap[appleGPUID]; !ok {
|
||||||
|
gm.GpuDataMap[appleGPUID] = &system.GPUData{Name: "Apple GPU"}
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
failures := 0
|
||||||
|
for {
|
||||||
|
if err := gm.collectMacmonPipe(); err != nil {
|
||||||
|
failures++
|
||||||
|
if failures > maxFailureRetries {
|
||||||
|
slog.Warn("macmon GPU collector failed repeatedly, stopping", "err", err)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
slog.Warn("Error collecting macOS GPU data via macmon", "err", err)
|
||||||
|
time.Sleep(retryWaitTime)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
failures = 0
|
||||||
|
// `macmon pipe` is long-running; if it returns, wait a bit before restarting.
|
||||||
|
time.Sleep(retryWaitTime)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
type macmonTemp struct {
|
||||||
|
GPUTempAvg float64 `json:"gpu_temp_avg"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type macmonSample struct {
|
||||||
|
GPUPower float64 `json:"gpu_power"` // watts (macmon reports fractional values)
|
||||||
|
GPURAMPower float64 `json:"gpu_ram_power"` // watts
|
||||||
|
GPUUsage []float64 `json:"gpu_usage"` // [freq_mhz, usage] where usage is typically 0..1
|
||||||
|
Temp macmonTemp `json:"temp"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (gm *GPUManager) collectMacmonPipe() (err error) {
|
||||||
|
cmd := exec.Command(macmonCmd, "pipe", "-i", strconv.Itoa(macmonIntervalMs))
|
||||||
|
// Avoid blocking if macmon writes to stderr.
|
||||||
|
cmd.Stderr = io.Discard
|
||||||
|
stdout, err := cmd.StdoutPipe()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure we always reap the child to avoid zombies on any return path and
|
||||||
|
// propagate a non-zero exit code if no other error was set.
|
||||||
|
defer func() {
|
||||||
|
_ = stdout.Close()
|
||||||
|
if cmd.ProcessState == nil || !cmd.ProcessState.Exited() {
|
||||||
|
_ = cmd.Process.Kill()
|
||||||
|
}
|
||||||
|
if waitErr := cmd.Wait(); err == nil && waitErr != nil {
|
||||||
|
err = waitErr
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(stdout)
|
||||||
|
var hadSample bool
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := bytes.TrimSpace(scanner.Bytes())
|
||||||
|
if len(line) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if gm.parseMacmonLine(line) {
|
||||||
|
hadSample = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if scanErr := scanner.Err(); scanErr != nil {
|
||||||
|
return scanErr
|
||||||
|
}
|
||||||
|
if !hadSample {
|
||||||
|
return errNoValidData
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseMacmonLine parses a single macmon JSON line and updates Apple GPU metrics.
|
||||||
|
func (gm *GPUManager) parseMacmonLine(line []byte) bool {
|
||||||
|
var sample macmonSample
|
||||||
|
if err := json.Unmarshal(line, &sample); err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
usage := 0.0
|
||||||
|
if len(sample.GPUUsage) >= 2 {
|
||||||
|
usage = sample.GPUUsage[1]
|
||||||
|
// Heuristic: macmon typically reports 0..1; convert to percentage.
|
||||||
|
if usage <= 1.0 {
|
||||||
|
usage *= 100
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consider the line valid if it contains at least one GPU metric.
|
||||||
|
if usage == 0 && sample.GPUPower == 0 && sample.Temp.GPUTempAvg == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
gm.Lock()
|
||||||
|
defer gm.Unlock()
|
||||||
|
|
||||||
|
gpu, ok := gm.GpuDataMap[appleGPUID]
|
||||||
|
if !ok {
|
||||||
|
gpu = &system.GPUData{Name: "Apple GPU"}
|
||||||
|
gm.GpuDataMap[appleGPUID] = gpu
|
||||||
|
}
|
||||||
|
gpu.Temperature = sample.Temp.GPUTempAvg
|
||||||
|
gpu.Usage += usage
|
||||||
|
// macmon reports power in watts; include VRAM power if present.
|
||||||
|
gpu.Power += sample.GPUPower + sample.GPURAMPower
|
||||||
|
gpu.Count++
|
||||||
|
return true
|
||||||
|
}
|
||||||
81
agent/gpu_darwin_test.go
Normal file
81
agent/gpu_darwin_test.go
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
//go:build darwin
|
||||||
|
|
||||||
|
package agent
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/henrygd/beszel/internal/entities/system"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParsePowermetricsData(t *testing.T) {
|
||||||
|
input := `
|
||||||
|
Machine model: Mac14,10
|
||||||
|
OS version: 25D125
|
||||||
|
|
||||||
|
*** Sampled system activity (Sat Feb 14 00:42:06 2026 -0500) (503.05ms elapsed) ***
|
||||||
|
|
||||||
|
**** GPU usage ****
|
||||||
|
|
||||||
|
GPU HW active frequency: 444 MHz
|
||||||
|
GPU HW active residency: 0.97% (444 MHz: .97% 612 MHz: 0% 808 MHz: 0% 968 MHz: 0% 1110 MHz: 0% 1236 MHz: 0% 1338 MHz: 0% 1398 MHz: 0%)
|
||||||
|
GPU SW requested state: (P1 : 100% P2 : 0% P3 : 0% P4 : 0% P5 : 0% P6 : 0% P7 : 0% P8 : 0%)
|
||||||
|
GPU idle residency: 99.03%
|
||||||
|
GPU Power: 4 mW
|
||||||
|
`
|
||||||
|
gm := &GPUManager{
|
||||||
|
GpuDataMap: make(map[string]*system.GPUData),
|
||||||
|
}
|
||||||
|
valid := gm.parsePowermetricsData([]byte(input))
|
||||||
|
require.True(t, valid)
|
||||||
|
|
||||||
|
g0, ok := gm.GpuDataMap["0"]
|
||||||
|
require.True(t, ok)
|
||||||
|
assert.Equal(t, "Apple GPU", g0.Name)
|
||||||
|
// Usage = 100 - 99.03 = 0.97
|
||||||
|
assert.InDelta(t, 0.97, g0.Usage, 0.01)
|
||||||
|
// 4 mW -> 0.004 W
|
||||||
|
assert.InDelta(t, 0.004, g0.Power, 0.0001)
|
||||||
|
assert.Equal(t, 1.0, g0.Count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParsePowermetricsDataPartial(t *testing.T) {
|
||||||
|
// Only power line (e.g. older macOS or different sampler output)
|
||||||
|
input := `
|
||||||
|
**** GPU usage ****
|
||||||
|
GPU Power: 120 mW
|
||||||
|
`
|
||||||
|
gm := &GPUManager{
|
||||||
|
GpuDataMap: make(map[string]*system.GPUData),
|
||||||
|
}
|
||||||
|
valid := gm.parsePowermetricsData([]byte(input))
|
||||||
|
require.True(t, valid)
|
||||||
|
|
||||||
|
g0, ok := gm.GpuDataMap["0"]
|
||||||
|
require.True(t, ok)
|
||||||
|
assert.Equal(t, "Apple GPU", g0.Name)
|
||||||
|
assert.InDelta(t, 0.12, g0.Power, 0.001)
|
||||||
|
assert.Equal(t, 1.0, g0.Count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseMacmonLine(t *testing.T) {
|
||||||
|
input := `{"all_power":0.6468324661254883,"ane_power":0.0,"cpu_power":0.6359732151031494,"ecpu_usage":[2061,0.1726151406764984],"gpu_power":0.010859241709113121,"gpu_ram_power":0.000965250947047025,"gpu_usage":[503,0.013633215799927711],"memory":{"ram_total":17179869184,"ram_usage":12322914304,"swap_total":0,"swap_usage":0},"pcpu_usage":[1248,0.11792058497667313],"ram_power":0.14885640144348145,"sys_power":10.4955415725708,"temp":{"cpu_temp_avg":23.041261672973633,"gpu_temp_avg":29.44516944885254},"timestamp":"2026-02-17T19:34:27.942556+00:00"}`
|
||||||
|
|
||||||
|
gm := &GPUManager{
|
||||||
|
GpuDataMap: make(map[string]*system.GPUData),
|
||||||
|
}
|
||||||
|
valid := gm.parseMacmonLine([]byte(input))
|
||||||
|
require.True(t, valid)
|
||||||
|
|
||||||
|
g0, ok := gm.GpuDataMap["0"]
|
||||||
|
require.True(t, ok)
|
||||||
|
assert.Equal(t, "Apple GPU", g0.Name)
|
||||||
|
// macmon reports usage fraction 0..1; expect percent conversion.
|
||||||
|
assert.InDelta(t, 1.3633, g0.Usage, 0.05)
|
||||||
|
// power includes gpu_power + gpu_ram_power
|
||||||
|
assert.InDelta(t, 0.011824, g0.Power, 0.0005)
|
||||||
|
assert.InDelta(t, 29.445, g0.Temperature, 0.01)
|
||||||
|
assert.Equal(t, 1.0, g0.Count)
|
||||||
|
}
|
||||||
303
internal/hub/heartbeat/heartbeat.go
Normal file
303
internal/hub/heartbeat/heartbeat.go
Normal file
@@ -0,0 +1,303 @@
|
|||||||
|
// Package heartbeat sends periodic outbound pings to an external monitoring
|
||||||
|
// endpoint (e.g. BetterStack, Uptime Kuma, Healthchecks.io) so operators can
|
||||||
|
// monitor Beszel without exposing it to the internet.
|
||||||
|
package heartbeat
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"net/url"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/henrygd/beszel"
|
||||||
|
"github.com/pocketbase/pocketbase/core"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Default values for heartbeat configuration.
|
||||||
|
const (
|
||||||
|
defaultInterval = 60 // seconds
|
||||||
|
httpTimeout = 10 * time.Second
|
||||||
|
)
|
||||||
|
|
||||||
|
// Payload is the JSON body sent with each heartbeat request.
|
||||||
|
type Payload struct {
|
||||||
|
// Status is "ok" when all non-paused systems are up, "warn" when alerts
|
||||||
|
// are triggered but no systems are down, and "error" when any system is down.
|
||||||
|
Status string `json:"status"`
|
||||||
|
Timestamp string `json:"timestamp"`
|
||||||
|
Msg string `json:"msg"`
|
||||||
|
Systems SystemsSummary `json:"systems"`
|
||||||
|
Down []SystemInfo `json:"down_systems,omitempty"`
|
||||||
|
Alerts []AlertInfo `json:"triggered_alerts,omitempty"`
|
||||||
|
Version string `json:"beszel_version"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// SystemsSummary contains counts of systems by status.
|
||||||
|
type SystemsSummary struct {
|
||||||
|
Total int `json:"total"`
|
||||||
|
Up int `json:"up"`
|
||||||
|
Down int `json:"down"`
|
||||||
|
Paused int `json:"paused"`
|
||||||
|
Pending int `json:"pending"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// SystemInfo identifies a system that is currently down.
|
||||||
|
type SystemInfo struct {
|
||||||
|
ID string `json:"id" db:"id"`
|
||||||
|
Name string `json:"name" db:"name"`
|
||||||
|
Host string `json:"host" db:"host"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// AlertInfo describes a currently triggered alert.
|
||||||
|
type AlertInfo struct {
|
||||||
|
SystemID string `json:"system_id"`
|
||||||
|
SystemName string `json:"system_name"`
|
||||||
|
AlertName string `json:"alert_name"`
|
||||||
|
Threshold float64 `json:"threshold"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Config holds heartbeat settings read from environment variables.
|
||||||
|
type Config struct {
|
||||||
|
URL string // endpoint to ping
|
||||||
|
Interval int // seconds between pings
|
||||||
|
Method string // HTTP method (GET or POST, default POST)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Heartbeat manages the periodic outbound health check.
|
||||||
|
type Heartbeat struct {
|
||||||
|
app core.App
|
||||||
|
config Config
|
||||||
|
client *http.Client
|
||||||
|
}
|
||||||
|
|
||||||
|
// New creates a Heartbeat if configuration is present.
|
||||||
|
// Returns nil if HEARTBEAT_URL is not set (feature disabled).
|
||||||
|
func New(app core.App, getEnv func(string) (string, bool)) *Heartbeat {
|
||||||
|
url, _ := getEnv("HEARTBEAT_URL")
|
||||||
|
url = strings.TrimSpace(url)
|
||||||
|
if app == nil || url == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
interval := defaultInterval
|
||||||
|
if v, ok := getEnv("HEARTBEAT_INTERVAL"); ok {
|
||||||
|
if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 {
|
||||||
|
interval = parsed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
method := http.MethodPost
|
||||||
|
if v, ok := getEnv("HEARTBEAT_METHOD"); ok {
|
||||||
|
v = strings.ToUpper(strings.TrimSpace(v))
|
||||||
|
if v == http.MethodGet || v == http.MethodHead {
|
||||||
|
method = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &Heartbeat{
|
||||||
|
app: app,
|
||||||
|
config: Config{
|
||||||
|
URL: url,
|
||||||
|
Interval: interval,
|
||||||
|
Method: method,
|
||||||
|
},
|
||||||
|
client: &http.Client{Timeout: httpTimeout},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start begins the heartbeat loop. It blocks and should be called in a goroutine.
|
||||||
|
// The loop runs until the provided stop channel is closed.
|
||||||
|
func (hb *Heartbeat) Start(stop <-chan struct{}) {
|
||||||
|
sanitizedURL := sanitizeHeartbeatURL(hb.config.URL)
|
||||||
|
hb.app.Logger().Info("Heartbeat enabled",
|
||||||
|
"url", sanitizedURL,
|
||||||
|
"interval", fmt.Sprintf("%ds", hb.config.Interval),
|
||||||
|
"method", hb.config.Method,
|
||||||
|
)
|
||||||
|
|
||||||
|
// Send an initial heartbeat immediately on startup.
|
||||||
|
hb.send()
|
||||||
|
|
||||||
|
ticker := time.NewTicker(time.Duration(hb.config.Interval) * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-stop:
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
hb.send()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send performs a single heartbeat ping. Exposed for the test-heartbeat API endpoint.
|
||||||
|
func (hb *Heartbeat) Send() error {
|
||||||
|
return hb.send()
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetConfig returns the current heartbeat configuration.
|
||||||
|
func (hb *Heartbeat) GetConfig() Config {
|
||||||
|
return hb.config
|
||||||
|
}
|
||||||
|
|
||||||
|
func (hb *Heartbeat) send() error {
|
||||||
|
var req *http.Request
|
||||||
|
var err error
|
||||||
|
method := normalizeMethod(hb.config.Method)
|
||||||
|
|
||||||
|
if method == http.MethodGet || method == http.MethodHead {
|
||||||
|
req, err = http.NewRequest(method, hb.config.URL, nil)
|
||||||
|
} else {
|
||||||
|
payload, payloadErr := hb.buildPayload()
|
||||||
|
if payloadErr != nil {
|
||||||
|
hb.app.Logger().Error("Heartbeat: failed to build payload", "err", payloadErr)
|
||||||
|
return payloadErr
|
||||||
|
}
|
||||||
|
|
||||||
|
body, jsonErr := json.Marshal(payload)
|
||||||
|
if jsonErr != nil {
|
||||||
|
hb.app.Logger().Error("Heartbeat: failed to marshal payload", "err", jsonErr)
|
||||||
|
return jsonErr
|
||||||
|
}
|
||||||
|
req, err = http.NewRequest(http.MethodPost, hb.config.URL, bytes.NewReader(body))
|
||||||
|
if err == nil {
|
||||||
|
req.Header.Set("Content-Type", "application/json")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
hb.app.Logger().Error("Heartbeat: failed to create request", "err", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
req.Header.Set("User-Agent", "Beszel-Heartbeat")
|
||||||
|
|
||||||
|
resp, err := hb.client.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
hb.app.Logger().Error("Heartbeat: request failed", "url", sanitizeHeartbeatURL(hb.config.URL), "err", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
if resp.StatusCode >= 400 {
|
||||||
|
hb.app.Logger().Warn("Heartbeat: non-success response",
|
||||||
|
"url", sanitizeHeartbeatURL(hb.config.URL),
|
||||||
|
"status", resp.StatusCode,
|
||||||
|
)
|
||||||
|
return fmt.Errorf("heartbeat endpoint returned status %d", resp.StatusCode)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (hb *Heartbeat) buildPayload() (*Payload, error) {
|
||||||
|
db := hb.app.DB()
|
||||||
|
|
||||||
|
// Count systems by status.
|
||||||
|
var systemCounts []struct {
|
||||||
|
Status string `db:"status"`
|
||||||
|
Count int `db:"cnt"`
|
||||||
|
}
|
||||||
|
err := db.NewQuery("SELECT status, COUNT(*) as cnt FROM systems GROUP BY status").All(&systemCounts)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("query system counts: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
summary := SystemsSummary{}
|
||||||
|
for _, sc := range systemCounts {
|
||||||
|
switch sc.Status {
|
||||||
|
case "up":
|
||||||
|
summary.Up = sc.Count
|
||||||
|
case "down":
|
||||||
|
summary.Down = sc.Count
|
||||||
|
case "paused":
|
||||||
|
summary.Paused = sc.Count
|
||||||
|
case "pending":
|
||||||
|
summary.Pending = sc.Count
|
||||||
|
}
|
||||||
|
summary.Total += sc.Count
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get names of down systems.
|
||||||
|
var downSystems []SystemInfo
|
||||||
|
if summary.Down > 0 {
|
||||||
|
err = db.NewQuery("SELECT id, name, host FROM systems WHERE status = 'down'").All(&downSystems)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("query down systems: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get triggered alerts with system names.
|
||||||
|
var triggeredAlerts []struct {
|
||||||
|
SystemID string `db:"system"`
|
||||||
|
SystemName string `db:"system_name"`
|
||||||
|
AlertName string `db:"name"`
|
||||||
|
Value float64 `db:"value"`
|
||||||
|
}
|
||||||
|
err = db.NewQuery(`
|
||||||
|
SELECT a.system, s.name as system_name, a.name, a.value
|
||||||
|
FROM alerts a
|
||||||
|
JOIN systems s ON a.system = s.id
|
||||||
|
WHERE a.triggered = true
|
||||||
|
`).All(&triggeredAlerts)
|
||||||
|
if err != nil {
|
||||||
|
// Non-fatal: alerts info is supplementary.
|
||||||
|
triggeredAlerts = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
alerts := make([]AlertInfo, 0, len(triggeredAlerts))
|
||||||
|
for _, ta := range triggeredAlerts {
|
||||||
|
alerts = append(alerts, AlertInfo{
|
||||||
|
SystemID: ta.SystemID,
|
||||||
|
SystemName: ta.SystemName,
|
||||||
|
AlertName: ta.AlertName,
|
||||||
|
Threshold: ta.Value,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine overall status.
|
||||||
|
status := "ok"
|
||||||
|
msg := "All systems operational"
|
||||||
|
if summary.Down > 0 {
|
||||||
|
status = "error"
|
||||||
|
names := make([]string, len(downSystems))
|
||||||
|
for i, ds := range downSystems {
|
||||||
|
names[i] = ds.Name
|
||||||
|
}
|
||||||
|
msg = fmt.Sprintf("%d system(s) down: %s", summary.Down, strings.Join(names, ", "))
|
||||||
|
} else if len(alerts) > 0 {
|
||||||
|
status = "warn"
|
||||||
|
msg = fmt.Sprintf("%d alert(s) triggered", len(alerts))
|
||||||
|
}
|
||||||
|
|
||||||
|
return &Payload{
|
||||||
|
Status: status,
|
||||||
|
Timestamp: time.Now().UTC().Format(time.RFC3339),
|
||||||
|
Msg: msg,
|
||||||
|
Systems: summary,
|
||||||
|
Down: downSystems,
|
||||||
|
Alerts: alerts,
|
||||||
|
Version: beszel.Version,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeMethod(method string) string {
|
||||||
|
upper := strings.ToUpper(strings.TrimSpace(method))
|
||||||
|
if upper == http.MethodGet || upper == http.MethodHead || upper == http.MethodPost {
|
||||||
|
return upper
|
||||||
|
}
|
||||||
|
return http.MethodPost
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeHeartbeatURL(rawURL string) string {
|
||||||
|
parsed, err := url.Parse(strings.TrimSpace(rawURL))
|
||||||
|
if err != nil || parsed.Scheme == "" || parsed.Host == "" {
|
||||||
|
return "<invalid-url>"
|
||||||
|
}
|
||||||
|
return parsed.Scheme + "://" + parsed.Host
|
||||||
|
}
|
||||||
258
internal/hub/heartbeat/heartbeat_test.go
Normal file
258
internal/hub/heartbeat/heartbeat_test.go
Normal file
@@ -0,0 +1,258 @@
|
|||||||
|
//go:build testing
|
||||||
|
// +build testing
|
||||||
|
|
||||||
|
package heartbeat_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/henrygd/beszel/internal/hub/heartbeat"
|
||||||
|
beszeltests "github.com/henrygd/beszel/internal/tests"
|
||||||
|
"github.com/pocketbase/pocketbase/core"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestNew(t *testing.T) {
|
||||||
|
t.Run("returns nil when app is missing", func(t *testing.T) {
|
||||||
|
hb := heartbeat.New(nil, envGetter(map[string]string{
|
||||||
|
"HEARTBEAT_URL": "https://heartbeat.example.com/ping",
|
||||||
|
}))
|
||||||
|
assert.Nil(t, hb)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("returns nil when URL is missing", func(t *testing.T) {
|
||||||
|
app := newTestHub(t)
|
||||||
|
hb := heartbeat.New(app.App, func(string) (string, bool) {
|
||||||
|
return "", false
|
||||||
|
})
|
||||||
|
assert.Nil(t, hb)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("parses and normalizes config values", func(t *testing.T) {
|
||||||
|
app := newTestHub(t)
|
||||||
|
env := map[string]string{
|
||||||
|
"HEARTBEAT_URL": " https://heartbeat.example.com/ping ",
|
||||||
|
"HEARTBEAT_INTERVAL": "90",
|
||||||
|
"HEARTBEAT_METHOD": "head",
|
||||||
|
}
|
||||||
|
getEnv := func(key string) (string, bool) {
|
||||||
|
v, ok := env[key]
|
||||||
|
return v, ok
|
||||||
|
}
|
||||||
|
|
||||||
|
hb := heartbeat.New(app.App, getEnv)
|
||||||
|
require.NotNil(t, hb)
|
||||||
|
cfg := hb.GetConfig()
|
||||||
|
assert.Equal(t, "https://heartbeat.example.com/ping", cfg.URL)
|
||||||
|
assert.Equal(t, 90, cfg.Interval)
|
||||||
|
assert.Equal(t, http.MethodHead, cfg.Method)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSendGETDoesNotRequireAppOrDB(t *testing.T) {
|
||||||
|
app := newTestHub(t)
|
||||||
|
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
assert.Equal(t, http.MethodGet, r.Method)
|
||||||
|
assert.Equal(t, "Beszel-Heartbeat", r.Header.Get("User-Agent"))
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
}))
|
||||||
|
defer server.Close()
|
||||||
|
|
||||||
|
hb := heartbeat.New(app.App, envGetter(map[string]string{
|
||||||
|
"HEARTBEAT_URL": server.URL,
|
||||||
|
"HEARTBEAT_METHOD": "GET",
|
||||||
|
}))
|
||||||
|
require.NotNil(t, hb)
|
||||||
|
|
||||||
|
require.NoError(t, hb.Send())
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSendReturnsErrorOnHTTPFailureStatus(t *testing.T) {
|
||||||
|
app := newTestHub(t)
|
||||||
|
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.WriteHeader(http.StatusInternalServerError)
|
||||||
|
}))
|
||||||
|
defer server.Close()
|
||||||
|
|
||||||
|
hb := heartbeat.New(app.App, envGetter(map[string]string{
|
||||||
|
"HEARTBEAT_URL": server.URL,
|
||||||
|
"HEARTBEAT_METHOD": "GET",
|
||||||
|
}))
|
||||||
|
require.NotNil(t, hb)
|
||||||
|
|
||||||
|
err := hb.Send()
|
||||||
|
require.Error(t, err)
|
||||||
|
assert.ErrorContains(t, err, "heartbeat endpoint returned status 500")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSendPOSTBuildsExpectedStatuses(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
setup func(t *testing.T, app *beszeltests.TestHub, user *core.Record)
|
||||||
|
expectStatus string
|
||||||
|
expectMsgPart string
|
||||||
|
expectDown int
|
||||||
|
expectAlerts int
|
||||||
|
expectTotal int
|
||||||
|
expectUp int
|
||||||
|
expectPaused int
|
||||||
|
expectPending int
|
||||||
|
expectDownSumm int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "error when at least one system is down",
|
||||||
|
setup: func(t *testing.T, app *beszeltests.TestHub, user *core.Record) {
|
||||||
|
downSystem := createTestSystem(t, app, user.Id, "db-1", "10.0.0.1", "down")
|
||||||
|
_ = createTestSystem(t, app, user.Id, "web-1", "10.0.0.2", "up")
|
||||||
|
createTriggeredAlert(t, app, user.Id, downSystem.Id, "CPU", 95)
|
||||||
|
},
|
||||||
|
expectStatus: "error",
|
||||||
|
expectMsgPart: "1 system(s) down",
|
||||||
|
expectDown: 1,
|
||||||
|
expectAlerts: 1,
|
||||||
|
expectTotal: 2,
|
||||||
|
expectUp: 1,
|
||||||
|
expectDownSumm: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "warn when only alerts are triggered",
|
||||||
|
setup: func(t *testing.T, app *beszeltests.TestHub, user *core.Record) {
|
||||||
|
system := createTestSystem(t, app, user.Id, "api-1", "10.1.0.1", "up")
|
||||||
|
createTriggeredAlert(t, app, user.Id, system.Id, "CPU", 90)
|
||||||
|
},
|
||||||
|
expectStatus: "warn",
|
||||||
|
expectMsgPart: "1 alert(s) triggered",
|
||||||
|
expectDown: 0,
|
||||||
|
expectAlerts: 1,
|
||||||
|
expectTotal: 1,
|
||||||
|
expectUp: 1,
|
||||||
|
expectDownSumm: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ok when no down systems and no alerts",
|
||||||
|
setup: func(t *testing.T, app *beszeltests.TestHub, user *core.Record) {
|
||||||
|
_ = createTestSystem(t, app, user.Id, "node-1", "10.2.0.1", "up")
|
||||||
|
_ = createTestSystem(t, app, user.Id, "node-2", "10.2.0.2", "paused")
|
||||||
|
_ = createTestSystem(t, app, user.Id, "node-3", "10.2.0.3", "pending")
|
||||||
|
},
|
||||||
|
expectStatus: "ok",
|
||||||
|
expectMsgPart: "All systems operational",
|
||||||
|
expectDown: 0,
|
||||||
|
expectAlerts: 0,
|
||||||
|
expectTotal: 3,
|
||||||
|
expectUp: 1,
|
||||||
|
expectPaused: 1,
|
||||||
|
expectPending: 1,
|
||||||
|
expectDownSumm: 0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
app := newTestHub(t)
|
||||||
|
user := createTestUser(t, app)
|
||||||
|
tt.setup(t, app, user)
|
||||||
|
|
||||||
|
type requestCapture struct {
|
||||||
|
method string
|
||||||
|
userAgent string
|
||||||
|
contentType string
|
||||||
|
payload heartbeat.Payload
|
||||||
|
}
|
||||||
|
|
||||||
|
captured := make(chan requestCapture, 1)
|
||||||
|
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
defer r.Body.Close()
|
||||||
|
body, err := io.ReadAll(r.Body)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
var payload heartbeat.Payload
|
||||||
|
require.NoError(t, json.Unmarshal(body, &payload))
|
||||||
|
captured <- requestCapture{
|
||||||
|
method: r.Method,
|
||||||
|
userAgent: r.Header.Get("User-Agent"),
|
||||||
|
contentType: r.Header.Get("Content-Type"),
|
||||||
|
payload: payload,
|
||||||
|
}
|
||||||
|
w.WriteHeader(http.StatusNoContent)
|
||||||
|
}))
|
||||||
|
defer server.Close()
|
||||||
|
|
||||||
|
hb := heartbeat.New(app.App, envGetter(map[string]string{
|
||||||
|
"HEARTBEAT_URL": server.URL,
|
||||||
|
"HEARTBEAT_METHOD": "POST",
|
||||||
|
}))
|
||||||
|
require.NotNil(t, hb)
|
||||||
|
require.NoError(t, hb.Send())
|
||||||
|
|
||||||
|
req := <-captured
|
||||||
|
assert.Equal(t, http.MethodPost, req.method)
|
||||||
|
assert.Equal(t, "Beszel-Heartbeat", req.userAgent)
|
||||||
|
assert.Equal(t, "application/json", req.contentType)
|
||||||
|
|
||||||
|
assert.Equal(t, tt.expectStatus, req.payload.Status)
|
||||||
|
assert.Contains(t, req.payload.Msg, tt.expectMsgPart)
|
||||||
|
assert.Equal(t, tt.expectDown, len(req.payload.Down))
|
||||||
|
assert.Equal(t, tt.expectAlerts, len(req.payload.Alerts))
|
||||||
|
assert.Equal(t, tt.expectTotal, req.payload.Systems.Total)
|
||||||
|
assert.Equal(t, tt.expectUp, req.payload.Systems.Up)
|
||||||
|
assert.Equal(t, tt.expectDownSumm, req.payload.Systems.Down)
|
||||||
|
assert.Equal(t, tt.expectPaused, req.payload.Systems.Paused)
|
||||||
|
assert.Equal(t, tt.expectPending, req.payload.Systems.Pending)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func newTestHub(t *testing.T) *beszeltests.TestHub {
|
||||||
|
t.Helper()
|
||||||
|
app, err := beszeltests.NewTestHub(t.TempDir())
|
||||||
|
require.NoError(t, err)
|
||||||
|
t.Cleanup(app.Cleanup)
|
||||||
|
return app
|
||||||
|
}
|
||||||
|
|
||||||
|
func createTestUser(t *testing.T, app *beszeltests.TestHub) *core.Record {
|
||||||
|
t.Helper()
|
||||||
|
user, err := beszeltests.CreateUser(app.App, "admin@example.com", "password123")
|
||||||
|
require.NoError(t, err)
|
||||||
|
return user
|
||||||
|
}
|
||||||
|
|
||||||
|
func createTestSystem(t *testing.T, app *beszeltests.TestHub, userID, name, host, status string) *core.Record {
|
||||||
|
t.Helper()
|
||||||
|
system, err := beszeltests.CreateRecord(app.App, "systems", map[string]any{
|
||||||
|
"name": name,
|
||||||
|
"host": host,
|
||||||
|
"port": "45876",
|
||||||
|
"users": []string{userID},
|
||||||
|
"status": status,
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
return system
|
||||||
|
}
|
||||||
|
|
||||||
|
func createTriggeredAlert(t *testing.T, app *beszeltests.TestHub, userID, systemID, name string, threshold float64) *core.Record {
|
||||||
|
t.Helper()
|
||||||
|
alert, err := beszeltests.CreateRecord(app.App, "alerts", map[string]any{
|
||||||
|
"name": name,
|
||||||
|
"system": systemID,
|
||||||
|
"user": userID,
|
||||||
|
"value": threshold,
|
||||||
|
"min": 0,
|
||||||
|
"triggered": true,
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
return alert
|
||||||
|
}
|
||||||
|
|
||||||
|
func envGetter(values map[string]string) func(string) (string, bool) {
|
||||||
|
return func(key string) (string, bool) {
|
||||||
|
v, ok := values[key]
|
||||||
|
return v, ok
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -15,6 +15,7 @@ import (
|
|||||||
"github.com/henrygd/beszel"
|
"github.com/henrygd/beszel"
|
||||||
"github.com/henrygd/beszel/internal/alerts"
|
"github.com/henrygd/beszel/internal/alerts"
|
||||||
"github.com/henrygd/beszel/internal/hub/config"
|
"github.com/henrygd/beszel/internal/hub/config"
|
||||||
|
"github.com/henrygd/beszel/internal/hub/heartbeat"
|
||||||
"github.com/henrygd/beszel/internal/hub/systems"
|
"github.com/henrygd/beszel/internal/hub/systems"
|
||||||
"github.com/henrygd/beszel/internal/records"
|
"github.com/henrygd/beszel/internal/records"
|
||||||
"github.com/henrygd/beszel/internal/users"
|
"github.com/henrygd/beszel/internal/users"
|
||||||
@@ -33,6 +34,8 @@ type Hub struct {
|
|||||||
um *users.UserManager
|
um *users.UserManager
|
||||||
rm *records.RecordManager
|
rm *records.RecordManager
|
||||||
sm *systems.SystemManager
|
sm *systems.SystemManager
|
||||||
|
hb *heartbeat.Heartbeat
|
||||||
|
hbStop chan struct{}
|
||||||
pubKey string
|
pubKey string
|
||||||
signer ssh.Signer
|
signer ssh.Signer
|
||||||
appURL string
|
appURL string
|
||||||
@@ -48,6 +51,10 @@ func NewHub(app core.App) *Hub {
|
|||||||
hub.rm = records.NewRecordManager(hub)
|
hub.rm = records.NewRecordManager(hub)
|
||||||
hub.sm = systems.NewSystemManager(hub)
|
hub.sm = systems.NewSystemManager(hub)
|
||||||
hub.appURL, _ = GetEnv("APP_URL")
|
hub.appURL, _ = GetEnv("APP_URL")
|
||||||
|
hub.hb = heartbeat.New(app, GetEnv)
|
||||||
|
if hub.hb != nil {
|
||||||
|
hub.hbStop = make(chan struct{})
|
||||||
|
}
|
||||||
return hub
|
return hub
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -88,6 +95,10 @@ func (h *Hub) StartHub() error {
|
|||||||
if err := h.sm.Initialize(); err != nil {
|
if err := h.sm.Initialize(); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
// start heartbeat if configured
|
||||||
|
if h.hb != nil {
|
||||||
|
go h.hb.Start(h.hbStop)
|
||||||
|
}
|
||||||
return e.Next()
|
return e.Next()
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -287,6 +298,9 @@ func (h *Hub) registerApiRoutes(se *core.ServeEvent) error {
|
|||||||
})
|
})
|
||||||
// send test notification
|
// send test notification
|
||||||
apiAuth.POST("/test-notification", h.SendTestNotification)
|
apiAuth.POST("/test-notification", h.SendTestNotification)
|
||||||
|
// heartbeat status and test
|
||||||
|
apiAuth.GET("/heartbeat-status", h.getHeartbeatStatus)
|
||||||
|
apiAuth.POST("/test-heartbeat", h.testHeartbeat)
|
||||||
// get config.yml content
|
// get config.yml content
|
||||||
apiAuth.GET("/config-yaml", config.GetYamlConfig)
|
apiAuth.GET("/config-yaml", config.GetYamlConfig)
|
||||||
// handle agent websocket connection
|
// handle agent websocket connection
|
||||||
@@ -403,6 +417,42 @@ func (h *Hub) getUniversalToken(e *core.RequestEvent) error {
|
|||||||
return e.JSON(http.StatusOK, response)
|
return e.JSON(http.StatusOK, response)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getHeartbeatStatus returns current heartbeat configuration and whether it's enabled
|
||||||
|
func (h *Hub) getHeartbeatStatus(e *core.RequestEvent) error {
|
||||||
|
if e.Auth.GetString("role") != "admin" {
|
||||||
|
return e.ForbiddenError("Requires admin role", nil)
|
||||||
|
}
|
||||||
|
if h.hb == nil {
|
||||||
|
return e.JSON(http.StatusOK, map[string]any{
|
||||||
|
"enabled": false,
|
||||||
|
"msg": "Set HEARTBEAT_URL to enable outbound heartbeat monitoring",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
cfg := h.hb.GetConfig()
|
||||||
|
return e.JSON(http.StatusOK, map[string]any{
|
||||||
|
"enabled": true,
|
||||||
|
"url": cfg.URL,
|
||||||
|
"interval": cfg.Interval,
|
||||||
|
"method": cfg.Method,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// testHeartbeat triggers a single heartbeat ping and returns the result
|
||||||
|
func (h *Hub) testHeartbeat(e *core.RequestEvent) error {
|
||||||
|
if e.Auth.GetString("role") != "admin" {
|
||||||
|
return e.ForbiddenError("Requires admin role", nil)
|
||||||
|
}
|
||||||
|
if h.hb == nil {
|
||||||
|
return e.JSON(http.StatusOK, map[string]any{
|
||||||
|
"err": "Heartbeat not configured. Set HEARTBEAT_URL environment variable.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if err := h.hb.Send(); err != nil {
|
||||||
|
return e.JSON(http.StatusOK, map[string]any{"err": err.Error()})
|
||||||
|
}
|
||||||
|
return e.JSON(http.StatusOK, map[string]any{"err": false})
|
||||||
|
}
|
||||||
|
|
||||||
// containerRequestHandler handles both container logs and info requests
|
// containerRequestHandler handles both container logs and info requests
|
||||||
func (h *Hub) containerRequestHandler(e *core.RequestEvent, fetchFunc func(*systems.System, string) (string, error), responseKey string) error {
|
func (h *Hub) containerRequestHandler(e *core.RequestEvent, fetchFunc func(*systems.System, string) (string, error), responseKey string) error {
|
||||||
systemID := e.Request.URL.Query().Get("system")
|
systemID := e.Request.URL.Query().Get("system")
|
||||||
|
|||||||
@@ -362,6 +362,58 @@ func TestApiRoutesAuthentication(t *testing.T) {
|
|||||||
ExpectedContent: []string{"test-system"},
|
ExpectedContent: []string{"test-system"},
|
||||||
TestAppFactory: testAppFactory,
|
TestAppFactory: testAppFactory,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
Name: "GET /heartbeat-status - no auth should fail",
|
||||||
|
Method: http.MethodGet,
|
||||||
|
URL: "/api/beszel/heartbeat-status",
|
||||||
|
ExpectedStatus: 401,
|
||||||
|
ExpectedContent: []string{"requires valid"},
|
||||||
|
TestAppFactory: testAppFactory,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "GET /heartbeat-status - with user auth should fail",
|
||||||
|
Method: http.MethodGet,
|
||||||
|
URL: "/api/beszel/heartbeat-status",
|
||||||
|
Headers: map[string]string{
|
||||||
|
"Authorization": userToken,
|
||||||
|
},
|
||||||
|
ExpectedStatus: 403,
|
||||||
|
ExpectedContent: []string{"Requires admin role"},
|
||||||
|
TestAppFactory: testAppFactory,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "GET /heartbeat-status - with admin auth should succeed",
|
||||||
|
Method: http.MethodGet,
|
||||||
|
URL: "/api/beszel/heartbeat-status",
|
||||||
|
Headers: map[string]string{
|
||||||
|
"Authorization": adminUserToken,
|
||||||
|
},
|
||||||
|
ExpectedStatus: 200,
|
||||||
|
ExpectedContent: []string{`"enabled":false`},
|
||||||
|
TestAppFactory: testAppFactory,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "POST /test-heartbeat - with user auth should fail",
|
||||||
|
Method: http.MethodPost,
|
||||||
|
URL: "/api/beszel/test-heartbeat",
|
||||||
|
Headers: map[string]string{
|
||||||
|
"Authorization": userToken,
|
||||||
|
},
|
||||||
|
ExpectedStatus: 403,
|
||||||
|
ExpectedContent: []string{"Requires admin role"},
|
||||||
|
TestAppFactory: testAppFactory,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "POST /test-heartbeat - with admin auth should report disabled state",
|
||||||
|
Method: http.MethodPost,
|
||||||
|
URL: "/api/beszel/test-heartbeat",
|
||||||
|
Headers: map[string]string{
|
||||||
|
"Authorization": adminUserToken,
|
||||||
|
},
|
||||||
|
ExpectedStatus: 200,
|
||||||
|
ExpectedContent: []string{"Heartbeat not configured"},
|
||||||
|
TestAppFactory: testAppFactory,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
Name: "GET /universal-token - no auth should fail",
|
Name: "GET /universal-token - no auth should fail",
|
||||||
Method: http.MethodGet,
|
Method: http.MethodGet,
|
||||||
|
|||||||
205
internal/site/src/components/routes/settings/heartbeat.tsx
Normal file
205
internal/site/src/components/routes/settings/heartbeat.tsx
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
import { t } from "@lingui/core/macro"
|
||||||
|
import { Trans } from "@lingui/react/macro"
|
||||||
|
import { redirectPage } from "@nanostores/router"
|
||||||
|
import clsx from "clsx"
|
||||||
|
import { LoaderCircleIcon, SendIcon } from "lucide-react"
|
||||||
|
import { useEffect, useState } from "react"
|
||||||
|
import { $router } from "@/components/router"
|
||||||
|
import { Badge } from "@/components/ui/badge"
|
||||||
|
import { Button } from "@/components/ui/button"
|
||||||
|
import { Separator } from "@/components/ui/separator"
|
||||||
|
import { toast } from "@/components/ui/use-toast"
|
||||||
|
import { isAdmin, pb } from "@/lib/api"
|
||||||
|
|
||||||
|
interface HeartbeatStatus {
|
||||||
|
enabled: boolean
|
||||||
|
url?: string
|
||||||
|
interval?: number
|
||||||
|
method?: string
|
||||||
|
msg?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export default function HeartbeatSettings() {
|
||||||
|
const [status, setStatus] = useState<HeartbeatStatus | null>(null)
|
||||||
|
const [isLoading, setIsLoading] = useState(true)
|
||||||
|
const [isTesting, setIsTesting] = useState(false)
|
||||||
|
|
||||||
|
if (!isAdmin()) {
|
||||||
|
redirectPage($router, "settings", { name: "general" })
|
||||||
|
}
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
fetchStatus()
|
||||||
|
}, [])
|
||||||
|
|
||||||
|
async function fetchStatus() {
|
||||||
|
try {
|
||||||
|
setIsLoading(true)
|
||||||
|
const res = await pb.send<HeartbeatStatus>("/api/beszel/heartbeat-status", {})
|
||||||
|
setStatus(res)
|
||||||
|
} catch (error: any) {
|
||||||
|
toast({
|
||||||
|
title: t`Error`,
|
||||||
|
description: error.message,
|
||||||
|
variant: "destructive",
|
||||||
|
})
|
||||||
|
} finally {
|
||||||
|
setIsLoading(false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function sendTestHeartbeat() {
|
||||||
|
setIsTesting(true)
|
||||||
|
try {
|
||||||
|
const res = await pb.send<{ err: string | false }>("/api/beszel/test-heartbeat", {
|
||||||
|
method: "POST",
|
||||||
|
})
|
||||||
|
if ("err" in res && !res.err) {
|
||||||
|
toast({
|
||||||
|
title: t`Heartbeat sent successfully`,
|
||||||
|
description: t`Check your monitoring service`,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
toast({
|
||||||
|
title: t`Error`,
|
||||||
|
description: (res.err as string) ?? t`Failed to send heartbeat`,
|
||||||
|
variant: "destructive",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
toast({
|
||||||
|
title: t`Error`,
|
||||||
|
description: error.message,
|
||||||
|
variant: "destructive",
|
||||||
|
})
|
||||||
|
} finally {
|
||||||
|
setIsTesting(false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const TestIcon = isTesting ? LoaderCircleIcon : SendIcon
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div>
|
||||||
|
<div>
|
||||||
|
<h3 className="text-xl font-medium mb-2">
|
||||||
|
<Trans>Heartbeat Monitoring</Trans>
|
||||||
|
</h3>
|
||||||
|
<p className="text-sm text-muted-foreground leading-relaxed">
|
||||||
|
<Trans>
|
||||||
|
Send periodic outbound pings to an external monitoring service so you can monitor Beszel without exposing it
|
||||||
|
to the internet.
|
||||||
|
</Trans>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<Separator className="my-4" />
|
||||||
|
|
||||||
|
{isLoading ? (
|
||||||
|
<div className="flex items-center gap-2 text-muted-foreground py-4">
|
||||||
|
<LoaderCircleIcon className="h-4 w-4 animate-spin" />
|
||||||
|
<Trans>Loading heartbeat status...</Trans>
|
||||||
|
</div>
|
||||||
|
) : status?.enabled ? (
|
||||||
|
<div className="space-y-5">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<Badge variant="success">
|
||||||
|
<Trans>Active</Trans>
|
||||||
|
</Badge>
|
||||||
|
</div>
|
||||||
|
<div className="grid gap-4 sm:grid-cols-2">
|
||||||
|
<ConfigItem label={t`Endpoint URL`} value={status.url ?? ""} mono />
|
||||||
|
<ConfigItem label={t`Interval`} value={`${status.interval}s`} />
|
||||||
|
<ConfigItem label={t`HTTP Method`} value={status.method ?? "POST"} />
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<Separator />
|
||||||
|
|
||||||
|
<div>
|
||||||
|
<h4 className="text-base font-medium mb-1">
|
||||||
|
<Trans>Test heartbeat</Trans>
|
||||||
|
</h4>
|
||||||
|
<p className="text-sm text-muted-foreground leading-relaxed mb-3">
|
||||||
|
<Trans>Send a single heartbeat ping to verify your endpoint is working.</Trans>
|
||||||
|
</p>
|
||||||
|
<Button
|
||||||
|
type="button"
|
||||||
|
variant="outline"
|
||||||
|
className="flex items-center gap-1.5"
|
||||||
|
onClick={sendTestHeartbeat}
|
||||||
|
disabled={isTesting}
|
||||||
|
>
|
||||||
|
<TestIcon className={clsx("h-4 w-4", isTesting && "animate-spin")} />
|
||||||
|
<Trans>Send test heartbeat</Trans>
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<Separator />
|
||||||
|
|
||||||
|
<div>
|
||||||
|
<h4 className="text-base font-medium mb-2">
|
||||||
|
<Trans>Payload format</Trans>
|
||||||
|
</h4>
|
||||||
|
<p className="text-sm text-muted-foreground leading-relaxed mb-2">
|
||||||
|
<Trans>
|
||||||
|
When using POST, each heartbeat includes a JSON payload with system status summary, list of down
|
||||||
|
systems, and triggered alerts.
|
||||||
|
</Trans>
|
||||||
|
</p>
|
||||||
|
<p className="text-sm text-muted-foreground leading-relaxed">
|
||||||
|
<Trans>
|
||||||
|
The overall status is <code className="bg-muted rounded-sm px-1 text-primary">ok</code> when all systems
|
||||||
|
are up, <code className="bg-muted rounded-sm px-1 text-primary">warn</code> when alerts are triggered,
|
||||||
|
and <code className="bg-muted rounded-sm px-1 text-primary">error</code> when any system is down.
|
||||||
|
</Trans>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<div className="grid gap-4">
|
||||||
|
<div>
|
||||||
|
<p className="text-sm text-muted-foreground leading-relaxed mb-3">
|
||||||
|
<Trans>Set the following environment variables on your Beszel hub to enable heartbeat monitoring:</Trans>
|
||||||
|
</p>
|
||||||
|
<div className="grid gap-2.5">
|
||||||
|
<EnvVarItem
|
||||||
|
name="HEARTBEAT_URL"
|
||||||
|
description={t`Endpoint URL to ping (required)`}
|
||||||
|
example="https://uptime.betterstack.com/api/v1/heartbeat/xxxx"
|
||||||
|
/>
|
||||||
|
<EnvVarItem name="HEARTBEAT_INTERVAL" description={t`Seconds between pings (default: 60)`} example="60" />
|
||||||
|
<EnvVarItem
|
||||||
|
name="HEARTBEAT_METHOD"
|
||||||
|
description={t`HTTP method: POST, GET, or HEAD (default: POST)`}
|
||||||
|
example="POST"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<p className="text-sm text-muted-foreground leading-relaxed">
|
||||||
|
<Trans>After setting the environment variables, restart your Beszel hub for changes to take effect.</Trans>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
function ConfigItem({ label, value, mono }: { label: string; value: string; mono?: boolean }) {
|
||||||
|
return (
|
||||||
|
<div>
|
||||||
|
<p className="text-sm font-medium mb-0.5">{label}</p>
|
||||||
|
<p className={clsx("text-sm text-muted-foreground break-all", mono && "font-mono")}>{value}</p>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
function EnvVarItem({ name, description, example }: { name: string; description: string; example: string }) {
|
||||||
|
return (
|
||||||
|
<div className="bg-muted/50 rounded-md px-3 py-2 grid gap-1.5">
|
||||||
|
<code className="text-sm font-mono text-primary font-medium leading-tight">{name}</code>
|
||||||
|
<p className="text-sm text-muted-foreground">{description}</p>
|
||||||
|
<p className="text-xs text-muted-foreground">
|
||||||
|
<Trans>Example:</Trans> <code className="font-mono">{example}</code>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
@@ -2,7 +2,14 @@ import { t } from "@lingui/core/macro"
|
|||||||
import { Trans, useLingui } from "@lingui/react/macro"
|
import { Trans, useLingui } from "@lingui/react/macro"
|
||||||
import { useStore } from "@nanostores/react"
|
import { useStore } from "@nanostores/react"
|
||||||
import { getPagePath, redirectPage } from "@nanostores/router"
|
import { getPagePath, redirectPage } from "@nanostores/router"
|
||||||
import { AlertOctagonIcon, BellIcon, FileSlidersIcon, FingerprintIcon, SettingsIcon } from "lucide-react"
|
import {
|
||||||
|
AlertOctagonIcon,
|
||||||
|
BellIcon,
|
||||||
|
FileSlidersIcon,
|
||||||
|
FingerprintIcon,
|
||||||
|
HeartPulseIcon,
|
||||||
|
SettingsIcon,
|
||||||
|
} from "lucide-react"
|
||||||
import { lazy, useEffect } from "react"
|
import { lazy, useEffect } from "react"
|
||||||
import { $router } from "@/components/router.tsx"
|
import { $router } from "@/components/router.tsx"
|
||||||
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card.tsx"
|
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card.tsx"
|
||||||
@@ -18,12 +25,14 @@ const notificationsSettingsImport = () => import("./notifications.tsx")
|
|||||||
const configYamlSettingsImport = () => import("./config-yaml.tsx")
|
const configYamlSettingsImport = () => import("./config-yaml.tsx")
|
||||||
const fingerprintsSettingsImport = () => import("./tokens-fingerprints.tsx")
|
const fingerprintsSettingsImport = () => import("./tokens-fingerprints.tsx")
|
||||||
const alertsHistoryDataTableSettingsImport = () => import("./alerts-history-data-table.tsx")
|
const alertsHistoryDataTableSettingsImport = () => import("./alerts-history-data-table.tsx")
|
||||||
|
const heartbeatSettingsImport = () => import("./heartbeat.tsx")
|
||||||
|
|
||||||
const GeneralSettings = lazy(generalSettingsImport)
|
const GeneralSettings = lazy(generalSettingsImport)
|
||||||
const NotificationsSettings = lazy(notificationsSettingsImport)
|
const NotificationsSettings = lazy(notificationsSettingsImport)
|
||||||
const ConfigYamlSettings = lazy(configYamlSettingsImport)
|
const ConfigYamlSettings = lazy(configYamlSettingsImport)
|
||||||
const FingerprintsSettings = lazy(fingerprintsSettingsImport)
|
const FingerprintsSettings = lazy(fingerprintsSettingsImport)
|
||||||
const AlertsHistoryDataTableSettings = lazy(alertsHistoryDataTableSettingsImport)
|
const AlertsHistoryDataTableSettings = lazy(alertsHistoryDataTableSettingsImport)
|
||||||
|
const HeartbeatSettings = lazy(heartbeatSettingsImport)
|
||||||
|
|
||||||
export async function saveSettings(newSettings: Partial<UserSettings>) {
|
export async function saveSettings(newSettings: Partial<UserSettings>) {
|
||||||
try {
|
try {
|
||||||
@@ -88,6 +97,13 @@ export default function SettingsLayout() {
|
|||||||
admin: true,
|
admin: true,
|
||||||
preload: configYamlSettingsImport,
|
preload: configYamlSettingsImport,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
title: t`Heartbeat`,
|
||||||
|
href: getPagePath($router, "settings", { name: "heartbeat" }),
|
||||||
|
icon: HeartPulseIcon,
|
||||||
|
admin: true,
|
||||||
|
preload: heartbeatSettingsImport,
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
const page = useStore($router)
|
const page = useStore($router)
|
||||||
@@ -141,5 +157,7 @@ function SettingsContent({ name }: { name: string }) {
|
|||||||
return <FingerprintsSettings />
|
return <FingerprintsSettings />
|
||||||
case "alert-history":
|
case "alert-history":
|
||||||
return <AlertsHistoryDataTableSettings />
|
return <AlertsHistoryDataTableSettings />
|
||||||
|
case "heartbeat":
|
||||||
|
return <HeartbeatSettings />
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,9 @@
|
|||||||
|
## Unreleased
|
||||||
|
|
||||||
|
- Add outbound heartbeat monitoring to external services (BetterStack, Uptime Kuma, Healthchecks.io, etc.) with system status summary payload. Configured via `BESZEL_HUB_HEARTBEAT_URL`, `BESZEL_HUB_HEARTBEAT_INTERVAL`, and `BESZEL_HUB_HEARTBEAT_METHOD` environment variables.
|
||||||
|
|
||||||
|
- Add Heartbeat settings page to the admin UI with status display, configuration reference, and test button.
|
||||||
|
|
||||||
## 0.18.3
|
## 0.18.3
|
||||||
|
|
||||||
- Add experimental sysfs AMD GPU collector. (#737, #1569)
|
- Add experimental sysfs AMD GPU collector. (#737, #1569)
|
||||||
|
|||||||
Reference in New Issue
Block a user