This commit is contained in:
henrygd
2026-04-23 01:13:01 -04:00
parent 5fc774666f
commit 0d440e5fb9
8 changed files with 390 additions and 127 deletions

View File

@@ -213,11 +213,14 @@ func (h *GetSystemdInfoHandler) Handle(hctx *HandlerContext) error {
type SyncNetworkProbesHandler struct{}
func (h *SyncNetworkProbesHandler) Handle(hctx *HandlerContext) error {
var configs []probe.Config
if err := cbor.Unmarshal(hctx.Request.Data, &configs); err != nil {
var req probe.SyncRequest
if err := cbor.Unmarshal(hctx.Request.Data, &req); err != nil {
return err
}
hctx.Agent.probeManager.SyncProbes(configs)
slog.Info("network probes synced", "count", len(configs))
return hctx.SendResponse("ok", hctx.RequestID)
resp, err := hctx.Agent.probeManager.ApplySync(req)
if err != nil {
return err
}
slog.Info("network probes synced", "action", req.Action)
return hctx.SendResponse(resp, hctx.RequestID)
}

View File

@@ -1,6 +1,7 @@
package agent
import (
"errors"
"fmt"
"math"
"net"
@@ -77,6 +78,14 @@ func newProbeManager() *ProbeManager {
}
}
func newProbeTask(config probe.Config) *probeTask {
return &probeTask{
config: config,
cancel: make(chan struct{}),
samples: make([]probeSample, 0, 64),
}
}
// newProbeAggregate initializes an aggregate with an unset minimum value.
func newProbeAggregate() probeAggregate {
return probeAggregate{minMs: math.MaxFloat64}
@@ -175,18 +184,94 @@ func (pm *ProbeManager) SyncProbes(configs []probe.Config) {
}
}
// Start new probes (skip existing ones with same key)
// Start new probes and restart tasks whose config changed.
for key, cfg := range newKeys {
if _, exists := pm.probes[key]; exists {
task, exists := pm.probes[key]
if exists && task.config == cfg {
continue
}
task := &probeTask{
config: cfg,
cancel: make(chan struct{}),
samples: make([]probeSample, 0, 64),
if exists {
close(task.cancel)
}
task = newProbeTask(cfg)
pm.probes[key] = task
go pm.runProbe(task)
go pm.runProbe(task, true)
}
}
// ApplySync applies a full or incremental probe sync request.
func (pm *ProbeManager) ApplySync(req probe.SyncRequest) (probe.SyncResponse, error) {
switch req.Action {
case probe.SyncActionReplace:
pm.SyncProbes(req.Configs)
return probe.SyncResponse{}, nil
case probe.SyncActionUpsert:
result, err := pm.UpsertProbe(req.Config, req.RunNow)
if err != nil {
return probe.SyncResponse{}, err
}
if result == nil {
return probe.SyncResponse{}, nil
}
return probe.SyncResponse{Result: *result}, nil
case probe.SyncActionDelete:
if req.Config.ID == "" {
return probe.SyncResponse{}, errors.New("missing probe ID for delete action")
}
pm.DeleteProbe(req.Config.ID)
return probe.SyncResponse{}, nil
default:
return probe.SyncResponse{}, fmt.Errorf("unknown probe sync action: %d", req.Action)
}
}
// UpsertProbe creates or replaces a single probe task.
func (pm *ProbeManager) UpsertProbe(config probe.Config, runNow bool) (*probe.Result, error) {
if config.ID == "" {
return nil, errors.New("missing probe ID")
}
pm.mu.Lock()
task, exists := pm.probes[config.ID]
startTask := false
if exists && task.config == config {
pm.mu.Unlock()
if !runNow {
return nil, nil
}
return pm.runProbeNow(task), nil
}
if exists {
close(task.cancel)
}
task = newProbeTask(config)
pm.probes[config.ID] = task
startTask = true
pm.mu.Unlock()
if runNow {
result := pm.runProbeNow(task)
if startTask {
go pm.runProbe(task, false)
}
return result, nil
}
if startTask {
go pm.runProbe(task, true)
}
return nil, nil
}
// DeleteProbe stops and removes a single probe task.
func (pm *ProbeManager) DeleteProbe(id string) {
if id == "" {
return
}
pm.mu.Lock()
defer pm.mu.Unlock()
if task, exists := pm.probes[id]; exists {
close(task.cancel)
delete(pm.probes, id)
}
}
@@ -201,28 +286,12 @@ func (pm *ProbeManager) GetResults(durationMs uint16) map[string]probe.Result {
for _, task := range pm.probes {
task.mu.Lock()
agg := task.aggregateLocked(duration, now)
hourAgg := task.aggregateLocked(time.Hour, now)
result, ok := task.resultLocked(duration, now)
task.mu.Unlock()
if !agg.hasData() {
if !ok {
continue
}
result := agg.result()
hourAvg := hourAgg.avgResponse()
hourLoss := hourAgg.lossPercentage()
if hourAgg.successCount > 0 {
result = probe.Result{
result[0],
hourAvg,
math.Round(hourAgg.minMs*100) / 100,
math.Round(hourAgg.maxMs*100) / 100,
hourLoss,
}
} else {
result = probe.Result{result[0], hourAvg, 0, 0, hourLoss}
}
results[task.config.ID] = result
}
@@ -240,26 +309,39 @@ func (pm *ProbeManager) Stop() {
}
// runProbe executes a single probe task in a loop.
func (pm *ProbeManager) runProbe(task *probeTask) {
func (pm *ProbeManager) runProbe(task *probeTask, runImmediately bool) {
interval := time.Duration(task.config.Interval) * time.Second
if interval < time.Second {
interval = 10 * time.Second
}
ticker := time.Tick(interval)
ticker := time.NewTicker(interval)
defer ticker.Stop()
// Run immediately on start
if runImmediately {
pm.executeProbe(task)
}
for {
select {
case <-task.cancel:
return
case <-ticker:
case <-ticker.C:
pm.executeProbe(task)
}
}
}
func (pm *ProbeManager) runProbeNow(task *probeTask) *probe.Result {
pm.executeProbe(task)
task.mu.Lock()
defer task.mu.Unlock()
result, ok := task.resultLocked(time.Minute, time.Now())
if !ok {
return nil
}
return &result
}
// aggregateLocked collects probe data for the requested time window.
func (task *probeTask) aggregateLocked(duration time.Duration, now time.Time) probeAggregate {
cutoff := now.Add(-duration)
@@ -270,6 +352,28 @@ func (task *probeTask) aggregateLocked(duration time.Duration, now time.Time) pr
return aggregateBucketsSince(task.buckets[:], cutoff, now)
}
func (task *probeTask) resultLocked(duration time.Duration, now time.Time) (probe.Result, bool) {
agg := task.aggregateLocked(duration, now)
hourAgg := task.aggregateLocked(time.Hour, now)
if !agg.hasData() {
return nil, false
}
result := agg.result()
hourAvg := hourAgg.avgResponse()
hourLoss := hourAgg.lossPercentage()
if hourAgg.successCount > 0 {
return probe.Result{
result[0],
hourAvg,
math.Round(hourAgg.minMs*100) / 100,
math.Round(hourAgg.maxMs*100) / 100,
hourLoss,
}, true
}
return probe.Result{result[0], hourAvg, 0, 0, hourLoss}, true
}
// aggregateSamplesSince aggregates raw samples newer than the cutoff.
func aggregateSamplesSince(samples []probeSample, cutoff time.Time) probeAggregate {
agg := newProbeAggregate()
@@ -374,6 +478,9 @@ func probeTCP(target string, port uint16) float64 {
// probeHTTP measures HTTP GET request response. Returns -1 on failure.
func probeHTTP(client *http.Client, url string) float64 {
if client == nil {
client = http.DefaultClient
}
start := time.Now()
resp, err := client.Get(url)
if err != nil {

View File

@@ -117,8 +117,8 @@ func TestProbeConfigResultKeyUsesSyncedID(t *testing.T) {
}
func TestProbeManagerSyncProbesSkipsConfigsWithoutStableID(t *testing.T) {
validCfg := probe.Config{ID: "probe-1", Target: "https://example.com", Protocol: "http", Interval: 10}
invalidCfg := probe.Config{Target: "1.1.1.1", Protocol: "icmp", Interval: 10}
validCfg := probe.Config{ID: "probe-1", Target: "ignored", Protocol: "noop", Interval: 10}
invalidCfg := probe.Config{Target: "ignored", Protocol: "noop", Interval: 10}
pm := newProbeManager()
pm.SyncProbes([]probe.Config{validCfg, invalidCfg})
@@ -131,8 +131,8 @@ func TestProbeManagerSyncProbesSkipsConfigsWithoutStableID(t *testing.T) {
}
func TestProbeManagerSyncProbesStopsRemovedTasksButKeepsExisting(t *testing.T) {
keepCfg := probe.Config{ID: "probe-1", Target: "https://example.com", Protocol: "http", Interval: 10}
removeCfg := probe.Config{ID: "probe-2", Target: "1.1.1.1", Protocol: "icmp", Interval: 10}
keepCfg := probe.Config{ID: "probe-1", Target: "ignored", Protocol: "noop", Interval: 10}
removeCfg := probe.Config{ID: "probe-2", Target: "ignored", Protocol: "noop", Interval: 10}
keptTask := &probeTask{config: keepCfg, cancel: make(chan struct{})}
removedTask := &probeTask{config: removeCfg, cancel: make(chan struct{})}
@@ -162,6 +162,83 @@ func TestProbeManagerSyncProbesStopsRemovedTasksButKeepsExisting(t *testing.T) {
}
}
func TestProbeManagerSyncProbesRestartsChangedConfig(t *testing.T) {
originalCfg := probe.Config{ID: "probe-1", Target: "ignored-a", Protocol: "noop", Interval: 10}
updatedCfg := probe.Config{ID: "probe-1", Target: "ignored-b", Protocol: "noop", Interval: 10}
originalTask := &probeTask{config: originalCfg, cancel: make(chan struct{})}
pm := &ProbeManager{
probes: map[string]*probeTask{
originalCfg.ID: originalTask,
},
}
pm.SyncProbes([]probe.Config{updatedCfg})
defer pm.Stop()
restartedTask := pm.probes[updatedCfg.ID]
assert.NotSame(t, originalTask, restartedTask)
assert.Equal(t, updatedCfg, restartedTask.config)
select {
case <-originalTask.cancel:
default:
t.Fatal("expected changed probe task to be cancelled")
}
}
func TestProbeManagerApplySyncUpsertRunsImmediatelyAndReturnsResult(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusNoContent)
}))
defer server.Close()
pm := &ProbeManager{
probes: make(map[string]*probeTask),
httpClient: server.Client(),
}
resp, err := pm.ApplySync(probe.SyncRequest{
Action: probe.SyncActionUpsert,
Config: probe.Config{ID: "probe-1", Target: server.URL, Protocol: "http", Interval: 10},
RunNow: true,
})
defer pm.Stop()
require.NoError(t, err)
require.Len(t, resp.Result, 5)
assert.GreaterOrEqual(t, resp.Result[0], 0.0)
assert.Equal(t, 0.0, resp.Result[4])
task := pm.probes["probe-1"]
require.NotNil(t, task)
task.mu.Lock()
defer task.mu.Unlock()
require.Len(t, task.samples, 1)
}
func TestProbeManagerApplySyncDeleteRemovesTask(t *testing.T) {
config := probe.Config{ID: "probe-1", Target: "1.1.1.1", Protocol: "icmp", Interval: 10}
task := &probeTask{config: config, cancel: make(chan struct{})}
pm := &ProbeManager{
probes: map[string]*probeTask{config.ID: task},
}
_, err := pm.ApplySync(probe.SyncRequest{
Action: probe.SyncActionDelete,
Config: probe.Config{ID: config.ID},
})
require.NoError(t, err)
_, exists := pm.probes[config.ID]
assert.False(t, exists)
select {
case <-task.cancel:
default:
t.Fatal("expected deleted probe task to be cancelled")
}
}
func TestProbeHTTP(t *testing.T) {
t.Run("success", func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {

View File

@@ -1,5 +1,16 @@
package probe
type SyncAction uint8
const (
// SyncActionReplace indicates a full sync where the provided configs should replace all existing probes for the system.
SyncActionReplace SyncAction = iota
// SyncActionUpsert indicates an incremental sync where the provided config should be added or updated.
SyncActionUpsert
// SyncActionDelete indicates an incremental sync where the provided config should be removed.
SyncActionDelete
)
// Config defines a network probe task sent from hub to agent.
type Config struct {
// ID is the stable network_probes record ID generated by the hub.
@@ -10,6 +21,19 @@ type Config struct {
Interval uint16 `cbor:"4,keyasint"` // seconds
}
// SyncRequest defines an incremental or full probe sync request sent to the agent.
type SyncRequest struct {
Action SyncAction `cbor:"0,keyasint"`
Config Config `cbor:"1,keyasint,omitempty"`
Configs []Config `cbor:"2,keyasint,omitempty"`
RunNow bool `cbor:"3,keyasint,omitempty"`
}
// SyncResponse returns the immediate result for an upsert when requested.
type SyncResponse struct {
Result Result `cbor:"0,keyasint,omitempty"`
}
// Result holds aggregated probe results for a single target.
//
// 0: avg response in ms
@@ -22,3 +46,11 @@ type Config struct {
//
// 4: packet loss percentage over the last hour (0-100)
type Result []float64
// Get returns the value at the specified index or 0 if the index is out of range.
func (r Result) Get(index int) float64 {
if index < len(r) {
return r[index]
}
return 0
}

View File

@@ -15,48 +15,106 @@ func generateProbeID(systemId string, config probe.Config) string {
return systems.MakeStableHashId(systemId, config.Protocol, config.Target, portStr, intervalStr)
}
func bindNetworkProbesEvents(h *Hub) {
// bindNetworkProbesEvents keeps probe records and agent probe state in sync.
func bindNetworkProbesEvents(hub *Hub) {
// on create, make sure the id is set to a stable hash
h.OnRecordCreate("network_probes").BindFunc(func(e *core.RecordEvent) error {
hub.OnRecordCreate("network_probes").BindFunc(func(e *core.RecordEvent) error {
systemID := e.Record.GetString("system")
config := &probe.Config{
Target: e.Record.GetString("target"),
Protocol: e.Record.GetString("protocol"),
Port: uint16(e.Record.GetInt("port")),
Interval: uint16(e.Record.GetInt("interval")),
}
config := probeConfigFromRecord(e.Record)
id := generateProbeID(systemID, *config)
e.Record.Set("id", id)
return e.Next()
})
// sync probe to agent on creation
h.OnRecordAfterCreateSuccess("network_probes").BindFunc(func(e *core.RecordEvent) error {
systemID := e.Record.GetString("system")
h.syncProbesToAgent(systemID)
return e.Next()
// sync probe to agent on creation and persist the first result immediately when available
hub.OnRecordCreateRequest("network_probes").BindFunc(func(e *core.RecordRequestEvent) error {
err := e.Next()
if err != nil {
return err
}
if !e.Record.GetBool("enabled") {
return nil
}
result, err := hub.upsertNetworkProbe(e.Record, true)
if err != nil {
hub.Logger().Warn("failed to sync probe to agent", "system", e.Record.GetString("system"), "probe", e.Record.Id, "err", err)
return nil
}
if result == nil {
return nil
}
setProbeResultFields(e.Record, *result)
if err := e.App.SaveNoValidate(e.Record); err != nil {
hub.Logger().Warn("failed to save initial probe result", "system", e.Record.GetString("system"), "probe", e.Record.Id, "err", err)
}
return nil
})
hub.OnRecordUpdateRequest("network_probes").BindFunc(func(e *core.RecordRequestEvent) error {
err := e.Next()
if err != nil {
return err
}
if e.Record.GetBool("enabled") {
_, err = hub.upsertNetworkProbe(e.Record, false)
} else {
err = hub.deleteNetworkProbe(e.Record)
}
if err != nil {
hub.Logger().Warn("failed to sync updated probe to agent", "system", e.Record.GetString("system"), "probe", e.Record.Id, "err", err)
}
return nil
})
// sync probe to agent on delete
h.OnRecordAfterDeleteSuccess("network_probes").BindFunc(func(e *core.RecordEvent) error {
systemID := e.Record.GetString("system")
h.syncProbesToAgent(systemID)
return e.Next()
hub.OnRecordDeleteRequest("network_probes").BindFunc(func(e *core.RecordRequestEvent) error {
err := e.Next()
if err != nil {
return err
}
if err := hub.deleteNetworkProbe(e.Record); err != nil {
hub.Logger().Warn("failed to delete probe on agent", "system", e.Record.GetString("system"), "probe", e.Record.Id, "err", err)
}
return nil
})
// TODO: if enabled changes, sync to agent
}
// syncProbesToAgent fetches enabled probes for a system and sends them to the agent.
func (h *Hub) syncProbesToAgent(systemID string) {
// probeConfigFromRecord builds a probe config from a network_probes record.
func probeConfigFromRecord(record *core.Record) *probe.Config {
return &probe.Config{
ID: record.Id,
Target: record.GetString("target"),
Protocol: record.GetString("protocol"),
Port: uint16(record.GetInt("port")),
Interval: uint16(record.GetInt("interval")),
}
}
// setProbeResultFields stores the latest probe result values on the record.
func setProbeResultFields(record *core.Record, result probe.Result) {
record.Set("res", result.Get(0))
record.Set("resAvg1h", result.Get(1))
record.Set("resMin1h", result.Get(2))
record.Set("resMax1h", result.Get(3))
record.Set("loss1h", result.Get(4))
}
// upsertNetworkProbe applies the record's probe config to the target system.
func (h *Hub) upsertNetworkProbe(record *core.Record, runNow bool) (*probe.Result, error) {
systemID := record.GetString("system")
system, err := h.sm.GetSystem(systemID)
if err != nil {
return
return nil, err
}
configs := h.sm.GetProbeConfigsForSystem(systemID)
go func() {
if err := system.SyncNetworkProbes(configs); err != nil {
h.Logger().Warn("failed to sync probes to agent", "system", systemID, "err", err)
}
}()
return system.UpsertNetworkProbe(*probeConfigFromRecord(record), runNow)
}
// deleteNetworkProbe removes the record's probe from the target system.
func (h *Hub) deleteNetworkProbe(record *core.Record) error {
systemID := record.GetString("system")
system, err := h.sm.GetSystem(systemID)
if err != nil {
return err
}
return system.DeleteNetworkProbe(record.Id)
}

View File

@@ -371,21 +371,21 @@ func updateNetworkProbesRecords(app core.App, data map[string]probe.Result, syst
var record *core.Record
record, err = app.FindRecordById(collectionName, id)
if err == nil {
record.Set("res", probeMetric(values, 0))
record.Set("resAvg1h", probeMetric(values, 1))
record.Set("resMin1h", probeMetric(values, 2))
record.Set("resMax1h", probeMetric(values, 3))
record.Set("loss1h", probeMetric(values, 4))
record.Set("res", values.Get(0))
record.Set("resAvg1h", values.Get(1))
record.Set("resMin1h", values.Get(2))
record.Set("resMax1h", values.Get(3))
record.Set("loss1h", values.Get(4))
err = app.SaveNoValidate(record)
}
default:
_, err = updateQuery.Bind(dbx.Params{
"id": id,
"res": probeMetric(values, 0),
"resAvg1h": probeMetric(values, 1),
"resMin1h": probeMetric(values, 2),
"resMax1h": probeMetric(values, 3),
"loss1h": probeMetric(values, 4),
"res": values.Get(0),
"resAvg1h": values.Get(1),
"resMin1h": values.Get(2),
"resMax1h": values.Get(3),
"loss1h": values.Get(4),
"updated": nowString,
}).Execute()
}
@@ -397,13 +397,6 @@ func updateNetworkProbesRecords(app core.App, data map[string]probe.Result, syst
return nil
}
func probeMetric(values probe.Result, index int) float64 {
if index < len(values) {
return values[index]
}
return 0
}
// createContainerRecords creates container records
func createContainerRecords(app core.App, data []*container.Stats, systemId string) error {
if len(data) == 0 {

View File

@@ -10,48 +10,39 @@ import (
// SyncNetworkProbes sends probe configurations to the agent.
func (sys *System) SyncNetworkProbes(configs []probe.Config) error {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
var result string
return sys.request(ctx, common.SyncNetworkProbes, configs, &result)
_, err := sys.syncNetworkProbes(probe.SyncRequest{Action: probe.SyncActionReplace, Configs: configs})
return err
}
// FetchNetworkProbeResults fetches probe results from the agent.
// func (sys *System) FetchNetworkProbeResults() (map[string]probe.Result, error) {
// ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
// defer cancel()
// var results map[string]probe.Result
// err := sys.request(ctx, common.GetNetworkProbeResults, nil, &results)
// return results, err
// }
// UpsertNetworkProbe sends a single probe configuration change to the agent.
func (sys *System) UpsertNetworkProbe(config probe.Config, runNow bool) (*probe.Result, error) {
resp, err := sys.syncNetworkProbes(probe.SyncRequest{
Action: probe.SyncActionUpsert,
Config: config,
RunNow: runNow,
})
if err != nil {
return nil, err
}
if len(resp.Result) == 0 {
return nil, nil
}
result := resp.Result
return &result, nil
}
// hasEnabledProbes returns true if this system has any enabled network probes.
// func (sys *System) hasEnabledProbes() bool {
// count, err := sys.manager.hub.CountRecords("network_probes",
// dbx.NewExp("system = {:system} AND enabled = true", dbx.Params{"system": sys.Id}))
// return err == nil && count > 0
// }
// DeleteNetworkProbe removes a single probe task from the agent.
func (sys *System) DeleteNetworkProbe(id string) error {
_, err := sys.syncNetworkProbes(probe.SyncRequest{
Action: probe.SyncActionDelete,
Config: probe.Config{ID: id},
})
return err
}
// fetchAndSaveProbeResults fetches probe results and saves them to the database.
// func (sys *System) fetchAndSaveProbeResults() {
// hub := sys.manager.hub
// results, err := sys.FetchNetworkProbeResults()
// if err != nil || len(results) == 0 {
// return
// }
// collection, err := hub.FindCachedCollectionByNameOrId("network_probe_stats")
// if err != nil {
// return
// }
// record := core.NewRecord(collection)
// record.Set("system", sys.Id)
// record.Set("stats", results)
// record.Set("type", "1m")
// if err := hub.SaveNoValidate(record); err != nil {
// hub.Logger().Warn("failed to save probe stats", "system", sys.Id, "err", err)
// }
// }
func (sys *System) syncNetworkProbes(req probe.SyncRequest) (probe.SyncResponse, error) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
var result probe.SyncResponse
return result, sys.request(ctx, common.SyncNetworkProbes, req, &result)
}

View File

@@ -66,8 +66,10 @@ export function AddProbeDialog({ systemId }: { systemId?: string }) {
interval: probeInterval,
enabled: true,
})
if (name && name !== target) {
if (name) {
payload.name = name
} else if (targetName !== target) {
payload.name = targetName
}
await pb.collection("network_probes").create(payload)
resetForm()