mirror of
https://github.com/henrygd/beszel.git
synced 2025-12-17 02:36:17 +01:00
add one minute chart + refactor rpc
- add one minute charts - update disk io to use bytes - update hub and agent connection interfaces / handlers to be more flexible - change agent cache to use cache time instead of session id - refactor collection of metrics which require deltas to track separately per cache time
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package hub
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net"
|
||||
"net/http"
|
||||
@@ -93,7 +94,7 @@ func (acr *agentConnectRequest) agentConnect() (err error) {
|
||||
// verifyWsConn verifies the WebSocket connection using the agent's fingerprint and
|
||||
// SSH key signature, then adds the system to the system manager.
|
||||
func (acr *agentConnectRequest) verifyWsConn(conn *gws.Conn, fpRecords []ws.FingerprintRecord) (err error) {
|
||||
wsConn := ws.NewWsConnection(conn)
|
||||
wsConn := ws.NewWsConnection(conn, acr.agentSemVer)
|
||||
|
||||
// must set wsConn in connection store before the read loop
|
||||
conn.Session().Store("wsConn", wsConn)
|
||||
@@ -112,7 +113,7 @@ func (acr *agentConnectRequest) verifyWsConn(conn *gws.Conn, fpRecords []ws.Fing
|
||||
return err
|
||||
}
|
||||
|
||||
agentFingerprint, err := wsConn.GetFingerprint(acr.token, signer, acr.isUniversalToken)
|
||||
agentFingerprint, err := wsConn.GetFingerprint(context.Background(), acr.token, signer, acr.isUniversalToken)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/henrygd/beszel/internal/common"
|
||||
"github.com/henrygd/beszel/internal/hub/ws"
|
||||
|
||||
"github.com/henrygd/beszel/internal/entities/system"
|
||||
@@ -107,7 +108,7 @@ func (sys *System) update() error {
|
||||
sys.handlePaused()
|
||||
return nil
|
||||
}
|
||||
data, err := sys.fetchDataFromAgent()
|
||||
data, err := sys.fetchDataFromAgent(common.DataRequestOptions{CacheTimeMs: uint16(interval)})
|
||||
if err == nil {
|
||||
_, err = sys.createRecords(data)
|
||||
}
|
||||
@@ -209,13 +210,13 @@ func (sys *System) getContext() (context.Context, context.CancelFunc) {
|
||||
|
||||
// fetchDataFromAgent attempts to fetch data from the agent,
|
||||
// prioritizing WebSocket if available.
|
||||
func (sys *System) fetchDataFromAgent() (*system.CombinedData, error) {
|
||||
func (sys *System) fetchDataFromAgent(options common.DataRequestOptions) (*system.CombinedData, error) {
|
||||
if sys.data == nil {
|
||||
sys.data = &system.CombinedData{}
|
||||
}
|
||||
|
||||
if sys.WsConn != nil && sys.WsConn.IsConnected() {
|
||||
wsData, err := sys.fetchDataViaWebSocket()
|
||||
wsData, err := sys.fetchDataViaWebSocket(options)
|
||||
if err == nil {
|
||||
return wsData, nil
|
||||
}
|
||||
@@ -223,18 +224,18 @@ func (sys *System) fetchDataFromAgent() (*system.CombinedData, error) {
|
||||
sys.closeWebSocketConnection()
|
||||
}
|
||||
|
||||
sshData, err := sys.fetchDataViaSSH()
|
||||
sshData, err := sys.fetchDataViaSSH(options)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return sshData, nil
|
||||
}
|
||||
|
||||
func (sys *System) fetchDataViaWebSocket() (*system.CombinedData, error) {
|
||||
func (sys *System) fetchDataViaWebSocket(options common.DataRequestOptions) (*system.CombinedData, error) {
|
||||
if sys.WsConn == nil || !sys.WsConn.IsConnected() {
|
||||
return nil, errors.New("no websocket connection")
|
||||
}
|
||||
err := sys.WsConn.RequestSystemData(sys.data)
|
||||
err := sys.WsConn.RequestSystemData(context.Background(), sys.data, options)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -244,7 +245,7 @@ func (sys *System) fetchDataViaWebSocket() (*system.CombinedData, error) {
|
||||
// fetchDataViaSSH handles fetching data using SSH.
|
||||
// This function encapsulates the original SSH logic.
|
||||
// It updates sys.data directly upon successful fetch.
|
||||
func (sys *System) fetchDataViaSSH() (*system.CombinedData, error) {
|
||||
func (sys *System) fetchDataViaSSH(options common.DataRequestOptions) (*system.CombinedData, error) {
|
||||
maxRetries := 1
|
||||
for attempt := 0; attempt <= maxRetries; attempt++ {
|
||||
if sys.client == nil || sys.Status == down {
|
||||
@@ -269,12 +270,31 @@ func (sys *System) fetchDataViaSSH() (*system.CombinedData, error) {
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stdin, stdinErr := session.StdinPipe()
|
||||
if err := session.Shell(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
*sys.data = system.CombinedData{}
|
||||
|
||||
if sys.agentVersion.GTE(beszel.MinVersionAgentResponse) && stdinErr == nil {
|
||||
req := common.HubRequest[any]{Action: common.GetData, Data: options}
|
||||
_ = cbor.NewEncoder(stdin).Encode(req)
|
||||
// Close write side to signal end of request
|
||||
_ = stdin.Close()
|
||||
|
||||
var resp common.AgentResponse
|
||||
if decErr := cbor.NewDecoder(stdout).Decode(&resp); decErr == nil && resp.SystemData != nil {
|
||||
*sys.data = *resp.SystemData
|
||||
// wait for the session to complete
|
||||
if err := session.Wait(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return sys.data, nil
|
||||
}
|
||||
// If decoding failed, fall back below
|
||||
}
|
||||
|
||||
if sys.agentVersion.GTE(beszel.MinVersionCbor) {
|
||||
err = cbor.NewDecoder(stdout).Decode(sys.data)
|
||||
} else {
|
||||
@@ -379,11 +399,11 @@ func extractAgentVersion(versionString string) (semver.Version, error) {
|
||||
}
|
||||
|
||||
// getJitter returns a channel that will be triggered after a random delay
|
||||
// between 40% and 90% of the interval.
|
||||
// between 51% and 95% of the interval.
|
||||
// This is used to stagger the initial WebSocket connections to prevent clustering.
|
||||
func getJitter() <-chan time.Time {
|
||||
minPercent := 40
|
||||
maxPercent := 90
|
||||
minPercent := 51
|
||||
maxPercent := 95
|
||||
jitterRange := maxPercent - minPercent
|
||||
msDelay := (interval * minPercent / 100) + rand.Intn(interval*jitterRange/100)
|
||||
return time.After(time.Duration(msDelay) * time.Millisecond)
|
||||
|
||||
@@ -106,6 +106,8 @@ func (sm *SystemManager) bindEventHooks() {
|
||||
sm.hub.OnRecordAfterUpdateSuccess("systems").BindFunc(sm.onRecordAfterUpdateSuccess)
|
||||
sm.hub.OnRecordAfterDeleteSuccess("systems").BindFunc(sm.onRecordAfterDeleteSuccess)
|
||||
sm.hub.OnRecordAfterUpdateSuccess("fingerprints").BindFunc(sm.onTokenRotated)
|
||||
sm.hub.OnRealtimeSubscribeRequest().BindFunc(sm.onRealtimeSubscribeRequest)
|
||||
sm.hub.OnRealtimeConnectRequest().BindFunc(sm.onRealtimeConnectRequest)
|
||||
}
|
||||
|
||||
// onTokenRotated handles fingerprint token rotation events.
|
||||
|
||||
187
internal/hub/systems/system_realtime.go
Normal file
187
internal/hub/systems/system_realtime.go
Normal file
@@ -0,0 +1,187 @@
|
||||
package systems
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/henrygd/beszel/internal/common"
|
||||
"github.com/pocketbase/pocketbase/core"
|
||||
"github.com/pocketbase/pocketbase/tools/subscriptions"
|
||||
)
|
||||
|
||||
type subscriptionInfo struct {
|
||||
subscription string
|
||||
connectedClients uint8
|
||||
}
|
||||
|
||||
var (
|
||||
activeSubscriptions = make(map[string]*subscriptionInfo)
|
||||
workerRunning bool
|
||||
realtimeTicker *time.Ticker
|
||||
tickerStopChan chan struct{}
|
||||
realtimeMutex sync.Mutex
|
||||
)
|
||||
|
||||
// onRealtimeConnectRequest handles client connection events for realtime subscriptions.
|
||||
// It cleans up existing subscriptions when a client connects.
|
||||
func (sm *SystemManager) onRealtimeConnectRequest(e *core.RealtimeConnectRequestEvent) error {
|
||||
// after e.Next() is the client disconnection
|
||||
e.Next()
|
||||
subscriptions := e.Client.Subscriptions()
|
||||
for k := range subscriptions {
|
||||
sm.removeRealtimeSubscription(k, subscriptions[k])
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// onRealtimeSubscribeRequest handles client subscription events for realtime metrics.
|
||||
// It tracks new subscriptions and unsubscriptions to manage the realtime worker lifecycle.
|
||||
func (sm *SystemManager) onRealtimeSubscribeRequest(e *core.RealtimeSubscribeRequestEvent) error {
|
||||
oldSubs := e.Client.Subscriptions()
|
||||
// after e.Next() is the result of the subscribe request
|
||||
err := e.Next()
|
||||
newSubs := e.Client.Subscriptions()
|
||||
|
||||
// handle new subscriptions
|
||||
for k, options := range newSubs {
|
||||
if _, ok := oldSubs[k]; !ok {
|
||||
if strings.HasPrefix(k, "rt_metrics") {
|
||||
systemId := options.Query["system"]
|
||||
if _, ok := activeSubscriptions[systemId]; !ok {
|
||||
activeSubscriptions[systemId] = &subscriptionInfo{
|
||||
subscription: k,
|
||||
}
|
||||
}
|
||||
activeSubscriptions[systemId].connectedClients += 1
|
||||
sm.onRealtimeSubscriptionAdded()
|
||||
}
|
||||
}
|
||||
}
|
||||
// handle unsubscriptions
|
||||
for k := range oldSubs {
|
||||
if _, ok := newSubs[k]; !ok {
|
||||
sm.removeRealtimeSubscription(k, oldSubs[k])
|
||||
}
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// onRealtimeSubscriptionAdded initializes or starts the realtime worker when the first subscription is added.
|
||||
// It ensures only one worker runs at a time and creates the ticker for periodic data fetching.
|
||||
func (sm *SystemManager) onRealtimeSubscriptionAdded() {
|
||||
realtimeMutex.Lock()
|
||||
defer realtimeMutex.Unlock()
|
||||
|
||||
// Start the worker if it's not already running
|
||||
if !workerRunning {
|
||||
workerRunning = true
|
||||
// Create a new stop channel for this worker instance
|
||||
tickerStopChan = make(chan struct{})
|
||||
go sm.startRealtimeWorker()
|
||||
}
|
||||
|
||||
// If no ticker exists, create one
|
||||
if realtimeTicker == nil {
|
||||
realtimeTicker = time.NewTicker(1 * time.Second)
|
||||
}
|
||||
}
|
||||
|
||||
// checkSubscriptions stops the realtime worker when there are no active subscriptions.
|
||||
// This prevents unnecessary resource usage when no clients are listening for realtime data.
|
||||
func (sm *SystemManager) checkSubscriptions() {
|
||||
if !workerRunning || len(activeSubscriptions) > 0 {
|
||||
return
|
||||
}
|
||||
|
||||
realtimeMutex.Lock()
|
||||
defer realtimeMutex.Unlock()
|
||||
|
||||
// Signal the worker to stop
|
||||
if tickerStopChan != nil {
|
||||
select {
|
||||
case tickerStopChan <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
if realtimeTicker != nil {
|
||||
realtimeTicker.Stop()
|
||||
realtimeTicker = nil
|
||||
}
|
||||
|
||||
// Mark worker as stopped (will be reset when next subscription comes in)
|
||||
workerRunning = false
|
||||
}
|
||||
|
||||
// removeRealtimeSubscription removes a realtime subscription and checks if the worker should be stopped.
|
||||
// It only processes subscriptions with the "rt_metrics" prefix and triggers cleanup when subscriptions are removed.
|
||||
func (sm *SystemManager) removeRealtimeSubscription(subscription string, options subscriptions.SubscriptionOptions) {
|
||||
if strings.HasPrefix(subscription, "rt_metrics") {
|
||||
systemId := options.Query["system"]
|
||||
if info, ok := activeSubscriptions[systemId]; ok {
|
||||
info.connectedClients -= 1
|
||||
if info.connectedClients <= 0 {
|
||||
delete(activeSubscriptions, systemId)
|
||||
}
|
||||
}
|
||||
sm.checkSubscriptions()
|
||||
}
|
||||
}
|
||||
|
||||
// startRealtimeWorker runs the main loop for fetching realtime data from agents.
|
||||
// It continuously fetches system data and broadcasts it to subscribed clients via WebSocket.
|
||||
func (sm *SystemManager) startRealtimeWorker() {
|
||||
sm.fetchRealtimeDataAndNotify()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-tickerStopChan:
|
||||
return
|
||||
case <-realtimeTicker.C:
|
||||
// Check if ticker is still valid (might have been stopped)
|
||||
if realtimeTicker == nil || len(activeSubscriptions) == 0 {
|
||||
return
|
||||
}
|
||||
// slog.Debug("activeSubscriptions", "count", len(activeSubscriptions))
|
||||
sm.fetchRealtimeDataAndNotify()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fetchRealtimeDataAndNotify fetches realtime data for all active subscriptions and notifies the clients.
|
||||
func (sm *SystemManager) fetchRealtimeDataAndNotify() {
|
||||
for systemId, info := range activeSubscriptions {
|
||||
system, ok := sm.systems.GetOk(systemId)
|
||||
if ok {
|
||||
go func() {
|
||||
data, err := system.fetchDataFromAgent(common.DataRequestOptions{CacheTimeMs: 1000})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
bytes, err := json.Marshal(data)
|
||||
if err == nil {
|
||||
notify(sm.hub, info.subscription, bytes)
|
||||
}
|
||||
}()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// notify broadcasts realtime data to all clients subscribed to a specific subscription.
|
||||
// It iterates through all connected clients and sends the data only to those with matching subscriptions.
|
||||
func notify(app core.App, subscription string, data []byte) error {
|
||||
message := subscriptions.Message{
|
||||
Name: subscription,
|
||||
Data: data,
|
||||
}
|
||||
for _, client := range app.SubscriptionsBroker().Clients() {
|
||||
if !client.HasSubscription(subscription) {
|
||||
continue
|
||||
}
|
||||
client.Send(message)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
107
internal/hub/ws/handlers.go
Normal file
107
internal/hub/ws/handlers.go
Normal file
@@ -0,0 +1,107 @@
|
||||
package ws
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
|
||||
"github.com/fxamacker/cbor/v2"
|
||||
"github.com/henrygd/beszel/internal/common"
|
||||
"github.com/henrygd/beszel/internal/entities/system"
|
||||
"github.com/lxzan/gws"
|
||||
"golang.org/x/crypto/ssh"
|
||||
)
|
||||
|
||||
// ResponseHandler defines interface for handling agent responses
|
||||
type ResponseHandler interface {
|
||||
Handle(agentResponse common.AgentResponse) error
|
||||
HandleLegacy(rawData []byte) error
|
||||
}
|
||||
|
||||
// BaseHandler provides a default implementation that can be embedded to make HandleLegacy optional
|
||||
// type BaseHandler struct{}
|
||||
|
||||
// func (h *BaseHandler) HandleLegacy(rawData []byte) error {
|
||||
// return errors.New("legacy format not supported")
|
||||
// }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// systemDataHandler implements ResponseHandler for system data requests
|
||||
type systemDataHandler struct {
|
||||
data *system.CombinedData
|
||||
}
|
||||
|
||||
func (h *systemDataHandler) HandleLegacy(rawData []byte) error {
|
||||
return cbor.Unmarshal(rawData, h.data)
|
||||
}
|
||||
|
||||
func (h *systemDataHandler) Handle(agentResponse common.AgentResponse) error {
|
||||
if agentResponse.SystemData != nil {
|
||||
*h.data = *agentResponse.SystemData
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// RequestSystemData requests system metrics from the agent and unmarshals the response.
|
||||
func (ws *WsConn) RequestSystemData(ctx context.Context, data *system.CombinedData, options common.DataRequestOptions) error {
|
||||
if !ws.IsConnected() {
|
||||
return gws.ErrConnClosed
|
||||
}
|
||||
|
||||
req, err := ws.requestManager.SendRequest(ctx, common.GetData, options)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
handler := &systemDataHandler{data: data}
|
||||
return ws.handleAgentRequest(req, handler)
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// fingerprintHandler implements ResponseHandler for fingerprint requests
|
||||
type fingerprintHandler struct {
|
||||
result *common.FingerprintResponse
|
||||
}
|
||||
|
||||
func (h *fingerprintHandler) HandleLegacy(rawData []byte) error {
|
||||
return cbor.Unmarshal(rawData, h.result)
|
||||
}
|
||||
|
||||
func (h *fingerprintHandler) Handle(agentResponse common.AgentResponse) error {
|
||||
if agentResponse.Fingerprint != nil {
|
||||
*h.result = *agentResponse.Fingerprint
|
||||
return nil
|
||||
}
|
||||
return errors.New("no fingerprint data in response")
|
||||
}
|
||||
|
||||
// GetFingerprint authenticates with the agent using SSH signature and returns the agent's fingerprint.
|
||||
func (ws *WsConn) GetFingerprint(ctx context.Context, token string, signer ssh.Signer, needSysInfo bool) (common.FingerprintResponse, error) {
|
||||
if !ws.IsConnected() {
|
||||
return common.FingerprintResponse{}, gws.ErrConnClosed
|
||||
}
|
||||
|
||||
challenge := []byte(token)
|
||||
signature, err := signer.Sign(nil, challenge)
|
||||
if err != nil {
|
||||
return common.FingerprintResponse{}, err
|
||||
}
|
||||
|
||||
req, err := ws.requestManager.SendRequest(ctx, common.CheckFingerprint, common.FingerprintRequest{
|
||||
Signature: signature.Blob,
|
||||
NeedSysInfo: needSysInfo,
|
||||
})
|
||||
if err != nil {
|
||||
return common.FingerprintResponse{}, err
|
||||
}
|
||||
|
||||
var result common.FingerprintResponse
|
||||
handler := &fingerprintHandler{result: &result}
|
||||
err = ws.handleAgentRequest(req, handler)
|
||||
return result, err
|
||||
}
|
||||
186
internal/hub/ws/request_manager.go
Normal file
186
internal/hub/ws/request_manager.go
Normal file
@@ -0,0 +1,186 @@
|
||||
package ws
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/fxamacker/cbor/v2"
|
||||
"github.com/henrygd/beszel/internal/common"
|
||||
"github.com/lxzan/gws"
|
||||
)
|
||||
|
||||
// RequestID uniquely identifies a request
|
||||
type RequestID uint32
|
||||
|
||||
// PendingRequest tracks an in-flight request
|
||||
type PendingRequest struct {
|
||||
ID RequestID
|
||||
ResponseCh chan *gws.Message
|
||||
Context context.Context
|
||||
Cancel context.CancelFunc
|
||||
CreatedAt time.Time
|
||||
}
|
||||
|
||||
// RequestManager handles concurrent requests to an agent
|
||||
type RequestManager struct {
|
||||
sync.RWMutex
|
||||
conn *gws.Conn
|
||||
pendingReqs map[RequestID]*PendingRequest
|
||||
nextID atomic.Uint32
|
||||
}
|
||||
|
||||
// NewRequestManager creates a new request manager for a WebSocket connection
|
||||
func NewRequestManager(conn *gws.Conn) *RequestManager {
|
||||
rm := &RequestManager{
|
||||
conn: conn,
|
||||
pendingReqs: make(map[RequestID]*PendingRequest),
|
||||
}
|
||||
return rm
|
||||
}
|
||||
|
||||
// SendRequest sends a request and returns a channel for the response
|
||||
func (rm *RequestManager) SendRequest(ctx context.Context, action common.WebSocketAction, data any) (*PendingRequest, error) {
|
||||
reqID := RequestID(rm.nextID.Add(1))
|
||||
|
||||
reqCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
|
||||
req := &PendingRequest{
|
||||
ID: reqID,
|
||||
ResponseCh: make(chan *gws.Message, 1),
|
||||
Context: reqCtx,
|
||||
Cancel: cancel,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
|
||||
rm.Lock()
|
||||
rm.pendingReqs[reqID] = req
|
||||
rm.Unlock()
|
||||
|
||||
hubReq := common.HubRequest[any]{
|
||||
Id: (*uint32)(&reqID),
|
||||
Action: action,
|
||||
Data: data,
|
||||
}
|
||||
|
||||
// Send the request
|
||||
if err := rm.sendMessage(hubReq); err != nil {
|
||||
rm.cancelRequest(reqID)
|
||||
return nil, fmt.Errorf("failed to send request: %w", err)
|
||||
}
|
||||
|
||||
// Start cleanup watcher for timeout/cancellation
|
||||
go rm.cleanupRequest(req)
|
||||
|
||||
return req, nil
|
||||
}
|
||||
|
||||
// sendMessage encodes and sends a message over WebSocket
|
||||
func (rm *RequestManager) sendMessage(data any) error {
|
||||
if rm.conn == nil {
|
||||
return gws.ErrConnClosed
|
||||
}
|
||||
|
||||
bytes, err := cbor.Marshal(data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal request: %w", err)
|
||||
}
|
||||
|
||||
return rm.conn.WriteMessage(gws.OpcodeBinary, bytes)
|
||||
}
|
||||
|
||||
// handleResponse processes a single response message
|
||||
func (rm *RequestManager) handleResponse(message *gws.Message) {
|
||||
var response common.AgentResponse
|
||||
if err := cbor.Unmarshal(message.Data.Bytes(), &response); err != nil {
|
||||
// Legacy response without ID - route to first pending request of any type
|
||||
rm.routeLegacyResponse(message)
|
||||
return
|
||||
}
|
||||
|
||||
reqID := RequestID(*response.Id)
|
||||
|
||||
rm.RLock()
|
||||
req, exists := rm.pendingReqs[reqID]
|
||||
rm.RUnlock()
|
||||
|
||||
if !exists {
|
||||
// Request not found (might have timed out) - close the message
|
||||
message.Close()
|
||||
return
|
||||
}
|
||||
|
||||
select {
|
||||
case req.ResponseCh <- message:
|
||||
// Message successfully delivered - the receiver will close it
|
||||
rm.deleteRequest(reqID)
|
||||
case <-req.Context.Done():
|
||||
// Request was cancelled/timed out - close the message
|
||||
message.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// routeLegacyResponse handles responses that don't have request IDs (backwards compatibility)
|
||||
func (rm *RequestManager) routeLegacyResponse(message *gws.Message) {
|
||||
// Snapshot the oldest pending request without holding the lock during send
|
||||
rm.RLock()
|
||||
var oldestReq *PendingRequest
|
||||
for _, req := range rm.pendingReqs {
|
||||
if oldestReq == nil || req.CreatedAt.Before(oldestReq.CreatedAt) {
|
||||
oldestReq = req
|
||||
}
|
||||
}
|
||||
rm.RUnlock()
|
||||
|
||||
if oldestReq != nil {
|
||||
select {
|
||||
case oldestReq.ResponseCh <- message:
|
||||
// Message successfully delivered - the receiver will close it
|
||||
rm.deleteRequest(oldestReq.ID)
|
||||
case <-oldestReq.Context.Done():
|
||||
// Request was cancelled - close the message
|
||||
message.Close()
|
||||
}
|
||||
} else {
|
||||
// No pending requests - close the message
|
||||
message.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// cleanupRequest handles request timeout and cleanup
|
||||
func (rm *RequestManager) cleanupRequest(req *PendingRequest) {
|
||||
<-req.Context.Done()
|
||||
rm.cancelRequest(req.ID)
|
||||
}
|
||||
|
||||
// cancelRequest removes a request and cancels its context
|
||||
func (rm *RequestManager) cancelRequest(reqID RequestID) {
|
||||
rm.Lock()
|
||||
defer rm.Unlock()
|
||||
|
||||
if req, exists := rm.pendingReqs[reqID]; exists {
|
||||
req.Cancel()
|
||||
delete(rm.pendingReqs, reqID)
|
||||
}
|
||||
}
|
||||
|
||||
// deleteRequest removes a request from the pending map without cancelling its context.
|
||||
func (rm *RequestManager) deleteRequest(reqID RequestID) {
|
||||
rm.Lock()
|
||||
defer rm.Unlock()
|
||||
delete(rm.pendingReqs, reqID)
|
||||
}
|
||||
|
||||
// Close shuts down the request manager
|
||||
func (rm *RequestManager) Close() {
|
||||
rm.Lock()
|
||||
defer rm.Unlock()
|
||||
|
||||
// Cancel all pending requests
|
||||
for _, req := range rm.pendingReqs {
|
||||
req.Cancel()
|
||||
}
|
||||
rm.pendingReqs = make(map[RequestID]*PendingRequest)
|
||||
}
|
||||
81
internal/hub/ws/request_manager_test.go
Normal file
81
internal/hub/ws/request_manager_test.go
Normal file
@@ -0,0 +1,81 @@
|
||||
//go:build testing
|
||||
// +build testing
|
||||
|
||||
package ws
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
// TestRequestManager_BasicFunctionality tests the request manager without mocking gws.Conn
|
||||
func TestRequestManager_BasicFunctionality(t *testing.T) {
|
||||
// We'll test the core logic without mocking the connection
|
||||
// since the gws.Conn interface is complex to mock properly
|
||||
|
||||
t.Run("request ID generation", func(t *testing.T) {
|
||||
// Test that request IDs are generated sequentially and uniquely
|
||||
rm := &RequestManager{}
|
||||
|
||||
// Simulate multiple ID generations
|
||||
id1 := rm.nextID.Add(1)
|
||||
id2 := rm.nextID.Add(1)
|
||||
id3 := rm.nextID.Add(1)
|
||||
|
||||
assert.NotEqual(t, id1, id2)
|
||||
assert.NotEqual(t, id2, id3)
|
||||
assert.Greater(t, id2, id1)
|
||||
assert.Greater(t, id3, id2)
|
||||
})
|
||||
|
||||
t.Run("pending request tracking", func(t *testing.T) {
|
||||
rm := &RequestManager{
|
||||
pendingReqs: make(map[RequestID]*PendingRequest),
|
||||
}
|
||||
|
||||
// Initially no pending requests
|
||||
assert.Equal(t, 0, rm.GetPendingCount())
|
||||
|
||||
// Add some fake pending requests
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
req1 := &PendingRequest{
|
||||
ID: RequestID(1),
|
||||
Context: ctx,
|
||||
Cancel: cancel,
|
||||
}
|
||||
req2 := &PendingRequest{
|
||||
ID: RequestID(2),
|
||||
Context: ctx,
|
||||
Cancel: cancel,
|
||||
}
|
||||
|
||||
rm.pendingReqs[req1.ID] = req1
|
||||
rm.pendingReqs[req2.ID] = req2
|
||||
|
||||
assert.Equal(t, 2, rm.GetPendingCount())
|
||||
|
||||
// Remove one
|
||||
delete(rm.pendingReqs, req1.ID)
|
||||
assert.Equal(t, 1, rm.GetPendingCount())
|
||||
|
||||
// Remove all
|
||||
delete(rm.pendingReqs, req2.ID)
|
||||
assert.Equal(t, 0, rm.GetPendingCount())
|
||||
})
|
||||
|
||||
t.Run("context cancellation", func(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Millisecond)
|
||||
defer cancel()
|
||||
|
||||
// Wait for context to timeout
|
||||
<-ctx.Done()
|
||||
|
||||
// Verify context was cancelled
|
||||
assert.Equal(t, context.DeadlineExceeded, ctx.Err())
|
||||
})
|
||||
}
|
||||
@@ -5,13 +5,13 @@ import (
|
||||
"time"
|
||||
"weak"
|
||||
|
||||
"github.com/henrygd/beszel/internal/entities/system"
|
||||
"github.com/blang/semver"
|
||||
"github.com/henrygd/beszel"
|
||||
|
||||
"github.com/henrygd/beszel/internal/common"
|
||||
|
||||
"github.com/fxamacker/cbor/v2"
|
||||
"github.com/lxzan/gws"
|
||||
"golang.org/x/crypto/ssh"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -25,9 +25,10 @@ type Handler struct {
|
||||
|
||||
// WsConn represents a WebSocket connection to an agent.
|
||||
type WsConn struct {
|
||||
conn *gws.Conn
|
||||
responseChan chan *gws.Message
|
||||
DownChan chan struct{}
|
||||
conn *gws.Conn
|
||||
requestManager *RequestManager
|
||||
DownChan chan struct{}
|
||||
agentVersion semver.Version
|
||||
}
|
||||
|
||||
// FingerprintRecord is fingerprints collection record data in the hub
|
||||
@@ -50,21 +51,22 @@ func GetUpgrader() *gws.Upgrader {
|
||||
return upgrader
|
||||
}
|
||||
|
||||
// NewWsConnection creates a new WebSocket connection wrapper.
|
||||
func NewWsConnection(conn *gws.Conn) *WsConn {
|
||||
// NewWsConnection creates a new WebSocket connection wrapper with agent version.
|
||||
func NewWsConnection(conn *gws.Conn, agentVersion semver.Version) *WsConn {
|
||||
return &WsConn{
|
||||
conn: conn,
|
||||
responseChan: make(chan *gws.Message, 1),
|
||||
DownChan: make(chan struct{}, 1),
|
||||
conn: conn,
|
||||
requestManager: NewRequestManager(conn),
|
||||
DownChan: make(chan struct{}, 1),
|
||||
agentVersion: agentVersion,
|
||||
}
|
||||
}
|
||||
|
||||
// OnOpen sets a deadline for the WebSocket connection.
|
||||
// OnOpen sets a deadline for the WebSocket connection and extracts agent version.
|
||||
func (h *Handler) OnOpen(conn *gws.Conn) {
|
||||
conn.SetDeadline(time.Now().Add(deadline))
|
||||
}
|
||||
|
||||
// OnMessage routes incoming WebSocket messages to the response channel.
|
||||
// OnMessage routes incoming WebSocket messages to the request manager.
|
||||
func (h *Handler) OnMessage(conn *gws.Conn, message *gws.Message) {
|
||||
conn.SetDeadline(time.Now().Add(deadline))
|
||||
if message.Opcode != gws.OpcodeBinary || message.Data.Len() == 0 {
|
||||
@@ -75,12 +77,7 @@ func (h *Handler) OnMessage(conn *gws.Conn, message *gws.Message) {
|
||||
_ = conn.WriteClose(1000, nil)
|
||||
return
|
||||
}
|
||||
select {
|
||||
case wsConn.(*WsConn).responseChan <- message:
|
||||
default:
|
||||
// close if the connection is not expecting a response
|
||||
wsConn.(*WsConn).Close(nil)
|
||||
}
|
||||
wsConn.(*WsConn).requestManager.handleResponse(message)
|
||||
}
|
||||
|
||||
// OnClose handles WebSocket connection closures and triggers system down status after delay.
|
||||
@@ -106,6 +103,9 @@ func (ws *WsConn) Close(msg []byte) {
|
||||
if ws.IsConnected() {
|
||||
ws.conn.WriteClose(1000, msg)
|
||||
}
|
||||
if ws.requestManager != nil {
|
||||
ws.requestManager.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// Ping sends a ping frame to keep the connection alive.
|
||||
@@ -115,6 +115,7 @@ func (ws *WsConn) Ping() error {
|
||||
}
|
||||
|
||||
// sendMessage encodes data to CBOR and sends it as a binary message to the agent.
|
||||
// This is kept for backwards compatibility but new actions should use RequestManager.
|
||||
func (ws *WsConn) sendMessage(data common.HubRequest[any]) error {
|
||||
if ws.conn == nil {
|
||||
return gws.ErrConnClosed
|
||||
@@ -126,54 +127,34 @@ func (ws *WsConn) sendMessage(data common.HubRequest[any]) error {
|
||||
return ws.conn.WriteMessage(gws.OpcodeBinary, bytes)
|
||||
}
|
||||
|
||||
// RequestSystemData requests system metrics from the agent and unmarshals the response.
|
||||
func (ws *WsConn) RequestSystemData(data *system.CombinedData) error {
|
||||
var message *gws.Message
|
||||
|
||||
ws.sendMessage(common.HubRequest[any]{
|
||||
Action: common.GetData,
|
||||
})
|
||||
// handleAgentRequest processes a request to the agent, handling both legacy and new formats.
|
||||
func (ws *WsConn) handleAgentRequest(req *PendingRequest, handler ResponseHandler) error {
|
||||
// Wait for response
|
||||
select {
|
||||
case <-time.After(10 * time.Second):
|
||||
ws.Close(nil)
|
||||
return gws.ErrConnClosed
|
||||
case message = <-ws.responseChan:
|
||||
case message := <-req.ResponseCh:
|
||||
defer message.Close()
|
||||
// Cancel request context to stop timeout watcher promptly
|
||||
defer req.Cancel()
|
||||
data := message.Data.Bytes()
|
||||
|
||||
// Legacy format - unmarshal directly
|
||||
if ws.agentVersion.LT(beszel.MinVersionAgentResponse) {
|
||||
return handler.HandleLegacy(data)
|
||||
}
|
||||
|
||||
// New format with AgentResponse wrapper
|
||||
var agentResponse common.AgentResponse
|
||||
if err := cbor.Unmarshal(data, &agentResponse); err != nil {
|
||||
return err
|
||||
}
|
||||
if agentResponse.Error != "" {
|
||||
return errors.New(agentResponse.Error)
|
||||
}
|
||||
return handler.Handle(agentResponse)
|
||||
|
||||
case <-req.Context.Done():
|
||||
return req.Context.Err()
|
||||
}
|
||||
defer message.Close()
|
||||
return cbor.Unmarshal(message.Data.Bytes(), data)
|
||||
}
|
||||
|
||||
// GetFingerprint authenticates with the agent using SSH signature and returns the agent's fingerprint.
|
||||
func (ws *WsConn) GetFingerprint(token string, signer ssh.Signer, needSysInfo bool) (common.FingerprintResponse, error) {
|
||||
var clientFingerprint common.FingerprintResponse
|
||||
challenge := []byte(token)
|
||||
|
||||
signature, err := signer.Sign(nil, challenge)
|
||||
if err != nil {
|
||||
return clientFingerprint, err
|
||||
}
|
||||
|
||||
err = ws.sendMessage(common.HubRequest[any]{
|
||||
Action: common.CheckFingerprint,
|
||||
Data: common.FingerprintRequest{
|
||||
Signature: signature.Blob,
|
||||
NeedSysInfo: needSysInfo,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return clientFingerprint, err
|
||||
}
|
||||
|
||||
var message *gws.Message
|
||||
select {
|
||||
case message = <-ws.responseChan:
|
||||
case <-time.After(10 * time.Second):
|
||||
return clientFingerprint, errors.New("request expired")
|
||||
}
|
||||
defer message.Close()
|
||||
|
||||
err = cbor.Unmarshal(message.Data.Bytes(), &clientFingerprint)
|
||||
return clientFingerprint, err
|
||||
}
|
||||
|
||||
// IsConnected returns true if the WebSocket connection is active.
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/blang/semver"
|
||||
"github.com/henrygd/beszel/internal/common"
|
||||
|
||||
"github.com/fxamacker/cbor/v2"
|
||||
@@ -36,26 +37,25 @@ func TestGetUpgrader(t *testing.T) {
|
||||
// TestNewWsConnection tests WebSocket connection creation
|
||||
func TestNewWsConnection(t *testing.T) {
|
||||
// We can't easily mock gws.Conn, so we'll pass nil and test the structure
|
||||
wsConn := NewWsConnection(nil)
|
||||
wsConn := NewWsConnection(nil, semver.MustParse("0.12.10"))
|
||||
|
||||
assert.NotNil(t, wsConn, "WebSocket connection should not be nil")
|
||||
assert.Nil(t, wsConn.conn, "Connection should be nil as passed")
|
||||
assert.NotNil(t, wsConn.responseChan, "Response channel should be initialized")
|
||||
assert.NotNil(t, wsConn.requestManager, "Request manager should be initialized")
|
||||
assert.NotNil(t, wsConn.DownChan, "Down channel should be initialized")
|
||||
assert.Equal(t, 1, cap(wsConn.responseChan), "Response channel should have capacity of 1")
|
||||
assert.Equal(t, 1, cap(wsConn.DownChan), "Down channel should have capacity of 1")
|
||||
}
|
||||
|
||||
// TestWsConn_IsConnected tests the connection status check
|
||||
func TestWsConn_IsConnected(t *testing.T) {
|
||||
// Test with nil connection
|
||||
wsConn := NewWsConnection(nil)
|
||||
wsConn := NewWsConnection(nil, semver.MustParse("0.12.10"))
|
||||
assert.False(t, wsConn.IsConnected(), "Should not be connected when conn is nil")
|
||||
}
|
||||
|
||||
// TestWsConn_Close tests the connection closing with nil connection
|
||||
func TestWsConn_Close(t *testing.T) {
|
||||
wsConn := NewWsConnection(nil)
|
||||
wsConn := NewWsConnection(nil, semver.MustParse("0.12.10"))
|
||||
|
||||
// Should handle nil connection gracefully
|
||||
assert.NotPanics(t, func() {
|
||||
@@ -65,7 +65,7 @@ func TestWsConn_Close(t *testing.T) {
|
||||
|
||||
// TestWsConn_SendMessage_CBOR tests CBOR encoding in sendMessage
|
||||
func TestWsConn_SendMessage_CBOR(t *testing.T) {
|
||||
wsConn := NewWsConnection(nil)
|
||||
wsConn := NewWsConnection(nil, semver.MustParse("0.12.10"))
|
||||
|
||||
testData := common.HubRequest[any]{
|
||||
Action: common.GetData,
|
||||
@@ -194,7 +194,7 @@ func TestHandler(t *testing.T) {
|
||||
|
||||
// TestWsConnChannelBehavior tests channel behavior without WebSocket connections
|
||||
func TestWsConnChannelBehavior(t *testing.T) {
|
||||
wsConn := NewWsConnection(nil)
|
||||
wsConn := NewWsConnection(nil, semver.MustParse("0.12.10"))
|
||||
|
||||
// Test that channels are properly initialized and can be used
|
||||
select {
|
||||
@@ -212,11 +212,6 @@ func TestWsConnChannelBehavior(t *testing.T) {
|
||||
t.Error("Should be able to read from DownChan")
|
||||
}
|
||||
|
||||
// Response channel should be empty initially
|
||||
select {
|
||||
case <-wsConn.responseChan:
|
||||
t.Error("Response channel should be empty initially")
|
||||
default:
|
||||
// Expected - channel should be empty
|
||||
}
|
||||
// Request manager should have no pending requests initially
|
||||
assert.Equal(t, 0, wsConn.requestManager.GetPendingCount(), "Should have no pending requests initially")
|
||||
}
|
||||
|
||||
11
internal/hub/ws/ws_test_helpers.go
Normal file
11
internal/hub/ws/ws_test_helpers.go
Normal file
@@ -0,0 +1,11 @@
|
||||
//go:build testing
|
||||
// +build testing
|
||||
|
||||
package ws
|
||||
|
||||
// GetPendingCount returns the number of pending requests (for monitoring)
|
||||
func (rm *RequestManager) GetPendingCount() int {
|
||||
rm.RLock()
|
||||
defer rm.RUnlock()
|
||||
return len(rm.pendingReqs)
|
||||
}
|
||||
Reference in New Issue
Block a user