add one minute chart + refactor rpc

- add one minute charts
- update disk io to use bytes
- update hub and agent connection interfaces / handlers to be more
flexible
- change agent cache to use cache time instead of session id
- refactor collection of metrics which require deltas to track
separately per cache time
This commit is contained in:
henrygd
2025-10-02 17:56:51 -04:00
parent f9a39c6004
commit 7d6230de74
44 changed files with 3892 additions and 551 deletions

107
internal/hub/ws/handlers.go Normal file
View File

@@ -0,0 +1,107 @@
package ws
import (
"context"
"errors"
"github.com/fxamacker/cbor/v2"
"github.com/henrygd/beszel/internal/common"
"github.com/henrygd/beszel/internal/entities/system"
"github.com/lxzan/gws"
"golang.org/x/crypto/ssh"
)
// ResponseHandler defines interface for handling agent responses
type ResponseHandler interface {
Handle(agentResponse common.AgentResponse) error
HandleLegacy(rawData []byte) error
}
// BaseHandler provides a default implementation that can be embedded to make HandleLegacy optional
// type BaseHandler struct{}
// func (h *BaseHandler) HandleLegacy(rawData []byte) error {
// return errors.New("legacy format not supported")
// }
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
// systemDataHandler implements ResponseHandler for system data requests
type systemDataHandler struct {
data *system.CombinedData
}
func (h *systemDataHandler) HandleLegacy(rawData []byte) error {
return cbor.Unmarshal(rawData, h.data)
}
func (h *systemDataHandler) Handle(agentResponse common.AgentResponse) error {
if agentResponse.SystemData != nil {
*h.data = *agentResponse.SystemData
}
return nil
}
// RequestSystemData requests system metrics from the agent and unmarshals the response.
func (ws *WsConn) RequestSystemData(ctx context.Context, data *system.CombinedData, options common.DataRequestOptions) error {
if !ws.IsConnected() {
return gws.ErrConnClosed
}
req, err := ws.requestManager.SendRequest(ctx, common.GetData, options)
if err != nil {
return err
}
handler := &systemDataHandler{data: data}
return ws.handleAgentRequest(req, handler)
}
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
// fingerprintHandler implements ResponseHandler for fingerprint requests
type fingerprintHandler struct {
result *common.FingerprintResponse
}
func (h *fingerprintHandler) HandleLegacy(rawData []byte) error {
return cbor.Unmarshal(rawData, h.result)
}
func (h *fingerprintHandler) Handle(agentResponse common.AgentResponse) error {
if agentResponse.Fingerprint != nil {
*h.result = *agentResponse.Fingerprint
return nil
}
return errors.New("no fingerprint data in response")
}
// GetFingerprint authenticates with the agent using SSH signature and returns the agent's fingerprint.
func (ws *WsConn) GetFingerprint(ctx context.Context, token string, signer ssh.Signer, needSysInfo bool) (common.FingerprintResponse, error) {
if !ws.IsConnected() {
return common.FingerprintResponse{}, gws.ErrConnClosed
}
challenge := []byte(token)
signature, err := signer.Sign(nil, challenge)
if err != nil {
return common.FingerprintResponse{}, err
}
req, err := ws.requestManager.SendRequest(ctx, common.CheckFingerprint, common.FingerprintRequest{
Signature: signature.Blob,
NeedSysInfo: needSysInfo,
})
if err != nil {
return common.FingerprintResponse{}, err
}
var result common.FingerprintResponse
handler := &fingerprintHandler{result: &result}
err = ws.handleAgentRequest(req, handler)
return result, err
}

View File

@@ -0,0 +1,186 @@
package ws
import (
"context"
"fmt"
"sync"
"sync/atomic"
"time"
"github.com/fxamacker/cbor/v2"
"github.com/henrygd/beszel/internal/common"
"github.com/lxzan/gws"
)
// RequestID uniquely identifies a request
type RequestID uint32
// PendingRequest tracks an in-flight request
type PendingRequest struct {
ID RequestID
ResponseCh chan *gws.Message
Context context.Context
Cancel context.CancelFunc
CreatedAt time.Time
}
// RequestManager handles concurrent requests to an agent
type RequestManager struct {
sync.RWMutex
conn *gws.Conn
pendingReqs map[RequestID]*PendingRequest
nextID atomic.Uint32
}
// NewRequestManager creates a new request manager for a WebSocket connection
func NewRequestManager(conn *gws.Conn) *RequestManager {
rm := &RequestManager{
conn: conn,
pendingReqs: make(map[RequestID]*PendingRequest),
}
return rm
}
// SendRequest sends a request and returns a channel for the response
func (rm *RequestManager) SendRequest(ctx context.Context, action common.WebSocketAction, data any) (*PendingRequest, error) {
reqID := RequestID(rm.nextID.Add(1))
reqCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
req := &PendingRequest{
ID: reqID,
ResponseCh: make(chan *gws.Message, 1),
Context: reqCtx,
Cancel: cancel,
CreatedAt: time.Now(),
}
rm.Lock()
rm.pendingReqs[reqID] = req
rm.Unlock()
hubReq := common.HubRequest[any]{
Id: (*uint32)(&reqID),
Action: action,
Data: data,
}
// Send the request
if err := rm.sendMessage(hubReq); err != nil {
rm.cancelRequest(reqID)
return nil, fmt.Errorf("failed to send request: %w", err)
}
// Start cleanup watcher for timeout/cancellation
go rm.cleanupRequest(req)
return req, nil
}
// sendMessage encodes and sends a message over WebSocket
func (rm *RequestManager) sendMessage(data any) error {
if rm.conn == nil {
return gws.ErrConnClosed
}
bytes, err := cbor.Marshal(data)
if err != nil {
return fmt.Errorf("failed to marshal request: %w", err)
}
return rm.conn.WriteMessage(gws.OpcodeBinary, bytes)
}
// handleResponse processes a single response message
func (rm *RequestManager) handleResponse(message *gws.Message) {
var response common.AgentResponse
if err := cbor.Unmarshal(message.Data.Bytes(), &response); err != nil {
// Legacy response without ID - route to first pending request of any type
rm.routeLegacyResponse(message)
return
}
reqID := RequestID(*response.Id)
rm.RLock()
req, exists := rm.pendingReqs[reqID]
rm.RUnlock()
if !exists {
// Request not found (might have timed out) - close the message
message.Close()
return
}
select {
case req.ResponseCh <- message:
// Message successfully delivered - the receiver will close it
rm.deleteRequest(reqID)
case <-req.Context.Done():
// Request was cancelled/timed out - close the message
message.Close()
}
}
// routeLegacyResponse handles responses that don't have request IDs (backwards compatibility)
func (rm *RequestManager) routeLegacyResponse(message *gws.Message) {
// Snapshot the oldest pending request without holding the lock during send
rm.RLock()
var oldestReq *PendingRequest
for _, req := range rm.pendingReqs {
if oldestReq == nil || req.CreatedAt.Before(oldestReq.CreatedAt) {
oldestReq = req
}
}
rm.RUnlock()
if oldestReq != nil {
select {
case oldestReq.ResponseCh <- message:
// Message successfully delivered - the receiver will close it
rm.deleteRequest(oldestReq.ID)
case <-oldestReq.Context.Done():
// Request was cancelled - close the message
message.Close()
}
} else {
// No pending requests - close the message
message.Close()
}
}
// cleanupRequest handles request timeout and cleanup
func (rm *RequestManager) cleanupRequest(req *PendingRequest) {
<-req.Context.Done()
rm.cancelRequest(req.ID)
}
// cancelRequest removes a request and cancels its context
func (rm *RequestManager) cancelRequest(reqID RequestID) {
rm.Lock()
defer rm.Unlock()
if req, exists := rm.pendingReqs[reqID]; exists {
req.Cancel()
delete(rm.pendingReqs, reqID)
}
}
// deleteRequest removes a request from the pending map without cancelling its context.
func (rm *RequestManager) deleteRequest(reqID RequestID) {
rm.Lock()
defer rm.Unlock()
delete(rm.pendingReqs, reqID)
}
// Close shuts down the request manager
func (rm *RequestManager) Close() {
rm.Lock()
defer rm.Unlock()
// Cancel all pending requests
for _, req := range rm.pendingReqs {
req.Cancel()
}
rm.pendingReqs = make(map[RequestID]*PendingRequest)
}

View File

@@ -0,0 +1,81 @@
//go:build testing
// +build testing
package ws
import (
"context"
"testing"
"time"
"github.com/stretchr/testify/assert"
)
// TestRequestManager_BasicFunctionality tests the request manager without mocking gws.Conn
func TestRequestManager_BasicFunctionality(t *testing.T) {
// We'll test the core logic without mocking the connection
// since the gws.Conn interface is complex to mock properly
t.Run("request ID generation", func(t *testing.T) {
// Test that request IDs are generated sequentially and uniquely
rm := &RequestManager{}
// Simulate multiple ID generations
id1 := rm.nextID.Add(1)
id2 := rm.nextID.Add(1)
id3 := rm.nextID.Add(1)
assert.NotEqual(t, id1, id2)
assert.NotEqual(t, id2, id3)
assert.Greater(t, id2, id1)
assert.Greater(t, id3, id2)
})
t.Run("pending request tracking", func(t *testing.T) {
rm := &RequestManager{
pendingReqs: make(map[RequestID]*PendingRequest),
}
// Initially no pending requests
assert.Equal(t, 0, rm.GetPendingCount())
// Add some fake pending requests
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
req1 := &PendingRequest{
ID: RequestID(1),
Context: ctx,
Cancel: cancel,
}
req2 := &PendingRequest{
ID: RequestID(2),
Context: ctx,
Cancel: cancel,
}
rm.pendingReqs[req1.ID] = req1
rm.pendingReqs[req2.ID] = req2
assert.Equal(t, 2, rm.GetPendingCount())
// Remove one
delete(rm.pendingReqs, req1.ID)
assert.Equal(t, 1, rm.GetPendingCount())
// Remove all
delete(rm.pendingReqs, req2.ID)
assert.Equal(t, 0, rm.GetPendingCount())
})
t.Run("context cancellation", func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Millisecond)
defer cancel()
// Wait for context to timeout
<-ctx.Done()
// Verify context was cancelled
assert.Equal(t, context.DeadlineExceeded, ctx.Err())
})
}

View File

@@ -5,13 +5,13 @@ import (
"time"
"weak"
"github.com/henrygd/beszel/internal/entities/system"
"github.com/blang/semver"
"github.com/henrygd/beszel"
"github.com/henrygd/beszel/internal/common"
"github.com/fxamacker/cbor/v2"
"github.com/lxzan/gws"
"golang.org/x/crypto/ssh"
)
const (
@@ -25,9 +25,10 @@ type Handler struct {
// WsConn represents a WebSocket connection to an agent.
type WsConn struct {
conn *gws.Conn
responseChan chan *gws.Message
DownChan chan struct{}
conn *gws.Conn
requestManager *RequestManager
DownChan chan struct{}
agentVersion semver.Version
}
// FingerprintRecord is fingerprints collection record data in the hub
@@ -50,21 +51,22 @@ func GetUpgrader() *gws.Upgrader {
return upgrader
}
// NewWsConnection creates a new WebSocket connection wrapper.
func NewWsConnection(conn *gws.Conn) *WsConn {
// NewWsConnection creates a new WebSocket connection wrapper with agent version.
func NewWsConnection(conn *gws.Conn, agentVersion semver.Version) *WsConn {
return &WsConn{
conn: conn,
responseChan: make(chan *gws.Message, 1),
DownChan: make(chan struct{}, 1),
conn: conn,
requestManager: NewRequestManager(conn),
DownChan: make(chan struct{}, 1),
agentVersion: agentVersion,
}
}
// OnOpen sets a deadline for the WebSocket connection.
// OnOpen sets a deadline for the WebSocket connection and extracts agent version.
func (h *Handler) OnOpen(conn *gws.Conn) {
conn.SetDeadline(time.Now().Add(deadline))
}
// OnMessage routes incoming WebSocket messages to the response channel.
// OnMessage routes incoming WebSocket messages to the request manager.
func (h *Handler) OnMessage(conn *gws.Conn, message *gws.Message) {
conn.SetDeadline(time.Now().Add(deadline))
if message.Opcode != gws.OpcodeBinary || message.Data.Len() == 0 {
@@ -75,12 +77,7 @@ func (h *Handler) OnMessage(conn *gws.Conn, message *gws.Message) {
_ = conn.WriteClose(1000, nil)
return
}
select {
case wsConn.(*WsConn).responseChan <- message:
default:
// close if the connection is not expecting a response
wsConn.(*WsConn).Close(nil)
}
wsConn.(*WsConn).requestManager.handleResponse(message)
}
// OnClose handles WebSocket connection closures and triggers system down status after delay.
@@ -106,6 +103,9 @@ func (ws *WsConn) Close(msg []byte) {
if ws.IsConnected() {
ws.conn.WriteClose(1000, msg)
}
if ws.requestManager != nil {
ws.requestManager.Close()
}
}
// Ping sends a ping frame to keep the connection alive.
@@ -115,6 +115,7 @@ func (ws *WsConn) Ping() error {
}
// sendMessage encodes data to CBOR and sends it as a binary message to the agent.
// This is kept for backwards compatibility but new actions should use RequestManager.
func (ws *WsConn) sendMessage(data common.HubRequest[any]) error {
if ws.conn == nil {
return gws.ErrConnClosed
@@ -126,54 +127,34 @@ func (ws *WsConn) sendMessage(data common.HubRequest[any]) error {
return ws.conn.WriteMessage(gws.OpcodeBinary, bytes)
}
// RequestSystemData requests system metrics from the agent and unmarshals the response.
func (ws *WsConn) RequestSystemData(data *system.CombinedData) error {
var message *gws.Message
ws.sendMessage(common.HubRequest[any]{
Action: common.GetData,
})
// handleAgentRequest processes a request to the agent, handling both legacy and new formats.
func (ws *WsConn) handleAgentRequest(req *PendingRequest, handler ResponseHandler) error {
// Wait for response
select {
case <-time.After(10 * time.Second):
ws.Close(nil)
return gws.ErrConnClosed
case message = <-ws.responseChan:
case message := <-req.ResponseCh:
defer message.Close()
// Cancel request context to stop timeout watcher promptly
defer req.Cancel()
data := message.Data.Bytes()
// Legacy format - unmarshal directly
if ws.agentVersion.LT(beszel.MinVersionAgentResponse) {
return handler.HandleLegacy(data)
}
// New format with AgentResponse wrapper
var agentResponse common.AgentResponse
if err := cbor.Unmarshal(data, &agentResponse); err != nil {
return err
}
if agentResponse.Error != "" {
return errors.New(agentResponse.Error)
}
return handler.Handle(agentResponse)
case <-req.Context.Done():
return req.Context.Err()
}
defer message.Close()
return cbor.Unmarshal(message.Data.Bytes(), data)
}
// GetFingerprint authenticates with the agent using SSH signature and returns the agent's fingerprint.
func (ws *WsConn) GetFingerprint(token string, signer ssh.Signer, needSysInfo bool) (common.FingerprintResponse, error) {
var clientFingerprint common.FingerprintResponse
challenge := []byte(token)
signature, err := signer.Sign(nil, challenge)
if err != nil {
return clientFingerprint, err
}
err = ws.sendMessage(common.HubRequest[any]{
Action: common.CheckFingerprint,
Data: common.FingerprintRequest{
Signature: signature.Blob,
NeedSysInfo: needSysInfo,
},
})
if err != nil {
return clientFingerprint, err
}
var message *gws.Message
select {
case message = <-ws.responseChan:
case <-time.After(10 * time.Second):
return clientFingerprint, errors.New("request expired")
}
defer message.Close()
err = cbor.Unmarshal(message.Data.Bytes(), &clientFingerprint)
return clientFingerprint, err
}
// IsConnected returns true if the WebSocket connection is active.

View File

@@ -8,6 +8,7 @@ import (
"testing"
"time"
"github.com/blang/semver"
"github.com/henrygd/beszel/internal/common"
"github.com/fxamacker/cbor/v2"
@@ -36,26 +37,25 @@ func TestGetUpgrader(t *testing.T) {
// TestNewWsConnection tests WebSocket connection creation
func TestNewWsConnection(t *testing.T) {
// We can't easily mock gws.Conn, so we'll pass nil and test the structure
wsConn := NewWsConnection(nil)
wsConn := NewWsConnection(nil, semver.MustParse("0.12.10"))
assert.NotNil(t, wsConn, "WebSocket connection should not be nil")
assert.Nil(t, wsConn.conn, "Connection should be nil as passed")
assert.NotNil(t, wsConn.responseChan, "Response channel should be initialized")
assert.NotNil(t, wsConn.requestManager, "Request manager should be initialized")
assert.NotNil(t, wsConn.DownChan, "Down channel should be initialized")
assert.Equal(t, 1, cap(wsConn.responseChan), "Response channel should have capacity of 1")
assert.Equal(t, 1, cap(wsConn.DownChan), "Down channel should have capacity of 1")
}
// TestWsConn_IsConnected tests the connection status check
func TestWsConn_IsConnected(t *testing.T) {
// Test with nil connection
wsConn := NewWsConnection(nil)
wsConn := NewWsConnection(nil, semver.MustParse("0.12.10"))
assert.False(t, wsConn.IsConnected(), "Should not be connected when conn is nil")
}
// TestWsConn_Close tests the connection closing with nil connection
func TestWsConn_Close(t *testing.T) {
wsConn := NewWsConnection(nil)
wsConn := NewWsConnection(nil, semver.MustParse("0.12.10"))
// Should handle nil connection gracefully
assert.NotPanics(t, func() {
@@ -65,7 +65,7 @@ func TestWsConn_Close(t *testing.T) {
// TestWsConn_SendMessage_CBOR tests CBOR encoding in sendMessage
func TestWsConn_SendMessage_CBOR(t *testing.T) {
wsConn := NewWsConnection(nil)
wsConn := NewWsConnection(nil, semver.MustParse("0.12.10"))
testData := common.HubRequest[any]{
Action: common.GetData,
@@ -194,7 +194,7 @@ func TestHandler(t *testing.T) {
// TestWsConnChannelBehavior tests channel behavior without WebSocket connections
func TestWsConnChannelBehavior(t *testing.T) {
wsConn := NewWsConnection(nil)
wsConn := NewWsConnection(nil, semver.MustParse("0.12.10"))
// Test that channels are properly initialized and can be used
select {
@@ -212,11 +212,6 @@ func TestWsConnChannelBehavior(t *testing.T) {
t.Error("Should be able to read from DownChan")
}
// Response channel should be empty initially
select {
case <-wsConn.responseChan:
t.Error("Response channel should be empty initially")
default:
// Expected - channel should be empty
}
// Request manager should have no pending requests initially
assert.Equal(t, 0, wsConn.requestManager.GetPendingCount(), "Should have no pending requests initially")
}

View File

@@ -0,0 +1,11 @@
//go:build testing
// +build testing
package ws
// GetPendingCount returns the number of pending requests (for monitoring)
func (rm *RequestManager) GetPendingCount() int {
rm.RLock()
defer rm.RUnlock()
return len(rm.pendingReqs)
}