mirror of
https://github.com/henrygd/beszel.git
synced 2025-12-17 02:36:17 +01:00
initial support for one intel gpu with intel_gpu_top
This commit is contained in:
76
agent/gpu.go
76
agent/gpu.go
@@ -27,13 +27,10 @@ const (
|
|||||||
nvidiaSmiInterval string = "4" // in seconds
|
nvidiaSmiInterval string = "4" // in seconds
|
||||||
tegraStatsInterval string = "3700" // in milliseconds
|
tegraStatsInterval string = "3700" // in milliseconds
|
||||||
rocmSmiInterval time.Duration = 4300 * time.Millisecond
|
rocmSmiInterval time.Duration = 4300 * time.Millisecond
|
||||||
|
|
||||||
// Command retry and timeout constants
|
// Command retry and timeout constants
|
||||||
retryWaitTime time.Duration = 5 * time.Second
|
retryWaitTime time.Duration = 5 * time.Second
|
||||||
maxFailureRetries int = 5
|
maxFailureRetries int = 5
|
||||||
|
|
||||||
cmdBufferSize uint16 = 10 * 1024
|
|
||||||
|
|
||||||
// Unit Conversions
|
// Unit Conversions
|
||||||
mebibytesInAMegabyte float64 = 1.024 // nvidia-smi reports memory in MiB
|
mebibytesInAMegabyte float64 = 1.024 // nvidia-smi reports memory in MiB
|
||||||
milliwattsInAWatt float64 = 1000.0 // tegrastats reports power in mW
|
milliwattsInAWatt float64 = 1000.0 // tegrastats reports power in mW
|
||||||
@@ -42,10 +39,11 @@ const (
|
|||||||
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
|
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
|
||||||
type GPUManager struct {
|
type GPUManager struct {
|
||||||
sync.Mutex
|
sync.Mutex
|
||||||
nvidiaSmi bool
|
nvidiaSmi bool
|
||||||
rocmSmi bool
|
rocmSmi bool
|
||||||
tegrastats bool
|
tegrastats bool
|
||||||
GpuDataMap map[string]*system.GPUData
|
intelGpuStats bool
|
||||||
|
GpuDataMap map[string]*system.GPUData
|
||||||
}
|
}
|
||||||
|
|
||||||
// RocmSmiJson represents the JSON structure of rocm-smi output
|
// RocmSmiJson represents the JSON structure of rocm-smi output
|
||||||
@@ -66,6 +64,7 @@ type gpuCollector struct {
|
|||||||
cmdArgs []string
|
cmdArgs []string
|
||||||
parse func([]byte) bool // returns true if valid data was found
|
parse func([]byte) bool // returns true if valid data was found
|
||||||
buf []byte
|
buf []byte
|
||||||
|
bufSize uint16
|
||||||
}
|
}
|
||||||
|
|
||||||
var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data
|
var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data
|
||||||
@@ -99,7 +98,7 @@ func (c *gpuCollector) collect() error {
|
|||||||
|
|
||||||
scanner := bufio.NewScanner(stdout)
|
scanner := bufio.NewScanner(stdout)
|
||||||
if c.buf == nil {
|
if c.buf == nil {
|
||||||
c.buf = make([]byte, 0, cmdBufferSize)
|
c.buf = make([]byte, 0, c.bufSize)
|
||||||
}
|
}
|
||||||
scanner.Buffer(c.buf, bufio.MaxScanTokenSize)
|
scanner.Buffer(c.buf, bufio.MaxScanTokenSize)
|
||||||
|
|
||||||
@@ -244,20 +243,31 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData {
|
|||||||
// copy / reset the data
|
// copy / reset the data
|
||||||
gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap))
|
gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap))
|
||||||
for id, gpu := range gm.GpuDataMap {
|
for id, gpu := range gm.GpuDataMap {
|
||||||
gpuAvg := *gpu
|
|
||||||
|
|
||||||
gpuAvg.Temperature = twoDecimals(gpu.Temperature)
|
|
||||||
gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed)
|
|
||||||
gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal)
|
|
||||||
|
|
||||||
// avoid division by zero
|
// avoid division by zero
|
||||||
if gpu.Count > 0 {
|
count := max(gpu.Count, 1)
|
||||||
gpuAvg.Usage = twoDecimals(gpu.Usage / gpu.Count)
|
|
||||||
gpuAvg.Power = twoDecimals(gpu.Power / gpu.Count)
|
// average the data
|
||||||
|
gpuAvg := *gpu
|
||||||
|
gpuAvg.Temperature = twoDecimals(gpu.Temperature)
|
||||||
|
gpuAvg.Power = twoDecimals(gpu.Power / count)
|
||||||
|
|
||||||
|
// intel gpu stats doesn't provide usage, memory used, or memory total
|
||||||
|
if gpu.Engines != nil {
|
||||||
|
maxEngineUsage := 0.0
|
||||||
|
for name, engine := range gpu.Engines {
|
||||||
|
gpuAvg.Engines[name] = twoDecimals(engine / count)
|
||||||
|
maxEngineUsage = max(maxEngineUsage, engine/count)
|
||||||
|
}
|
||||||
|
gpuAvg.Usage = twoDecimals(maxEngineUsage)
|
||||||
|
} else {
|
||||||
|
gpuAvg.Usage = twoDecimals(gpu.Usage / count)
|
||||||
|
gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed)
|
||||||
|
gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal)
|
||||||
}
|
}
|
||||||
|
|
||||||
// reset accumulators in the original
|
// reset accumulators in the original gpu data for next collection
|
||||||
gpu.Usage, gpu.Power, gpu.Count = 0, 0, 0
|
gpu.Usage, gpu.Power, gpu.Count = gpuAvg.Usage, gpuAvg.Power, 1
|
||||||
|
gpu.Engines = gpuAvg.Engines
|
||||||
|
|
||||||
// append id to the name if there are multiple GPUs with the same name
|
// append id to the name if there are multiple GPUs with the same name
|
||||||
if nameCounts[gpu.Name] > 1 {
|
if nameCounts[gpu.Name] > 1 {
|
||||||
@@ -284,18 +294,37 @@ func (gm *GPUManager) detectGPUs() error {
|
|||||||
gm.tegrastats = true
|
gm.tegrastats = true
|
||||||
gm.nvidiaSmi = false
|
gm.nvidiaSmi = false
|
||||||
}
|
}
|
||||||
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats {
|
if _, err := exec.LookPath(intelGpuStatsCmd); err == nil {
|
||||||
|
gm.intelGpuStats = true
|
||||||
|
}
|
||||||
|
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelGpuStats {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or tegrastats")
|
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, tegrastats, or intel_gpu_top")
|
||||||
}
|
}
|
||||||
|
|
||||||
// startCollector starts the appropriate GPU data collector based on the command
|
// startCollector starts the appropriate GPU data collector based on the command
|
||||||
func (gm *GPUManager) startCollector(command string) {
|
func (gm *GPUManager) startCollector(command string) {
|
||||||
collector := gpuCollector{
|
collector := gpuCollector{
|
||||||
name: command,
|
name: command,
|
||||||
|
bufSize: 10 * 1024,
|
||||||
}
|
}
|
||||||
switch command {
|
switch command {
|
||||||
|
case intelGpuStatsCmd:
|
||||||
|
go func() {
|
||||||
|
failures := 0
|
||||||
|
for {
|
||||||
|
if err := gm.collectIntelStats(); err != nil {
|
||||||
|
failures++
|
||||||
|
if failures > maxFailureRetries {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err)
|
||||||
|
time.Sleep(retryWaitTime)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
case nvidiaSmiCmd:
|
case nvidiaSmiCmd:
|
||||||
collector.cmdArgs = []string{
|
collector.cmdArgs = []string{
|
||||||
"-l", nvidiaSmiInterval,
|
"-l", nvidiaSmiInterval,
|
||||||
@@ -344,6 +373,9 @@ func NewGPUManager() (*GPUManager, error) {
|
|||||||
if gm.tegrastats {
|
if gm.tegrastats {
|
||||||
gm.startCollector(tegraStatsCmd)
|
gm.startCollector(tegraStatsCmd)
|
||||||
}
|
}
|
||||||
|
if gm.intelGpuStats {
|
||||||
|
gm.startCollector(intelGpuStatsCmd)
|
||||||
|
}
|
||||||
|
|
||||||
return &gm, nil
|
return &gm, nil
|
||||||
}
|
}
|
||||||
|
|||||||
102
agent/gpu_intel.go
Normal file
102
agent/gpu_intel.go
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
package agent
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"os/exec"
|
||||||
|
|
||||||
|
"github.com/henrygd/beszel/internal/entities/system"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
intelGpuStatsCmd string = "intel_gpu_top"
|
||||||
|
intelGpuStatsInterval string = "3300" // in milliseconds
|
||||||
|
)
|
||||||
|
|
||||||
|
type intelGpuStats struct {
|
||||||
|
Power struct {
|
||||||
|
GPU float64 `json:"GPU"`
|
||||||
|
} `json:"power"`
|
||||||
|
Engines map[string]struct {
|
||||||
|
Busy float64 `json:"busy"`
|
||||||
|
} `json:"engines"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// updateIntelFromStats updates aggregated GPU data from a single intelGpuStats sample
|
||||||
|
func (gm *GPUManager) updateIntelFromStats(sample *intelGpuStats) bool {
|
||||||
|
gm.Lock()
|
||||||
|
defer gm.Unlock()
|
||||||
|
|
||||||
|
// only one gpu for now - cmd doesn't provide all by default
|
||||||
|
gpuData, ok := gm.GpuDataMap["0"]
|
||||||
|
if !ok {
|
||||||
|
gpuData = &system.GPUData{Name: "GPU", Engines: make(map[string]float64)}
|
||||||
|
gm.GpuDataMap["0"] = gpuData
|
||||||
|
}
|
||||||
|
|
||||||
|
if sample.Power.GPU > 0 {
|
||||||
|
gpuData.Power += sample.Power.GPU
|
||||||
|
}
|
||||||
|
|
||||||
|
if gpuData.Engines == nil {
|
||||||
|
gpuData.Engines = make(map[string]float64, len(sample.Engines))
|
||||||
|
}
|
||||||
|
for name, engine := range sample.Engines {
|
||||||
|
gpuData.Engines[name] += engine.Busy
|
||||||
|
}
|
||||||
|
|
||||||
|
gpuData.Count++
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectIntelStats executes intel_gpu_top in JSON mode and stream-decodes the array of samples
|
||||||
|
func (gm *GPUManager) collectIntelStats() error {
|
||||||
|
cmd := exec.Command(intelGpuStatsCmd, "-s", intelGpuStatsInterval, "-J")
|
||||||
|
stdout, err := cmd.StdoutPipe()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
dec := json.NewDecoder(stdout)
|
||||||
|
|
||||||
|
// Expect a JSON array stream: [ { ... }, { ... }, ... ]
|
||||||
|
tok, err := dec.Token()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if delim, ok := tok.(json.Delim); !ok || delim != '[' {
|
||||||
|
return fmt.Errorf("unexpected JSON start token: %v", tok)
|
||||||
|
}
|
||||||
|
|
||||||
|
var sample intelGpuStats
|
||||||
|
for {
|
||||||
|
if dec.More() {
|
||||||
|
// Clear the engines map before decoding
|
||||||
|
if sample.Engines != nil {
|
||||||
|
for k := range sample.Engines {
|
||||||
|
delete(sample.Engines, k)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := dec.Decode(&sample); err != nil {
|
||||||
|
return fmt.Errorf("decode intel gpu: %w", err)
|
||||||
|
}
|
||||||
|
gm.updateIntelFromStats(&sample)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Attempt to read closing bracket (will only be present when process exits)
|
||||||
|
tok, err = dec.Token()
|
||||||
|
if err != nil {
|
||||||
|
// When the process is still running, decoder will block in More/Decode; any error here is terminal
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if delim, ok := tok.(json.Delim); ok && delim == ']' {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return cmd.Wait()
|
||||||
|
}
|
||||||
@@ -379,12 +379,12 @@ func TestGetCurrentData(t *testing.T) {
|
|||||||
assert.InDelta(t, 60.0, result["1"].Power, 0.01)
|
assert.InDelta(t, 60.0, result["1"].Power, 0.01)
|
||||||
|
|
||||||
// Verify that accumulators in the original map are reset
|
// Verify that accumulators in the original map are reset
|
||||||
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Count, "GPU 0 Count should be reset")
|
assert.EqualValues(t, float64(1), gm.GpuDataMap["0"].Count, "GPU 0 Count should be reset")
|
||||||
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Usage, "GPU 0 Usage should be reset")
|
assert.EqualValues(t, float64(50.0), gm.GpuDataMap["0"].Usage, "GPU 0 Usage should be reset")
|
||||||
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Power, "GPU 0 Power should be reset")
|
assert.Equal(t, float64(100.0), gm.GpuDataMap["0"].Power, "GPU 0 Power should be reset")
|
||||||
assert.Equal(t, float64(0), gm.GpuDataMap["1"].Count, "GPU 1 Count should be reset")
|
assert.Equal(t, float64(1), gm.GpuDataMap["1"].Count, "GPU 1 Count should be reset")
|
||||||
assert.Equal(t, float64(0), gm.GpuDataMap["1"].Usage, "GPU 1 Usage should be reset")
|
assert.Equal(t, float64(30), gm.GpuDataMap["1"].Usage, "GPU 1 Usage should be reset")
|
||||||
assert.Equal(t, float64(0), gm.GpuDataMap["1"].Power, "GPU 1 Power should be reset")
|
assert.Equal(t, float64(60), gm.GpuDataMap["1"].Power, "GPU 1 Power should be reset")
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("handles zero count without panicking", func(t *testing.T) {
|
t.Run("handles zero count without panicking", func(t *testing.T) {
|
||||||
@@ -409,7 +409,7 @@ func TestGetCurrentData(t *testing.T) {
|
|||||||
assert.Equal(t, 0.0, result["0"].Power)
|
assert.Equal(t, 0.0, result["0"].Power)
|
||||||
|
|
||||||
// Verify reset count
|
// Verify reset count
|
||||||
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Count)
|
assert.EqualValues(t, 1, gm.GpuDataMap["0"].Count)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -779,16 +779,109 @@ func TestAccumulation(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Verify that accumulators in the original map are reset
|
// Verify that accumulators in the original map are reset
|
||||||
for id := range tt.expectedValues {
|
for id, expected := range tt.expectedValues {
|
||||||
gpu, exists := gm.GpuDataMap[id]
|
gpu, exists := gm.GpuDataMap[id]
|
||||||
assert.True(t, exists, "GPU with ID %s should still exist after GetCurrentData", id)
|
assert.True(t, exists, "GPU with ID %s should still exist after GetCurrentData", id)
|
||||||
if !exists {
|
if !exists {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
assert.Equal(t, float64(0), gpu.Count, "Count should be reset for GPU ID %s", id)
|
assert.EqualValues(t, 1, gpu.Count, "Count should be reset for GPU ID %s", id)
|
||||||
assert.Equal(t, float64(0), gpu.Usage, "Usage should be reset for GPU ID %s", id)
|
assert.EqualValues(t, expected.avgUsage, gpu.Usage, "Usage should be reset for GPU ID %s", id)
|
||||||
assert.Equal(t, float64(0), gpu.Power, "Power should be reset for GPU ID %s", id)
|
assert.EqualValues(t, expected.avgPower, gpu.Power, "Power should be reset for GPU ID %s", id)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestIntelUpdateFromStats(t *testing.T) {
|
||||||
|
gm := &GPUManager{
|
||||||
|
GpuDataMap: make(map[string]*system.GPUData),
|
||||||
|
}
|
||||||
|
|
||||||
|
// First sample with power and two engines
|
||||||
|
sample1 := intelGpuStats{
|
||||||
|
Engines: map[string]struct {
|
||||||
|
Busy float64 `json:"busy"`
|
||||||
|
}{
|
||||||
|
"Render/3D": {Busy: 20.0},
|
||||||
|
"Video": {Busy: 5.0},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
sample1.Power.GPU = 10.5
|
||||||
|
|
||||||
|
ok := gm.updateIntelFromStats(&sample1)
|
||||||
|
assert.True(t, ok)
|
||||||
|
|
||||||
|
gpu := gm.GpuDataMap["0"]
|
||||||
|
require.NotNil(t, gpu)
|
||||||
|
assert.Equal(t, "GPU", gpu.Name)
|
||||||
|
assert.InDelta(t, 10.5, gpu.Power, 0.001)
|
||||||
|
assert.InDelta(t, 20.0, gpu.Engines["Render/3D"], 0.001)
|
||||||
|
assert.InDelta(t, 5.0, gpu.Engines["Video"], 0.001)
|
||||||
|
assert.Equal(t, float64(1), gpu.Count)
|
||||||
|
|
||||||
|
// Second sample with zero power (should not add) and additional engine busy
|
||||||
|
sample2 := intelGpuStats{
|
||||||
|
Engines: map[string]struct {
|
||||||
|
Busy float64 `json:"busy"`
|
||||||
|
}{
|
||||||
|
"Render/3D": {Busy: 10.0},
|
||||||
|
"Video": {Busy: 2.5},
|
||||||
|
"Blitter": {Busy: 1.0},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
// zero power should not increment power accumulator
|
||||||
|
sample2.Power.GPU = 0.0
|
||||||
|
|
||||||
|
ok = gm.updateIntelFromStats(&sample2)
|
||||||
|
assert.True(t, ok)
|
||||||
|
|
||||||
|
gpu = gm.GpuDataMap["0"]
|
||||||
|
require.NotNil(t, gpu)
|
||||||
|
assert.InDelta(t, 10.5, gpu.Power, 0.001)
|
||||||
|
assert.InDelta(t, 30.0, gpu.Engines["Render/3D"], 0.001) // 20 + 10
|
||||||
|
assert.InDelta(t, 7.5, gpu.Engines["Video"], 0.001) // 5 + 2.5
|
||||||
|
assert.InDelta(t, 1.0, gpu.Engines["Blitter"], 0.001)
|
||||||
|
assert.Equal(t, float64(2), gpu.Count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIntelCollectorStreaming(t *testing.T) {
|
||||||
|
// Save and override PATH
|
||||||
|
origPath := os.Getenv("PATH")
|
||||||
|
defer os.Setenv("PATH", origPath)
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
os.Setenv("PATH", dir)
|
||||||
|
|
||||||
|
// Create a fake intel_gpu_top that prints a JSON array with two samples and exits
|
||||||
|
scriptPath := filepath.Join(dir, "intel_gpu_top")
|
||||||
|
script := `#!/bin/sh
|
||||||
|
# Ignore args -s and -J
|
||||||
|
# Emit a JSON array with two objects, separated by a comma, then exit
|
||||||
|
(echo '['; \
|
||||||
|
echo '{"power":{"GPU":1.5},"engines":{"Render/3D":{"busy":12.34}}},'; \
|
||||||
|
echo '{"power":{"GPU":2.0},"engines":{"Video":{"busy":5}}}'; \
|
||||||
|
echo ']')`
|
||||||
|
if err := os.WriteFile(scriptPath, []byte(script), 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
gm := &GPUManager{
|
||||||
|
GpuDataMap: make(map[string]*system.GPUData),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run the collector once; it should read two samples and return
|
||||||
|
if err := gm.collectIntelStats(); err != nil {
|
||||||
|
t.Fatalf("collectIntelStats error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
gpu := gm.GpuDataMap["0"]
|
||||||
|
require.NotNil(t, gpu)
|
||||||
|
// Power should be sum of non-zero samples: 1.5 + 2.0 = 3.5
|
||||||
|
assert.InDelta(t, 3.5, gpu.Power, 0.001)
|
||||||
|
// Engines aggregated
|
||||||
|
assert.InDelta(t, 12.34, gpu.Engines["Render/3D"], 0.001)
|
||||||
|
assert.InDelta(t, 5.0, gpu.Engines["Video"], 0.001)
|
||||||
|
// Count should be 2 samples
|
||||||
|
assert.Equal(t, float64(2), gpu.Count)
|
||||||
|
}
|
||||||
|
|||||||
@@ -45,13 +45,14 @@ type Stats struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type GPUData struct {
|
type GPUData struct {
|
||||||
Name string `json:"n" cbor:"0,keyasint"`
|
Name string `json:"n" cbor:"0,keyasint"`
|
||||||
Temperature float64 `json:"-"`
|
Temperature float64 `json:"-"`
|
||||||
MemoryUsed float64 `json:"mu,omitempty" cbor:"1,keyasint,omitempty"`
|
MemoryUsed float64 `json:"mu,omitempty,omitzero" cbor:"1,keyasint,omitempty,omitzero"`
|
||||||
MemoryTotal float64 `json:"mt,omitempty" cbor:"2,keyasint,omitempty"`
|
MemoryTotal float64 `json:"mt,omitempty,omitzero" cbor:"2,keyasint,omitempty,omitzero"`
|
||||||
Usage float64 `json:"u" cbor:"3,keyasint"`
|
Usage float64 `json:"u" cbor:"3,keyasint,omitempty"`
|
||||||
Power float64 `json:"p,omitempty" cbor:"4,keyasint,omitempty"`
|
Power float64 `json:"p,omitempty" cbor:"4,keyasint,omitempty"`
|
||||||
Count float64 `json:"-"`
|
Count float64 `json:"-"`
|
||||||
|
Engines map[string]float64 `json:"e,omitempty" cbor:"5,keyasint,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type FsStats struct {
|
type FsStats struct {
|
||||||
|
|||||||
@@ -284,6 +284,16 @@ func (rm *RecordManager) AverageSystemStats(db dbx.Builder, records RecordIds) *
|
|||||||
gpu.Usage += value.Usage
|
gpu.Usage += value.Usage
|
||||||
gpu.Power += value.Power
|
gpu.Power += value.Power
|
||||||
gpu.Count += value.Count
|
gpu.Count += value.Count
|
||||||
|
|
||||||
|
if value.Engines != nil {
|
||||||
|
if gpu.Engines == nil {
|
||||||
|
gpu.Engines = make(map[string]float64, len(value.Engines))
|
||||||
|
}
|
||||||
|
for engineKey, engineValue := range value.Engines {
|
||||||
|
gpu.Engines[engineKey] += engineValue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
sum.GPUData[id] = gpu
|
sum.GPUData[id] = gpu
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -353,6 +363,13 @@ func (rm *RecordManager) AverageSystemStats(db dbx.Builder, records RecordIds) *
|
|||||||
gpu.Usage = twoDecimals(gpu.Usage / count)
|
gpu.Usage = twoDecimals(gpu.Usage / count)
|
||||||
gpu.Power = twoDecimals(gpu.Power / count)
|
gpu.Power = twoDecimals(gpu.Power / count)
|
||||||
gpu.Count = twoDecimals(gpu.Count / count)
|
gpu.Count = twoDecimals(gpu.Count / count)
|
||||||
|
|
||||||
|
if gpu.Engines != nil {
|
||||||
|
for engineKey := range gpu.Engines {
|
||||||
|
gpu.Engines[engineKey] = twoDecimals(gpu.Engines[engineKey] / count)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
sum.GPUData[id] = gpu
|
sum.GPUData[id] = gpu
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -115,11 +115,11 @@ export function useNetworkInterfaces(interfaces: SystemStats["ni"]) {
|
|||||||
data: (index = 3) => {
|
data: (index = 3) => {
|
||||||
return sortedKeys.map((key) => ({
|
return sortedKeys.map((key) => ({
|
||||||
label: key,
|
label: key,
|
||||||
dataKey: (stats: SystemStatsRecord) => stats.stats?.ni?.[key]?.[index],
|
dataKey: ({ stats }: SystemStatsRecord) => stats?.ni?.[key]?.[index],
|
||||||
color: `hsl(${220 + (((sortedKeys.indexOf(key) * 360) / sortedKeys.length) % 360)}, 70%, 50%)`,
|
color: `hsl(${220 + (((sortedKeys.indexOf(key) * 360) / sortedKeys.length) % 360)}, 70%, 50%)`,
|
||||||
|
|
||||||
opacity: 0.3,
|
opacity: 0.3,
|
||||||
}))
|
}))
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -61,6 +61,7 @@ import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from ".
|
|||||||
import { Separator } from "../ui/separator"
|
import { Separator } from "../ui/separator"
|
||||||
import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "../ui/tooltip"
|
import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "../ui/tooltip"
|
||||||
import NetworkSheet from "./system/network-sheet"
|
import NetworkSheet from "./system/network-sheet"
|
||||||
|
import LineChartDefault from "../charts/line-chart"
|
||||||
|
|
||||||
type ChartTimeData = {
|
type ChartTimeData = {
|
||||||
time: number
|
time: number
|
||||||
@@ -398,6 +399,7 @@ export default memo(function SystemDetail({ name }: { name: string }) {
|
|||||||
const lastGpuVals = Object.values(systemStats.at(-1)?.stats.g ?? {})
|
const lastGpuVals = Object.values(systemStats.at(-1)?.stats.g ?? {})
|
||||||
const hasGpuData = lastGpuVals.length > 0
|
const hasGpuData = lastGpuVals.length > 0
|
||||||
const hasGpuPowerData = lastGpuVals.some((gpu) => gpu.p !== undefined)
|
const hasGpuPowerData = lastGpuVals.some((gpu) => gpu.p !== undefined)
|
||||||
|
const hasGpuEnginesData = lastGpuVals.some((gpu) => gpu.e !== undefined)
|
||||||
|
|
||||||
let translatedStatus: string = system.status
|
let translatedStatus: string = system.status
|
||||||
if (system.status === SystemStatus.Up) {
|
if (system.status === SystemStatus.Up) {
|
||||||
@@ -758,7 +760,6 @@ export default memo(function SystemDetail({ name }: { name: string }) {
|
|||||||
/>
|
/>
|
||||||
</ChartCard>
|
</ChartCard>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
{/* GPU power draw chart */}
|
{/* GPU power draw chart */}
|
||||||
{hasGpuPowerData && (
|
{hasGpuPowerData && (
|
||||||
<ChartCard
|
<ChartCard
|
||||||
@@ -772,9 +773,20 @@ export default memo(function SystemDetail({ name }: { name: string }) {
|
|||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{/* GPU charts */}
|
{/* Non-power GPU charts */}
|
||||||
{hasGpuData && (
|
{hasGpuData && (
|
||||||
<div className="grid xl:grid-cols-2 gap-4">
|
<div className="grid xl:grid-cols-2 gap-4">
|
||||||
|
{hasGpuEnginesData && (
|
||||||
|
<ChartCard
|
||||||
|
className="!col-span-1"
|
||||||
|
empty={dataEmpty}
|
||||||
|
grid={grid}
|
||||||
|
title={t`GPU Engines`}
|
||||||
|
description={t`Average utilization of GPU engines`}
|
||||||
|
>
|
||||||
|
<GpuEnginesChart chartData={chartData} />
|
||||||
|
</ChartCard>
|
||||||
|
)}
|
||||||
{Object.keys(systemStats.at(-1)?.stats.g ?? {}).map((id) => {
|
{Object.keys(systemStats.at(-1)?.stats.g ?? {}).map((id) => {
|
||||||
const gpu = systemStats.at(-1)?.stats.g?.[id] as GPUData
|
const gpu = systemStats.at(-1)?.stats.g?.[id] as GPUData
|
||||||
return (
|
return (
|
||||||
@@ -799,33 +811,36 @@ export default memo(function SystemDetail({ name }: { name: string }) {
|
|||||||
contentFormatter={({ value }) => `${decimalString(value)}%`}
|
contentFormatter={({ value }) => `${decimalString(value)}%`}
|
||||||
/>
|
/>
|
||||||
</ChartCard>
|
</ChartCard>
|
||||||
<ChartCard
|
|
||||||
empty={dataEmpty}
|
{(gpu.mt ?? 0) > 0 && (
|
||||||
grid={grid}
|
<ChartCard
|
||||||
title={`${gpu.n} VRAM`}
|
empty={dataEmpty}
|
||||||
description={t`Precise utilization at the recorded time`}
|
grid={grid}
|
||||||
>
|
title={`${gpu.n} VRAM`}
|
||||||
<AreaChartDefault
|
description={t`Precise utilization at the recorded time`}
|
||||||
chartData={chartData}
|
>
|
||||||
dataPoints={[
|
<AreaChartDefault
|
||||||
{
|
chartData={chartData}
|
||||||
label: t`Usage`,
|
dataPoints={[
|
||||||
dataKey: ({ stats }) => stats?.g?.[id]?.mu ?? 0,
|
{
|
||||||
color: 2,
|
label: t`Usage`,
|
||||||
opacity: 0.25,
|
dataKey: ({ stats }) => stats?.g?.[id]?.mu ?? 0,
|
||||||
},
|
color: 2,
|
||||||
]}
|
opacity: 0.25,
|
||||||
max={gpu.mt}
|
},
|
||||||
tickFormatter={(val) => {
|
]}
|
||||||
const { value, unit } = formatBytes(val, false, Unit.Bytes, true)
|
max={gpu.mt}
|
||||||
return `${toFixedFloat(value, value >= 10 ? 0 : 1)} ${unit}`
|
tickFormatter={(val) => {
|
||||||
}}
|
const { value, unit } = formatBytes(val, false, Unit.Bytes, true)
|
||||||
contentFormatter={({ value }) => {
|
return `${toFixedFloat(value, value >= 10 ? 0 : 1)} ${unit}`
|
||||||
const { value: convertedValue, unit } = formatBytes(value, false, Unit.Bytes, true)
|
}}
|
||||||
return `${decimalString(convertedValue)} ${unit}`
|
contentFormatter={({ value }) => {
|
||||||
}}
|
const { value: convertedValue, unit } = formatBytes(value, false, Unit.Bytes, true)
|
||||||
/>
|
return `${decimalString(convertedValue)} ${unit}`
|
||||||
</ChartCard>
|
}}
|
||||||
|
/>
|
||||||
|
</ChartCard>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
})}
|
})}
|
||||||
@@ -897,6 +912,28 @@ export default memo(function SystemDetail({ name }: { name: string }) {
|
|||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
function GpuEnginesChart({ chartData }: { chartData: ChartData }) {
|
||||||
|
const dataPoints = []
|
||||||
|
const engines = Object.keys(chartData.systemStats?.at(-1)?.stats.g?.[0]?.e ?? {}).sort()
|
||||||
|
for (const engine of engines) {
|
||||||
|
dataPoints.push({
|
||||||
|
label: engine,
|
||||||
|
dataKey: ({ stats }: SystemStatsRecord) => stats?.g?.[0]?.e?.[engine] ?? 0,
|
||||||
|
color: `hsl(${140 + (((engines.indexOf(engine) * 360) / engines.length) % 360)}, 65%, 52%)`,
|
||||||
|
opacity: 0.35,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return (
|
||||||
|
<LineChartDefault
|
||||||
|
legend={true}
|
||||||
|
chartData={chartData}
|
||||||
|
dataPoints={dataPoints}
|
||||||
|
tickFormatter={(val) => `${toFixedFloat(val, 2)}%`}
|
||||||
|
contentFormatter={({ value }) => `${decimalString(value)}%`}
|
||||||
|
/>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
function FilterBar({ store = $containerFilter }: { store?: typeof $containerFilter }) {
|
function FilterBar({ store = $containerFilter }: { store?: typeof $containerFilter }) {
|
||||||
const containerFilter = useStore(store)
|
const containerFilter = useStore(store)
|
||||||
const { t } = useLingui()
|
const { t } = useLingui()
|
||||||
|
|||||||
2
internal/site/src/types.d.ts
vendored
2
internal/site/src/types.d.ts
vendored
@@ -158,6 +158,8 @@ export interface GPUData {
|
|||||||
u: number
|
u: number
|
||||||
/** power (w) */
|
/** power (w) */
|
||||||
p?: number
|
p?: number
|
||||||
|
/** engines */
|
||||||
|
e?: Record<string, number>
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ExtraFsStats {
|
export interface ExtraFsStats {
|
||||||
|
|||||||
Reference in New Issue
Block a user