gpu: add tests and refactor to support amd on windows

This commit is contained in:
henrygd
2025-02-21 00:56:40 -05:00
parent b958e9eefe
commit 6b41a98338
2 changed files with 557 additions and 76 deletions

View File

@@ -17,11 +17,11 @@ import (
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
type GPUManager struct {
sync.Mutex
nvidiaSmi bool
rocmSmi bool
tegrastats bool
GpuDataMap map[string]*system.GPUData
mutex sync.Mutex
}
// RocmSmiJson represents the JSON structure of rocm-smi output
@@ -38,9 +38,10 @@ type RocmSmiJson struct {
// gpuCollector defines a collector for a specific GPU management utility (nvidia-smi or rocm-smi)
type gpuCollector struct {
name string
cmd *exec.Cmd
parse func([]byte) bool // returns true if valid data was found
name string
cmdArgs []string
parse func([]byte) bool // returns true if valid data was found
buf []byte
}
var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data
@@ -63,17 +64,20 @@ func (c *gpuCollector) start() {
// collect executes the command, parses output with the assigned parser function
func (c *gpuCollector) collect() error {
stdout, err := c.cmd.StdoutPipe()
cmd := exec.Command(c.name, c.cmdArgs...)
stdout, err := cmd.StdoutPipe()
if err != nil {
return err
}
if err := c.cmd.Start(); err != nil {
if err := cmd.Start(); err != nil {
return err
}
scanner := bufio.NewScanner(stdout)
buf := make([]byte, 0, 8*1024)
scanner.Buffer(buf, bufio.MaxScanTokenSize)
if c.buf == nil {
c.buf = make([]byte, 0, 4*1024)
}
scanner.Buffer(c.buf, bufio.MaxScanTokenSize)
for scanner.Scan() {
hasValidData := c.parse(scanner.Bytes())
@@ -85,7 +89,7 @@ func (c *gpuCollector) collect() error {
if err := scanner.Err(); err != nil {
return fmt.Errorf("scanner error: %w", err)
}
return c.cmd.Wait()
return cmd.Wait()
}
// getJetsonParser returns a function to parse the output of tegrastats and update the GPUData map
@@ -99,8 +103,8 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool {
powerPattern := regexp.MustCompile(`(GPU_SOC|CPU_GPU_CV) (\d+)mW`)
return func(output []byte) bool {
gm.mutex.Lock()
defer gm.mutex.Unlock()
gm.Lock()
defer gm.Unlock()
// we get gpu name from the intitial run of nvidia-smi, so return if it hasn't been initialized
gpuData, ok := gm.GpuDataMap["0"]
if !ok {
@@ -126,7 +130,7 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool {
// Parse power usage
powerMatches := powerPattern.FindStringSubmatch(data)
if powerMatches != nil {
power, _ := strconv.ParseFloat(powerMatches[1], 64)
power, _ := strconv.ParseFloat(powerMatches[2], 64)
gpuData.Power = power / 1000
}
gpuData.Count++
@@ -136,46 +140,42 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool {
// parseNvidiaData parses the output of nvidia-smi and updates the GPUData map
func (gm *GPUManager) parseNvidiaData(output []byte) bool {
fields := strings.Split(string(output), ", ")
if len(fields) < 7 {
return false
}
gm.mutex.Lock()
defer gm.mutex.Unlock()
lines := strings.Split(string(output), "\n")
for _, line := range lines {
if line != "" {
fields := strings.Split(line, ", ")
if len(fields) >= 7 {
id := fields[0]
temp, _ := strconv.ParseFloat(fields[2], 64)
memoryUsage, _ := strconv.ParseFloat(fields[3], 64)
totalMemory, _ := strconv.ParseFloat(fields[4], 64)
usage, _ := strconv.ParseFloat(fields[5], 64)
power, _ := strconv.ParseFloat(fields[6], 64)
// add gpu if not exists
if _, ok := gm.GpuDataMap[id]; !ok {
name := strings.TrimPrefix(fields[1], "NVIDIA ")
gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")}
// check if tegrastats is active - if so we will only use nvidia-smi to get gpu name
// - nvidia-smi does not provide metrics for tegra / jetson devices
// this will end the nvidia-smi collector
if gm.tegrastats {
return false
}
}
// update gpu data
gpu := gm.GpuDataMap[id]
gpu.Temperature = temp
gpu.MemoryUsed = memoryUsage / 1.024
gpu.MemoryTotal = totalMemory / 1.024
gpu.Usage += usage
gpu.Power += power
gpu.Count++
gm.Lock()
defer gm.Unlock()
var valid bool
for line := range strings.Lines(string(output)) {
fields := strings.Split(strings.TrimSpace(line), ", ")
if len(fields) < 7 {
continue
}
valid = true
id := fields[0]
temp, _ := strconv.ParseFloat(fields[2], 64)
memoryUsage, _ := strconv.ParseFloat(fields[3], 64)
totalMemory, _ := strconv.ParseFloat(fields[4], 64)
usage, _ := strconv.ParseFloat(fields[5], 64)
power, _ := strconv.ParseFloat(fields[6], 64)
// add gpu if not exists
if _, ok := gm.GpuDataMap[id]; !ok {
name := strings.TrimPrefix(fields[1], "NVIDIA ")
gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")}
// check if tegrastats is active - if so we will only use nvidia-smi to get gpu name
// - nvidia-smi does not provide metrics for tegra / jetson devices
// this will end the nvidia-smi collector
if gm.tegrastats {
return false
}
}
// update gpu data
gpu := gm.GpuDataMap[id]
gpu.Temperature = temp
gpu.MemoryUsed = memoryUsage / 1.024
gpu.MemoryTotal = totalMemory / 1.024
gpu.Usage += usage
gpu.Power += power
gpu.Count++
}
return true
return valid
}
// parseAmdData parses the output of rocm-smi and updates the GPUData map
@@ -184,8 +184,8 @@ func (gm *GPUManager) parseAmdData(output []byte) bool {
if err := json.Unmarshal(output, &rocmSmiInfo); err != nil || len(rocmSmiInfo) == 0 {
return false
}
gm.mutex.Lock()
defer gm.mutex.Unlock()
gm.Lock()
defer gm.Unlock()
for _, v := range rocmSmiInfo {
var power float64
if v.PowerPackage != "" {
@@ -213,8 +213,8 @@ func (gm *GPUManager) parseAmdData(output []byte) bool {
// sums and resets the current GPU utilization data since the last update
func (gm *GPUManager) GetCurrentData() map[string]system.GPUData {
gm.mutex.Lock()
defer gm.mutex.Unlock()
gm.Lock()
defer gm.Unlock()
// check for GPUs with the same name
nameCounts := make(map[string]int)
@@ -266,31 +266,36 @@ func (gm *GPUManager) detectGPUs() error {
// startCollector starts the appropriate GPU data collector based on the command
func (gm *GPUManager) startCollector(command string) {
collector := gpuCollector{
name: command,
}
switch command {
case "nvidia-smi":
nvidia := gpuCollector{
name: "nvidia-smi",
cmd: exec.Command("nvidia-smi", "-l", "4",
"--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw",
"--format=csv,noheader,nounits"),
parse: gm.parseNvidiaData,
}
go nvidia.start()
case "rocm-smi":
amdCollector := gpuCollector{
name: "rocm-smi",
cmd: exec.Command("/bin/sh", "-c",
"while true; do rocm-smi --showid --showtemp --showuse --showpower --showproductname --showmeminfo vram --json; sleep 4.3; done"),
parse: gm.parseAmdData,
}
go amdCollector.start()
collector.cmdArgs = []string{"-l", "4",
"--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw",
"--format=csv,noheader,nounits"}
collector.parse = gm.parseNvidiaData
go collector.start()
case "tegrastats":
jetsonCollector := gpuCollector{
name: "tegrastats",
cmd: exec.Command("tegrastats", "--interval", "3000"),
parse: gm.getJetsonParser(),
}
go jetsonCollector.start()
collector.cmdArgs = []string{"--interval", "3000"}
collector.parse = gm.getJetsonParser()
go collector.start()
case "rocm-smi":
collector.cmdArgs = []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"}
collector.parse = gm.parseAmdData
go func() {
failures := 0
for {
if err := collector.collect(); err != nil {
failures++
if failures > 5 {
break
}
slog.Warn("Error collecting AMD GPU data", "err", err)
}
time.Sleep(4300 * time.Millisecond)
}
}()
}
}
@@ -300,7 +305,7 @@ func NewGPUManager() (*GPUManager, error) {
if err := gm.detectGPUs(); err != nil {
return nil, err
}
gm.GpuDataMap = make(map[string]*system.GPUData, 1)
gm.GpuDataMap = make(map[string]*system.GPUData)
if gm.nvidiaSmi {
gm.startCollector("nvidia-smi")