From ec69f6c6e0f75f61e1bb1a4341868534a9354745 Mon Sep 17 00:00:00 2001 From: henrygd Date: Thu, 26 Feb 2026 16:59:12 -0500 Subject: [PATCH] improve disk I/O device matching for partition-to-disk mismatches (#1772) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit findIoDevice now normalizes device names and falls back to prefix-based matching when partition names differ from IOCounter names (e.g. nda0p2 → nda0 on FreeBSD). The most-active prefix-related device is selected, avoiding the broad "most active of all" heuristic that caused Docker misattribution in #1737. --- agent/disk.go | 103 ++++++++++++++++++++++++++++++++++++++++----- agent/disk_test.go | 80 ++++++++++++++++++++++++++++++++++- 2 files changed, 172 insertions(+), 11 deletions(-) diff --git a/agent/disk.go b/agent/disk.go index 6df5d73f..876f6815 100644 --- a/agent/disk.go +++ b/agent/disk.go @@ -78,14 +78,21 @@ func (a *Agent) initializeDiskInfo() { if _, exists := a.fsStats[key]; !exists { if root { slog.Info("Detected root device", "name", key) - // Check if root device is in /proc/diskstats. Do not guess a - // fallback device for root: that can misattribute root I/O to a - // different disk while usage remains tied to root mountpoint. + // Try to map root device to a diskIoCounters entry. First + // checks for an exact key match, then uses findIoDevice for + // normalized / prefix-based matching (e.g. nda0p2 → nda0), + // and finally falls back to the FILESYSTEM env var. if _, ioMatch = diskIoCounters[key]; !ioMatch { - if matchedKey, match := findIoDevice(filesystem, diskIoCounters); match { + if matchedKey, match := findIoDevice(key, diskIoCounters); match { key = matchedKey ioMatch = true - } else { + } else if filesystem != "" { + if matchedKey, match := findIoDevice(filesystem, diskIoCounters); match { + key = matchedKey + ioMatch = true + } + } + if !ioMatch { slog.Warn("Root I/O unmapped; set FILESYSTEM", "device", device, "mountpoint", mountpoint) } } @@ -114,7 +121,7 @@ func (a *Agent) initializeDiskInfo() { // Use FILESYSTEM env var to find root filesystem if filesystem != "" { for _, p := range partitions { - if strings.HasSuffix(p.Device, filesystem) || p.Mountpoint == filesystem { + if filesystemMatchesPartitionSetting(filesystem, p) { addFsStat(p.Device, p.Mountpoint, true) hasRoot = true break @@ -251,15 +258,91 @@ func withinUsageTolerance(a, b, tolerance uint64) bool { return b-a <= tolerance } -// Returns matching device from /proc/diskstats. -// bool is true if a match was found. +type ioMatchCandidate struct { + name string + bytes uint64 + ops uint64 +} + +// findIoDevice prefers exact device/label matches, then falls back to a +// prefix-related candidate with the highest recent activity. func findIoDevice(filesystem string, diskIoCounters map[string]disk.IOCountersStat) (string, bool) { + filesystem = normalizeDeviceName(filesystem) + if filesystem == "" { + return "", false + } + + candidates := []ioMatchCandidate{} + for _, d := range diskIoCounters { - if d.Name == filesystem || (d.Label != "" && d.Label == filesystem) { + if normalizeDeviceName(d.Name) == filesystem || (d.Label != "" && normalizeDeviceName(d.Label) == filesystem) { return d.Name, true } + if prefixRelated(normalizeDeviceName(d.Name), filesystem) || + (d.Label != "" && prefixRelated(normalizeDeviceName(d.Label), filesystem)) { + candidates = append(candidates, ioMatchCandidate{ + name: d.Name, + bytes: d.ReadBytes + d.WriteBytes, + ops: d.ReadCount + d.WriteCount, + }) + } } - return "", false + + if len(candidates) == 0 { + return "", false + } + + best := candidates[0] + for _, c := range candidates[1:] { + if c.bytes > best.bytes || + (c.bytes == best.bytes && c.ops > best.ops) || + (c.bytes == best.bytes && c.ops == best.ops && c.name < best.name) { + best = c + } + } + + slog.Info("Using disk I/O fallback", "requested", filesystem, "selected", best.name) + return best.name, true +} + +// prefixRelated reports whether either identifier is a prefix of the other. +func prefixRelated(a, b string) bool { + if a == "" || b == "" || a == b { + return false + } + return strings.HasPrefix(a, b) || strings.HasPrefix(b, a) +} + +// filesystemMatchesPartitionSetting checks whether a FILESYSTEM env var value +// matches a partition by mountpoint, exact device name, or prefix relationship +// (e.g. FILESYSTEM=ada0 matches partition /dev/ada0p2). +func filesystemMatchesPartitionSetting(filesystem string, p disk.PartitionStat) bool { + filesystem = strings.TrimSpace(filesystem) + if filesystem == "" { + return false + } + if p.Mountpoint == filesystem { + return true + } + + fsName := normalizeDeviceName(filesystem) + partName := normalizeDeviceName(p.Device) + if fsName == "" || partName == "" { + return false + } + if fsName == partName { + return true + } + return prefixRelated(partName, fsName) +} + +// normalizeDeviceName canonicalizes device strings for comparisons. +func normalizeDeviceName(value string) string { + name := filepath.Base(strings.TrimSpace(value)) + if name == "." { + return "" + } + return name } // Sets start values for disk I/O stats. diff --git a/agent/disk_test.go b/agent/disk_test.go index b4a58624..acfcb751 100644 --- a/agent/disk_test.go +++ b/agent/disk_test.go @@ -116,7 +116,7 @@ func TestFindIoDevice(t *testing.T) { assert.Equal(t, "sda", device) }) - t.Run("returns no fallback when not found", func(t *testing.T) { + t.Run("returns no match when not found", func(t *testing.T) { ioCounters := map[string]disk.IOCountersStat{ "sda": {Name: "sda"}, "sdb": {Name: "sdb"}, @@ -126,6 +126,84 @@ func TestFindIoDevice(t *testing.T) { assert.False(t, ok) assert.Equal(t, "", device) }) + + t.Run("uses uncertain unique prefix fallback", func(t *testing.T) { + ioCounters := map[string]disk.IOCountersStat{ + "nvme0n1": {Name: "nvme0n1"}, + "sda": {Name: "sda"}, + } + + device, ok := findIoDevice("nvme0n1p2", ioCounters) + assert.True(t, ok) + assert.Equal(t, "nvme0n1", device) + }) + + t.Run("uses dominant activity when prefix matches are ambiguous", func(t *testing.T) { + ioCounters := map[string]disk.IOCountersStat{ + "sda": {Name: "sda", ReadBytes: 5000, WriteBytes: 5000, ReadCount: 100, WriteCount: 100}, + "sdb": {Name: "sdb", ReadBytes: 1000, WriteBytes: 1000, ReadCount: 50, WriteCount: 50}, + } + + device, ok := findIoDevice("sd", ioCounters) + assert.True(t, ok) + assert.Equal(t, "sda", device) + }) + + t.Run("uses highest activity when ambiguous without dominance", func(t *testing.T) { + ioCounters := map[string]disk.IOCountersStat{ + "sda": {Name: "sda", ReadBytes: 3000, WriteBytes: 3000, ReadCount: 50, WriteCount: 50}, + "sdb": {Name: "sdb", ReadBytes: 2500, WriteBytes: 2500, ReadCount: 40, WriteCount: 40}, + } + + device, ok := findIoDevice("sd", ioCounters) + assert.True(t, ok) + assert.Equal(t, "sda", device) + }) + + t.Run("matches /dev/-prefixed partition to parent disk", func(t *testing.T) { + ioCounters := map[string]disk.IOCountersStat{ + "nda0": {Name: "nda0", ReadBytes: 1000, WriteBytes: 1000}, + } + + device, ok := findIoDevice("/dev/nda0p2", ioCounters) + assert.True(t, ok) + assert.Equal(t, "nda0", device) + }) + + t.Run("uses deterministic name tie-breaker", func(t *testing.T) { + ioCounters := map[string]disk.IOCountersStat{ + "sdb": {Name: "sdb", ReadBytes: 2000, WriteBytes: 2000, ReadCount: 10, WriteCount: 10}, + "sda": {Name: "sda", ReadBytes: 2000, WriteBytes: 2000, ReadCount: 10, WriteCount: 10}, + } + + device, ok := findIoDevice("sd", ioCounters) + assert.True(t, ok) + assert.Equal(t, "sda", device) + }) +} + +func TestFilesystemMatchesPartitionSetting(t *testing.T) { + p := disk.PartitionStat{Device: "/dev/ada0p2", Mountpoint: "/"} + + t.Run("matches mountpoint setting", func(t *testing.T) { + assert.True(t, filesystemMatchesPartitionSetting("/", p)) + }) + + t.Run("matches exact partition setting", func(t *testing.T) { + assert.True(t, filesystemMatchesPartitionSetting("ada0p2", p)) + assert.True(t, filesystemMatchesPartitionSetting("/dev/ada0p2", p)) + }) + + t.Run("matches prefix-style parent setting", func(t *testing.T) { + assert.True(t, filesystemMatchesPartitionSetting("ada0", p)) + assert.True(t, filesystemMatchesPartitionSetting("/dev/ada0", p)) + }) + + t.Run("does not match unrelated device", func(t *testing.T) { + assert.False(t, filesystemMatchesPartitionSetting("sda", p)) + assert.False(t, filesystemMatchesPartitionSetting("nvme0n1", p)) + assert.False(t, filesystemMatchesPartitionSetting("", p)) + }) } func TestIsDockerSpecialMountpoint(t *testing.T) {