improve disk I/O device matching for partition-to-disk mismatches (#1772)

findIoDevice now normalizes device names and falls back to prefix-based
matching when partition names differ from IOCounter names (e.g. nda0p2 →
nda0 on FreeBSD). The most-active prefix-related device is selected,
avoiding the broad "most active of all" heuristic that caused Docker
misattribution in #1737.
This commit is contained in:
henrygd
2026-02-26 16:59:12 -05:00
parent a86cb91e07
commit ec69f6c6e0
2 changed files with 172 additions and 11 deletions

View File

@@ -78,14 +78,21 @@ func (a *Agent) initializeDiskInfo() {
if _, exists := a.fsStats[key]; !exists {
if root {
slog.Info("Detected root device", "name", key)
// Check if root device is in /proc/diskstats. Do not guess a
// fallback device for root: that can misattribute root I/O to a
// different disk while usage remains tied to root mountpoint.
// Try to map root device to a diskIoCounters entry. First
// checks for an exact key match, then uses findIoDevice for
// normalized / prefix-based matching (e.g. nda0p2 → nda0),
// and finally falls back to the FILESYSTEM env var.
if _, ioMatch = diskIoCounters[key]; !ioMatch {
if matchedKey, match := findIoDevice(filesystem, diskIoCounters); match {
if matchedKey, match := findIoDevice(key, diskIoCounters); match {
key = matchedKey
ioMatch = true
} else {
} else if filesystem != "" {
if matchedKey, match := findIoDevice(filesystem, diskIoCounters); match {
key = matchedKey
ioMatch = true
}
}
if !ioMatch {
slog.Warn("Root I/O unmapped; set FILESYSTEM", "device", device, "mountpoint", mountpoint)
}
}
@@ -114,7 +121,7 @@ func (a *Agent) initializeDiskInfo() {
// Use FILESYSTEM env var to find root filesystem
if filesystem != "" {
for _, p := range partitions {
if strings.HasSuffix(p.Device, filesystem) || p.Mountpoint == filesystem {
if filesystemMatchesPartitionSetting(filesystem, p) {
addFsStat(p.Device, p.Mountpoint, true)
hasRoot = true
break
@@ -251,15 +258,91 @@ func withinUsageTolerance(a, b, tolerance uint64) bool {
return b-a <= tolerance
}
// Returns matching device from /proc/diskstats.
// bool is true if a match was found.
type ioMatchCandidate struct {
name string
bytes uint64
ops uint64
}
// findIoDevice prefers exact device/label matches, then falls back to a
// prefix-related candidate with the highest recent activity.
func findIoDevice(filesystem string, diskIoCounters map[string]disk.IOCountersStat) (string, bool) {
filesystem = normalizeDeviceName(filesystem)
if filesystem == "" {
return "", false
}
candidates := []ioMatchCandidate{}
for _, d := range diskIoCounters {
if d.Name == filesystem || (d.Label != "" && d.Label == filesystem) {
if normalizeDeviceName(d.Name) == filesystem || (d.Label != "" && normalizeDeviceName(d.Label) == filesystem) {
return d.Name, true
}
if prefixRelated(normalizeDeviceName(d.Name), filesystem) ||
(d.Label != "" && prefixRelated(normalizeDeviceName(d.Label), filesystem)) {
candidates = append(candidates, ioMatchCandidate{
name: d.Name,
bytes: d.ReadBytes + d.WriteBytes,
ops: d.ReadCount + d.WriteCount,
})
}
}
return "", false
if len(candidates) == 0 {
return "", false
}
best := candidates[0]
for _, c := range candidates[1:] {
if c.bytes > best.bytes ||
(c.bytes == best.bytes && c.ops > best.ops) ||
(c.bytes == best.bytes && c.ops == best.ops && c.name < best.name) {
best = c
}
}
slog.Info("Using disk I/O fallback", "requested", filesystem, "selected", best.name)
return best.name, true
}
// prefixRelated reports whether either identifier is a prefix of the other.
func prefixRelated(a, b string) bool {
if a == "" || b == "" || a == b {
return false
}
return strings.HasPrefix(a, b) || strings.HasPrefix(b, a)
}
// filesystemMatchesPartitionSetting checks whether a FILESYSTEM env var value
// matches a partition by mountpoint, exact device name, or prefix relationship
// (e.g. FILESYSTEM=ada0 matches partition /dev/ada0p2).
func filesystemMatchesPartitionSetting(filesystem string, p disk.PartitionStat) bool {
filesystem = strings.TrimSpace(filesystem)
if filesystem == "" {
return false
}
if p.Mountpoint == filesystem {
return true
}
fsName := normalizeDeviceName(filesystem)
partName := normalizeDeviceName(p.Device)
if fsName == "" || partName == "" {
return false
}
if fsName == partName {
return true
}
return prefixRelated(partName, fsName)
}
// normalizeDeviceName canonicalizes device strings for comparisons.
func normalizeDeviceName(value string) string {
name := filepath.Base(strings.TrimSpace(value))
if name == "." {
return ""
}
return name
}
// Sets start values for disk I/O stats.

View File

@@ -116,7 +116,7 @@ func TestFindIoDevice(t *testing.T) {
assert.Equal(t, "sda", device)
})
t.Run("returns no fallback when not found", func(t *testing.T) {
t.Run("returns no match when not found", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"sda": {Name: "sda"},
"sdb": {Name: "sdb"},
@@ -126,6 +126,84 @@ func TestFindIoDevice(t *testing.T) {
assert.False(t, ok)
assert.Equal(t, "", device)
})
t.Run("uses uncertain unique prefix fallback", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"nvme0n1": {Name: "nvme0n1"},
"sda": {Name: "sda"},
}
device, ok := findIoDevice("nvme0n1p2", ioCounters)
assert.True(t, ok)
assert.Equal(t, "nvme0n1", device)
})
t.Run("uses dominant activity when prefix matches are ambiguous", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"sda": {Name: "sda", ReadBytes: 5000, WriteBytes: 5000, ReadCount: 100, WriteCount: 100},
"sdb": {Name: "sdb", ReadBytes: 1000, WriteBytes: 1000, ReadCount: 50, WriteCount: 50},
}
device, ok := findIoDevice("sd", ioCounters)
assert.True(t, ok)
assert.Equal(t, "sda", device)
})
t.Run("uses highest activity when ambiguous without dominance", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"sda": {Name: "sda", ReadBytes: 3000, WriteBytes: 3000, ReadCount: 50, WriteCount: 50},
"sdb": {Name: "sdb", ReadBytes: 2500, WriteBytes: 2500, ReadCount: 40, WriteCount: 40},
}
device, ok := findIoDevice("sd", ioCounters)
assert.True(t, ok)
assert.Equal(t, "sda", device)
})
t.Run("matches /dev/-prefixed partition to parent disk", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"nda0": {Name: "nda0", ReadBytes: 1000, WriteBytes: 1000},
}
device, ok := findIoDevice("/dev/nda0p2", ioCounters)
assert.True(t, ok)
assert.Equal(t, "nda0", device)
})
t.Run("uses deterministic name tie-breaker", func(t *testing.T) {
ioCounters := map[string]disk.IOCountersStat{
"sdb": {Name: "sdb", ReadBytes: 2000, WriteBytes: 2000, ReadCount: 10, WriteCount: 10},
"sda": {Name: "sda", ReadBytes: 2000, WriteBytes: 2000, ReadCount: 10, WriteCount: 10},
}
device, ok := findIoDevice("sd", ioCounters)
assert.True(t, ok)
assert.Equal(t, "sda", device)
})
}
func TestFilesystemMatchesPartitionSetting(t *testing.T) {
p := disk.PartitionStat{Device: "/dev/ada0p2", Mountpoint: "/"}
t.Run("matches mountpoint setting", func(t *testing.T) {
assert.True(t, filesystemMatchesPartitionSetting("/", p))
})
t.Run("matches exact partition setting", func(t *testing.T) {
assert.True(t, filesystemMatchesPartitionSetting("ada0p2", p))
assert.True(t, filesystemMatchesPartitionSetting("/dev/ada0p2", p))
})
t.Run("matches prefix-style parent setting", func(t *testing.T) {
assert.True(t, filesystemMatchesPartitionSetting("ada0", p))
assert.True(t, filesystemMatchesPartitionSetting("/dev/ada0", p))
})
t.Run("does not match unrelated device", func(t *testing.T) {
assert.False(t, filesystemMatchesPartitionSetting("sda", p))
assert.False(t, filesystemMatchesPartitionSetting("nvme0n1", p))
assert.False(t, filesystemMatchesPartitionSetting("", p))
})
}
func TestIsDockerSpecialMountpoint(t *testing.T) {