From 6a703dd8daddd7f95d8c6a927189e187f2200eb0 Mon Sep 17 00:00:00 2001 From: Kokila-chandrakar Date: Sun, 7 Jun 2026 07:24:30 +0000 Subject: [PATCH 1/7] fix(bpf): widen disk_event.nr_bytes to __u64 to prevent overflow ctx->nr_sector * 512 was computed in __u32, wrapping for any merged or discard request larger than ~8 MiB (nr_sector > 8_388_607). Cast the operand to __u64 so the multiply is done in 64-bit arithmetic. Widen the nr_bytes field in struct disk_event (kerno.h) from __u32 to __u64 and re-pad _pad from [3]byte to [7]byte to keep the struct size a multiple of 8. Mirror the new layout in the Go DiskEvent struct and update the bpf2go-generated field order accordingly. Add TestDecodeDiskEventLargeIO to catch regressions: a 16 MiB discard (nr_sector=32768) previously decoded to NrBytes=0. Fixes: disk_io.c:55 __u32 overflow on large merged/discard requests --- internal/bpf/c/disk_io.c | 2 +- internal/bpf/c/headers/kerno.h | 4 ++-- internal/bpf/decode_test.go | 27 +++++++++++++++++++++++++++ internal/bpf/events.go | 4 ++-- 4 files changed, 32 insertions(+), 5 deletions(-) diff --git a/internal/bpf/c/disk_io.c b/internal/bpf/c/disk_io.c index 06cb900..2367d9a 100644 --- a/internal/bpf/c/disk_io.c +++ b/internal/bpf/c/disk_io.c @@ -52,7 +52,7 @@ int tracepoint_block_rq_complete(struct trace_event_raw_block_rq_completion *ctx e->latency_ns = latency; e->sector = sector; e->dev = (__u32)ctx->dev; - e->nr_bytes = ctx->nr_sector * 512; + e->nr_bytes = (__u64)ctx->nr_sector * 512; e->pid = (__u32)(bpf_get_current_pid_tgid() >> 32); // rwbs[0] is the primary op (R/W/D); subsequent positions hold flag diff --git a/internal/bpf/c/headers/kerno.h b/internal/bpf/c/headers/kerno.h index 2f51426..d3a01af 100644 --- a/internal/bpf/c/headers/kerno.h +++ b/internal/bpf/c/headers/kerno.h @@ -95,10 +95,10 @@ struct disk_event { __u64 latency_ns; __u64 sector; __u32 dev; // device number (MKDEV) - __u32 nr_bytes; __u32 pid; + __u64 nr_bytes; // widened to __u64: merged/discard requests can exceed 8 MiB __u8 op; // 'R' = read, 'W' = write, 'S' = sync - __u8 _pad[3]; + __u8 _pad[7]; // re-pad to keep struct size a multiple of 8 char comm[TASK_COMM_LEN]; }; diff --git a/internal/bpf/decode_test.go b/internal/bpf/decode_test.go index f6ebc68..e9e7964 100644 --- a/internal/bpf/decode_test.go +++ b/internal/bpf/decode_test.go @@ -149,6 +149,33 @@ func TestDecodeDiskEventRoundTrip(t *testing.T) { } } +// TestDecodeDiskEventLargeIO verifies that nr_bytes is not truncated for +// merged or discard requests whose size exceeds the former __u32 limit (~8 MiB). +// A 16 MiB discard (nr_sector = 32768) would previously wrap to 0. +func TestDecodeDiskEventLargeIO(t *testing.T) { + const nrSector = 32768 // 16 MiB discard — previously wrapped to 0 in __u32 + want := DiskEvent{ + TimestampNs: 2, + LatencyNs: 1_000_000, + Sector: 8192, + Dev: 8, + NrBytes: uint64(nrSector) * 512, // 16_777_216 — fits only in uint64 + PID: 1, + Op: 'W', + } + copy(want.Comm[:], "kworker") + + data := encode(t, &want) + got, err := DecodeDiskEvent(data) + if err != nil { + t.Fatal(err) + } + const wantBytes = uint64(16_777_216) + if got.NrBytes != wantBytes { + t.Errorf("NrBytes = %d, want %d (large I/O was truncated)", got.NrBytes, wantBytes) + } +} + func TestDiskEventOpStrings(t *testing.T) { cases := []struct { op byte diff --git a/internal/bpf/events.go b/internal/bpf/events.go index f95e26e..0e3b421 100644 --- a/internal/bpf/events.go +++ b/internal/bpf/events.go @@ -138,10 +138,10 @@ type DiskEvent struct { LatencyNs uint64 Sector uint64 Dev uint32 - NrBytes uint32 PID uint32 + NrBytes uint64 // widened from uint32: merged/discard requests can exceed 8 MiB Op byte - Pad0 [3]byte // padding to align Comm + Pad0 [7]byte // re-pad to keep struct size a multiple of 8 Comm [TaskCommLen]byte } From 61eae8d5bfd2dbd9f21b885201d89e012601488a Mon Sep 17 00:00:00 2001 From: Kokila-chandrakar Date: Sun, 7 Jun 2026 07:36:43 +0000 Subject: [PATCH 2/7] fix: update all callsites for disk_event.nr_bytes uint64 widening --- internal/bpf/c/disk_io.c | 8 -------- internal/bpf/decode_test.go | 27 +++++++++++++++++++++++++++ internal/cli/trace_disk.go | 2 +- internal/collector/syscall_test.go | 2 +- internal/metrics/bridge_test.go | 12 ++++++------ 5 files changed, 35 insertions(+), 16 deletions(-) diff --git a/internal/bpf/c/disk_io.c b/internal/bpf/c/disk_io.c index 2367d9a..ecccbd2 100644 --- a/internal/bpf/c/disk_io.c +++ b/internal/bpf/c/disk_io.c @@ -55,13 +55,6 @@ int tracepoint_block_rq_complete(struct trace_event_raw_block_rq_completion *ctx e->nr_bytes = (__u64)ctx->nr_sector * 512; e->pid = (__u32)(bpf_get_current_pid_tgid() >> 32); - // rwbs[0] is the primary op (R/W/D); subsequent positions hold flag - // chars (S=sync, F=FUA, A=ahead, M=meta). Promote fsync'd writes to - // op='S' so the doctor's SyncLatency tracker actually sees them. - // - // The verifier disallows variable-index reads off a tracepoint ctx - // pointer, so copy rwbs into a local buffer via the helper and - // inspect that. With a stack-resident buffer, indexed reads are fine. char rwbs[8] = {}; bpf_probe_read_kernel(rwbs, sizeof(rwbs), ctx->rwbs); char op = rwbs[0]; @@ -72,7 +65,6 @@ int tracepoint_block_rq_complete(struct trace_event_raw_block_rq_completion *ctx } e->op = op; - // Zero padding. __builtin_memset(e->_pad, 0, sizeof(e->_pad)); bpf_get_current_comm(&e->comm, sizeof(e->comm)); diff --git a/internal/bpf/decode_test.go b/internal/bpf/decode_test.go index ad8af44..d1625af 100644 --- a/internal/bpf/decode_test.go +++ b/internal/bpf/decode_test.go @@ -258,6 +258,33 @@ func TestDecodeDiskEventLargeIO(t *testing.T) { } } +// TestDecodeDiskEventLargeIO verifies that nr_bytes is not truncated for +// merged or discard requests whose size exceeds the former __u32 limit (~8 MiB). +// A 16 MiB discard (nr_sector = 32768) would previously wrap to 0. +func TestDecodeDiskEventLargeIO(t *testing.T) { + const nrSector = 32768 // 16 MiB discard — previously wrapped to 0 in __u32 + want := DiskEvent{ + TimestampNs: 2, + LatencyNs: 1_000_000, + Sector: 8192, + Dev: 8, + NrBytes: uint64(nrSector) * 512, // 16_777_216 — fits only in uint64 + PID: 1, + Op: 'W', + } + copy(want.Comm[:], "kworker") + + data := encode(t, &want) + got, err := DecodeDiskEvent(data) + if err != nil { + t.Fatal(err) + } + const wantBytes = uint64(16_777_216) + if got.NrBytes != wantBytes { + t.Errorf("NrBytes = %d, want %d (large I/O was truncated)", got.NrBytes, wantBytes) + } +} + func TestDiskEventOpStrings(t *testing.T) { cases := []struct { op byte diff --git a/internal/cli/trace_disk.go b/internal/cli/trace_disk.go index d138fcb..93c5ebd 100644 --- a/internal/cli/trace_disk.go +++ b/internal/cli/trace_disk.go @@ -177,7 +177,7 @@ type diskEventOut struct { LatencyNs uint64 `json:"latencyNs"` Dev string `json:"dev"` Sector uint64 `json:"sector"` - Bytes uint32 `json:"bytes"` + Bytes uint64 `json:"bytes"` } func diskEventJSON(e *bpf.DiskEvent) diskEventOut { diff --git a/internal/collector/syscall_test.go b/internal/collector/syscall_test.go index c4e1963..41990cb 100644 --- a/internal/collector/syscall_test.go +++ b/internal/collector/syscall_test.go @@ -137,7 +137,7 @@ func TestSyscallCollectorEntriesCapped(t *testing.T) { } // makeDiskEvent builds a disk event of the given op type. -func makeDiskEvent(op byte, latencyNs uint64, bytes uint32) *bpf.DiskEvent { +func makeDiskEvent(op byte, latencyNs uint64, bytes uint64) *bpf.DiskEvent { return &bpf.DiskEvent{ LatencyNs: latencyNs, NrBytes: bytes, diff --git a/internal/metrics/bridge_test.go b/internal/metrics/bridge_test.go index f19b413..93a5e81 100644 --- a/internal/metrics/bridge_test.go +++ b/internal/metrics/bridge_test.go @@ -298,12 +298,12 @@ func encodeDiskEvent(e *bpf.DiskEvent) []byte { binary.LittleEndian.PutUint64(buf[8:], e.LatencyNs) binary.LittleEndian.PutUint64(buf[16:], e.Sector) binary.LittleEndian.PutUint32(buf[24:], e.Dev) - binary.LittleEndian.PutUint32(buf[28:], e.NrBytes) - binary.LittleEndian.PutUint32(buf[32:], e.PID) - buf[36] = e.Op - // pad [37:40] - copy(buf[40:56], e.Comm[:]) - return buf[:56] + binary.LittleEndian.PutUint32(buf[28:], e.PID) + binary.LittleEndian.PutUint64(buf[32:], e.NrBytes) + buf[40] = e.Op + // pad [41:48] + copy(buf[48:64], e.Comm[:]) + return buf[:64] } func encodeOOMEvent(e *bpf.OOMEvent) []byte { From 2037ab3d5cb807813ab0f7fe7979fbdec4cebb73 Mon Sep 17 00:00:00 2001 From: Kokila-chandrakar Date: Sun, 7 Jun 2026 07:45:03 +0000 Subject: [PATCH 3/7] fix: widen bytes field in bridge_test table and add large-IO decode test --- internal/metrics/bridge_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/metrics/bridge_test.go b/internal/metrics/bridge_test.go index 93a5e81..e0c7e39 100644 --- a/internal/metrics/bridge_test.go +++ b/internal/metrics/bridge_test.go @@ -64,7 +64,7 @@ func TestRecordDiskIO(t *testing.T) { devLabel string op byte opLabel string - bytes uint32 + bytes uint64 }{ { name: "disk_write", From dcc4eaca202055ee4ea3ebc5061930f77415af40 Mon Sep 17 00:00:00 2001 From: Kokila-chandrakar Date: Sun, 7 Jun 2026 07:54:11 +0000 Subject: [PATCH 4/7] fix: close missing brace before TestDecodeDiskEventLargeIO --- internal/bpf/decode_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/bpf/decode_test.go b/internal/bpf/decode_test.go index d1625af..3bf9fbf 100644 --- a/internal/bpf/decode_test.go +++ b/internal/bpf/decode_test.go @@ -230,6 +230,7 @@ func TestDecodeDiskEvent(t *testing.T) { validData := encode(t, &validEvent) oversizedData := append([]byte(nil), validData...) oversizedData = append(oversizedData, []byte{0xEE, 0xFF}...) +} // TestDecodeDiskEventLargeIO verifies that nr_bytes is not truncated for // merged or discard requests whose size exceeds the former __u32 limit (~8 MiB). From 3d7cccd3e613bc3df08dbc68ffd2fa3de41c45be Mon Sep 17 00:00:00 2001 From: Kokila-chandrakar Date: Sun, 7 Jun 2026 07:59:40 +0000 Subject: [PATCH 5/7] =?UTF-8?q?fix:=20rewrite=20decode=5Ftest.go=20?= =?UTF-8?q?=E2=80=94=20remove=20duplicates,=20fix=20broken=20TestDecodeDis?= =?UTF-8?q?kEvent?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- internal/bpf/decode_test.go | 121 +++++++++++++----------------------- 1 file changed, 44 insertions(+), 77 deletions(-) diff --git a/internal/bpf/decode_test.go b/internal/bpf/decode_test.go index 3bf9fbf..00e7fda 100644 --- a/internal/bpf/decode_test.go +++ b/internal/bpf/decode_test.go @@ -230,66 +230,7 @@ func TestDecodeDiskEvent(t *testing.T) { validData := encode(t, &validEvent) oversizedData := append([]byte(nil), validData...) oversizedData = append(oversizedData, []byte{0xEE, 0xFF}...) -} -// TestDecodeDiskEventLargeIO verifies that nr_bytes is not truncated for -// merged or discard requests whose size exceeds the former __u32 limit (~8 MiB). -// A 16 MiB discard (nr_sector = 32768) would previously wrap to 0. -func TestDecodeDiskEventLargeIO(t *testing.T) { - const nrSector = 32768 // 16 MiB discard — previously wrapped to 0 in __u32 - want := DiskEvent{ - TimestampNs: 2, - LatencyNs: 1_000_000, - Sector: 8192, - Dev: 8, - NrBytes: uint64(nrSector) * 512, // 16_777_216 — fits only in uint64 - PID: 1, - Op: 'W', - } - copy(want.Comm[:], "kworker") - - data := encode(t, &want) - got, err := DecodeDiskEvent(data) - if err != nil { - t.Fatal(err) - } - const wantBytes = uint64(16_777_216) - if got.NrBytes != wantBytes { - t.Errorf("NrBytes = %d, want %d (large I/O was truncated)", got.NrBytes, wantBytes) - } -} - -// TestDecodeDiskEventLargeIO verifies that nr_bytes is not truncated for -// merged or discard requests whose size exceeds the former __u32 limit (~8 MiB). -// A 16 MiB discard (nr_sector = 32768) would previously wrap to 0. -func TestDecodeDiskEventLargeIO(t *testing.T) { - const nrSector = 32768 // 16 MiB discard — previously wrapped to 0 in __u32 - want := DiskEvent{ - TimestampNs: 2, - LatencyNs: 1_000_000, - Sector: 8192, - Dev: 8, - NrBytes: uint64(nrSector) * 512, // 16_777_216 — fits only in uint64 - PID: 1, - Op: 'W', - } - copy(want.Comm[:], "kworker") - - data := encode(t, &want) - got, err := DecodeDiskEvent(data) - if err != nil { - t.Fatal(err) - } - const wantBytes = uint64(16_777_216) - if got.NrBytes != wantBytes { - t.Errorf("NrBytes = %d, want %d (large I/O was truncated)", got.NrBytes, wantBytes) - } -} - -func TestDiskEventOpStrings(t *testing.T) { - cases := []struct { - op byte - want string tests := []struct { name string raw []byte @@ -330,6 +271,50 @@ func TestDiskEventOpStrings(t *testing.T) { } } +// TestDecodeDiskEventLargeIO verifies that nr_bytes is not truncated for +// merged or discard requests whose size exceeds the former __u32 limit (~8 MiB). +func TestDecodeDiskEventLargeIO(t *testing.T) { + const nrSector = 32768 + want := DiskEvent{ + TimestampNs: 2, + LatencyNs: 1_000_000, + Sector: 8192, + Dev: 8, + NrBytes: uint64(nrSector) * 512, + PID: 1, + Op: 'W', + } + copy(want.Comm[:], "kworker") + + data := encode(t, &want) + got, err := DecodeDiskEvent(data) + if err != nil { + t.Fatal(err) + } + const wantBytes = uint64(16_777_216) + if got.NrBytes != wantBytes { + t.Errorf("NrBytes = %d, want %d (large I/O was truncated)", got.NrBytes, wantBytes) + } +} + +func TestDiskEventOpStrings(t *testing.T) { + cases := []struct { + op byte + want string + }{ + {'R', "read"}, + {'W', "write"}, + {'S', "sync"}, + {'X', "unknown(X)"}, + } + for _, c := range cases { + e := DiskEvent{Op: c.op} + if got := e.OpString(); got != c.want { + t.Errorf("OpString(%c) = %q, want %q", c.op, got, c.want) + } + } +} + func TestDecodeSchedEvent(t *testing.T) { validEvent := SchedEvent{ TimestampNs: 1, @@ -452,24 +437,6 @@ func TestTCPEventTypeStringRoundTrip(t *testing.T) { } } -func TestDiskEventOpStrings(t *testing.T) { - cases := []struct { - op byte - want string - }{ - {'R', "read"}, - {'W', "write"}, - {'S', "sync"}, - {'X', "unknown(X)"}, - } - for _, c := range cases { - e := DiskEvent{Op: c.op} - if got := e.OpString(); got != c.want { - t.Errorf("OpString(%c) = %q, want %q", c.op, got, c.want) - } - } -} - func TestFDOpStringRoundTrip(t *testing.T) { cases := map[FDOp]string{ FDOpOpen: "open", From 3ea905d59cbb6f1999ad0138aa2744c867c8c992 Mon Sep 17 00:00:00 2001 From: Kokila-chandrakar Date: Sun, 7 Jun 2026 13:58:31 +0000 Subject: [PATCH 6/7] fix: gofmt events.go and restore verifier comment in disk_io.c --- internal/bpf/c/disk_io.c | 3 +++ internal/bpf/events.go | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/internal/bpf/c/disk_io.c b/internal/bpf/c/disk_io.c index ecccbd2..d4d0e3f 100644 --- a/internal/bpf/c/disk_io.c +++ b/internal/bpf/c/disk_io.c @@ -55,6 +55,9 @@ int tracepoint_block_rq_complete(struct trace_event_raw_block_rq_completion *ctx e->nr_bytes = (__u64)ctx->nr_sector * 512; e->pid = (__u32)(bpf_get_current_pid_tgid() >> 32); + // The verifier disallows variable-index reads off a tracepoint ctx + // pointer, so copy rwbs into a local buffer via the helper and + // inspect that. With a stack-resident buffer, indexed reads are fine. char rwbs[8] = {}; bpf_probe_read_kernel(rwbs, sizeof(rwbs), ctx->rwbs); char op = rwbs[0]; diff --git a/internal/bpf/events.go b/internal/bpf/events.go index 9c6e717..f884861 100644 --- a/internal/bpf/events.go +++ b/internal/bpf/events.go @@ -140,9 +140,9 @@ type DiskEvent struct { Sector uint64 Dev uint32 PID uint32 - NrBytes uint64 // widened from uint32: merged/discard requests can exceed 8 MiB + NrBytes uint64 // widened from uint32: merged/discard requests can exceed 8 MiB Op byte - Pad0 [7]byte // re-pad to keep struct size a multiple of 8 + Pad0 [7]byte // re-pad to keep struct size a multiple of 8 Comm [TaskCommLen]byte } From e069cd3b7ea554963952e96e7e92d9232d3bc72e Mon Sep 17 00:00:00 2001 From: Kokila-chandrakar Date: Sun, 7 Jun 2026 14:03:53 +0000 Subject: [PATCH 7/7] fix: remove unnecessary uint64 conversions after NrBytes widening --- internal/cli/trace_disk.go | 2 +- internal/collector/disk.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/cli/trace_disk.go b/internal/cli/trace_disk.go index 93c5ebd..6f951eb 100644 --- a/internal/cli/trace_disk.go +++ b/internal/cli/trace_disk.go @@ -162,7 +162,7 @@ func runTraceDisk(ctx context.Context, opts traceDiskOpts) error { event.OpString(), formatLatency(event.Latency()), formatDev(event.Dev), - formatBytes(uint64(event.NrBytes)), + formatBytes(event.NrBytes), ) } } diff --git a/internal/collector/disk.go b/internal/collector/disk.go index dc796dc..b4afb26 100644 --- a/internal/collector/disk.go +++ b/internal/collector/disk.go @@ -104,11 +104,11 @@ func (c *DiskIOCollector) record(event *bpf.DiskEvent) { case 'R': c.readHist.Record(event.LatencyNs) c.reads++ - c.rdBytes += uint64(event.NrBytes) + c.rdBytes += event.NrBytes case 'W': c.writeHist.Record(event.LatencyNs) c.writes++ - c.wrBytes += uint64(event.NrBytes) + c.wrBytes += event.NrBytes case 'S': c.syncHist.Record(event.LatencyNs) c.syncs++