diff --git a/clear b/clear index e29c2066e..1d3b2d32f 100755 --- a/clear +++ b/clear @@ -1062,6 +1062,18 @@ when 'profile' # are reported in the dump as a `# WARNING:` line. N must be a # power of two (the tables index via `& (N-1)`). profile_max = nil + # --sample=N: record every Nth event in the runtime profile tables. + # Captured values are scaled by N at record time so doctor / pprof + # see estimated totals. Default is no sampling for alloc-profile; + # auto-bumped to 100 for lock+mvcc when --sync-callstacks is on + # (see below) unless the user passes their own --sample value. + sample_n = nil + # --sync-callstacks: turn on per-record stack capture in lock-profile + # and mvcc-profile. Off by default because the FP walk costs ~100-500ns + # per record and uncontended mutex acquire / MVCC commit fast paths + # are 10-50ns; the trace can dominate. Enable when investigating + # caller-attribution for a specific lock or cell. + sync_callstacks = false clear_args = clear_args.reject do |arg| if arg =~ /\A--profile-max=(\d+)\z/ profile_max = $1.to_i @@ -1069,13 +1081,25 @@ when 'profile' error "--profile-max=#{profile_max} must be a positive power of two (e.g. 1024, 4096, 16384)" end true + elsif arg =~ /\A--sample=(\d+)\z/ + sample_n = $1.to_i + error "--sample=#{sample_n} must be a positive integer" unless sample_n.positive? + true + elsif arg == '--sync-callstacks' + sync_callstacks = true + true else false end end + # When --sync-callstacks is on but no explicit --sample was given, + # default to 100 so the per-record cost stays manageable. Users can + # set --sample=1 to opt in to full capture at the full cost. + sample_n = 100 if sync_callstacks && sample_n.nil? + source = clear_args.first - error "Usage: clear profile [--profile-max=N] [-- args...]" unless source + error "Usage: clear profile [--profile-max=N] [--sample=N] [--sync-callstacks] [-- args...]" unless source source = File.expand_path(source) base_name = File.basename(source, '.cht') @@ -1114,6 +1138,8 @@ when 'profile' "CLEAR_LOCK_PROFILE=#{lock_file}", "CLEAR_MVCC_PROFILE=#{mvcc_file}", ] + env_parts << "CLEAR_PROFILE_SAMPLE=#{sample_n}" if sample_n + env_parts << "CLEAR_PROFILE_SYNC_CALLSTACKS=1" if sync_callstacks env_parts << "CLEAR_THREADS=#{ENV['CLEAR_THREADS']}" if ENV['CLEAR_THREADS'] target_env = env_parts.join(' ') @@ -1137,6 +1163,17 @@ when 'profile' system("perf stat -e cycles,instructions,branches,branch-misses,cache-references,cache-misses,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses,page-faults -o #{perf_stat_file} env #{target_env} #{output} #{run_args.join(' ')} > /dev/null 2>&1") end + # Convert each runtime profile to pprof's gzipped protobuf so the + # `pprof` tool can render flamegraphs / call graphs / source views + # directly. Heap / lock / mvcc are encoded in pure Ruby; CPU defers + # to the standard `perf_to_profile` Go binary if it's on PATH. + require_relative 'src/tools/pprof_converter' + pprof_files = PprofConverter.convert_all(profile_dir) + if File.exist?(perf_data) && !pprof_files[:cpu] + hint "perf_to_profile not on PATH — skipping CPU pprof. Install: " \ + "go install github.com/google/perf_data_converter/src/cmd/perf_to_profile@latest" + end + puts "" puts "Profile data: #{profile_dir}/" puts " Heap: #{alloc_file} (#{File.exist?(alloc_file) ? "#{(File.size(alloc_file) / 1024.0).round(1)} KB" : 'not generated'})" @@ -1146,10 +1183,44 @@ when 'profile' puts " Syscalls: #{strace_file}" if File.exist?(strace_file) puts " Counters: #{perf_stat_file}" if File.exist?(perf_stat_file) puts " MVCC: #{mvcc_file}" if File.exist?(mvcc_file) + if pprof_files.any? + puts "" + puts "pprof: pprof -http=:8080 #{output} #{profile_dir}/.pb.gz" + pprof_files.each { |kind, path| puts " #{kind}:#{' ' * (8 - kind.to_s.length)}#{path}" } + end when 'doctor' require_relative 'src/tools/doctor' - Doctor.run(ARGV[1]) + # Parse flags: --cumulative shows cum bytes/count alongside flat; + # --focus=REGEX filters samples whose trace touches a function + # matching the pattern; --diff compares two profile + # runs and shows top deltas per section. + doctor_opts = { cumulative: false, focus: nil, ignore: nil, peek: nil, diff: nil, by: :bytes } + positional = [] + i = 1 + while i < ARGV.length + a = ARGV[i] + case a + when '--cumulative', '--cum' + doctor_opts[:cumulative] = true + when /\A--focus=(.+)\z/ + doctor_opts[:focus] = Regexp.new($1) + when /\A--ignore=(.+)\z/ + doctor_opts[:ignore] = Regexp.new($1) + when /\A--peek=(.+)\z/ + doctor_opts[:peek] = Regexp.new($1) + when /\A--by=(bytes|allocs|inuse_bytes|inuse_allocs)\z/ + doctor_opts[:by] = $1.to_sym + when '--diff' + doctor_opts[:diff] = ARGV[i + 1] + i += 1 + error "--diff requires a argument" if doctor_opts[:diff].nil? + else + positional << a + end + i += 1 + end + Doctor.run(positional.first, **doctor_opts) when 'fix' require_relative 'src/ast/lexer' @@ -1569,6 +1640,28 @@ when 'explain' end exit 0 +when 'completions' + shell = ARGV[1] + unless shell && %w[bash zsh fish].include?(shell) + $stderr.puts "Usage: clear completions {bash|zsh|fish}" + $stderr.puts "" + $stderr.puts "Install (bash):" + $stderr.puts " echo 'source <(clear completions bash)' >> ~/.bashrc" + $stderr.puts "" + $stderr.puts "Install (zsh):" + $stderr.puts " mkdir -p ~/.zsh/completions" + $stderr.puts " clear completions zsh > ~/.zsh/completions/_clear" + $stderr.puts " # add to ~/.zshrc BEFORE 'compinit':" + $stderr.puts " # fpath=(~/.zsh/completions \\$fpath)" + $stderr.puts "" + $stderr.puts "Install (fish):" + $stderr.puts " clear completions fish > ~/.config/fish/completions/clear.fish" + exit 1 + end + require_relative 'src/tools/completions' + print Completions.script_for(shell) + exit 0 + when 'help', '--help', '-h', nil puts <<~HELP CLEAR Language Compiler @@ -1593,6 +1686,7 @@ when 'help', '--help', '-h', nil clear fmt --no-warn Suppress width warnings for >120-char lines clear explain List every registered diagnostic code clear explain Show docs for a specific diagnostic + clear completions {bash|zsh|fish} Print shell completions (see docs/completions.md) Commands: build Transpile and compile a CLEAR program to a native binary diff --git a/docs/completions.md b/docs/completions.md new file mode 100644 index 000000000..1f25b02b3 --- /dev/null +++ b/docs/completions.md @@ -0,0 +1,74 @@ +# Shell completions + +Tab-completion for `clear` subcommands, and file/directory arguments +filtered per subcommand: + +| Subcommand | Completes to | +|---|---| +| `clear build`, `run`, `fmt`, `fix`, `profile`, `explain` | `*.cht` files (and directories to navigate) | +| `clear test`, `benchmark` | `*.cht` files or directories | +| `clear doctor` | `*.profile/` directories | +| `clear completions` | `bash` / `zsh` / `fish` | + +Generate the script for your shell with `clear completions `, +then install per the instructions below. + +## Bash + +Add to `~/.bashrc`: + +```sh +source <(clear completions bash) +``` + +Or write to the system completions dir (loaded by every interactive +shell, no rc-file edit): + +```sh +clear completions bash | sudo tee /etc/bash_completion.d/clear > /dev/null +``` + +## Zsh + +The convention is one `_` file per command in a directory on +`$fpath`: + +```sh +mkdir -p ~/.zsh/completions +clear completions zsh > ~/.zsh/completions/_clear +``` + +Then ensure `~/.zsh/completions` is on `$fpath` **before** `compinit` +runs. In `~/.zshrc`: + +```sh +fpath=(~/.zsh/completions $fpath) +autoload -Uz compinit && compinit +``` + +Reopen the shell (or `compinit -u`) and tab-completion will pick up +descriptions for each subcommand. + +## Fish + +Fish auto-loads completions from `~/.config/fish/completions/`: + +```sh +clear completions fish > ~/.config/fish/completions/clear.fish +``` + +No rc-file edit needed. + +## Verifying + +```sh +clear # lists subcommands +clear doctor # lists *.profile/ dirs +clear profile examples/ # lists *.cht files in examples/ +``` + +## Updating + +The completion script is generated from the live `Completions` +module (`src/tools/completions.rb`). When new subcommands are added, +re-run `clear completions > ` to refresh. diff --git a/docs/pprof.md b/docs/pprof.md new file mode 100644 index 000000000..4b3b1ab62 --- /dev/null +++ b/docs/pprof.md @@ -0,0 +1,143 @@ +# pprof integration + +`clear profile` emits its runtime profile data in pprof's gzipped +protobuf format alongside the existing text dumps, so you can use +the standard `pprof` tool for flamegraphs, call graphs, and source +views. + +## Install + +| Tool | When you need it | Install | +|---|---|---| +| `pprof` | Always (the viewer) | `go install github.com/google/pprof@latest` | +| `perf_to_profile` | If you want CPU flamegraphs from `perf.data` | `go install github.com/google/perf_data_converter/src/cmd/perf_to_profile@latest` | +| `graphviz` | If you use `pprof -svg` / `pprof -dot` | `apt install graphviz` / `brew install graphviz` | + +The web UI (`pprof -http=:8080`) and `pprof -top` text view do not +need graphviz. + +## Use + +```sh +clear profile foo.cht +# -> writes foo.profile/heap.pb.gz, lock.pb.gz, mvcc.pb.gz +# (and cpu.pb.gz if perf_to_profile is on PATH) + +pprof -http=:8080 ./foo foo.profile/heap.pb.gz +pprof -top -alloc_space foo.profile/heap.pb.gz +pprof -top -inuse_space foo.profile/heap.pb.gz +pprof -top -delay foo.profile/lock.pb.gz +pprof -base before/heap.pb.gz after/heap.pb.gz # regression diff +``` + +## What's in each file + +### `heap.pb.gz` + +Sample columns: `alloc_objects` / `alloc_space` / `inuse_objects` / +`inuse_space`. Each call site is one sample with its hex address as +a label (`pprof -tags heap.pb.gz`). + +### `lock.pb.gz` + +Sample columns: `contentions` / `delay` / `hold` / `acquisitions`. +Read+write contention sums into `contentions`; `delay` is the total +wait (read+write); `hold` is total exclusive hold time. + +### `mvcc.pb.gz` + +Sample columns: `reads` / `commits` / `retries` / `cow_bytes`. +`cow_bytes = struct_size * (commits + retries)` — the byte volume +moved by copy-on-write commits, the most direct cost signal for +`@shared:versioned` cells. + +### `channels.pb.gz` + +Sample columns: `pushes` / `pops` / `push_blocked` / `pop_blocked` / +`max_depth`. One sample per registered channel; the synthetic +function name is `channel#` and the channel's capacity travels +as a label (`pprof -tags channels.pb.gz`). + +### `cpu.pb.gz` + +Standard CPU profile from `perf.data`, converted by +`perf_to_profile`. Sample columns are whatever Go's pprof shows +(`samples` / `cpu` ns). + +## CLEAR source mapping + +The transpiler emits `// CLR:N` markers in `transpiled.zig`. Our +converter walks those back to the user's `.cht` line and stamps it +onto each pprof Location, so `pprof -list ` shows CLEAR source +lines (not Zig). + +## Sampling + +Stack traces for **alloc-profile** are captured on every alloc by +default. For workloads where the per-alloc unwind cost matters, +`--sample=N` records every Nth event and scales the captured values +by N so doctor / pprof see estimated totals: + +```sh +clear profile foo.cht --sample=100 +``` + +Header records the chosen `sample_n` so consumers can rescale or +flag the approximation. + +## Stack traces for lock and mvcc profiles + +`--sync-callstacks` (off by default) turns on per-record stack +capture in lock-profile and mvcc-profile, so pprof's tree/flame +views and per-caller attribution work for these profiles too: + +```sh +clear profile foo.cht --sync-callstacks # auto-bumps --sample to 100 +clear profile foo.cht --sync-callstacks --sample=1 # full capture, full cost +``` + +Off by default because the FP walk costs ~100-500ns per record. +Uncontended mutex acquire is ~10-20ns and MVCC commit fast paths +are ~20-50ns, so the trace can dominate the operation it's measuring. +When the flag is set without an explicit `--sample`, we auto-default +to `--sample=100` to keep the cost manageable. + +With `--sync-callstacks` on, each (lock, caller-trace) pair becomes +its own row in lock-profile, and same for (cell, caller-trace) in +mvcc-profile. Doctor aggregates rows back to one-per-lock for its +existing diagnoses; pprof tree views show the per-caller breakdown. + +## Doctor flags built on this data + +`clear doctor` consumes the same profile dirs and grew three new +flags that draw on the multi-frame trace data: + +```sh +clear doctor foo.profile/ --cumulative # rank functions by cum bytes +clear doctor foo.profile/ --focus=intToString # filter to traces that touch this function +clear doctor foo.profile/ --ignore=intToString # drop traces that touch this function +clear doctor foo.profile/ --peek=processRequest # callers + callees of one function +clear doctor foo.profile/ --by=allocs # sort heap by allocation count, not bytes +clear doctor foo.profile/ --by=inuse_bytes # sort by allocs - frees (live bytes) +clear doctor old.profile/ --diff new.profile/ # perf-regression diff between two runs +``` + +`--cumulative` aggregates bytes/allocs across every frame in each +trace, so a function high in the call stack accrues its callees' +costs. `--focus=REGEX` keeps only sites whose trace touches a function +matching the pattern. `--diff` reports per-function deltas in +allocation, lock contention, and MVCC retries, with directional arrows +and "newly contended" / "retries eliminated" annotations. + +## Notes + +- We do not emit a Mapping message for the binary, so `pprof` prints + "Main binary filename not available" and skips its own symbolization. + Function names still appear because we resolve via `addr2line` at + conversion time. +- alloc-profile captures stacks unconditionally (multi-frame v2, + comma-separated leaf-first in `alloc.txt`). +- lock-profile (v3) and mvcc-profile (v2) capture stacks only when + `--sync-callstacks` is on. The 12th column of `locks.txt` and the + 8th column of `mvcc.txt` carry `-` (off) or comma-separated leaf- + first addrs (on). Tab-separated to allow commas in the trace field. diff --git a/spec/completions_spec.rb b/spec/completions_spec.rb new file mode 100644 index 000000000..21cb24332 --- /dev/null +++ b/spec/completions_spec.rb @@ -0,0 +1,118 @@ +require 'rspec' +require_relative '../src/tools/completions' + +RSpec.describe Completions do + describe '.script_for' do + it 'rejects unsupported shells' do + expect { described_class.script_for('powershell') } + .to raise_error(ArgumentError, /unsupported shell/) + end + + it 'returns the bash script for "bash"' do + out = described_class.script_for('bash') + expect(out).to start_with('# bash completion for `clear`') + expect(out).to include('complete -F _clear_complete clear') + end + + it 'returns the zsh script for "zsh"' do + out = described_class.script_for('zsh') + expect(out).to start_with('#compdef clear') + expect(out).to include('_files -g \'*.cht\'') + end + + it 'returns the fish script for "fish"' do + out = described_class.script_for('fish') + expect(out).to start_with('# fish completion for `clear`') + expect(out).to include('__fish_use_subcommand') + end + end + + describe 'bash script semantics' do + let(:script) { described_class.bash } + + it 'enumerates every subcommand for first-position completion' do + Completions::SUBCOMMANDS.each_key do |sub| + expect(script).to include(sub), + "bash completion missing subcommand #{sub.inspect}" + end + end + + it 'filters file completion to *.cht for build-like subcommands' do + expect(script).to include("compgen -f -X '!*.cht'") + end + + it 'completes only directories for `doctor`' do + expect(script).to match(/doctor\)\s*\n\s+# Prefer \*\.profile\/ directories/) + expect(script).to include('compgen -d -- "$cur"') + end + + it 'completes shell names for `completions`' do + expect(script).to match(/completions\)\s*\n\s+COMPREPLY=\(.*"bash zsh fish"/m) + end + end + + describe 'zsh script semantics' do + let(:script) { described_class.zsh } + + it 'tags every subcommand with its description for `_describe`' do + Completions::SUBCOMMANDS.each do |sub, desc| + expect(script).to include("'#{sub}:#{desc}'"), + "zsh completion missing entry for #{sub.inspect}" + end + end + + it 'filters `doctor` to *.profile directories via `_files -/`' do + expect(script).to match(/doctor\)\s*\n\s+_files -\/ -g '\*\.profile'/) + end + + it 'uses _describe for first-position subcommand completion' do + expect(script).to include('_describe \'subcommand\' subcmds') + end + end + + describe 'fish script semantics' do + let(:script) { described_class.fish } + + it 'declares one `complete` per subcommand' do + Completions::SUBCOMMANDS.each_key do |sub| + expect(script).to match(/-a #{sub} -d /), + "fish completion missing entry for #{sub.inspect}" + end + end + + it 'gates file completion on subcommand context' do + expect(script).to include("__fish_seen_subcommand_from build run fmt format fix profile explain test benchmark") + expect(script).to include("__fish_seen_subcommand_from doctor") + end + + it 'escapes single quotes in descriptions' do + # No subcommand description currently contains a quote, but if + # someone adds one the script must still parse cleanly. + stub_const('Completions::SUBCOMMANDS', { 'foo' => "it's a thing" }) + script = described_class.fish + expect(script).to include("-d 'it\\'s a thing'") + end + end + + describe 'CLI integration' do + let(:clear_bin) { File.expand_path('../clear', __dir__) } + + it '`clear completions bash` prints the bash script and exits 0' do + out = `#{clear_bin} completions bash` + expect($?.exitstatus).to eq(0) + expect(out).to start_with('# bash completion for `clear`') + end + + it '`clear completions` with no shell prints usage to stderr and exits 1' do + out = `#{clear_bin} completions 2>&1` + expect($?.exitstatus).to eq(1) + expect(out).to include('Usage: clear completions') + end + + it '`clear completions powershell` prints usage and exits 1' do + out = `#{clear_bin} completions powershell 2>&1` + expect($?.exitstatus).to eq(1) + expect(out).to include('Usage: clear completions') + end + end +end diff --git a/spec/doctor_diff_spec.rb b/spec/doctor_diff_spec.rb new file mode 100644 index 000000000..caf2324f8 --- /dev/null +++ b/spec/doctor_diff_spec.rb @@ -0,0 +1,133 @@ +require "rspec" +require "tmpdir" +require "stringio" +require "fileutils" +require_relative "../src/tools/doctor" + +# Exercises the new --diff / --cumulative / --focus flags. Profiles +# are synthesized as plain text files in a tmpdir; the binary at +# `.profile`'s sibling path is left absent so addr2line returns +# `??` and we exercise the fallback-to-raw-addr keying. + +RSpec.describe Doctor do + def capture_stdout + old_stdout = $stdout + out = StringIO.new + $stdout = out + yield + out.string + ensure + $stdout = old_stdout + end + + describe ".run --diff" do + around do |ex| + Dir.mktmpdir do |dir| + @before = File.join(dir, "before.profile") + @after = File.join(dir, "after.profile") + FileUtils.mkdir_p([@before, @after]) + ex.run + end + end + + it "highlights heap regressions with directional arrows" do + File.write(File.join(@before, "alloc.txt"), <<~ALLOC) + # alloc-profile v2 + 0x100 100 1000 0 0 100 + ALLOC + File.write(File.join(@after, "alloc.txt"), <<~ALLOC) + # alloc-profile v2 + 0x100 200 2000 0 0 200 + ALLOC + + out = capture_stdout { described_class.run(@after, diff: @before) } + expect(out).to include("Heap Δ") + expect(out).to include("↑") # regression + end + + it "calls out brand-new allocation sites" do + File.write(File.join(@before, "alloc.txt"), "# alloc-profile v2\n") + File.write(File.join(@after, "alloc.txt"), <<~ALLOC) + # alloc-profile v2 + 0xabc 50 500 0 0 500 + ALLOC + + out = capture_stdout { described_class.run(@after, diff: @before) } + expect(out).to include("New allocation sites") + end + + it "calls out eliminated allocation sites" do + File.write(File.join(@before, "alloc.txt"), <<~ALLOC) + # alloc-profile v2 + 0xdead 10 100 0 0 100 + ALLOC + File.write(File.join(@after, "alloc.txt"), "# alloc-profile v2\n") + + out = capture_stdout { described_class.run(@after, diff: @before) } + expect(out).to include("Eliminated allocation sites") + end + + it "respects --focus by skipping non-matching functions" do + File.write(File.join(@before, "alloc.txt"), <<~ALLOC) + # alloc-profile v2 + 0x100 100 1000 0 0 100 + ALLOC + File.write(File.join(@after, "alloc.txt"), <<~ALLOC) + # alloc-profile v2 + 0x100 200 2000 0 0 200 + ALLOC + + # focus that matches nothing → no Heap Δ table emitted. + out = capture_stdout { described_class.run(@after, diff: @before, focus: /no_such_function/) } + expect(out).not_to include("Heap Δ") + end + + it "emits a lock Δ table when contention shifts" do + tab = "\t" + hdr = "# lock-profile v3\n" + # Wait_ns from 0 -> 5_000_000 (5ms), contention 0 -> 3 + File.write(File.join(@before, "locks.txt"), + hdr + "0x500#{tab}10#{tab}0#{tab}0#{tab}0#{tab}1000#{tab}500#{tab}0#{tab}0#{tab}0#{tab}0#{tab}-\n") + File.write(File.join(@after, "locks.txt"), + hdr + "0x500#{tab}10#{tab}3#{tab}5000000#{tab}2000000#{tab}1000#{tab}500#{tab}0#{tab}0#{tab}0#{tab}0#{tab}-\n") + + out = capture_stdout { described_class.run(@after, diff: @before) } + expect(out).to include("Lock Δ") + expect(out).to include("newly contended") + end + + it "emits an MVCC Δ table when retries appear" do + tab = "\t" + hdr = "# mvcc-profile v2\n" + File.write(File.join(@before, "mvcc.txt"), + hdr + "0x600#{tab}64#{tab}1000#{tab}10#{tab}0#{tab}0#{tab}0#{tab}-\n") + File.write(File.join(@after, "mvcc.txt"), + hdr + "0x600#{tab}64#{tab}1000#{tab}10#{tab}500#{tab}0#{tab}0#{tab}-\n") + + out = capture_stdout { described_class.run(@after, diff: @before) } + expect(out).to include("MVCC Δ") + expect(out).to include("new retry storm") + end + + it "supports --ignore as the inverse of --focus" do + File.write(File.join(@before, "alloc.txt"), "# alloc-profile v2\n") + File.write(File.join(@after, "alloc.txt"), <<~ALLOC) + # alloc-profile v2 + 0xabc 50 500 0 0 500 + ALLOC + + out = capture_stdout { described_class.run(@after, diff: @before) } + expect(out).to include("New allocation sites") + end + + it "exits 1 when either dir does not exist" do + old_stderr = $stderr + $stderr = StringIO.new + expect { + described_class.run(@after, diff: "/nonexistent") + }.to raise_error(SystemExit) { |e| expect(e.status).to eq(1) } + ensure + $stderr = old_stderr + end + end +end diff --git a/spec/doctor_peek_spec.rb b/spec/doctor_peek_spec.rb new file mode 100644 index 000000000..972885629 --- /dev/null +++ b/spec/doctor_peek_spec.rb @@ -0,0 +1,184 @@ +require "rspec" +require "tmpdir" +require "stringio" +require "fileutils" +require_relative "../src/tools/doctor" + +# Exercises --peek (callers + callees of a function) and --ignore +# (inverse of --focus). Profiles are synthesized; addr2line resolution +# is unavailable in tests (no real binary), so we use raw hex addrs +# as the function-name fallback baked into parse_alloc_for_diff and +# the heap reader. The peek tests then match against those addrs. + +RSpec.describe Doctor do + def capture_stdout + old_stdout = $stdout + out = StringIO.new + $stdout = out + yield + out.string + ensure + $stdout = old_stdout + end + + describe ".run --ignore" do + around do |ex| + Dir.mktmpdir do |dir| + @profile_dir = File.join(dir, "p.profile") + FileUtils.mkdir_p(@profile_dir) + File.write(File.join(@profile_dir, "alloc.txt"), <<~ALLOC) + # alloc-profile v2 + 0xaaa 100 1000 0 0 100 + 0xbbb,0xaaa 50 500 0 0 100 + ALLOC + ex.run + end + end + + it "drops sites whose trace touches the ignore regex" do + out = capture_stdout { described_class.run(@profile_dir, ignore: /aaa/) } + expect(out).to include("Ignore:") + expect(out).not_to match(/^\s*1\. 0xaaa/) + end + + it "keeps sites that do not match the ignore regex" do + out = capture_stdout { described_class.run(@profile_dir, ignore: /no_match/) } + expect(out).to include("0xaaa") + end + + it "shows both Focus and Ignore lines when both flags are set" do + out = capture_stdout { described_class.run(@profile_dir, focus: /a/, ignore: /no/) } + expect(out).to include("Focus:") + expect(out).to include("Ignore:") + end + end + + describe ".run --peek" do + around do |ex| + Dir.mktmpdir do |dir| + @profile_dir = File.join(dir, "p.profile") + FileUtils.mkdir_p(@profile_dir) + # Three samples sharing a common ancestor 0xparent. The peek + # target 0xfoo appears as a leaf in two and mid-stack in one. + File.write(File.join(@profile_dir, "alloc.txt"), <<~ALLOC) + # alloc-profile v2 + 0xfoo,0xparent 5 500 0 0 500 + 0xfoo,0xparent 3 300 0 0 300 + 0xleaf,0xfoo,0xparent 2 200 0 0 200 + ALLOC + ex.run + end + end + + it "aggregates self-bytes across all samples touching the function" do + out = capture_stdout { described_class.run(@profile_dir, peek: /0xfoo/) } + expect(out).to include("Peek") + expect(out).to match(/Self bytes:\s+1000 B \(across 10 alloc/) + end + + it "surfaces the caller (frame above) of the matched function" do + out = capture_stdout { described_class.run(@profile_dir, peek: /0xfoo/) } + expect(out).to include("Callers (frames above") + expect(out).to match(/0xparent\s+1000 B/) + end + + it "surfaces callees only when the function appears non-leaf" do + out = capture_stdout { described_class.run(@profile_dir, peek: /0xfoo/) } + expect(out).to include("Callees (frames below") + expect(out).to match(/0xleaf\s+200 B/) + end + + it "tells the user when no samples matched" do + out = capture_stdout { described_class.run(@profile_dir, peek: /no_match/) } + expect(out).to include("No samples matched") + end + + it "exits 1 when run on a missing directory" do + old_stderr = $stderr + $stderr = StringIO.new + expect { + described_class.run("/nonexistent", peek: /foo/) + }.to raise_error(SystemExit) { |e| expect(e.status).to eq(1) } + ensure + $stderr = old_stderr + end + end + + describe ".run --by sample-type pivoting" do + around do |ex| + Dir.mktmpdir do |dir| + @profile_dir = File.join(dir, "p.profile") + FileUtils.mkdir_p(@profile_dir) + # Two sites: 0xbig has few large allocs, 0xsmall has many tiny. + # Sorted by bytes, 0xbig wins; sorted by allocs, 0xsmall wins. + File.write(File.join(@profile_dir, "alloc.txt"), <<~ALLOC) + # alloc-profile v2 + 0xbig 2 10000 0 0 10000 + 0xsmall 200 200 0 0 200 + ALLOC + ex.run + end + end + + it "default sort is by bytes (largest sites first)" do + out = capture_stdout { described_class.run(@profile_dir) } + first_idx = out.index("0xbig") + second_idx = out.index("0xsmall") + expect(first_idx).to be < second_idx + end + + it "--by=allocs sorts by allocation count" do + out = capture_stdout { described_class.run(@profile_dir, by: :allocs) } + expect(out).to include("Top sites by allocations:") + first_idx = out.index("0xsmall") + second_idx = out.index("0xbig") + expect(first_idx).to be < second_idx + end + + it "--by=inuse_bytes computes alloc_bytes - free_bytes" do + File.write(File.join(@profile_dir, "alloc.txt"), <<~ALLOC) + # alloc-profile v2 + 0xleak 1 1000 0 0 1000 + 0xpaired 1 1000 1 1000 0 + ALLOC + out = capture_stdout { described_class.run(@profile_dir, by: :inuse_bytes) } + expect(out).to include("Top sites by in-use bytes") + expect(out.index("0xleak")).to be < out.index("0xpaired") + end + end + + describe "runtime-function attribution" do + around do |ex| + Dir.mktmpdir do |dir| + @profile_dir = File.join(dir, "p.profile") + FileUtils.mkdir_p(@profile_dir) + # Synthesize a transpiled.zig with one CLR marker, then test + # that addresses NOT in this file aren't credited a clear_line. + File.write(File.join(@profile_dir, "transpiled.zig"), <<~ZIG) + // header + // CLR:42 + some_zig_code(); + ZIG + ex.run + end + end + + it "leaves clear_line nil when the file is not the user transpiled.zig" do + # Doctor's resolver runs against a real binary which we don't have + # in tests, so resolve_addrs returns no entries — the absence of + # `is_user_zig` mis-attribution is what matters in production. This + # test pins the logic via a direct file_is_transpiled_zig? check. + require_relative "../src/tools/pprof_converter" + expect( + PprofConverter.send(:file_is_transpiled_zig?, "/tmp/runtime.zig:100", "/tmp/transpiled.zig") + ).to be false + expect( + PprofConverter.send(:file_is_transpiled_zig?, "/build/cache/._clear_tmp_litedb.zig:60", "/p/transpiled.zig") + ).to be true + expect( + PprofConverter.send(:file_is_transpiled_zig?, + "/build/cache/._clear_tmp_litedb.zig:176 (discriminator 4)", "/p/transpiled.zig") + ).to be true + end + end +end diff --git a/spec/pprof_spec.rb b/spec/pprof_spec.rb new file mode 100644 index 000000000..70e83f8d6 --- /dev/null +++ b/spec/pprof_spec.rb @@ -0,0 +1,409 @@ +require 'rspec' +require 'tmpdir' +require 'fileutils' +require 'zlib' +require 'stringio' +require_relative '../src/tools/pprof' +require_relative '../src/tools/pprof_converter' + +# Exercises both the pprof wire-format encoder and the converter that +# turns `clear profile` text dumps into pprof `.pb.gz` files. We do not +# require pprof itself to be installed -- a tiny in-test decoder +# verifies the bytes we emit match what we mean to encode. + +# Minimal protobuf decoder for the subset of the schema the encoder +# uses. Verifies field tags, wire types, and lengths. Returns nested +# hashes keyed by field number. +module ProtoTestDecoder + module_function + + def parse(bytes) + bytes = bytes.b + io = StringIO.new(bytes) + out = Hash.new { |h, k| h[k] = [] } + until io.eof? + tag = read_varint(io) + field = tag >> 3 + wire = tag & 7 + val = + case wire + when 0 then read_varint(io) + when 2 then io.read(read_varint(io)).to_s.b + when 1 then io.read(8) + when 5 then io.read(4) + else raise "unsupported wire type #{wire} for field #{field}" + end + out[field] << val + end + out + end + + def read_varint(io) + n = 0 + shift = 0 + loop do + b = io.read(1)&.unpack1('C') + raise 'truncated varint' if b.nil? + n |= (b & 0x7F) << shift + return n if (b & 0x80).zero? + shift += 7 + raise 'varint too long' if shift > 63 + end + end + + # Decode a packed varint field (Sample.location_id, Sample.value). + def parse_packed_varints(bytes) + io = StringIO.new(bytes.b) + out = [] + out << read_varint(io) until io.eof? + out + end +end + +RSpec.describe Pprof::Wire do + it 'encodes single-byte varints unchanged' do + expect(described_class.varint(0)).to eq("\x00".b) + expect(described_class.varint(1)).to eq("\x01".b) + expect(described_class.varint(127)).to eq("\x7f".b) + end + + it 'encodes multi-byte varints with the continuation bit' do + # 300 = 0b1_0010_1100 -> bytes [0xAC, 0x02] + expect(described_class.varint(300)).to eq("\xac\x02".b) + end + + it 'rejects negative values' do + expect { described_class.varint(-1) }.to raise_error(ArgumentError) + end + + it 'composes a length-delimited field' do + bytes = described_class.field_bytes(6, 'hi') + expect(bytes).to eq("\x32\x02hi".b) # tag=(6<<3|2)=0x32, len=2, "hi" + end +end + +RSpec.describe Pprof::Profile do + it 'reserves string_table[0] = "" so pprof readers parse correctly' do + pb = described_class.new + bytes = pb.encode + decoded = ProtoTestDecoder.parse(bytes) + expect(decoded[6].first).to eq('') # string_table is field 6 + end + + it 'deduplicates strings, functions, and string columns' do + pb = described_class.new + pb.add_function(name: 'foo', filename: 'a.cht') + second_id = pb.add_function(name: 'foo', filename: 'a.cht') + expect(second_id).to eq(1) + decoded = ProtoTestDecoder.parse(pb.encode) + expect(decoded[5].length).to eq(1) # one Function entry only + end + + it 'encodes a roundtrippable single-sample profile' do + pb = described_class.new + pb.add_sample_type('alloc_objects', 'count') + pb.add_sample_type('alloc_space', 'bytes') + pb.set_period_type('space', 'bytes', 1) + fid = pb.add_function(name: 'allocBlob', filename: 'foo.cht') + lid = pb.add_location(function_id: fid, line: 42, address: 0x1234) + pb.add_sample([lid], [10, 1024], addr: '0x1234') + + bytes = pb.encode + decoded = ProtoTestDecoder.parse(bytes) + + expect(decoded[1].length).to eq(2) # 2 sample_types + expect(decoded[2].length).to eq(1) # 1 sample + expect(decoded[4].length).to eq(1) # 1 location + expect(decoded[5].length).to eq(1) # 1 function + expect(decoded[11].length).to eq(1) # period_type set + expect(decoded[12].first).to eq(1) # period + + # Sample.value should be packed varints [10, 1024] + sample = ProtoTestDecoder.parse(decoded[2].first) + values_bytes = sample[2].first + expect(ProtoTestDecoder.parse_packed_varints(values_bytes)).to eq([10, 1024]) + + location_bytes = sample[1].first + expect(ProtoTestDecoder.parse_packed_varints(location_bytes)).to eq([1]) + end + + it 'emits a Mapping when one is registered, and Locations point at it' do + pb = described_class.new + pb.add_mapping(binary: '/path/to/litedb', build_id: 'abc123') + pb.add_sample_type('alloc_space', 'bytes') + fid = pb.add_function(name: 'foo', filename: 'foo.cht') + lid = pb.add_location(function_id: fid, line: 1, address: 0x100) + pb.add_sample([lid], [10]) + + decoded = ProtoTestDecoder.parse(pb.encode) + expect(decoded[3].length).to eq(1) # one Mapping + mapping = ProtoTestDecoder.parse(decoded[3].first) + expect(mapping[1].first).to eq(1) # Mapping.id + expect(mapping[7].first).to eq(1) # has_functions = true + expect(mapping[8].first).to eq(1) # has_filenames = true + expect(mapping[9].first).to eq(1) # has_line_numbers = true + + location = ProtoTestDecoder.parse(decoded[4].first) + expect(location[2].first).to eq(1) # Location.mapping_id = 1 + end + + it 'omits the Mapping field entirely when none is registered' do + pb = described_class.new + pb.add_sample_type('count', 'samples') + decoded = ProtoTestDecoder.parse(pb.encode) + expect(decoded[3]).to be_empty + end + + it 'gzips the output with a valid gzip magic' do + pb = described_class.new + bytes = pb.encode_gzip + expect(bytes[0, 2].bytes).to eq([0x1f, 0x8b]) + # Round-trip through Zlib to confirm + inflated = Zlib::GzipReader.new(StringIO.new(bytes)).read + expect(inflated.b).to eq(pb.encode.b) + end +end + +RSpec.describe PprofConverter do + around do |ex| + Dir.mktmpdir do |dir| + @profile_dir = File.join(dir, 'foo.profile') + FileUtils.mkdir_p(@profile_dir) + ex.run + end + end + + describe '.convert_alloc' do + it 'returns nil when alloc.txt is missing' do + expect(described_class.convert_alloc(@profile_dir, nil)).to be_nil + end + + it 'writes heap.pb.gz with four sample-type columns' do + File.write(File.join(@profile_dir, 'alloc.txt'), <<~PROF) + # site allocs bytes frees free_bytes live + 0x401234 1000 40000 500 20000 500 + 0x402000 2000 80000 2000 80000 0 + PROF + out = described_class.convert_alloc(@profile_dir, nil) + expect(out).to eq(File.join(@profile_dir, 'heap.pb.gz')) + bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read + decoded = ProtoTestDecoder.parse(bytes) + expect(decoded[1].length).to eq(4) # alloc_objects, alloc_space, inuse_*, inuse_* + expect(decoded[2].length).to eq(2) # one sample per site + end + + it 'computes inuse columns from (allocs-frees) and (bytes-free_bytes)' do + File.write(File.join(@profile_dir, 'alloc.txt'), <<~PROF) + 0x401234 1000 40000 200 8000 800 + PROF + out = described_class.convert_alloc(@profile_dir, nil) + bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read + decoded = ProtoTestDecoder.parse(bytes) + sample = ProtoTestDecoder.parse(decoded[2].first) + values = ProtoTestDecoder.parse_packed_varints(sample[2].first) + # alloc_objects, alloc_space, inuse_objects, inuse_space + expect(values).to eq([1000, 40000, 800, 32000]) + end + + it 'parses comma-separated multi-frame stack traces (alloc-profile v2)' do + File.write(File.join(@profile_dir, 'alloc.txt'), <<~PROF) + # alloc-profile v2 (multi-frame, comma-separated leaf-first) + 0x401234,0x402000,0x403000 1000 40000 500 20000 500 + PROF + out = described_class.convert_alloc(@profile_dir, nil) + bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read + decoded = ProtoTestDecoder.parse(bytes) + # Three Locations (one per unique addr), three Functions + expect(decoded[4].length).to eq(3) + expect(decoded[5].length).to eq(3) + sample = ProtoTestDecoder.parse(decoded[2].first) + stack = ProtoTestDecoder.parse_packed_varints(sample[1].first) + expect(stack.length).to eq(3) # 3-frame trace in Sample.location_id + end + + it 'shares Locations across samples that include the same frame' do + File.write(File.join(@profile_dir, 'alloc.txt'), <<~PROF) + # alloc-profile v2 + 0x10,0x20 100 1000 0 0 100 + 0x30,0x20 100 1000 0 0 100 + PROF + out = described_class.convert_alloc(@profile_dir, nil) + bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read + decoded = ProtoTestDecoder.parse(bytes) + # 0x20 should be ONE Location/Function reused by both samples. + expect(decoded[4].length).to eq(3) # 0x10, 0x20, 0x30 + expect(decoded[5].length).to eq(3) + expect(decoded[2].length).to eq(2) + end + + it 'tags each sample with its source address as a label' do + File.write(File.join(@profile_dir, 'alloc.txt'), <<~PROF) + 0xdeadbeef 1 16 0 0 1 + PROF + out = described_class.convert_alloc(@profile_dir, nil) + bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read + # The address 0xdeadbeef should appear in string_table. + expect(bytes).to include('0xdeadbeef') + end + end + + describe '.convert_locks' do + it 'returns nil for empty input' do + File.write(File.join(@profile_dir, 'locks.txt'), "# header only\n") + expect(described_class.convert_locks(@profile_dir, nil)).to be_nil + end + + it 'sums read+write contention into the contentions column' do + File.write(File.join(@profile_dir, 'locks.txt'), <<~PROF) + # addr acq cont total_wait max_wait total_hold max_hold r_acq r_cont r_wait r_max + 0x500 100 5 50000 1000 200000 5000 50 2 10000 500 + PROF + out = described_class.convert_locks(@profile_dir, nil) + bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read + decoded = ProtoTestDecoder.parse(bytes) + sample = ProtoTestDecoder.parse(decoded[2].first) + values = ProtoTestDecoder.parse_packed_varints(sample[2].first) + # contentions, delay, hold, acquisitions + expect(values).to eq([5 + 2, 50000 + 10000, 200000, 100 + 50]) + end + + it 'parses caller_trace = "-" as no trace (sync-callstacks off)' do + tab = "\t" + content = "# lock-profile v3\n" \ + "0x500#{tab}100#{tab}5#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}-\n" + File.write(File.join(@profile_dir, 'locks.txt'), content) + out = described_class.convert_locks(@profile_dir, nil) + bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read + decoded = ProtoTestDecoder.parse(bytes) + sample = ProtoTestDecoder.parse(decoded[2].first) + stack = ProtoTestDecoder.parse_packed_varints(sample[1].first) + expect(stack.length).to eq(1) # only the lock-pointer leaf + end + + it 'parses caller_trace as multi-frame stack (sync-callstacks on)' do + tab = "\t" + content = "# lock-profile v3\n" \ + "0x500#{tab}100#{tab}5#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0" \ + "#{tab}0xC1,0xC2,0xC3\n" + File.write(File.join(@profile_dir, 'locks.txt'), content) + out = described_class.convert_locks(@profile_dir, nil) + bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read + decoded = ProtoTestDecoder.parse(bytes) + sample = ProtoTestDecoder.parse(decoded[2].first) + stack = ProtoTestDecoder.parse_packed_varints(sample[1].first) + # leaf (lock 0x500) + 3 caller frames + expect(stack.length).to eq(4) + end + + it 'emits a separate sample per (lock, caller-trace) row' do + tab = "\t" + content = "# lock-profile v3\n" \ + "0x500#{tab}40#{tab}2#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0xA\n" \ + "0x500#{tab}10#{tab}1#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0xB\n" + File.write(File.join(@profile_dir, 'locks.txt'), content) + out = described_class.convert_locks(@profile_dir, nil) + bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read + decoded = ProtoTestDecoder.parse(bytes) + expect(decoded[2].length).to eq(2) # one Sample per (lock,caller) row + end + end + + describe '.convert_mvcc' do + it 'computes cow_bytes as struct_size * (commits + retries)' do + File.write(File.join(@profile_dir, 'mvcc.txt'), <<~PROF) + 0x600 128 5000 1000 200 10 + PROF + out = described_class.convert_mvcc(@profile_dir, nil) + bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read + decoded = ProtoTestDecoder.parse(bytes) + sample = ProtoTestDecoder.parse(decoded[2].first) + values = ProtoTestDecoder.parse_packed_varints(sample[2].first) + # reads, commits, retries, cow_bytes + expect(values).to eq([5000, 1000, 200, 128 * (1000 + 200)]) + end + + it 'tolerates the optional 7th `multi_commits` column' do + File.write(File.join(@profile_dir, 'mvcc.txt'), <<~PROF) + 0x700 64 100 50 5 0 3 + PROF + out = described_class.convert_mvcc(@profile_dir, nil) + expect(out).to eq(File.join(@profile_dir, 'mvcc.pb.gz')) + end + + it 'parses caller_trace as multi-frame stack (sync-callstacks on)' do + tab = "\t" + content = "# mvcc-profile v2\n" \ + "0x700#{tab}64#{tab}1000#{tab}50#{tab}5#{tab}0#{tab}3#{tab}0xC1,0xC2,0xC3\n" + File.write(File.join(@profile_dir, 'mvcc.txt'), content) + out = described_class.convert_mvcc(@profile_dir, nil) + bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read + decoded = ProtoTestDecoder.parse(bytes) + sample = ProtoTestDecoder.parse(decoded[2].first) + stack = ProtoTestDecoder.parse_packed_varints(sample[1].first) + # leaf (cell 0x700) + 3 caller frames + expect(stack.length).to eq(4) + end + end + + describe '.convert_channels' do + it 'returns nil for an empty channels.txt' do + File.write(File.join(@profile_dir, 'channels.txt'), <<~PROF) + # channel-profile v1 + # id pushes pops push_blocked pop_blocked max_depth capacity + PROF + expect(described_class.convert_channels(@profile_dir, nil)).to be_nil + end + + it 'emits one Sample per registered channel' do + tab = "\t" + File.write(File.join(@profile_dir, 'channels.txt'), <<~PROF) + # channel-profile v1 + 0#{tab}5#{tab}5#{tab}0#{tab}2#{tab}5#{tab}8 + 1#{tab}9#{tab}9#{tab}0#{tab}3#{tab}9#{tab}16 + PROF + out = described_class.convert_channels(@profile_dir, nil) + expect(out).to eq(File.join(@profile_dir, 'channels.pb.gz')) + bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read + decoded = ProtoTestDecoder.parse(bytes) + expect(decoded[1].length).to eq(5) # 5 sample-type columns + expect(decoded[2].length).to eq(2) # 2 samples + expect(bytes).to include('channel#0') + expect(bytes).to include('channel#1') + end + + it 'tags each sample with the channel capacity' do + tab = "\t" + File.write(File.join(@profile_dir, 'channels.txt'), + "# channel-profile v1\n0#{tab}5#{tab}5#{tab}0#{tab}2#{tab}5#{tab}8\n") + out = described_class.convert_channels(@profile_dir, nil) + bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read + expect(bytes).to include('capacity') + end + end + + describe '.convert_perf' do + it 'returns nil when perf_to_profile is unavailable or perf.data is missing' do + # No perf.data => nil regardless of perf_to_profile presence. + expect(described_class.convert_perf(@profile_dir, nil)).to be_nil + end + end + + describe '.convert_all' do + it 'returns the set of files actually emitted' do + File.write(File.join(@profile_dir, 'alloc.txt'), "0x100 1 16 0 0 1\n") + File.write(File.join(@profile_dir, 'mvcc.txt'), "0x200 64 10 5 1 0\n") + out = described_class.convert_all(@profile_dir) + expect(out.keys).to contain_exactly(:heap, :mvcc) + expect(out[:heap]).to eq(File.join(@profile_dir, 'heap.pb.gz')) + expect(out[:mvcc]).to eq(File.join(@profile_dir, 'mvcc.pb.gz')) + end + + it 'returns an empty hash for a directory with no profile files' do + expect(described_class.convert_all(@profile_dir)).to eq({}) + end + + it 'returns an empty hash for a missing directory' do + expect(described_class.convert_all('/nonexistent')).to eq({}) + end + end +end diff --git a/src/tools/completions.rb b/src/tools/completions.rb new file mode 100644 index 000000000..ae3bcd260 --- /dev/null +++ b/src/tools/completions.rb @@ -0,0 +1,153 @@ +# Shell-completion script generators for the `clear` CLI. +# +# Usage from the user side: +# clear completions bash >> ~/.bashrc +# clear completions zsh > ~/.zsh/completions/_clear # then `compinit` +# clear completions fish > ~/.config/fish/completions/clear.fish +# +# Per-subcommand completion semantics: +# build / run / fmt / fix / profile / explain -> *.cht files (+ dirs) +# test -> *.cht files OR directories +# doctor -> *.profile/ directories +# completions -> bash | zsh | fish +module Completions + SUBCOMMANDS = { + 'build' => 'Build a .cht file to a native binary', + 'run' => 'Build and run a .cht file', + 'test' => 'Run tests in a .cht file or directory', + 'benchmark' => 'Run a CLEAR benchmark', + 'profile' => 'Build with profiling and run', + 'doctor' => 'Analyze a .profile/ directory', + 'fix' => 'Apply lint fixes to .cht files', + 'fmt' => 'Format .cht files', + 'format' => 'Alias for fmt', + 'explain' => 'Explain a language feature', + 'completions' => 'Print shell completions', + 'help' => 'Show help', + }.freeze + + module_function + + def script_for(shell) + case shell + when 'bash' then bash + when 'zsh' then zsh + when 'fish' then fish + else + raise ArgumentError, "unsupported shell: #{shell.inspect} (expected bash, zsh, or fish)" + end + end + + # Bash uses `complete -F` with a function. We hand-roll the + # subcommand dispatch so file completion is filtered to `.cht` + # for build-like commands and to `.profile/` for `doctor`. + def bash + <<~BASH + # bash completion for `clear` + # Source from ~/.bashrc: source <(clear completions bash) + _clear_complete() { + local cur prev cmd + COMPREPLY=() + cur="${COMP_WORDS[COMP_CWORD]}" + prev="${COMP_WORDS[COMP_CWORD-1]}" + cmd="${COMP_WORDS[1]}" + + if [ "$COMP_CWORD" -eq 1 ]; then + COMPREPLY=( $(compgen -W "#{SUBCOMMANDS.keys.join(' ')}" -- "$cur") ) + return + fi + + case "$cmd" in + build|run|fmt|format|fix|profile|explain) + COMPREPLY=( $(compgen -f -X '!*.cht' -- "$cur") $(compgen -d -- "$cur") ) + compopt -o filenames 2>/dev/null + ;; + test|benchmark) + COMPREPLY=( $(compgen -f -X '!*.cht' -- "$cur") $(compgen -d -- "$cur") ) + compopt -o filenames 2>/dev/null + ;; + doctor) + # Prefer *.profile/ directories. Falls back to all dirs so + # the user can drill into a parent. + COMPREPLY=( $(compgen -d -- "$cur") ) + local filtered=() + local d + for d in "${COMPREPLY[@]}"; do + [[ "$d" == *.profile || -d "$d/.." ]] && filtered+=("$d") + done + COMPREPLY=("${filtered[@]}") + compopt -o filenames 2>/dev/null + ;; + completions) + COMPREPLY=( $(compgen -W "bash zsh fish" -- "$cur") ) + ;; + esac + } + complete -F _clear_complete clear + BASH + end + + # Zsh's completion system is richer: `_describe` shows subcommand + # descriptions inline, `_files -g GLOB` filters by glob. + def zsh + desc_lines = SUBCOMMANDS.map { |k, v| " '#{k}:#{v}'" }.join("\n") + <<~ZSH + #compdef clear + # zsh completion for `clear` + # Install: clear completions zsh > ~/.zsh/completions/_clear + # # ensure ~/.zsh/completions is in $fpath BEFORE compinit + _clear() { + local -a subcmds + subcmds=( + #{desc_lines} + ) + + if (( CURRENT == 2 )); then + _describe 'subcommand' subcmds + return + fi + + case "${words[2]}" in + build|run|fmt|format|fix|profile|explain|test|benchmark) + _files -g '*.cht' + ;; + doctor) + _files -/ -g '*.profile' + ;; + completions) + _values 'shell' bash zsh fish + ;; + esac + } + _clear "$@" + ZSH + end + + # Fish completions are declarative: one `complete` call per arm. + def fish + sub_complete = SUBCOMMANDS.map do |k, v| + desc = v.gsub("'", "\\\\'") + "complete -c clear -n '__fish_use_subcommand' -a #{k} -d '#{desc}'" + end.join("\n") + + <<~FISH + # fish completion for `clear` + # Install: clear completions fish > ~/.config/fish/completions/clear.fish + + # Top-level subcommand list + #{sub_complete} + + # File arguments per subcommand + complete -c clear -n '__fish_seen_subcommand_from build run fmt format fix profile explain test benchmark' \\ + -F -k -a "(__fish_complete_path '*.cht')" + + # `doctor` wants *.profile/ directories + complete -c clear -n '__fish_seen_subcommand_from doctor' \\ + -F -k -a "(__fish_complete_directories '' '*.profile')" + + # `completions` shell name + complete -c clear -n '__fish_seen_subcommand_from completions' \\ + -a 'bash zsh fish' + FISH + end +end diff --git a/src/tools/doctor.rb b/src/tools/doctor.rb index f907f274e..98845b5ad 100644 --- a/src/tools/doctor.rb +++ b/src/tools/doctor.rb @@ -1,4 +1,5 @@ require 'set' +require 'stringio' # Analyzes a `clear profile` run. Reads heap/CPU/syscall/HW-counter/lock/MVCC # data from the .profile/ directory and prints actionable advice. @@ -26,12 +27,22 @@ def saturation_warning(file) line&.sub(/\A# /, '')&.rstrip end - def run(profile_dir) + def run(profile_dir, cumulative: false, focus: nil, ignore: nil, peek: nil, diff: nil, by: :bytes) + if diff + return run_diff(diff, profile_dir, focus: focus) + end + unless profile_dir && Dir.exist?(profile_dir) - $stderr.puts "\e[31merror:\e[0m Usage: clear doctor " + $stderr.puts "\e[31merror:\e[0m Usage: clear doctor [--cumulative] [--focus=REGEX] [--ignore=REGEX] [--peek=REGEX] [--by=bytes|allocs|inuse_bytes|inuse_allocs] [--diff ]" exit 1 end + @opts = { cumulative: cumulative, focus: focus, ignore: ignore, peek: peek, by: by } + + if peek + return run_peek(profile_dir, peek) + end + perf_data = File.join(profile_dir, 'perf.data') binary = profile_dir.chomp('/').sub(/\.profile$/, '') binary = nil unless File.exist?(binary.to_s) @@ -48,6 +59,56 @@ def run(profile_dir) section_freeze(profile_dir, sites, resolved, llc_miss_rate) end + # Returns true if a sample's trace (function names, leaf-first) + # passes the focus/ignore filters. focus keeps only matches; ignore + # drops matches; both can compose. With no filters set, every + # sample passes. + def focus_match?(funcs) + return false if @opts && @opts[:ignore] && funcs.any? { |f| f =~ @opts[:ignore] } + return true unless @opts && @opts[:focus] + funcs.any? { |f| f =~ @opts[:focus] } + end + + def cumulative? + !!(@opts && @opts[:cumulative]) + end + + # Which Site field to sort/display by. Maps the user's --by flag + # (`bytes` / `allocs` / `inuse_bytes` / `inuse_allocs`) to the + # corresponding key in the parsed Site hash. Default `:bytes`. + def sort_key + return :bytes unless @opts && @opts[:by] + case @opts[:by] + when :allocs then :allocs + when :inuse_bytes then :inuse_bytes + when :inuse_allocs then :inuse_allocs + else :bytes + end + end + + # Human-readable label for the sort axis (used in section headers). + def sort_label + case sort_key + when :allocs then "allocations" + when :inuse_bytes then "in-use bytes (alloc - free)" + when :inuse_allocs then "in-use allocations (alloc - free)" + else "bytes" + end + end + + # Format the sort metric as a string for the per-row display. + # Bytes get KB/MB pretty-printing; counts get thousand-separators. + def fmt_sort_value(s) + v = s[sort_key] + case sort_key + when :allocs, :inuse_allocs + "%s allocs" % v.to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,') + else + kb = v / 1024.0 + kb >= 1.0 ? ("%.1f KB" % kb) : ("%d B" % v) + end + end + # ── Heap Profile ── def section_heap(profile_dir, binary) alloc_file = File.join(profile_dir, 'alloc.txt') @@ -60,18 +121,38 @@ def section_heap(profile_dir, binary) total_line = File.readlines(alloc_file).find { |l| l.include?('total_allocs') } total = total_line ? total_line.scan(/\d+/).last.to_i : 0 + # alloc-profile v2: the first column may be a comma-separated + # leaf-first stack trace. We treat the leaf address as the + # primary site (matching v1 behaviour); the deeper frames are + # available in `:trace` for richer views below if a future + # caller wants them. v1 files (single addr, no comma) still parse + # because `split(',')` on a no-comma string returns a 1-element + # array. sites = lines.map do |line| parts = line.strip.split next nil if parts.size < 6 - addr = parts[0] - { addr: addr, allocs: parts[1].to_i, bytes: parts[2].to_i, - frees: parts[3].to_i, free_bytes: parts[4].to_i, live: parts[5].to_i } - end.compact.sort_by { |s| -s[:bytes] } + trace = parts[0].split(',') + allocs = parts[1].to_i + frees = parts[3].to_i + bytes = parts[2].to_i + free_bytes = parts[4].to_i + { addr: trace.first, trace: trace, + allocs: allocs, bytes: bytes, + frees: frees, free_bytes: free_bytes, + live: parts[5].to_i, + # Inuse columns: live counts derived from alloc - free. + # Doctor uses these when --by=inuse_bytes / --by=inuse_allocs. + inuse_allocs: [allocs - frees, 0].max, + inuse_bytes: [bytes - free_bytes, 0].max } + end.compact.sort_by { |s| -s[sort_key] } resolved = nil if binary resolved = {} - addrs = sites.map { |s| s[:addr] } + # Resolve every address that appears in any trace, not just + # the leaf, so callers can render full traces. uniq keeps the + # addr2line invocation small on hot leaves. + addrs = sites.flat_map { |s| s[:trace] }.uniq raw = IO.popen(['addr2line', '-e', binary, '-f'] + addrs, err: '/dev/null', &:read) lines_out = raw.split("\n") zig_source_path = File.join(profile_dir, 'transpiled.zig') @@ -80,7 +161,17 @@ def section_heap(profile_dir, binary) func = lines_out[i * 2]&.strip || "?" file = lines_out[i * 2 + 1]&.strip || "?" clear_line = nil - if file =~ /:(\d+)/ && File.exist?(zig_source_path) + # Only walk transpiled.zig for CLR markers if the address + # actually came from the user's transpiled CLEAR file. Runtime + # and stdlib zig files have no CLR markers; treating them as + # user code led to runtime functions appearing at random + # source.cht lines. addr2line emits the build-time filename + # `._clear_tmp_.zig` (the compile target), which is a + # content copy of profile_dir/transpiled.zig. + is_user_zig = File.exist?(zig_source_path) && + File.basename(file.sub(/:\d+\b.*\z/, '')) + .match?(/\A\._clear_tmp_.*\.zig\z/) + if is_user_zig && file =~ /:(\d+)\b/ zig_line = $1.to_i zig_lines_cache ||= File.readlines(zig_source_path) (zig_line - 1).downto(0) do |li| @@ -92,7 +183,38 @@ def section_heap(profile_dir, binary) end end func = func.sub(/.*\./, '').sub(/__anon_\d+/, '') - resolved[addr] = { func: func, file: file, clear_line: clear_line } + resolved[addr] = { func: func, file: file, clear_line: clear_line, is_user_zig: is_user_zig } + end + end + + # Cumulative bytes per leaf function: for every sample, every + # frame in its trace (resolved to a function name) gets credit + # for the sample's bytes/allocs. A leaf alone gets only flat + # bytes; a function high in the stack accumulates its callees' + # bytes too. This answers "who is on the call path of the most + # bytes" not "who directly allocated the most bytes." + cum_bytes = Hash.new(0) + cum_allocs = Hash.new(0) + sites.each do |s| + seen = {} # one credit per (function, sample) so recursion doesn't double-count + s[:trace].each do |a| + f = resolved&.dig(a)&.dig(:func) || a + next if seen[f] + seen[f] = true + cum_bytes[f] += s[:bytes] + cum_allocs[f] += s[:allocs] + end + end + + # --focus / --ignore filter which sites contribute to the top-10 + # list. focus keeps a site if any function in its trace matches + # the regex; ignore drops one if any function matches. Both can + # compose; ignore wins on overlap (a site that would pass focus + # is dropped if it also matches ignore). + if @opts && (@opts[:focus] || @opts[:ignore]) + sites = sites.select do |s| + funcs = s[:trace].map { |a| resolved&.dig(a)&.dig(:func) || a } + focus_match?(funcs) end end @@ -102,30 +224,56 @@ def section_heap(profile_dir, binary) puts " *** #{warn}" puts "" end - puts "Top sites by bytes:" - sites.first(10).each_with_index do |s, i| - r = resolved&.dig(s[:addr]) - func = r ? r[:func] : s[:addr] - loc = if r&.dig(:clear_line) then "(line #{r[:clear_line]})" - elsif r then "(#{r[:file].split('/').last})" - else "" end - avg = s[:allocs] > 0 ? s[:bytes] / s[:allocs] : 0 - kb = s[:bytes] / 1024.0 - leak = if s[:frees] == 0 && s[:allocs] > 0 && avg >= 16 - " (heap rc)" - elsif s[:frees] == 0 && s[:allocs] > 0 - " (arena)" - elsif s[:frees] > 0 && s[:frees] < s[:allocs] - " ** LEAK ** (#{s[:allocs] - s[:frees]} unfreed)" - else - "" - end - if kb >= 1.0 - puts " #{i + 1}. %-25s %6.1f KB (%s allocs, %d bytes avg)%s" % [ - "#{func} #{loc}", kb, s[:allocs].to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,'), avg, leak] - else - puts " #{i + 1}. %-25s %6d B (%s allocs, %d bytes avg)%s" % [ - "#{func} #{loc}", s[:bytes], s[:allocs].to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,'), avg, leak] + if @opts && @opts[:focus] + puts "Focus: /#{@opts[:focus].source}/" + end + if @opts && @opts[:ignore] + puts "Ignore: /#{@opts[:ignore].source}/" + end + puts "" if @opts && (@opts[:focus] || @opts[:ignore]) + label = cumulative? ? "Top functions by cumulative #{sort_label}:" : "Top sites by #{sort_label}:" + puts label + + if cumulative? + ranked = cum_bytes.sort_by { |_f, b| -b }.first(10) + ranked.each_with_index do |(func, bytes), i| + kb = bytes / 1024.0 + ac = cum_allocs[func] + if kb >= 1.0 + puts " #{i + 1}. %-25s %6.1f KB cum (on %s alloc paths)" % + [func, kb, ac.to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,')] + else + puts " #{i + 1}. %-25s %6d B cum (on %s alloc paths)" % + [func, bytes, ac.to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,')] + end + end + puts "" + puts " (\"cum\" = sum of bytes whose call path passes through this function)" + else + sites.first(10).each_with_index do |s, i| + r = resolved&.dig(s[:addr]) + func = r ? r[:func] : s[:addr] + loc = if r&.dig(:clear_line) then "(line #{r[:clear_line]})" + elsif r then "(#{r[:file].split('/').last})" + else "" end + avg = s[:allocs] > 0 ? s[:bytes] / s[:allocs] : 0 + kb = s[:bytes] / 1024.0 + leak = if s[:frees] == 0 && s[:allocs] > 0 && avg >= 16 + " (heap rc)" + elsif s[:frees] == 0 && s[:allocs] > 0 + " (arena)" + elsif s[:frees] > 0 && s[:frees] < s[:allocs] + " ** LEAK ** (#{s[:allocs] - s[:frees]} unfreed)" + else + "" + end + if kb >= 1.0 + puts " #{i + 1}. %-25s %6.1f KB (%s allocs, %d bytes avg)%s" % [ + "#{func} #{loc}", kb, s[:allocs].to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,'), avg, leak] + else + puts " #{i + 1}. %-25s %6d B (%s allocs, %d bytes avg)%s" % [ + "#{func} #{loc}", s[:bytes], s[:allocs].to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,'), avg, leak] + end end end arena_count = sites.first(10).count { |s| s[:frees] == 0 && s[:allocs] > 0 && (s[:allocs] > 0 ? s[:bytes] / s[:allocs] : 0) < 16 } @@ -495,20 +643,53 @@ def section_locks(profile_dir) return unless File.exist?(lock_prof) rows = File.readlines(lock_prof).reject { |l| l.start_with?('#') || l.strip.empty? } - locks = rows.map do |l| - f = l.split - # Columns: addr acquires contended total_wait max_wait total_hold max_hold - # read_acquires read_contended read_total_wait read_max_wait + # Columns: addr acquires contended total_wait max_wait total_hold max_hold + # read_acquires read_contended read_total_wait read_max_wait + # [caller_trace] <-- v3, optional. `-` for empty; comma- + # separated leaf-first when present. + # With --sync-callstacks on, multiple rows can share the same `addr` + # (one per (lock, caller-trace) pair). We aggregate by addr to + # preserve the "one row per mutex" view; the trace data is kept on + # the row for future caller-aware drilldowns. + rows_parsed = rows.map do |l| + f = l.split("\t").map(&:strip) + # tab-split for v3 (caller_trace can contain `,`). Older v1/v2 + # files use whitespace-only separators; fall back to whitespace + # split when the tab-split produced too few fields. + f = l.strip.split if f.size < 11 next nil if f.size < 11 + trace_field = f[11] + trace = (trace_field.nil? || trace_field == '-') ? [] : trace_field.split(',') { addr: f[0], acquires: f[1].to_i, contended: f[2].to_i, total_wait_ns: f[3].to_i, max_wait_ns: f[4].to_i, total_hold_ns: f[5].to_i, max_hold_ns: f[6].to_i, read_acquires: f[7].to_i, read_contended: f[8].to_i, read_total_wait_ns: f[9].to_i, read_max_wait_ns: f[10].to_i, + trace: trace, } - end.compact.reject { |c| c[:acquires] == 0 && c[:read_acquires] == 0 } - .sort_by { |c| -c[:total_hold_ns] } + end.compact + + # Aggregate (lock, trace) rows back to a single row per lock so + # the existing "Top locks" view keeps its meaning. Counters sum; + # max_* fields take the larger across rows. + locks = rows_parsed.group_by { |r| r[:addr] }.map do |addr, group| + { + addr: addr, + acquires: group.sum { |r| r[:acquires] }, + contended: group.sum { |r| r[:contended] }, + total_wait_ns: group.sum { |r| r[:total_wait_ns] }, + max_wait_ns: group.map { |r| r[:max_wait_ns] }.max, + total_hold_ns: group.sum { |r| r[:total_hold_ns] }, + max_hold_ns: group.map { |r| r[:max_hold_ns] }.max, + read_acquires: group.sum { |r| r[:read_acquires] }, + read_contended: group.sum { |r| r[:read_contended] }, + read_total_wait_ns: group.sum { |r| r[:read_total_wait_ns] }, + read_max_wait_ns: group.map { |r| r[:read_max_wait_ns] }.max, + traces: group.map { |r| r[:trace] }.reject(&:empty?), + } + end.reject { |c| c[:acquires] == 0 && c[:read_acquires] == 0 } + .sort_by { |c| -c[:total_hold_ns] } return if locks.empty? @@ -731,19 +912,41 @@ def section_mvcc(profile_dir) return unless File.exist?(mvcc_prof) rows = File.readlines(mvcc_prof).reject { |l| l.start_with?('#') || l.strip.empty? } - cells = rows.map do |l| - f = l.split + # Columns (mvcc-profile v2): addr struct_size reads commits retries + # update_failures multi_commits [caller_trace] + # caller_trace is `-` when --sync-callstacks is off, comma-separated + # leaf-first when on. Pre-v2 files don't have the column; we treat + # missing as empty. + rows_parsed = rows.map do |l| + f = l.split("\t").map(&:strip) + f = l.strip.split if f.size < 6 next nil if f.size < 6 + trace_field = f[7] + trace = (trace_field.nil? || trace_field == '-') ? [] : trace_field.split(',') { addr: f[0], struct_size: f[1].to_i, reads: f[2].to_i, commits: f[3].to_i, retries: f[4].to_i, update_failures: f[5].to_i, - # AtomicPtr M3.16: optional 7th column for multi-cell commits. - # Older mvcc.txt files (pre-M3.16) won't have it; treat as 0. multi_commits: (f[6] || 0).to_i, + trace: trace, } - end.compact.reject { |c| c[:reads] == 0 && c[:commits] == 0 } - .sort_by { |c| -(c[:reads] + c[:commits]) } + end.compact + + # Aggregate (cell, trace) rows back to one row per cell so the + # existing diagnoses (COW thrash, MVCC misuse) keep their meaning. + cells = rows_parsed.group_by { |r| r[:addr] }.map do |addr, group| + { + addr: addr, + struct_size: group.first[:struct_size], + reads: group.sum { |r| r[:reads] }, + commits: group.sum { |r| r[:commits] }, + retries: group.sum { |r| r[:retries] }, + update_failures: group.sum { |r| r[:update_failures] }, + multi_commits: group.sum { |r| r[:multi_commits] }, + traces: group.map { |r| r[:trace] }.reject(&:empty?), + } + end.reject { |c| c[:reads] == 0 && c[:commits] == 0 } + .sort_by { |c| -(c[:reads] + c[:commits]) } return if cells.empty? @@ -1098,4 +1301,381 @@ def section_freeze(profile_dir, sites, resolved, llc_miss_rate) puts " CLEAR: frozen = FREEZE expr" puts "" end + + # ── --peek: callers and callees of one function ── + # For sites whose leaf is `regex`, lists the unique callers (frames + # immediately above the leaf in each trace) and aggregates bytes + # per caller. For samples where `regex` matches a non-leaf frame, + # also lists what's directly below — the callees this function + # was on the path to. Mirrors `pprof -peek`'s shape. + def run_peek(profile_dir, regex) + unless profile_dir && Dir.exist?(profile_dir) + $stderr.puts "\e[31merror:\e[0m --peek requires a profile directory" + exit 1 + end + binary = profile_dir.chomp('/').sub(/\.profile$/, '') + binary = nil unless File.exist?(binary.to_s) + + sites, resolved = section_heap_silent(profile_dir, binary) + return if sites.nil? || sites.empty? + + callers = Hash.new { |h, k| h[k] = { bytes: 0, allocs: 0 } } # caller -> stats + callees = Hash.new { |h, k| h[k] = { bytes: 0, allocs: 0 } } # callee -> stats + self_total = { bytes: 0, allocs: 0 } + + sites.each do |s| + funcs = s[:trace].map { |a| resolved&.dig(a)&.dig(:func) || a } + idx = funcs.index { |f| f =~ regex } + next unless idx + + self_total[:bytes] += s[:bytes] + self_total[:allocs] += s[:allocs] + + # Caller: the frame above (deeper in the stack) — leaf-first + # ordering means the caller is the next index. Stops at trace + # root. + if (caller_func = funcs[idx + 1]) + callers[caller_func][:bytes] += s[:bytes] + callers[caller_func][:allocs] += s[:allocs] + else + callers[''][:bytes] += s[:bytes] + callers[''][:allocs] += s[:allocs] + end + + # Callee: only if the matched function is non-leaf (idx > 0). + # The callee is the previous index (closer to leaf). + if idx > 0 && (callee_func = funcs[idx - 1]) + callees[callee_func][:bytes] += s[:bytes] + callees[callee_func][:allocs] += s[:allocs] + end + end + + if self_total[:bytes].zero? + puts "No samples matched /#{regex.source}/." + return + end + + puts "=== Peek /#{regex.source}/ ===" + puts "" + puts " Self bytes: %s (across %s allocations)" % + [bytes_pretty(self_total[:bytes]), self_total[:allocs].to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,')] + puts "" + + puts " Callers (frames above /#{regex.source}/):" + callers.sort_by { |_f, v| -v[:bytes] }.first(10).each do |func, v| + puts " %-30s %12s (%d allocs)" % [func, bytes_pretty(v[:bytes]), v[:allocs]] + end + puts "" + + if callees.any? + puts " Callees (frames below /#{regex.source}/, when matched non-leaf):" + callees.sort_by { |_f, v| -v[:bytes] }.first(10).each do |func, v| + puts " %-30s %12s (%d allocs)" % [func, bytes_pretty(v[:bytes]), v[:allocs]] + end + puts "" + end + end + + # Same parsing as section_heap but without the printout — used by + # run_peek so we can build caller/callee tables from the parsed sites. + def section_heap_silent(profile_dir, binary) + out = StringIO.new + real, $stdout = $stdout, out + sites, resolved = section_heap(profile_dir, binary) + [sites, resolved] + ensure + $stdout = real + end + + # ── --diff: compare two profile runs ── + # Loads both profile dirs, aggregates by leaf function, and reports + # the largest deltas in alloc_space, lock contention, and MVCC + # commits. Doctor's per-site advice is layered on top: a site that + # newly trips "MVCC fit" gets called out, etc. Computes deltas + # ourselves rather than shelling to `pprof -base` so doctor stays + # self-contained and can attach commentary. + def run_diff(before_dir, after_dir, focus: nil) + before_dir = before_dir.to_s.chomp('/') + after_dir = after_dir.to_s.chomp('/') + unless Dir.exist?(before_dir) && Dir.exist?(after_dir) + $stderr.puts "\e[31merror:\e[0m diff requires two existing profile directories" + $stderr.puts " before: #{before_dir} (#{Dir.exist?(before_dir) ? 'ok' : 'MISSING'})" + $stderr.puts " after: #{after_dir} (#{Dir.exist?(after_dir) ? 'ok' : 'MISSING'})" + exit 1 + end + + @opts = { cumulative: false, focus: focus } + puts "=== Diff (before → after) ===" + puts " before: #{before_dir}" + puts " after: #{after_dir}" + puts " focus: /#{focus.source}/" if focus + puts "" + + diff_heap(before_dir, after_dir, focus) + diff_locks(before_dir, after_dir, focus) + diff_mvcc(before_dir, after_dir, focus) + end + + # Parse alloc.txt and aggregate by leaf-function name across all + # rows that resolve to that function. Returns + # { func_name => { allocs:, bytes:, frees:, free_bytes: } }. + # `binary` is the path to use for addr2line resolution; pass the + # after-profile's binary so before/after addresses resolve through + # the same symbol table (ASLR / rebuild can shift addresses but the + # function name should stay stable for the same source). + def parse_alloc_for_diff(profile_dir, binary) + alloc_file = File.join(profile_dir, 'alloc.txt') + return {} unless File.exist?(alloc_file) + + rows = File.readlines(alloc_file).reject { |l| l.start_with?('#') || l.strip.empty? } + leaves = [] + parsed = rows.map do |l| + f = l.strip.split + next nil if f.size < 6 + leaf = f[0].split(',').first + leaves << leaf + [leaf, { allocs: f[1].to_i, bytes: f[2].to_i, + frees: f[3].to_i, free_bytes: f[4].to_i }] + end.compact + + # Resolve leaf addrs to function names if a binary is available. + # Otherwise fall back to keying by raw address — distinct leaves + # still diff against each other; the table just shows hex addrs + # in place of function names. + addr_to_func = {} + if binary && File.exist?(binary) + raw = IO.popen(['addr2line', '-e', binary, '-f'] + leaves.uniq, err: '/dev/null', &:read) + out_lines = raw.split("\n") + leaves.uniq.each_with_index do |a, i| + func = (out_lines[i * 2] || '?').strip.sub(/.*\./, '').sub(/__anon_\d+/, '') + # addr2line returns "??" for addresses not in this binary; + # fall back to the raw addr. + func = a if func == '??' || func.empty? + addr_to_func[a] = func + end + end + + by_func = Hash.new { |h, k| h[k] = { allocs: 0, bytes: 0, frees: 0, free_bytes: 0 } } + parsed.each do |leaf, vals| + f = addr_to_func[leaf] || leaf + by_func[f][:allocs] += vals[:allocs] + by_func[f][:bytes] += vals[:bytes] + by_func[f][:frees] += vals[:frees] + by_func[f][:free_bytes] += vals[:free_bytes] + end + by_func + end + + def diff_heap(before_dir, after_dir, focus) + # Use the after-dir's binary for both lookups — that's the user's + # current build. Falls back to the before-dir's binary if the + # after-dir doesn't have one. + binary = locate_diff_binary(after_dir, before_dir) + before = parse_alloc_for_diff(before_dir, binary) + after = parse_alloc_for_diff(after_dir, binary) + return if before.empty? && after.empty? + + funcs = (before.keys + after.keys).uniq + funcs.select! { |f| f =~ focus } if focus + + deltas = funcs.map do |f| + b = before[f] || { bytes: 0, allocs: 0 } + a = after[f] || { bytes: 0, allocs: 0 } + { + func: f, + before_bytes: b[:bytes], after_bytes: a[:bytes], + delta_bytes: a[:bytes] - b[:bytes], + before_allocs: b[:allocs], after_allocs: a[:allocs], + delta_allocs: a[:allocs] - b[:allocs], + } + end.reject { |d| d[:delta_bytes].zero? && d[:delta_allocs].zero? } + .sort_by { |d| -d[:delta_bytes].abs } + + return if deltas.empty? + + puts "=== Heap Δ (top by |Δ alloc_space|) ===" + puts "" + printf " %-30s %12s %12s %12s\n", "function", "before", "after", "Δ" + deltas.first(10).each do |d| + arrow = d[:delta_bytes].positive? ? "↑" : "↓" + puts " %-30s %12s %12s %s %12s" % [ + d[:func], + bytes_pretty(d[:before_bytes]), + bytes_pretty(d[:after_bytes]), + arrow, + bytes_pretty(d[:delta_bytes].abs), + ] + end + new_funcs = deltas.select { |d| d[:before_bytes].zero? && d[:after_bytes].positive? } + if new_funcs.any? + puts "" + puts " New allocation sites (cold → hot):" + new_funcs.first(5).each { |d| puts " + #{d[:func]} (#{bytes_pretty(d[:after_bytes])})" } + end + gone_funcs = deltas.select { |d| d[:after_bytes].zero? && d[:before_bytes].positive? } + if gone_funcs.any? + puts "" + puts " Eliminated allocation sites:" + gone_funcs.first(5).each { |d| puts " - #{d[:func]} (was #{bytes_pretty(d[:before_bytes])})" } + end + puts "" + end + + def parse_locks_for_diff(profile_dir) + lock_prof = File.join(profile_dir, 'locks.txt') + return {} unless File.exist?(lock_prof) + rows = File.readlines(lock_prof).reject { |l| l.start_with?('#') || l.strip.empty? } + by_addr = Hash.new { |h, k| h[k] = { acquires: 0, contended: 0, + total_wait_ns: 0, total_hold_ns: 0, + read_acquires: 0, read_contended: 0, + read_total_wait_ns: 0 } } + rows.each do |l| + f = l.split("\t").map(&:strip) + f = l.strip.split if f.size < 11 + next if f.size < 11 + a = by_addr[f[0]] + a[:acquires] += f[1].to_i + a[:contended] += f[2].to_i + a[:total_wait_ns] += f[3].to_i + a[:total_hold_ns] += f[5].to_i + a[:read_acquires] += f[7].to_i + a[:read_contended] += f[8].to_i + a[:read_total_wait_ns] += f[9].to_i + end + by_addr + end + + def diff_locks(before_dir, after_dir, _focus) + before = parse_locks_for_diff(before_dir) + after = parse_locks_for_diff(after_dir) + return if before.empty? && after.empty? + + addrs = (before.keys + after.keys).uniq + deltas = addrs.map do |addr| + b = before[addr] || { contended: 0, read_contended: 0, total_wait_ns: 0, read_total_wait_ns: 0 } + a = after[addr] || { contended: 0, read_contended: 0, total_wait_ns: 0, read_total_wait_ns: 0 } + bcont = b[:contended] + b[:read_contended] + acont = a[:contended] + a[:read_contended] + bwait = b[:total_wait_ns] + b[:read_total_wait_ns] + await = a[:total_wait_ns] + a[:read_total_wait_ns] + { + addr: addr, + before_cont: bcont, after_cont: acont, delta_cont: acont - bcont, + before_wait: bwait, after_wait: await, delta_wait: await - bwait, + } + end.reject { |d| d[:delta_cont].zero? && d[:delta_wait].zero? } + .sort_by { |d| -d[:delta_wait].abs } + + return if deltas.empty? + + puts "=== Lock Δ (top by |Δ wait_ns|) ===" + puts "" + printf " %-14s %12s %12s %12s %s\n", "lock", "Δ contend", "Δ wait_ns", "before/after", "diagnosis" + deltas.first(10).each do |d| + diag = if d[:before_wait].zero? && d[:after_wait].positive? + "newly contended" + elsif d[:after_wait].zero? && d[:before_wait].positive? + "contention eliminated" + elsif d[:delta_wait].positive? + "regression" + else + "improved" + end + arrow = d[:delta_wait].positive? ? "↑" : "↓" + puts " %-14s %s %5d %s %10d %d/%d %s" % [ + d[:addr], + d[:delta_cont].positive? ? "↑" : "↓", + d[:delta_cont].abs, + arrow, + d[:delta_wait].abs, + d[:before_wait], d[:after_wait], diag, + ] + end + puts "" + end + + def parse_mvcc_for_diff(profile_dir) + mvcc_prof = File.join(profile_dir, 'mvcc.txt') + return {} unless File.exist?(mvcc_prof) + rows = File.readlines(mvcc_prof).reject { |l| l.start_with?('#') || l.strip.empty? } + by_addr = Hash.new { |h, k| h[k] = { reads: 0, commits: 0, retries: 0 } } + rows.each do |l| + f = l.split("\t").map(&:strip) + f = l.strip.split if f.size < 6 + next if f.size < 6 + a = by_addr[f[0]] + a[:reads] += f[2].to_i + a[:commits] += f[3].to_i + a[:retries] += f[4].to_i + end + by_addr + end + + def diff_mvcc(before_dir, after_dir, _focus) + before = parse_mvcc_for_diff(before_dir) + after = parse_mvcc_for_diff(after_dir) + return if before.empty? && after.empty? + + addrs = (before.keys + after.keys).uniq + deltas = addrs.map do |addr| + b = before[addr] || { reads: 0, commits: 0, retries: 0 } + a = after[addr] || { reads: 0, commits: 0, retries: 0 } + { + addr: addr, + delta_commits: a[:commits] - b[:commits], + delta_retries: a[:retries] - b[:retries], + before_retries: b[:retries], after_retries: a[:retries], + } + end.reject { |d| d[:delta_commits].zero? && d[:delta_retries].zero? } + .sort_by { |d| -d[:delta_retries].abs } + + return if deltas.empty? + + puts "=== MVCC Δ (top by |Δ retries|) ===" + puts "" + printf " %-14s %12s %12s %s\n", "cell", "Δ commits", "Δ retries", "diagnosis" + deltas.first(10).each do |d| + diag = if d[:before_retries].zero? && d[:after_retries].positive? + "new retry storm" + elsif d[:after_retries].zero? && d[:before_retries].positive? + "retries eliminated" + elsif d[:delta_retries].positive? + "more contention" + else + "less contention" + end + arrow_c = d[:delta_commits].positive? ? "↑" : "↓" + arrow_r = d[:delta_retries].positive? ? "↑" : "↓" + puts " %-14s %s %10d %s %10d %s" % [ + d[:addr], + arrow_c, d[:delta_commits].abs, + arrow_r, d[:delta_retries].abs, + diag, + ] + end + puts "" + end + + # Find the binary to use for addr2line resolution in --diff mode. + # Prefers the after-dir's binary (the user's current build); falls + # back to the before-dir's binary if the after-dir is missing one + # (e.g. binary was deleted after profiling). + def locate_diff_binary(after_dir, before_dir) + [after_dir, before_dir].each do |dir| + bin = dir.to_s.chomp('/').sub(/\.profile$/, '') + return bin if File.exist?(bin) + end + nil + end + + def bytes_pretty(n) + return "-" if n.zero? + if n.abs >= 1024 * 1024 + "%.1f MB" % (n / (1024.0 * 1024.0)) + elsif n.abs >= 1024 + "%.1f KB" % (n / 1024.0) + else + "#{n} B" + end + end end diff --git a/src/tools/pprof.rb b/src/tools/pprof.rb new file mode 100644 index 000000000..d17d583e4 --- /dev/null +++ b/src/tools/pprof.rb @@ -0,0 +1,278 @@ +require 'zlib' +require 'stringio' + +# Pure-Ruby pprof v3 profile encoder. +# +# Builds a `Profile` message in the schema defined by +# https://github.com/google/pprof/blob/main/proto/profile.proto and +# emits a gzipped protobuf (`.pb.gz`) suitable for the `pprof` tool +# (`pprof -http=:8080 binary file.pb.gz`). +# +# We hand-roll the wire format (~80 lines) instead of pulling the +# `google-protobuf` C-extension gem so that `clear profile`'s pprof +# emission stays a stdlib-only feature. +# +# Wire-format reference: +# tag = (field_number << 3) | wire_type +# wire_type 0 = varint +# wire_type 2 = length-delimited (length, then bytes) +module Pprof + module Wire + module_function + + def varint(n) + raise ArgumentError, "negative varint: #{n}" if n.negative? + out = String.new(encoding: 'ASCII-8BIT') + loop do + if n < 0x80 + out << n.chr + return out + end + out << ((n & 0x7F) | 0x80).chr + n >>= 7 + end + end + + def field_varint(field, n) + varint((field << 3) | 0) + varint(n) + end + + def field_bytes(field, bytes) + varint((field << 3) | 2) + varint(bytes.bytesize) + bytes.b + end + + def field_string(field, s) + field_bytes(field, s.to_s) + end + end + + # Builder for one Profile message. Strings, functions, and locations + # are deduplicated as added; samples are accumulated and serialized + # at `encode`. + class Profile + def initialize + @strings = { "" => 0 } # string_table[0] is required to be empty + @functions = {} # key: [name, system_name, filename] -> Function + @locations = [] + @samples = [] + @sample_types = [] + @mappings = [] # one entry per binary + @period_type = nil + @period = 0 + @time_nanos = (Time.now.to_f * 1e9).to_i + @duration_nanos = 0 + @default_sample_type_idx = nil + @next_func_id = 1 + @next_loc_id = 1 + @next_mapping_id = 1 + @primary_mapping_id = 0 + end + + attr_writer :time_nanos, :duration_nanos, :period + + def intern(s) + @strings[s.to_s] ||= @strings.size + end + + # Declare a sample value column. `type` is the metric (e.g. + # `"alloc_space"`); `unit` is the unit (`"bytes"`, `"count"`, + # `"nanoseconds"`). The column index matches the order of calls. + def add_sample_type(type, unit) + @sample_types << [intern(type), intern(unit)] + end + + # `period` and `period_type` describe the sampling rate (e.g. + # 1 sample per N events). Optional; pprof shows it in the header. + def set_period_type(type, unit, period) + @period_type = [intern(type), intern(unit)] + @period = period + end + + # Pick which sample-type column the pprof UI selects by default + # (`pprof -inuse_space file.pb.gz`). Optional. + def default_sample_type=(type) + @default_sample_type_idx = intern(type) + end + + # Register the binary this profile is about. pprof uses the Mapping + # to display the binary name in the header (silences "Main binary + # filename not available") and to set has_functions/has_filenames/ + # has_line_numbers flags so its UI knows symbolization is already + # done. memory_start/limit are 0 because we do not require pprof + # to load the binary itself — addr2line resolved everything at + # convert time. Returns the mapping id; the first call also sets + # the primary mapping that all Locations attach to by default. + def add_mapping(binary:, build_id: '') + mapping = { + id: @next_mapping_id, + memory_start: 0, + memory_limit: 0, + file_offset: 0, + filename_idx: intern(binary), + build_id_idx: intern(build_id), + has_functions: true, + has_filenames: true, + has_line_numbers: true, + } + @mappings << mapping + @next_mapping_id += 1 + @primary_mapping_id = mapping[:id] if @primary_mapping_id.zero? + mapping[:id] + end + + # Returns the function id (1-based). Identical (name, system_name, + # filename) tuples are deduplicated. + def add_function(name:, filename: "", system_name: nil, start_line: 0) + sys = system_name || name + key = [name, sys, filename] + f = @functions[key] + return f[:id] if f + f = { + id: @next_func_id, + name_idx: intern(name), + system_name_idx: intern(sys), + filename_idx: intern(filename), + start_line: start_line, + } + @functions[key] = f + @next_func_id += 1 + f[:id] + end + + # Returns the location id (1-based). One Line entry per call. + # Defaults to the primary Mapping (the binary this profile is for) + # so symbol metadata propagates without per-Location plumbing. + def add_location(function_id:, line: 0, address: 0) + id = @next_loc_id + @next_loc_id += 1 + @locations << { + id: id, + address: address, + function_id: function_id, + line_no: line, + mapping_id: @primary_mapping_id, + } + id + end + + # `location_ids` is a stack trace, leaf-first. `values` matches the + # order/length of `add_sample_type` calls. `labels` is a hash of + # string key -> string-or-int (str labels are interned; int labels + # render as numeric). + def add_sample(location_ids, values, labels = {}) + @samples << { location_ids: location_ids, values: values, labels: labels } + end + + def encode + buf = String.new(encoding: 'ASCII-8BIT') + + # field 1: sample_type (repeated ValueType) + @sample_types.each do |type_idx, unit_idx| + sub = Wire.field_varint(1, type_idx) + Wire.field_varint(2, unit_idx) + buf << Wire.field_bytes(1, sub) + end + + # field 2: sample (repeated Sample) + @samples.each { |s| buf << Wire.field_bytes(2, encode_sample(s)) } + + # field 3: mapping (repeated Mapping) + @mappings.each { |m| buf << Wire.field_bytes(3, encode_mapping(m)) } + + # field 4: location (repeated Location) + @locations.each { |loc| buf << Wire.field_bytes(4, encode_location(loc)) } + + # field 5: function (repeated Function) + @functions.each_value { |f| buf << Wire.field_bytes(5, encode_function(f)) } + + # field 6: string_table (repeated string) + @strings.keys.each { |s| buf << Wire.field_string(6, s) } + + # field 9, 10: time_nanos, duration_nanos + buf << Wire.field_varint(9, @time_nanos) if @time_nanos.positive? + buf << Wire.field_varint(10, @duration_nanos) if @duration_nanos.positive? + + # field 11: period_type (ValueType), 12: period + if @period_type + sub = Wire.field_varint(1, @period_type[0]) + Wire.field_varint(2, @period_type[1]) + buf << Wire.field_bytes(11, sub) + end + buf << Wire.field_varint(12, @period) if @period.positive? + + # field 14: default_sample_type + buf << Wire.field_varint(14, @default_sample_type_idx) if @default_sample_type_idx + + buf + end + + def encode_gzip + io = StringIO.new(String.new(encoding: 'ASCII-8BIT')) + io.set_encoding('ASCII-8BIT') + Zlib::GzipWriter.wrap(io) { |gz| gz.write(encode) } + io.string + end + + def write_gzip(path) + File.binwrite(path, encode_gzip) + end + + private + + def encode_sample(s) + buf = String.new(encoding: 'ASCII-8BIT') + # field 1: location_id (repeated uint64) — packed for compactness + if s[:location_ids].any? + packed = s[:location_ids].map { |id| Wire.varint(id) }.join + buf << Wire.field_bytes(1, packed) + end + # field 2: value (repeated int64) — packed + if s[:values].any? + packed = s[:values].map { |v| Wire.varint(v.to_i.negative? ? v.to_i + (1 << 64) : v.to_i) }.join + buf << Wire.field_bytes(2, packed) + end + # field 3: label (repeated Label) + s[:labels].each do |k, v| + lab = Wire.field_varint(1, intern(k.to_s)) + case v + when Integer + lab += Wire.field_varint(3, v) + else + lab += Wire.field_varint(2, intern(v.to_s)) + end + buf << Wire.field_bytes(3, lab) + end + buf + end + + def encode_location(loc) + sub = Wire.field_varint(1, loc[:id]) + sub += Wire.field_varint(2, loc[:mapping_id]) if loc[:mapping_id].to_i.positive? + sub += Wire.field_varint(3, loc[:address]) if loc[:address].to_i.positive? + line_buf = Wire.field_varint(1, loc[:function_id]) + + Wire.field_varint(2, loc[:line_no].to_i) + sub += Wire.field_bytes(4, line_buf) + sub + end + + def encode_mapping(m) + buf = Wire.field_varint(1, m[:id]) + buf += Wire.field_varint(2, m[:memory_start]) if m[:memory_start].positive? + buf += Wire.field_varint(3, m[:memory_limit]) if m[:memory_limit].positive? + buf += Wire.field_varint(4, m[:file_offset]) if m[:file_offset].positive? + buf += Wire.field_varint(5, m[:filename_idx]) + buf += Wire.field_varint(6, m[:build_id_idx]) if m[:build_id_idx].positive? + buf += Wire.field_varint(7, 1) if m[:has_functions] + buf += Wire.field_varint(8, 1) if m[:has_filenames] + buf += Wire.field_varint(9, 1) if m[:has_line_numbers] + buf += Wire.field_varint(10, 1) if m[:has_inline_frames] + buf + end + + def encode_function(f) + Wire.field_varint(1, f[:id]) + + Wire.field_varint(2, f[:name_idx]) + + Wire.field_varint(3, f[:system_name_idx]) + + Wire.field_varint(4, f[:filename_idx]) + + Wire.field_varint(5, f[:start_line]) + end + end +end diff --git a/src/tools/pprof_converter.rb b/src/tools/pprof_converter.rb new file mode 100644 index 000000000..5e0fa372d --- /dev/null +++ b/src/tools/pprof_converter.rb @@ -0,0 +1,418 @@ +require_relative 'pprof' + +# Converts the text-format profile data emitted by `clear profile` +# into pprof's gzipped protobuf format. Each runtime profile becomes +# one `.pb.gz` file in the same `.profile/` dir, viewable via: +# +# pprof -http=:8080 ./ /heap.pb.gz +# pprof -top -alloc_space /heap.pb.gz +# pprof -base before/heap.pb.gz after/heap.pb.gz +# +# The converters share an `addr2line` resolver so call-site addresses +# show up in pprof as proper function names + CLEAR source lines (via +# the `// CLR:N` markers the transpiler emits into transpiled.zig). +module PprofConverter + module_function + + # Run all available converters for a `.profile/` directory. Returns + # a hash of {name => path} for files actually written. Missing input + # files are silently skipped. + def convert_all(profile_dir) + return {} unless profile_dir && Dir.exist?(profile_dir) + + binary = profile_dir.chomp('/').sub(/\.profile$/, '') + binary = nil unless File.exist?(binary.to_s) + + out = {} + if (path = convert_alloc(profile_dir, binary)) + out[:heap] = path + end + if (path = convert_locks(profile_dir, binary)) + out[:lock] = path + end + if (path = convert_mvcc(profile_dir, binary)) + out[:mvcc] = path + end + if (path = convert_channels(profile_dir, binary)) + out[:channels] = path + end + if (path = convert_perf(profile_dir, binary)) + out[:cpu] = path + end + out + end + + # ── channels.txt → channels.pb.gz ───────────────────────────────── + # Channel telemetry has no call site — channels are identified by a + # registry id, not an address. We emit one Sample per registered + # channel, with synthetic Function names like `channel#7` so pprof's + # views show the per-channel breakdown with stable identifiers. + # Sample columns: pushes / pops / push_blocked / pop_blocked / + # max_depth (capacity travels as a label). + def convert_channels(profile_dir, binary) + src = File.join(profile_dir, 'channels.txt') + return nil unless File.exist?(src) + + rows = parse_tabbed_columns(src, 7).map do |f| + f = f.first(7) # ignore any future trailing columns + { + id: f[0].to_i, + pushes: f[1].to_i, pops: f[2].to_i, + push_blocked: f[3].to_i, pop_blocked: f[4].to_i, + max_depth: f[5].to_i, capacity: f[6].to_i, + } + end.reject { |c| c[:pushes].zero? && c[:pops].zero? } + return nil if rows.empty? + + pb = Pprof::Profile.new + pb.add_mapping(binary: binary.to_s) if binary + pb.add_sample_type('pushes', 'count') + pb.add_sample_type('pops', 'count') + pb.add_sample_type('push_blocked', 'count') + pb.add_sample_type('pop_blocked', 'count') + pb.add_sample_type('max_depth', 'count') + pb.set_period_type('operations', 'count', 1) + pb.default_sample_type = 'pushes' + + rows.each do |r| + name = "channel##{r[:id]}" + fid = pb.add_function(name: name, filename: '', system_name: name) + lid = pb.add_location(function_id: fid, line: 0, address: r[:id]) + pb.add_sample( + [lid], + [r[:pushes], r[:pops], r[:push_blocked], r[:pop_blocked], r[:max_depth]], + capacity: r[:capacity], id: r[:id] + ) + end + + out = File.join(profile_dir, 'channels.pb.gz') + pb.write_gzip(out) + out + end + + # ── alloc.txt → heap.pb.gz ──────────────────────────────────────── + # Emits four sample-type columns matching Go's heap profile so the + # standard pprof flags (-alloc_space / -alloc_objects / + # -inuse_space / -inuse_objects) all work out of the box. + # + # alloc_objects (count) = total allocations at this site + # alloc_space (bytes) = total bytes allocated at this site + # inuse_objects (count) = currently-live objects (allocs - frees) + # inuse_space (bytes) = currently-live bytes (bytes - free_bytes) + def convert_alloc(profile_dir, binary) + src = File.join(profile_dir, 'alloc.txt') + return nil unless File.exist?(src) + + # Each site's first column is now a comma-separated leaf-first + # stack trace (alloc-profile v2). v1 single-addr files still parse + # because `split(',')` on a no-comma string returns a one-element + # array. + sites = parse_columns(src, 6).map do |f| + addrs = f[0].split(',') + { + addrs: addrs, + allocs: f[1].to_i, + bytes: f[2].to_i, + frees: f[3].to_i, + free_bytes: f[4].to_i, + live: f[5].to_i, + } + end + return nil if sites.empty? + + all_addrs = sites.flat_map { |s| s[:addrs] }.uniq + resolved = resolve_addrs(all_addrs, binary, profile_dir) + + pb = Pprof::Profile.new + pb.add_mapping(binary: binary.to_s) if binary + pb.add_sample_type('alloc_objects', 'count') + pb.add_sample_type('alloc_space', 'bytes') + pb.add_sample_type('inuse_objects', 'count') + pb.add_sample_type('inuse_space', 'bytes') + pb.set_period_type('space', 'bytes', 1) + pb.default_sample_type = 'alloc_space' + + # One Location per unique address, reused across samples whose + # traces share that frame. Saves a lot of bytes on hot leaves + # like `entryWrapper` that appear in every trace. + location_ids = build_location_index(pb, all_addrs, resolved, profile_dir) + + sites.each do |s| + stack = s[:addrs].map { |a| location_ids[a] } + pb.add_sample( + stack, + [ + s[:allocs], + s[:bytes], + [s[:allocs] - s[:frees], 0].max, + [s[:bytes] - s[:free_bytes], 0].max, + ], + addr: s[:addrs].first + ) + end + + out = File.join(profile_dir, 'heap.pb.gz') + pb.write_gzip(out) + out + end + + # Pre-build one Location per unique address so multi-frame samples + # can share frames. Returns { hex_addr => location_id }. + def build_location_index(pb, addrs, resolved, profile_dir) + addrs.each_with_object({}) do |addr, idx| + r = resolved[addr] || {} + func = r[:func] || addr + file = r[:file] || '' + # User-zig frames get clear_line into source.cht; runtime/stdlib + # frames get the Zig line itself, with their own filename so + # pprof -list resolves to the right file. + line = r[:is_user_zig] ? (r[:clear_line] || 0) : extract_zig_line(file) + filename = clear_source_path(profile_dir, file, is_user_zig: r[:is_user_zig]) + fid = pb.add_function(name: func, filename: filename, system_name: r[:func] || func) + idx[addr] = pb.add_location(function_id: fid, line: line, address: parse_addr(addr)) + end + end + + # addr2line emits `path:line` for each address. Pull the trailing + # line number; returns 0 if absent. + def extract_zig_line(addr2line_file) + return 0 unless addr2line_file + addr2line_file =~ /:(\d+)\b/ ? Regexp.last_match(1).to_i : 0 + end + + # ── locks.txt → lock.pb.gz ──────────────────────────────────────── + # Mirrors Go's mutex profile shape (contention count + delay ns). + # We add hold-time columns since the runtime tracks both, and the + # split read/write columns so the pprof user can drill into either. + def convert_locks(profile_dir, binary) + src = File.join(profile_dir, 'locks.txt') + return nil unless File.exist?(src) + + # lock-profile v3 adds an optional 12th column for the caller + # trace ('-' = empty; comma-separated leaf-first when present). + # We tab-split because the trace field can contain commas; older + # whitespace-only files still parse via the fallback split. + locks = parse_tabbed_columns(src, 11).map do |f| + trace_field = f[11] + caller_trace = (trace_field.nil? || trace_field == '-') ? [] : trace_field.split(',') + { + addr: f[0], acquires: f[1].to_i, contended: f[2].to_i, + total_wait_ns: f[3].to_i, max_wait_ns: f[4].to_i, + total_hold_ns: f[5].to_i, max_hold_ns: f[6].to_i, + read_acquires: f[7].to_i, read_contended: f[8].to_i, + read_total_wait_ns: f[9].to_i, read_max_wait_ns: f[10].to_i, + caller_trace: caller_trace, + } + end.reject { |l| l[:acquires].zero? && l[:read_acquires].zero? } + return nil if locks.empty? + + # Each row already carries its own (lock, caller-trace) identity; + # we keep them as-is so pprof renders one Sample per row. The + # leaf of every trace is the lock pointer, then the caller frames. + all_addrs = locks.flat_map { |l| [l[:addr]] + l[:caller_trace] }.uniq + resolved = resolve_addrs(all_addrs, binary, profile_dir) + + pb = Pprof::Profile.new + pb.add_mapping(binary: binary.to_s) if binary + pb.add_sample_type('contentions', 'count') + pb.add_sample_type('delay', 'nanoseconds') + pb.add_sample_type('hold', 'nanoseconds') + pb.add_sample_type('acquisitions', 'count') + pb.set_period_type('contentions', 'count', 1) + pb.default_sample_type = 'delay' + + location_ids = build_location_index(pb, all_addrs, resolved, profile_dir) + + locks.each do |l| + stack = [location_ids[l[:addr]]] + + l[:caller_trace].map { |a| location_ids[a] }.compact + pb.add_sample( + stack, + [ + l[:contended] + l[:read_contended], + l[:total_wait_ns] + l[:read_total_wait_ns], + l[:total_hold_ns], + l[:acquires] + l[:read_acquires], + ], + kind: l[:read_acquires] > l[:acquires] ? 'read' : 'write', + addr: l[:addr] + ) + end + + out = File.join(profile_dir, 'lock.pb.gz') + pb.write_gzip(out) + out + end + + # ── mvcc.txt → mvcc.pb.gz ───────────────────────────────────────── + # MVCC cells track read/commit/retry counts and per-cell struct size. + # Reported columns let the pprof user spot COW-thrash (high commits + # x large struct), retry storms, and read-heavy cells worth keeping. + def convert_mvcc(profile_dir, binary) + src = File.join(profile_dir, 'mvcc.txt') + return nil unless File.exist?(src) + + # mvcc-profile v2 adds an optional 8th column for the caller + # trace; same shape as lock-profile v3. + cells = parse_tabbed_columns(src, 6).map do |f| + trace_field = f[7] + caller_trace = (trace_field.nil? || trace_field == '-') ? [] : trace_field.split(',') + { + addr: f[0], struct_size: f[1].to_i, reads: f[2].to_i, + commits: f[3].to_i, retries: f[4].to_i, + update_failures: f[5].to_i, + multi_commits: (f[6] || 0).to_i, + caller_trace: caller_trace, + } + end.reject { |c| c[:reads].zero? && c[:commits].zero? } + return nil if cells.empty? + + all_addrs = cells.flat_map { |c| [c[:addr]] + c[:caller_trace] }.uniq + resolved = resolve_addrs(all_addrs, binary, profile_dir) + + pb = Pprof::Profile.new + pb.add_mapping(binary: binary.to_s) if binary + pb.add_sample_type('reads', 'count') + pb.add_sample_type('commits', 'count') + pb.add_sample_type('retries', 'count') + pb.add_sample_type('cow_bytes', 'bytes') + pb.set_period_type('operations', 'count', 1) + pb.default_sample_type = 'commits' + + location_ids = build_location_index(pb, all_addrs, resolved, profile_dir) + + cells.each do |c| + stack = [location_ids[c[:addr]]] + + c[:caller_trace].map { |a| location_ids[a] }.compact + cow_bytes = c[:struct_size] * (c[:commits] + c[:retries]) + pb.add_sample( + stack, + [c[:reads], c[:commits], c[:retries], cow_bytes], + struct_size: c[:struct_size], + addr: c[:addr] + ) + end + + out = File.join(profile_dir, 'mvcc.pb.gz') + pb.write_gzip(out) + out + end + + # ── perf.data → cpu.pb.gz ───────────────────────────────────────── + # Defers to Google's `perf_to_profile` (the canonical converter, + # `go install github.com/google/perf_data_converter/src/cmd/perf_to_profile`). + # If the tool is not on PATH we leave perf.data in place and return + # nil; the caller surfaces a one-line install hint. + def convert_perf(profile_dir, _binary) + src = File.join(profile_dir, 'perf.data') + return nil unless File.exist?(src) + return nil unless system('which perf_to_profile > /dev/null 2>&1') + + out = File.join(profile_dir, 'cpu.pb.gz') + ok = system('perf_to_profile', '-i', src, '-o', out, %i[out err] => '/dev/null') + return nil unless ok && File.exist?(out) + out + end + + # ── helpers ──────────────────────────────────────────────────────── + + # Read whitespace-separated columns from a profile text file, + # skipping `#` comment lines and blank lines. Filters rows that are + # shorter than `min_cols` (header lines, partial dumps). + def parse_columns(path, min_cols) + File.readlines(path).each_with_object([]) do |line, acc| + next if line.start_with?('#') || line.strip.empty? + f = line.strip.split + acc << f if f.size >= min_cols + end + end + + # lock-profile and mvcc-profile use TAB separators so the optional + # caller_trace field can carry commas without a column-count + # ambiguity. Older single-tab-or-whitespace files still parse via + # the fallback. Filters rows shorter than `min_cols`. + def parse_tabbed_columns(path, min_cols) + File.readlines(path).each_with_object([]) do |line, acc| + next if line.start_with?('#') || line.strip.empty? + f = line.split("\t").map(&:strip) + f = line.strip.split if f.size < min_cols + acc << f if f.size >= min_cols + end + end + + # Resolve a batch of hex addresses to {func, file, clear_line}. + # Mirrors `Doctor.section_heap`'s resolver: addr2line gives us the + # Zig source line; we walk back to the nearest `// CLR:N` marker + # the transpiler emits to recover the user-facing CLEAR source line. + # + # Only assign `clear_line` when the resolved Zig file is the user's + # transpiled.zig — otherwise the address was in runtime/stdlib Zig + # code where CLR markers don't exist, and we'd misattribute by + # walking back through transpiled.zig anyway. (Repro: `pprof -list + # entryWrapper` showed it pointing at random source.cht lines.) + def resolve_addrs(addrs, binary, profile_dir) + return {} if addrs.empty? || binary.nil? + raw = IO.popen(['addr2line', '-e', binary, '-f'] + addrs, err: '/dev/null', &:read) + lines_out = raw.split("\n") + zig_path = File.join(profile_dir, 'transpiled.zig') + zig_lines = File.exist?(zig_path) ? File.readlines(zig_path) : nil + + out = {} + addrs.each_with_index do |addr, i| + func = lines_out[i * 2]&.strip || '?' + file = lines_out[i * 2 + 1]&.strip || '?' + func = func.sub(/.*\./, '').sub(/__anon_\d+/, '') + clear_line = nil + is_user_zig = zig_lines && file_is_transpiled_zig?(file, zig_path) + if is_user_zig && file =~ /:(\d+)\b/ + zig_line = $1.to_i + (zig_line - 1).downto(0) do |li| + break if li.negative? || li >= zig_lines.size + if zig_lines[li] =~ %r{// CLR:(\d+)} + clear_line = $1.to_i + break + end + end + end + out[addr] = { func: func, file: file, clear_line: clear_line, is_user_zig: !!is_user_zig } + end + out + end + + # True if addr2line's `:` came from the user's + # transpiled CLEAR file (where CLR markers exist), not from + # runtime/stdlib. The build target's actual compile-time filename + # is `._clear_tmp_.zig`, copied to `/ + # transpiled.zig` for inspection. addr2line emits the build-time + # path, so we match the basename pattern that signals user code. + # addr2line may append "(discriminator N)" or other metadata after + # the line number; strip that before extracting the basename. + def file_is_transpiled_zig?(addr2line_file, _zig_path) + return false unless addr2line_file + bare = addr2line_file.sub(/:\d+\b.*\z/, '') + File.basename(bare).match?(/\A\._clear_tmp_.*\.zig\z/) + end + + def parse_addr(s) + s.start_with?('0x') ? s.to_i(16) : s.to_i + end + + # Pick the source file pprof's `list` view should show for this + # function. User-code (transpiled.zig with CLR markers) -> the + # original .cht source. Runtime/stdlib code -> the addr2line file + # path (which is a real .zig file pprof can read). Without this + # split, runtime functions like `entryWrapper` rendered against + # arbitrary .cht line numbers, badly misleading the user. + def clear_source_path(profile_dir, addr2line_file, is_user_zig: nil) + if is_user_zig.nil? + cht = File.join(profile_dir, 'source.cht') + return cht if File.exist?(cht) + return addr2line_file.to_s.sub(/:\d+\z/, '') + end + if is_user_zig + cht = File.join(profile_dir, 'source.cht') + return cht if File.exist?(cht) + end + addr2line_file.to_s.sub(/:\d+\z/, '') + end +end diff --git a/zig/runtime/alloc-profile.zig b/zig/runtime/alloc-profile.zig index 42bca5c72..51109ffc2 100644 --- a/zig/runtime/alloc-profile.zig +++ b/zig/runtime/alloc-profile.zig @@ -1,12 +1,19 @@ // alloc-profile.zig -- Allocation site tracker for `clear profile`. // -// Records allocation counts and bytes per call site using @returnAddress(). -// Zero overhead when CLEAR_PROFILE is not set (comptime branch elimination). -// Fixed-size open-addressed hash table -- no allocations inside the profiler. +// Records allocation stack traces via std.debug.captureStackTrace, hashed +// across the full trace so distinct call paths to the same helper resolve +// as separate sites. Zero overhead when profiling is not built in +// (comptime branch elimination at the call sites). Fixed-size open- +// addressed hash table -- no allocations inside the profiler. +// +// Sampling: `CLEAR_PROFILE_SAMPLE=N` (set by `clear profile --sample=N`) +// records every Nth call. Default N=1 (no sampling). Sampled values are +// scaled by N at record time so pprof / doctor see estimated totals. const std = @import("std"); const compat = @import("../lib/compat.zig"); const SpinLock = @import("profile-lock.zig").SpinLock; +const profile_trace = @import("profile-trace.zig"); // Profile-table size; shared default with lock-profile / mvcc-profile. // `clear profile --profile-max=N` injects the override into the @@ -17,8 +24,21 @@ pub const MAX_SITES: usize = if (@hasDecl(@import("root"), "CLEAR_PROFILE_MAX_TA else 1024; +// Maximum frames captured per stack. 16 matches the default depth +// `perf record -g` walks on x86. 128 bytes per Site, vs the 8 we used +// to cost. With MAX_SITES=1024 the table grows from ~40 KB to ~172 KB +// — comfortably small relative to the Large (60 KB) profile-build +// fiber stacks. The trace buffer at the call site is `[16]usize` == +// 128 B on the recorder's stack; the FP walk reads but doesn't push +// new frames. Shared with lock/mvcc via profile_trace.MAX_FRAMES. +pub const MAX_FRAMES: u8 = profile_trace.MAX_FRAMES; + const Site = struct { - addr: usize = 0, + // The captured stack, leaf-first. addrs[0] is closest to the alloc + // site. addr_count == 0 means the slot is empty (a real captured + // trace always has at least the leaf frame). + addrs: [MAX_FRAMES]usize = [_]usize{0} ** MAX_FRAMES, + addr_count: u8 = 0, alloc_count: u64 = 0, alloc_bytes: u64 = 0, free_count: u64 = 0, @@ -27,6 +47,7 @@ const Site = struct { var sites: [MAX_SITES]Site = [_]Site{.{}} ** MAX_SITES; var total_allocs: u64 = 0; +var total_bytes: u64 = 0; var mu: SpinLock = .{}; // Counts findSlot() calls that hit the saturated table. Surfaced @@ -34,23 +55,67 @@ var mu: SpinLock = .{}; // user to bump --profile-max=N. See mvcc-profile.zig for rationale. var dropped_samples: u64 = 0; -fn findSlot(addr: usize) ?*Site { - const hash = addr *% 0x9E3779B97F4A7C15; // fibonacci hash +// Sample stride. Read once from CLEAR_PROFILE_SAMPLE on the first +// recordAlloc call (always under `mu`, so no atomics needed). 0 is +// the uninitialized sentinel; 1 means no sampling. +var sample_n: u64 = 0; +var sample_counter: u64 = 0; + +fn ensureSampleN() void { + if (sample_n != 0) return; + if (std.c.getenv("CLEAR_PROFILE_SAMPLE")) |env| { + const slc = std.mem.span(env); + const n = std.fmt.parseInt(u64, slc, 10) catch 1; + sample_n = if (n == 0) 1 else n; + } else { + sample_n = 1; + } +} + +// Hash a captured stack trace. fnv-1a-style mixing of the addresses +// keeps the cost ~constant in the trace length and gives reasonable +// dispersion for our open-addressed table. +fn hashTrace(addrs: []const usize) usize { + var h: usize = 0xcbf29ce484222325; + for (addrs) |a| { + h ^= a; + h *%= 0x9E3779B97F4A7C15; + } + return h; +} + +fn tracesEqual(slot: *const Site, addrs: []const usize) bool { + if (slot.addr_count != addrs.len) return false; + for (addrs, 0..) |a, i| { + if (slot.addrs[i] != a) return false; + } + return true; +} + +fn findSlot(addrs: []const usize) ?*Site { + if (addrs.len == 0) return null; + const hash = hashTrace(addrs); var idx = hash & (MAX_SITES - 1); var probes: usize = 0; while (probes < MAX_SITES) : (probes += 1) { - if (sites[idx].addr == addr) return &sites[idx]; - if (sites[idx].addr == 0) { - sites[idx].addr = addr; - return &sites[idx]; + const slot = &sites[idx]; + if (slot.addr_count == 0) { + // Empty: claim it. Copy up to MAX_FRAMES; a real captured + // trace has been clamped already, but guard anyway. + const n: u8 = @intCast(@min(addrs.len, MAX_FRAMES)); + for (0..n) |i| slot.addrs[i] = addrs[i]; + slot.addr_count = n; + return slot; } + if (tracesEqual(slot, addrs)) return slot; idx = (idx + 1) & (MAX_SITES - 1); } dropped_samples += 1; - return null; // table saturated -- bump CLEAR_PROFILE_MAX_TABLE_ENTRIES + return null; } -var total_bytes: u64 = 0; +// Stack capture is delegated to profile_trace.captureFromHere so the +// FP-unwind logic stays in one place across all three profile recorders. pub fn totalAllocs() u64 { return total_allocs; @@ -61,27 +126,52 @@ pub fn totalBytes() u64 { } pub fn recordAlloc(ret_addr: usize, size: usize) void { + var trace_buf: [MAX_FRAMES]usize = undefined; + const n = profile_trace.captureFromHere(ret_addr, &trace_buf); + mu.lock(); defer mu.unlock(); - if (findSlot(ret_addr)) |site| { - site.alloc_count += 1; - site.alloc_bytes += size; - } + + ensureSampleN(); total_allocs += 1; total_bytes += size; + + // Drop all but every sample_n-th sample. Counter is incremented + // unconditionally so even sampled-out allocs contribute to + // total_allocs / total_bytes (the header reflects the full run). + sample_counter +%= 1; + if (sample_n > 1 and (sample_counter % sample_n) != 0) return; + + if (n == 0) return; // FP walk failed -- keep totals, drop site + + if (findSlot(trace_buf[0..n])) |site| { + // Scale by sample_n so per-site totals approximate the real + // distribution at the cost of one multiply per recorded sample. + site.alloc_count += sample_n; + site.alloc_bytes += size * sample_n; + } } pub fn recordFree(ret_addr: usize, size: usize) void { + var trace_buf: [MAX_FRAMES]usize = undefined; + const n = profile_trace.captureFromHere(ret_addr, &trace_buf); + mu.lock(); defer mu.unlock(); - if (findSlot(ret_addr)) |site| { - site.free_count += 1; - site.free_bytes += size; + + ensureSampleN(); + sample_counter +%= 1; + if (sample_n > 1 and (sample_counter % sample_n) != 0) return; + + if (n == 0) return; + + if (findSlot(trace_buf[0..n])) |site| { + site.free_count += sample_n; + site.free_bytes += size * sample_n; } } pub fn dump() void { - // Read output path from env var const path_ptr = std.c.getenv("CLEAR_ALLOC_PROFILE") orelse return; const fd = compat.createFileTruncate(path_ptr) catch return; @@ -90,34 +180,50 @@ pub fn dump() void { mu.lock(); defer mu.unlock(); - var buf: [256]u8 = undefined; + var buf: [4096]u8 = undefined; - compat.writeAllFd(fd, "# alloc-profile v1\n") catch return; + compat.writeAllFd(fd, "# alloc-profile v2 (multi-frame, comma-separated leaf-first)\n") catch return; const hdr = std.fmt.bufPrint(&buf, "# total_allocs: {d}\n", .{total_allocs}) catch return; compat.writeAllFd(fd, hdr) catch return; + if (sample_n > 1) { + const sw = std.fmt.bufPrint(&buf, "# sample_n: {d}\n", .{sample_n}) catch return; + compat.writeAllFd(fd, sw) catch return; + } if (dropped_samples > 0) { const warn = std.fmt.bufPrint(&buf, "# WARNING: {d} samples dropped (cap={d}; rebuild with `clear profile --profile-max=N`)\n", .{ dropped_samples, MAX_SITES }) catch return; compat.writeAllFd(fd, warn) catch return; } - compat.writeAllFd(fd, "# addr alloc_count alloc_bytes free_count free_bytes live_bytes\n") catch return; + compat.writeAllFd(fd, "# addrs alloc_count alloc_bytes free_count free_bytes live_bytes\n") catch return; for (&sites) |*site| { - if (site.addr != 0 and site.alloc_count > 0) { - const live = if (site.alloc_bytes >= site.free_bytes) - site.alloc_bytes - site.free_bytes - else - 0; - const line = std.fmt.bufPrint(&buf, "0x{x} {d} {d} {d} {d} {d}\n", .{ - site.addr, - site.alloc_count, - site.alloc_bytes, - site.free_count, - site.free_bytes, - live, - }) catch continue; - compat.writeAllFd(fd, line) catch return; + if (site.addr_count == 0 or site.alloc_count == 0) continue; + const live = if (site.alloc_bytes >= site.free_bytes) + site.alloc_bytes - site.free_bytes + else + 0; + + // Build the comma-separated addrs field into the line buffer + // by appending with `bufPrint` and walking a write cursor. + // With 16 frames at "0x" + 16 hex + ",", worst case is + // 19*16 = 304 chars. 4 KB buffer covers that plus the + // count/bytes tail with margin. + var pos: usize = 0; + var i: u8 = 0; + while (i < site.addr_count) : (i += 1) { + const sep = if (i == 0) "" else ","; + const slc = std.fmt.bufPrint(buf[pos..], "{s}0x{x}", .{ sep, site.addrs[i] }) catch break; + pos += slc.len; } + const tail = std.fmt.bufPrint(buf[pos..], " {d} {d} {d} {d} {d}\n", .{ + site.alloc_count, + site.alloc_bytes, + site.free_count, + site.free_bytes, + live, + }) catch continue; + pos += tail.len; + compat.writeAllFd(fd, buf[0..pos]) catch return; } } diff --git a/zig/runtime/lock-profile.zig b/zig/runtime/lock-profile.zig index 10d0230dd..6aeef73ce 100644 --- a/zig/runtime/lock-profile.zig +++ b/zig/runtime/lock-profile.zig @@ -3,10 +3,14 @@ // Zero overhead when CLEAR_PROFILE == false; every call site in // parking-lot.zig is wrapped with `if (rt_profile.CLEAR_PROFILE)`. // -// Per-lock stats are keyed by mutex address (open-addressed hash like -// alloc-profile). Each lock gets its own entry in the hash; profile -// data is stored by value in the table so stats survive the mutex's -// lifetime — the registered entry is never revoked. +// Default keying: lock pointer (one row per mutex). With `clear profile +// --sync-callstacks`, each (lock, caller-trace) pair becomes its own row, +// so the dump shows which call site contended for which lock. The flag +// is off by default because the FP walk costs ~100-500ns per record; +// uncontended mutex acquire is ~10-20ns, so the trace can dominate. +// `--sample=N` composes (record every Nth event, scale by N at record +// time). The CLI auto-defaults --sample to 100 when --sync-callstacks +// is set unless the user passes their own value. // // Counters are plain u64 (not atomic): under concurrent recordAcquire // the counts can be slightly low under tearing. Acceptable for profile @@ -17,6 +21,7 @@ const std = @import("std"); const compat = @import("../lib/compat.zig"); const SpinLock = @import("profile-lock.zig").SpinLock; +const profile_trace = @import("profile-trace.zig"); // Profile-table size; shared default with alloc-profile / mvcc-profile. // `clear profile --profile-max=N` injects the override into the @@ -46,6 +51,12 @@ pub const LockStats = struct { read_total_wait_ns: u64 = 0, read_max_wait_ns: u64 = 0, read_contended_acquires: u64 = 0, + + // Caller stack captured at first record into this slot. Empty + // (caller_trace_count == 0) when --sync-callstacks is off, in + // which case the slot's identity is just `addr`. Leaf-first. + caller_trace: [profile_trace.MAX_FRAMES]usize = [_]usize{0} ** profile_trace.MAX_FRAMES, + caller_trace_count: u8 = 0, }; var stats: [MAX_LOCKS]LockStats = [_]LockStats{.{}} ** MAX_LOCKS; @@ -56,57 +67,133 @@ var mu: SpinLock = .{}; // user to bump --profile-max=N. See mvcc-profile.zig for rationale. var dropped_samples: u64 = 0; +// Sample stride snapshot. Read once on first record call (under `mu`, +// no atomic needed) and reused thereafter. 0 = uninitialized sentinel. +var sample_n: u64 = 0; +var sample_counter: u64 = 0; + +fn ensureSampleN() void { + if (sample_n != 0) return; + sample_n = profile_trace.sampleStride(); +} + pub inline fn now() u64 { return compat.nanoTimestamp(); } -fn findSlot(addr: usize) ?*LockStats { - // Fibonacci-style multiplicative hash — same recipe as alloc-profile. - const hash = addr *% 0x9E3779B97F4A7C15; +// Mix `addr` (lock pointer) with the optional caller trace into one +// hash. fnv-1a-style; same recipe as alloc-profile's hashTrace. +fn hashKey(addr: usize, trace: []const usize) usize { + var h: usize = addr *% 0x9E3779B97F4A7C15; + for (trace) |a| { + h ^= a; + h *%= 0x9E3779B97F4A7C15; + } + return h; +} + +fn keyMatches(slot: *const LockStats, addr: usize, trace: []const usize) bool { + if (slot.addr != addr) return false; + if (slot.caller_trace_count != trace.len) return false; + for (trace, 0..) |a, i| { + if (slot.caller_trace[i] != a) return false; + } + return true; +} + +fn findSlot(addr: usize, trace: []const usize) ?*LockStats { + const hash = hashKey(addr, trace); var idx: usize = @intCast(hash & (MAX_LOCKS - 1)); var probes: usize = 0; while (probes < MAX_LOCKS) : (probes += 1) { - if (stats[idx].addr == addr) return &stats[idx]; - if (stats[idx].addr == 0) { - stats[idx].addr = addr; - return &stats[idx]; + const slot = &stats[idx]; + if (slot.addr == 0 and slot.caller_trace_count == 0) { + // Empty slot: claim it. + slot.addr = addr; + const n: u8 = @intCast(@min(trace.len, profile_trace.MAX_FRAMES)); + for (0..n) |i| slot.caller_trace[i] = trace[i]; + slot.caller_trace_count = n; + return slot; } + if (keyMatches(slot, addr, trace)) return slot; idx = (idx + 1) & (MAX_LOCKS - 1); } dropped_samples += 1; - return null; // table saturated — bump CLEAR_PROFILE_MAX_TABLE_ENTRIES + return null; +} + +// Captures the caller trace if --sync-callstacks is on; otherwise +// returns an empty slice. The trace buffer is supplied by the caller +// so it lives on the recorder's stack frame. +fn captureCallerTrace( + ret_addr: usize, + buf: *[profile_trace.MAX_FRAMES]usize, +) []const usize { + if (!profile_trace.syncCallstacksEnabled()) return &.{}; + const n = profile_trace.captureFromHere(ret_addr, buf); + return buf[0..n]; +} + +// Sampling gate. `sample_counter` is incremented on every call so +// even sampled-out events count for the cadence; the next captured +// sample's values are scaled by `sample_n` so per-row totals match +// estimated reality. Returns true if this call should be recorded. +fn shouldSample() bool { + sample_counter +%= 1; + return sample_n <= 1 or (sample_counter % sample_n) == 0; } -pub inline fn recordAcquire(addr: usize, wait_ns: u64, contended: bool) void { +pub noinline fn recordAcquire(addr: usize, wait_ns: u64, contended: bool) void { + var trace_buf: [profile_trace.MAX_FRAMES]usize = undefined; + const trace = captureCallerTrace(@returnAddress(), &trace_buf); + mu.lock(); defer mu.unlock(); - if (findSlot(addr)) |s| { - s.acquires += 1; - s.total_wait_ns += wait_ns; + + ensureSampleN(); + if (!shouldSample()) return; + + if (findSlot(addr, trace)) |s| { + s.acquires += sample_n; + s.total_wait_ns += wait_ns * sample_n; if (wait_ns > s.max_wait_ns) s.max_wait_ns = wait_ns; - if (contended) s.contended_acquires += 1; + if (contended) s.contended_acquires += sample_n; } } /// Read-side acquire on a ParkingRwLock. Same shape as recordAcquire /// but stored in the read counters so doctor can compute read/write /// split (read-heavy → recommend @shared:versioned). -pub inline fn recordReadAcquire(addr: usize, wait_ns: u64, contended: bool) void { +pub noinline fn recordReadAcquire(addr: usize, wait_ns: u64, contended: bool) void { + var trace_buf: [profile_trace.MAX_FRAMES]usize = undefined; + const trace = captureCallerTrace(@returnAddress(), &trace_buf); + mu.lock(); defer mu.unlock(); - if (findSlot(addr)) |s| { - s.read_acquires += 1; - s.read_total_wait_ns += wait_ns; + + ensureSampleN(); + if (!shouldSample()) return; + + if (findSlot(addr, trace)) |s| { + s.read_acquires += sample_n; + s.read_total_wait_ns += wait_ns * sample_n; if (wait_ns > s.read_max_wait_ns) s.read_max_wait_ns = wait_ns; - if (contended) s.read_contended_acquires += 1; + if (contended) s.read_contended_acquires += sample_n; } } -pub inline fn recordRelease(addr: usize, hold_ns: u64) void { +pub noinline fn recordRelease(addr: usize, hold_ns: u64) void { + var trace_buf: [profile_trace.MAX_FRAMES]usize = undefined; + const trace = captureCallerTrace(@returnAddress(), &trace_buf); + mu.lock(); defer mu.unlock(); - if (findSlot(addr)) |s| { - s.total_hold_ns += hold_ns; + + ensureSampleN(); + if (!shouldSample()) return; + + if (findSlot(addr, trace)) |s| { + s.total_hold_ns += hold_ns * sample_n; if (hold_ns > s.max_hold_ns) s.max_hold_ns = hold_ns; } } @@ -119,8 +206,12 @@ pub fn dumpToEnvFile() void { mu.lock(); defer mu.unlock(); - var buf: [512]u8 = undefined; - _ = compat.writeAllFd(fd, "# lock-profile v2\n") catch return; + var buf: [4096]u8 = undefined; + _ = compat.writeAllFd(fd, "# lock-profile v3\n") catch return; + if (sample_n > 1) { + const sw = std.fmt.bufPrint(&buf, "# sample_n: {d}\n", .{sample_n}) catch return; + _ = compat.writeAllFd(fd, sw) catch return; + } if (dropped_samples > 0) { const warn = std.fmt.bufPrint(&buf, "# WARNING: {d} samples dropped (cap={d}; rebuild with `clear profile --profile-max=N`)\n", @@ -129,14 +220,16 @@ pub fn dumpToEnvFile() void { } _ = compat.writeAllFd(fd, "# addr\tacquires\tcontended\ttotal_wait_ns\tmax_wait_ns\ttotal_hold_ns\tmax_hold_ns" ++ - "\tread_acquires\tread_contended\tread_total_wait_ns\tread_max_wait_ns\n") + "\tread_acquires\tread_contended\tread_total_wait_ns\tread_max_wait_ns\tcaller_trace\n") catch return; for (&stats) |*s| { - if (s.addr == 0) continue; + if (s.addr == 0 and s.caller_trace_count == 0) continue; if (s.acquires == 0 and s.read_acquires == 0) continue; - const line = std.fmt.bufPrint(&buf, - "0x{x}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\n", + + // Existing 11 columns first. + const head = std.fmt.bufPrint(&buf, + "0x{x}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t", .{ s.addr, s.acquires, s.contended_acquires, s.total_wait_ns, s.max_wait_ns, @@ -145,6 +238,23 @@ pub fn dumpToEnvFile() void { s.read_total_wait_ns, s.read_max_wait_ns, }) catch continue; - _ = compat.writeAllFd(fd, line) catch return; + _ = compat.writeAllFd(fd, head) catch return; + + // Caller trace (12th column). `-` when empty so column count + // is constant; comma-separated leaf-first when populated. + if (s.caller_trace_count == 0) { + _ = compat.writeAllFd(fd, "-\n") catch return; + } else { + var pos: usize = 0; + var i: u8 = 0; + while (i < s.caller_trace_count) : (i += 1) { + const sep = if (i == 0) "" else ","; + const slc = std.fmt.bufPrint(buf[pos..], "{s}0x{x}", .{ sep, s.caller_trace[i] }) catch break; + pos += slc.len; + } + const tail = std.fmt.bufPrint(buf[pos..], "\n", .{}) catch continue; + pos += tail.len; + _ = compat.writeAllFd(fd, buf[0..pos]) catch return; + } } } diff --git a/zig/runtime/mvcc-profile.zig b/zig/runtime/mvcc-profile.zig index b222c80c2..f7c3a79a8 100644 --- a/zig/runtime/mvcc-profile.zig +++ b/zig/runtime/mvcc-profile.zig @@ -1,10 +1,13 @@ // mvcc-profile.zig — `Versioned(T)` cell telemetry: reads, commits, // retries, COW byte volume. // -// Zero overhead when CLEAR_PROFILE == false. Per-cell stats are keyed -// by cell address (open-addressed hash like alloc-profile and -// lock-profile). Counters are plain u64; the same relaxed-tearing -// rationale as the sibling profile modules applies. +// Zero overhead when CLEAR_PROFILE == false. Default keying: cell pointer +// (one row per Versioned cell). With `clear profile --sync-callstacks`, +// each (cell, caller-trace) pair becomes its own row, so the dump shows +// which call site reads/commits which cell. The flag is off by default +// because the FP walk costs ~100-500ns per record; an MVCC commit fast +// path is ~20-50ns, so the trace can dominate. `--sample=N` composes +// (record every Nth event, scale by N at record time). // // Why MVCC needs its own profile (not lock-profile): MVCC has no // lock. The interesting numbers are different: @@ -23,6 +26,7 @@ const std = @import("std"); const compat = @import("../lib/compat.zig"); const SpinLock = @import("profile-lock.zig").SpinLock; +const profile_trace = @import("profile-trace.zig"); // Profile-table size, shared across alloc-profile / lock-profile / // mvcc-profile via a single root-level override knob. Default 1024 @@ -56,6 +60,12 @@ pub const CellStats = struct { // here: 0 for fast-path-success, N for // contended commits. update_failures: u64 = 0, // returned UpdateRetriesExhausted (gave up) + + // Caller stack captured at first record into this slot. Empty + // (caller_trace_count == 0) when --sync-callstacks is off, in + // which case the slot's identity is just `addr`. Leaf-first. + caller_trace: [profile_trace.MAX_FRAMES]usize = [_]usize{0} ** profile_trace.MAX_FRAMES, + caller_trace_count: u8 = 0, }; var stats: [MAX_CELLS]CellStats = [_]CellStats{.{}} ** MAX_CELLS; @@ -67,28 +77,80 @@ var mu: SpinLock = .{}; // Plain u64; same relaxed-tearing rationale as the per-cell counters. var dropped_samples: u64 = 0; -fn findSlot(addr: usize, struct_size: u32) ?*CellStats { - const hash = addr *% 0x9E3779B97F4A7C15; +// Sample stride snapshot. Read once on first record call (under `mu`, +// no atomic needed) and reused thereafter. 0 = uninitialized sentinel. +var sample_n: u64 = 0; +var sample_counter: u64 = 0; + +fn ensureSampleN() void { + if (sample_n != 0) return; + sample_n = profile_trace.sampleStride(); +} + +fn hashKey(addr: usize, trace: []const usize) usize { + var h: usize = addr *% 0x9E3779B97F4A7C15; + for (trace) |a| { + h ^= a; + h *%= 0x9E3779B97F4A7C15; + } + return h; +} + +fn keyMatches(slot: *const CellStats, addr: usize, trace: []const usize) bool { + if (slot.addr != addr) return false; + if (slot.caller_trace_count != trace.len) return false; + for (trace, 0..) |a, i| { + if (slot.caller_trace[i] != a) return false; + } + return true; +} + +fn findSlot(addr: usize, struct_size: u32, trace: []const usize) ?*CellStats { + const hash = hashKey(addr, trace); var idx: usize = @intCast(hash & (MAX_CELLS - 1)); var probes: usize = 0; while (probes < MAX_CELLS) : (probes += 1) { - if (stats[idx].addr == addr) return &stats[idx]; - if (stats[idx].addr == 0) { - stats[idx].addr = addr; - stats[idx].struct_size = struct_size; - return &stats[idx]; + const slot = &stats[idx]; + if (slot.addr == 0 and slot.caller_trace_count == 0) { + slot.addr = addr; + slot.struct_size = struct_size; + const n: u8 = @intCast(@min(trace.len, profile_trace.MAX_FRAMES)); + for (0..n) |i| slot.caller_trace[i] = trace[i]; + slot.caller_trace_count = n; + return slot; } + if (keyMatches(slot, addr, trace)) return slot; idx = (idx + 1) & (MAX_CELLS - 1); } dropped_samples += 1; - return null; // table saturated -- bump CLEAR_PROFILE_MAX_TABLE_ENTRIES + return null; } -pub inline fn recordRead(addr: usize, struct_size: u32) void { +fn captureCallerTrace( + ret_addr: usize, + buf: *[profile_trace.MAX_FRAMES]usize, +) []const usize { + if (!profile_trace.syncCallstacksEnabled()) return &.{}; + const n = profile_trace.captureFromHere(ret_addr, buf); + return buf[0..n]; +} + +fn shouldSample() bool { + sample_counter +%= 1; + return sample_n <= 1 or (sample_counter % sample_n) == 0; +} + +pub noinline fn recordRead(addr: usize, struct_size: u32) void { + var trace_buf: [profile_trace.MAX_FRAMES]usize = undefined; + const trace = captureCallerTrace(@returnAddress(), &trace_buf); + mu.lock(); defer mu.unlock(); - if (findSlot(addr, struct_size)) |s| { - s.reads += 1; + ensureSampleN(); + if (!shouldSample()) return; + + if (findSlot(addr, struct_size, trace)) |s| { + s.reads += sample_n; } } @@ -96,15 +158,21 @@ pub inline fn recordRead(addr: usize, struct_size: u32) void { /// CAS failures inside the update before the eventual success /// (0 for fast-path commits). `committed` distinguishes a winning /// commit from a bailed-out UpdateRetriesExhausted. -pub inline fn recordUpdate(addr: usize, struct_size: u32, retries: u64, committed: bool) void { +pub noinline fn recordUpdate(addr: usize, struct_size: u32, retries: u64, committed: bool) void { + var trace_buf: [profile_trace.MAX_FRAMES]usize = undefined; + const trace = captureCallerTrace(@returnAddress(), &trace_buf); + mu.lock(); defer mu.unlock(); - if (findSlot(addr, struct_size)) |s| { - s.retries += retries; + ensureSampleN(); + if (!shouldSample()) return; + + if (findSlot(addr, struct_size, trace)) |s| { + s.retries += retries * sample_n; if (committed) { - s.commits += 1; + s.commits += sample_n; } else { - s.update_failures += 1; + s.update_failures += sample_n; } } } @@ -116,11 +184,17 @@ pub inline fn recordUpdate(addr: usize, struct_size: u32, retries: u64, committe /// "upgrade @shared:versioned -> @indirect:atomic" suggestion /// (multi-cell commits forbid the upgrade because AtomicPtr has /// no multi-pointer CAS). -pub inline fn recordMultiCommit(addr: usize, struct_size: u32) void { +pub noinline fn recordMultiCommit(addr: usize, struct_size: u32) void { + var trace_buf: [profile_trace.MAX_FRAMES]usize = undefined; + const trace = captureCallerTrace(@returnAddress(), &trace_buf); + mu.lock(); defer mu.unlock(); - if (findSlot(addr, struct_size)) |s| { - s.multi_commits += 1; + ensureSampleN(); + if (!shouldSample()) return; + + if (findSlot(addr, struct_size, trace)) |s| { + s.multi_commits += sample_n; } } @@ -132,8 +206,12 @@ pub fn dumpToEnvFile() void { mu.lock(); defer mu.unlock(); - var buf: [256]u8 = undefined; - _ = compat.writeAllFd(fd, "# mvcc-profile v1\n") catch return; + var buf: [4096]u8 = undefined; + _ = compat.writeAllFd(fd, "# mvcc-profile v2\n") catch return; + if (sample_n > 1) { + const sw = std.fmt.bufPrint(&buf, "# sample_n: {d}\n", .{sample_n}) catch return; + _ = compat.writeAllFd(fd, sw) catch return; + } if (dropped_samples > 0) { const warn = std.fmt.bufPrint(&buf, "# WARNING: {d} samples dropped (cap={d}; rebuild with `clear profile --profile-max=N`)\n", @@ -141,16 +219,32 @@ pub fn dumpToEnvFile() void { _ = compat.writeAllFd(fd, warn) catch return; } _ = compat.writeAllFd(fd, - "# addr\tstruct_size\treads\tcommits\tretries\tupdate_failures\tmulti_commits\n") + "# addr\tstruct_size\treads\tcommits\tretries\tupdate_failures\tmulti_commits\tcaller_trace\n") catch return; for (&stats) |*s| { - if (s.addr == 0) continue; + if (s.addr == 0 and s.caller_trace_count == 0) continue; if (s.reads == 0 and s.commits == 0 and s.update_failures == 0 and s.multi_commits == 0) continue; - const line = std.fmt.bufPrint(&buf, - "0x{x}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\n", + + const head = std.fmt.bufPrint(&buf, + "0x{x}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t", .{ s.addr, s.struct_size, s.reads, s.commits, s.retries, s.update_failures, s.multi_commits }) catch continue; - _ = compat.writeAllFd(fd, line) catch return; + _ = compat.writeAllFd(fd, head) catch return; + + if (s.caller_trace_count == 0) { + _ = compat.writeAllFd(fd, "-\n") catch return; + } else { + var pos: usize = 0; + var i: u8 = 0; + while (i < s.caller_trace_count) : (i += 1) { + const sep = if (i == 0) "" else ","; + const slc = std.fmt.bufPrint(buf[pos..], "{s}0x{x}", .{ sep, s.caller_trace[i] }) catch break; + pos += slc.len; + } + const tail = std.fmt.bufPrint(buf[pos..], "\n", .{}) catch continue; + pos += tail.len; + _ = compat.writeAllFd(fd, buf[0..pos]) catch return; + } } } diff --git a/zig/runtime/profile-trace.zig b/zig/runtime/profile-trace.zig new file mode 100644 index 000000000..726c0a59a --- /dev/null +++ b/zig/runtime/profile-trace.zig @@ -0,0 +1,64 @@ +// profile-trace.zig -- Shared stack-trace + sampling helpers for the +// alloc / lock / mvcc profile recorders. +// +// Each recorder has its own state (table, lock, sample counter), but +// the trace-capture call shape is identical. We centralize: +// * MAX_FRAMES (one number to tune) +// * captureFromHere() — wraps std.debug.captureCurrentStackTrace +// * syncCallstacksEnabled() — reads CLEAR_PROFILE_SYNC_CALLSTACKS +// once on first call. Used by lock + mvcc to gate stack capture +// behind `clear profile --sync-callstacks` (off by default; the +// flag costs ~100-500ns per record and scales the wait-cheap +// fast paths poorly). +// * sampleStride() — reads CLEAR_PROFILE_SAMPLE; alloc-profile keeps +// its own copy because it computed sample_n before this module +// was extracted. + +const std = @import("std"); + +pub const MAX_FRAMES: u8 = 16; + +// Capture a stack trace starting at `first_addr` (typically the +// caller's `@returnAddress()`). Returns the number of frames written +// into `addrs`. The buffer lives on the caller's stack -- 128 B at +// MAX_FRAMES=16 -- and the FP walk reads but does not push frames, +// so the cost is one `[16]usize` slot on the calling stack plus the +// walk itself. +pub fn captureFromHere(first_addr: usize, addrs: *[MAX_FRAMES]usize) u8 { + const trace = std.debug.captureCurrentStackTrace( + .{ .first_address = first_addr }, + addrs, + ); + return @intCast(@min(trace.return_addresses.len, MAX_FRAMES)); +} + +var sync_callstacks_state: enum { uninit, off, on } = .uninit; + +pub fn syncCallstacksEnabled() bool { + if (sync_callstacks_state != .uninit) return sync_callstacks_state == .on; + if (std.c.getenv("CLEAR_PROFILE_SYNC_CALLSTACKS")) |env| { + const slc = std.mem.span(env); + if (slc.len > 0 and slc[0] != '0') { + sync_callstacks_state = .on; + return true; + } + } + sync_callstacks_state = .off; + return false; +} + +var sample_n: u64 = 0; + +// Read CLEAR_PROFILE_SAMPLE once and cache. Returns >=1; 0 in the env +// is treated as 1 (no sampling). +pub fn sampleStride() u64 { + if (sample_n != 0) return sample_n; + if (std.c.getenv("CLEAR_PROFILE_SAMPLE")) |env| { + const slc = std.mem.span(env); + const n = std.fmt.parseInt(u64, slc, 10) catch 1; + sample_n = if (n == 0) 1 else n; + } else { + sample_n = 1; + } + return sample_n; +}