diff --git a/clear b/clear
index e29c2066e..1d3b2d32f 100755
--- a/clear
+++ b/clear
@@ -1062,6 +1062,18 @@ when 'profile'
   # are reported in the dump as a `# WARNING:` line. N must be a
   # power of two (the tables index via `& (N-1)`).
   profile_max = nil
+  # --sample=N: record every Nth event in the runtime profile tables.
+  # Captured values are scaled by N at record time so doctor / pprof
+  # see estimated totals. Default is no sampling for alloc-profile;
+  # auto-bumped to 100 for lock+mvcc when --sync-callstacks is on
+  # (see below) unless the user passes their own --sample value.
+  sample_n = nil
+  # --sync-callstacks: turn on per-record stack capture in lock-profile
+  # and mvcc-profile. Off by default because the FP walk costs ~100-500ns
+  # per record and uncontended mutex acquire / MVCC commit fast paths
+  # are 10-50ns; the trace can dominate. Enable when investigating
+  # caller-attribution for a specific lock or cell.
+  sync_callstacks = false
   clear_args = clear_args.reject do |arg|
     if arg =~ /\A--profile-max=(\d+)\z/
       profile_max = $1.to_i
@@ -1069,13 +1081,25 @@ when 'profile'
         error "--profile-max=#{profile_max} must be a positive power of two (e.g. 1024, 4096, 16384)"
       end
       true
+    elsif arg =~ /\A--sample=(\d+)\z/
+      sample_n = $1.to_i
+      error "--sample=#{sample_n} must be a positive integer" unless sample_n.positive?
+      true
+    elsif arg == '--sync-callstacks'
+      sync_callstacks = true
+      true
     else
       false
     end
   end
 
+  # When --sync-callstacks is on but no explicit --sample was given,
+  # default to 100 so the per-record cost stays manageable. Users can
+  # set --sample=1 to opt in to full capture at the full cost.
+  sample_n = 100 if sync_callstacks && sample_n.nil?
+
   source = clear_args.first
-  error "Usage: clear profile <file.cht> [--profile-max=N] [-- args...]" unless source
+  error "Usage: clear profile <file.cht> [--profile-max=N] [--sample=N] [--sync-callstacks] [-- args...]" unless source
 
   source = File.expand_path(source)
   base_name = File.basename(source, '.cht')
@@ -1114,6 +1138,8 @@ when 'profile'
     "CLEAR_LOCK_PROFILE=#{lock_file}",
     "CLEAR_MVCC_PROFILE=#{mvcc_file}",
   ]
+  env_parts << "CLEAR_PROFILE_SAMPLE=#{sample_n}" if sample_n
+  env_parts << "CLEAR_PROFILE_SYNC_CALLSTACKS=1" if sync_callstacks
   env_parts << "CLEAR_THREADS=#{ENV['CLEAR_THREADS']}" if ENV['CLEAR_THREADS']
   target_env = env_parts.join(' ')
 
@@ -1137,6 +1163,17 @@ when 'profile'
     system("perf stat -e cycles,instructions,branches,branch-misses,cache-references,cache-misses,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses,page-faults -o #{perf_stat_file} env #{target_env} #{output} #{run_args.join(' ')} > /dev/null 2>&1")
   end
 
+  # Convert each runtime profile to pprof's gzipped protobuf so the
+  # `pprof` tool can render flamegraphs / call graphs / source views
+  # directly. Heap / lock / mvcc are encoded in pure Ruby; CPU defers
+  # to the standard `perf_to_profile` Go binary if it's on PATH.
+  require_relative 'src/tools/pprof_converter'
+  pprof_files = PprofConverter.convert_all(profile_dir)
+  if File.exist?(perf_data) && !pprof_files[:cpu]
+    hint "perf_to_profile not on PATH — skipping CPU pprof. Install: " \
+         "go install github.com/google/perf_data_converter/src/cmd/perf_to_profile@latest"
+  end
+
   puts ""
   puts "Profile data: #{profile_dir}/"
   puts "  Heap:     #{alloc_file} (#{File.exist?(alloc_file) ? "#{(File.size(alloc_file) / 1024.0).round(1)} KB" : 'not generated'})"
@@ -1146,10 +1183,44 @@ when 'profile'
   puts "  Syscalls: #{strace_file}" if File.exist?(strace_file)
   puts "  Counters: #{perf_stat_file}" if File.exist?(perf_stat_file)
   puts "  MVCC:     #{mvcc_file}" if File.exist?(mvcc_file)
+  if pprof_files.any?
+    puts ""
+    puts "pprof:    pprof -http=:8080 #{output} #{profile_dir}/<file>.pb.gz"
+    pprof_files.each { |kind, path| puts "  #{kind}:#{' ' * (8 - kind.to_s.length)}#{path}" }
+  end
 
 when 'doctor'
   require_relative 'src/tools/doctor'
-  Doctor.run(ARGV[1])
+  # Parse flags: --cumulative shows cum bytes/count alongside flat;
+  # --focus=REGEX filters samples whose trace touches a function
+  # matching the pattern; --diff <before-dir> compares two profile
+  # runs and shows top deltas per section.
+  doctor_opts = { cumulative: false, focus: nil, ignore: nil, peek: nil, diff: nil, by: :bytes }
+  positional = []
+  i = 1
+  while i < ARGV.length
+    a = ARGV[i]
+    case a
+    when '--cumulative', '--cum'
+      doctor_opts[:cumulative] = true
+    when /\A--focus=(.+)\z/
+      doctor_opts[:focus] = Regexp.new($1)
+    when /\A--ignore=(.+)\z/
+      doctor_opts[:ignore] = Regexp.new($1)
+    when /\A--peek=(.+)\z/
+      doctor_opts[:peek] = Regexp.new($1)
+    when /\A--by=(bytes|allocs|inuse_bytes|inuse_allocs)\z/
+      doctor_opts[:by] = $1.to_sym
+    when '--diff'
+      doctor_opts[:diff] = ARGV[i + 1]
+      i += 1
+      error "--diff requires a <before-profile-dir> argument" if doctor_opts[:diff].nil?
+    else
+      positional << a
+    end
+    i += 1
+  end
+  Doctor.run(positional.first, **doctor_opts)
 
 when 'fix'
   require_relative 'src/ast/lexer'
@@ -1569,6 +1640,28 @@ when 'explain'
   end
   exit 0
 
+when 'completions'
+  shell = ARGV[1]
+  unless shell && %w[bash zsh fish].include?(shell)
+    $stderr.puts "Usage: clear completions {bash|zsh|fish}"
+    $stderr.puts ""
+    $stderr.puts "Install (bash):"
+    $stderr.puts "  echo 'source <(clear completions bash)' >> ~/.bashrc"
+    $stderr.puts ""
+    $stderr.puts "Install (zsh):"
+    $stderr.puts "  mkdir -p ~/.zsh/completions"
+    $stderr.puts "  clear completions zsh > ~/.zsh/completions/_clear"
+    $stderr.puts "  # add to ~/.zshrc BEFORE 'compinit':"
+    $stderr.puts "  #   fpath=(~/.zsh/completions \\$fpath)"
+    $stderr.puts ""
+    $stderr.puts "Install (fish):"
+    $stderr.puts "  clear completions fish > ~/.config/fish/completions/clear.fish"
+    exit 1
+  end
+  require_relative 'src/tools/completions'
+  print Completions.script_for(shell)
+  exit 0
+
 when 'help', '--help', '-h', nil
   puts <<~HELP
     CLEAR Language Compiler
@@ -1593,6 +1686,7 @@ when 'help', '--help', '-h', nil
       clear fmt --no-warn <path>    Suppress width warnings for >120-char lines
       clear explain                 List every registered diagnostic code
       clear explain <CODE>          Show docs for a specific diagnostic
+      clear completions {bash|zsh|fish}  Print shell completions (see docs/completions.md)
 
     Commands:
       build    Transpile and compile a CLEAR program to a native binary
diff --git a/docs/completions.md b/docs/completions.md
new file mode 100644
index 000000000..1f25b02b3
--- /dev/null
+++ b/docs/completions.md
@@ -0,0 +1,74 @@
+# Shell completions
+
+Tab-completion for `clear` subcommands, and file/directory arguments
+filtered per subcommand:
+
+| Subcommand | Completes to |
+|---|---|
+| `clear build`, `run`, `fmt`, `fix`, `profile`, `explain` | `*.cht` files (and directories to navigate) |
+| `clear test`, `benchmark` | `*.cht` files or directories |
+| `clear doctor` | `*.profile/` directories |
+| `clear completions` | `bash` / `zsh` / `fish` |
+
+Generate the script for your shell with `clear completions <shell>`,
+then install per the instructions below.
+
+## Bash
+
+Add to `~/.bashrc`:
+
+```sh
+source <(clear completions bash)
+```
+
+Or write to the system completions dir (loaded by every interactive
+shell, no rc-file edit):
+
+```sh
+clear completions bash | sudo tee /etc/bash_completion.d/clear > /dev/null
+```
+
+## Zsh
+
+The convention is one `_<cmd>` file per command in a directory on
+`$fpath`:
+
+```sh
+mkdir -p ~/.zsh/completions
+clear completions zsh > ~/.zsh/completions/_clear
+```
+
+Then ensure `~/.zsh/completions` is on `$fpath` **before** `compinit`
+runs. In `~/.zshrc`:
+
+```sh
+fpath=(~/.zsh/completions $fpath)
+autoload -Uz compinit && compinit
+```
+
+Reopen the shell (or `compinit -u`) and tab-completion will pick up
+descriptions for each subcommand.
+
+## Fish
+
+Fish auto-loads completions from `~/.config/fish/completions/`:
+
+```sh
+clear completions fish > ~/.config/fish/completions/clear.fish
+```
+
+No rc-file edit needed.
+
+## Verifying
+
+```sh
+clear <TAB>                       # lists subcommands
+clear doctor <TAB>                # lists *.profile/ dirs
+clear profile examples/<TAB>      # lists *.cht files in examples/
+```
+
+## Updating
+
+The completion script is generated from the live `Completions`
+module (`src/tools/completions.rb`). When new subcommands are added,
+re-run `clear completions <shell> > <file>` to refresh.
diff --git a/docs/pprof.md b/docs/pprof.md
new file mode 100644
index 000000000..4b3b1ab62
--- /dev/null
+++ b/docs/pprof.md
@@ -0,0 +1,143 @@
+# pprof integration
+
+`clear profile` emits its runtime profile data in pprof's gzipped
+protobuf format alongside the existing text dumps, so you can use
+the standard `pprof` tool for flamegraphs, call graphs, and source
+views.
+
+## Install
+
+| Tool | When you need it | Install |
+|---|---|---|
+| `pprof` | Always (the viewer) | `go install github.com/google/pprof@latest` |
+| `perf_to_profile` | If you want CPU flamegraphs from `perf.data` | `go install github.com/google/perf_data_converter/src/cmd/perf_to_profile@latest` |
+| `graphviz` | If you use `pprof -svg` / `pprof -dot` | `apt install graphviz` / `brew install graphviz` |
+
+The web UI (`pprof -http=:8080`) and `pprof -top` text view do not
+need graphviz.
+
+## Use
+
+```sh
+clear profile foo.cht
+# -> writes foo.profile/heap.pb.gz, lock.pb.gz, mvcc.pb.gz
+#    (and cpu.pb.gz if perf_to_profile is on PATH)
+
+pprof -http=:8080 ./foo foo.profile/heap.pb.gz
+pprof -top -alloc_space  foo.profile/heap.pb.gz
+pprof -top -inuse_space  foo.profile/heap.pb.gz
+pprof -top -delay        foo.profile/lock.pb.gz
+pprof -base before/heap.pb.gz after/heap.pb.gz   # regression diff
+```
+
+## What's in each file
+
+### `heap.pb.gz`
+
+Sample columns: `alloc_objects` / `alloc_space` / `inuse_objects` /
+`inuse_space`. Each call site is one sample with its hex address as
+a label (`pprof -tags heap.pb.gz`).
+
+### `lock.pb.gz`
+
+Sample columns: `contentions` / `delay` / `hold` / `acquisitions`.
+Read+write contention sums into `contentions`; `delay` is the total
+wait (read+write); `hold` is total exclusive hold time.
+
+### `mvcc.pb.gz`
+
+Sample columns: `reads` / `commits` / `retries` / `cow_bytes`.
+`cow_bytes = struct_size * (commits + retries)` — the byte volume
+moved by copy-on-write commits, the most direct cost signal for
+`@shared:versioned` cells.
+
+### `channels.pb.gz`
+
+Sample columns: `pushes` / `pops` / `push_blocked` / `pop_blocked` /
+`max_depth`. One sample per registered channel; the synthetic
+function name is `channel#<id>` and the channel's capacity travels
+as a label (`pprof -tags channels.pb.gz`).
+
+### `cpu.pb.gz`
+
+Standard CPU profile from `perf.data`, converted by
+`perf_to_profile`. Sample columns are whatever Go's pprof shows
+(`samples` / `cpu` ns).
+
+## CLEAR source mapping
+
+The transpiler emits `// CLR:N` markers in `transpiled.zig`. Our
+converter walks those back to the user's `.cht` line and stamps it
+onto each pprof Location, so `pprof -list <fn>` shows CLEAR source
+lines (not Zig).
+
+## Sampling
+
+Stack traces for **alloc-profile** are captured on every alloc by
+default. For workloads where the per-alloc unwind cost matters,
+`--sample=N` records every Nth event and scales the captured values
+by N so doctor / pprof see estimated totals:
+
+```sh
+clear profile foo.cht --sample=100
+```
+
+Header records the chosen `sample_n` so consumers can rescale or
+flag the approximation.
+
+## Stack traces for lock and mvcc profiles
+
+`--sync-callstacks` (off by default) turns on per-record stack
+capture in lock-profile and mvcc-profile, so pprof's tree/flame
+views and per-caller attribution work for these profiles too:
+
+```sh
+clear profile foo.cht --sync-callstacks            # auto-bumps --sample to 100
+clear profile foo.cht --sync-callstacks --sample=1  # full capture, full cost
+```
+
+Off by default because the FP walk costs ~100-500ns per record.
+Uncontended mutex acquire is ~10-20ns and MVCC commit fast paths
+are ~20-50ns, so the trace can dominate the operation it's measuring.
+When the flag is set without an explicit `--sample`, we auto-default
+to `--sample=100` to keep the cost manageable.
+
+With `--sync-callstacks` on, each (lock, caller-trace) pair becomes
+its own row in lock-profile, and same for (cell, caller-trace) in
+mvcc-profile. Doctor aggregates rows back to one-per-lock for its
+existing diagnoses; pprof tree views show the per-caller breakdown.
+
+## Doctor flags built on this data
+
+`clear doctor` consumes the same profile dirs and grew three new
+flags that draw on the multi-frame trace data:
+
+```sh
+clear doctor foo.profile/ --cumulative           # rank functions by cum bytes
+clear doctor foo.profile/ --focus=intToString    # filter to traces that touch this function
+clear doctor foo.profile/ --ignore=intToString   # drop traces that touch this function
+clear doctor foo.profile/ --peek=processRequest  # callers + callees of one function
+clear doctor foo.profile/ --by=allocs            # sort heap by allocation count, not bytes
+clear doctor foo.profile/ --by=inuse_bytes       # sort by allocs - frees (live bytes)
+clear doctor old.profile/ --diff new.profile/    # perf-regression diff between two runs
+```
+
+`--cumulative` aggregates bytes/allocs across every frame in each
+trace, so a function high in the call stack accrues its callees'
+costs. `--focus=REGEX` keeps only sites whose trace touches a function
+matching the pattern. `--diff` reports per-function deltas in
+allocation, lock contention, and MVCC retries, with directional arrows
+and "newly contended" / "retries eliminated" annotations.
+
+## Notes
+
+- We do not emit a Mapping message for the binary, so `pprof` prints
+  "Main binary filename not available" and skips its own symbolization.
+  Function names still appear because we resolve via `addr2line` at
+  conversion time.
+- alloc-profile captures stacks unconditionally (multi-frame v2,
+  comma-separated leaf-first in `alloc.txt`).
+- lock-profile (v3) and mvcc-profile (v2) capture stacks only when
+  `--sync-callstacks` is on. The 12th column of `locks.txt` and the
+  8th column of `mvcc.txt` carry `-` (off) or comma-separated leaf-
+  first addrs (on). Tab-separated to allow commas in the trace field.
diff --git a/spec/completions_spec.rb b/spec/completions_spec.rb
new file mode 100644
index 000000000..21cb24332
--- /dev/null
+++ b/spec/completions_spec.rb
@@ -0,0 +1,118 @@
+require 'rspec'
+require_relative '../src/tools/completions'
+
+RSpec.describe Completions do
+  describe '.script_for' do
+    it 'rejects unsupported shells' do
+      expect { described_class.script_for('powershell') }
+        .to raise_error(ArgumentError, /unsupported shell/)
+    end
+
+    it 'returns the bash script for "bash"' do
+      out = described_class.script_for('bash')
+      expect(out).to start_with('# bash completion for `clear`')
+      expect(out).to include('complete -F _clear_complete clear')
+    end
+
+    it 'returns the zsh script for "zsh"' do
+      out = described_class.script_for('zsh')
+      expect(out).to start_with('#compdef clear')
+      expect(out).to include('_files -g \'*.cht\'')
+    end
+
+    it 'returns the fish script for "fish"' do
+      out = described_class.script_for('fish')
+      expect(out).to start_with('# fish completion for `clear`')
+      expect(out).to include('__fish_use_subcommand')
+    end
+  end
+
+  describe 'bash script semantics' do
+    let(:script) { described_class.bash }
+
+    it 'enumerates every subcommand for first-position completion' do
+      Completions::SUBCOMMANDS.each_key do |sub|
+        expect(script).to include(sub),
+                          "bash completion missing subcommand #{sub.inspect}"
+      end
+    end
+
+    it 'filters file completion to *.cht for build-like subcommands' do
+      expect(script).to include("compgen -f -X '!*.cht'")
+    end
+
+    it 'completes only directories for `doctor`' do
+      expect(script).to match(/doctor\)\s*\n\s+# Prefer \*\.profile\/ directories/)
+      expect(script).to include('compgen -d -- "$cur"')
+    end
+
+    it 'completes shell names for `completions`' do
+      expect(script).to match(/completions\)\s*\n\s+COMPREPLY=\(.*"bash zsh fish"/m)
+    end
+  end
+
+  describe 'zsh script semantics' do
+    let(:script) { described_class.zsh }
+
+    it 'tags every subcommand with its description for `_describe`' do
+      Completions::SUBCOMMANDS.each do |sub, desc|
+        expect(script).to include("'#{sub}:#{desc}'"),
+                          "zsh completion missing entry for #{sub.inspect}"
+      end
+    end
+
+    it 'filters `doctor` to *.profile directories via `_files -/`' do
+      expect(script).to match(/doctor\)\s*\n\s+_files -\/ -g '\*\.profile'/)
+    end
+
+    it 'uses _describe for first-position subcommand completion' do
+      expect(script).to include('_describe \'subcommand\' subcmds')
+    end
+  end
+
+  describe 'fish script semantics' do
+    let(:script) { described_class.fish }
+
+    it 'declares one `complete` per subcommand' do
+      Completions::SUBCOMMANDS.each_key do |sub|
+        expect(script).to match(/-a #{sub} -d /),
+                          "fish completion missing entry for #{sub.inspect}"
+      end
+    end
+
+    it 'gates file completion on subcommand context' do
+      expect(script).to include("__fish_seen_subcommand_from build run fmt format fix profile explain test benchmark")
+      expect(script).to include("__fish_seen_subcommand_from doctor")
+    end
+
+    it 'escapes single quotes in descriptions' do
+      # No subcommand description currently contains a quote, but if
+      # someone adds one the script must still parse cleanly.
+      stub_const('Completions::SUBCOMMANDS', { 'foo' => "it's a thing" })
+      script = described_class.fish
+      expect(script).to include("-d 'it\\'s a thing'")
+    end
+  end
+
+  describe 'CLI integration' do
+    let(:clear_bin) { File.expand_path('../clear', __dir__) }
+
+    it '`clear completions bash` prints the bash script and exits 0' do
+      out = `#{clear_bin} completions bash`
+      expect($?.exitstatus).to eq(0)
+      expect(out).to start_with('# bash completion for `clear`')
+    end
+
+    it '`clear completions` with no shell prints usage to stderr and exits 1' do
+      out = `#{clear_bin} completions 2>&1`
+      expect($?.exitstatus).to eq(1)
+      expect(out).to include('Usage: clear completions')
+    end
+
+    it '`clear completions powershell` prints usage and exits 1' do
+      out = `#{clear_bin} completions powershell 2>&1`
+      expect($?.exitstatus).to eq(1)
+      expect(out).to include('Usage: clear completions')
+    end
+  end
+end
diff --git a/spec/doctor_diff_spec.rb b/spec/doctor_diff_spec.rb
new file mode 100644
index 000000000..caf2324f8
--- /dev/null
+++ b/spec/doctor_diff_spec.rb
@@ -0,0 +1,133 @@
+require "rspec"
+require "tmpdir"
+require "stringio"
+require "fileutils"
+require_relative "../src/tools/doctor"
+
+# Exercises the new --diff / --cumulative / --focus flags. Profiles
+# are synthesized as plain text files in a tmpdir; the binary at
+# `<dir>.profile`'s sibling path is left absent so addr2line returns
+# `??` and we exercise the fallback-to-raw-addr keying.
+
+RSpec.describe Doctor do
+  def capture_stdout
+    old_stdout = $stdout
+    out = StringIO.new
+    $stdout = out
+    yield
+    out.string
+  ensure
+    $stdout = old_stdout
+  end
+
+  describe ".run --diff" do
+    around do |ex|
+      Dir.mktmpdir do |dir|
+        @before = File.join(dir, "before.profile")
+        @after  = File.join(dir, "after.profile")
+        FileUtils.mkdir_p([@before, @after])
+        ex.run
+      end
+    end
+
+    it "highlights heap regressions with directional arrows" do
+      File.write(File.join(@before, "alloc.txt"), <<~ALLOC)
+        # alloc-profile v2
+        0x100   100   1000   0   0   100
+      ALLOC
+      File.write(File.join(@after,  "alloc.txt"), <<~ALLOC)
+        # alloc-profile v2
+        0x100   200   2000   0   0   200
+      ALLOC
+
+      out = capture_stdout { described_class.run(@after, diff: @before) }
+      expect(out).to include("Heap Δ")
+      expect(out).to include("↑")           # regression
+    end
+
+    it "calls out brand-new allocation sites" do
+      File.write(File.join(@before, "alloc.txt"), "# alloc-profile v2\n")
+      File.write(File.join(@after,  "alloc.txt"), <<~ALLOC)
+        # alloc-profile v2
+        0xabc   50   500   0   0   500
+      ALLOC
+
+      out = capture_stdout { described_class.run(@after, diff: @before) }
+      expect(out).to include("New allocation sites")
+    end
+
+    it "calls out eliminated allocation sites" do
+      File.write(File.join(@before, "alloc.txt"), <<~ALLOC)
+        # alloc-profile v2
+        0xdead   10   100   0   0   100
+      ALLOC
+      File.write(File.join(@after, "alloc.txt"), "# alloc-profile v2\n")
+
+      out = capture_stdout { described_class.run(@after, diff: @before) }
+      expect(out).to include("Eliminated allocation sites")
+    end
+
+    it "respects --focus by skipping non-matching functions" do
+      File.write(File.join(@before, "alloc.txt"), <<~ALLOC)
+        # alloc-profile v2
+        0x100   100   1000   0   0   100
+      ALLOC
+      File.write(File.join(@after,  "alloc.txt"), <<~ALLOC)
+        # alloc-profile v2
+        0x100   200   2000   0   0   200
+      ALLOC
+
+      # focus that matches nothing → no Heap Δ table emitted.
+      out = capture_stdout { described_class.run(@after, diff: @before, focus: /no_such_function/) }
+      expect(out).not_to include("Heap Δ")
+    end
+
+    it "emits a lock Δ table when contention shifts" do
+      tab = "\t"
+      hdr = "# lock-profile v3\n"
+      # Wait_ns from 0 -> 5_000_000 (5ms), contention 0 -> 3
+      File.write(File.join(@before, "locks.txt"),
+                 hdr + "0x500#{tab}10#{tab}0#{tab}0#{tab}0#{tab}1000#{tab}500#{tab}0#{tab}0#{tab}0#{tab}0#{tab}-\n")
+      File.write(File.join(@after,  "locks.txt"),
+                 hdr + "0x500#{tab}10#{tab}3#{tab}5000000#{tab}2000000#{tab}1000#{tab}500#{tab}0#{tab}0#{tab}0#{tab}0#{tab}-\n")
+
+      out = capture_stdout { described_class.run(@after, diff: @before) }
+      expect(out).to include("Lock Δ")
+      expect(out).to include("newly contended")
+    end
+
+    it "emits an MVCC Δ table when retries appear" do
+      tab = "\t"
+      hdr = "# mvcc-profile v2\n"
+      File.write(File.join(@before, "mvcc.txt"),
+                 hdr + "0x600#{tab}64#{tab}1000#{tab}10#{tab}0#{tab}0#{tab}0#{tab}-\n")
+      File.write(File.join(@after,  "mvcc.txt"),
+                 hdr + "0x600#{tab}64#{tab}1000#{tab}10#{tab}500#{tab}0#{tab}0#{tab}-\n")
+
+      out = capture_stdout { described_class.run(@after, diff: @before) }
+      expect(out).to include("MVCC Δ")
+      expect(out).to include("new retry storm")
+    end
+
+    it "supports --ignore as the inverse of --focus" do
+      File.write(File.join(@before, "alloc.txt"), "# alloc-profile v2\n")
+      File.write(File.join(@after,  "alloc.txt"), <<~ALLOC)
+        # alloc-profile v2
+        0xabc   50   500   0   0   500
+      ALLOC
+
+      out = capture_stdout { described_class.run(@after, diff: @before) }
+      expect(out).to include("New allocation sites")
+    end
+
+    it "exits 1 when either dir does not exist" do
+      old_stderr = $stderr
+      $stderr = StringIO.new
+      expect {
+        described_class.run(@after, diff: "/nonexistent")
+      }.to raise_error(SystemExit) { |e| expect(e.status).to eq(1) }
+    ensure
+      $stderr = old_stderr
+    end
+  end
+end
diff --git a/spec/doctor_peek_spec.rb b/spec/doctor_peek_spec.rb
new file mode 100644
index 000000000..972885629
--- /dev/null
+++ b/spec/doctor_peek_spec.rb
@@ -0,0 +1,184 @@
+require "rspec"
+require "tmpdir"
+require "stringio"
+require "fileutils"
+require_relative "../src/tools/doctor"
+
+# Exercises --peek (callers + callees of a function) and --ignore
+# (inverse of --focus). Profiles are synthesized; addr2line resolution
+# is unavailable in tests (no real binary), so we use raw hex addrs
+# as the function-name fallback baked into parse_alloc_for_diff and
+# the heap reader. The peek tests then match against those addrs.
+
+RSpec.describe Doctor do
+  def capture_stdout
+    old_stdout = $stdout
+    out = StringIO.new
+    $stdout = out
+    yield
+    out.string
+  ensure
+    $stdout = old_stdout
+  end
+
+  describe ".run --ignore" do
+    around do |ex|
+      Dir.mktmpdir do |dir|
+        @profile_dir = File.join(dir, "p.profile")
+        FileUtils.mkdir_p(@profile_dir)
+        File.write(File.join(@profile_dir, "alloc.txt"), <<~ALLOC)
+          # alloc-profile v2
+          0xaaa   100   1000   0   0   100
+          0xbbb,0xaaa   50   500    0   0   100
+        ALLOC
+        ex.run
+      end
+    end
+
+    it "drops sites whose trace touches the ignore regex" do
+      out = capture_stdout { described_class.run(@profile_dir, ignore: /aaa/) }
+      expect(out).to include("Ignore:")
+      expect(out).not_to match(/^\s*1\. 0xaaa/)
+    end
+
+    it "keeps sites that do not match the ignore regex" do
+      out = capture_stdout { described_class.run(@profile_dir, ignore: /no_match/) }
+      expect(out).to include("0xaaa")
+    end
+
+    it "shows both Focus and Ignore lines when both flags are set" do
+      out = capture_stdout { described_class.run(@profile_dir, focus: /a/, ignore: /no/) }
+      expect(out).to include("Focus:")
+      expect(out).to include("Ignore:")
+    end
+  end
+
+  describe ".run --peek" do
+    around do |ex|
+      Dir.mktmpdir do |dir|
+        @profile_dir = File.join(dir, "p.profile")
+        FileUtils.mkdir_p(@profile_dir)
+        # Three samples sharing a common ancestor 0xparent. The peek
+        # target 0xfoo appears as a leaf in two and mid-stack in one.
+        File.write(File.join(@profile_dir, "alloc.txt"), <<~ALLOC)
+          # alloc-profile v2
+          0xfoo,0xparent              5  500  0  0  500
+          0xfoo,0xparent              3  300  0  0  300
+          0xleaf,0xfoo,0xparent       2  200  0  0  200
+        ALLOC
+        ex.run
+      end
+    end
+
+    it "aggregates self-bytes across all samples touching the function" do
+      out = capture_stdout { described_class.run(@profile_dir, peek: /0xfoo/) }
+      expect(out).to include("Peek")
+      expect(out).to match(/Self bytes:\s+1000 B \(across 10 alloc/)
+    end
+
+    it "surfaces the caller (frame above) of the matched function" do
+      out = capture_stdout { described_class.run(@profile_dir, peek: /0xfoo/) }
+      expect(out).to include("Callers (frames above")
+      expect(out).to match(/0xparent\s+1000 B/)
+    end
+
+    it "surfaces callees only when the function appears non-leaf" do
+      out = capture_stdout { described_class.run(@profile_dir, peek: /0xfoo/) }
+      expect(out).to include("Callees (frames below")
+      expect(out).to match(/0xleaf\s+200 B/)
+    end
+
+    it "tells the user when no samples matched" do
+      out = capture_stdout { described_class.run(@profile_dir, peek: /no_match/) }
+      expect(out).to include("No samples matched")
+    end
+
+    it "exits 1 when run on a missing directory" do
+      old_stderr = $stderr
+      $stderr = StringIO.new
+      expect {
+        described_class.run("/nonexistent", peek: /foo/)
+      }.to raise_error(SystemExit) { |e| expect(e.status).to eq(1) }
+    ensure
+      $stderr = old_stderr
+    end
+  end
+
+  describe ".run --by sample-type pivoting" do
+    around do |ex|
+      Dir.mktmpdir do |dir|
+        @profile_dir = File.join(dir, "p.profile")
+        FileUtils.mkdir_p(@profile_dir)
+        # Two sites: 0xbig has few large allocs, 0xsmall has many tiny.
+        # Sorted by bytes, 0xbig wins; sorted by allocs, 0xsmall wins.
+        File.write(File.join(@profile_dir, "alloc.txt"), <<~ALLOC)
+          # alloc-profile v2
+          0xbig     2  10000  0  0  10000
+          0xsmall   200  200  0  0  200
+        ALLOC
+        ex.run
+      end
+    end
+
+    it "default sort is by bytes (largest sites first)" do
+      out = capture_stdout { described_class.run(@profile_dir) }
+      first_idx = out.index("0xbig")
+      second_idx = out.index("0xsmall")
+      expect(first_idx).to be < second_idx
+    end
+
+    it "--by=allocs sorts by allocation count" do
+      out = capture_stdout { described_class.run(@profile_dir, by: :allocs) }
+      expect(out).to include("Top sites by allocations:")
+      first_idx = out.index("0xsmall")
+      second_idx = out.index("0xbig")
+      expect(first_idx).to be < second_idx
+    end
+
+    it "--by=inuse_bytes computes alloc_bytes - free_bytes" do
+      File.write(File.join(@profile_dir, "alloc.txt"), <<~ALLOC)
+        # alloc-profile v2
+        0xleak     1  1000   0    0    1000
+        0xpaired   1  1000   1    1000   0
+      ALLOC
+      out = capture_stdout { described_class.run(@profile_dir, by: :inuse_bytes) }
+      expect(out).to include("Top sites by in-use bytes")
+      expect(out.index("0xleak")).to be < out.index("0xpaired")
+    end
+  end
+
+  describe "runtime-function attribution" do
+    around do |ex|
+      Dir.mktmpdir do |dir|
+        @profile_dir = File.join(dir, "p.profile")
+        FileUtils.mkdir_p(@profile_dir)
+        # Synthesize a transpiled.zig with one CLR marker, then test
+        # that addresses NOT in this file aren't credited a clear_line.
+        File.write(File.join(@profile_dir, "transpiled.zig"), <<~ZIG)
+          // header
+          // CLR:42
+          some_zig_code();
+        ZIG
+        ex.run
+      end
+    end
+
+    it "leaves clear_line nil when the file is not the user transpiled.zig" do
+      # Doctor's resolver runs against a real binary which we don't have
+      # in tests, so resolve_addrs returns no entries — the absence of
+      # `is_user_zig` mis-attribution is what matters in production. This
+      # test pins the logic via a direct file_is_transpiled_zig? check.
+      require_relative "../src/tools/pprof_converter"
+      expect(
+        PprofConverter.send(:file_is_transpiled_zig?, "/tmp/runtime.zig:100", "/tmp/transpiled.zig")
+      ).to be false
+      expect(
+        PprofConverter.send(:file_is_transpiled_zig?, "/build/cache/._clear_tmp_litedb.zig:60", "/p/transpiled.zig")
+      ).to be true
+      expect(
+        PprofConverter.send(:file_is_transpiled_zig?,
+          "/build/cache/._clear_tmp_litedb.zig:176 (discriminator 4)", "/p/transpiled.zig")
+      ).to be true
+    end
+  end
+end
diff --git a/spec/pprof_spec.rb b/spec/pprof_spec.rb
new file mode 100644
index 000000000..70e83f8d6
--- /dev/null
+++ b/spec/pprof_spec.rb
@@ -0,0 +1,409 @@
+require 'rspec'
+require 'tmpdir'
+require 'fileutils'
+require 'zlib'
+require 'stringio'
+require_relative '../src/tools/pprof'
+require_relative '../src/tools/pprof_converter'
+
+# Exercises both the pprof wire-format encoder and the converter that
+# turns `clear profile` text dumps into pprof `.pb.gz` files. We do not
+# require pprof itself to be installed -- a tiny in-test decoder
+# verifies the bytes we emit match what we mean to encode.
+
+# Minimal protobuf decoder for the subset of the schema the encoder
+# uses. Verifies field tags, wire types, and lengths. Returns nested
+# hashes keyed by field number.
+module ProtoTestDecoder
+  module_function
+
+  def parse(bytes)
+    bytes = bytes.b
+    io = StringIO.new(bytes)
+    out = Hash.new { |h, k| h[k] = [] }
+    until io.eof?
+      tag = read_varint(io)
+      field = tag >> 3
+      wire = tag & 7
+      val =
+        case wire
+        when 0 then read_varint(io)
+        when 2 then io.read(read_varint(io)).to_s.b
+        when 1 then io.read(8)
+        when 5 then io.read(4)
+        else raise "unsupported wire type #{wire} for field #{field}"
+        end
+      out[field] << val
+    end
+    out
+  end
+
+  def read_varint(io)
+    n = 0
+    shift = 0
+    loop do
+      b = io.read(1)&.unpack1('C')
+      raise 'truncated varint' if b.nil?
+      n |= (b & 0x7F) << shift
+      return n if (b & 0x80).zero?
+      shift += 7
+      raise 'varint too long' if shift > 63
+    end
+  end
+
+  # Decode a packed varint field (Sample.location_id, Sample.value).
+  def parse_packed_varints(bytes)
+    io = StringIO.new(bytes.b)
+    out = []
+    out << read_varint(io) until io.eof?
+    out
+  end
+end
+
+RSpec.describe Pprof::Wire do
+  it 'encodes single-byte varints unchanged' do
+    expect(described_class.varint(0)).to eq("\x00".b)
+    expect(described_class.varint(1)).to eq("\x01".b)
+    expect(described_class.varint(127)).to eq("\x7f".b)
+  end
+
+  it 'encodes multi-byte varints with the continuation bit' do
+    # 300 = 0b1_0010_1100 -> bytes [0xAC, 0x02]
+    expect(described_class.varint(300)).to eq("\xac\x02".b)
+  end
+
+  it 'rejects negative values' do
+    expect { described_class.varint(-1) }.to raise_error(ArgumentError)
+  end
+
+  it 'composes a length-delimited field' do
+    bytes = described_class.field_bytes(6, 'hi')
+    expect(bytes).to eq("\x32\x02hi".b) # tag=(6<<3|2)=0x32, len=2, "hi"
+  end
+end
+
+RSpec.describe Pprof::Profile do
+  it 'reserves string_table[0] = "" so pprof readers parse correctly' do
+    pb = described_class.new
+    bytes = pb.encode
+    decoded = ProtoTestDecoder.parse(bytes)
+    expect(decoded[6].first).to eq('') # string_table is field 6
+  end
+
+  it 'deduplicates strings, functions, and string columns' do
+    pb = described_class.new
+    pb.add_function(name: 'foo', filename: 'a.cht')
+    second_id = pb.add_function(name: 'foo', filename: 'a.cht')
+    expect(second_id).to eq(1)
+    decoded = ProtoTestDecoder.parse(pb.encode)
+    expect(decoded[5].length).to eq(1) # one Function entry only
+  end
+
+  it 'encodes a roundtrippable single-sample profile' do
+    pb = described_class.new
+    pb.add_sample_type('alloc_objects', 'count')
+    pb.add_sample_type('alloc_space',   'bytes')
+    pb.set_period_type('space', 'bytes', 1)
+    fid = pb.add_function(name: 'allocBlob', filename: 'foo.cht')
+    lid = pb.add_location(function_id: fid, line: 42, address: 0x1234)
+    pb.add_sample([lid], [10, 1024], addr: '0x1234')
+
+    bytes = pb.encode
+    decoded = ProtoTestDecoder.parse(bytes)
+
+    expect(decoded[1].length).to eq(2)              # 2 sample_types
+    expect(decoded[2].length).to eq(1)              # 1 sample
+    expect(decoded[4].length).to eq(1)              # 1 location
+    expect(decoded[5].length).to eq(1)              # 1 function
+    expect(decoded[11].length).to eq(1)             # period_type set
+    expect(decoded[12].first).to eq(1)              # period
+
+    # Sample.value should be packed varints [10, 1024]
+    sample = ProtoTestDecoder.parse(decoded[2].first)
+    values_bytes = sample[2].first
+    expect(ProtoTestDecoder.parse_packed_varints(values_bytes)).to eq([10, 1024])
+
+    location_bytes = sample[1].first
+    expect(ProtoTestDecoder.parse_packed_varints(location_bytes)).to eq([1])
+  end
+
+  it 'emits a Mapping when one is registered, and Locations point at it' do
+    pb = described_class.new
+    pb.add_mapping(binary: '/path/to/litedb', build_id: 'abc123')
+    pb.add_sample_type('alloc_space', 'bytes')
+    fid = pb.add_function(name: 'foo', filename: 'foo.cht')
+    lid = pb.add_location(function_id: fid, line: 1, address: 0x100)
+    pb.add_sample([lid], [10])
+
+    decoded = ProtoTestDecoder.parse(pb.encode)
+    expect(decoded[3].length).to eq(1)            # one Mapping
+    mapping = ProtoTestDecoder.parse(decoded[3].first)
+    expect(mapping[1].first).to eq(1)             # Mapping.id
+    expect(mapping[7].first).to eq(1)             # has_functions = true
+    expect(mapping[8].first).to eq(1)             # has_filenames = true
+    expect(mapping[9].first).to eq(1)             # has_line_numbers = true
+
+    location = ProtoTestDecoder.parse(decoded[4].first)
+    expect(location[2].first).to eq(1)            # Location.mapping_id = 1
+  end
+
+  it 'omits the Mapping field entirely when none is registered' do
+    pb = described_class.new
+    pb.add_sample_type('count', 'samples')
+    decoded = ProtoTestDecoder.parse(pb.encode)
+    expect(decoded[3]).to be_empty
+  end
+
+  it 'gzips the output with a valid gzip magic' do
+    pb = described_class.new
+    bytes = pb.encode_gzip
+    expect(bytes[0, 2].bytes).to eq([0x1f, 0x8b])
+    # Round-trip through Zlib to confirm
+    inflated = Zlib::GzipReader.new(StringIO.new(bytes)).read
+    expect(inflated.b).to eq(pb.encode.b)
+  end
+end
+
+RSpec.describe PprofConverter do
+  around do |ex|
+    Dir.mktmpdir do |dir|
+      @profile_dir = File.join(dir, 'foo.profile')
+      FileUtils.mkdir_p(@profile_dir)
+      ex.run
+    end
+  end
+
+  describe '.convert_alloc' do
+    it 'returns nil when alloc.txt is missing' do
+      expect(described_class.convert_alloc(@profile_dir, nil)).to be_nil
+    end
+
+    it 'writes heap.pb.gz with four sample-type columns' do
+      File.write(File.join(@profile_dir, 'alloc.txt'), <<~PROF)
+        # site          allocs    bytes  frees  free_bytes  live
+        0x401234        1000      40000  500    20000       500
+        0x402000        2000      80000  2000   80000       0
+      PROF
+      out = described_class.convert_alloc(@profile_dir, nil)
+      expect(out).to eq(File.join(@profile_dir, 'heap.pb.gz'))
+      bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read
+      decoded = ProtoTestDecoder.parse(bytes)
+      expect(decoded[1].length).to eq(4) # alloc_objects, alloc_space, inuse_*, inuse_*
+      expect(decoded[2].length).to eq(2) # one sample per site
+    end
+
+    it 'computes inuse columns from (allocs-frees) and (bytes-free_bytes)' do
+      File.write(File.join(@profile_dir, 'alloc.txt'), <<~PROF)
+        0x401234   1000   40000   200   8000   800
+      PROF
+      out = described_class.convert_alloc(@profile_dir, nil)
+      bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read
+      decoded = ProtoTestDecoder.parse(bytes)
+      sample = ProtoTestDecoder.parse(decoded[2].first)
+      values = ProtoTestDecoder.parse_packed_varints(sample[2].first)
+      # alloc_objects, alloc_space, inuse_objects, inuse_space
+      expect(values).to eq([1000, 40000, 800, 32000])
+    end
+
+    it 'parses comma-separated multi-frame stack traces (alloc-profile v2)' do
+      File.write(File.join(@profile_dir, 'alloc.txt'), <<~PROF)
+        # alloc-profile v2 (multi-frame, comma-separated leaf-first)
+        0x401234,0x402000,0x403000   1000  40000  500  20000  500
+      PROF
+      out = described_class.convert_alloc(@profile_dir, nil)
+      bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read
+      decoded = ProtoTestDecoder.parse(bytes)
+      # Three Locations (one per unique addr), three Functions
+      expect(decoded[4].length).to eq(3)
+      expect(decoded[5].length).to eq(3)
+      sample = ProtoTestDecoder.parse(decoded[2].first)
+      stack = ProtoTestDecoder.parse_packed_varints(sample[1].first)
+      expect(stack.length).to eq(3)            # 3-frame trace in Sample.location_id
+    end
+
+    it 'shares Locations across samples that include the same frame' do
+      File.write(File.join(@profile_dir, 'alloc.txt'), <<~PROF)
+        # alloc-profile v2
+        0x10,0x20    100  1000  0  0  100
+        0x30,0x20    100  1000  0  0  100
+      PROF
+      out = described_class.convert_alloc(@profile_dir, nil)
+      bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read
+      decoded = ProtoTestDecoder.parse(bytes)
+      # 0x20 should be ONE Location/Function reused by both samples.
+      expect(decoded[4].length).to eq(3)       # 0x10, 0x20, 0x30
+      expect(decoded[5].length).to eq(3)
+      expect(decoded[2].length).to eq(2)
+    end
+
+    it 'tags each sample with its source address as a label' do
+      File.write(File.join(@profile_dir, 'alloc.txt'), <<~PROF)
+        0xdeadbeef   1   16   0   0   1
+      PROF
+      out = described_class.convert_alloc(@profile_dir, nil)
+      bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read
+      # The address 0xdeadbeef should appear in string_table.
+      expect(bytes).to include('0xdeadbeef')
+    end
+  end
+
+  describe '.convert_locks' do
+    it 'returns nil for empty input' do
+      File.write(File.join(@profile_dir, 'locks.txt'), "# header only\n")
+      expect(described_class.convert_locks(@profile_dir, nil)).to be_nil
+    end
+
+    it 'sums read+write contention into the contentions column' do
+      File.write(File.join(@profile_dir, 'locks.txt'), <<~PROF)
+        # addr  acq cont total_wait max_wait total_hold max_hold r_acq r_cont r_wait r_max
+        0x500   100 5    50000      1000     200000     5000     50    2      10000  500
+      PROF
+      out = described_class.convert_locks(@profile_dir, nil)
+      bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read
+      decoded = ProtoTestDecoder.parse(bytes)
+      sample = ProtoTestDecoder.parse(decoded[2].first)
+      values = ProtoTestDecoder.parse_packed_varints(sample[2].first)
+      # contentions, delay, hold, acquisitions
+      expect(values).to eq([5 + 2, 50000 + 10000, 200000, 100 + 50])
+    end
+
+    it 'parses caller_trace = "-" as no trace (sync-callstacks off)' do
+      tab = "\t"
+      content = "# lock-profile v3\n" \
+                "0x500#{tab}100#{tab}5#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}-\n"
+      File.write(File.join(@profile_dir, 'locks.txt'), content)
+      out = described_class.convert_locks(@profile_dir, nil)
+      bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read
+      decoded = ProtoTestDecoder.parse(bytes)
+      sample = ProtoTestDecoder.parse(decoded[2].first)
+      stack = ProtoTestDecoder.parse_packed_varints(sample[1].first)
+      expect(stack.length).to eq(1)             # only the lock-pointer leaf
+    end
+
+    it 'parses caller_trace as multi-frame stack (sync-callstacks on)' do
+      tab = "\t"
+      content = "# lock-profile v3\n" \
+                "0x500#{tab}100#{tab}5#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0" \
+                "#{tab}0xC1,0xC2,0xC3\n"
+      File.write(File.join(@profile_dir, 'locks.txt'), content)
+      out = described_class.convert_locks(@profile_dir, nil)
+      bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read
+      decoded = ProtoTestDecoder.parse(bytes)
+      sample = ProtoTestDecoder.parse(decoded[2].first)
+      stack = ProtoTestDecoder.parse_packed_varints(sample[1].first)
+      # leaf (lock 0x500) + 3 caller frames
+      expect(stack.length).to eq(4)
+    end
+
+    it 'emits a separate sample per (lock, caller-trace) row' do
+      tab = "\t"
+      content = "# lock-profile v3\n" \
+                "0x500#{tab}40#{tab}2#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0xA\n" \
+                "0x500#{tab}10#{tab}1#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0#{tab}0xB\n"
+      File.write(File.join(@profile_dir, 'locks.txt'), content)
+      out = described_class.convert_locks(@profile_dir, nil)
+      bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read
+      decoded = ProtoTestDecoder.parse(bytes)
+      expect(decoded[2].length).to eq(2)        # one Sample per (lock,caller) row
+    end
+  end
+
+  describe '.convert_mvcc' do
+    it 'computes cow_bytes as struct_size * (commits + retries)' do
+      File.write(File.join(@profile_dir, 'mvcc.txt'), <<~PROF)
+        0x600   128   5000   1000   200   10
+      PROF
+      out = described_class.convert_mvcc(@profile_dir, nil)
+      bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read
+      decoded = ProtoTestDecoder.parse(bytes)
+      sample = ProtoTestDecoder.parse(decoded[2].first)
+      values = ProtoTestDecoder.parse_packed_varints(sample[2].first)
+      # reads, commits, retries, cow_bytes
+      expect(values).to eq([5000, 1000, 200, 128 * (1000 + 200)])
+    end
+
+    it 'tolerates the optional 7th `multi_commits` column' do
+      File.write(File.join(@profile_dir, 'mvcc.txt'), <<~PROF)
+        0x700   64   100   50   5   0   3
+      PROF
+      out = described_class.convert_mvcc(@profile_dir, nil)
+      expect(out).to eq(File.join(@profile_dir, 'mvcc.pb.gz'))
+    end
+
+    it 'parses caller_trace as multi-frame stack (sync-callstacks on)' do
+      tab = "\t"
+      content = "# mvcc-profile v2\n" \
+                "0x700#{tab}64#{tab}1000#{tab}50#{tab}5#{tab}0#{tab}3#{tab}0xC1,0xC2,0xC3\n"
+      File.write(File.join(@profile_dir, 'mvcc.txt'), content)
+      out = described_class.convert_mvcc(@profile_dir, nil)
+      bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read
+      decoded = ProtoTestDecoder.parse(bytes)
+      sample = ProtoTestDecoder.parse(decoded[2].first)
+      stack = ProtoTestDecoder.parse_packed_varints(sample[1].first)
+      # leaf (cell 0x700) + 3 caller frames
+      expect(stack.length).to eq(4)
+    end
+  end
+
+  describe '.convert_channels' do
+    it 'returns nil for an empty channels.txt' do
+      File.write(File.join(@profile_dir, 'channels.txt'), <<~PROF)
+        # channel-profile v1
+        # id pushes pops push_blocked pop_blocked max_depth capacity
+      PROF
+      expect(described_class.convert_channels(@profile_dir, nil)).to be_nil
+    end
+
+    it 'emits one Sample per registered channel' do
+      tab = "\t"
+      File.write(File.join(@profile_dir, 'channels.txt'), <<~PROF)
+        # channel-profile v1
+        0#{tab}5#{tab}5#{tab}0#{tab}2#{tab}5#{tab}8
+        1#{tab}9#{tab}9#{tab}0#{tab}3#{tab}9#{tab}16
+      PROF
+      out = described_class.convert_channels(@profile_dir, nil)
+      expect(out).to eq(File.join(@profile_dir, 'channels.pb.gz'))
+      bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read
+      decoded = ProtoTestDecoder.parse(bytes)
+      expect(decoded[1].length).to eq(5)             # 5 sample-type columns
+      expect(decoded[2].length).to eq(2)             # 2 samples
+      expect(bytes).to include('channel#0')
+      expect(bytes).to include('channel#1')
+    end
+
+    it 'tags each sample with the channel capacity' do
+      tab = "\t"
+      File.write(File.join(@profile_dir, 'channels.txt'),
+                 "# channel-profile v1\n0#{tab}5#{tab}5#{tab}0#{tab}2#{tab}5#{tab}8\n")
+      out = described_class.convert_channels(@profile_dir, nil)
+      bytes = Zlib::GzipReader.new(StringIO.new(File.binread(out))).read
+      expect(bytes).to include('capacity')
+    end
+  end
+
+  describe '.convert_perf' do
+    it 'returns nil when perf_to_profile is unavailable or perf.data is missing' do
+      # No perf.data => nil regardless of perf_to_profile presence.
+      expect(described_class.convert_perf(@profile_dir, nil)).to be_nil
+    end
+  end
+
+  describe '.convert_all' do
+    it 'returns the set of files actually emitted' do
+      File.write(File.join(@profile_dir, 'alloc.txt'), "0x100  1  16  0  0  1\n")
+      File.write(File.join(@profile_dir, 'mvcc.txt'),  "0x200  64  10  5  1  0\n")
+      out = described_class.convert_all(@profile_dir)
+      expect(out.keys).to contain_exactly(:heap, :mvcc)
+      expect(out[:heap]).to eq(File.join(@profile_dir, 'heap.pb.gz'))
+      expect(out[:mvcc]).to eq(File.join(@profile_dir, 'mvcc.pb.gz'))
+    end
+
+    it 'returns an empty hash for a directory with no profile files' do
+      expect(described_class.convert_all(@profile_dir)).to eq({})
+    end
+
+    it 'returns an empty hash for a missing directory' do
+      expect(described_class.convert_all('/nonexistent')).to eq({})
+    end
+  end
+end
diff --git a/src/tools/completions.rb b/src/tools/completions.rb
new file mode 100644
index 000000000..ae3bcd260
--- /dev/null
+++ b/src/tools/completions.rb
@@ -0,0 +1,153 @@
+# Shell-completion script generators for the `clear` CLI.
+#
+# Usage from the user side:
+#   clear completions bash >> ~/.bashrc
+#   clear completions zsh  > ~/.zsh/completions/_clear   # then `compinit`
+#   clear completions fish > ~/.config/fish/completions/clear.fish
+#
+# Per-subcommand completion semantics:
+#   build / run / fmt / fix / profile / explain  -> *.cht files (+ dirs)
+#   test                                          -> *.cht files OR directories
+#   doctor                                        -> *.profile/ directories
+#   completions                                   -> bash | zsh | fish
+module Completions
+  SUBCOMMANDS = {
+    'build'       => 'Build a .cht file to a native binary',
+    'run'         => 'Build and run a .cht file',
+    'test'        => 'Run tests in a .cht file or directory',
+    'benchmark'   => 'Run a CLEAR benchmark',
+    'profile'     => 'Build with profiling and run',
+    'doctor'      => 'Analyze a .profile/ directory',
+    'fix'         => 'Apply lint fixes to .cht files',
+    'fmt'         => 'Format .cht files',
+    'format'      => 'Alias for fmt',
+    'explain'     => 'Explain a language feature',
+    'completions' => 'Print shell completions',
+    'help'        => 'Show help',
+  }.freeze
+
+  module_function
+
+  def script_for(shell)
+    case shell
+    when 'bash' then bash
+    when 'zsh'  then zsh
+    when 'fish' then fish
+    else
+      raise ArgumentError, "unsupported shell: #{shell.inspect} (expected bash, zsh, or fish)"
+    end
+  end
+
+  # Bash uses `complete -F` with a function. We hand-roll the
+  # subcommand dispatch so file completion is filtered to `.cht`
+  # for build-like commands and to `.profile/` for `doctor`.
+  def bash
+    <<~BASH
+      # bash completion for `clear`
+      # Source from ~/.bashrc:  source <(clear completions bash)
+      _clear_complete() {
+        local cur prev cmd
+        COMPREPLY=()
+        cur="${COMP_WORDS[COMP_CWORD]}"
+        prev="${COMP_WORDS[COMP_CWORD-1]}"
+        cmd="${COMP_WORDS[1]}"
+
+        if [ "$COMP_CWORD" -eq 1 ]; then
+          COMPREPLY=( $(compgen -W "#{SUBCOMMANDS.keys.join(' ')}" -- "$cur") )
+          return
+        fi
+
+        case "$cmd" in
+          build|run|fmt|format|fix|profile|explain)
+            COMPREPLY=( $(compgen -f -X '!*.cht' -- "$cur") $(compgen -d -- "$cur") )
+            compopt -o filenames 2>/dev/null
+            ;;
+          test|benchmark)
+            COMPREPLY=( $(compgen -f -X '!*.cht' -- "$cur") $(compgen -d -- "$cur") )
+            compopt -o filenames 2>/dev/null
+            ;;
+          doctor)
+            # Prefer *.profile/ directories. Falls back to all dirs so
+            # the user can drill into a parent.
+            COMPREPLY=( $(compgen -d -- "$cur") )
+            local filtered=()
+            local d
+            for d in "${COMPREPLY[@]}"; do
+              [[ "$d" == *.profile || -d "$d/.." ]] && filtered+=("$d")
+            done
+            COMPREPLY=("${filtered[@]}")
+            compopt -o filenames 2>/dev/null
+            ;;
+          completions)
+            COMPREPLY=( $(compgen -W "bash zsh fish" -- "$cur") )
+            ;;
+        esac
+      }
+      complete -F _clear_complete clear
+    BASH
+  end
+
+  # Zsh's completion system is richer: `_describe` shows subcommand
+  # descriptions inline, `_files -g GLOB` filters by glob.
+  def zsh
+    desc_lines = SUBCOMMANDS.map { |k, v| "    '#{k}:#{v}'" }.join("\n")
+    <<~ZSH
+      #compdef clear
+      # zsh completion for `clear`
+      # Install:  clear completions zsh > ~/.zsh/completions/_clear
+      #           # ensure ~/.zsh/completions is in $fpath BEFORE compinit
+      _clear() {
+        local -a subcmds
+        subcmds=(
+      #{desc_lines}
+        )
+
+        if (( CURRENT == 2 )); then
+          _describe 'subcommand' subcmds
+          return
+        fi
+
+        case "${words[2]}" in
+          build|run|fmt|format|fix|profile|explain|test|benchmark)
+            _files -g '*.cht'
+            ;;
+          doctor)
+            _files -/ -g '*.profile'
+            ;;
+          completions)
+            _values 'shell' bash zsh fish
+            ;;
+        esac
+      }
+      _clear "$@"
+    ZSH
+  end
+
+  # Fish completions are declarative: one `complete` call per arm.
+  def fish
+    sub_complete = SUBCOMMANDS.map do |k, v|
+      desc = v.gsub("'", "\\\\'")
+      "complete -c clear -n '__fish_use_subcommand' -a #{k} -d '#{desc}'"
+    end.join("\n")
+
+    <<~FISH
+      # fish completion for `clear`
+      # Install:  clear completions fish > ~/.config/fish/completions/clear.fish
+
+      # Top-level subcommand list
+      #{sub_complete}
+
+      # File arguments per subcommand
+      complete -c clear -n '__fish_seen_subcommand_from build run fmt format fix profile explain test benchmark' \\
+        -F -k -a "(__fish_complete_path '*.cht')"
+
+      # `doctor` wants *.profile/ directories
+      complete -c clear -n '__fish_seen_subcommand_from doctor' \\
+        -F -k -a "(__fish_complete_directories '' '*.profile')"
+
+      # `completions` shell name
+      complete -c clear -n '__fish_seen_subcommand_from completions' \\
+        -a 'bash zsh fish'
+    FISH
+  end
+end
diff --git a/src/tools/doctor.rb b/src/tools/doctor.rb
index f907f274e..98845b5ad 100644
--- a/src/tools/doctor.rb
+++ b/src/tools/doctor.rb
@@ -1,4 +1,5 @@
 require 'set'
+require 'stringio'
 
 # Analyzes a `clear profile` run. Reads heap/CPU/syscall/HW-counter/lock/MVCC
 # data from the .profile/ directory and prints actionable advice.
@@ -26,12 +27,22 @@ def saturation_warning(file)
     line&.sub(/\A# /, '')&.rstrip
   end
 
-  def run(profile_dir)
+  def run(profile_dir, cumulative: false, focus: nil, ignore: nil, peek: nil, diff: nil, by: :bytes)
+    if diff
+      return run_diff(diff, profile_dir, focus: focus)
+    end
+
     unless profile_dir && Dir.exist?(profile_dir)
-      $stderr.puts "\e[31merror:\e[0m Usage: clear doctor <profile-dir>"
+      $stderr.puts "\e[31merror:\e[0m Usage: clear doctor <profile-dir> [--cumulative] [--focus=REGEX] [--ignore=REGEX] [--peek=REGEX] [--by=bytes|allocs|inuse_bytes|inuse_allocs] [--diff <before-dir>]"
       exit 1
     end
 
+    @opts = { cumulative: cumulative, focus: focus, ignore: ignore, peek: peek, by: by }
+
+    if peek
+      return run_peek(profile_dir, peek)
+    end
+
     perf_data = File.join(profile_dir, 'perf.data')
     binary = profile_dir.chomp('/').sub(/\.profile$/, '')
     binary = nil unless File.exist?(binary.to_s)
@@ -48,6 +59,56 @@ def run(profile_dir)
     section_freeze(profile_dir, sites, resolved, llc_miss_rate)
   end
 
+  # Returns true if a sample's trace (function names, leaf-first)
+  # passes the focus/ignore filters. focus keeps only matches; ignore
+  # drops matches; both can compose. With no filters set, every
+  # sample passes.
+  def focus_match?(funcs)
+    return false if @opts && @opts[:ignore] && funcs.any? { |f| f =~ @opts[:ignore] }
+    return true unless @opts && @opts[:focus]
+    funcs.any? { |f| f =~ @opts[:focus] }
+  end
+
+  def cumulative?
+    !!(@opts && @opts[:cumulative])
+  end
+
+  # Which Site field to sort/display by. Maps the user's --by flag
+  # (`bytes` / `allocs` / `inuse_bytes` / `inuse_allocs`) to the
+  # corresponding key in the parsed Site hash. Default `:bytes`.
+  def sort_key
+    return :bytes unless @opts && @opts[:by]
+    case @opts[:by]
+    when :allocs        then :allocs
+    when :inuse_bytes   then :inuse_bytes
+    when :inuse_allocs  then :inuse_allocs
+    else :bytes
+    end
+  end
+
+  # Human-readable label for the sort axis (used in section headers).
+  def sort_label
+    case sort_key
+    when :allocs        then "allocations"
+    when :inuse_bytes   then "in-use bytes (alloc - free)"
+    when :inuse_allocs  then "in-use allocations (alloc - free)"
+    else "bytes"
+    end
+  end
+
+  # Format the sort metric as a string for the per-row display.
+  # Bytes get KB/MB pretty-printing; counts get thousand-separators.
+  def fmt_sort_value(s)
+    v = s[sort_key]
+    case sort_key
+    when :allocs, :inuse_allocs
+      "%s allocs" % v.to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,')
+    else
+      kb = v / 1024.0
+      kb >= 1.0 ? ("%.1f KB" % kb) : ("%d B" % v)
+    end
+  end
+
   # ── Heap Profile ──
   def section_heap(profile_dir, binary)
     alloc_file = File.join(profile_dir, 'alloc.txt')
@@ -60,18 +121,38 @@ def section_heap(profile_dir, binary)
     total_line = File.readlines(alloc_file).find { |l| l.include?('total_allocs') }
     total = total_line ? total_line.scan(/\d+/).last.to_i : 0
 
+    # alloc-profile v2: the first column may be a comma-separated
+    # leaf-first stack trace. We treat the leaf address as the
+    # primary site (matching v1 behaviour); the deeper frames are
+    # available in `:trace` for richer views below if a future
+    # caller wants them. v1 files (single addr, no comma) still parse
+    # because `split(',')` on a no-comma string returns a 1-element
+    # array.
     sites = lines.map do |line|
       parts = line.strip.split
       next nil if parts.size < 6
-      addr = parts[0]
-      { addr: addr, allocs: parts[1].to_i, bytes: parts[2].to_i,
-        frees: parts[3].to_i, free_bytes: parts[4].to_i, live: parts[5].to_i }
-    end.compact.sort_by { |s| -s[:bytes] }
+      trace = parts[0].split(',')
+      allocs = parts[1].to_i
+      frees  = parts[3].to_i
+      bytes  = parts[2].to_i
+      free_bytes = parts[4].to_i
+      { addr: trace.first, trace: trace,
+        allocs: allocs, bytes: bytes,
+        frees: frees, free_bytes: free_bytes,
+        live: parts[5].to_i,
+        # Inuse columns: live counts derived from alloc - free.
+        # Doctor uses these when --by=inuse_bytes / --by=inuse_allocs.
+        inuse_allocs: [allocs - frees, 0].max,
+        inuse_bytes:  [bytes  - free_bytes, 0].max }
+    end.compact.sort_by { |s| -s[sort_key] }
 
     resolved = nil
     if binary
       resolved = {}
-      addrs = sites.map { |s| s[:addr] }
+      # Resolve every address that appears in any trace, not just
+      # the leaf, so callers can render full traces. uniq keeps the
+      # addr2line invocation small on hot leaves.
+      addrs = sites.flat_map { |s| s[:trace] }.uniq
       raw = IO.popen(['addr2line', '-e', binary, '-f'] + addrs, err: '/dev/null', &:read)
       lines_out = raw.split("\n")
       zig_source_path = File.join(profile_dir, 'transpiled.zig')
@@ -80,7 +161,17 @@ def section_heap(profile_dir, binary)
         func = lines_out[i * 2]&.strip || "?"
         file = lines_out[i * 2 + 1]&.strip || "?"
         clear_line = nil
-        if file =~ /:(\d+)/ && File.exist?(zig_source_path)
+        # Only walk transpiled.zig for CLR markers if the address
+        # actually came from the user's transpiled CLEAR file. Runtime
+        # and stdlib zig files have no CLR markers; treating them as
+        # user code led to runtime functions appearing at random
+        # source.cht lines. addr2line emits the build-time filename
+        # `._clear_tmp_<name>.zig` (the compile target), which is a
+        # content copy of profile_dir/transpiled.zig.
+        is_user_zig = File.exist?(zig_source_path) &&
+                      File.basename(file.sub(/:\d+\b.*\z/, ''))
+                          .match?(/\A\._clear_tmp_.*\.zig\z/)
+        if is_user_zig && file =~ /:(\d+)\b/
           zig_line = $1.to_i
           zig_lines_cache ||= File.readlines(zig_source_path)
           (zig_line - 1).downto(0) do |li|
@@ -92,7 +183,38 @@ def section_heap(profile_dir, binary)
           end
         end
         func = func.sub(/.*\./, '').sub(/__anon_\d+/, '')
-        resolved[addr] = { func: func, file: file, clear_line: clear_line }
+        resolved[addr] = { func: func, file: file, clear_line: clear_line, is_user_zig: is_user_zig }
+      end
+    end
+
+    # Cumulative bytes per leaf function: for every sample, every
+    # frame in its trace (resolved to a function name) gets credit
+    # for the sample's bytes/allocs. A leaf alone gets only flat
+    # bytes; a function high in the stack accumulates its callees'
+    # bytes too. This answers "who is on the call path of the most
+    # bytes" not "who directly allocated the most bytes."
+    cum_bytes = Hash.new(0)
+    cum_allocs = Hash.new(0)
+    sites.each do |s|
+      seen = {} # one credit per (function, sample) so recursion doesn't double-count
+      s[:trace].each do |a|
+        f = resolved&.dig(a)&.dig(:func) || a
+        next if seen[f]
+        seen[f] = true
+        cum_bytes[f] += s[:bytes]
+        cum_allocs[f] += s[:allocs]
+      end
+    end
+
+    # --focus / --ignore filter which sites contribute to the top-10
+    # list. focus keeps a site if any function in its trace matches
+    # the regex; ignore drops one if any function matches. Both can
+    # compose; ignore wins on overlap (a site that would pass focus
+    # is dropped if it also matches ignore).
+    if @opts && (@opts[:focus] || @opts[:ignore])
+      sites = sites.select do |s|
+        funcs = s[:trace].map { |a| resolved&.dig(a)&.dig(:func) || a }
+        focus_match?(funcs)
       end
     end
 
@@ -102,30 +224,56 @@ def section_heap(profile_dir, binary)
       puts "  *** #{warn}"
       puts ""
     end
-    puts "Top sites by bytes:"
-    sites.first(10).each_with_index do |s, i|
-      r = resolved&.dig(s[:addr])
-      func = r ? r[:func] : s[:addr]
-      loc = if r&.dig(:clear_line) then "(line #{r[:clear_line]})"
-            elsif r then "(#{r[:file].split('/').last})"
-            else "" end
-      avg = s[:allocs] > 0 ? s[:bytes] / s[:allocs] : 0
-      kb = s[:bytes] / 1024.0
-      leak = if s[:frees] == 0 && s[:allocs] > 0 && avg >= 16
-               "  (heap rc)"
-             elsif s[:frees] == 0 && s[:allocs] > 0
-               "  (arena)"
-             elsif s[:frees] > 0 && s[:frees] < s[:allocs]
-               "  ** LEAK ** (#{s[:allocs] - s[:frees]} unfreed)"
-             else
-               ""
-             end
-      if kb >= 1.0
-        puts "  #{i + 1}. %-25s %6.1f KB  (%s allocs, %d bytes avg)%s" % [
-          "#{func} #{loc}", kb, s[:allocs].to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,'), avg, leak]
-      else
-        puts "  #{i + 1}. %-25s %6d B   (%s allocs, %d bytes avg)%s" % [
-          "#{func} #{loc}", s[:bytes], s[:allocs].to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,'), avg, leak]
+    if @opts && @opts[:focus]
+      puts "Focus: /#{@opts[:focus].source}/"
+    end
+    if @opts && @opts[:ignore]
+      puts "Ignore: /#{@opts[:ignore].source}/"
+    end
+    puts "" if @opts && (@opts[:focus] || @opts[:ignore])
+    label = cumulative? ? "Top functions by cumulative #{sort_label}:" : "Top sites by #{sort_label}:"
+    puts label
+
+    if cumulative?
+      ranked = cum_bytes.sort_by { |_f, b| -b }.first(10)
+      ranked.each_with_index do |(func, bytes), i|
+        kb = bytes / 1024.0
+        ac = cum_allocs[func]
+        if kb >= 1.0
+          puts "  #{i + 1}. %-25s %6.1f KB cum  (on %s alloc paths)" %
+               [func, kb, ac.to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,')]
+        else
+          puts "  #{i + 1}. %-25s %6d B  cum  (on %s alloc paths)" %
+               [func, bytes, ac.to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,')]
+        end
+      end
+      puts ""
+      puts "  (\"cum\" = sum of bytes whose call path passes through this function)"
+    else
+      sites.first(10).each_with_index do |s, i|
+        r = resolved&.dig(s[:addr])
+        func = r ? r[:func] : s[:addr]
+        loc = if r&.dig(:clear_line) then "(line #{r[:clear_line]})"
+              elsif r then "(#{r[:file].split('/').last})"
+              else "" end
+        avg = s[:allocs] > 0 ? s[:bytes] / s[:allocs] : 0
+        kb = s[:bytes] / 1024.0
+        leak = if s[:frees] == 0 && s[:allocs] > 0 && avg >= 16
+                 "  (heap rc)"
+               elsif s[:frees] == 0 && s[:allocs] > 0
+                 "  (arena)"
+               elsif s[:frees] > 0 && s[:frees] < s[:allocs]
+                 "  ** LEAK ** (#{s[:allocs] - s[:frees]} unfreed)"
+               else
+                 ""
+               end
+        if kb >= 1.0
+          puts "  #{i + 1}. %-25s %6.1f KB  (%s allocs, %d bytes avg)%s" % [
+            "#{func} #{loc}", kb, s[:allocs].to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,'), avg, leak]
+        else
+          puts "  #{i + 1}. %-25s %6d B   (%s allocs, %d bytes avg)%s" % [
+            "#{func} #{loc}", s[:bytes], s[:allocs].to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,'), avg, leak]
+        end
       end
     end
     arena_count   = sites.first(10).count { |s| s[:frees] == 0 && s[:allocs] > 0 && (s[:allocs] > 0 ? s[:bytes] / s[:allocs] : 0) < 16 }
@@ -495,20 +643,53 @@ def section_locks(profile_dir)
     return unless File.exist?(lock_prof)
 
     rows = File.readlines(lock_prof).reject { |l| l.start_with?('#') || l.strip.empty? }
-    locks = rows.map do |l|
-      f = l.split
-      # Columns: addr acquires contended total_wait max_wait total_hold max_hold
-      #          read_acquires read_contended read_total_wait read_max_wait
+    # Columns: addr acquires contended total_wait max_wait total_hold max_hold
+    #          read_acquires read_contended read_total_wait read_max_wait
+    #          [caller_trace]   <-- v3, optional. `-` for empty; comma-
+    #                                separated leaf-first when present.
+    # With --sync-callstacks on, multiple rows can share the same `addr`
+    # (one per (lock, caller-trace) pair). We aggregate by addr to
+    # preserve the "one row per mutex" view; the trace data is kept on
+    # the row for future caller-aware drilldowns.
+    rows_parsed = rows.map do |l|
+      f = l.split("\t").map(&:strip)
+      # tab-split for v3 (caller_trace can contain `,`). Older v1/v2
+      # files use whitespace-only separators; fall back to whitespace
+      # split when the tab-split produced too few fields.
+      f = l.strip.split if f.size < 11
       next nil if f.size < 11
+      trace_field = f[11]
+      trace = (trace_field.nil? || trace_field == '-') ? [] : trace_field.split(',')
       {
         addr: f[0], acquires: f[1].to_i, contended: f[2].to_i,
         total_wait_ns: f[3].to_i, max_wait_ns: f[4].to_i,
         total_hold_ns: f[5].to_i, max_hold_ns: f[6].to_i,
         read_acquires: f[7].to_i, read_contended: f[8].to_i,
         read_total_wait_ns: f[9].to_i, read_max_wait_ns: f[10].to_i,
+        trace: trace,
       }
-    end.compact.reject { |c| c[:acquires] == 0 && c[:read_acquires] == 0 }
-        .sort_by { |c| -c[:total_hold_ns] }
+    end.compact
+
+    # Aggregate (lock, trace) rows back to a single row per lock so
+    # the existing "Top locks" view keeps its meaning. Counters sum;
+    # max_* fields take the larger across rows.
+    locks = rows_parsed.group_by { |r| r[:addr] }.map do |addr, group|
+      {
+        addr: addr,
+        acquires: group.sum { |r| r[:acquires] },
+        contended: group.sum { |r| r[:contended] },
+        total_wait_ns: group.sum { |r| r[:total_wait_ns] },
+        max_wait_ns: group.map { |r| r[:max_wait_ns] }.max,
+        total_hold_ns: group.sum { |r| r[:total_hold_ns] },
+        max_hold_ns: group.map { |r| r[:max_hold_ns] }.max,
+        read_acquires: group.sum { |r| r[:read_acquires] },
+        read_contended: group.sum { |r| r[:read_contended] },
+        read_total_wait_ns: group.sum { |r| r[:read_total_wait_ns] },
+        read_max_wait_ns: group.map { |r| r[:read_max_wait_ns] }.max,
+        traces: group.map { |r| r[:trace] }.reject(&:empty?),
+      }
+    end.reject { |c| c[:acquires] == 0 && c[:read_acquires] == 0 }
+       .sort_by { |c| -c[:total_hold_ns] }
 
     return if locks.empty?
 
@@ -731,19 +912,41 @@ def section_mvcc(profile_dir)
     return unless File.exist?(mvcc_prof)
 
     rows = File.readlines(mvcc_prof).reject { |l| l.start_with?('#') || l.strip.empty? }
-    cells = rows.map do |l|
-      f = l.split
+    # Columns (mvcc-profile v2): addr struct_size reads commits retries
+    #   update_failures multi_commits [caller_trace]
+    # caller_trace is `-` when --sync-callstacks is off, comma-separated
+    # leaf-first when on. Pre-v2 files don't have the column; we treat
+    # missing as empty.
+    rows_parsed = rows.map do |l|
+      f = l.split("\t").map(&:strip)
+      f = l.strip.split if f.size < 6
       next nil if f.size < 6
+      trace_field = f[7]
+      trace = (trace_field.nil? || trace_field == '-') ? [] : trace_field.split(',')
       {
         addr: f[0], struct_size: f[1].to_i, reads: f[2].to_i,
         commits: f[3].to_i, retries: f[4].to_i,
         update_failures: f[5].to_i,
-        # AtomicPtr M3.16: optional 7th column for multi-cell commits.
-        # Older mvcc.txt files (pre-M3.16) won't have it; treat as 0.
         multi_commits: (f[6] || 0).to_i,
+        trace: trace,
       }
-    end.compact.reject { |c| c[:reads] == 0 && c[:commits] == 0 }
-        .sort_by { |c| -(c[:reads] + c[:commits]) }
+    end.compact
+
+    # Aggregate (cell, trace) rows back to one row per cell so the
+    # existing diagnoses (COW thrash, MVCC misuse) keep their meaning.
+    cells = rows_parsed.group_by { |r| r[:addr] }.map do |addr, group|
+      {
+        addr: addr,
+        struct_size: group.first[:struct_size],
+        reads: group.sum { |r| r[:reads] },
+        commits: group.sum { |r| r[:commits] },
+        retries: group.sum { |r| r[:retries] },
+        update_failures: group.sum { |r| r[:update_failures] },
+        multi_commits: group.sum { |r| r[:multi_commits] },
+        traces: group.map { |r| r[:trace] }.reject(&:empty?),
+      }
+    end.reject { |c| c[:reads] == 0 && c[:commits] == 0 }
+       .sort_by { |c| -(c[:reads] + c[:commits]) }
 
     return if cells.empty?
 
@@ -1098,4 +1301,381 @@ def section_freeze(profile_dir, sites, resolved, llc_miss_rate)
     puts "  CLEAR:       frozen = FREEZE expr"
     puts ""
   end
+
+  # ── --peek: callers and callees of one function ──
+  # For sites whose leaf is `regex`, lists the unique callers (frames
+  # immediately above the leaf in each trace) and aggregates bytes
+  # per caller. For samples where `regex` matches a non-leaf frame,
+  # also lists what's directly below — the callees this function
+  # was on the path to. Mirrors `pprof -peek`'s shape.
+  def run_peek(profile_dir, regex)
+    unless profile_dir && Dir.exist?(profile_dir)
+      $stderr.puts "\e[31merror:\e[0m --peek requires a profile directory"
+      exit 1
+    end
+    binary = profile_dir.chomp('/').sub(/\.profile$/, '')
+    binary = nil unless File.exist?(binary.to_s)
+
+    sites, resolved = section_heap_silent(profile_dir, binary)
+    return if sites.nil? || sites.empty?
+
+    callers  = Hash.new { |h, k| h[k] = { bytes: 0, allocs: 0 } } # caller -> stats
+    callees  = Hash.new { |h, k| h[k] = { bytes: 0, allocs: 0 } } # callee -> stats
+    self_total = { bytes: 0, allocs: 0 }
+
+    sites.each do |s|
+      funcs = s[:trace].map { |a| resolved&.dig(a)&.dig(:func) || a }
+      idx = funcs.index { |f| f =~ regex }
+      next unless idx
+
+      self_total[:bytes]  += s[:bytes]
+      self_total[:allocs] += s[:allocs]
+
+      # Caller: the frame above (deeper in the stack) — leaf-first
+      # ordering means the caller is the next index. Stops at trace
+      # root.
+      if (caller_func = funcs[idx + 1])
+        callers[caller_func][:bytes]  += s[:bytes]
+        callers[caller_func][:allocs] += s[:allocs]
+      else
+        callers['<root>'][:bytes]  += s[:bytes]
+        callers['<root>'][:allocs] += s[:allocs]
+      end
+
+      # Callee: only if the matched function is non-leaf (idx > 0).
+      # The callee is the previous index (closer to leaf).
+      if idx > 0 && (callee_func = funcs[idx - 1])
+        callees[callee_func][:bytes]  += s[:bytes]
+        callees[callee_func][:allocs] += s[:allocs]
+      end
+    end
+
+    if self_total[:bytes].zero?
+      puts "No samples matched /#{regex.source}/."
+      return
+    end
+
+    puts "=== Peek /#{regex.source}/ ==="
+    puts ""
+    puts "  Self bytes:   %s (across %s allocations)" %
+         [bytes_pretty(self_total[:bytes]), self_total[:allocs].to_s.gsub(/(\d)(?=(\d{3})+$)/, '\\1,')]
+    puts ""
+
+    puts "  Callers (frames above /#{regex.source}/):"
+    callers.sort_by { |_f, v| -v[:bytes] }.first(10).each do |func, v|
+      puts "    %-30s  %12s  (%d allocs)" % [func, bytes_pretty(v[:bytes]), v[:allocs]]
+    end
+    puts ""
+
+    if callees.any?
+      puts "  Callees (frames below /#{regex.source}/, when matched non-leaf):"
+      callees.sort_by { |_f, v| -v[:bytes] }.first(10).each do |func, v|
+        puts "    %-30s  %12s  (%d allocs)" % [func, bytes_pretty(v[:bytes]), v[:allocs]]
+      end
+      puts ""
+    end
+  end
+
+  # Same parsing as section_heap but without the printout — used by
+  # run_peek so we can build caller/callee tables from the parsed sites.
+  def section_heap_silent(profile_dir, binary)
+    out = StringIO.new
+    real, $stdout = $stdout, out
+    sites, resolved = section_heap(profile_dir, binary)
+    [sites, resolved]
+  ensure
+    $stdout = real
+  end
+
+  # ── --diff: compare two profile runs ──
+  # Loads both profile dirs, aggregates by leaf function, and reports
+  # the largest deltas in alloc_space, lock contention, and MVCC
+  # commits. Doctor's per-site advice is layered on top: a site that
+  # newly trips "MVCC fit" gets called out, etc. Computes deltas
+  # ourselves rather than shelling to `pprof -base` so doctor stays
+  # self-contained and can attach commentary.
+  def run_diff(before_dir, after_dir, focus: nil)
+    before_dir = before_dir.to_s.chomp('/')
+    after_dir  = after_dir.to_s.chomp('/')
+    unless Dir.exist?(before_dir) && Dir.exist?(after_dir)
+      $stderr.puts "\e[31merror:\e[0m diff requires two existing profile directories"
+      $stderr.puts "  before: #{before_dir} (#{Dir.exist?(before_dir) ? 'ok' : 'MISSING'})"
+      $stderr.puts "  after:  #{after_dir}  (#{Dir.exist?(after_dir)  ? 'ok' : 'MISSING'})"
+      exit 1
+    end
+
+    @opts = { cumulative: false, focus: focus }
+    puts "=== Diff (before → after) ==="
+    puts "  before: #{before_dir}"
+    puts "  after:  #{after_dir}"
+    puts "  focus:  /#{focus.source}/" if focus
+    puts ""
+
+    diff_heap(before_dir, after_dir, focus)
+    diff_locks(before_dir, after_dir, focus)
+    diff_mvcc(before_dir, after_dir, focus)
+  end
+
+  # Parse alloc.txt and aggregate by leaf-function name across all
+  # rows that resolve to that function. Returns
+  # { func_name => { allocs:, bytes:, frees:, free_bytes: } }.
+  # `binary` is the path to use for addr2line resolution; pass the
+  # after-profile's binary so before/after addresses resolve through
+  # the same symbol table (ASLR / rebuild can shift addresses but the
+  # function name should stay stable for the same source).
+  def parse_alloc_for_diff(profile_dir, binary)
+    alloc_file = File.join(profile_dir, 'alloc.txt')
+    return {} unless File.exist?(alloc_file)
+
+    rows = File.readlines(alloc_file).reject { |l| l.start_with?('#') || l.strip.empty? }
+    leaves = []
+    parsed = rows.map do |l|
+      f = l.strip.split
+      next nil if f.size < 6
+      leaf = f[0].split(',').first
+      leaves << leaf
+      [leaf, { allocs: f[1].to_i, bytes: f[2].to_i,
+               frees: f[3].to_i, free_bytes: f[4].to_i }]
+    end.compact
+
+    # Resolve leaf addrs to function names if a binary is available.
+    # Otherwise fall back to keying by raw address — distinct leaves
+    # still diff against each other; the table just shows hex addrs
+    # in place of function names.
+    addr_to_func = {}
+    if binary && File.exist?(binary)
+      raw = IO.popen(['addr2line', '-e', binary, '-f'] + leaves.uniq, err: '/dev/null', &:read)
+      out_lines = raw.split("\n")
+      leaves.uniq.each_with_index do |a, i|
+        func = (out_lines[i * 2] || '?').strip.sub(/.*\./, '').sub(/__anon_\d+/, '')
+        # addr2line returns "??" for addresses not in this binary;
+        # fall back to the raw addr.
+        func = a if func == '??' || func.empty?
+        addr_to_func[a] = func
+      end
+    end
+
+    by_func = Hash.new { |h, k| h[k] = { allocs: 0, bytes: 0, frees: 0, free_bytes: 0 } }
+    parsed.each do |leaf, vals|
+      f = addr_to_func[leaf] || leaf
+      by_func[f][:allocs]      += vals[:allocs]
+      by_func[f][:bytes]       += vals[:bytes]
+      by_func[f][:frees]       += vals[:frees]
+      by_func[f][:free_bytes]  += vals[:free_bytes]
+    end
+    by_func
+  end
+
+  def diff_heap(before_dir, after_dir, focus)
+    # Use the after-dir's binary for both lookups — that's the user's
+    # current build. Falls back to the before-dir's binary if the
+    # after-dir doesn't have one.
+    binary = locate_diff_binary(after_dir, before_dir)
+    before = parse_alloc_for_diff(before_dir, binary)
+    after  = parse_alloc_for_diff(after_dir, binary)
+    return if before.empty? && after.empty?
+
+    funcs = (before.keys + after.keys).uniq
+    funcs.select! { |f| f =~ focus } if focus
+
+    deltas = funcs.map do |f|
+      b = before[f] || { bytes: 0, allocs: 0 }
+      a = after[f]  || { bytes: 0, allocs: 0 }
+      {
+        func: f,
+        before_bytes: b[:bytes], after_bytes: a[:bytes],
+        delta_bytes: a[:bytes] - b[:bytes],
+        before_allocs: b[:allocs], after_allocs: a[:allocs],
+        delta_allocs: a[:allocs] - b[:allocs],
+      }
+    end.reject { |d| d[:delta_bytes].zero? && d[:delta_allocs].zero? }
+       .sort_by { |d| -d[:delta_bytes].abs }
+
+    return if deltas.empty?
+
+    puts "=== Heap Δ (top by |Δ alloc_space|) ==="
+    puts ""
+    printf "  %-30s  %12s  %12s  %12s\n", "function", "before", "after", "Δ"
+    deltas.first(10).each do |d|
+      arrow = d[:delta_bytes].positive? ? "↑" : "↓"
+      puts "  %-30s  %12s  %12s  %s %12s" % [
+        d[:func],
+        bytes_pretty(d[:before_bytes]),
+        bytes_pretty(d[:after_bytes]),
+        arrow,
+        bytes_pretty(d[:delta_bytes].abs),
+      ]
+    end
+    new_funcs = deltas.select { |d| d[:before_bytes].zero? && d[:after_bytes].positive? }
+    if new_funcs.any?
+      puts ""
+      puts "  New allocation sites (cold → hot):"
+      new_funcs.first(5).each { |d| puts "    + #{d[:func]} (#{bytes_pretty(d[:after_bytes])})" }
+    end
+    gone_funcs = deltas.select { |d| d[:after_bytes].zero? && d[:before_bytes].positive? }
+    if gone_funcs.any?
+      puts ""
+      puts "  Eliminated allocation sites:"
+      gone_funcs.first(5).each { |d| puts "    - #{d[:func]} (was #{bytes_pretty(d[:before_bytes])})" }
+    end
+    puts ""
+  end
+
+  def parse_locks_for_diff(profile_dir)
+    lock_prof = File.join(profile_dir, 'locks.txt')
+    return {} unless File.exist?(lock_prof)
+    rows = File.readlines(lock_prof).reject { |l| l.start_with?('#') || l.strip.empty? }
+    by_addr = Hash.new { |h, k| h[k] = { acquires: 0, contended: 0,
+                                          total_wait_ns: 0, total_hold_ns: 0,
+                                          read_acquires: 0, read_contended: 0,
+                                          read_total_wait_ns: 0 } }
+    rows.each do |l|
+      f = l.split("\t").map(&:strip)
+      f = l.strip.split if f.size < 11
+      next if f.size < 11
+      a = by_addr[f[0]]
+      a[:acquires]            += f[1].to_i
+      a[:contended]           += f[2].to_i
+      a[:total_wait_ns]       += f[3].to_i
+      a[:total_hold_ns]       += f[5].to_i
+      a[:read_acquires]       += f[7].to_i
+      a[:read_contended]      += f[8].to_i
+      a[:read_total_wait_ns]  += f[9].to_i
+    end
+    by_addr
+  end
+
+  def diff_locks(before_dir, after_dir, _focus)
+    before = parse_locks_for_diff(before_dir)
+    after  = parse_locks_for_diff(after_dir)
+    return if before.empty? && after.empty?
+
+    addrs = (before.keys + after.keys).uniq
+    deltas = addrs.map do |addr|
+      b = before[addr] || { contended: 0, read_contended: 0, total_wait_ns: 0, read_total_wait_ns: 0 }
+      a = after[addr]  || { contended: 0, read_contended: 0, total_wait_ns: 0, read_total_wait_ns: 0 }
+      bcont = b[:contended] + b[:read_contended]
+      acont = a[:contended] + a[:read_contended]
+      bwait = b[:total_wait_ns] + b[:read_total_wait_ns]
+      await = a[:total_wait_ns] + a[:read_total_wait_ns]
+      {
+        addr: addr,
+        before_cont: bcont, after_cont: acont, delta_cont: acont - bcont,
+        before_wait: bwait, after_wait: await, delta_wait: await - bwait,
+      }
+    end.reject { |d| d[:delta_cont].zero? && d[:delta_wait].zero? }
+       .sort_by { |d| -d[:delta_wait].abs }
+
+    return if deltas.empty?
+
+    puts "=== Lock Δ (top by |Δ wait_ns|) ==="
+    puts ""
+    printf "  %-14s  %12s  %12s  %12s  %s\n", "lock", "Δ contend", "Δ wait_ns", "before/after", "diagnosis"
+    deltas.first(10).each do |d|
+      diag = if d[:before_wait].zero? && d[:after_wait].positive?
+               "newly contended"
+             elsif d[:after_wait].zero? && d[:before_wait].positive?
+               "contention eliminated"
+             elsif d[:delta_wait].positive?
+               "regression"
+             else
+               "improved"
+             end
+      arrow = d[:delta_wait].positive? ? "↑" : "↓"
+      puts "  %-14s  %s %5d  %s %10d  %d/%d  %s" % [
+        d[:addr],
+        d[:delta_cont].positive? ? "↑" : "↓",
+        d[:delta_cont].abs,
+        arrow,
+        d[:delta_wait].abs,
+        d[:before_wait], d[:after_wait], diag,
+      ]
+    end
+    puts ""
+  end
+
+  def parse_mvcc_for_diff(profile_dir)
+    mvcc_prof = File.join(profile_dir, 'mvcc.txt')
+    return {} unless File.exist?(mvcc_prof)
+    rows = File.readlines(mvcc_prof).reject { |l| l.start_with?('#') || l.strip.empty? }
+    by_addr = Hash.new { |h, k| h[k] = { reads: 0, commits: 0, retries: 0 } }
+    rows.each do |l|
+      f = l.split("\t").map(&:strip)
+      f = l.strip.split if f.size < 6
+      next if f.size < 6
+      a = by_addr[f[0]]
+      a[:reads]   += f[2].to_i
+      a[:commits] += f[3].to_i
+      a[:retries] += f[4].to_i
+    end
+    by_addr
+  end
+
+  def diff_mvcc(before_dir, after_dir, _focus)
+    before = parse_mvcc_for_diff(before_dir)
+    after  = parse_mvcc_for_diff(after_dir)
+    return if before.empty? && after.empty?
+
+    addrs = (before.keys + after.keys).uniq
+    deltas = addrs.map do |addr|
+      b = before[addr] || { reads: 0, commits: 0, retries: 0 }
+      a = after[addr]  || { reads: 0, commits: 0, retries: 0 }
+      {
+        addr: addr,
+        delta_commits: a[:commits] - b[:commits],
+        delta_retries: a[:retries] - b[:retries],
+        before_retries: b[:retries], after_retries: a[:retries],
+      }
+    end.reject { |d| d[:delta_commits].zero? && d[:delta_retries].zero? }
+       .sort_by { |d| -d[:delta_retries].abs }
+
+    return if deltas.empty?
+
+    puts "=== MVCC Δ (top by |Δ retries|) ==="
+    puts ""
+    printf "  %-14s  %12s  %12s  %s\n", "cell", "Δ commits", "Δ retries", "diagnosis"
+    deltas.first(10).each do |d|
+      diag = if d[:before_retries].zero? && d[:after_retries].positive?
+               "new retry storm"
+             elsif d[:after_retries].zero? && d[:before_retries].positive?
+               "retries eliminated"
+             elsif d[:delta_retries].positive?
+               "more contention"
+             else
+               "less contention"
+             end
+      arrow_c = d[:delta_commits].positive? ? "↑" : "↓"
+      arrow_r = d[:delta_retries].positive? ? "↑" : "↓"
+      puts "  %-14s  %s %10d  %s %10d  %s" % [
+        d[:addr],
+        arrow_c, d[:delta_commits].abs,
+        arrow_r, d[:delta_retries].abs,
+        diag,
+      ]
+    end
+    puts ""
+  end
+
+  # Find the binary to use for addr2line resolution in --diff mode.
+  # Prefers the after-dir's binary (the user's current build); falls
+  # back to the before-dir's binary if the after-dir is missing one
+  # (e.g. binary was deleted after profiling).
+  def locate_diff_binary(after_dir, before_dir)
+    [after_dir, before_dir].each do |dir|
+      bin = dir.to_s.chomp('/').sub(/\.profile$/, '')
+      return bin if File.exist?(bin)
+    end
+    nil
+  end
+
+  def bytes_pretty(n)
+    return "-" if n.zero?
+    if n.abs >= 1024 * 1024
+      "%.1f MB" % (n / (1024.0 * 1024.0))
+    elsif n.abs >= 1024
+      "%.1f KB" % (n / 1024.0)
+    else
+      "#{n} B"
+    end
+  end
 end
diff --git a/src/tools/pprof.rb b/src/tools/pprof.rb
new file mode 100644
index 000000000..d17d583e4
--- /dev/null
+++ b/src/tools/pprof.rb
@@ -0,0 +1,278 @@
+require 'zlib'
+require 'stringio'
+
+# Pure-Ruby pprof v3 profile encoder.
+#
+# Builds a `Profile` message in the schema defined by
+# https://github.com/google/pprof/blob/main/proto/profile.proto and
+# emits a gzipped protobuf (`.pb.gz`) suitable for the `pprof` tool
+# (`pprof -http=:8080 binary file.pb.gz`).
+#
+# We hand-roll the wire format (~80 lines) instead of pulling the
+# `google-protobuf` C-extension gem so that `clear profile`'s pprof
+# emission stays a stdlib-only feature.
+#
+# Wire-format reference:
+#   tag = (field_number << 3) | wire_type
+#   wire_type 0 = varint
+#   wire_type 2 = length-delimited (length, then bytes)
+module Pprof
+  module Wire
+    module_function
+
+    def varint(n)
+      raise ArgumentError, "negative varint: #{n}" if n.negative?
+      out = String.new(encoding: 'ASCII-8BIT')
+      loop do
+        if n < 0x80
+          out << n.chr
+          return out
+        end
+        out << ((n & 0x7F) | 0x80).chr
+        n >>= 7
+      end
+    end
+
+    def field_varint(field, n)
+      varint((field << 3) | 0) + varint(n)
+    end
+
+    def field_bytes(field, bytes)
+      varint((field << 3) | 2) + varint(bytes.bytesize) + bytes.b
+    end
+
+    def field_string(field, s)
+      field_bytes(field, s.to_s)
+    end
+  end
+
+  # Builder for one Profile message. Strings, functions, and locations
+  # are deduplicated as added; samples are accumulated and serialized
+  # at `encode`.
+  class Profile
+    def initialize
+      @strings    = { "" => 0 }      # string_table[0] is required to be empty
+      @functions  = {}               # key: [name, system_name, filename] -> Function
+      @locations  = []
+      @samples    = []
+      @sample_types = []
+      @mappings   = []               # one entry per binary
+      @period_type  = nil
+      @period       = 0
+      @time_nanos     = (Time.now.to_f * 1e9).to_i
+      @duration_nanos = 0
+      @default_sample_type_idx = nil
+      @next_func_id = 1
+      @next_loc_id  = 1
+      @next_mapping_id = 1
+      @primary_mapping_id = 0
+    end
+
+    attr_writer :time_nanos, :duration_nanos, :period
+
+    def intern(s)
+      @strings[s.to_s] ||= @strings.size
+    end
+
+    # Declare a sample value column. `type` is the metric (e.g.
+    # `"alloc_space"`); `unit` is the unit (`"bytes"`, `"count"`,
+    # `"nanoseconds"`). The column index matches the order of calls.
+    def add_sample_type(type, unit)
+      @sample_types << [intern(type), intern(unit)]
+    end
+
+    # `period` and `period_type` describe the sampling rate (e.g.
+    # 1 sample per N events). Optional; pprof shows it in the header.
+    def set_period_type(type, unit, period)
+      @period_type = [intern(type), intern(unit)]
+      @period = period
+    end
+
+    # Pick which sample-type column the pprof UI selects by default
+    # (`pprof -inuse_space file.pb.gz`). Optional.
+    def default_sample_type=(type)
+      @default_sample_type_idx = intern(type)
+    end
+
+    # Register the binary this profile is about. pprof uses the Mapping
+    # to display the binary name in the header (silences "Main binary
+    # filename not available") and to set has_functions/has_filenames/
+    # has_line_numbers flags so its UI knows symbolization is already
+    # done. memory_start/limit are 0 because we do not require pprof
+    # to load the binary itself — addr2line resolved everything at
+    # convert time. Returns the mapping id; the first call also sets
+    # the primary mapping that all Locations attach to by default.
+    def add_mapping(binary:, build_id: '')
+      mapping = {
+        id: @next_mapping_id,
+        memory_start: 0,
+        memory_limit: 0,
+        file_offset: 0,
+        filename_idx: intern(binary),
+        build_id_idx: intern(build_id),
+        has_functions: true,
+        has_filenames: true,
+        has_line_numbers: true,
+      }
+      @mappings << mapping
+      @next_mapping_id += 1
+      @primary_mapping_id = mapping[:id] if @primary_mapping_id.zero?
+      mapping[:id]
+    end
+
+    # Returns the function id (1-based). Identical (name, system_name,
+    # filename) tuples are deduplicated.
+    def add_function(name:, filename: "", system_name: nil, start_line: 0)
+      sys = system_name || name
+      key = [name, sys, filename]
+      f = @functions[key]
+      return f[:id] if f
+      f = {
+        id: @next_func_id,
+        name_idx: intern(name),
+        system_name_idx: intern(sys),
+        filename_idx: intern(filename),
+        start_line: start_line,
+      }
+      @functions[key] = f
+      @next_func_id += 1
+      f[:id]
+    end
+
+    # Returns the location id (1-based). One Line entry per call.
+    # Defaults to the primary Mapping (the binary this profile is for)
+    # so symbol metadata propagates without per-Location plumbing.
+    def add_location(function_id:, line: 0, address: 0)
+      id = @next_loc_id
+      @next_loc_id += 1
+      @locations << {
+        id: id,
+        address: address,
+        function_id: function_id,
+        line_no: line,
+        mapping_id: @primary_mapping_id,
+      }
+      id
+    end
+
+    # `location_ids` is a stack trace, leaf-first. `values` matches the
+    # order/length of `add_sample_type` calls. `labels` is a hash of
+    # string key -> string-or-int (str labels are interned; int labels
+    # render as numeric).
+    def add_sample(location_ids, values, labels = {})
+      @samples << { location_ids: location_ids, values: values, labels: labels }
+    end
+
+    def encode
+      buf = String.new(encoding: 'ASCII-8BIT')
+
+      # field 1: sample_type (repeated ValueType)
+      @sample_types.each do |type_idx, unit_idx|
+        sub = Wire.field_varint(1, type_idx) + Wire.field_varint(2, unit_idx)
+        buf << Wire.field_bytes(1, sub)
+      end
+
+      # field 2: sample (repeated Sample)
+      @samples.each { |s| buf << Wire.field_bytes(2, encode_sample(s)) }
+
+      # field 3: mapping (repeated Mapping)
+      @mappings.each { |m| buf << Wire.field_bytes(3, encode_mapping(m)) }
+
+      # field 4: location (repeated Location)
+      @locations.each { |loc| buf << Wire.field_bytes(4, encode_location(loc)) }
+
+      # field 5: function (repeated Function)
+      @functions.each_value { |f| buf << Wire.field_bytes(5, encode_function(f)) }
+
+      # field 6: string_table (repeated string)
+      @strings.keys.each { |s| buf << Wire.field_string(6, s) }
+
+      # field 9, 10: time_nanos, duration_nanos
+      buf << Wire.field_varint(9, @time_nanos) if @time_nanos.positive?
+      buf << Wire.field_varint(10, @duration_nanos) if @duration_nanos.positive?
+
+      # field 11: period_type (ValueType), 12: period
+      if @period_type
+        sub = Wire.field_varint(1, @period_type[0]) + Wire.field_varint(2, @period_type[1])
+        buf << Wire.field_bytes(11, sub)
+      end
+      buf << Wire.field_varint(12, @period) if @period.positive?
+
+      # field 14: default_sample_type
+      buf << Wire.field_varint(14, @default_sample_type_idx) if @default_sample_type_idx
+
+      buf
+    end
+
+    def encode_gzip
+      io = StringIO.new(String.new(encoding: 'ASCII-8BIT'))
+      io.set_encoding('ASCII-8BIT')
+      Zlib::GzipWriter.wrap(io) { |gz| gz.write(encode) }
+      io.string
+    end
+
+    def write_gzip(path)
+      File.binwrite(path, encode_gzip)
+    end
+
+    private
+
+    def encode_sample(s)
+      buf = String.new(encoding: 'ASCII-8BIT')
+      # field 1: location_id (repeated uint64) — packed for compactness
+      if s[:location_ids].any?
+        packed = s[:location_ids].map { |id| Wire.varint(id) }.join
+        buf << Wire.field_bytes(1, packed)
+      end
+      # field 2: value (repeated int64) — packed
+      if s[:values].any?
+        packed = s[:values].map { |v| Wire.varint(v.to_i.negative? ? v.to_i + (1 << 64) : v.to_i) }.join
+        buf << Wire.field_bytes(2, packed)
+      end
+      # field 3: label (repeated Label)
+      s[:labels].each do |k, v|
+        lab = Wire.field_varint(1, intern(k.to_s))
+        case v
+        when Integer
+          lab += Wire.field_varint(3, v)
+        else
+          lab += Wire.field_varint(2, intern(v.to_s))
+        end
+        buf << Wire.field_bytes(3, lab)
+      end
+      buf
+    end
+
+    def encode_location(loc)
+      sub = Wire.field_varint(1, loc[:id])
+      sub += Wire.field_varint(2, loc[:mapping_id]) if loc[:mapping_id].to_i.positive?
+      sub += Wire.field_varint(3, loc[:address]) if loc[:address].to_i.positive?
+      line_buf = Wire.field_varint(1, loc[:function_id]) +
+                 Wire.field_varint(2, loc[:line_no].to_i)
+      sub += Wire.field_bytes(4, line_buf)
+      sub
+    end
+
+    def encode_mapping(m)
+      buf = Wire.field_varint(1, m[:id])
+      buf += Wire.field_varint(2, m[:memory_start]) if m[:memory_start].positive?
+      buf += Wire.field_varint(3, m[:memory_limit]) if m[:memory_limit].positive?
+      buf += Wire.field_varint(4, m[:file_offset]) if m[:file_offset].positive?
+      buf += Wire.field_varint(5, m[:filename_idx])
+      buf += Wire.field_varint(6, m[:build_id_idx]) if m[:build_id_idx].positive?
+      buf += Wire.field_varint(7, 1) if m[:has_functions]
+      buf += Wire.field_varint(8, 1) if m[:has_filenames]
+      buf += Wire.field_varint(9, 1) if m[:has_line_numbers]
+      buf += Wire.field_varint(10, 1) if m[:has_inline_frames]
+      buf
+    end
+
+    def encode_function(f)
+      Wire.field_varint(1, f[:id]) +
+        Wire.field_varint(2, f[:name_idx]) +
+        Wire.field_varint(3, f[:system_name_idx]) +
+        Wire.field_varint(4, f[:filename_idx]) +
+        Wire.field_varint(5, f[:start_line])
+    end
+  end
+end
diff --git a/src/tools/pprof_converter.rb b/src/tools/pprof_converter.rb
new file mode 100644
index 000000000..5e0fa372d
--- /dev/null
+++ b/src/tools/pprof_converter.rb
@@ -0,0 +1,418 @@
+require_relative 'pprof'
+
+# Converts the text-format profile data emitted by `clear profile`
+# into pprof's gzipped protobuf format. Each runtime profile becomes
+# one `.pb.gz` file in the same `.profile/` dir, viewable via:
+#
+#   pprof -http=:8080 ./<binary> <profile_dir>/heap.pb.gz
+#   pprof -top -alloc_space <profile_dir>/heap.pb.gz
+#   pprof -base before/heap.pb.gz after/heap.pb.gz
+#
+# The converters share an `addr2line` resolver so call-site addresses
+# show up in pprof as proper function names + CLEAR source lines (via
+# the `// CLR:N` markers the transpiler emits into transpiled.zig).
+module PprofConverter
+  module_function
+
+  # Run all available converters for a `.profile/` directory. Returns
+  # a hash of {name => path} for files actually written. Missing input
+  # files are silently skipped.
+  def convert_all(profile_dir)
+    return {} unless profile_dir && Dir.exist?(profile_dir)
+
+    binary = profile_dir.chomp('/').sub(/\.profile$/, '')
+    binary = nil unless File.exist?(binary.to_s)
+
+    out = {}
+    if (path = convert_alloc(profile_dir, binary))
+      out[:heap] = path
+    end
+    if (path = convert_locks(profile_dir, binary))
+      out[:lock] = path
+    end
+    if (path = convert_mvcc(profile_dir, binary))
+      out[:mvcc] = path
+    end
+    if (path = convert_channels(profile_dir, binary))
+      out[:channels] = path
+    end
+    if (path = convert_perf(profile_dir, binary))
+      out[:cpu] = path
+    end
+    out
+  end
+
+  # ── channels.txt → channels.pb.gz ─────────────────────────────────
+  # Channel telemetry has no call site — channels are identified by a
+  # registry id, not an address. We emit one Sample per registered
+  # channel, with synthetic Function names like `channel#7` so pprof's
+  # views show the per-channel breakdown with stable identifiers.
+  # Sample columns: pushes / pops / push_blocked / pop_blocked /
+  # max_depth (capacity travels as a label).
+  def convert_channels(profile_dir, binary)
+    src = File.join(profile_dir, 'channels.txt')
+    return nil unless File.exist?(src)
+
+    rows = parse_tabbed_columns(src, 7).map do |f|
+      f = f.first(7)  # ignore any future trailing columns
+      {
+        id: f[0].to_i,
+        pushes: f[1].to_i, pops: f[2].to_i,
+        push_blocked: f[3].to_i, pop_blocked: f[4].to_i,
+        max_depth: f[5].to_i, capacity: f[6].to_i,
+      }
+    end.reject { |c| c[:pushes].zero? && c[:pops].zero? }
+    return nil if rows.empty?
+
+    pb = Pprof::Profile.new
+    pb.add_mapping(binary: binary.to_s) if binary
+    pb.add_sample_type('pushes',       'count')
+    pb.add_sample_type('pops',         'count')
+    pb.add_sample_type('push_blocked', 'count')
+    pb.add_sample_type('pop_blocked',  'count')
+    pb.add_sample_type('max_depth',    'count')
+    pb.set_period_type('operations', 'count', 1)
+    pb.default_sample_type = 'pushes'
+
+    rows.each do |r|
+      name = "channel##{r[:id]}"
+      fid = pb.add_function(name: name, filename: '', system_name: name)
+      lid = pb.add_location(function_id: fid, line: 0, address: r[:id])
+      pb.add_sample(
+        [lid],
+        [r[:pushes], r[:pops], r[:push_blocked], r[:pop_blocked], r[:max_depth]],
+        capacity: r[:capacity], id: r[:id]
+      )
+    end
+
+    out = File.join(profile_dir, 'channels.pb.gz')
+    pb.write_gzip(out)
+    out
+  end
+
+  # ── alloc.txt → heap.pb.gz ────────────────────────────────────────
+  # Emits four sample-type columns matching Go's heap profile so the
+  # standard pprof flags (-alloc_space / -alloc_objects /
+  # -inuse_space / -inuse_objects) all work out of the box.
+  #
+  #   alloc_objects (count)  = total allocations at this site
+  #   alloc_space   (bytes)  = total bytes allocated at this site
+  #   inuse_objects (count)  = currently-live objects (allocs - frees)
+  #   inuse_space   (bytes)  = currently-live bytes  (bytes - free_bytes)
+  def convert_alloc(profile_dir, binary)
+    src = File.join(profile_dir, 'alloc.txt')
+    return nil unless File.exist?(src)
+
+    # Each site's first column is now a comma-separated leaf-first
+    # stack trace (alloc-profile v2). v1 single-addr files still parse
+    # because `split(',')` on a no-comma string returns a one-element
+    # array.
+    sites = parse_columns(src, 6).map do |f|
+      addrs = f[0].split(',')
+      {
+        addrs: addrs,
+        allocs: f[1].to_i,
+        bytes: f[2].to_i,
+        frees: f[3].to_i,
+        free_bytes: f[4].to_i,
+        live: f[5].to_i,
+      }
+    end
+    return nil if sites.empty?
+
+    all_addrs = sites.flat_map { |s| s[:addrs] }.uniq
+    resolved = resolve_addrs(all_addrs, binary, profile_dir)
+
+    pb = Pprof::Profile.new
+    pb.add_mapping(binary: binary.to_s) if binary
+    pb.add_sample_type('alloc_objects', 'count')
+    pb.add_sample_type('alloc_space',   'bytes')
+    pb.add_sample_type('inuse_objects', 'count')
+    pb.add_sample_type('inuse_space',   'bytes')
+    pb.set_period_type('space', 'bytes', 1)
+    pb.default_sample_type = 'alloc_space'
+
+    # One Location per unique address, reused across samples whose
+    # traces share that frame. Saves a lot of bytes on hot leaves
+    # like `entryWrapper` that appear in every trace.
+    location_ids = build_location_index(pb, all_addrs, resolved, profile_dir)
+
+    sites.each do |s|
+      stack = s[:addrs].map { |a| location_ids[a] }
+      pb.add_sample(
+        stack,
+        [
+          s[:allocs],
+          s[:bytes],
+          [s[:allocs] - s[:frees], 0].max,
+          [s[:bytes] - s[:free_bytes], 0].max,
+        ],
+        addr: s[:addrs].first
+      )
+    end
+
+    out = File.join(profile_dir, 'heap.pb.gz')
+    pb.write_gzip(out)
+    out
+  end
+
+  # Pre-build one Location per unique address so multi-frame samples
+  # can share frames. Returns { hex_addr => location_id }.
+  def build_location_index(pb, addrs, resolved, profile_dir)
+    addrs.each_with_object({}) do |addr, idx|
+      r = resolved[addr] || {}
+      func = r[:func] || addr
+      file = r[:file] || ''
+      # User-zig frames get clear_line into source.cht; runtime/stdlib
+      # frames get the Zig line itself, with their own filename so
+      # pprof -list resolves to the right file.
+      line = r[:is_user_zig] ? (r[:clear_line] || 0) : extract_zig_line(file)
+      filename = clear_source_path(profile_dir, file, is_user_zig: r[:is_user_zig])
+      fid = pb.add_function(name: func, filename: filename, system_name: r[:func] || func)
+      idx[addr] = pb.add_location(function_id: fid, line: line, address: parse_addr(addr))
+    end
+  end
+
+  # addr2line emits `path:line` for each address. Pull the trailing
+  # line number; returns 0 if absent.
+  def extract_zig_line(addr2line_file)
+    return 0 unless addr2line_file
+    addr2line_file =~ /:(\d+)\b/ ? Regexp.last_match(1).to_i : 0
+  end
+
+  # ── locks.txt → lock.pb.gz ────────────────────────────────────────
+  # Mirrors Go's mutex profile shape (contention count + delay ns).
+  # We add hold-time columns since the runtime tracks both, and the
+  # split read/write columns so the pprof user can drill into either.
+  def convert_locks(profile_dir, binary)
+    src = File.join(profile_dir, 'locks.txt')
+    return nil unless File.exist?(src)
+
+    # lock-profile v3 adds an optional 12th column for the caller
+    # trace ('-' = empty; comma-separated leaf-first when present).
+    # We tab-split because the trace field can contain commas; older
+    # whitespace-only files still parse via the fallback split.
+    locks = parse_tabbed_columns(src, 11).map do |f|
+      trace_field = f[11]
+      caller_trace = (trace_field.nil? || trace_field == '-') ? [] : trace_field.split(',')
+      {
+        addr: f[0], acquires: f[1].to_i, contended: f[2].to_i,
+        total_wait_ns: f[3].to_i, max_wait_ns: f[4].to_i,
+        total_hold_ns: f[5].to_i, max_hold_ns: f[6].to_i,
+        read_acquires: f[7].to_i, read_contended: f[8].to_i,
+        read_total_wait_ns: f[9].to_i, read_max_wait_ns: f[10].to_i,
+        caller_trace: caller_trace,
+      }
+    end.reject { |l| l[:acquires].zero? && l[:read_acquires].zero? }
+    return nil if locks.empty?
+
+    # Each row already carries its own (lock, caller-trace) identity;
+    # we keep them as-is so pprof renders one Sample per row. The
+    # leaf of every trace is the lock pointer, then the caller frames.
+    all_addrs = locks.flat_map { |l| [l[:addr]] + l[:caller_trace] }.uniq
+    resolved = resolve_addrs(all_addrs, binary, profile_dir)
+
+    pb = Pprof::Profile.new
+    pb.add_mapping(binary: binary.to_s) if binary
+    pb.add_sample_type('contentions',  'count')
+    pb.add_sample_type('delay',        'nanoseconds')
+    pb.add_sample_type('hold',         'nanoseconds')
+    pb.add_sample_type('acquisitions', 'count')
+    pb.set_period_type('contentions', 'count', 1)
+    pb.default_sample_type = 'delay'
+
+    location_ids = build_location_index(pb, all_addrs, resolved, profile_dir)
+
+    locks.each do |l|
+      stack = [location_ids[l[:addr]]] +
+              l[:caller_trace].map { |a| location_ids[a] }.compact
+      pb.add_sample(
+        stack,
+        [
+          l[:contended] + l[:read_contended],
+          l[:total_wait_ns] + l[:read_total_wait_ns],
+          l[:total_hold_ns],
+          l[:acquires] + l[:read_acquires],
+        ],
+        kind: l[:read_acquires] > l[:acquires] ? 'read' : 'write',
+        addr: l[:addr]
+      )
+    end
+
+    out = File.join(profile_dir, 'lock.pb.gz')
+    pb.write_gzip(out)
+    out
+  end
+
+  # ── mvcc.txt → mvcc.pb.gz ─────────────────────────────────────────
+  # MVCC cells track read/commit/retry counts and per-cell struct size.
+  # Reported columns let the pprof user spot COW-thrash (high commits
+  # x large struct), retry storms, and read-heavy cells worth keeping.
+  def convert_mvcc(profile_dir, binary)
+    src = File.join(profile_dir, 'mvcc.txt')
+    return nil unless File.exist?(src)
+
+    # mvcc-profile v2 adds an optional 8th column for the caller
+    # trace; same shape as lock-profile v3.
+    cells = parse_tabbed_columns(src, 6).map do |f|
+      trace_field = f[7]
+      caller_trace = (trace_field.nil? || trace_field == '-') ? [] : trace_field.split(',')
+      {
+        addr: f[0], struct_size: f[1].to_i, reads: f[2].to_i,
+        commits: f[3].to_i, retries: f[4].to_i,
+        update_failures: f[5].to_i,
+        multi_commits: (f[6] || 0).to_i,
+        caller_trace: caller_trace,
+      }
+    end.reject { |c| c[:reads].zero? && c[:commits].zero? }
+    return nil if cells.empty?
+
+    all_addrs = cells.flat_map { |c| [c[:addr]] + c[:caller_trace] }.uniq
+    resolved = resolve_addrs(all_addrs, binary, profile_dir)
+
+    pb = Pprof::Profile.new
+    pb.add_mapping(binary: binary.to_s) if binary
+    pb.add_sample_type('reads',     'count')
+    pb.add_sample_type('commits',   'count')
+    pb.add_sample_type('retries',   'count')
+    pb.add_sample_type('cow_bytes', 'bytes')
+    pb.set_period_type('operations', 'count', 1)
+    pb.default_sample_type = 'commits'
+
+    location_ids = build_location_index(pb, all_addrs, resolved, profile_dir)
+
+    cells.each do |c|
+      stack = [location_ids[c[:addr]]] +
+              c[:caller_trace].map { |a| location_ids[a] }.compact
+      cow_bytes = c[:struct_size] * (c[:commits] + c[:retries])
+      pb.add_sample(
+        stack,
+        [c[:reads], c[:commits], c[:retries], cow_bytes],
+        struct_size: c[:struct_size],
+        addr: c[:addr]
+      )
+    end
+
+    out = File.join(profile_dir, 'mvcc.pb.gz')
+    pb.write_gzip(out)
+    out
+  end
+
+  # ── perf.data → cpu.pb.gz ─────────────────────────────────────────
+  # Defers to Google's `perf_to_profile` (the canonical converter,
+  # `go install github.com/google/perf_data_converter/src/cmd/perf_to_profile`).
+  # If the tool is not on PATH we leave perf.data in place and return
+  # nil; the caller surfaces a one-line install hint.
+  def convert_perf(profile_dir, _binary)
+    src = File.join(profile_dir, 'perf.data')
+    return nil unless File.exist?(src)
+    return nil unless system('which perf_to_profile > /dev/null 2>&1')
+
+    out = File.join(profile_dir, 'cpu.pb.gz')
+    ok = system('perf_to_profile', '-i', src, '-o', out, %i[out err] => '/dev/null')
+    return nil unless ok && File.exist?(out)
+    out
+  end
+
+  # ── helpers ────────────────────────────────────────────────────────
+
+  # Read whitespace-separated columns from a profile text file,
+  # skipping `#` comment lines and blank lines. Filters rows that are
+  # shorter than `min_cols` (header lines, partial dumps).
+  def parse_columns(path, min_cols)
+    File.readlines(path).each_with_object([]) do |line, acc|
+      next if line.start_with?('#') || line.strip.empty?
+      f = line.strip.split
+      acc << f if f.size >= min_cols
+    end
+  end
+
+  # lock-profile and mvcc-profile use TAB separators so the optional
+  # caller_trace field can carry commas without a column-count
+  # ambiguity. Older single-tab-or-whitespace files still parse via
+  # the fallback. Filters rows shorter than `min_cols`.
+  def parse_tabbed_columns(path, min_cols)
+    File.readlines(path).each_with_object([]) do |line, acc|
+      next if line.start_with?('#') || line.strip.empty?
+      f = line.split("\t").map(&:strip)
+      f = line.strip.split if f.size < min_cols
+      acc << f if f.size >= min_cols
+    end
+  end
+
+  # Resolve a batch of hex addresses to {func, file, clear_line}.
+  # Mirrors `Doctor.section_heap`'s resolver: addr2line gives us the
+  # Zig source line; we walk back to the nearest `// CLR:N` marker
+  # the transpiler emits to recover the user-facing CLEAR source line.
+  #
+  # Only assign `clear_line` when the resolved Zig file is the user's
+  # transpiled.zig — otherwise the address was in runtime/stdlib Zig
+  # code where CLR markers don't exist, and we'd misattribute by
+  # walking back through transpiled.zig anyway. (Repro: `pprof -list
+  # entryWrapper` showed it pointing at random source.cht lines.)
+  def resolve_addrs(addrs, binary, profile_dir)
+    return {} if addrs.empty? || binary.nil?
+    raw = IO.popen(['addr2line', '-e', binary, '-f'] + addrs, err: '/dev/null', &:read)
+    lines_out = raw.split("\n")
+    zig_path = File.join(profile_dir, 'transpiled.zig')
+    zig_lines = File.exist?(zig_path) ? File.readlines(zig_path) : nil
+
+    out = {}
+    addrs.each_with_index do |addr, i|
+      func = lines_out[i * 2]&.strip || '?'
+      file = lines_out[i * 2 + 1]&.strip || '?'
+      func = func.sub(/.*\./, '').sub(/__anon_\d+/, '')
+      clear_line = nil
+      is_user_zig = zig_lines && file_is_transpiled_zig?(file, zig_path)
+      if is_user_zig && file =~ /:(\d+)\b/
+        zig_line = $1.to_i
+        (zig_line - 1).downto(0) do |li|
+          break if li.negative? || li >= zig_lines.size
+          if zig_lines[li] =~ %r{// CLR:(\d+)}
+            clear_line = $1.to_i
+            break
+          end
+        end
+      end
+      out[addr] = { func: func, file: file, clear_line: clear_line, is_user_zig: !!is_user_zig }
+    end
+    out
+  end
+
+  # True if addr2line's `<file>:<line>` came from the user's
+  # transpiled CLEAR file (where CLR markers exist), not from
+  # runtime/stdlib. The build target's actual compile-time filename
+  # is `._clear_tmp_<name>.zig`, copied to `<profile_dir>/
+  # transpiled.zig` for inspection. addr2line emits the build-time
+  # path, so we match the basename pattern that signals user code.
+  # addr2line may append "(discriminator N)" or other metadata after
+  # the line number; strip that before extracting the basename.
+  def file_is_transpiled_zig?(addr2line_file, _zig_path)
+    return false unless addr2line_file
+    bare = addr2line_file.sub(/:\d+\b.*\z/, '')
+    File.basename(bare).match?(/\A\._clear_tmp_.*\.zig\z/)
+  end
+
+  def parse_addr(s)
+    s.start_with?('0x') ? s.to_i(16) : s.to_i
+  end
+
+  # Pick the source file pprof's `list` view should show for this
+  # function. User-code (transpiled.zig with CLR markers) -> the
+  # original .cht source. Runtime/stdlib code -> the addr2line file
+  # path (which is a real .zig file pprof can read). Without this
+  # split, runtime functions like `entryWrapper` rendered against
+  # arbitrary .cht line numbers, badly misleading the user.
+  def clear_source_path(profile_dir, addr2line_file, is_user_zig: nil)
+    if is_user_zig.nil?
+      cht = File.join(profile_dir, 'source.cht')
+      return cht if File.exist?(cht)
+      return addr2line_file.to_s.sub(/:\d+\z/, '')
+    end
+    if is_user_zig
+      cht = File.join(profile_dir, 'source.cht')
+      return cht if File.exist?(cht)
+    end
+    addr2line_file.to_s.sub(/:\d+\z/, '')
+  end
+end
diff --git a/zig/runtime/alloc-profile.zig b/zig/runtime/alloc-profile.zig
index 42bca5c72..51109ffc2 100644
--- a/zig/runtime/alloc-profile.zig
+++ b/zig/runtime/alloc-profile.zig
@@ -1,12 +1,19 @@
 // alloc-profile.zig -- Allocation site tracker for `clear profile`.
 //
-// Records allocation counts and bytes per call site using @returnAddress().
-// Zero overhead when CLEAR_PROFILE is not set (comptime branch elimination).
-// Fixed-size open-addressed hash table -- no allocations inside the profiler.
+// Records allocation stack traces via std.debug.captureStackTrace, hashed
+// across the full trace so distinct call paths to the same helper resolve
+// as separate sites. Zero overhead when profiling is not built in
+// (comptime branch elimination at the call sites). Fixed-size open-
+// addressed hash table -- no allocations inside the profiler.
+//
+// Sampling: `CLEAR_PROFILE_SAMPLE=N` (set by `clear profile --sample=N`)
+// records every Nth call. Default N=1 (no sampling). Sampled values are
+// scaled by N at record time so pprof / doctor see estimated totals.
 
 const std = @import("std");
 const compat = @import("../lib/compat.zig");
 const SpinLock = @import("profile-lock.zig").SpinLock;
+const profile_trace = @import("profile-trace.zig");
 
 // Profile-table size; shared default with lock-profile / mvcc-profile.
 // `clear profile --profile-max=N` injects the override into the
@@ -17,8 +24,21 @@ pub const MAX_SITES: usize = if (@hasDecl(@import("root"), "CLEAR_PROFILE_MAX_TA
 else
     1024;
 
+// Maximum frames captured per stack. 16 matches the default depth
+// `perf record -g` walks on x86. 128 bytes per Site, vs the 8 we used
+// to cost. With MAX_SITES=1024 the table grows from ~40 KB to ~172 KB
+// — comfortably small relative to the Large (60 KB) profile-build
+// fiber stacks. The trace buffer at the call site is `[16]usize` ==
+// 128 B on the recorder's stack; the FP walk reads but doesn't push
+// new frames. Shared with lock/mvcc via profile_trace.MAX_FRAMES.
+pub const MAX_FRAMES: u8 = profile_trace.MAX_FRAMES;
+
 const Site = struct {
-    addr: usize = 0,
+    // The captured stack, leaf-first. addrs[0] is closest to the alloc
+    // site. addr_count == 0 means the slot is empty (a real captured
+    // trace always has at least the leaf frame).
+    addrs: [MAX_FRAMES]usize = [_]usize{0} ** MAX_FRAMES,
+    addr_count: u8 = 0,
     alloc_count: u64 = 0,
     alloc_bytes: u64 = 0,
     free_count: u64 = 0,
@@ -27,6 +47,7 @@ const Site = struct {
 
 var sites: [MAX_SITES]Site = [_]Site{.{}} ** MAX_SITES;
 var total_allocs: u64 = 0;
+var total_bytes: u64 = 0;
 var mu: SpinLock = .{};
 
 // Counts findSlot() calls that hit the saturated table. Surfaced
@@ -34,23 +55,67 @@ var mu: SpinLock = .{};
 // user to bump --profile-max=N. See mvcc-profile.zig for rationale.
 var dropped_samples: u64 = 0;
 
-fn findSlot(addr: usize) ?*Site {
-    const hash = addr *% 0x9E3779B97F4A7C15; // fibonacci hash
+// Sample stride. Read once from CLEAR_PROFILE_SAMPLE on the first
+// recordAlloc call (always under `mu`, so no atomics needed). 0 is
+// the uninitialized sentinel; 1 means no sampling.
+var sample_n: u64 = 0;
+var sample_counter: u64 = 0;
+
+fn ensureSampleN() void {
+    if (sample_n != 0) return;
+    if (std.c.getenv("CLEAR_PROFILE_SAMPLE")) |env| {
+        const slc = std.mem.span(env);
+        const n = std.fmt.parseInt(u64, slc, 10) catch 1;
+        sample_n = if (n == 0) 1 else n;
+    } else {
+        sample_n = 1;
+    }
+}
+
+// Hash a captured stack trace. fnv-1a-style mixing of the addresses
+// keeps the cost ~constant in the trace length and gives reasonable
+// dispersion for our open-addressed table.
+fn hashTrace(addrs: []const usize) usize {
+    var h: usize = 0xcbf29ce484222325;
+    for (addrs) |a| {
+        h ^= a;
+        h *%= 0x9E3779B97F4A7C15;
+    }
+    return h;
+}
+
+fn tracesEqual(slot: *const Site, addrs: []const usize) bool {
+    if (slot.addr_count != addrs.len) return false;
+    for (addrs, 0..) |a, i| {
+        if (slot.addrs[i] != a) return false;
+    }
+    return true;
+}
+
+fn findSlot(addrs: []const usize) ?*Site {
+    if (addrs.len == 0) return null;
+    const hash = hashTrace(addrs);
     var idx = hash & (MAX_SITES - 1);
     var probes: usize = 0;
     while (probes < MAX_SITES) : (probes += 1) {
-        if (sites[idx].addr == addr) return &sites[idx];
-        if (sites[idx].addr == 0) {
-            sites[idx].addr = addr;
-            return &sites[idx];
+        const slot = &sites[idx];
+        if (slot.addr_count == 0) {
+            // Empty: claim it. Copy up to MAX_FRAMES; a real captured
+            // trace has been clamped already, but guard anyway.
+            const n: u8 = @intCast(@min(addrs.len, MAX_FRAMES));
+            for (0..n) |i| slot.addrs[i] = addrs[i];
+            slot.addr_count = n;
+            return slot;
         }
+        if (tracesEqual(slot, addrs)) return slot;
         idx = (idx + 1) & (MAX_SITES - 1);
     }
     dropped_samples += 1;
-    return null; // table saturated -- bump CLEAR_PROFILE_MAX_TABLE_ENTRIES
+    return null;
 }
 
-var total_bytes: u64 = 0;
+// Stack capture is delegated to profile_trace.captureFromHere so the
+// FP-unwind logic stays in one place across all three profile recorders.
 
 pub fn totalAllocs() u64 {
     return total_allocs;
@@ -61,27 +126,52 @@ pub fn totalBytes() u64 {
 }
 
 pub fn recordAlloc(ret_addr: usize, size: usize) void {
+    var trace_buf: [MAX_FRAMES]usize = undefined;
+    const n = profile_trace.captureFromHere(ret_addr, &trace_buf);
+
     mu.lock();
     defer mu.unlock();
-    if (findSlot(ret_addr)) |site| {
-        site.alloc_count += 1;
-        site.alloc_bytes += size;
-    }
+
+    ensureSampleN();
     total_allocs += 1;
     total_bytes += size;
+
+    // Drop all but every sample_n-th sample. Counter is incremented
+    // unconditionally so even sampled-out allocs contribute to
+    // total_allocs / total_bytes (the header reflects the full run).
+    sample_counter +%= 1;
+    if (sample_n > 1 and (sample_counter % sample_n) != 0) return;
+
+    if (n == 0) return; // FP walk failed -- keep totals, drop site
+
+    if (findSlot(trace_buf[0..n])) |site| {
+        // Scale by sample_n so per-site totals approximate the real
+        // distribution at the cost of one multiply per recorded sample.
+        site.alloc_count += sample_n;
+        site.alloc_bytes += size * sample_n;
+    }
 }
 
 pub fn recordFree(ret_addr: usize, size: usize) void {
+    var trace_buf: [MAX_FRAMES]usize = undefined;
+    const n = profile_trace.captureFromHere(ret_addr, &trace_buf);
+
     mu.lock();
     defer mu.unlock();
-    if (findSlot(ret_addr)) |site| {
-        site.free_count += 1;
-        site.free_bytes += size;
+
+    ensureSampleN();
+    sample_counter +%= 1;
+    if (sample_n > 1 and (sample_counter % sample_n) != 0) return;
+
+    if (n == 0) return;
+
+    if (findSlot(trace_buf[0..n])) |site| {
+        site.free_count += sample_n;
+        site.free_bytes += size * sample_n;
     }
 }
 
 pub fn dump() void {
-    // Read output path from env var
     const path_ptr = std.c.getenv("CLEAR_ALLOC_PROFILE") orelse return;
 
     const fd = compat.createFileTruncate(path_ptr) catch return;
@@ -90,34 +180,50 @@ pub fn dump() void {
     mu.lock();
     defer mu.unlock();
 
-    var buf: [256]u8 = undefined;
+    var buf: [4096]u8 = undefined;
 
-    compat.writeAllFd(fd, "# alloc-profile v1\n") catch return;
+    compat.writeAllFd(fd, "# alloc-profile v2 (multi-frame, comma-separated leaf-first)\n") catch return;
     const hdr = std.fmt.bufPrint(&buf, "# total_allocs: {d}\n", .{total_allocs}) catch return;
     compat.writeAllFd(fd, hdr) catch return;
+    if (sample_n > 1) {
+        const sw = std.fmt.bufPrint(&buf, "# sample_n: {d}\n", .{sample_n}) catch return;
+        compat.writeAllFd(fd, sw) catch return;
+    }
     if (dropped_samples > 0) {
         const warn = std.fmt.bufPrint(&buf,
             "# WARNING: {d} samples dropped (cap={d}; rebuild with `clear profile --profile-max=N`)\n",
             .{ dropped_samples, MAX_SITES }) catch return;
         compat.writeAllFd(fd, warn) catch return;
     }
-    compat.writeAllFd(fd, "# addr alloc_count alloc_bytes free_count free_bytes live_bytes\n") catch return;
+    compat.writeAllFd(fd, "# addrs alloc_count alloc_bytes free_count free_bytes live_bytes\n") catch return;
 
     for (&sites) |*site| {
-        if (site.addr != 0 and site.alloc_count > 0) {
-            const live = if (site.alloc_bytes >= site.free_bytes)
-                site.alloc_bytes - site.free_bytes
-            else
-                0;
-            const line = std.fmt.bufPrint(&buf, "0x{x} {d} {d} {d} {d} {d}\n", .{
-                site.addr,
-                site.alloc_count,
-                site.alloc_bytes,
-                site.free_count,
-                site.free_bytes,
-                live,
-            }) catch continue;
-            compat.writeAllFd(fd, line) catch return;
+        if (site.addr_count == 0 or site.alloc_count == 0) continue;
+        const live = if (site.alloc_bytes >= site.free_bytes)
+            site.alloc_bytes - site.free_bytes
+        else
+            0;
+
+        // Build the comma-separated addrs field into the line buffer
+        // by appending with `bufPrint` and walking a write cursor.
+        // With 16 frames at "0x" + 16 hex + ",", worst case is
+        // 19*16 = 304 chars. 4 KB buffer covers that plus the
+        // count/bytes tail with margin.
+        var pos: usize = 0;
+        var i: u8 = 0;
+        while (i < site.addr_count) : (i += 1) {
+            const sep = if (i == 0) "" else ",";
+            const slc = std.fmt.bufPrint(buf[pos..], "{s}0x{x}", .{ sep, site.addrs[i] }) catch break;
+            pos += slc.len;
         }
+        const tail = std.fmt.bufPrint(buf[pos..], " {d} {d} {d} {d} {d}\n", .{
+            site.alloc_count,
+            site.alloc_bytes,
+            site.free_count,
+            site.free_bytes,
+            live,
+        }) catch continue;
+        pos += tail.len;
+        compat.writeAllFd(fd, buf[0..pos]) catch return;
     }
 }
diff --git a/zig/runtime/lock-profile.zig b/zig/runtime/lock-profile.zig
index 10d0230dd..6aeef73ce 100644
--- a/zig/runtime/lock-profile.zig
+++ b/zig/runtime/lock-profile.zig
@@ -3,10 +3,14 @@
 // Zero overhead when CLEAR_PROFILE == false; every call site in
 // parking-lot.zig is wrapped with `if (rt_profile.CLEAR_PROFILE)`.
 //
-// Per-lock stats are keyed by mutex address (open-addressed hash like
-// alloc-profile). Each lock gets its own entry in the hash; profile
-// data is stored by value in the table so stats survive the mutex's
-// lifetime — the registered entry is never revoked.
+// Default keying: lock pointer (one row per mutex). With `clear profile
+// --sync-callstacks`, each (lock, caller-trace) pair becomes its own row,
+// so the dump shows which call site contended for which lock. The flag
+// is off by default because the FP walk costs ~100-500ns per record;
+// uncontended mutex acquire is ~10-20ns, so the trace can dominate.
+// `--sample=N` composes (record every Nth event, scale by N at record
+// time). The CLI auto-defaults --sample to 100 when --sync-callstacks
+// is set unless the user passes their own value.
 //
 // Counters are plain u64 (not atomic): under concurrent recordAcquire
 // the counts can be slightly low under tearing. Acceptable for profile
@@ -17,6 +21,7 @@
 const std = @import("std");
 const compat = @import("../lib/compat.zig");
 const SpinLock = @import("profile-lock.zig").SpinLock;
+const profile_trace = @import("profile-trace.zig");
 
 // Profile-table size; shared default with alloc-profile / mvcc-profile.
 // `clear profile --profile-max=N` injects the override into the
@@ -46,6 +51,12 @@ pub const LockStats = struct {
     read_total_wait_ns: u64 = 0,
     read_max_wait_ns: u64 = 0,
     read_contended_acquires: u64 = 0,
+
+    // Caller stack captured at first record into this slot. Empty
+    // (caller_trace_count == 0) when --sync-callstacks is off, in
+    // which case the slot's identity is just `addr`. Leaf-first.
+    caller_trace: [profile_trace.MAX_FRAMES]usize = [_]usize{0} ** profile_trace.MAX_FRAMES,
+    caller_trace_count: u8 = 0,
 };
 
 var stats: [MAX_LOCKS]LockStats = [_]LockStats{.{}} ** MAX_LOCKS;
@@ -56,57 +67,133 @@ var mu: SpinLock = .{};
 // user to bump --profile-max=N. See mvcc-profile.zig for rationale.
 var dropped_samples: u64 = 0;
 
+// Sample stride snapshot. Read once on first record call (under `mu`,
+// no atomic needed) and reused thereafter. 0 = uninitialized sentinel.
+var sample_n: u64 = 0;
+var sample_counter: u64 = 0;
+
+fn ensureSampleN() void {
+    if (sample_n != 0) return;
+    sample_n = profile_trace.sampleStride();
+}
+
 pub inline fn now() u64 {
     return compat.nanoTimestamp();
 }
 
-fn findSlot(addr: usize) ?*LockStats {
-    // Fibonacci-style multiplicative hash — same recipe as alloc-profile.
-    const hash = addr *% 0x9E3779B97F4A7C15;
+// Mix `addr` (lock pointer) with the optional caller trace into one
+// hash. fnv-1a-style; same recipe as alloc-profile's hashTrace.
+fn hashKey(addr: usize, trace: []const usize) usize {
+    var h: usize = addr *% 0x9E3779B97F4A7C15;
+    for (trace) |a| {
+        h ^= a;
+        h *%= 0x9E3779B97F4A7C15;
+    }
+    return h;
+}
+
+fn keyMatches(slot: *const LockStats, addr: usize, trace: []const usize) bool {
+    if (slot.addr != addr) return false;
+    if (slot.caller_trace_count != trace.len) return false;
+    for (trace, 0..) |a, i| {
+        if (slot.caller_trace[i] != a) return false;
+    }
+    return true;
+}
+
+fn findSlot(addr: usize, trace: []const usize) ?*LockStats {
+    const hash = hashKey(addr, trace);
     var idx: usize = @intCast(hash & (MAX_LOCKS - 1));
     var probes: usize = 0;
     while (probes < MAX_LOCKS) : (probes += 1) {
-        if (stats[idx].addr == addr) return &stats[idx];
-        if (stats[idx].addr == 0) {
-            stats[idx].addr = addr;
-            return &stats[idx];
+        const slot = &stats[idx];
+        if (slot.addr == 0 and slot.caller_trace_count == 0) {
+            // Empty slot: claim it.
+            slot.addr = addr;
+            const n: u8 = @intCast(@min(trace.len, profile_trace.MAX_FRAMES));
+            for (0..n) |i| slot.caller_trace[i] = trace[i];
+            slot.caller_trace_count = n;
+            return slot;
         }
+        if (keyMatches(slot, addr, trace)) return slot;
         idx = (idx + 1) & (MAX_LOCKS - 1);
     }
     dropped_samples += 1;
-    return null; // table saturated — bump CLEAR_PROFILE_MAX_TABLE_ENTRIES
+    return null;
+}
+
+// Captures the caller trace if --sync-callstacks is on; otherwise
+// returns an empty slice. The trace buffer is supplied by the caller
+// so it lives on the recorder's stack frame.
+fn captureCallerTrace(
+    ret_addr: usize,
+    buf: *[profile_trace.MAX_FRAMES]usize,
+) []const usize {
+    if (!profile_trace.syncCallstacksEnabled()) return &.{};
+    const n = profile_trace.captureFromHere(ret_addr, buf);
+    return buf[0..n];
+}
+
+// Sampling gate. `sample_counter` is incremented on every call so
+// even sampled-out events count for the cadence; the next captured
+// sample's values are scaled by `sample_n` so per-row totals match
+// estimated reality. Returns true if this call should be recorded.
+fn shouldSample() bool {
+    sample_counter +%= 1;
+    return sample_n <= 1 or (sample_counter % sample_n) == 0;
 }
 
-pub inline fn recordAcquire(addr: usize, wait_ns: u64, contended: bool) void {
+pub noinline fn recordAcquire(addr: usize, wait_ns: u64, contended: bool) void {
+    var trace_buf: [profile_trace.MAX_FRAMES]usize = undefined;
+    const trace = captureCallerTrace(@returnAddress(), &trace_buf);
+
     mu.lock();
     defer mu.unlock();
-    if (findSlot(addr)) |s| {
-        s.acquires += 1;
-        s.total_wait_ns += wait_ns;
+
+    ensureSampleN();
+    if (!shouldSample()) return;
+
+    if (findSlot(addr, trace)) |s| {
+        s.acquires += sample_n;
+        s.total_wait_ns += wait_ns * sample_n;
         if (wait_ns > s.max_wait_ns) s.max_wait_ns = wait_ns;
-        if (contended) s.contended_acquires += 1;
+        if (contended) s.contended_acquires += sample_n;
     }
 }
 
 /// Read-side acquire on a ParkingRwLock. Same shape as recordAcquire
 /// but stored in the read counters so doctor can compute read/write
 /// split (read-heavy → recommend @shared:versioned).
-pub inline fn recordReadAcquire(addr: usize, wait_ns: u64, contended: bool) void {
+pub noinline fn recordReadAcquire(addr: usize, wait_ns: u64, contended: bool) void {
+    var trace_buf: [profile_trace.MAX_FRAMES]usize = undefined;
+    const trace = captureCallerTrace(@returnAddress(), &trace_buf);
+
     mu.lock();
     defer mu.unlock();
-    if (findSlot(addr)) |s| {
-        s.read_acquires += 1;
-        s.read_total_wait_ns += wait_ns;
+
+    ensureSampleN();
+    if (!shouldSample()) return;
+
+    if (findSlot(addr, trace)) |s| {
+        s.read_acquires += sample_n;
+        s.read_total_wait_ns += wait_ns * sample_n;
         if (wait_ns > s.read_max_wait_ns) s.read_max_wait_ns = wait_ns;
-        if (contended) s.read_contended_acquires += 1;
+        if (contended) s.read_contended_acquires += sample_n;
     }
 }
 
-pub inline fn recordRelease(addr: usize, hold_ns: u64) void {
+pub noinline fn recordRelease(addr: usize, hold_ns: u64) void {
+    var trace_buf: [profile_trace.MAX_FRAMES]usize = undefined;
+    const trace = captureCallerTrace(@returnAddress(), &trace_buf);
+
     mu.lock();
     defer mu.unlock();
-    if (findSlot(addr)) |s| {
-        s.total_hold_ns += hold_ns;
+
+    ensureSampleN();
+    if (!shouldSample()) return;
+
+    if (findSlot(addr, trace)) |s| {
+        s.total_hold_ns += hold_ns * sample_n;
         if (hold_ns > s.max_hold_ns) s.max_hold_ns = hold_ns;
     }
 }
@@ -119,8 +206,12 @@ pub fn dumpToEnvFile() void {
     mu.lock();
     defer mu.unlock();
 
-    var buf: [512]u8 = undefined;
-    _ = compat.writeAllFd(fd, "# lock-profile v2\n") catch return;
+    var buf: [4096]u8 = undefined;
+    _ = compat.writeAllFd(fd, "# lock-profile v3\n") catch return;
+    if (sample_n > 1) {
+        const sw = std.fmt.bufPrint(&buf, "# sample_n: {d}\n", .{sample_n}) catch return;
+        _ = compat.writeAllFd(fd, sw) catch return;
+    }
     if (dropped_samples > 0) {
         const warn = std.fmt.bufPrint(&buf,
             "# WARNING: {d} samples dropped (cap={d}; rebuild with `clear profile --profile-max=N`)\n",
@@ -129,14 +220,16 @@ pub fn dumpToEnvFile() void {
     }
     _ = compat.writeAllFd(fd,
         "# addr\tacquires\tcontended\ttotal_wait_ns\tmax_wait_ns\ttotal_hold_ns\tmax_hold_ns" ++
-        "\tread_acquires\tread_contended\tread_total_wait_ns\tread_max_wait_ns\n")
+        "\tread_acquires\tread_contended\tread_total_wait_ns\tread_max_wait_ns\tcaller_trace\n")
     catch return;
 
     for (&stats) |*s| {
-        if (s.addr == 0) continue;
+        if (s.addr == 0 and s.caller_trace_count == 0) continue;
         if (s.acquires == 0 and s.read_acquires == 0) continue;
-        const line = std.fmt.bufPrint(&buf,
-            "0x{x}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\n",
+
+        // Existing 11 columns first.
+        const head = std.fmt.bufPrint(&buf,
+            "0x{x}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t",
             .{
                 s.addr, s.acquires, s.contended_acquires,
                 s.total_wait_ns, s.max_wait_ns,
@@ -145,6 +238,23 @@ pub fn dumpToEnvFile() void {
                 s.read_total_wait_ns, s.read_max_wait_ns,
             })
         catch continue;
-        _ = compat.writeAllFd(fd, line) catch return;
+        _ = compat.writeAllFd(fd, head) catch return;
+
+        // Caller trace (12th column). `-` when empty so column count
+        // is constant; comma-separated leaf-first when populated.
+        if (s.caller_trace_count == 0) {
+            _ = compat.writeAllFd(fd, "-\n") catch return;
+        } else {
+            var pos: usize = 0;
+            var i: u8 = 0;
+            while (i < s.caller_trace_count) : (i += 1) {
+                const sep = if (i == 0) "" else ",";
+                const slc = std.fmt.bufPrint(buf[pos..], "{s}0x{x}", .{ sep, s.caller_trace[i] }) catch break;
+                pos += slc.len;
+            }
+            const tail = std.fmt.bufPrint(buf[pos..], "\n", .{}) catch continue;
+            pos += tail.len;
+            _ = compat.writeAllFd(fd, buf[0..pos]) catch return;
+        }
     }
 }
diff --git a/zig/runtime/mvcc-profile.zig b/zig/runtime/mvcc-profile.zig
index b222c80c2..f7c3a79a8 100644
--- a/zig/runtime/mvcc-profile.zig
+++ b/zig/runtime/mvcc-profile.zig
@@ -1,10 +1,13 @@
 // mvcc-profile.zig — `Versioned(T)` cell telemetry: reads, commits,
 // retries, COW byte volume.
 //
-// Zero overhead when CLEAR_PROFILE == false. Per-cell stats are keyed
-// by cell address (open-addressed hash like alloc-profile and
-// lock-profile). Counters are plain u64; the same relaxed-tearing
-// rationale as the sibling profile modules applies.
+// Zero overhead when CLEAR_PROFILE == false. Default keying: cell pointer
+// (one row per Versioned cell). With `clear profile --sync-callstacks`,
+// each (cell, caller-trace) pair becomes its own row, so the dump shows
+// which call site reads/commits which cell. The flag is off by default
+// because the FP walk costs ~100-500ns per record; an MVCC commit fast
+// path is ~20-50ns, so the trace can dominate. `--sample=N` composes
+// (record every Nth event, scale by N at record time).
 //
 // Why MVCC needs its own profile (not lock-profile): MVCC has no
 // lock. The interesting numbers are different:
@@ -23,6 +26,7 @@
 const std = @import("std");
 const compat = @import("../lib/compat.zig");
 const SpinLock = @import("profile-lock.zig").SpinLock;
+const profile_trace = @import("profile-trace.zig");
 
 // Profile-table size, shared across alloc-profile / lock-profile /
 // mvcc-profile via a single root-level override knob. Default 1024
@@ -56,6 +60,12 @@ pub const CellStats = struct {
                                 // here: 0 for fast-path-success, N for
                                 // contended commits.
     update_failures: u64 = 0,   // returned UpdateRetriesExhausted (gave up)
+
+    // Caller stack captured at first record into this slot. Empty
+    // (caller_trace_count == 0) when --sync-callstacks is off, in
+    // which case the slot's identity is just `addr`. Leaf-first.
+    caller_trace: [profile_trace.MAX_FRAMES]usize = [_]usize{0} ** profile_trace.MAX_FRAMES,
+    caller_trace_count: u8 = 0,
 };
 
 var stats: [MAX_CELLS]CellStats = [_]CellStats{.{}} ** MAX_CELLS;
@@ -67,28 +77,80 @@ var mu: SpinLock = .{};
 // Plain u64; same relaxed-tearing rationale as the per-cell counters.
 var dropped_samples: u64 = 0;
 
-fn findSlot(addr: usize, struct_size: u32) ?*CellStats {
-    const hash = addr *% 0x9E3779B97F4A7C15;
+// Sample stride snapshot. Read once on first record call (under `mu`,
+// no atomic needed) and reused thereafter. 0 = uninitialized sentinel.
+var sample_n: u64 = 0;
+var sample_counter: u64 = 0;
+
+fn ensureSampleN() void {
+    if (sample_n != 0) return;
+    sample_n = profile_trace.sampleStride();
+}
+
+fn hashKey(addr: usize, trace: []const usize) usize {
+    var h: usize = addr *% 0x9E3779B97F4A7C15;
+    for (trace) |a| {
+        h ^= a;
+        h *%= 0x9E3779B97F4A7C15;
+    }
+    return h;
+}
+
+fn keyMatches(slot: *const CellStats, addr: usize, trace: []const usize) bool {
+    if (slot.addr != addr) return false;
+    if (slot.caller_trace_count != trace.len) return false;
+    for (trace, 0..) |a, i| {
+        if (slot.caller_trace[i] != a) return false;
+    }
+    return true;
+}
+
+fn findSlot(addr: usize, struct_size: u32, trace: []const usize) ?*CellStats {
+    const hash = hashKey(addr, trace);
     var idx: usize = @intCast(hash & (MAX_CELLS - 1));
     var probes: usize = 0;
     while (probes < MAX_CELLS) : (probes += 1) {
-        if (stats[idx].addr == addr) return &stats[idx];
-        if (stats[idx].addr == 0) {
-            stats[idx].addr = addr;
-            stats[idx].struct_size = struct_size;
-            return &stats[idx];
+        const slot = &stats[idx];
+        if (slot.addr == 0 and slot.caller_trace_count == 0) {
+            slot.addr = addr;
+            slot.struct_size = struct_size;
+            const n: u8 = @intCast(@min(trace.len, profile_trace.MAX_FRAMES));
+            for (0..n) |i| slot.caller_trace[i] = trace[i];
+            slot.caller_trace_count = n;
+            return slot;
         }
+        if (keyMatches(slot, addr, trace)) return slot;
         idx = (idx + 1) & (MAX_CELLS - 1);
     }
     dropped_samples += 1;
-    return null; // table saturated -- bump CLEAR_PROFILE_MAX_TABLE_ENTRIES
+    return null;
 }
 
-pub inline fn recordRead(addr: usize, struct_size: u32) void {
+fn captureCallerTrace(
+    ret_addr: usize,
+    buf: *[profile_trace.MAX_FRAMES]usize,
+) []const usize {
+    if (!profile_trace.syncCallstacksEnabled()) return &.{};
+    const n = profile_trace.captureFromHere(ret_addr, buf);
+    return buf[0..n];
+}
+
+fn shouldSample() bool {
+    sample_counter +%= 1;
+    return sample_n <= 1 or (sample_counter % sample_n) == 0;
+}
+
+pub noinline fn recordRead(addr: usize, struct_size: u32) void {
+    var trace_buf: [profile_trace.MAX_FRAMES]usize = undefined;
+    const trace = captureCallerTrace(@returnAddress(), &trace_buf);
+
     mu.lock();
     defer mu.unlock();
-    if (findSlot(addr, struct_size)) |s| {
-        s.reads += 1;
+    ensureSampleN();
+    if (!shouldSample()) return;
+
+    if (findSlot(addr, struct_size, trace)) |s| {
+        s.reads += sample_n;
     }
 }
 
@@ -96,15 +158,21 @@ pub inline fn recordRead(addr: usize, struct_size: u32) void {
 /// CAS failures inside the update before the eventual success
 /// (0 for fast-path commits). `committed` distinguishes a winning
 /// commit from a bailed-out UpdateRetriesExhausted.
-pub inline fn recordUpdate(addr: usize, struct_size: u32, retries: u64, committed: bool) void {
+pub noinline fn recordUpdate(addr: usize, struct_size: u32, retries: u64, committed: bool) void {
+    var trace_buf: [profile_trace.MAX_FRAMES]usize = undefined;
+    const trace = captureCallerTrace(@returnAddress(), &trace_buf);
+
     mu.lock();
     defer mu.unlock();
-    if (findSlot(addr, struct_size)) |s| {
-        s.retries += retries;
+    ensureSampleN();
+    if (!shouldSample()) return;
+
+    if (findSlot(addr, struct_size, trace)) |s| {
+        s.retries += retries * sample_n;
         if (committed) {
-            s.commits += 1;
+            s.commits += sample_n;
         } else {
-            s.update_failures += 1;
+            s.update_failures += sample_n;
         }
     }
 }
@@ -116,11 +184,17 @@ pub inline fn recordUpdate(addr: usize, struct_size: u32, retries: u64, committe
 /// "upgrade @shared:versioned -> @indirect:atomic" suggestion
 /// (multi-cell commits forbid the upgrade because AtomicPtr has
 /// no multi-pointer CAS).
-pub inline fn recordMultiCommit(addr: usize, struct_size: u32) void {
+pub noinline fn recordMultiCommit(addr: usize, struct_size: u32) void {
+    var trace_buf: [profile_trace.MAX_FRAMES]usize = undefined;
+    const trace = captureCallerTrace(@returnAddress(), &trace_buf);
+
     mu.lock();
     defer mu.unlock();
-    if (findSlot(addr, struct_size)) |s| {
-        s.multi_commits += 1;
+    ensureSampleN();
+    if (!shouldSample()) return;
+
+    if (findSlot(addr, struct_size, trace)) |s| {
+        s.multi_commits += sample_n;
     }
 }
 
@@ -132,8 +206,12 @@ pub fn dumpToEnvFile() void {
     mu.lock();
     defer mu.unlock();
 
-    var buf: [256]u8 = undefined;
-    _ = compat.writeAllFd(fd, "# mvcc-profile v1\n") catch return;
+    var buf: [4096]u8 = undefined;
+    _ = compat.writeAllFd(fd, "# mvcc-profile v2\n") catch return;
+    if (sample_n > 1) {
+        const sw = std.fmt.bufPrint(&buf, "# sample_n: {d}\n", .{sample_n}) catch return;
+        _ = compat.writeAllFd(fd, sw) catch return;
+    }
     if (dropped_samples > 0) {
         const warn = std.fmt.bufPrint(&buf,
             "# WARNING: {d} samples dropped (cap={d}; rebuild with `clear profile --profile-max=N`)\n",
@@ -141,16 +219,32 @@ pub fn dumpToEnvFile() void {
         _ = compat.writeAllFd(fd, warn) catch return;
     }
     _ = compat.writeAllFd(fd,
-        "# addr\tstruct_size\treads\tcommits\tretries\tupdate_failures\tmulti_commits\n")
+        "# addr\tstruct_size\treads\tcommits\tretries\tupdate_failures\tmulti_commits\tcaller_trace\n")
     catch return;
 
     for (&stats) |*s| {
-        if (s.addr == 0) continue;
+        if (s.addr == 0 and s.caller_trace_count == 0) continue;
         if (s.reads == 0 and s.commits == 0 and s.update_failures == 0 and s.multi_commits == 0) continue;
-        const line = std.fmt.bufPrint(&buf,
-            "0x{x}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\n",
+
+        const head = std.fmt.bufPrint(&buf,
+            "0x{x}\t{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t",
             .{ s.addr, s.struct_size, s.reads, s.commits, s.retries, s.update_failures, s.multi_commits })
         catch continue;
-        _ = compat.writeAllFd(fd, line) catch return;
+        _ = compat.writeAllFd(fd, head) catch return;
+
+        if (s.caller_trace_count == 0) {
+            _ = compat.writeAllFd(fd, "-\n") catch return;
+        } else {
+            var pos: usize = 0;
+            var i: u8 = 0;
+            while (i < s.caller_trace_count) : (i += 1) {
+                const sep = if (i == 0) "" else ",";
+                const slc = std.fmt.bufPrint(buf[pos..], "{s}0x{x}", .{ sep, s.caller_trace[i] }) catch break;
+                pos += slc.len;
+            }
+            const tail = std.fmt.bufPrint(buf[pos..], "\n", .{}) catch continue;
+            pos += tail.len;
+            _ = compat.writeAllFd(fd, buf[0..pos]) catch return;
+        }
     }
 }
diff --git a/zig/runtime/profile-trace.zig b/zig/runtime/profile-trace.zig
new file mode 100644
index 000000000..726c0a59a
--- /dev/null
+++ b/zig/runtime/profile-trace.zig
@@ -0,0 +1,64 @@
+// profile-trace.zig -- Shared stack-trace + sampling helpers for the
+// alloc / lock / mvcc profile recorders.
+//
+// Each recorder has its own state (table, lock, sample counter), but
+// the trace-capture call shape is identical. We centralize:
+//   * MAX_FRAMES (one number to tune)
+//   * captureFromHere() — wraps std.debug.captureCurrentStackTrace
+//   * syncCallstacksEnabled() — reads CLEAR_PROFILE_SYNC_CALLSTACKS
+//     once on first call. Used by lock + mvcc to gate stack capture
+//     behind `clear profile --sync-callstacks` (off by default; the
+//     flag costs ~100-500ns per record and scales the wait-cheap
+//     fast paths poorly).
+//   * sampleStride() — reads CLEAR_PROFILE_SAMPLE; alloc-profile keeps
+//     its own copy because it computed sample_n before this module
+//     was extracted.
+
+const std = @import("std");
+
+pub const MAX_FRAMES: u8 = 16;
+
+// Capture a stack trace starting at `first_addr` (typically the
+// caller's `@returnAddress()`). Returns the number of frames written
+// into `addrs`. The buffer lives on the caller's stack -- 128 B at
+// MAX_FRAMES=16 -- and the FP walk reads but does not push frames,
+// so the cost is one `[16]usize` slot on the calling stack plus the
+// walk itself.
+pub fn captureFromHere(first_addr: usize, addrs: *[MAX_FRAMES]usize) u8 {
+    const trace = std.debug.captureCurrentStackTrace(
+        .{ .first_address = first_addr },
+        addrs,
+    );
+    return @intCast(@min(trace.return_addresses.len, MAX_FRAMES));
+}
+
+var sync_callstacks_state: enum { uninit, off, on } = .uninit;
+
+pub fn syncCallstacksEnabled() bool {
+    if (sync_callstacks_state != .uninit) return sync_callstacks_state == .on;
+    if (std.c.getenv("CLEAR_PROFILE_SYNC_CALLSTACKS")) |env| {
+        const slc = std.mem.span(env);
+        if (slc.len > 0 and slc[0] != '0') {
+            sync_callstacks_state = .on;
+            return true;
+        }
+    }
+    sync_callstacks_state = .off;
+    return false;
+}
+
+var sample_n: u64 = 0;
+
+// Read CLEAR_PROFILE_SAMPLE once and cache. Returns >=1; 0 in the env
+// is treated as 1 (no sampling).
+pub fn sampleStride() u64 {
+    if (sample_n != 0) return sample_n;
+    if (std.c.getenv("CLEAR_PROFILE_SAMPLE")) |env| {
+        const slc = std.mem.span(env);
+        const n = std.fmt.parseInt(u64, slc, 10) catch 1;
+        sample_n = if (n == 0) 1 else n;
+    } else {
+        sample_n = 1;
+    }
+    return sample_n;
+}