cuzzo · cuzzo · May 9, 2026 · May 8, 2026 · May 8, 2026 · May 8, 2026
diff --git a/clear b/clear
@@ -1062,20 +1062,44 @@ when 'profile'
   # are reported in the dump as a `# WARNING:` line. N must be a
   # power of two (the tables index via `& (N-1)`).
   profile_max = nil
+  # --sample=N: record every Nth event in the runtime profile tables.
+  # Captured values are scaled by N at record time so doctor / pprof
+  # see estimated totals. Default is no sampling for alloc-profile;
+  # auto-bumped to 100 for lock+mvcc when --sync-callstacks is on
+  # (see below) unless the user passes their own --sample value.
+  sample_n = nil
+  # --sync-callstacks: turn on per-record stack capture in lock-profile
+  # and mvcc-profile. Off by default because the FP walk costs ~100-500ns
+  # per record and uncontended mutex acquire / MVCC commit fast paths
+  # are 10-50ns; the trace can dominate. Enable when investigating
+  # caller-attribution for a specific lock or cell.
+  sync_callstacks = false
   clear_args = clear_args.reject do |arg|
     if arg =~ /\A--profile-max=(\d+)\z/
       profile_max = $1.to_i
       unless profile_max > 0 && (profile_max & (profile_max - 1)) == 0
         error "--profile-max=#{profile_max} must be a positive power of two (e.g. 1024, 4096, 16384)"
       end
       true
+    elsif arg =~ /\A--sample=(\d+)\z/
+      sample_n = $1.to_i
+      error "--sample=#{sample_n} must be a positive integer" unless sample_n.positive?
+      true
+    elsif arg == '--sync-callstacks'
+      sync_callstacks = true
+      true
     else
       false
     end
   end
 
+  # When --sync-callstacks is on but no explicit --sample was given,
+  # default to 100 so the per-record cost stays manageable. Users can
+  # set --sample=1 to opt in to full capture at the full cost.
+  sample_n = 100 if sync_callstacks && sample_n.nil?
+
   source = clear_args.first
-  error "Usage: clear profile <file.cht> [--profile-max=N] [-- args...]" unless source
+  error "Usage: clear profile <file.cht> [--profile-max=N] [--sample=N] [--sync-callstacks] [-- args...]" unless source
 
   source = File.expand_path(source)
   base_name = File.basename(source, '.cht')
@@ -1114,6 +1138,8 @@ when 'profile'
     "CLEAR_LOCK_PROFILE=#{lock_file}",
     "CLEAR_MVCC_PROFILE=#{mvcc_file}",
   ]
+  env_parts << "CLEAR_PROFILE_SAMPLE=#{sample_n}" if sample_n
+  env_parts << "CLEAR_PROFILE_SYNC_CALLSTACKS=1" if sync_callstacks
   env_parts << "CLEAR_THREADS=#{ENV['CLEAR_THREADS']}" if ENV['CLEAR_THREADS']
   target_env = env_parts.join(' ')
 
@@ -1137,6 +1163,17 @@ when 'profile'
     system("perf stat -e cycles,instructions,branches,branch-misses,cache-references,cache-misses,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses,page-faults -o #{perf_stat_file} env #{target_env} #{output} #{run_args.join(' ')} > /dev/null 2>&1")
   end
 
+  # Convert each runtime profile to pprof's gzipped protobuf so the
+  # `pprof` tool can render flamegraphs / call graphs / source views
+  # directly. Heap / lock / mvcc are encoded in pure Ruby; CPU defers
+  # to the standard `perf_to_profile` Go binary if it's on PATH.
+  require_relative 'src/tools/pprof_converter'
+  pprof_files = PprofConverter.convert_all(profile_dir)
+  if File.exist?(perf_data) && !pprof_files[:cpu]
+    hint "perf_to_profile not on PATH — skipping CPU pprof. Install: " \
+         "go install github.com/google/perf_data_converter/src/cmd/perf_to_profile@latest"
+  end
+
   puts ""
   puts "Profile data: #{profile_dir}/"
   puts "  Heap:     #{alloc_file} (#{File.exist?(alloc_file) ? "#{(File.size(alloc_file) / 1024.0).round(1)} KB" : 'not generated'})"
@@ -1146,10 +1183,44 @@ when 'profile'
   puts "  Syscalls: #{strace_file}" if File.exist?(strace_file)
   puts "  Counters: #{perf_stat_file}" if File.exist?(perf_stat_file)
   puts "  MVCC:     #{mvcc_file}" if File.exist?(mvcc_file)
+  if pprof_files.any?
+    puts ""
+    puts "pprof:    pprof -http=:8080 #{output} #{profile_dir}/<file>.pb.gz"
+    pprof_files.each { |kind, path| puts "  #{kind}:#{' ' * (8 - kind.to_s.length)}#{path}" }
+  end
 
 when 'doctor'
   require_relative 'src/tools/doctor'
-  Doctor.run(ARGV[1])
+  # Parse flags: --cumulative shows cum bytes/count alongside flat;
+  # --focus=REGEX filters samples whose trace touches a function
+  # matching the pattern; --diff <before-dir> compares two profile
+  # runs and shows top deltas per section.
+  doctor_opts = { cumulative: false, focus: nil, ignore: nil, peek: nil, diff: nil, by: :bytes }
+  positional = []
+  i = 1
+  while i < ARGV.length
+    a = ARGV[i]
+    case a
+    when '--cumulative', '--cum'
+      doctor_opts[:cumulative] = true
+    when /\A--focus=(.+)\z/
+      doctor_opts[:focus] = Regexp.new($1)
+    when /\A--ignore=(.+)\z/
+      doctor_opts[:ignore] = Regexp.new($1)
+    when /\A--peek=(.+)\z/
+      doctor_opts[:peek] = Regexp.new($1)
+    when /\A--by=(bytes|allocs|inuse_bytes|inuse_allocs)\z/
+      doctor_opts[:by] = $1.to_sym
+    when '--diff'
+      doctor_opts[:diff] = ARGV[i + 1]
+      i += 1
+      error "--diff requires a <before-profile-dir> argument" if doctor_opts[:diff].nil?
+    else
+      positional << a
+    end
+    i += 1
+  end
+  Doctor.run(positional.first, **doctor_opts)
 
 when 'fix'
   require_relative 'src/ast/lexer'
@@ -1569,6 +1640,28 @@ when 'explain'
   end
   exit 0
 
+when 'completions'
+  shell = ARGV[1]
+  unless shell && %w[bash zsh fish].include?(shell)
+    $stderr.puts "Usage: clear completions {bash|zsh|fish}"
+    $stderr.puts ""
+    $stderr.puts "Install (bash):"
+    $stderr.puts "  echo 'source <(clear completions bash)' >> ~/.bashrc"
+    $stderr.puts ""
+    $stderr.puts "Install (zsh):"
+    $stderr.puts "  mkdir -p ~/.zsh/completions"
+    $stderr.puts "  clear completions zsh > ~/.zsh/completions/_clear"
+    $stderr.puts "  # add to ~/.zshrc BEFORE 'compinit':"
+    $stderr.puts "  #   fpath=(~/.zsh/completions \\$fpath)"
+    $stderr.puts ""
+    $stderr.puts "Install (fish):"
+    $stderr.puts "  clear completions fish > ~/.config/fish/completions/clear.fish"
+    exit 1
+  end
+  require_relative 'src/tools/completions'
+  print Completions.script_for(shell)
+  exit 0
+
 when 'help', '--help', '-h', nil
   puts <<~HELP
     CLEAR Language Compiler
@@ -1593,6 +1686,7 @@ when 'help', '--help', '-h', nil
       clear fmt --no-warn <path>    Suppress width warnings for >120-char lines
       clear explain                 List every registered diagnostic code
       clear explain <CODE>          Show docs for a specific diagnostic
+      clear completions {bash|zsh|fish}  Print shell completions (see docs/completions.md)
 
     Commands:
       build    Transpile and compile a CLEAR program to a native binary

diff --git a/docs/completions.md b/docs/completions.md
@@ -0,0 +1,74 @@
+# Shell completions
+
+Tab-completion for `clear` subcommands, and file/directory arguments
+filtered per subcommand:
+
+| Subcommand | Completes to |
+|---|---|
+| `clear build`, `run`, `fmt`, `fix`, `profile`, `explain` | `*.cht` files (and directories to navigate) |
+| `clear test`, `benchmark` | `*.cht` files or directories |
+| `clear doctor` | `*.profile/` directories |
+| `clear completions` | `bash` / `zsh` / `fish` |
+
+Generate the script for your shell with `clear completions <shell>`,
+then install per the instructions below.
+
+## Bash
+
+Add to `~/.bashrc`:
+
+```sh
+source <(clear completions bash)
+```
+
+Or write to the system completions dir (loaded by every interactive
+shell, no rc-file edit):
+
+```sh
+clear completions bash | sudo tee /etc/bash_completion.d/clear > /dev/null
+```
+
+## Zsh
+
+The convention is one `_<cmd>` file per command in a directory on
+`$fpath`:
+
+```sh
+mkdir -p ~/.zsh/completions
+clear completions zsh > ~/.zsh/completions/_clear
+```
+
+Then ensure `~/.zsh/completions` is on `$fpath` **before** `compinit`
+runs. In `~/.zshrc`:
+
+```sh
+fpath=(~/.zsh/completions $fpath)
+autoload -Uz compinit && compinit
+```
+
+Reopen the shell (or `compinit -u`) and tab-completion will pick up
+descriptions for each subcommand.
+
+## Fish
+
+Fish auto-loads completions from `~/.config/fish/completions/`:
+
+```sh
+clear completions fish > ~/.config/fish/completions/clear.fish
+```
+
+No rc-file edit needed.
+
+## Verifying
+
+```sh
+clear <TAB>                       # lists subcommands
+clear doctor <TAB>                # lists *.profile/ dirs
+clear profile examples/<TAB>      # lists *.cht files in examples/
+```
+
+## Updating
+
+The completion script is generated from the live `Completions`
+module (`src/tools/completions.rb`). When new subcommands are added,
+re-run `clear completions <shell> > <file>` to refresh.
diff --git a/docs/pprof.md b/docs/pprof.md
@@ -0,0 +1,143 @@
+# pprof integration
+
+`clear profile` emits its runtime profile data in pprof's gzipped
+protobuf format alongside the existing text dumps, so you can use
+the standard `pprof` tool for flamegraphs, call graphs, and source
+views.
+
+## Install
+
+| Tool | When you need it | Install |
+|---|---|---|
+| `pprof` | Always (the viewer) | `go install github.com/google/pprof@latest` |
+| `perf_to_profile` | If you want CPU flamegraphs from `perf.data` | `go install github.com/google/perf_data_converter/src/cmd/perf_to_profile@latest` |
+| `graphviz` | If you use `pprof -svg` / `pprof -dot` | `apt install graphviz` / `brew install graphviz` |
+
+The web UI (`pprof -http=:8080`) and `pprof -top` text view do not
+need graphviz.
+
+## Use
+
+```sh
+clear profile foo.cht
+# -> writes foo.profile/heap.pb.gz, lock.pb.gz, mvcc.pb.gz
+#    (and cpu.pb.gz if perf_to_profile is on PATH)
+
+pprof -http=:8080 ./foo foo.profile/heap.pb.gz
+pprof -top -alloc_space  foo.profile/heap.pb.gz
+pprof -top -inuse_space  foo.profile/heap.pb.gz
+pprof -top -delay        foo.profile/lock.pb.gz
+pprof -base before/heap.pb.gz after/heap.pb.gz   # regression diff
+```
+
+## What's in each file
+
+### `heap.pb.gz`
+
+Sample columns: `alloc_objects` / `alloc_space` / `inuse_objects` /
+`inuse_space`. Each call site is one sample with its hex address as
+a label (`pprof -tags heap.pb.gz`).
+
+### `lock.pb.gz`
+
+Sample columns: `contentions` / `delay` / `hold` / `acquisitions`.
+Read+write contention sums into `contentions`; `delay` is the total
+wait (read+write); `hold` is total exclusive hold time.
+
+### `mvcc.pb.gz`
+
+Sample columns: `reads` / `commits` / `retries` / `cow_bytes`.
+`cow_bytes = struct_size * (commits + retries)` — the byte volume
+moved by copy-on-write commits, the most direct cost signal for
+`@shared:versioned` cells.
+
+### `channels.pb.gz`
+
+Sample columns: `pushes` / `pops` / `push_blocked` / `pop_blocked` /
+`max_depth`. One sample per registered channel; the synthetic
+function name is `channel#<id>` and the channel's capacity travels
+as a label (`pprof -tags channels.pb.gz`).
+
+### `cpu.pb.gz`
+
+Standard CPU profile from `perf.data`, converted by
+`perf_to_profile`. Sample columns are whatever Go's pprof shows
+(`samples` / `cpu` ns).
+
+## CLEAR source mapping
+
+The transpiler emits `// CLR:N` markers in `transpiled.zig`. Our
+converter walks those back to the user's `.cht` line and stamps it
+onto each pprof Location, so `pprof -list <fn>` shows CLEAR source
+lines (not Zig).
+
+## Sampling
+
+Stack traces for **alloc-profile** are captured on every alloc by
+default. For workloads where the per-alloc unwind cost matters,
+`--sample=N` records every Nth event and scales the captured values
+by N so doctor / pprof see estimated totals:
+
+```sh
+clear profile foo.cht --sample=100
+```
+
+Header records the chosen `sample_n` so consumers can rescale or
+flag the approximation.
+
+## Stack traces for lock and mvcc profiles
+
+`--sync-callstacks` (off by default) turns on per-record stack
+capture in lock-profile and mvcc-profile, so pprof's tree/flame
+views and per-caller attribution work for these profiles too:
+
+```sh
+clear profile foo.cht --sync-callstacks            # auto-bumps --sample to 100
+clear profile foo.cht --sync-callstacks --sample=1  # full capture, full cost
+```
+
+Off by default because the FP walk costs ~100-500ns per record.
+Uncontended mutex acquire is ~10-20ns and MVCC commit fast paths
+are ~20-50ns, so the trace can dominate the operation it's measuring.
+When the flag is set without an explicit `--sample`, we auto-default
+to `--sample=100` to keep the cost manageable.
+
+With `--sync-callstacks` on, each (lock, caller-trace) pair becomes
+its own row in lock-profile, and same for (cell, caller-trace) in
+mvcc-profile. Doctor aggregates rows back to one-per-lock for its
+existing diagnoses; pprof tree views show the per-caller breakdown.
+
+## Doctor flags built on this data
+
+`clear doctor` consumes the same profile dirs and grew three new
+flags that draw on the multi-frame trace data:
+
+```sh
+clear doctor foo.profile/ --cumulative           # rank functions by cum bytes
+clear doctor foo.profile/ --focus=intToString    # filter to traces that touch this function
+clear doctor foo.profile/ --ignore=intToString   # drop traces that touch this function
+clear doctor foo.profile/ --peek=processRequest  # callers + callees of one function
+clear doctor foo.profile/ --by=allocs            # sort heap by allocation count, not bytes
+clear doctor foo.profile/ --by=inuse_bytes       # sort by allocs - frees (live bytes)
+clear doctor old.profile/ --diff new.profile/    # perf-regression diff between two runs
+```
+
+`--cumulative` aggregates bytes/allocs across every frame in each
+trace, so a function high in the call stack accrues its callees'
+costs. `--focus=REGEX` keeps only sites whose trace touches a function
+matching the pattern. `--diff` reports per-function deltas in
+allocation, lock contention, and MVCC retries, with directional arrows
+and "newly contended" / "retries eliminated" annotations.
+
+## Notes
+
+- We do not emit a Mapping message for the binary, so `pprof` prints
+  "Main binary filename not available" and skips its own symbolization.
+  Function names still appear because we resolve via `addr2line` at
+  conversion time.
+- alloc-profile captures stacks unconditionally (multi-frame v2,
+  comma-separated leaf-first in `alloc.txt`).
+- lock-profile (v3) and mvcc-profile (v2) capture stacks only when
+  `--sync-callstacks` is on. The 12th column of `locks.txt` and the
+  8th column of `mvcc.txt` carry `-` (off) or comma-separated leaf-
+  first addrs (on). Tab-separated to allow commas in the trace field.