diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index de40bfb8c..c22bae6a3 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -46,7 +46,7 @@ jobs:
   bench-base:
     name: Bench base
     needs: merge-base
-    runs-on: [matterlabs-ci-runner-highmem]
+    runs-on: [matterlabs-ci-runner-high-performance]
     permissions:
       contents: read
     steps:
@@ -58,27 +58,81 @@ jobs:
       - uses: ./.github/actions/setup-toolchain
       - name: Compile for RISC-V
         working-directory: ./zksync_os
+        # `for-tests-benchmarking-pectra` was introduced on the PR side of
+        # this workflow; the merge-base may predate it. Fall back to the
+        # non-pectra type so bench-base can still produce baseline numbers
+        # for `test_precompiles` even if the merge-base lacks pectra
+        # support (test_pectra_precompiles / test_kzg_regression are then
+        # exercised only on the head side — best-effort coverage).
         run: |
-          ./dump_bin.sh --type for-tests-benchmarking
+          if grep -q "for-tests-benchmarking-pectra" dump_bin.sh; then
+            ./dump_bin.sh --type for-tests-benchmarking-pectra
+          else
+            ./dump_bin.sh --type for-tests-benchmarking
+          fi
           ./dump_bin.sh --type evm-replay-benchmarking
       - name: Run benchmarks
         shell: bash
+        # Profile/feature/test-filter selection adapts to the checked-out tree:
+        #   - `bench-fast` profile: introduced on the PR; merge-base may lack
+        #     it, in which case we fall back to `--release`.
+        #   - `precompiles/pectra` feature + extra test functions:
+        #     introduced together with `for-tests-benchmarking-pectra`; if the
+        #     proving binary type is unavailable, the proving binary can't run
+        #     BLS/BLAKE2F/KZG vectors, so we drop those tests + the feature.
         run: |
+          if grep -q "bench-fast" Cargo.toml; then
+            PROFILE="--profile bench-fast"
+          else
+            PROFILE="--release"
+          fi
+          if grep -q "for-tests-benchmarking-pectra" zksync_os/dump_bin.sh; then
+            PRECOMPILES_FEATURES="rig/no_print,precompiles/cycle_marker,precompiles/pectra,rig/unlimited_native"
+            PRECOMPILES_TESTS="test_precompiles test_pectra_precompiles test_kzg_regression"
+          else
+            PRECOMPILES_FEATURES="rig/no_print,precompiles/cycle_marker,rig/unlimited_native"
+            PRECOMPILES_TESTS="test_precompiles"
+          fi
           for dir in tests/instances/eth_runner/blocks/*; do
             blk=$(basename "$dir")
+            # Pass 1: default DA scheme (BlobsAndPubdataKeccak256) — full
+            # instrumentation (opcodes + precompiles + cycle markers).
             OPCODE_SAMPLES_DIR=$(pwd)/opcode_samples/base_${blk} \
             OPCODE_CYCLE_SAMPLES_DIR=$(pwd)/opcode_cycles/base_${blk} \
             MARKER_PATH=$(pwd)/base_block_${blk}.bench \
-            cargo run --manifest-path tests/instances/eth_runner/Cargo.toml --release --features rig/no_print,rig/cycle_marker,rig/unlimited_native -- single-run --block-dir "$dir" --opcode-stats > base_block_${blk}.out
+            PRECOMPILE_STATS_PATH=$(pwd)/base_block_${blk}_precompile_stats.csv \
+            PRECOMPILE_SAMPLES_DIR=$(pwd)/precompile_samples/base_${blk} \
+            LABEL_CYCLE_SAMPLES_DIR=$(pwd)/precompile_cycles/base_${blk} \
+            cargo run --manifest-path tests/instances/eth_runner/Cargo.toml $PROFILE --features rig/no_print,rig/cycle_marker,rig/unlimited_native -- single-run --block-dir "$dir" --opcode-stats > base_block_${blk}.out
+            # Pass 2: BlobsZKsyncOS DA scheme. Only the post-tx-op stage
+            # differs (the tx-loop work is identical to pass 1), so we capture
+            # ONLY the cycle markers here — no opcode/precompile dumps.
+            # Skip on merge-bases that lack BENCH_DA_SCHEME plumbing.
+            if grep -q "BENCH_DA_SCHEME" tests/instances/eth_runner/src/single_run.rs; then
+              BENCH_DA_SCHEME=blobs_zksync_os \
+              MARKER_PATH=$(pwd)/base_block_${blk}_blobs.bench \
+              cargo run --manifest-path tests/instances/eth_runner/Cargo.toml $PROFILE --features rig/no_print,rig/cycle_marker,rig/unlimited_native -- single-run --block-dir "$dir" > base_block_${blk}_blobs.out
+            fi
           done
-          MARKER_PATH=$(pwd)/base_precompiles.bench cargo test --release --features rig/no_print,precompiles/cycle_marker,rig/unlimited_native -p precompiles -- test_precompiles
+          # Test name substring filters (Rust's harness matches if ANY substring matches):
+          #   test_precompiles       — 114 core precompile vectors (TESTS)
+          #   test_pectra_precompiles — 6 BLAKE2F + BLS12-381 vectors (PECTRA_TESTS, gated by `precompiles/pectra` feature)
+          #   test_kzg_regression    — 1 KZG / point_evaluation vector (KZG_TESTS)
+          # `test_p256` (781 P256 vectors) is `#[ignore = "Too long for CI"]`; would
+          # need `--include-ignored` to run, which significantly lengthens CI. Tracked
+          # as a follow-up coverage gap.
+          MARKER_PATH=$(pwd)/base_precompiles.bench PRECOMPILE_STATS_PATH=$(pwd)/base_precompile_stats.csv PRECOMPILE_SAMPLES_DIR=$(pwd)/base_precompile_samples LABEL_CYCLE_SAMPLES_DIR=$(pwd)/base_precompile_cycles cargo test $PROFILE --features $PRECOMPILES_FEATURES -p precompiles -- --test-threads=1 $PRECOMPILES_TESTS
       - uses: actions/upload-artifact@v4
         with:
           name: bench-base-results
           path: |
             base_block_*.bench
+            base_block_*_blobs.bench
             base_block_*.out
+            base_block_*_blobs.out
+            base_block_*_precompile_stats.csv
             base_precompiles.bench
+            base_precompile_stats.csv
             opcode_samples/
             opcode_cycles/
 
@@ -86,7 +140,7 @@ jobs:
   bench-head:
     name: Bench head
     needs: merge-base
-    runs-on: [matterlabs-ci-runner-highmem]
+    runs-on: [matterlabs-ci-runner-high-performance]
     permissions:
       contents: read
     steps:
@@ -99,31 +153,75 @@ jobs:
       - uses: ./.github/actions/setup-toolchain
       - name: Compile for RISC-V
         working-directory: ./zksync_os
+        # `for-tests-benchmarking-pectra` was introduced on the PR side of
+        # this workflow; the merge-base may predate it. Fall back to the
+        # non-pectra type so bench-base can still produce baseline numbers
+        # for `test_precompiles` even if the merge-base lacks pectra
+        # support (test_pectra_precompiles / test_kzg_regression are then
+        # exercised only on the head side — best-effort coverage).
         run: |
-          ./dump_bin.sh --type for-tests-benchmarking
+          if grep -q "for-tests-benchmarking-pectra" dump_bin.sh; then
+            ./dump_bin.sh --type for-tests-benchmarking-pectra
+          else
+            ./dump_bin.sh --type for-tests-benchmarking
+          fi
           ./dump_bin.sh --type evm-replay-benchmarking
       - name: Run benchmarks
         shell: bash
+        # See bench-base for the rationale on profile/feature/test-filter
+        # detection. Mirrored here so the same fallback applies if/when the
+        # head also lacks one of these (defensive — usually head has them).
         run: |
+          if grep -q "bench-fast" Cargo.toml; then
+            PROFILE="--profile bench-fast"
+          else
+            PROFILE="--release"
+          fi
+          if grep -q "for-tests-benchmarking-pectra" zksync_os/dump_bin.sh; then
+            PRECOMPILES_FEATURES="rig/no_print,precompiles/cycle_marker,precompiles/pectra,rig/unlimited_native"
+            PRECOMPILES_TESTS="test_precompiles test_pectra_precompiles test_kzg_regression"
+          else
+            PRECOMPILES_FEATURES="rig/no_print,precompiles/cycle_marker,rig/unlimited_native"
+            PRECOMPILES_TESTS="test_precompiles"
+          fi
           for dir in tests/instances/eth_runner/blocks/*; do
             blk=$(basename "$dir")
+            # Pass 1: default DA scheme (BlobsAndPubdataKeccak256).
             OPCODE_SAMPLES_DIR=$(pwd)/opcode_samples/head_${blk} \
             OPCODE_CYCLE_SAMPLES_DIR=$(pwd)/opcode_cycles/head_${blk} \
             OPCODE_STATS_PATH=$(pwd)/head_block_${blk}_opcode_stats.csv \
             MARKER_PATH=$(pwd)/head_block_${blk}.bench \
-            cargo run --manifest-path tests/instances/eth_runner/Cargo.toml --release --features rig/no_print,rig/cycle_marker,rig/unlimited_native -- single-run --block-dir "$dir" --opcode-stats > head_block_${blk}.out
+            PRECOMPILE_STATS_PATH=$(pwd)/head_block_${blk}_precompile_stats.csv \
+            PRECOMPILE_SAMPLES_DIR=$(pwd)/precompile_samples/head_${blk} \
+            LABEL_CYCLE_SAMPLES_DIR=$(pwd)/precompile_cycles/head_${blk} \
+            cargo run --manifest-path tests/instances/eth_runner/Cargo.toml $PROFILE --features rig/no_print,rig/cycle_marker,rig/unlimited_native -- single-run --block-dir "$dir" --opcode-stats > head_block_${blk}.out
+            # Pass 2: BlobsZKsyncOS — only cycle markers captured.
+            if grep -q "BENCH_DA_SCHEME" tests/instances/eth_runner/src/single_run.rs; then
+              BENCH_DA_SCHEME=blobs_zksync_os \
+              MARKER_PATH=$(pwd)/head_block_${blk}_blobs.bench \
+              cargo run --manifest-path tests/instances/eth_runner/Cargo.toml $PROFILE --features rig/no_print,rig/cycle_marker,rig/unlimited_native -- single-run --block-dir "$dir" > head_block_${blk}_blobs.out
+            fi
           done
-          MARKER_PATH=$(pwd)/head_precompiles.bench cargo test --release --features rig/no_print,precompiles/cycle_marker,rig/unlimited_native -p precompiles -- test_precompiles
+          # See bench-base step for filter-substring rationale.
+          MARKER_PATH=$(pwd)/head_precompiles.bench PRECOMPILE_STATS_PATH=$(pwd)/head_precompile_stats.csv PRECOMPILE_SAMPLES_DIR=$(pwd)/head_precompile_samples LABEL_CYCLE_SAMPLES_DIR=$(pwd)/head_precompile_cycles cargo test $PROFILE --features $PRECOMPILES_FEATURES -p precompiles -- --test-threads=1 $PRECOMPILES_TESTS
       - uses: actions/upload-artifact@v4
         with:
           name: bench-head-results
           path: |
             head_block_*.bench
+            head_block_*_blobs.bench
             head_block_*.out
+            head_block_*_blobs.out
             head_block_*_opcode_stats.csv
+            head_block_*_precompile_stats.csv
             head_precompiles.bench
+            head_precompile_stats.csv
+            head_precompile_samples/
+            head_precompile_cycles/
             opcode_samples/
             opcode_cycles/
+            precompile_samples/
+            precompile_cycles/
 
   # Compare base and head results, post comment.
   compare:
@@ -158,33 +256,103 @@ jobs:
           # Move all files to the workspace root so scripts find them by name.
           cp base-results/base_block_*.bench base-results/base_block_*.out base-results/base_precompiles.bench . 2>/dev/null || true
           cp head-results/head_block_*.bench head-results/head_block_*.out head-results/head_precompiles.bench . 2>/dev/null || true
+          cp base-results/base_precompile_stats.csv . 2>/dev/null || true
+          cp head-results/head_precompile_stats.csv . 2>/dev/null || true
+          cp base-results/base_block_*_precompile_stats.csv . 2>/dev/null || true
+          cp head-results/head_block_*_precompile_stats.csv . 2>/dev/null || true
           cp head-results/head_block_*_opcode_stats.csv . 2>/dev/null || true
+          # Precompile per-execution samples + cycles are head-only (base
+          # currently has no per-block tracer wiring); the comparison join
+          # consumes only the head-side data.
+          cp -r head-results/head_precompile_samples . 2>/dev/null || true
+          cp -r head-results/head_precompile_cycles . 2>/dev/null || true
           # Merge opcode_samples and opcode_cycles directories
           mkdir -p opcode_samples opcode_cycles
           cp -r base-results/opcode_samples/* opcode_samples/ 2>/dev/null || true
           cp -r head-results/opcode_samples/* opcode_samples/ 2>/dev/null || true
           cp -r base-results/opcode_cycles/* opcode_cycles/ 2>/dev/null || true
           cp -r head-results/opcode_cycles/* opcode_cycles/ 2>/dev/null || true
+          # Per-block precompile samples and cycles (head only).
+          mkdir -p precompile_samples precompile_cycles
+          cp -r head-results/precompile_samples/* precompile_samples/ 2>/dev/null || true
+          cp -r head-results/precompile_cycles/* precompile_cycles/ 2>/dev/null || true
 
       - name: Generate comparison
         shell: bash
         id: comparison
         run: |
           mkdir -p bench_results
-          pairs=""
+          # Build three separate pair lists so the resulting PR comment has a
+          # clear top-level structure:
+          #   - Headline:    `process_block` per (block, DA scheme) — the rows
+          #                  reviewers should always see.
+          #   - Sub-phases:  `system_init`, `run_tx_loop`, `da_commitment`,
+          #                  `state_commitment_update`, `blob_versioned_hash`.
+          #                  Useful when something
+          #                  regresses inside one of these stages; otherwise
+          #                  visual debt → collapsed under <details>.
+          #   - Precompiles bench: the synthetic test-crate workload, which
+          #                  expands to ~30 labels — collapsed under <details>.
+          headline_pairs=""
+          subphase_pairs=""
+          # The default DA scheme run (BlobsAndPubdataKeccak256) gets all four
+          # sub-phases. The BlobsZKsyncOS pass only differs in the post-tx-op
+          # stage (the tx loop is identical), so we surface only the rows that
+          # actually change: `da_commitment`, `state_commitment_update`, and `blob_versioned_hash`.
+          subphase_symbols_keccak="system_init run_tx_loop da_commitment state_commitment_update"
+          subphase_symbols_blobs="da_commitment state_commitment_update blob_versioned_hash"
+          add_pair() {
+            local list_var="$1"; local entry="$2"
+            if [ -z "${!list_var}" ]; then
+              eval "$list_var=\$entry"
+            else
+              eval "$list_var=\"\${$list_var},\$entry\""
+            fi
+          }
           for dir in tests/instances/eth_runner/blocks/*; do
             blk=$(basename "$dir")
             python3 bench_scripts/parse_opcodes.py base_block_${blk}.out bench_results/base_block_${blk}.csv bench_results/base_block_${blk}.png
             python3 bench_scripts/parse_opcodes.py head_block_${blk}.out bench_results/head_block_${blk}.csv bench_results/head_block_${blk}.png
-            if [ -z "$pairs" ]; then
-              pairs="(\"block_${blk}\", \"base_block_${blk}.bench\", \"head_block_${blk}.bench\", \"process_block\")"
-            else
-              pairs="${pairs},(\"block_${blk}\", \"base_block_${blk}.bench\", \"head_block_${blk}.bench\", \"process_block\")"
+            add_pair headline_pairs "(\"block_${blk} (keccak DA)\", \"base_block_${blk}.bench\", \"head_block_${blk}.bench\", \"process_block\")"
+            for sym in $subphase_symbols_keccak; do
+              add_pair subphase_pairs "(\"block_${blk} (keccak DA)\", \"base_block_${blk}.bench\", \"head_block_${blk}.bench\", \"${sym}\")"
+            done
+            # When the merge-base predates `BENCH_DA_SCHEME` plumbing the
+            # bench-base job emits no blobs `.bench` file. Fall back to
+            # comparing the head's blobs file against itself so the absolute
+            # values are still visible in the PR comment (deltas will read 0%
+            # — fine until the next PR cycles after merge).
+            if [ -f "head_block_${blk}_blobs.bench" ]; then
+              if [ -f "base_block_${blk}_blobs.bench" ]; then
+                base_blob="base_block_${blk}_blobs.bench"
+              else
+                base_blob="head_block_${blk}_blobs.bench"
+              fi
+              add_pair headline_pairs "(\"block_${blk} (blobs DA)\", \"${base_blob}\", \"head_block_${blk}_blobs.bench\", \"process_block\")"
+              for sym in $subphase_symbols_blobs; do
+                add_pair subphase_pairs "(\"block_${blk} (blobs DA)\", \"${base_blob}\", \"head_block_${blk}_blobs.bench\", \"${sym}\")"
+              done
             fi
           done
-          pairs="${pairs},(\"precompiles\", \"base_precompiles.bench\", \"head_precompiles.bench\")"
-          # Save comparison to file (for artifact + comment workflow fallback)
-          python3 bench_scripts/compare_bench.py "[${pairs}]" > bench_results/comparison.md
+          precompiles_pair="(\"precompiles\", \"base_precompiles.bench\", \"head_precompiles.bench\")"
+          # Headline section: process_block per (block, DA scheme).
+          echo "## Block-level effective cycles" > bench_results/comparison.md
+          echo "" >> bench_results/comparison.md
+          python3 bench_scripts/compare_bench.py --no-title "[${headline_pairs}]" >> bench_results/comparison.md
+          # Block sub-phases collapsed under <details>.
+          echo "" >> bench_results/comparison.md
+          echo "<details><summary>Block-level sub-phases</summary>" >> bench_results/comparison.md
+          echo "" >> bench_results/comparison.md
+          python3 bench_scripts/compare_bench.py --no-title --sort-by-symbol "[${subphase_pairs}]" >> bench_results/comparison.md
+          echo "" >> bench_results/comparison.md
+          echo "</details>" >> bench_results/comparison.md
+          # Synthetic precompiles test-crate bench collapsed under <details>.
+          echo "" >> bench_results/comparison.md
+          echo "<details><summary>Precompiles test-crate bench (synthetic workload, all labels)</summary>" >> bench_results/comparison.md
+          echo "" >> bench_results/comparison.md
+          python3 bench_scripts/compare_bench.py --no-title "[${precompiles_pair}]" >> bench_results/comparison.md
+          echo "" >> bench_results/comparison.md
+          echo "</details>" >> bench_results/comparison.md
           # Collect file lists for aggregated per-opcode comparison
           stats_args=""
           cycle_args=""
@@ -199,15 +367,74 @@ jobs:
             stats_sample_args="$stats_sample_args $(pwd)/opcode_samples/base_${blk} $(pwd)/opcode_samples/head_${blk}"
             cycle_sample_args="$cycle_sample_args $(pwd)/opcode_samples/base_${blk} $(pwd)/opcode_cycles/base_${blk} $(pwd)/opcode_samples/head_${blk} $(pwd)/opcode_cycles/head_${blk}"
           done
+          # Per-opcode changes — both scripts emit nothing if nothing moved.
+          echo "" >> bench_results/comparison.md
+          echo "## Per-opcode" >> bench_results/comparison.md
           # Aggregated per-opcode EVM stats (gas/native/ratios)
-          python3 bench_scripts/compare_opcode_stats.py $stats_args \
+          if ! python3 bench_scripts/compare_opcode_stats.py $stats_args \
             --sample-dirs $stats_sample_args \
-            >> bench_results/comparison.md 2>/dev/null || true
+            >> bench_results/comparison.md; then
+            echo "" >> bench_results/comparison.md
+            echo "_Per-opcode gas/native diff generation failed; see CI logs._" >> bench_results/comparison.md
+          fi
           # Aggregated per-opcode RISC-V cycles (and cycles/gas ratios)
-          python3 bench_scripts/compare_opcode_cycles.py $cycle_args \
+          if ! python3 bench_scripts/compare_opcode_cycles.py $cycle_args \
             --gas-stats $gas_args \
             --sample-dirs $cycle_sample_args \
-            >> bench_results/comparison.md 2>/dev/null || true
+            >> bench_results/comparison.md; then
+            echo "" >> bench_results/comparison.md
+            echo "_Per-opcode cycles diff generation failed; see CI logs._" >> bench_results/comparison.md
+          fi
+          # Per-execution precompile join inputs: aggregate the per-block
+          # tracer sample dirs and cycle sample dirs into positional pairs
+          # for `join_precompile_samples.py`. `--opcode-samples-dir`
+          # provides gas/native for synthetic precompile entries (currently
+          # `keccak` sourced from `SHA3.samples`).
+          join_pairs="head_precompile_samples head_precompile_cycles"
+          join_bench_args=""
+          join_opcode_args=""
+          if [ -f head_precompiles.bench ]; then
+            join_bench_args="--bench-file head_precompiles.bench"
+          fi
+          # Test-crate run dumps opcode samples to a flat dir; per-block runs
+          # dump under opcode_samples/head_${blk}. Pass the flat dir as the
+          # first --opcode-samples-dir to align with the first join pair.
+          if [ -d head_precompile_samples ]; then
+            join_opcode_args="--opcode-samples-dir opcode_samples"
+          fi
+          for dir in tests/instances/eth_runner/blocks/*; do
+            blk=$(basename "$dir")
+            if [ -d "precompile_samples/head_${blk}" ] && [ -d "precompile_cycles/head_${blk}" ]; then
+              join_pairs="$join_pairs precompile_samples/head_${blk} precompile_cycles/head_${blk}"
+              # Each --bench-file / --opcode-samples-dir is matched positionally
+              # to its (tracer_dir, cycles_dir) pair.
+              if [ -f "head_block_${blk}.bench" ]; then
+                join_bench_args="$join_bench_args --bench-file head_block_${blk}.bench"
+              else
+                join_bench_args="$join_bench_args --bench-file /dev/null"
+              fi
+              if [ -d "opcode_samples/head_${blk}" ]; then
+                join_opcode_args="$join_opcode_args --opcode-samples-dir opcode_samples/head_${blk}"
+              else
+                join_opcode_args="$join_opcode_args --opcode-samples-dir /dev/null"
+              fi
+            fi
+          done
+          # Per-execution precompile cycles/gas + native/gas (joined),
+          # aggregated across the test crate + all block benchmarks.
+          echo "" >> bench_results/comparison.md
+          echo "## Per-precompile" >> bench_results/comparison.md
+          echo "" >> bench_results/comparison.md
+          echo "<details><summary>Per-precompile per-execution ratios (head)</summary>" >> bench_results/comparison.md
+          echo "" >> bench_results/comparison.md
+          echo '```' >> bench_results/comparison.md
+          if ! python3 bench_scripts/join_precompile_samples.py $join_pairs $join_bench_args $join_opcode_args --summary \
+            >> bench_results/comparison.md; then
+            echo "(per-execution ratios generation failed; see CI logs)" >> bench_results/comparison.md
+          fi
+          echo '```' >> bench_results/comparison.md
+          echo "" >> bench_results/comparison.md
+          echo "</details>" >> bench_results/comparison.md
           # Also write to step output for direct comment
           EOF=$(dd if=/dev/urandom bs=15 count=1 status=none | base64)
           echo "result<<$EOF" >> $GITHUB_OUTPUT
@@ -217,17 +444,20 @@ jobs:
       - name: Collect per-opcode artifacts
         shell: bash
         run: |
+          # These produce supplementary CSV artifacts (not PR-comment
+          # content). On failure we let the job continue (`|| true`) but
+          # send stderr to the log so failures are visible during debug.
           for dir in tests/instances/eth_runner/blocks/*; do
             blk=$(basename "$dir")
             # Combined gas+native+cycles stats CSV
             python3 bench_scripts/join_opcode_stats.py \
               head_block_${blk}.out head_block_${blk}.bench \
-              --csv bench_results/head_block_${blk}_joined_stats.csv 2>/dev/null || true
+              --csv bench_results/head_block_${blk}_joined_stats.csv || true
             # Per-execution joined CSVs
             if [ -d "opcode_samples/head_${blk}" ] && [ -d "opcode_cycles/head_${blk}" ]; then
               python3 bench_scripts/join_samples.py \
                 opcode_samples/head_${blk} opcode_cycles/head_${blk} \
-                --out-dir bench_results/per_opcode/block_${blk} 2>/dev/null || true
+                --out-dir bench_results/per_opcode/block_${blk} || true
             fi
           done
           # Generate visualization charts
diff --git a/Cargo.toml b/Cargo.toml
index 6b163d8a9..a6b87f7f7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -115,6 +115,28 @@ lto = true
 codegen-units = 1
 debug = true
 
+# Fast-compile variant used by the bench CI for in-workspace bench
+# targets — specifically `cargo test -p precompiles --profile bench-fast`.
+# `tests/instances/eth_runner/` is excluded from this workspace (see
+# `workspace.exclude` above), so its `cargo run --manifest-path …` uses
+# the duplicate `[profile.bench-fast]` defined in
+# `tests/instances/eth_runner/Cargo.toml`; keep the two in sync.
+# Runtime perf of these driver binaries doesn't affect measurements
+# (cycle counts come from the RISC-V simulator), so disabling fat LTO
+# and parallelizing codegen cuts "Run benchmarks" compile time by a
+# large factor. The RISC-V proving binary is yet another workspace at
+# `zksync_os/Cargo.toml` and uses its own `[profile.release]` — also
+# unaffected.
+# NOTE: the literal string `bench-fast` is used as a `grep -q` fallback
+# target by `.github/workflows/bench.yml` — if this profile name is
+# changed, update the workflow too.
+[profile.bench-fast]
+inherits = "release"
+opt-level = 3
+lto = false
+codegen-units = 16
+debug = false
+
 [patch.crates-io]
 #zksync_os_evm_errors = { path = "../zksync-os-interface/crates/evm-errors" }
 #zksync_os_interface = { path = "../zksync-os-interface/crates/interface" }
diff --git a/basic_bootloader/src/bootloader/block_flow/ethereum/post_tx_op_proving.rs b/basic_bootloader/src/bootloader/block_flow/ethereum/post_tx_op_proving.rs
index ea0397607..2985faf34 100644
--- a/basic_bootloader/src/bootloader/block_flow/ethereum/post_tx_op_proving.rs
+++ b/basic_bootloader/src/bootloader/block_flow/ethereum/post_tx_op_proving.rs
@@ -119,9 +119,9 @@ where
             &initial_state_commitment
         );
 
-        // // 3. Verify/apply reads and writes
+        // 3. Verify/apply reads and writes — state-tree merkle commit.
         let mut updated_state_commitment = initial_state_commitment;
-        cycle_marker::wrap!("verify_and_apply_batch", {
+        cycle_marker::wrap!("state_commitment_update", {
             io.update_commitment(
                 Some(&mut updated_state_commitment),
                 &mut logger,
diff --git a/basic_bootloader/src/bootloader/block_flow/ethereum/post_tx_op_sequencing.rs b/basic_bootloader/src/bootloader/block_flow/ethereum/post_tx_op_sequencing.rs
index 9a2341f33..3c77410ef 100644
--- a/basic_bootloader/src/bootloader/block_flow/ethereum/post_tx_op_sequencing.rs
+++ b/basic_bootloader/src/bootloader/block_flow/ethereum/post_tx_op_sequencing.rs
@@ -121,8 +121,8 @@ where
         // Events
         result_keeper.events(io.events_iterator());
 
-        // // 3. Verify/apply reads and writes
-        cycle_marker::wrap!("verify_and_apply_batch", {
+        // 3. Verify/apply reads and writes
+        cycle_marker::wrap!("state_commitment_update", {
             io.update_commitment(None, &mut logger, result_keeper);
         });
 
diff --git a/basic_bootloader/src/bootloader/block_flow/zk/post_tx_op/post_tx_op_proving_multiblock_batch.rs b/basic_bootloader/src/bootloader/block_flow/zk/post_tx_op/post_tx_op_proving_multiblock_batch.rs
index ef685ffa6..f3893cf59 100644
--- a/basic_bootloader/src/bootloader/block_flow/zk/post_tx_op/post_tx_op_proving_multiblock_batch.rs
+++ b/basic_bootloader/src/bootloader/block_flow/zk/post_tx_op/post_tx_op_proving_multiblock_batch.rs
@@ -96,17 +96,20 @@ where
                 da_commitment_scheme
             );
         }
-        write_pubdata(
-            batch_data
-                .da_commitment_generator
-                .as_mut()
-                .unwrap()
-                .as_mut(),
-            result_keeper,
-            block_hash,
-            metadata.block_timestamp(),
-            &mut io,
-        );
+        // See `post_tx_op_proving_singleblock_batch.rs` for the rationale.
+        cycle_marker::wrap!("da_commitment", {
+            write_pubdata(
+                batch_data
+                    .da_commitment_generator
+                    .as_mut()
+                    .unwrap()
+                    .as_mut(),
+                result_keeper,
+                block_hash,
+                metadata.block_timestamp(),
+                &mut io,
+            );
+        });
 
         io.logs_storage
             .apply_to_array_vec(&mut batch_data.logs_storage);
@@ -142,8 +145,8 @@ where
             last_block_timestamp,
         };
 
-        // 3. Verify/apply reads and writes
-        cycle_marker::wrap!("verify_and_apply_batch", {
+        // 3. Verify/apply reads and writes — state-tree merkle commit.
+        cycle_marker::wrap!("state_commitment_update", {
             IOTeardown::<_>::update_commitment(
                 &mut io,
                 Some(&mut state_commitment),
diff --git a/basic_bootloader/src/bootloader/block_flow/zk/post_tx_op/post_tx_op_proving_singleblock_batch.rs b/basic_bootloader/src/bootloader/block_flow/zk/post_tx_op/post_tx_op_proving_singleblock_batch.rs
index e3728b873..94d81e6b1 100644
--- a/basic_bootloader/src/bootloader/block_flow/zk/post_tx_op/post_tx_op_proving_singleblock_batch.rs
+++ b/basic_bootloader/src/bootloader/block_flow/zk/post_tx_op/post_tx_op_proving_singleblock_batch.rs
@@ -88,13 +88,22 @@ where
         let mut da_commitment_generator =
             da_commitment_generator_from_scheme(io.da_commitment_scheme.unwrap(), A::default())
                 .unwrap();
-        write_pubdata(
-            da_commitment_generator.as_mut(),
-            result_keeper,
-            block_hash,
-            metadata.block_timestamp(),
-            &mut io,
-        );
+        // For keccak DA (`BlobsAndPubdataKeccak256`), `write_pubdata` streams
+        // bytes through `Keccak256CommitmentGenerator`, which absorbs them
+        // into the keccak state — this is where the bulk of keccak
+        // delegations fire on the DA-commit path. For blob DA
+        // (`BlobsZKsyncOS`) the same call just appends to a buffer (no
+        // hashing yet); the actual blob KZG work happens in `.finalize()`
+        // below and is already captured by the `blob_versioned_hash` marker.
+        cycle_marker::wrap!("da_commitment", {
+            write_pubdata(
+                da_commitment_generator.as_mut(),
+                result_keeper,
+                block_hash,
+                metadata.block_timestamp(),
+                &mut io,
+            );
+        });
 
         let (multichain_root, settlement_layer_chain_id) = read_batch_context_inputs(&mut io);
 
@@ -152,8 +161,10 @@ where
             chain_state_commitment_before
         );
 
-        // update state commitment
-        cycle_marker::wrap!("verify_and_apply_batch", {
+        // update state commitment — this is the state-tree merkle commit
+        // (Blake-heavy). Distinct from `da_commitment` (keccak/blob over
+        // pubdata) and `blob_versioned_hash` (KZG per blob).
+        cycle_marker::wrap!("state_commitment_update", {
             IOTeardown::<_>::update_commitment(
                 &mut io,
                 Some(&mut state_commitment),
diff --git a/basic_bootloader/src/bootloader/block_flow/zk/post_tx_op/post_tx_op_sequencing.rs b/basic_bootloader/src/bootloader/block_flow/zk/post_tx_op/post_tx_op_sequencing.rs
index 087c06118..b0a1ff82b 100644
--- a/basic_bootloader/src/bootloader/block_flow/zk/post_tx_op/post_tx_op_sequencing.rs
+++ b/basic_bootloader/src/bootloader/block_flow/zk/post_tx_op/post_tx_op_sequencing.rs
@@ -73,15 +73,20 @@ where
         result_keeper.logs(io.logs_storage.messages_ref_iter());
         result_keeper.events(io.events_storage.events_ref_iter());
 
-        write_pubdata(
-            &mut NopCommitmentGenerator,
-            result_keeper,
-            block_hash,
-            metadata.block_timestamp(),
-            &mut io,
-        );
+        // Sequencing-mode post-op uses NopCommitmentGenerator (no DA work),
+        // but we still mark `da_commitment` for parity with the proving
+        // paths so the bench label set is consistent across STFs.
+        cycle_marker::wrap!("da_commitment", {
+            write_pubdata(
+                &mut NopCommitmentGenerator,
+                result_keeper,
+                block_hash,
+                metadata.block_timestamp(),
+                &mut io,
+            );
+        });
 
-        cycle_marker::wrap!("verify_and_apply_batch", {
+        cycle_marker::wrap!("state_commitment_update", {
             io.update_commitment(None, &mut logger, result_keeper);
         });
         Ok(())
diff --git a/basic_system/src/system_functions/bls12_381/pairing.rs b/basic_system/src/system_functions/bls12_381/pairing.rs
index 30b45f70f..d843deca6 100644
--- a/basic_system/src/system_functions/bls12_381/pairing.rs
+++ b/basic_system/src/system_functions/bls12_381/pairing.rs
@@ -1,5 +1,6 @@
 use super::*;
 use alloc::vec::Vec;
+use crypto::ark_ec::AffineRepr;
 use crypto::{ark_ec::pairing::Pairing, bls12_381::curves::Bls12_381};
 use zk_ee::{
     out_of_return_memory,
@@ -78,25 +79,159 @@ fn bls12_381_pairing_as_system_function_inner<
                 .try_into()
                 .unwrap(),
         )?;
+        // e(O, Q) = e(P, O) = 1 in the target field, so degenerate pairs do not
+        // affect the multi-pairing product. Skip them after subgroup validation
+        // to save the per-pair Miller-loop precomputation that dominates the
+        // cost on Pectra degenerate inputs.
+        if g1.is_zero() || g2.is_zero() {
+            continue;
+        }
         g1_points.push(g1);
         g2_points.push(g2);
     }
 
-    let pairing_result = <Bls12_381 as Pairing>::multi_pairing(g1_points, g2_points);
     output
         .try_extend([0u8; 31])
         .map_err(|_| out_of_return_memory!())?;
 
     use crypto::ark_ff::Field;
-    if pairing_result.0 == <Bls12_381 as Pairing>::TargetField::ONE {
-        output
-            .try_extend([1u8])
-            .map_err(|_| out_of_return_memory!())?;
+    let success = if g1_points.is_empty() {
+        // Empty product equals the identity in the target field.
+        true
     } else {
-        output
-            .try_extend([0u8])
-            .map_err(|_| out_of_return_memory!())?;
-    }
+        let pairing_result = <Bls12_381 as Pairing>::multi_pairing(g1_points, g2_points);
+        pairing_result.0 == <Bls12_381 as Pairing>::TargetField::ONE
+    };
+
+    output
+        .try_extend([success as u8])
+        .map_err(|_| out_of_return_memory!())?;
 
     Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use core::ops::Neg;
+    use crypto::bls12_381::eip2537::{serialize_g1_bytes, serialize_g2_bytes};
+    use zk_ee::reference_implementations::{BaseResources, DecreasingNative};
+    use zk_ee::system::Resource;
+
+    fn encode_g1(point: G1Affine) -> [u8; G1_SERIALIZATION_LEN] {
+        let mut buf = [0u8; G1_SERIALIZATION_LEN];
+        serialize_g1_bytes(point, &mut buf);
+        buf
+    }
+
+    fn encode_g2(point: G2Affine) -> [u8; G2_SERIALIZATION_LEN] {
+        let mut buf = [0u8; G2_SERIALIZATION_LEN];
+        serialize_g2_bytes(point, &mut buf);
+        buf
+    }
+
+    fn encode_pair(g1: G1Affine, g2: G2Affine) -> [u8; BLS12_381_PAIR_LEN] {
+        let mut buf = [0u8; BLS12_381_PAIR_LEN];
+        buf[..G1_SERIALIZATION_LEN].copy_from_slice(&encode_g1(g1));
+        buf[G1_SERIALIZATION_LEN..].copy_from_slice(&encode_g2(g2));
+        buf
+    }
+
+    fn run(input: &[u8]) -> Vec<u8> {
+        let allocator = std::alloc::Global;
+        let mut resource = <BaseResources<DecreasingNative> as Resource>::FORMAL_INFINITE;
+        let mut dst: Vec<u8> = Vec::new();
+        Bls12381PairingCheckPrecompile::execute(input, &mut dst, &mut resource, allocator)
+            .expect("precompile should succeed on well-formed input");
+        dst
+    }
+
+    fn expect_check(input: &[u8], expected_true: bool) {
+        let dst = run(input);
+        let mut expected = [0u8; 32];
+        expected[31] = expected_true as u8;
+        assert_eq!(dst.as_slice(), &expected[..]);
+    }
+
+    #[test]
+    fn single_pair_both_infinity_returns_true() {
+        let input = [0u8; BLS12_381_PAIR_LEN];
+        expect_check(&input, true);
+    }
+
+    #[test]
+    fn single_pair_g1_infinity_returns_true() {
+        let mut input = [0u8; BLS12_381_PAIR_LEN];
+        input[G1_SERIALIZATION_LEN..].copy_from_slice(&encode_g2(G2Affine::generator()));
+        expect_check(&input, true);
+    }
+
+    #[test]
+    fn single_pair_g2_infinity_returns_true() {
+        let mut input = [0u8; BLS12_381_PAIR_LEN];
+        input[..G1_SERIALIZATION_LEN].copy_from_slice(&encode_g1(G1Affine::generator()));
+        expect_check(&input, true);
+    }
+
+    #[test]
+    fn many_infinity_pairs_return_true() {
+        let input = vec![0u8; 7 * BLS12_381_PAIR_LEN];
+        expect_check(&input, true);
+    }
+
+    #[test]
+    fn nontrivial_pair_returns_false_and_infinity_does_not_mask_it() {
+        // e(G1, G2) is the BLS12-381 generator pairing, which is not 1.
+        let nontrivial = encode_pair(G1Affine::generator(), G2Affine::generator());
+        expect_check(&nontrivial, false);
+
+        // Appending degenerate pairs must not flip the result to true.
+        let mut with_inf = nontrivial.to_vec();
+        with_inf.extend_from_slice(&[0u8; BLS12_381_PAIR_LEN]);
+        expect_check(&with_inf, false);
+
+        let mut prefixed = vec![0u8; BLS12_381_PAIR_LEN];
+        prefixed.extend_from_slice(&nontrivial);
+        expect_check(&prefixed, false);
+    }
+
+    #[test]
+    fn balanced_pair_returns_true_with_or_without_infinity_padding() {
+        // e(G1, G2) * e(-G1, G2) = e(G1, G2) * e(G1, G2)^{-1} = 1
+        let g1 = G1Affine::generator();
+        let g2 = G2Affine::generator();
+        let balanced_a = encode_pair(g1, g2);
+        let balanced_b = encode_pair(g1.neg(), g2);
+
+        let mut balanced = balanced_a.to_vec();
+        balanced.extend_from_slice(&balanced_b);
+        expect_check(&balanced, true);
+
+        // Interleaving degenerate pairs must keep the result true.
+        let mut interleaved = vec![0u8; BLS12_381_PAIR_LEN];
+        interleaved.extend_from_slice(&balanced_a);
+        interleaved.extend_from_slice(&[0u8; BLS12_381_PAIR_LEN]);
+        interleaved.extend_from_slice(&balanced_b);
+        interleaved.extend_from_slice(&[0u8; BLS12_381_PAIR_LEN]);
+        expect_check(&interleaved, true);
+    }
+
+    #[test]
+    fn malformed_nonzero_encoding_is_still_rejected() {
+        // A G1 input where the y-coordinate is forced to zero with a non-zero x
+        // is not on the curve and must not be accepted as the point at infinity.
+        // This guards against any future refactor that filters before parsing.
+        let mut input = [0u8; BLS12_381_PAIR_LEN];
+        // x = 1 in big-endian, padded to 48 bytes then to the 64-byte slot.
+        input[G1_SERIALIZATION_LEN - 1] = 1;
+        // y stays zero. G2 stays at infinity (irrelevant once G1 parse fails).
+        let allocator = std::alloc::Global;
+        let mut resource = <BaseResources<DecreasingNative> as Resource>::FORMAL_INFINITE;
+        let mut dst: Vec<u8> = Vec::new();
+        let err =
+            Bls12381PairingCheckPrecompile::execute(&input, &mut dst, &mut resource, allocator)
+                .expect_err("invalid G1 encoding must be rejected");
+        // Sanity: we got an error rather than silently treating it as infinity.
+        let _ = err;
+    }
+}
diff --git a/bench_scripts/bench.sh b/bench_scripts/bench.sh
index 498e825b0..4bee4e55a 100755
--- a/bench_scripts/bench.sh
+++ b/bench_scripts/bench.sh
@@ -45,11 +45,16 @@ run_block() {
     local block_samples_dir="$output_dir/opcode_samples/block_${blk}"
     local block_cycles_dir="$output_dir/opcode_cycles/block_${blk}"
     local block_stats_path="$output_dir/opcode_stats/block_${blk}.csv"
+    local precompile_stats_path="$output_dir/block_${blk}_precompile_stats.csv"
+    local precompile_samples_dir="$output_dir/precompile_samples/block_${blk}"
+    local precompile_cycles_dir="$output_dir/precompile_cycles/block_${blk}"
 
     rm -rf "$block_samples_dir" "$block_cycles_dir"
-    rm -f "$block_stats_path"
+    rm -rf "$precompile_samples_dir" "$precompile_cycles_dir"
+    rm -f "$block_stats_path" "$precompile_stats_path"
 
     mkdir -p "$output_dir/opcode_samples" "$output_dir/opcode_cycles" "$output_dir/opcode_stats"
+    mkdir -p "$output_dir/precompile_samples" "$output_dir/precompile_cycles"
 
     echo "==> Benchmarking block $blk..."
     ZKSYNC_RISC_V_RUN=true \
@@ -57,6 +62,9 @@ run_block() {
     OPCODE_CYCLE_SAMPLES_DIR="$block_cycles_dir" \
     OPCODE_STATS_PATH="$block_stats_path" \
     MARKER_PATH="$output_dir/block_${blk}.bench" \
+    PRECOMPILE_STATS_PATH="$precompile_stats_path" \
+    PRECOMPILE_SAMPLES_DIR="$precompile_samples_dir" \
+    LABEL_CYCLE_SAMPLES_DIR="$precompile_cycles_dir" \
     cargo run --manifest-path "$ETH_RUNNER_MANIFEST" \
         --release -j 3 \
         --features "$FEATURES" \
@@ -67,9 +75,21 @@ run_block() {
 run_precompiles() {
     local output_dir="$1"
 
+    # Use a dedicated sub-namespace so we don't clobber per-block dirs that
+    # `run_all_blocks` already wrote under $output_dir/precompile_{samples,cycles}/block_*.
+    local samples_dir="$output_dir/precompile_samples/test_precompiles"
+    local cycles_dir="$output_dir/precompile_cycles/test_precompiles"
+
+    # Clean only our subdir so per-block artifacts survive.
+    rm -rf "$samples_dir" "$cycles_dir"
+    mkdir -p "$samples_dir" "$cycles_dir"
+
     echo "==> Benchmarking precompiles..."
     ZKSYNC_RISC_V_RUN=true \
     MARKER_PATH="$output_dir/precompiles.bench" \
+    PRECOMPILE_STATS_PATH="$output_dir/precompile_stats.csv" \
+    PRECOMPILE_SAMPLES_DIR="$samples_dir" \
+    LABEL_CYCLE_SAMPLES_DIR="$cycles_dir" \
     cargo test --manifest-path "$PRECOMPILE_MANIFEST" \
         --release -j 3 \
         --features "$PRECOMPILE_FEATURES" \
@@ -77,6 +97,53 @@ run_precompiles() {
         > "$output_dir/precompiles.out" 2>&1
 }
 
+join_precompile_samples_run() {
+    local output_dir="$1"
+
+    local pairs=()
+    local bench_args=()
+
+    # Test-crate cycle bench (test_precompiles). Lives in its own subdir so
+    # it doesn't collide with the per-block subdirs that share the parent.
+    local tc_samples="$output_dir/precompile_samples/test_precompiles"
+    local tc_cycles="$output_dir/precompile_cycles/test_precompiles"
+    if [ -d "$tc_samples" ] && [ -d "$tc_cycles" ]; then
+        pairs+=("$tc_samples" "$tc_cycles")
+        if [ -f "$output_dir/precompiles.bench" ]; then
+            bench_args+=(--bench-file "$output_dir/precompiles.bench")
+        else
+            bench_args+=(--bench-file /dev/null)
+        fi
+    fi
+
+    # Per-block eth_runner bench (real workloads).
+    for dir in "$BLOCKS_DIR"/*/; do
+        local blk
+        blk="$(basename "$dir")"
+        local p_samples="$output_dir/precompile_samples/block_${blk}"
+        local p_cycles="$output_dir/precompile_cycles/block_${blk}"
+        local p_bench="$output_dir/block_${blk}.bench"
+        if [ -d "$p_samples" ] && [ -d "$p_cycles" ]; then
+            pairs+=("$p_samples" "$p_cycles")
+            if [ -f "$p_bench" ]; then
+                bench_args+=(--bench-file "$p_bench")
+            else
+                bench_args+=(--bench-file /dev/null)
+            fi
+        fi
+    done
+
+    if [ ${#pairs[@]} -ge 2 ]; then
+        echo "==> Joining precompile per-execution samples (${#pairs[@]} dirs across $((${#pairs[@]} / 2)) sources)..."
+        python3 "$REPO_ROOT/bench_scripts/join_precompile_samples.py" \
+            "${pairs[@]}" \
+            "${bench_args[@]}" \
+            --out-dir "$output_dir/precompile_joined" \
+            --summary \
+            > "$output_dir/precompile_joined_summary.txt" 2>&1 || true
+    fi
+}
+
 run_all_blocks() {
     local output_dir="$1"
     for dir in "$BLOCKS_DIR"/*/; do
@@ -89,6 +156,7 @@ do_baseline() {
     build_riscv_binary
     run_all_blocks "$BASELINE_DIR"
     run_precompiles "$BASELINE_DIR"
+    join_precompile_samples_run "$BASELINE_DIR"
     echo "==> Baseline saved to $BASELINE_DIR"
 }
 
@@ -97,6 +165,7 @@ do_run() {
     build_riscv_binary
     run_all_blocks "$CURRENT_DIR"
     run_precompiles "$CURRENT_DIR"
+    join_precompile_samples_run "$CURRENT_DIR"
     echo "==> Results saved to $CURRENT_DIR"
 }
 
@@ -207,6 +276,28 @@ do_compare() {
             "${cycle_args[@]}" --gas-stats "${gas_args[@]}" --sample-dirs "${cycle_sample_args[@]}" \
             2>/dev/null || true
     fi
+    # Aggregate per-precompile stats across test-crate + all block benchmarks.
+    local precompile_stats_args=()
+    if [ -f "$BASELINE_DIR/precompile_stats.csv" ] && [ -f "$CURRENT_DIR/precompile_stats.csv" ]; then
+        precompile_stats_args+=(
+            "$BASELINE_DIR/precompile_stats.csv"
+            "$CURRENT_DIR/precompile_stats.csv"
+        )
+    fi
+    for dir in "$BLOCKS_DIR"/*/; do
+        local blk
+        blk="$(basename "$dir")"
+        local base_csv="$BASELINE_DIR/block_${blk}_precompile_stats.csv"
+        local head_csv="$CURRENT_DIR/block_${blk}_precompile_stats.csv"
+        if [ -f "$base_csv" ] && [ -f "$head_csv" ]; then
+            precompile_stats_args+=("$base_csv" "$head_csv")
+        fi
+    done
+    if [ ${#precompile_stats_args[@]} -ge 2 ]; then
+        python3 "$REPO_ROOT/bench_scripts/compare_precompile_stats.py" \
+            "${precompile_stats_args[@]}" \
+            2>/dev/null || true
+    fi
 }
 
 do_flamegraph() {
diff --git a/bench_scripts/benchlib.py b/bench_scripts/benchlib.py
new file mode 100644
index 000000000..dfc533b2f
--- /dev/null
+++ b/bench_scripts/benchlib.py
@@ -0,0 +1,154 @@
+"""Shared helpers for bench_scripts/*.
+
+Concentrates the formatting, percentile, and sample-loading utilities that
+multiple scripts had been re-implementing slightly differently. Keep this
+module dependency-free (stdlib only) so any script in `bench_scripts/` can
+import it without adding a wheel.
+
+Effective-cycle constants live alongside the helpers and MUST stay in
+lockstep with `cycle_marker/src/lib.rs::print_cycle_markers` (see
+`compare_bench.py` for the unknown-delegation handling rationale).
+"""
+
+import os
+
+
+# Delegation IDs — must match cycle_marker/src/lib.rs::print_cycle_markers
+BLAKE_DELEGATION_ID = 1991
+BIGINT_DELEGATION_ID = 1994
+KECCAK_DELEGATION_ID = 1995
+
+# Effective-cycle weights — must match cycle_marker's BLAKE_DELEGATION_COEFF,
+# BIGINT_DELEGATION_COEFF, KECCAK_DELEGATION_COEFF. If these drift, the
+# Python-side reports will diverge from Rust-side `block_effective` and
+# from the per-execution `.effective.cycles` dumps.
+BLAKE_DELEGATION_COEFF = 16
+BIGINT_DELEGATION_COEFF = 4
+KECCAK_DELEGATION_COEFF = 4
+
+
+def median_int(values):
+    """Median of an iterable of integers (returns int). Empty → 0."""
+    vals = sorted(values)
+    if not vals:
+        return 0
+    mid = len(vals) // 2
+    if len(vals) % 2 == 0:
+        return (vals[mid - 1] + vals[mid]) // 2
+    return vals[mid]
+
+
+def median_float(values):
+    """Median of an iterable of floats. Empty → None."""
+    vals = sorted(values)
+    if not vals:
+        return None
+    mid = len(vals) // 2
+    if len(vals) % 2 == 0:
+        return (vals[mid - 1] + vals[mid]) / 2
+    return vals[mid]
+
+
+def percentile(sorted_vals, p):
+    """Nearest-rank percentile (1-indexed). `sorted_vals` must already be sorted.
+
+    Returns 0 for an empty input.
+    """
+    if not sorted_vals:
+        return 0
+    rank = max(1, -(-len(sorted_vals) * p // 100))  # ceiling division
+    return sorted_vals[min(rank, len(sorted_vals)) - 1]
+
+
+def pct(old, new):
+    """Percent change `(new - old) / old * 100`.
+
+    Returns 0 when both sides are 0, `inf` when old is 0 and new > 0.
+    """
+    if old == 0:
+        return 0.0 if new == 0 else float("inf")
+    return (new - old) / old * 100
+
+
+def fmt_pct(val):
+    """Format a percent value as ` (+1.2%)` / ` (-3.4%)`. Empty for ~0."""
+    if val is None:
+        return ""
+    if val == float("inf"):
+        return " (new)"
+    if abs(val) < 0.005:
+        return ""
+    return f" ({val:+.1f}%)"
+
+
+def fmt_val_pct(base, head):
+    """Format `head (+1.2%)` for a base/head integer pair."""
+    return f"{head}{fmt_pct(pct(base, head))}"
+
+
+def fmt_ratio_pct(base, head):
+    """Like `fmt_val_pct` but for float ratios with one decimal."""
+    if base is None or head is None:
+        return "—"
+    return f"{head:.2f}{fmt_pct(pct(base, head))}"
+
+
+def ratio(num, den):
+    """`num / den` for positive `den`, else 0.0."""
+    return num / den if den > 0 else 0.0
+
+
+def safe_listdir(path):
+    """`os.listdir(path)` that returns `[]` on any OSError (missing path, not a directory, /dev/null, …)."""
+    try:
+        return os.listdir(path)
+    except OSError:
+        return []
+
+
+def load_int_samples(path):
+    """Load one integer per non-empty line. Used for `.cycles` / `.effective.cycles` files."""
+    samples = []
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                samples.append(int(line))
+    return samples
+
+
+def load_gas_native_samples(path):
+    """Load `gas,native` per line. Used for `.samples` files emitted by tracers."""
+    samples = []
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            parts = line.split(",")
+            samples.append((int(parts[0]), int(parts[1])))
+    return samples
+
+
+def list_label_files(samples_dir, raw_suffix=".cycles", effective_suffix=".effective.cycles"):
+    """Return `(raw_names, effective_names, opcode_to_file)` for a samples dir.
+
+    - `raw_names`: set of labels with a `.cycles` file (NOT `.effective.cycles`).
+    - `effective_names`: set of labels with a `.effective.cycles` file.
+    - `opcode_to_file`: dict label → filename to prefer (effective when present).
+    """
+    entries = set(safe_listdir(samples_dir))
+    raw_names = set()
+    effective_names = set()
+    opcode_to_file = {}
+    for name in entries:
+        if name.endswith(effective_suffix):
+            label = name[: -len(effective_suffix)]
+            effective_names.add(label)
+            opcode_to_file[label] = name
+    for name in entries:
+        if name.endswith(raw_suffix) and not name.endswith(effective_suffix):
+            label = name[: -len(raw_suffix)]
+            raw_names.add(label)
+            opcode_to_file.setdefault(label, name)
+    return raw_names, effective_names, opcode_to_file
diff --git a/bench_scripts/compare_bench.py b/bench_scripts/compare_bench.py
index 8fc52e93e..3df9f6fb6 100644
--- a/bench_scripts/compare_bench.py
+++ b/bench_scripts/compare_bench.py
@@ -1,10 +1,28 @@
+import os
 import sys
 import re
 import ast
 
-U256BIGINTOPS_RATIO = 4
-BLAKE2ROUNDEXTENDED_RATIO = 16
-KECCAK_RATIO = 4  # TODO(EVM-1242): calibrate with actual proving benchmarks
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from benchlib import (  # noqa: E402
+    BIGINT_DELEGATION_COEFF,
+    BIGINT_DELEGATION_ID,
+    BLAKE_DELEGATION_COEFF,
+    BLAKE_DELEGATION_ID,
+    KECCAK_DELEGATION_COEFF,
+    KECCAK_DELEGATION_ID,
+    pct as pct_change,  # historical name
+)
+
+# Unknown-delegation policy: when this parser computes effective from the
+# raw `.bench` text it adds +1 per occurrence (coefficient = 1) for any
+# delegation ID outside the weighted set (BLAKE/BIGINT/KECCAK). This is
+# compare_bench's deliberate choice — it keeps unfamiliar delegation IDs
+# visible in headline cycles without requiring script updates. The
+# Rust-side `effective_of` helper does NOT apply this fallback because it
+# operates on per-execution sample dumps restricted to the three weighted
+# IDs; that's why headline `block_effective` can differ slightly from
+# compare_bench.py's `Eff` column.
 
 def parse_cycle_markers(text):
     results = {}
@@ -17,14 +35,40 @@ def parse_cycle_markers(text):
             m = re.match(r"(\w+): net cycles: (\d+), net delegations: (\{.*\})", line.strip())
             if m:
                 name = m.group(1)
+                # Compatibility aliases for marker-name transitions. These
+                # let bench-base on older merge-bases match bench-head's
+                # current names so the PR-comment rows pair up correctly.
+                # TODO: drop each alias one PR cycle after the merge-base
+                # SHA on `draft-0.4.0` reliably contains the new name.
+                #
+                # Alias 1: `<base>_execution_environment` → `<base>`.
+                #   Introduced by feat(bench): mark user-EVM-EE keccak/ecrecover.
+                #   The outer cycles INCLUDE the inner ones, so collapsing
+                #   onto the base name + max-fold picks the larger value —
+                #   avoiding double rows in the PR comment.
+                # Alias 2: `verify_and_apply_batch` → `state_commitment_update`.
+                #   Introduced when the marker was renamed (the old name was
+                #   misleading — it always wrapped only the state-tree commit).
+                if name.endswith("_execution_environment"):
+                    name = name[: -len("_execution_environment")]
+                elif name == "verify_and_apply_batch":
+                    name = "state_commitment_update"
                 raw = int(m.group(2))
                 delegs = ast.literal_eval(m.group(3))
 
-                blake = delegs.get(1991, 0)
-                bigint = delegs.get(1994, 0)
-                keccak = delegs.get(1995, 0)
-                weighted = blake * BLAKE2ROUNDEXTENDED_RATIO + bigint * U256BIGINTOPS_RATIO + keccak * KECCAK_RATIO
-                weighted += sum(v for k, v in delegs.items() if k not in (1991, 1994, 1995))
+                blake = delegs.get(BLAKE_DELEGATION_ID, 0)
+                bigint = delegs.get(BIGINT_DELEGATION_ID, 0)
+                keccak = delegs.get(KECCAK_DELEGATION_ID, 0)
+                weighted = (
+                    blake * BLAKE_DELEGATION_COEFF
+                    + bigint * BIGINT_DELEGATION_COEFF
+                    + keccak * KECCAK_DELEGATION_COEFF
+                )
+                weighted += sum(
+                    v
+                    for k, v in delegs.items()
+                    if k not in (BLAKE_DELEGATION_ID, BIGINT_DELEGATION_ID, KECCAK_DELEGATION_ID)
+                )
 
                 eff = raw + weighted
                 prev = results.get(name)
@@ -38,18 +82,24 @@ def parse_cycle_markers(text):
                     }
     return results
 
-def pct_change(old, new):
-    if old == 0:
-        return float('inf') if new > 0 else 0.0
-    return (new - old) / old * 100
-
 def main():
-    if len(sys.argv) != 2:
-        print("Usage: python compare_bench.py '[...]'")
+    # `--no-title` lets the caller (e.g. bench.yml) provide its own section
+    # heading; useful when the same script is invoked multiple times to
+    # render different sub-tables under separate headings/spoilers.
+    # `--sort-by-symbol` groups rows by Symbol (then benchmark name) so
+    # all rows for the same marker line up — easier to scan when the
+    # table has many (benchmark × symbol) combinations like the
+    # block-level sub-phases view.
+    cli_flags = {"--no-title", "--sort-by-symbol"}
+    args = [a for a in sys.argv[1:] if a not in cli_flags]
+    emit_title = "--no-title" not in sys.argv[1:]
+    sort_by_symbol = "--sort-by-symbol" in sys.argv[1:]
+    if len(args) != 1:
+        print("Usage: python compare_bench.py [--no-title] [--sort-by-symbol] '[...]'")
         sys.exit(1)
 
     try:
-        benchmarks = ast.literal_eval(sys.argv[1])
+        benchmarks = ast.literal_eval(args[0])
     except Exception as e:
         print(f"Invalid input format: {e}")
         sys.exit(1)
@@ -84,6 +134,12 @@ def main():
             b = base.get(sym, {})
             h = head.get(sym, {})
 
+            # Skip symbols absent on both sides (e.g. an explicitly-requested
+            # block-level sub-phase that doesn't exist in this run's bench
+            # file would otherwise produce a noisy all-zero row).
+            if not b and not h:
+                continue
+
             b_raw = b.get('raw', 0)
             h_raw = h.get('raw', 0)
             b_blake = b.get('blake', 0)
@@ -104,8 +160,19 @@ def main():
                 b_eff, h_eff, pct_change(b_eff, h_eff)
             ))
 
+    # Skip emitting anything when there are no rows so callers wrapping the
+    # output in `<details>` don't produce an empty section.
+    if not rows:
+        return
+
+    if sort_by_symbol:
+        # row[0] = benchmark name, row[1] = symbol. Stable sort on
+        # (symbol, name) groups all rows of the same marker together.
+        rows.sort(key=lambda r: (r[1], r[0]))
+
     # Markdown table
-    print("### Benchmark report\n")
+    if emit_title:
+        print("### Benchmark report\n")
     print("| Benchmark | Symbol | Base Eff | Head Eff (%) | Base Raw | Head Raw (%) | Base Blake | Head Blake (%) | Base Bigint | Head Bigint (%) | Base Keccak | Head Keccak (%) |")
     print("|-----------|--------|-----------|----------------|-----------|----------------|-------------|------------------|---------------|--------------------|--------------|--------------------|")
 
diff --git a/bench_scripts/compare_opcode_cycles.py b/bench_scripts/compare_opcode_cycles.py
index f01d4a865..08b3c706a 100644
--- a/bench_scripts/compare_opcode_cycles.py
+++ b/bench_scripts/compare_opcode_cycles.py
@@ -23,27 +23,13 @@
 import re
 import sys
 
-
-def median_int(values):
-    """Return the true median of integer samples."""
-    if not values:
-        return 0
-    sorted_vals = sorted(values)
-    mid = len(sorted_vals) // 2
-    if len(sorted_vals) % 2 == 0:
-        return (sorted_vals[mid - 1] + sorted_vals[mid]) // 2
-    return sorted_vals[mid]
-
-
-def median_float(values):
-    """Return the true median of float samples."""
-    if not values:
-        return None
-    sorted_vals = sorted(values)
-    mid = len(sorted_vals) // 2
-    if len(sorted_vals) % 2 == 0:
-        return (sorted_vals[mid - 1] + sorted_vals[mid]) / 2
-    return sorted_vals[mid]
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from benchlib import (  # noqa: E402
+    fmt_pct,
+    median_float,
+    median_int,
+    pct,
+)
 
 
 def parse_cycle_stats(filename):
@@ -133,17 +119,33 @@ def load_tracer_samples(samples_dir):
 
 
 def load_cycle_samples(samples_dir):
-    """Load per-opcode cycle samples from a directory."""
+    """Load per-opcode cycle samples from a directory.
+
+    Prefers `<OPCODE>.effective.cycles` (raw + Blake/BigInt/Keccak delegation
+    weights, matching the block_effective formula) over `<OPCODE>.cycles`
+    (raw only). Effective samples reflect true prover cost for opcodes whose
+    handlers delegate (SHA3, SLOAD/SSTORE, BALANCE/EXTCODE*, CALL family,
+    CREATE/CREATE2); raw samples undercount them. Falls back to raw per
+    opcode when the effective variant is absent.
+    """
     stats = {}
     try:
-        entries = os.listdir(samples_dir)
+        entries = set(os.listdir(samples_dir))
     except OSError:
         return stats
 
+    # Group by opcode: prefer .effective.cycles, fall back to .cycles.
+    opcode_to_file = {}
     for name in entries:
-        if not name.endswith(".cycles"):
-            continue
-        opcode = name[:-len(".cycles")]
+        if name.endswith(".effective.cycles"):
+            opcode = name[: -len(".effective.cycles")]
+            opcode_to_file[opcode] = name
+    for name in entries:
+        if name.endswith(".cycles") and not name.endswith(".effective.cycles"):
+            opcode = name[: -len(".cycles")]
+            opcode_to_file.setdefault(opcode, name)
+
+    for opcode, name in opcode_to_file.items():
         rows = []
         with open(os.path.join(samples_dir, name)) as f:
             for line in f:
@@ -247,20 +249,13 @@ def overlay_sampled_stats(base_stats, sampled_stats):
     return merged
 
 
-def pct(old, new):
-    if old == 0:
-        return 0.0 if new == 0 else float("inf")
-    return (new - old) / old * 100
-
-
-def fmt_pct(val):
-    if abs(val) < 0.005:
-        return ""
-    return f" ({val:+.1f}%)"
-
-
 def ratio(num, den):
-    """Return num/den, or None if den is zero."""
+    """Return num/den, or None if den is zero.
+
+    NOTE: this differs from `benchlib.ratio` (which returns 0.0 for zero
+    denominator) — kept local so that downstream `fmt_ratio_pct` can
+    branch on None to render "n/a".
+    """
     return num / den if den > 0 else None
 
 
@@ -377,20 +372,27 @@ def format_table(rows, has_gas, label=""):
     lines.append("")
 
     if has_gas:
+        # When `--sample-dirs` is supplied (the CI path), `overlay_sampled_stats`
+        # replaces the .bench-aggregate cycle values with per-execution
+        # sampled values via `load_cycle_samples`, which prefers
+        # `<OPCODE>.effective.cycles` (raw + Blake/BigInt/Keccak delegation
+        # weights). Both cycles and cyc/gas columns are therefore effective.
+        # Without `--sample-dirs` (local fallback), the cycles columns come
+        # directly from the .bench aggregate which is raw.
         lines.append(
-            "| Opcode | Count | Med Cycles (%) | Total Cycles (%) "
-            "| Med Cyc/Gas (%) | Worst Cyc/Gas (%) |"
+            "| Opcode | Count | Med Cycles eff (%) | Total Cycles eff (%) "
+            "| Med Cyc/Gas eff (%) | Worst Cyc/Gas eff (%) |"
         )
         lines.append(
-            "|--------|-------|----------------|------------------"
-            "|-----------------|-------------------|"
+            "|--------|-------|--------------------|----------------------"
+            "|---------------------|-----------------------|"
         )
     else:
         lines.append(
-            "| Opcode | Count | Med Cycles (%) | Total Cycles (%) |"
+            "| Opcode | Count | Med Cycles eff (%) | Total Cycles eff (%) |"
         )
         lines.append(
-            "|--------|-------|----------------|------------------|"
+            "|--------|-------|--------------------|----------------------|"
         )
 
     # Sort by head total cycles descending (biggest cost first)
diff --git a/bench_scripts/compare_opcode_stats.py b/bench_scripts/compare_opcode_stats.py
index 42ed03273..492c91c2d 100644
--- a/bench_scripts/compare_opcode_stats.py
+++ b/bench_scripts/compare_opcode_stats.py
@@ -11,18 +11,11 @@
 
 import os
 import re
+import os
 import sys
 
-
-def median_int(values):
-    """Return the true median of integer samples."""
-    if not values:
-        return 0
-    sorted_vals = sorted(values)
-    mid = len(sorted_vals) // 2
-    if len(sorted_vals) % 2 == 0:
-        return (sorted_vals[mid - 1] + sorted_vals[mid]) // 2
-    return sorted_vals[mid]
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from benchlib import fmt_pct, median_int, pct  # noqa: E402
 
 
 def parse_opcode_stats(filename):
@@ -197,18 +190,6 @@ def overlay_sampled_stats(base_stats, sampled_stats):
     return merged
 
 
-def pct(old, new):
-    if old == 0:
-        return 0.0 if new == 0 else float("inf")
-    return (new - old) / old * 100
-
-
-def fmt_pct(val):
-    if abs(val) < 0.005:
-        return ""
-    return f" ({val:+.1f}%)"
-
-
 def compare(base_stats, head_stats):
     """Return list of rows for opcodes with changed avg_gas or avg_native."""
     all_opcodes = sorted(set(base_stats) | set(head_stats))
diff --git a/bench_scripts/compare_precompile_stats.py b/bench_scripts/compare_precompile_stats.py
new file mode 100755
index 000000000..d60205173
--- /dev/null
+++ b/bench_scripts/compare_precompile_stats.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+"""Compare base vs head precompile stats CSVs.
+
+Accepts one or more base/head CSV pairs as positional args and aggregates
+stats across all of them (matching `compare_opcode_stats.py`'s shape).
+
+Usage:
+    python compare_precompile_stats.py <base.csv> <head.csv> [label]
+    python compare_precompile_stats.py <b1.csv> <h1.csv> <b2.csv> <h2.csv> ... [label]
+
+CSV columns produced by PrecompileStatsTracer::write_csv:
+    name,address,count,avg_gas,median_gas,min_gas,max_gas,
+    avg_native,median_native,min_native,max_native,native_per_gas
+
+Aggregation across sources (per precompile):
+- `count`: sum
+- `avg_gas` / `avg_native`: count-weighted mean (re-derived from totals)
+- `med_gas` / `med_native`: count-weighted mean of per-source medians
+- `min_gas` / `min_native`: min across sources
+- `max_gas` / `max_native`: max across sources
+
+Exits 0 with no output if nothing changed or base CSVs are empty/absent.
+"""
+
+import csv
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from benchlib import fmt_pct, pct  # noqa: E402
+
+
+def parse_csv(path):
+    """Return dict keyed by precompile name."""
+    stats = {}
+    try:
+        with open(path) as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                try:
+                    stats[row["name"]] = {
+                        "address": row["address"],
+                        "count": int(row["count"]),
+                        "avg_gas": float(row["avg_gas"]),
+                        "med_gas": int(row["median_gas"]),
+                        "min_gas": int(row["min_gas"]),
+                        "max_gas": int(row["max_gas"]),
+                        "avg_native": float(row["avg_native"]),
+                        "med_native": int(row["median_native"]),
+                        "min_native": int(row["min_native"]),
+                        "max_native": int(row["max_native"]),
+                    }
+                except (ValueError, KeyError):
+                    continue
+    except FileNotFoundError:
+        pass
+    return stats
+
+
+def aggregate(sources):
+    """Combine N per-source stat dicts into one aggregate dict per precompile.
+
+    `count`s sum; min/max are extremes; averages and medians are count-weighted.
+    """
+    combined = {}
+    for stats in sources:
+        for name, s in stats.items():
+            cnt = s["count"]
+            if cnt <= 0:
+                continue
+            if name not in combined:
+                combined[name] = {
+                    "address": s["address"],
+                    "count": 0,
+                    "_wt_avg_gas": 0.0,
+                    "_wt_avg_native": 0.0,
+                    "_wt_med_gas": 0.0,
+                    "_wt_med_native": 0.0,
+                    "min_gas": s["min_gas"],
+                    "max_gas": s["max_gas"],
+                    "min_native": s["min_native"],
+                    "max_native": s["max_native"],
+                }
+            c = combined[name]
+            c["count"] += cnt
+            c["_wt_avg_gas"] += s["avg_gas"] * cnt
+            c["_wt_avg_native"] += s["avg_native"] * cnt
+            c["_wt_med_gas"] += s["med_gas"] * cnt
+            c["_wt_med_native"] += s["med_native"] * cnt
+            c["min_gas"] = min(c["min_gas"], s["min_gas"])
+            c["max_gas"] = max(c["max_gas"], s["max_gas"])
+            c["min_native"] = min(c["min_native"], s["min_native"])
+            c["max_native"] = max(c["max_native"], s["max_native"])
+
+    for c in combined.values():
+        total = c["count"]
+        if total > 0:
+            c["avg_gas"] = c["_wt_avg_gas"] / total
+            c["avg_native"] = c["_wt_avg_native"] / total
+            c["med_gas"] = round(c["_wt_med_gas"] / total)
+            c["med_native"] = round(c["_wt_med_native"] / total)
+        else:
+            c["avg_gas"] = 0.0
+            c["avg_native"] = 0.0
+            c["med_gas"] = 0
+            c["med_native"] = 0
+        for k in ("_wt_avg_gas", "_wt_avg_native", "_wt_med_gas", "_wt_med_native"):
+            del c[k]
+    return combined
+
+
+def compare(base, head):
+    names = sorted(set(base) | set(head))
+    rows = []
+    for name in names:
+        b = base.get(name, {})
+        h = head.get(name, {})
+        b_max_gas = b.get("max_gas", 0)
+        h_max_gas = h.get("max_gas", 0)
+        b_max_native = b.get("max_native", 0)
+        h_max_native = h.get("max_native", 0)
+        if b_max_gas == h_max_gas and b_max_native == h_max_native:
+            continue
+        rows.append(
+            {
+                "name": name,
+                "address": h.get("address", b.get("address", "")),
+                "b_count": b.get("count", 0),
+                "h_count": h.get("count", 0),
+                "b_max_gas": b_max_gas,
+                "h_max_gas": h_max_gas,
+                "b_med_gas": b.get("med_gas", 0),
+                "h_med_gas": h.get("med_gas", 0),
+                "b_max_native": b_max_native,
+                "h_max_native": h_max_native,
+                "b_med_native": b.get("med_native", 0),
+                "h_med_native": h.get("med_native", 0),
+            }
+        )
+    return rows
+
+
+def format_table(rows, label=""):
+    if not rows:
+        return ""
+    lines = []
+    title = "#### Precompile gas/native worst-case"
+    if label:
+        title += f" ({label})"
+    lines.append(title)
+    lines.append("")
+    lines.append(
+        "| Precompile | Address | Count | Max Gas | Med Gas | Max Native | Med Native |"
+    )
+    lines.append(
+        "|------------|---------|-------|---------|---------|------------|------------|"
+    )
+    rows.sort(key=lambda r: r["h_max_native"], reverse=True)
+    for r in rows:
+        count_s = f"{r['h_count']}"
+        if r["b_count"] != r["h_count"]:
+            count_s += fmt_pct(pct(r["b_count"], r["h_count"]))
+        max_gas_s = f"{r['h_max_gas']}" + fmt_pct(pct(r["b_max_gas"], r["h_max_gas"]))
+        med_gas_s = f"{r['h_med_gas']}" + fmt_pct(pct(r["b_med_gas"], r["h_med_gas"]))
+        max_native_s = f"{r['h_max_native']}" + fmt_pct(
+            pct(r["b_max_native"], r["h_max_native"])
+        )
+        med_native_s = f"{r['h_med_native']}" + fmt_pct(
+            pct(r["b_med_native"], r["h_med_native"])
+        )
+        lines.append(
+            f"| `{r['name']}` | `{r['address']}` | {count_s} | "
+            f"{max_gas_s} | {med_gas_s} | {max_native_s} | {med_native_s} |"
+        )
+    return "\n".join(lines)
+
+
+def main():
+    args = sys.argv[1:]
+    if len(args) < 2:
+        print(
+            "Usage: compare_precompile_stats.py <base.csv> <head.csv> [label]\n"
+            "       compare_precompile_stats.py <b1.csv> <h1.csv> "
+            "<b2.csv> <h2.csv> ... [label]",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    label = ""
+    # Backward compat: odd arg count means the last arg is a label.
+    if len(args) % 2 == 1:
+        label = args.pop()
+
+    if len(args) < 2 or len(args) % 2 != 0:
+        print("Error: need even number of files (base/head pairs)", file=sys.stderr)
+        sys.exit(1)
+
+    base_paths = [args[j] for j in range(0, len(args), 2)]
+    head_paths = [args[j] for j in range(1, len(args), 2)]
+    base = aggregate([parse_csv(p) for p in base_paths])
+    head = aggregate([parse_csv(p) for p in head_paths])
+
+    if not head:
+        # Head CSVs missing or unparsable (e.g. partial artifact). Without
+        # head numbers there's nothing to report.
+        sys.exit(0)
+    if not base:
+        # Base side has no instrumentation (typical on the PR that introduces
+        # the precompile bench, where merge-base lacks the tracer). Print a
+        # head-only table so the data is still visible in the PR comment.
+        print(format_head_only_table(head, label))
+        sys.exit(0)
+    rows = compare(base, head)
+    if not rows:
+        sys.exit(0)
+    print(format_table(rows, label))
+
+
+def format_head_only_table(head, label=""):
+    if not head:
+        return ""
+    summary = "Precompile gas/native worst-case (head only — base lacks instrumentation)"
+    if label:
+        summary += f" ({label})"
+    lines = [
+        f"<details><summary>{summary}</summary>",
+        "",
+        "| Precompile | Address | Count | Max Gas | Med Gas | Max Native | Med Native |",
+        "|------------|---------|-------|---------|---------|------------|------------|",
+    ]
+    rows = sorted(head.items(), key=lambda kv: kv[1].get("max_native", 0), reverse=True)
+    for name, h in rows:
+        lines.append(
+            f"| `{name}` | `{h.get('address', '')}` | {h['count']} | "
+            f"{h['max_gas']} | {h['med_gas']} | {h['max_native']} | {h['med_native']} |"
+        )
+    lines.append("")
+    lines.append("</details>")
+    return "\n".join(lines)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench_scripts/cycles_per_native_report.py b/bench_scripts/cycles_per_native_report.py
new file mode 100644
index 000000000..f6a048e61
--- /dev/null
+++ b/bench_scripts/cycles_per_native_report.py
@@ -0,0 +1,302 @@
+#!/usr/bin/env python3
+"""Per-execution cycles/native ratios for opcodes and precompiles.
+
+Reads:
+  - per-opcode tracer samples (`<dir>/<OPCODE>.samples`, `gas,native` per line)
+  - per-opcode cycle samples (`<dir>/<OPCODE>.effective.cycles` preferred,
+    else `<OPCODE>.cycles`)
+  - per-precompile tracer samples (`<dir>/<precompile>.samples`)
+  - per-label cycle samples (`<dir>/<label>.effective.cycles` preferred,
+    else `<label>.cycles`) — `label → precompile` mapping via
+    `bench_scripts.join_precompile_samples.CYCLE_LABEL_TO_PRECOMPILE`.
+
+Computes `cycles / native` per execution. Reports median, p95, and max
+ratio per opcode / precompile. Output as Markdown to stdout (or `--out`
+file). Skips entries with `native == 0` on a given execution to avoid
+divide-by-zero.
+
+Usage:
+    python cycles_per_native_report.py \\
+        --opcode-samples-dir <DIR> --opcode-cycles-dir <DIR> \\
+        --precompile-samples-dir <DIR> --precompile-cycles-dir <DIR> \\
+        [--out report.md]
+"""
+
+import argparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from benchlib import (  # noqa: E402
+    list_label_files,
+    load_gas_native_samples,
+    load_int_samples,
+    percentile,
+    ratio,
+)
+from join_precompile_samples import (  # noqa: E402
+    CYCLE_LABEL_TO_PRECOMPILE,
+    SYNTHETIC_OPCODE_SOURCES,
+)
+
+
+def collect_per_label_ratios(sources, label_to_sample_name):
+    """For each label in `label_to_sample_name`:
+
+    - across every `(samples_dir, cycles_dir)` pair in `sources`:
+      - load `(gas, native)` pairs from `samples_dir/<sample_name>.samples`
+      - load cycles from `cycles_dir/<label>.effective.cycles` preferred,
+        else `cycles_dir/<label>.cycles`
+      - pair `native[i]` with `cycles[i]` for `i in min(len(...), len(...))`
+    - concatenate per-execution `cycles / native` ratios across all sources
+    - return the merged sorted ratio list (with `native == 0` excluded)
+
+    `sources`: iterable of (samples_dir, cycles_dir) tuples. Missing dirs
+    or files are silently skipped per-source.
+    `label_to_sample_name`: dict mapping cycle-marker label → tracer sample
+    filename root (without `.samples`). For opcodes it's identity
+    (OPCODE → OPCODE); for precompiles it's `CYCLE_LABEL_TO_PRECOMPILE`.
+
+    Returns dict label → (ratios_sorted, kind, count_skipped_zero_native, total_paired_executions).
+    `kind` reflects whichever variant was used in the LAST source that
+    contributed data (effective preferred); if any source had only raw,
+    `kind` may be "raw" — the field is informational, all ratios are
+    pooled together.
+    """
+    out = {}
+    for samples_dir, cycles_dir in sources:
+        if not samples_dir or not cycles_dir:
+            continue
+        _, effective_files, _ = list_label_files(cycles_dir)
+        for label, sample_name in label_to_sample_name.items():
+            samples_path = os.path.join(samples_dir, f"{sample_name}.samples")
+            if not os.path.isfile(samples_path):
+                continue
+            if label in effective_files:
+                cycles_path = os.path.join(cycles_dir, f"{label}.effective.cycles")
+                kind = "effective"
+            else:
+                cycles_path = os.path.join(cycles_dir, f"{label}.cycles")
+                if not os.path.isfile(cycles_path):
+                    continue
+                kind = "raw"
+            tracer_samples = load_gas_native_samples(samples_path)
+            cycle_samples = load_int_samples(cycles_path)
+            n = min(len(tracer_samples), len(cycle_samples))
+            if n == 0:
+                continue
+            entry_ratios, entry_kind, entry_skipped, entry_n = out.get(
+                label, ([], kind, 0, 0)
+            )
+            for i in range(n):
+                _gas, native = tracer_samples[i]
+                cycles = cycle_samples[i]
+                if native > 0:
+                    entry_ratios.append(ratio(cycles, native))
+                else:
+                    entry_skipped += 1
+            entry_n += n
+            # If any source provided "raw" for this label, downgrade kind
+            # to "raw" so the report flags mixed sourcing.
+            if kind == "raw":
+                entry_kind = "raw"
+            out[label] = (entry_ratios, entry_kind, entry_skipped, entry_n)
+    # Sort accumulated ratios per label.
+    for label, (rs, k, s, n) in list(out.items()):
+        rs.sort()
+        out[label] = (rs, k, s, n)
+    # Strip labels that ended up with no usable ratios.
+    return {label: data for label, data in out.items() if data[0]}
+
+
+def opcode_label_map(samples_dirs):
+    """For opcodes the label name IS the sample-file name (e.g. `SHA3` → `SHA3.samples` + `SHA3.cycles`).
+
+    Accepts a list of opcode samples dirs and takes the union of opcode
+    names found in any of them.
+    """
+    names = set()
+    for d in samples_dirs:
+        if not d:
+            continue
+        try:
+            for f in os.listdir(d):
+                if f.endswith(".samples"):
+                    names.add(f[: -len(".samples")])
+        except OSError:
+            continue
+    return {n: n for n in sorted(names)}
+
+
+def precompile_label_map():
+    """Map cycle-marker label → tracer-sample filename root.
+
+    `CYCLE_LABEL_TO_PRECOMPILE` provides the mapping for real precompiles.
+    Synthetic entries (currently `keccak` ← `SHA3` opcode samples) are
+    surfaced in a separate report section.
+    """
+    return dict(CYCLE_LABEL_TO_PRECOMPILE)
+
+
+def format_section(title, label_to_data, sort_key="max"):
+    """Format `label_to_data` (returned by `collect_per_label_ratios`) as a
+    Markdown table sorted by the worst-case ratio descending (or by p95 /
+    median when `sort_key` differs)."""
+    lines = []
+    lines.append(f"### {title}")
+    lines.append("")
+    if not label_to_data:
+        lines.append("_No samples available._")
+        lines.append("")
+        return lines
+    lines.append("| Name | Count | Med cyc/native | p95 cyc/native | Max cyc/native | Cycle source |")
+    lines.append("|---|---:|---:|---:|---:|---|")
+
+    def sort_value(item):
+        ratios, _kind, _skipped, _n = item[1]
+        if sort_key == "median":
+            return percentile(ratios, 50)
+        if sort_key == "p95":
+            return percentile(ratios, 95)
+        return ratios[-1]
+
+    for label, (ratios, kind, skipped, n) in sorted(
+        label_to_data.items(), key=sort_value, reverse=True
+    ):
+        med = percentile(ratios, 50)
+        p95 = percentile(ratios, 95)
+        worst = ratios[-1]
+        skipped_note = f" (+{skipped} skipped native=0)" if skipped else ""
+        lines.append(
+            f"| `{label}` | {len(ratios)}{skipped_note} | {med:.2f} | {p95:.2f} | {worst:.2f} | {kind} |"
+        )
+    lines.append("")
+    return lines
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--opcode-samples-dir",
+        action="append",
+        default=[],
+        help="Per-opcode tracer samples dir. Repeat to aggregate across multiple sources (e.g. one per block).",
+    )
+    parser.add_argument(
+        "--opcode-cycles-dir",
+        action="append",
+        default=[],
+        help="Per-opcode cycle samples dir. Repeat in the same order as --opcode-samples-dir to pair up sources.",
+    )
+    parser.add_argument(
+        "--precompile-samples-dir",
+        action="append",
+        default=[],
+        help="Per-precompile tracer samples dir. Repeat to aggregate.",
+    )
+    parser.add_argument(
+        "--precompile-cycles-dir",
+        action="append",
+        default=[],
+        help="Per-label cycle samples dir. Repeat in the same order as --precompile-samples-dir.",
+    )
+    parser.add_argument("--out", help="Write Markdown report to this file (default stdout)")
+    parser.add_argument(
+        "--sort-by",
+        choices=("max", "p95", "median"),
+        default="max",
+        help="Sort each table by this percentile, descending (default: max)",
+    )
+    parser.add_argument(
+        "--source-label",
+        default="",
+        help="Free-form text describing the source data (block, scheme, etc.); printed at the top of the report",
+    )
+    args = parser.parse_args()
+
+    if not any(
+        [
+            args.opcode_samples_dir,
+            args.opcode_cycles_dir,
+            args.precompile_samples_dir,
+            args.precompile_cycles_dir,
+        ]
+    ):
+        parser.error("At least one --*-dir pair must be specified.")
+
+    if len(args.opcode_samples_dir) != len(args.opcode_cycles_dir):
+        parser.error(
+            "--opcode-samples-dir and --opcode-cycles-dir must be specified the same number of times"
+        )
+    if len(args.precompile_samples_dir) != len(args.precompile_cycles_dir):
+        parser.error(
+            "--precompile-samples-dir and --precompile-cycles-dir must be specified the same number of times"
+        )
+
+    opcode_sources = list(zip(args.opcode_samples_dir, args.opcode_cycles_dir))
+    precompile_sources = list(
+        zip(args.precompile_samples_dir, args.precompile_cycles_dir)
+    )
+
+    sections = ["# Cycles / native ratios", ""]
+    if args.source_label:
+        sections.append(f"_Source: {args.source_label}._")
+        sections.append("")
+    sections.append(
+        "Ratios computed per execution from paired `<sample>.samples` "
+        "(gas,native) and `<label>.effective.cycles` (preferred) / "
+        "`<label>.cycles`. Median, p95, max across all executions, "
+        "pooled across every input source. Executions with `native == 0` "
+        "are excluded."
+    )
+    sections.append("")
+
+    # Opcodes.
+    opcode_data = collect_per_label_ratios(
+        opcode_sources,
+        opcode_label_map(args.opcode_samples_dir),
+    )
+    sections += format_section("Per-opcode", opcode_data, sort_key=args.sort_by)
+
+    # Precompiles (real precompile addresses).
+    pre_data = collect_per_label_ratios(
+        precompile_sources,
+        precompile_label_map(),
+    )
+    sections += format_section("Per-precompile", pre_data, sort_key=args.sort_by)
+
+    # Synthetic precompile entries (e.g. `keccak` sourced from SHA3 opcode
+    # samples paired against the corresponding label-level cycle dump
+    # from the precompile side). Surfaced separately so the source is
+    # obvious. Requires that opcode-samples and precompile-cycles dirs
+    # were provided in matching counts; pair them positionally.
+    if (
+        args.opcode_samples_dir
+        and args.precompile_cycles_dir
+        and len(args.opcode_samples_dir) == len(args.precompile_cycles_dir)
+    ):
+        synthetic_map = {}
+        for label, prec_name in CYCLE_LABEL_TO_PRECOMPILE.items():
+            opcode_name = SYNTHETIC_OPCODE_SOURCES.get(prec_name)
+            if opcode_name:
+                synthetic_map[label] = opcode_name
+        synth_sources = list(zip(args.opcode_samples_dir, args.precompile_cycles_dir))
+        synth_data = collect_per_label_ratios(synth_sources, synthetic_map)
+        if synth_data:
+            sections += format_section(
+                "Per-precompile (synthetic — gas/native from opcode tracer)",
+                synth_data,
+                sort_key=args.sort_by,
+            )
+
+    out_text = "\n".join(sections) + "\n"
+    if args.out:
+        with open(args.out, "w") as f:
+            f.write(out_text)
+        print(f"Wrote {args.out}")
+    else:
+        sys.stdout.write(out_text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench_scripts/join_opcode_stats.py b/bench_scripts/join_opcode_stats.py
index e77a68abc..bdf9782d1 100644
--- a/bench_scripts/join_opcode_stats.py
+++ b/bench_scripts/join_opcode_stats.py
@@ -11,10 +11,14 @@
     python join_opcode_stats.py <block.out> <block.bench> [--csv output.csv]
 """
 
+import os
 import sys
 import re
 import argparse
 
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from benchlib import ratio  # noqa: E402
+
 
 def parse_tracer_stats(filename):
     """Parse '=== EVM Opcode Stats:' from .out file."""
@@ -81,10 +85,6 @@ def parse_cycle_stats(filename):
     return stats
 
 
-def ratio(num, den):
-    return num / den if den > 0 else 0.0
-
-
 def main():
     parser = argparse.ArgumentParser(description="Join opcode tracer and cycle stats")
     parser.add_argument("out_file", help=".out file with tracer stats")
diff --git a/bench_scripts/join_precompile_samples.py b/bench_scripts/join_precompile_samples.py
new file mode 100755
index 000000000..6a447c160
--- /dev/null
+++ b/bench_scripts/join_precompile_samples.py
@@ -0,0 +1,399 @@
+#!/usr/bin/env python3
+"""Join per-execution precompile samples (gas/native from tracer, cycles from RISC-V).
+
+Reads:
+  - <tracer_dir>/<precompile>.samples: "gas,native" per line (execution order)
+  - <cycles_dir>/<cycle_label>.effective.cycles (preferred) or
+    <cycles_dir>/<cycle_label>.cycles (fallback): one cycle count per line in
+    execution order.
+
+The `.effective.cycles` variant includes delegation cost (Blake/BigInt/Keccak)
+using the same coefficients as the block-wide `block_effective` formula in
+`cycle_marker`. Without it, `cycles/gas` reflects raw RISC-V cycles only and
+undercounts delegation-heavy precompiles (ecrecover, modexp, bn254). The
+script prefers the effective variant and falls back to raw with a stderr note
+when only raw is available.
+
+Cycle-marker labels differ from the user-facing precompile names emitted by
+`PrecompileStatsTracer::dump_samples`, so we apply a fixed mapping.
+Unmapped `.cycles` files (e.g. `process_block.cycles`) are ignored silently.
+Tracer `.samples` files with no matching cycle data are reported to stderr.
+
+Outputs per-precompile CSV with (gas, native, cycles, cycles/gas, native/gas)
+per execution, and a summary table with p50/p95/p99/max ratios.
+
+Usage:
+    python join_precompile_samples.py <tracer_dir> <cycles_dir> \
+        [--out-dir <output_dir>] [--summary]
+"""
+
+import os
+import sys
+import argparse
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from benchlib import (  # noqa: E402
+    load_gas_native_samples as load_tracer_samples,
+    load_int_samples as load_cycle_samples,
+    percentile,
+    ratio,
+    safe_listdir as _safe_listdir,
+)
+
+
+# Maps cycle_marker labels (used inside the RISC-V binary, written by
+# `cycle_marker::wrap_with_resources!("<label>", ...)`) to the user-facing
+# precompile names emitted by PrecompileStatsTracer::dump_samples /
+# precompile_name(addr) in forward_system/src/system/tracers/precompile_stats.rs.
+#
+# The `_execution_environment` variants fire only on user-EVM-EE-triggered
+# invocations (SHA3 opcode for keccak; EVM call-frame dispatch for ecrecover),
+# producing samples that are 1:1 with the corresponding tracer/opcode gas
+# samples — no positional intrinsic filter needed. When these are present
+# they're preferred over the generic-label fallback.
+CYCLE_LABEL_TO_PRECOMPILE = {
+    "ecrecover_execution_environment": "ecrecover",
+    "keccak_execution_environment": "keccak",
+    "ecrecover": "ecrecover",
+    "sha256": "sha256",
+    "ripemd": "ripemd160",
+    "id": "identity",
+    "modexp": "modexp",
+    "bn254_ecadd": "ecadd",
+    "bn254_ecmul": "ecmul",
+    "bn254_pairing": "ecpairing",
+    "blake2f": "blake2f",
+    "point_evaluation": "point_eval",
+    "p256_verify": "p256_verify",
+    "bls12_381_g1_add": "bls12_g1add",
+    "bls12_381_g1_msm": "bls12_g1msm",
+    "bls12_381_g2_add": "bls12_g2add",
+    "bls12_381_g2_msm": "bls12_g2msm",
+    "bls12_381_pairing": "bls12_pairing_check",
+    "bls12_381_map_fp_to_g1": "bls12_map_fp_to_g1",
+    "bls12_381_map_fp2_to_g2": "bls12_map_fp2_to_g2",
+}
+
+# Per-precompile gas-source override. By default the tracer's
+# `<precompile>.samples` file is used; for the synthetic `keccak` entry,
+# the gas/native pair comes from the SHA3 opcode's per-execution dump
+# (`<opcode_dir>/SHA3.samples`) since there is no keccak precompile address.
+SYNTHETIC_OPCODE_SOURCES = {
+    "keccak": "SHA3",
+}
+
+
+def process_precompile(name, tracer_samples, cycle_samples, out_dir):
+    """Join samples, write per-execution CSV if out_dir given, return summary stats."""
+    n = min(len(tracer_samples), len(cycle_samples))
+    if n == 0:
+        return None
+
+    if len(tracer_samples) != len(cycle_samples):
+        print(
+            f"  WARNING: {name} count mismatch: tracer={len(tracer_samples)} "
+            f"cycles={len(cycle_samples)}, using first {n}",
+            file=sys.stderr,
+        )
+
+    rows = [(tracer_samples[i][0], tracer_samples[i][1], cycle_samples[i]) for i in range(n)]
+
+    # Write per-execution CSV
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+        path = os.path.join(out_dir, f"{name}.csv")
+        with open(path, "w") as f:
+            f.write("gas,native,cycles,cycles_per_gas,native_per_gas\n")
+            for gas, native, cycles in rows:
+                cpg = ratio(cycles, gas)
+                npg = ratio(native, gas)
+                f.write(f"{gas},{native},{cycles},{cpg:.2f},{npg:.2f}\n")
+
+    # Compute summary statistics
+    cycles_per_gas_values = sorted(ratio(c, g) for g, _, c in rows if g > 0)
+    native_per_gas_values = sorted(ratio(nat, g) for g, nat, _ in rows if g > 0)
+
+    if not cycles_per_gas_values:
+        return None
+
+    return {
+        "name": name,
+        "count": n,
+        "med_cpg": percentile(cycles_per_gas_values, 50),
+        "p95_cpg": percentile(cycles_per_gas_values, 95),
+        "p99_cpg": percentile(cycles_per_gas_values, 99),
+        "max_cpg": cycles_per_gas_values[-1],
+        "med_npg": percentile(native_per_gas_values, 50),
+        "p95_npg": percentile(native_per_gas_values, 95),
+        "p99_npg": percentile(native_per_gas_values, 99),
+        "max_npg": native_per_gas_values[-1],
+    }
+
+
+def filter_intrinsic_ecrecover_cycles(bench_path):
+    """Identify indices into ecrecover.cycles that correspond to precompile-target
+    calls (not tx-signature-verification intrinsics).
+
+    Walks the cycle-marker bench file in start-cycle order (the order the file
+    is written in) and tracks `process_transaction` boundaries. The FIRST
+    `ecrecover` marker within each `process_transaction` is treated as the
+    intrinsic sig verification; any subsequent `ecrecover` markers within that
+    same transaction are precompile-target calls (via TxKind::Call(0x01)).
+
+    Assumption: every `process_transaction` invokes ecrecover exactly once for
+    signature verification before any user code runs. This holds for the
+    standard L2 mainnet transaction path used by the bench fixtures. It does
+    NOT hold for:
+      - L1->L2 priority ops (no signature, no intrinsic ecrecover)
+      - EIP-7702 set-code calls where authority recovery uses a different
+        ecrecover invocation pattern
+      - eth_call simulation paths that skip sig verification
+    If those workloads ever land in the bench fixtures, replace this
+    positional heuristic with a distinct cycle-marker label
+    (e.g. `ecrecover_intrinsic`) emitted at the sig-verification call site.
+
+    Returns the indices to KEEP (precompile-target only).
+    """
+    keep = []
+    in_tx = False
+    sig_seen_this_tx = False
+    ecrecover_idx = -1
+    try:
+        with open(bench_path) as f:
+            for line in f:
+                stripped = line.strip()
+                if stripped.startswith("process_transaction:"):
+                    in_tx = True
+                    sig_seen_this_tx = False
+                elif stripped.startswith("ecrecover:"):
+                    ecrecover_idx += 1
+                    if in_tx and not sig_seen_this_tx:
+                        sig_seen_this_tx = True
+                        # Intrinsic — skip.
+                    else:
+                        keep.append(ecrecover_idx)
+    except FileNotFoundError:
+        return None
+    return keep
+
+
+def collect_source(
+    tracer_dir,
+    cycles_dir,
+    bench_file=None,
+    used_kinds=None,
+    opcode_samples_dir=None,
+):
+    """Read one source (tracer_dir, cycles_dir, optional bench_file).
+
+    Returns a dict mapping precompile_name -> (tracer_samples, cycle_samples).
+    `cycle_samples` is filtered for intrinsic ecrecovers when bench_file is set
+    AND the `_execution_environment` cycle marker is absent (legacy fallback;
+    new binaries emit a distinct marker that obviates the positional filter).
+    Sources with missing dirs are skipped silently (returns {}).
+
+    Prefers `<label>.effective.cycles` (raw cycles + Blake/BigInt/Keccak
+    weights) over `<label>.cycles` (raw only); falls back per-label if the
+    effective variant is absent. `used_kinds`, when supplied, is populated
+    with the set of kinds ("effective", "raw") that were actually consumed.
+
+    For synthetic-precompile entries (see `SYNTHETIC_OPCODE_SOURCES`, e.g.
+    `keccak` ← `SHA3.samples`), the tracer gas/native source is the named
+    opcode sample file under `opcode_samples_dir` rather than a precompile
+    tracer sample. Multiple cycle labels pointing to the same precompile name
+    (e.g. `ecrecover_execution_environment` + `ecrecover`) are deduplicated:
+    the first one with both tracer + cycle data wins, in iteration order of
+    `CYCLE_LABEL_TO_PRECOMPILE` (so `_execution_environment` variants take
+    precedence).
+    """
+    # Group cycle files by label, preferring `.effective.cycles` (raw +
+    # delegation weights) over `.cycles` (raw only) per label.
+    raw_files = set()
+    effective_files = set()
+    for f in _safe_listdir(cycles_dir):
+        if f.endswith(".effective.cycles"):
+            effective_files.add(f[: -len(".effective.cycles")])
+        elif f.endswith(".cycles"):
+            raw_files.add(f[: -len(".cycles")])
+    cycles_labels = raw_files | effective_files
+    tracer_files = {
+        f[: -len(".samples")]
+        for f in _safe_listdir(tracer_dir)
+        if f.endswith(".samples")
+    }
+    opcode_files = {
+        f[: -len(".samples")]
+        for f in _safe_listdir(opcode_samples_dir or "")
+        if f.endswith(".samples")
+    }
+    # Only invoke the positional filter if no execution-environment marker is
+    # present (legacy binaries built before the dedicated marker landed).
+    ecrecover_keep = (
+        filter_intrinsic_ecrecover_cycles(bench_file)
+        if bench_file and "ecrecover_execution_environment" not in cycles_labels
+        else None
+    )
+
+    out = {}
+    for cycle_label, precompile_name in CYCLE_LABEL_TO_PRECOMPILE.items():
+        if cycle_label not in cycles_labels:
+            continue
+        if precompile_name in out:
+            # An earlier (higher-priority) label already supplied this precompile.
+            continue
+        synthetic_opcode = SYNTHETIC_OPCODE_SOURCES.get(precompile_name)
+        if synthetic_opcode is not None:
+            if synthetic_opcode not in opcode_files:
+                continue
+            tracer_path = os.path.join(opcode_samples_dir, f"{synthetic_opcode}.samples")
+        else:
+            if precompile_name not in tracer_files:
+                continue
+            tracer_path = os.path.join(tracer_dir, f"{precompile_name}.samples")
+        if cycle_label in effective_files:
+            cycles_path = os.path.join(cycles_dir, f"{cycle_label}.effective.cycles")
+            kind = "effective"
+        else:
+            cycles_path = os.path.join(cycles_dir, f"{cycle_label}.cycles")
+            kind = "raw"
+        if used_kinds is not None:
+            used_kinds.add(kind)
+        tracer_samples = load_tracer_samples(tracer_path)
+        cycle_samples = load_cycle_samples(cycles_path)
+        if cycle_label == "ecrecover" and ecrecover_keep is not None:
+            cycle_samples = [
+                cycle_samples[i] for i in ecrecover_keep if i < len(cycle_samples)
+            ]
+        out[precompile_name] = (tracer_samples, cycle_samples)
+    return out
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Join per-execution precompile samples across one or more sources"
+    )
+    parser.add_argument(
+        "pairs",
+        nargs="+",
+        help="Alternating tracer_dir cycles_dir paths; one pair per data source.",
+    )
+    parser.add_argument("--out-dir", help="Write per-execution CSVs to this directory")
+    parser.add_argument("--summary", action="store_true", help="Print summary table")
+    parser.add_argument(
+        "--bench-file",
+        action="append",
+        default=[],
+        help="Cycle-marker .bench file. When the `ecrecover_execution_environment`"
+        " marker is absent (legacy binaries), used to apply a positional filter"
+        " that drops the per-tx intrinsic sig-verification ecrecover. Ignored"
+        " when the marker is present. May be repeated; matched positionally to"
+        " the (tracer_dir, cycles_dir) pairs.",
+    )
+    parser.add_argument(
+        "--opcode-samples-dir",
+        action="append",
+        default=[],
+        help="Directory containing per-opcode `<OPCODE>.samples` files (gas,native)."
+        " Used to source gas/native for synthetic precompile entries that have no"
+        " precompile address (currently `keccak` ← `SHA3.samples`). May be repeated;"
+        " matched positionally to the (tracer_dir, cycles_dir) pairs.",
+    )
+    args = parser.parse_args()
+
+    if len(args.pairs) % 2 != 0:
+        print(
+            "Error: need an even number of positional args (tracer_dir cycles_dir pairs)",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    sources = []
+    used_kinds = set()
+    for i in range(0, len(args.pairs), 2):
+        tracer_dir = args.pairs[i]
+        cycles_dir = args.pairs[i + 1]
+        bench_file = (
+            args.bench_file[i // 2] if (i // 2) < len(args.bench_file) else None
+        )
+        opcode_samples_dir = (
+            args.opcode_samples_dir[i // 2]
+            if (i // 2) < len(args.opcode_samples_dir)
+            else None
+        )
+        sources.append(
+            collect_source(
+                tracer_dir,
+                cycles_dir,
+                bench_file,
+                used_kinds=used_kinds,
+                opcode_samples_dir=opcode_samples_dir,
+            )
+        )
+
+    # Concatenate samples + cycles per precompile across all sources.
+    aggregated = {}
+    for src in sources:
+        for name, (tracer_samples, cycle_samples) in src.items():
+            agg = aggregated.setdefault(name, ([], []))
+            agg[0].extend(tracer_samples)
+            agg[1].extend(cycle_samples)
+
+    summaries = []
+    seen_precompiles = set(aggregated.keys())
+    for name, (tracer_samples, cycle_samples) in aggregated.items():
+        s = process_precompile(name, tracer_samples, cycle_samples, args.out_dir)
+        if s:
+            summaries.append(s)
+
+    # Report tracer files that had no cycle counterpart across all sources.
+    all_tracer_files = set()
+    for i in range(0, len(args.pairs), 2):
+        for f in _safe_listdir(args.pairs[i]):
+            if f.endswith(".samples"):
+                all_tracer_files.add(f[: -len(".samples")])
+    missing_cycles = sorted(all_tracer_files - seen_precompiles)
+    if missing_cycles:
+        print(
+            f"Note: no cycle data for {len(missing_cycles)} precompile(s): "
+            f"{', '.join(missing_cycles)}",
+            file=sys.stderr,
+        )
+
+    if not args.summary and not args.out_dir:
+        args.summary = True
+
+    if args.summary and summaries:
+        if used_kinds == {"effective"}:
+            cycles_label = "cycles = effective (raw + Blake×16 + BigInt×4 + Keccak×4)"
+        elif used_kinds == {"raw"}:
+            cycles_label = "cycles = raw RISC-V (delegations NOT included)"
+        elif used_kinds == {"raw", "effective"}:
+            cycles_label = "cycles = mixed (some labels lack effective dump; see stderr)"
+            print(
+                "Warning: cycles dump kind mixed across labels — re-run with a"
+                " consistent cycle_marker build to avoid skewed comparisons.",
+                file=sys.stderr,
+            )
+        else:
+            cycles_label = "cycles = (no samples consumed)"
+        summaries.sort(key=lambda s: s["max_cpg"], reverse=True)
+        print(cycles_label)
+        print(
+            f"{'precompile':<22} {'count':>8}"
+            f" {'med c/g':>10} {'p95 c/g':>10} {'p99 c/g':>10} {'max c/g':>10}"
+            f" {'med n/g':>10} {'p95 n/g':>10} {'p99 n/g':>10} {'max n/g':>10}"
+        )
+        print("-" * 120)
+        for s in summaries:
+            print(
+                f"{s['name']:<22} {s['count']:>8}"
+                f" {s['med_cpg']:>10.1f} {s['p95_cpg']:>10.1f} {s['p99_cpg']:>10.1f} {s['max_cpg']:>10.1f}"
+                f" {s['med_npg']:>10.1f} {s['p95_npg']:>10.1f} {s['p99_npg']:>10.1f} {s['max_npg']:>10.1f}"
+            )
+
+    if args.out_dir:
+        print(f"\nPer-execution CSVs written to {args.out_dir}/")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench_scripts/join_samples.py b/bench_scripts/join_samples.py
index 914521b83..dc0d5acaf 100644
--- a/bench_scripts/join_samples.py
+++ b/bench_scripts/join_samples.py
@@ -2,7 +2,19 @@
 
 Reads:
   - <tracer_dir>/<OPCODE>.samples: "gas,native" per line (execution order)
-  - <cycles_dir>/<OPCODE>.cycles: "cycles" per line (execution order)
+  - <cycles_dir>/<OPCODE>.effective.cycles (preferred) or
+    <cycles_dir>/<OPCODE>.cycles (fallback): one cycle count per line in
+    execution order.
+
+The `.effective.cycles` variant includes delegation cost (Blake/BigInt/Keccak)
+using the same coefficients as the block-wide `block_effective` formula in
+`cycle_marker`. Without it, `cycles/gas` reflects raw RISC-V cycles only and
+undercounts opcodes whose handlers delegate (SHA3 → keccak; SLOAD/SSTORE,
+BALANCE/EXTCODE*, SELFBALANCE → Blake via account/storage tree;
+CALL/DELEGATECALL/STATICCALL/CALLCODE and CREATE/CREATE2 → keccak +
+Blake + any inner precompile delegations). The script prefers the
+effective variant and falls back to raw with a stderr note when only
+raw is available.
 
 Since both runs are deterministic, line K in both files corresponds to
 the Kth execution of that opcode.
@@ -18,33 +30,13 @@
 import sys
 import argparse
 
-
-def load_tracer_samples(path):
-    """Load gas,native pairs from .samples file."""
-    samples = []
-    with open(path) as f:
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            parts = line.split(",")
-            samples.append((int(parts[0]), int(parts[1])))
-    return samples
-
-
-def load_cycle_samples(path):
-    """Load cycle values from .cycles file."""
-    samples = []
-    with open(path) as f:
-        for line in f:
-            line = line.strip()
-            if line:
-                samples.append(int(line))
-    return samples
-
-
-def ratio(num, den):
-    return num / den if den > 0 else 0.0
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from benchlib import (  # noqa: E402
+    load_gas_native_samples as load_tracer_samples,
+    load_int_samples as load_cycle_samples,
+    percentile as _benchlib_percentile,
+    ratio,
+)
 
 
 def process_opcode(name, tracer_samples, cycle_samples, out_dir):
@@ -84,10 +76,7 @@ def process_opcode(name, tracer_samples, cycle_samples, out_dir):
     cycles_per_gas_values.sort()
     native_per_gas_values.sort()
 
-    def percentile(sorted_vals, p):
-        # Nearest-rank method: rank = ceil(p/100 * N), 1-indexed
-        rank = max(1, -(-len(sorted_vals) * p // 100))  # ceiling division
-        return sorted_vals[min(rank, len(sorted_vals)) - 1]
+    percentile = _benchlib_percentile
 
     return {
         "name": name,
@@ -111,9 +100,18 @@ def main():
     parser.add_argument("--summary", action="store_true", help="Print summary table")
     args = parser.parse_args()
 
-    # Find opcodes present in both directories
+    # Find opcodes present in both directories. Cycle files come in two
+    # flavors: `<OPCODE>.effective.cycles` (preferred, includes delegations)
+    # and `<OPCODE>.cycles` (raw). Strip both suffixes when computing the
+    # opcode name set.
     tracer_opcodes = {f.replace(".samples", "") for f in os.listdir(args.tracer_dir) if f.endswith(".samples")}
-    cycle_opcodes = {f.replace(".cycles", "") for f in os.listdir(args.cycles_dir) if f.endswith(".cycles")}
+    cycle_files = set(os.listdir(args.cycles_dir))
+    cycle_opcodes = set()
+    for f in cycle_files:
+        if f.endswith(".effective.cycles"):
+            cycle_opcodes.add(f[: -len(".effective.cycles")])
+        elif f.endswith(".cycles"):
+            cycle_opcodes.add(f[: -len(".cycles")])
     common = sorted(tracer_opcodes & cycle_opcodes)
 
     if not common:
@@ -121,9 +119,16 @@ def main():
         sys.exit(1)
 
     summaries = []
+    used_kinds = set()
     for name in common:
         tracer_path = os.path.join(args.tracer_dir, f"{name}.samples")
-        cycles_path = os.path.join(args.cycles_dir, f"{name}.cycles")
+        effective_file = f"{name}.effective.cycles"
+        if effective_file in cycle_files:
+            cycles_path = os.path.join(args.cycles_dir, effective_file)
+            used_kinds.add("effective")
+        else:
+            cycles_path = os.path.join(args.cycles_dir, f"{name}.cycles")
+            used_kinds.add("raw")
 
         tracer_samples = load_tracer_samples(tracer_path)
         cycle_samples = load_cycle_samples(cycles_path)
@@ -136,8 +141,22 @@ def main():
         args.summary = True
 
     if args.summary and summaries:
+        if used_kinds == {"effective"}:
+            cycles_label = "cycles = effective (raw + Blake×16 + BigInt×4 + Keccak×4)"
+        elif used_kinds == {"raw"}:
+            cycles_label = "cycles = raw RISC-V (delegations NOT included)"
+        elif used_kinds == {"raw", "effective"}:
+            cycles_label = "cycles = mixed (some opcodes lack effective dump; see stderr)"
+            print(
+                "Warning: cycles dump kind mixed across opcodes — re-run with a"
+                " consistent cycle_marker build to avoid skewed comparisons.",
+                file=sys.stderr,
+            )
+        else:
+            cycles_label = "cycles = (no samples consumed)"
         # Sort by worst-case cycles/gas
         summaries.sort(key=lambda s: s["max_cpg"], reverse=True)
+        print(cycles_label)
         print(f"{'opcode':<16} {'count':>8}"
               f" {'med c/g':>8} {'p95 c/g':>8} {'p99 c/g':>8} {'max c/g':>8}"
               f" {'med n/g':>8} {'p95 n/g':>8} {'p99 n/g':>8} {'max n/g':>8}")
diff --git a/cycle_marker/src/lib.rs b/cycle_marker/src/lib.rs
index f4d2c29c2..d9037b29b 100644
--- a/cycle_marker/src/lib.rs
+++ b/cycle_marker/src/lib.rs
@@ -353,14 +353,46 @@ pub fn print_cycle_markers(cm: CycleMarker) -> CycleMarkerResults {
     let mut marker_map: HashMap<(&'static str, u64), (Mark, Mark)> = HashMap::new();
     let mut start_counts: HashMap<(&'static str, u64), Mark> = HashMap::new();
 
+    // Effective-cycle weighting for a single marker delta. Used both for
+    // per-execution opcode samples and per-execution label samples so that
+    // cycles/gas analysis reflects true prover cost rather than raw RISC-V
+    // cycles only.
+    let effective_of = |diff: &Mark| -> u64 {
+        diff.cycles
+            + BLAKE_DELEGATION_COEFF
+                * diff
+                    .delegations
+                    .get(&BLAKE_DELEGATION_ID)
+                    .cloned()
+                    .unwrap_or_default()
+            + BIGINT_DELEGATION_COEFF
+                * diff
+                    .delegations
+                    .get(&BIGINT_DELEGATION_ID)
+                    .cloned()
+                    .unwrap_or_default()
+            + KECCAK_DELEGATION_COEFF
+                * diff
+                    .delegations
+                    .get(&KECCAK_DELEGATION_ID)
+                    .cloned()
+                    .unwrap_or_default()
+    };
+
     // Opcode-level markers: aggregate by label name.
     // OpcodeStart/OpcodeEnd pairs are always adjacent (leaf-level, no nesting).
+    // `samples` stores raw RISC-V cycles per execution; `samples_effective`
+    // stores raw + delegation cost. Aggregate counters (total/min/max/median)
+    // run over raw to preserve the long-standing `.bench` table semantics —
+    // the effective values are only consumed via the per-execution dump in
+    // `OPCODE_CYCLE_SAMPLES_DIR`.
     struct OpcodeAcc {
         total: u64,
         count: u64,
         min: u64,
         max: u64,
         samples: Vec<u64>,
+        samples_effective: Vec<u64>,
     }
     impl OpcodeAcc {
         fn new() -> Self {
@@ -370,14 +402,16 @@ pub fn print_cycle_markers(cm: CycleMarker) -> CycleMarkerResults {
                 min: u64::MAX,
                 max: 0,
                 samples: Vec::new(),
+                samples_effective: Vec::new(),
             }
         }
-        fn record(&mut self, cycles: u64) {
+        fn record(&mut self, cycles: u64, effective: u64) {
             self.total += cycles;
             self.count += 1;
             self.min = self.min.min(cycles);
             self.max = self.max.max(cycles);
             self.samples.push(cycles);
+            self.samples_effective.push(effective);
         }
         fn median(&mut self) -> u64 {
             if self.samples.is_empty() {
@@ -408,10 +442,11 @@ pub fn print_cycle_markers(cm: CycleMarker) -> CycleMarkerResults {
             Label::OpcodeEnd(name) => {
                 if let Some(start_mark) = pending_opcode_start.take() {
                     let diff = mark.diff(&start_mark);
+                    let effective = effective_of(&diff);
                     opcode_aggregated
                         .entry(name)
                         .or_insert_with(OpcodeAcc::new)
-                        .record(diff.cycles);
+                        .record(diff.cycles, effective);
                 }
             }
             Label::Start(name) => {
@@ -441,6 +476,25 @@ pub fn print_cycle_markers(cm: CycleMarker) -> CycleMarkerResults {
         .collect();
     markers.sort_by_key(|(_, (start, _))| start.cycles);
 
+    // Collect per-label samples for the LABEL_CYCLE_SAMPLES_DIR dump.
+    // Each (label, (start, end)) in `markers` corresponds to one execution
+    // of a Start/End label pair; cycles = end - start. Effective-cycle
+    // value is computed via `effective_of` defined above.
+    let mut label_cycle_samples: HashMap<&'static str, Vec<u64>> = HashMap::new();
+    let mut label_effective_samples: HashMap<&'static str, Vec<u64>> = HashMap::new();
+    for (label, (start, end)) in &markers {
+        let diff = end.diff(start);
+        let effective = effective_of(&diff);
+        label_cycle_samples
+            .entry(*label)
+            .or_default()
+            .push(diff.cycles);
+        label_effective_samples
+            .entry(*label)
+            .or_default()
+            .push(effective);
+    }
+
     let mut block_effective: Option<u64> = None;
 
     for (label, (start, end)) in markers {
@@ -478,19 +532,75 @@ pub fn print_cycle_markers(cm: CycleMarker) -> CycleMarkerResults {
         cm.delegation_counter
     ));
 
-    // Dump per-execution cycle samples if requested via env var
+    // Dump per-execution cycle samples if requested via env var. Writes
+    // two files per opcode (same layout as the label dump below):
+    //   `<dir>/<OPCODE>.cycles`           — raw RISC-V cycles per execution
+    //   `<dir>/<OPCODE>.effective.cycles` — raw + delegation weights
+    // Caller contract: clean the dir before the first invocation (e.g. with
+    // `rm -rf`) so that stale files from a previous run are not mixed in.
+    // This block APPENDS to existing files so that multi-block test runs
+    // accumulate all samples rather than keeping only the last block's data.
     if let Ok(dir) = std::env::var("OPCODE_CYCLE_SAMPLES_DIR") {
         let dir = std::path::Path::new(&dir);
         std::fs::create_dir_all(dir).expect("Failed to create cycle samples dir");
+        let append = |path: std::path::PathBuf, samples: &[u64]| {
+            let mut f = std::fs::OpenOptions::new()
+                .create(true)
+                .append(true)
+                .open(path)
+                .expect("Failed to open cycle samples file for append");
+            use std::io::Write;
+            for &c in samples {
+                writeln!(f, "{}", c).expect("Failed to write cycle sample");
+            }
+        };
         for (name, acc) in &opcode_aggregated {
             if acc.samples.is_empty() {
                 continue;
             }
-            let path = dir.join(format!("{}.cycles", name));
-            let mut f = std::fs::File::create(path).expect("Failed to create cycle samples file");
+            append(dir.join(format!("{}.cycles", name)), &acc.samples);
+            append(
+                dir.join(format!("{}.effective.cycles", name)),
+                &acc.samples_effective,
+            );
+        }
+    }
+
+    // Dump per-execution non-opcode label samples if requested via env var.
+    // Writes two files per label:
+    //   `<dir>/<label>.cycles`           — raw RISC-V cycles per execution
+    //   `<dir>/<label>.effective.cycles` — effective cycles per execution
+    //                                       (raw + Blake/BigInt/Keccak weights,
+    //                                       same formula as block_effective)
+    // One value per line, execution (start-cycle) order. Mirrors
+    // OPCODE_CYCLE_SAMPLES_DIR's format so the same join scripts can consume
+    // it. Effective values let cycles/gas analysis reflect prover cost
+    // including delegation work (which raw RISC-V cycles do not capture).
+    // Caller contract: clean the dir before the first invocation so that stale
+    // files from a previous run are not mixed in. This block APPENDS so that
+    // multi-block test runs accumulate all samples rather than keeping only the
+    // last block's data.
+    if let Ok(dir) = std::env::var("LABEL_CYCLE_SAMPLES_DIR") {
+        let dir = std::path::Path::new(&dir);
+        std::fs::create_dir_all(dir).expect("Failed to create label cycle samples dir");
+        let append = |path: std::path::PathBuf, samples: &[u64]| {
+            let mut f = std::fs::OpenOptions::new()
+                .create(true)
+                .append(true)
+                .open(path)
+                .expect("Failed to open label cycle samples file for append");
             use std::io::Write;
-            for &c in &acc.samples {
-                writeln!(f, "{}", c).expect("Failed to write cycle sample");
+            for &c in samples {
+                writeln!(f, "{}", c).expect("Failed to write label cycle sample");
+            }
+        };
+        for (name, samples) in &label_cycle_samples {
+            if samples.is_empty() {
+                continue;
+            }
+            append(dir.join(format!("{}.cycles", name)), samples);
+            if let Some(eff) = label_effective_samples.get(name) {
+                append(dir.join(format!("{}.effective.cycles", name)), eff);
             }
         }
     }
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
index 251fef994..35a7ee1ab 100644
--- a/docs/benchmarking.md
+++ b/docs/benchmarking.md
@@ -1,253 +1,132 @@
-# Benchmarking Guide
+# Benchmarking Reference
 
-## Overview
+## Metric
 
-ZKsync OS performance is measured in **effective RISC-V cycles**, not wall-clock time. The proving cost is directly proportional to cycle count. A reduction in effective cycles = cheaper proving.
+Proving cost is proportional to **effective RISC-V cycles**:
 
 ```
 effective_cycles = raw_risc_v_cycles
-                 + 16 × blake_delegations
-                 + 4  × bigint_delegations
-                 + 4  × keccak_delegations
-```
-
-The repository currently uses two closely related metrics:
-- `cycle_marker::print_cycle_markers()` and `zksync_os_runner::Runner::run()` (returning `block_effective`) use the formula above.
-- `bench_scripts/compare_bench.py` derives its `Eff` column from `.bench` files using the same Blake/BigInt/Keccak weights, and also adds `+1` for every other delegation type recorded in the marker output.
-
-## Quick Start
-
-Use `bench_scripts/bench.sh` to run benchmarks. All subcommands that run benchmarks automatically rebuild the RISC-V binary first.
-
-**Save a baseline** (do this once on the base branch):
-```bash
-bench_scripts/bench.sh baseline
-```
-
-**Quick check after making changes** (runs 1 block, compares against baseline):
-```bash
-bench_scripts/bench.sh quick
-```
-
-**Full benchmark run** (all blocks + precompiles):
-```bash
-bench_scripts/bench.sh run
-```
-
-**Compare full results against baseline:**
-```bash
-bench_scripts/bench.sh compare
-```
-
-**Generate a flamegraph** (identifies where RISC-V cycles are spent — use to find optimization targets). Produces both an SVG and a text summary (`.txt`) with self-cost and call stacks, suitable for automated analysis:
-```bash
-bench_scripts/bench.sh flamegraph              # default: bench_results/flamegraph.svg + .txt
-bench_scripts/bench.sh flamegraph output.svg   # custom path (text summary at output.txt)
-```
-
-Results are saved to `bench_results/` (gitignored). Negative % in effective cycles = improvement.
-
-## Prerequisites
-
-One-time setup (in addition to the standard Rust toolchain):
-
-```bash
-cargo install cargo-binutils --locked
-cargo install cargo-airbender --git https://github.com/matter-labs/airbender-platform --locked
-pip3 install matplotlib   # only needed for opcode frequency charts
-```
-
-## Interpreting Results
-
-The comparison output is a markdown table with columns:
-- **Base/Head Eff** — effective cycles (primary metric). Negative % = improvement.
-- **Base/Head Raw** — raw RISC-V cycles excluding delegations.
-- **Base/Head Blake** — number of Blake2 delegation calls.
-- **Base/Head Bigint** — number of BigInt delegation calls.
-- **Base/Head Keccak** — number of Keccak delegation calls.
-
-`Base/Head Eff` is the comparison-script metric described above. Focus on that column when comparing two `.bench` files, and keep in mind it is slightly broader than the simulator-returned block effective value.
-
-## How It Works
-
-### Cycle Marker Framework
-
-The `cycle_marker` crate provides macros to instrument code:
-
-```rust
-cycle_marker::wrap!("my_label", { /* measured code */ });
-// or
-cycle_marker::start!("my_label");
-// ... code ...
-cycle_marker::end!("my_label");
-```
-
-On RISC-V, these write to CSR `0x7ff`, signaling the simulator to record cycle counts. On the host (forward mode), labels are collected in thread-local storage for later pairing with simulator data.
-
-The block-wide marker is `"process_block"` — this is what produces the overall effective cycle count.
-
-### Feature Flags
-
-| Feature | Scope | Effect |
-|---------|-------|--------|
-| `cycle_marker` | Multiple crates | Activates cycle measurement markers |
-| `unlimited_native` | `basic_bootloader`, `forward_system` | Disables native resource limits so benchmarks don't hit gas ceilings |
-| `benchmarking` | `proof_running_system`, `zksync_os` | Convenience: enables both `cycle_marker` + `unlimited_native` |
-| `rig/no_print` | Test rig | Suppresses verbose execution logs |
-
-### Benchmark Data Flow
-
-1. Build RISC-V binary with `benchmarking` feature enabled
-2. Run block replay through RISC-V simulator (`zksync_os_runner`)
-3. Simulator records cycle counts at each CSR marker
-4. `cycle_marker::print_cycle_markers()` computes effective cycles for the block-wide marker using Blake/BigInt delegation weights
-5. Results written to file at `MARKER_PATH` (default: `markers.bench`)
-6. Python scripts parse and compare `.bench` files, adding `+1` for other delegation types in the comparison report
-
-### Output File Format
-
-The `.bench` files produced by `cycle_marker` contain sections like:
-
-```
-=== Cycle markers:
-process_block: net cycles: 12345678, net delegations: {1991: 100, 1994: 200}
-some_inner_label: net cycles: 456789, net delegations: {1991: 50}
-Total delegations: {1991: 100, 1994: 200}
-==================
-```
-
-Delegation IDs: `1991` = Blake2, `1994` = BigInt, `1995` = Keccak.
-
-## Manual Commands
-
-The `bench_scripts/bench.sh` script wraps these commands. Use them directly only if you need finer control.
-
-### Build the RISC-V Benchmarking Binary
-
-```bash
-cd zksync_os && ./dump_bin.sh --type evm-replay-benchmarking
-```
-
-### Run a Single Block
-
-```bash
-ZKSYNC_RISC_V_RUN=true \
-MARKER_PATH=$(pwd)/result.bench \
-cargo run --manifest-path tests/instances/eth_runner/Cargo.toml \
-  --release -j 3 \
-  --features rig/no_print,rig/cycle_marker,rig/unlimited_native \
-  -- single-run --block-dir tests/instances/eth_runner/blocks/19299001 \
-  --opcode-stats \
-  > result.out
-```
-
-Omit `--opcode-stats` when only block-level cycle benchmarks are needed — it adds per-opcode tracing overhead.
-
-Available blocks: `19299001`, `22244135`, `23292836` (in `tests/instances/eth_runner/blocks/`).
-
-### Run Precompile Benchmarks
-
-```bash
-ZKSYNC_RISC_V_RUN=true \
-MARKER_PATH=$(pwd)/precompiles.bench \
-cargo test --release -j 3 \
-  --features rig/no_print,precompiles/cycle_marker,rig/unlimited_native \
-  -p precompiles -- test_precompiles
-```
-
-### Compare Results
-
-```bash
-python3 bench_scripts/compare_bench.py \
-  '[("block_19299001", "base.bench", "head.bench", "process_block")]'
-```
-
-### Generate Flamegraph
-
-```bash
-ZKSYNC_RISC_V_RUN=true \
-cargo run --manifest-path tests/instances/eth_runner/Cargo.toml \
-  --release -j 3 \
-  --features rig/no_print,rig/cycle_marker,rig/unlimited_native \
-  -- single-run --block-dir tests/instances/eth_runner/blocks/19299001 \
-  --flamegraph block_19299001.svg
-
-# Convert SVG to text summary (self-cost + call stacks)
-python3 bench_scripts/parse_flamegraph.py block_19299001.svg block_19299001.txt
-```
-
-### Parse Opcode Statistics
-
-```bash
-python3 bench_scripts/parse_opcodes.py result.out opcodes.csv opcodes.png
-```
-
-### Per-Opcode Benchmarking
-
-The benchmark flow collects per-opcode gas, native resource, and RISC-V cycle stats. The forward-mode run uses `EvmOpcodeStatsTracer` (enabled via `--opcode-stats`) to record gas/native per opcode execution (with min/max/median). The RISC-V run records per-opcode cycles via `cycle_marker` opcode markers.
-
-**Quick run with all data:**
-```bash
-OPCODE_SAMPLES_DIR=$(pwd)/samples \
-OPCODE_CYCLE_SAMPLES_DIR=$(pwd)/cycle_samples \
-OPCODE_STATS_PATH=$(pwd)/opcode_stats.csv \
-bash bench_scripts/bench.sh quick
-```
-
-The `.out` file contains the per-opcode stats table (gas/native with min/max/median). The `.bench` file contains per-opcode cycle stats. Setting the env vars also dumps per-execution sample files for detailed analysis.
-
-**Join per-execution samples to get actual cycles/gas ratios:**
-```bash
-python3 bench_scripts/join_samples.py samples/ cycle_samples/ --summary --out-dir joined/
-```
-
-Produces per-execution `(gas, native, cycles, cycles/gas, native/gas)` CSVs per opcode and a summary table with p50/p95/p99/max ratios.
-
-**Visualize:**
-```bash
-python3 bench_scripts/visualize_opcode_stats.py joined/ --out-dir charts/
-```
-
-Produces: total cycle consumption bar chart, sorted cycles/gas ratio curves per opcode, and per-opcode detail plots with percentile annotations.
-
-**Compare per-opcode stats between base and head:**
-```bash
-python3 bench_scripts/compare_opcode_stats.py base_block.out head_block.out "label"
-```
-
-Outputs a compact diff table only when gas/native stats change. Used by CI to add opcode stats diffs to PR comments.
-
-## CI Integration
-
-The CI workflow (`.github/workflows/bench.yml`) runs the full comparison automatically on every PR. It:
-1. Checks out the merge-base of the PR
-2. Builds RISC-V binaries and runs all block + precompile benchmarks
-3. Checks out the PR branch and repeats
-4. Posts a comparison table as a PR comment (cycle benchmarks + per-opcode stats diff if changed)
-
-## Important Notes
-
-- Always rebuild the RISC-V binary (`dump_bin.sh`) after code changes — the binary is a static artifact. The `bench_scripts/bench.sh` script does this automatically.
-- `unlimited_native` must be enabled for benchmarking to prevent transactions from hitting native gas limits mid-block.
-- The `for-tests` binary type is for functional tests; `evm-replay-benchmarking` is for performance measurement.
-- Results are deterministic for the same binary + input. No need for multiple runs or statistical averaging.
-
-## Key Files
+                 + 16 × blake_delegations    (id 1991)
+                 + 4  × bigint_delegations   (id 1994)
+                 + 4  × keccak_delegations   (id 1995)
+```
+
+`cycle_marker::print_cycle_markers()` computes this for the `process_block`
+label. `bench_scripts/compare_bench.py`'s `Eff` column uses the same
+weights and additionally adds `+1` per delegation of any other id. The
+formula is the source of truth — when in doubt re-read
+`cycle_marker/src/lib.rs::print_cycle_markers`.
+
+Results are deterministic for the same RISC-V binary + input — no
+averaging needed. Always rebuild the binary after touching any code that
+ends up in `zksync_os` (`zksync_os/dump_bin.sh --type <type>`);
+`bench_scripts/bench.sh` does this automatically.
+
+## Running
+
+`bench_scripts/bench.sh` wraps the full pipeline. Subcommands:
+`baseline`, `quick`, `run`, `compare`, `flamegraph`. Read the script for
+exact invocations. `cargo test --features rig/no_print` is preferred over
+the underlying `cargo test` invocations when running tests directly.
+
+For local proof-mode simulation set `ZKSYNC_RISC_V_RUN=true`; CI sets it
+automatically.
+
+## Data pipeline (env-var opt-ins)
+
+The benchmark produces several artifacts, gated by env vars on the
+forward-mode run (block bench or `precompiles` test crate):
+
+| Env var | Producer | File layout |
+|---|---|---|
+| `MARKER_PATH` | `cycle_marker::print_cycle_markers` | `<path>.bench` — text format: `<label>: net cycles: <n>, net delegations: {id: count}` per marker, plus a global `Total delegations`. |
+| `OPCODE_STATS_PATH` | `EvmOpcodeStatsTracer` | CSV: per-opcode gas + native stats with min/median/avg/max. |
+| `OPCODE_SAMPLES_DIR` | `EvmOpcodeStatsTracer::dump_samples` | One `<OPCODE>.samples` file per opcode, `gas,native` per line in execution order. |
+| `OPCODE_CYCLE_SAMPLES_DIR` | `cycle_marker` | One `<OPCODE>.cycles` file per opcode, raw RISC-V cycles per line in execution order. |
+| `PRECOMPILE_STATS_PATH` | `PrecompileStatsTracer` | CSV: `name, address, count, avg_gas, median_gas, min_gas, max_gas, avg_native, median_native, min_native, max_native, native_per_gas`. |
+| `PRECOMPILE_SAMPLES_DIR` | `PrecompileStatsTracer::dump_samples` | One `<precompile>.samples` file per precompile, `gas,native` per line. |
+| `LABEL_CYCLE_SAMPLES_DIR` | `cycle_marker` | Per non-opcode label: `<label>.cycles` (raw) **and** `<label>.effective.cycles` (raw + delegation weights). |
+
+All sample/cycle dump dirs use **append** mode — clean the dir between
+runs (`rm -rf`) to avoid mixing.
+
+### Effective vs raw cycles
+
+Per-execution `<label>.cycles` files store raw RISC-V cycles only and
+**undercount delegation-heavy work**. `<label>.effective.cycles` (same
+formula as the `process_block` metric) is the correct input for
+cycles/gas analysis of any label whose handler delegates (precompiles
+like `ecrecover`/`modexp`/`bn254`, the `keccak` system function call,
+account/storage-touching paths).
+
+Opcode samples in `OPCODE_CYCLE_SAMPLES_DIR` currently dump raw only;
+opcodes whose handlers delegate (`SHA3`, `SLOAD`/`SSTORE`,
+`BALANCE`/`EXTCODE*`, `CALL` family, `CREATE`/`CREATE2`) are similarly
+undercounted in `join_samples.py` output.
+
+### Ecrecover intrinsic filter
+
+Every L2 transaction invokes `ecrecover` internally for signature
+verification. `bench_scripts/join_precompile_samples.py --bench-file`
+strips the first `ecrecover` cycle marker per `process_transaction`
+boundary and keeps only subsequent (precompile-target) ecrecovers.
+
+Positional heuristic assumption: every tx has **exactly one** intrinsic
+ecrecover before any user code. Holds for the current mainnet block
+fixtures; does **not** hold for L1→L2 priority ops, EIP-7702 set-code
+authority recovery, or `eth_call`. Replace with a dedicated marker label
+(`ecrecover_intrinsic`) before adding fixtures that violate the
+assumption.
+
+## Comparison scripts
+
+- `compare_bench.py` — base/head `.bench` diff; produces the headline
+  effective-cycles table.
+- `compare_opcode_stats.py` — diff per-opcode gas/native stats.
+- `compare_opcode_cycles.py` — diff per-opcode RISC-V cycles + cycles/gas
+  ratios.
+- `compare_precompile_stats.py` — diff per-precompile gas/native stats;
+  emits a head-only spoiler when base lacks instrumentation.
+- `join_samples.py` — per-opcode per-execution join (gas,native,cycles).
+- `join_precompile_samples.py` — per-precompile per-execution join;
+  prefers `<label>.effective.cycles` and falls back to raw with a stderr
+  note + summary header indicating which kind was used.
+- `cycles_per_native_report.py` — local-only ad-hoc tool. Given one or
+  more `(samples_dir, cycles_dir)` pairs from prior bench runs,
+  computes per-execution `cycles / native` ratios per opcode and per
+  precompile and writes a Markdown report (median / p95 / max). Useful
+  for spotting opcodes or precompiles whose native budget is out of
+  step with their cycle cost. Not wired into the CI comment.
+
+## CI
+
+`.github/workflows/bench.yml` runs the full pipeline on each PR:
+checkout merge-base → bench-base, checkout head → bench-head, then
+`compare` step composes a comparison comment from
+`compare_*` and `join_*` script outputs. Script failures are surfaced
+via explicit `_… failed; see CI logs._` markers in the PR comment
+rather than silently dropping tables.
+
+## Key files
 
 | Path | Description |
 |------|-------------|
-| `bench_scripts/bench.sh` | Convenience script for running benchmarks |
-| `cycle_marker/src/lib.rs` | Cycle marker macros and effective cycle calculation |
-| `zksync_os/dump_bin.sh` | RISC-V binary build script with type selection |
-| `zksync_os_runner/src/lib.rs` | RISC-V simulator runner, returns effective cycles |
-| `tests/instances/eth_runner/` | Real Ethereum block replay binary |
-| `tests/instances/eth_runner/blocks/` | Benchmark block fixtures |
-| `tests/instances/precompiles/` | Precompile benchmark tests |
-| `bench_scripts/compare_bench.py` | Compares base vs head `.bench` files |
-| `bench_scripts/parse_flamegraph.py` | Converts flamegraph SVG to text summary with self-cost and call stacks |
-| `bench_scripts/parse_opcodes.py` | Parses opcode frequency from simulator output |
-| `bench_scripts/compare_opcode_stats.py` | Compares per-opcode gas/native stats between base and head |
-| `bench_scripts/join_samples.py` | Joins per-execution tracer + cycle samples, computes ratios |
-| `bench_scripts/visualize_opcode_stats.py` | Generates charts from joined per-execution data |
-| `forward_system/src/system/tracers/evm_opcode_stats.rs` | Per-opcode gas/native stats tracer |
-| `.github/workflows/bench.yml` | CI benchmarking pipeline |
+| `cycle_marker/src/lib.rs` | Cycle marker macros, effective-cycle formula, per-execution dumps |
+| `zksync_os/dump_bin.sh` | RISC-V binary build script; `--type` selects feature combo |
+| `zksync_os_runner/src/lib.rs` | RISC-V simulator runner |
+| `bench_scripts/bench.sh` | Convenience wrapper for end-to-end runs |
+| `bench_scripts/compare_bench.py` | base/head `.bench` cycles diff |
+| `bench_scripts/compare_opcode_stats.py` | base/head opcode gas/native diff |
+| `bench_scripts/compare_opcode_cycles.py` | base/head opcode cycles + cycles/gas diff |
+| `bench_scripts/compare_precompile_stats.py` | base/head precompile gas/native diff |
+| `bench_scripts/cycles_per_native_report.py` | local-only `cycles/native` per-opcode + per-precompile ratio report (median / p95 / max) |
+| `bench_scripts/join_samples.py` | Per-opcode per-execution join |
+| `bench_scripts/join_precompile_samples.py` | Per-precompile per-execution join (effective-preferring) |
+| `bench_scripts/parse_flamegraph.py` | Flamegraph SVG → text summary |
+| `bench_scripts/visualize_opcode_stats.py` | Charts from joined per-execution data |
+| `forward_system/src/system/tracers/evm_opcode_stats.rs` | Per-opcode gas/native tracer |
+| `forward_system/src/system/tracers/precompile_stats.rs` | Per-precompile gas/native tracer |
+| `forward_system/src/system/tracers/pair.rs` | Combinator for running two tracers together |
+| `tests/instances/eth_runner/` | Block replay binary; consumes blocks from `blocks/` |
+| `tests/instances/precompiles/` | Precompile benchmark test crate |
+| `.github/workflows/bench.yml` | CI pipeline |
diff --git a/evm_interpreter/src/instructions/system.rs b/evm_interpreter/src/instructions/system.rs
index c6158591d..e7a630999 100644
--- a/evm_interpreter/src/instructions/system.rs
+++ b/evm_interpreter/src/instructions/system.rs
@@ -16,46 +16,76 @@ impl<S: EthereumLikeTypes> Interpreter<'_, S> {
     ]);
 
     pub fn sha3(&mut self, system: &mut System<S>) -> InstructionResult {
-        let (memory_offset, len) = self.stack.pop_2()?;
-
-        let len = Self::cast_to_usize(&len, EvmError::InvalidOperandOOG.into())?;
-        self.gas.spend_gas_and_native(0, KECCAK256_NATIVE_COST)?;
-
-        let hash = if len == 0 {
-            self.gas.spend_gas(gas_constants::SHA3)?;
-            Self::EMPTY_SLICE_SHA3
-        } else {
-            let memory_offset =
-                Self::cast_to_usize(&memory_offset, EvmError::InvalidOperandOOG.into())?;
-
-            self.resize_heap(memory_offset, len)?;
-
-            let allocator = system.get_allocator();
-            let input = &self.heap[memory_offset..(memory_offset + len)];
-
-            let mut dst = U256Builder::default();
-            S::SystemFunctions::keccak256(&input, &mut dst, self.gas.resources_mut(), allocator)
-                .map_err(SystemError::from)?;
-
-            let hash_ruint = dst.build();
-
-            if Self::PRINT_OPCODES {
-                use core::fmt::Write;
-                use zk_ee::logger_log;
-                use zk_ee::system::logger::Logger;
-                let mut logger = system.get_logger();
-                let input = &self.heap()[memory_offset..(memory_offset + len)];
-                let input_iter = input.iter().copied();
-                logger_log!(logger, " input: ",);
-                let _ = logger.log_data(input_iter);
-                logger_log!(logger, " -> 0x{hash_ruint:0x}");
-            }
+        // Wrap the whole dispatch — including the early stack/length/base-cost
+        // checks that may short-circuit via `?` — so the marker fires once per
+        // SHA3 opcode dispatch, matching `EvmOpcodeStatsTracer`'s per-dispatch
+        // sample count. Positional pairing in `cycles_per_native_report.py` /
+        // `join_precompile_samples.py` relies on this 1:1 correspondence. The
+        // inner `"keccak"` marker (from `Keccak256Impl::execute`) still fires
+        // for the system-function call, so bootloader/intrinsic keccak
+        // invocations remain attributed to `"keccak"` alone.
+        //
+        // `wrap!` (markers only) is used rather than `wrap_with_resources!`:
+        // SHA3 gas/native are already captured by `EvmOpcodeStatsTracer`, so
+        // the per-call resource diff would be redundant.
+        cycle_marker::wrap!("keccak_execution_environment", {
+            let (memory_offset, len) = self.stack.pop_2()?;
+            let len = Self::cast_to_usize(&len, EvmError::InvalidOperandOOG.into())?;
+            self.gas.spend_gas_and_native(0, KECCAK256_NATIVE_COST)?;
+
+            // Eagerly cast `memory_offset` to an owned `usize` so the
+            // `&memory_offset` borrow on `self.stack` ends here and does not
+            // collide with the final `self.stack.push(&hash)` below.
+            let memory_offset_usize: Option<usize> = if len > 0 {
+                Some(Self::cast_to_usize(
+                    &memory_offset,
+                    EvmError::InvalidOperandOOG.into(),
+                )?)
+            } else {
+                None
+            };
+
+            let hash = match memory_offset_usize {
+                None => {
+                    self.gas.spend_gas(gas_constants::SHA3)?;
+                    Self::EMPTY_SLICE_SHA3
+                }
+                Some(memory_offset) => {
+                    self.resize_heap(memory_offset, len)?;
+
+                    let allocator = system.get_allocator();
+                    let input = &self.heap[memory_offset..(memory_offset + len)];
+
+                    let mut dst = U256Builder::default();
+                    S::SystemFunctions::keccak256(
+                        &input,
+                        &mut dst,
+                        self.gas.resources_mut(),
+                        allocator,
+                    )
+                    .map_err(SystemError::from)?;
+
+                    let hash_ruint = dst.build();
+
+                    if Self::PRINT_OPCODES {
+                        use core::fmt::Write;
+                        use zk_ee::logger_log;
+                        use zk_ee::system::logger::Logger;
+                        let mut logger = system.get_logger();
+                        let input = &self.heap()[memory_offset..(memory_offset + len)];
+                        let input_iter = input.iter().copied();
+                        logger_log!(logger, " input: ",);
+                        let _ = logger.log_data(input_iter);
+                        logger_log!(logger, " -> 0x{hash_ruint:0x}");
+                    }
 
-            // Convert ruint::aliases::U256 to u256::U256
-            U256::from(hash_ruint)
-        };
+                    // Convert ruint::aliases::U256 to u256::U256
+                    U256::from(hash_ruint)
+                }
+            };
 
-        self.stack.push(&hash)
+            self.stack.push(&hash)
+        })
     }
 
     pub fn address(&mut self) -> InstructionResult {
diff --git a/forward_system/Cargo.toml b/forward_system/Cargo.toml
index d278e9ac1..60f97490f 100644
--- a/forward_system/Cargo.toml
+++ b/forward_system/Cargo.toml
@@ -66,3 +66,6 @@ evm_tester_pectra = ["evm_tester", "pectra"]
 # Features used for legacy evm_tester. Only defined in forward system,
 # as the legacy evm_tester does not perform a proof run.
 evm_tester_legacy = ["basic_bootloader/resources_for_tester", "basic_bootloader/disable_system_contracts", "zk_ee/prevrandao", "basic_bootloader/eip-7702", "basic_bootloader/burn_base_fee", "system_hooks/mock-unsupported-precompiles", "unlimited_native"]
+
+[dev-dependencies]
+tempfile = "3"
diff --git a/forward_system/src/system/tracers/mod.rs b/forward_system/src/system/tracers/mod.rs
index e005f6bae..496ae44f8 100644
--- a/forward_system/src/system/tracers/mod.rs
+++ b/forward_system/src/system/tracers/mod.rs
@@ -1,3 +1,5 @@
 pub mod call_tracer;
 pub mod evm_opcode_stats;
 pub mod evm_opcodes_logger;
+pub mod pair;
+pub mod precompile_stats;
diff --git a/forward_system/src/system/tracers/pair.rs b/forward_system/src/system/tracers/pair.rs
new file mode 100644
index 000000000..b135d384b
--- /dev/null
+++ b/forward_system/src/system/tracers/pair.rs
@@ -0,0 +1,204 @@
+//! Composite tracer that forwards every hook to two inner tracers.
+//!
+//! Use this to install multiple stats tracers on the same execution path
+//! (e.g. EvmOpcodeStatsTracer + PrecompileStatsTracer in eth_runner).
+
+use zk_ee::{
+    execution_environment_type::ExecutionEnvironmentType,
+    system::{
+        evm::{EvmError, EvmFrameInterface},
+        tracer::{evm_tracer::EvmTracer, Tracer},
+        CallResult, EthereumLikeTypes, ExecutionEnvironmentLaunchParams, SystemTypes,
+    },
+    types_config::SystemIOTypesConfig,
+};
+
+pub struct Pair<A, B> {
+    pub a: A,
+    pub b: B,
+}
+
+impl<A, B> Pair<A, B> {
+    pub fn new(a: A, b: B) -> Self {
+        Self { a, b }
+    }
+}
+
+impl<S, A, B> Tracer<S> for Pair<A, B>
+where
+    S: EthereumLikeTypes,
+    A: Tracer<S>,
+    B: Tracer<S>,
+{
+    #[inline(always)]
+    fn evm_tracer(&mut self) -> &mut impl EvmTracer<S> {
+        // Pair also implements EvmTracer<S> below; return self.
+        self
+    }
+
+    fn on_new_execution_frame(&mut self, request: &ExecutionEnvironmentLaunchParams<S>) {
+        self.a.on_new_execution_frame(request);
+        self.b.on_new_execution_frame(request);
+    }
+
+    fn after_execution_frame_completed(&mut self, result: Option<(&S::Resources, &CallResult<S>)>) {
+        self.a.after_execution_frame_completed(result);
+        self.b.after_execution_frame_completed(result);
+    }
+
+    fn begin_tx(&mut self, calldata: &[u8]) {
+        self.a.begin_tx(calldata);
+        self.b.begin_tx(calldata);
+    }
+
+    fn finish_tx(&mut self) {
+        self.a.finish_tx();
+        self.b.finish_tx();
+    }
+
+    fn on_storage_read(
+        &mut self,
+        ee_type: ExecutionEnvironmentType,
+        is_transient: bool,
+        address: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::Address,
+        key: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::StorageKey,
+        value: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::StorageValue,
+    ) {
+        self.a
+            .on_storage_read(ee_type, is_transient, address, key, value);
+        self.b
+            .on_storage_read(ee_type, is_transient, address, key, value);
+    }
+
+    fn on_storage_write(
+        &mut self,
+        ee_type: ExecutionEnvironmentType,
+        is_transient: bool,
+        address: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::Address,
+        key: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::StorageKey,
+        value: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::StorageValue,
+    ) {
+        self.a
+            .on_storage_write(ee_type, is_transient, address, key, value);
+        self.b
+            .on_storage_write(ee_type, is_transient, address, key, value);
+    }
+
+    fn on_bytecode_change(
+        &mut self,
+        ee_type: ExecutionEnvironmentType,
+        address: <S::IOTypes as SystemIOTypesConfig>::Address,
+        new_raw_bytecode: Option<&[u8]>,
+        new_internal_bytecode_hash: <S::IOTypes as SystemIOTypesConfig>::BytecodeHashValue,
+        new_observable_bytecode_length: u32,
+    ) {
+        self.a.on_bytecode_change(
+            ee_type,
+            address,
+            new_raw_bytecode,
+            new_internal_bytecode_hash,
+            new_observable_bytecode_length,
+        );
+        self.b.on_bytecode_change(
+            ee_type,
+            address,
+            new_raw_bytecode,
+            new_internal_bytecode_hash,
+            new_observable_bytecode_length,
+        );
+    }
+
+    fn on_event(
+        &mut self,
+        ee_type: ExecutionEnvironmentType,
+        address: &<<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::Address,
+        topics: &[<<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::EventKey],
+        data: &[u8],
+    ) {
+        self.a.on_event(ee_type, address, topics, data);
+        self.b.on_event(ee_type, address, topics, data);
+    }
+}
+
+/// EVM-tracer events on a `Pair<A, B>` fan out via each child's `evm_tracer()`.
+///
+/// Implicit contract: each child's `Tracer::evm_tracer()` MUST return a
+/// type implementing `EvmTracer<S>`. Both existing consumers
+/// (`EvmOpcodeStatsTracer`, `PrecompileStatsTracer`) return `&mut self`,
+/// satisfying this. A future tracer that delegates to a different inner
+/// type would still work as long as that inner type is itself an
+/// `EvmTracer<S>`. If you add a tracer whose `evm_tracer()` does
+/// something non-trivial, make sure that's true.
+impl<S, A, B> EvmTracer<S> for Pair<A, B>
+where
+    S: EthereumLikeTypes,
+    A: Tracer<S>,
+    B: Tracer<S>,
+{
+    #[inline(always)]
+    fn before_evm_interpreter_execution_step(
+        &mut self,
+        opcode: u8,
+        frame_state: &impl EvmFrameInterface<S>,
+    ) {
+        self.a
+            .evm_tracer()
+            .before_evm_interpreter_execution_step(opcode, frame_state);
+        self.b
+            .evm_tracer()
+            .before_evm_interpreter_execution_step(opcode, frame_state);
+    }
+
+    #[inline(always)]
+    fn after_evm_interpreter_execution_step(
+        &mut self,
+        opcode: u8,
+        frame_state: &impl EvmFrameInterface<S>,
+    ) {
+        self.a
+            .evm_tracer()
+            .after_evm_interpreter_execution_step(opcode, frame_state);
+        self.b
+            .evm_tracer()
+            .after_evm_interpreter_execution_step(opcode, frame_state);
+    }
+
+    #[inline(always)]
+    fn on_opcode_error(&mut self, error: &EvmError, frame_state: &impl EvmFrameInterface<S>) {
+        self.a.evm_tracer().on_opcode_error(error, frame_state);
+        self.b.evm_tracer().on_opcode_error(error, frame_state);
+    }
+
+    #[inline(always)]
+    fn on_call_error(&mut self, error: &EvmError) {
+        self.a.evm_tracer().on_call_error(error);
+        self.b.evm_tracer().on_call_error(error);
+    }
+
+    #[inline(always)]
+    fn on_selfdestruct(
+        &mut self,
+        beneficiary: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::Address,
+        token_value: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::NominalTokenValue,
+        frame_state: &impl EvmFrameInterface<S>,
+    ) {
+        self.a
+            .evm_tracer()
+            .on_selfdestruct(beneficiary, token_value, frame_state);
+        self.b
+            .evm_tracer()
+            .on_selfdestruct(beneficiary, token_value, frame_state);
+    }
+
+    #[inline(always)]
+    fn on_create_request(&mut self, is_create2: bool) {
+        self.a.evm_tracer().on_create_request(is_create2);
+        self.b.evm_tracer().on_create_request(is_create2);
+    }
+}
+
+// `Pair` is generic over `(A, B: Tracer<S>)`. A meaningful unit test would
+// require a mock `Tracer<S>` implementation backed by a concrete `SystemTypes`,
+// which is non-trivial to construct here. The actual integration test is via
+// `eth_runner` block bench. Structural / compilation correctness is implicit
+// in the `forward_system` build.
diff --git a/forward_system/src/system/tracers/precompile_stats.rs b/forward_system/src/system/tracers/precompile_stats.rs
new file mode 100644
index 000000000..71d499e53
--- /dev/null
+++ b/forward_system/src/system/tracers/precompile_stats.rs
@@ -0,0 +1,542 @@
+//! Per-precompile gas / native resource stats tracer.
+//!
+//! Mirrors `EvmOpcodeStatsTracer` but keyed on EVM precompile addresses
+//! (e.g. 0x01 = ECRECOVER, 0x05 = MODEXP, 0x0100 = P256 verify, …).
+//! Source of truth for the address list:
+//! `evm_interpreter::precompile_addresses::PRECOMPILE_ADDRESSES_LOWS`.
+
+use std::collections::BTreeMap;
+use std::io::Write;
+use std::marker::PhantomData;
+use std::path::Path;
+
+use evm_interpreter::precompile_addresses::PRECOMPILE_ADDRESSES_LOWS;
+use evm_interpreter::ERGS_PER_GAS;
+use zk_ee::{
+    execution_environment_type::ExecutionEnvironmentType,
+    system::{
+        evm::{EvmError, EvmFrameInterface},
+        tracer::{evm_tracer::EvmTracer, Tracer},
+        CallResult, Computational, EthereumLikeTypes, ExecutionEnvironmentLaunchParams, Resources,
+        SystemTypes,
+    },
+    types_config::SystemIOTypesConfig,
+};
+
+#[derive(Clone, Default)]
+pub struct PrecompileStats {
+    pub count: u64,
+    pub total_gas: u64,
+    pub total_native: u64,
+    pub gas_samples: Vec<u64>,
+    pub native_samples: Vec<u64>,
+}
+
+impl PrecompileStats {
+    fn median(samples: &[u64]) -> u64 {
+        if samples.is_empty() {
+            return 0;
+        }
+        let mut sorted = samples.to_vec();
+        sorted.sort_unstable();
+        let mid = sorted.len() / 2;
+        if sorted.len().is_multiple_of(2) {
+            ((sorted[mid - 1] as u128 + sorted[mid] as u128) / 2) as u64
+        } else {
+            sorted[mid]
+        }
+    }
+
+    pub fn gas_median(&self) -> u64 {
+        Self::median(&self.gas_samples)
+    }
+
+    pub fn native_median(&self) -> u64 {
+        Self::median(&self.native_samples)
+    }
+
+    pub fn gas_min(&self) -> u64 {
+        self.gas_samples.iter().copied().min().unwrap_or(0)
+    }
+
+    pub fn gas_max(&self) -> u64 {
+        self.gas_samples.iter().copied().max().unwrap_or(0)
+    }
+
+    pub fn native_min(&self) -> u64 {
+        self.native_samples.iter().copied().min().unwrap_or(0)
+    }
+
+    pub fn native_max(&self) -> u64 {
+        self.native_samples.iter().copied().max().unwrap_or(0)
+    }
+
+    pub fn record(&mut self, gas: u64, native: u64) {
+        self.count += 1;
+        self.total_gas += gas;
+        self.total_native += native;
+        self.gas_samples.push(gas);
+        self.native_samples.push(native);
+    }
+
+    /// Dump per-execution samples to a writer: one line per execution with "gas,native".
+    /// Samples are in execution order — the Kth line is the Kth execution.
+    pub fn dump_samples(&self, writer: &mut impl Write) -> std::io::Result<()> {
+        for (g, n) in self.gas_samples.iter().zip(self.native_samples.iter()) {
+            writeln!(writer, "{},{}", g, n)?;
+        }
+        Ok(())
+    }
+}
+
+/// Map u16 precompile-low-address → human-readable name.
+/// Covers every variant in `PRECOMPILE_ADDRESSES_LOWS` regardless of feature
+/// gates (rows for inactive precompiles are simply unused).
+pub fn precompile_name(low: u16) -> &'static str {
+    match low {
+        0x0001 => "ecrecover",
+        0x0002 => "sha256",
+        0x0003 => "ripemd160",
+        0x0004 => "identity",
+        0x0005 => "modexp",
+        0x0006 => "ecadd",
+        0x0007 => "ecmul",
+        0x0008 => "ecpairing",
+        0x0009 => "blake2f",
+        0x000a => "point_eval",
+        0x000b => "bls12_g1add",
+        0x000c => "bls12_g1msm",
+        0x000d => "bls12_g2add",
+        0x000e => "bls12_g2msm",
+        0x000f => "bls12_pairing_check",
+        0x0010 => "bls12_map_fp_to_g1",
+        0x0011 => "bls12_map_fp2_to_g2",
+        0x0100 => "p256_verify",
+        _ => {
+            debug_assert!(false, "precompile_name: unexpected low-address {low:#06x}");
+            "unknown"
+        }
+    }
+}
+
+/// Write a stats map to a CSV file.
+pub fn write_stats_csv(stats: &BTreeMap<u16, PrecompileStats>, path: &Path) -> std::io::Result<()> {
+    let mut f = std::fs::File::create(path)?;
+    writeln!(
+        f,
+        "name,address,count,avg_gas,median_gas,min_gas,max_gas,\
+         avg_native,median_native,min_native,max_native,native_per_gas"
+    )?;
+    for (&addr, s) in stats {
+        if s.count == 0 {
+            continue;
+        }
+        let avg_gas = s.total_gas as f64 / s.count as f64;
+        let avg_native = s.total_native as f64 / s.count as f64;
+        let native_per_gas = if s.total_gas > 0 {
+            s.total_native as f64 / s.total_gas as f64
+        } else {
+            0.0
+        };
+        writeln!(
+            f,
+            "{name},0x{addr:04x},{count},{avg_gas:.2},{med_gas},{min_gas},{max_gas},\
+             {avg_native:.2},{med_native},{min_native},{max_native},{ratio:.4}",
+            name = precompile_name(addr),
+            addr = addr,
+            count = s.count,
+            avg_gas = avg_gas,
+            med_gas = s.gas_median(),
+            min_gas = s.gas_min(),
+            max_gas = s.gas_max(),
+            avg_native = avg_native,
+            med_native = s.native_median(),
+            min_native = s.native_min(),
+            max_native = s.native_max(),
+            ratio = native_per_gas,
+        )?;
+    }
+    Ok(())
+}
+
+/// Check if a callee address is an EVM precompile. Returns the 16-bit low
+/// halfword if yes.
+fn precompile_id_from_address(addr_bytes: &[u8]) -> Option<u16> {
+    if addr_bytes.len() != 20 {
+        return None;
+    }
+    if !addr_bytes[..18].iter().all(|&b| b == 0) {
+        return None;
+    }
+    let low = u16::from_be_bytes([addr_bytes[18], addr_bytes[19]]);
+    if PRECOMPILE_ADDRESSES_LOWS.contains(&low) {
+        Some(low)
+    } else {
+        None
+    }
+}
+
+struct PendingFrame {
+    precompile_id: u16,
+    ergs_in: u64,
+    native_in: u64,
+}
+
+pub struct PrecompileStatsTracer<S: SystemTypes> {
+    pub stats: BTreeMap<u16, PrecompileStats>,
+    pending: Option<PendingFrame>,
+    /// `fn() -> S` is always `Send + Sync` regardless of `S`, so the tracer
+    /// can be held in a `static OnceLock<Mutex<...>>` even when `S` itself
+    /// contains non-`Sync` types (e.g. `Box<dyn OracleQueryProcessor>` in
+    /// `ForwardRunningSystem`).
+    _marker: PhantomData<fn() -> S>,
+}
+
+impl<S: SystemTypes> Default for PrecompileStatsTracer<S> {
+    fn default() -> Self {
+        Self {
+            stats: BTreeMap::new(),
+            pending: None,
+            _marker: PhantomData,
+        }
+    }
+}
+
+impl<S: SystemTypes> PrecompileStatsTracer<S> {
+    pub fn write_csv(&self, path: &Path) -> std::io::Result<()> {
+        write_stats_csv(&self.stats, path)
+    }
+
+    /// Dump per-execution samples to a directory.
+    /// Creates one file per precompile: `<dir>/<name>.samples` with "gas,native" per line.
+    /// File names use the user-facing names from `precompile_name`.
+    /// Files are in execution order so line K = Kth execution.
+    pub fn dump_samples(&self, dir: &Path) -> std::io::Result<()> {
+        std::fs::create_dir_all(dir)?;
+        for (&addr, s) in &self.stats {
+            if s.gas_samples.is_empty() {
+                continue;
+            }
+            let name = precompile_name(addr);
+            let path = dir.join(format!("{name}.samples"));
+            let mut f = std::fs::File::create(path)?;
+            s.dump_samples(&mut f)?;
+        }
+        Ok(())
+    }
+
+    /// Print a human-readable stats table to stdout. Omits the
+    /// `native_per_gas` ratio column emitted by `write_csv` — that's
+    /// CSV-only.
+    pub fn print_stats(&self) {
+        println!("=== EVM Precompile Stats:");
+        println!(
+            "{:<22} {:>8} {:>12} {:>12} {:>12} {:>12} {:>14} {:>14} {:>14} {:>14}",
+            "precompile",
+            "count",
+            "avg_gas",
+            "med_gas",
+            "min_gas",
+            "max_gas",
+            "avg_native",
+            "med_native",
+            "min_native",
+            "max_native",
+        );
+        for (&addr, s) in &self.stats {
+            if s.count == 0 {
+                continue;
+            }
+            let avg_gas = s.total_gas as f64 / s.count as f64;
+            let avg_native = s.total_native as f64 / s.count as f64;
+            println!(
+                "{:<22} {:>8} {:>12.1} {:>12} {:>12} {:>12} {:>14.1} {:>14} {:>14} {:>14}",
+                precompile_name(addr),
+                s.count,
+                avg_gas,
+                s.gas_median(),
+                s.gas_min(),
+                s.gas_max(),
+                avg_native,
+                s.native_median(),
+                s.native_min(),
+                s.native_max(),
+            );
+        }
+        println!("==================");
+    }
+}
+
+impl<S: EthereumLikeTypes> Tracer<S> for PrecompileStatsTracer<S> {
+    fn on_new_execution_frame(&mut self, request: &ExecutionEnvironmentLaunchParams<S>) {
+        // Single-slot pending state: precompile frames are leaves in the
+        // current EVM dispatch (a precompile body cannot itself enter
+        // another EVM frame through this hook). If that invariant ever
+        // changes — e.g. a precompile bounces back through the EVM call
+        // frame mechanism — the assertion below catches the silent
+        // overwrite that would otherwise misattribute stats. Promote to
+        // a stack of `Option<PendingFrame>` if that becomes a real path.
+        debug_assert!(
+            self.pending.is_none(),
+            "PrecompileStatsTracer: a new execution frame opened while a precompile pending frame was still recorded — \
+             nested precompile dispatch would clobber the outer frame's stats."
+        );
+        let addr = &request.external_call.callee;
+        let bytes: [u8; 20] = addr.to_be_bytes::<{ ruint::aliases::B160::BYTES }>();
+        if let Some(id) = precompile_id_from_address(&bytes) {
+            let ergs_in = request.external_call.available_resources.ergs().0 / ERGS_PER_GAS;
+            let native_in = request.external_call.available_resources.native().as_u64();
+            self.pending = Some(PendingFrame {
+                precompile_id: id,
+                ergs_in,
+                native_in,
+            });
+        } else {
+            self.pending = None;
+        }
+    }
+
+    fn after_execution_frame_completed(&mut self, result: Option<(&S::Resources, &CallResult<S>)>) {
+        let Some(pending) = self.pending.take() else {
+            return;
+        };
+        let Some((post, _)) = result else {
+            return;
+        };
+        // `post.ergs().0` unwraps the inner u64 of the Ergs newtype (not a
+        // tuple index). Equivalent: `post.ergs().to_u64()` if available.
+        let ergs_out = post.ergs().0 / ERGS_PER_GAS;
+        let native_out = post.native().as_u64();
+        let gas_used = pending.ergs_in.saturating_sub(ergs_out);
+        let native_used = pending.native_in.saturating_sub(native_out);
+        self.stats
+            .entry(pending.precompile_id)
+            .or_default()
+            .record(gas_used, native_used);
+    }
+
+    #[inline(always)]
+    fn begin_tx(&mut self, _calldata: &[u8]) {}
+
+    #[inline(always)]
+    fn finish_tx(&mut self) {}
+
+    #[inline(always)]
+    fn on_storage_read(
+        &mut self,
+        _ee_type: ExecutionEnvironmentType,
+        _is_transient: bool,
+        _address: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::Address,
+        _key: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::StorageKey,
+        _value: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::StorageValue,
+    ) {
+    }
+
+    #[inline(always)]
+    fn on_storage_write(
+        &mut self,
+        _ee_type: ExecutionEnvironmentType,
+        _is_transient: bool,
+        _address: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::Address,
+        _key: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::StorageKey,
+        _value: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::StorageValue,
+    ) {
+    }
+
+    #[inline(always)]
+    fn on_bytecode_change(
+        &mut self,
+        _ee_type: ExecutionEnvironmentType,
+        _address: <S::IOTypes as SystemIOTypesConfig>::Address,
+        _new_bytecode: Option<&[u8]>,
+        _new_bytecode_hash: <S::IOTypes as SystemIOTypesConfig>::BytecodeHashValue,
+        _new_observable_bytecode_length: u32,
+    ) {
+    }
+
+    #[inline(always)]
+    fn on_event(
+        &mut self,
+        _ee_type: ExecutionEnvironmentType,
+        _address: &<<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::Address,
+        _topics: &[<<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::EventKey],
+        _data: &[u8],
+    ) {
+    }
+
+    #[inline(always)]
+    fn evm_tracer(&mut self) -> &mut impl EvmTracer<S> {
+        self
+    }
+}
+
+// EvmTracer no-op forwarder so `evm_tracer()` can return `self`.
+impl<S: EthereumLikeTypes> EvmTracer<S> for PrecompileStatsTracer<S> {
+    #[inline(always)]
+    fn before_evm_interpreter_execution_step(
+        &mut self,
+        _opcode: u8,
+        _frame_state: &impl EvmFrameInterface<S>,
+    ) {
+    }
+
+    #[inline(always)]
+    fn after_evm_interpreter_execution_step(
+        &mut self,
+        _opcode: u8,
+        _frame_state: &impl EvmFrameInterface<S>,
+    ) {
+    }
+
+    #[inline(always)]
+    fn on_opcode_error(&mut self, _error: &EvmError, _frame_state: &impl EvmFrameInterface<S>) {}
+
+    #[inline(always)]
+    fn on_call_error(&mut self, _error: &EvmError) {}
+
+    #[inline(always)]
+    fn on_selfdestruct(
+        &mut self,
+        _beneficiary: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::Address,
+        _token_value: <<S as SystemTypes>::IOTypes as SystemIOTypesConfig>::NominalTokenValue,
+        _frame_state: &impl EvmFrameInterface<S>,
+    ) {
+    }
+
+    #[inline(always)]
+    fn on_create_request(&mut self, _is_create2: bool) {}
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn median_of_empty_is_zero() {
+        let s = PrecompileStats::default();
+        assert_eq!(s.gas_median(), 0);
+        assert_eq!(s.native_median(), 0);
+    }
+
+    #[test]
+    fn median_odd_count() {
+        let mut s = PrecompileStats::default();
+        for g in [10u64, 30, 20] {
+            s.record(g, g * 2);
+        }
+        assert_eq!(s.gas_median(), 20);
+        assert_eq!(s.native_median(), 40);
+    }
+
+    #[test]
+    fn median_even_count_averages_middle_two() {
+        let mut s = PrecompileStats::default();
+        for g in [10u64, 20, 30, 40] {
+            s.record(g, g);
+        }
+        assert_eq!(s.gas_median(), 25);
+        assert_eq!(s.native_median(), 25);
+    }
+
+    #[test]
+    fn min_max_track_extremes() {
+        let mut s = PrecompileStats::default();
+        for g in [10u64, 50, 30, 70, 20] {
+            s.record(g, g);
+        }
+        assert_eq!(s.gas_min(), 10);
+        assert_eq!(s.gas_max(), 70);
+        assert_eq!(s.native_min(), 10);
+        assert_eq!(s.native_max(), 70);
+    }
+
+    #[test]
+    fn record_accumulates_totals_and_samples() {
+        let mut s = PrecompileStats::default();
+        s.record(10, 100);
+        s.record(20, 200);
+        assert_eq!(s.count, 2);
+        assert_eq!(s.total_gas, 30);
+        assert_eq!(s.total_native, 300);
+        assert_eq!(s.gas_samples, vec![10, 20]);
+        assert_eq!(s.native_samples, vec![100, 200]);
+    }
+
+    #[test]
+    fn write_csv_round_trip() {
+        use std::io::Read;
+
+        let mut map: BTreeMap<u16, PrecompileStats> = BTreeMap::new();
+        let mut s = PrecompileStats::default();
+        s.record(3000, 41912);
+        s.record(3000, 41912);
+        map.insert(0x0001, s);
+
+        let mut s = PrecompileStats::default();
+        s.record(1250, 6_100_000);
+        s.record(84210, 9_800_000);
+        map.insert(0x0005, s);
+
+        let dir = tempfile::tempdir().expect("tempdir");
+        let path = dir.path().join("p.csv");
+        write_stats_csv(&map, &path).expect("write");
+
+        let mut content = String::new();
+        std::fs::File::open(&path)
+            .unwrap()
+            .read_to_string(&mut content)
+            .unwrap();
+        let lines: Vec<&str> = content.lines().collect();
+        assert_eq!(
+            lines[0],
+            "name,address,count,avg_gas,median_gas,min_gas,max_gas,\
+             avg_native,median_native,min_native,max_native,native_per_gas"
+        );
+        assert!(lines.iter().any(|l| l.starts_with("ecrecover,0x0001,2,")));
+        assert!(lines.iter().any(|l| l.starts_with("modexp,0x0005,2,")));
+
+        // Full-line check on the deterministic ecrecover row: two identical
+        // (3000, 41912) samples → avg=3000.00, med/min/max=3000, avg_native
+        // formatted to .2 = 41912.00, med/min/max=41912, ratio=41912/3000=13.9707.
+        let expected_ecrecover =
+            "ecrecover,0x0001,2,3000.00,3000,3000,3000,41912.00,41912,41912,41912,13.9707";
+        assert!(
+            lines.contains(&expected_ecrecover),
+            "missing exact ecrecover row; got lines: {:?}",
+            lines
+        );
+    }
+
+    #[test]
+    fn precompile_id_from_address_filters_non_precompile() {
+        // Random non-precompile address.
+        let mut addr = [0u8; 20];
+        addr[0] = 0xab;
+        addr[19] = 0xcd;
+        assert_eq!(precompile_id_from_address(&addr), None);
+    }
+
+    #[test]
+    fn precompile_id_from_address_accepts_ecrecover() {
+        // 0x...0001 — ECRECOVER.
+        let mut addr = [0u8; 20];
+        addr[19] = 0x01;
+        assert_eq!(precompile_id_from_address(&addr), Some(0x0001));
+    }
+
+    #[test]
+    fn precompile_id_from_address_rejects_wrong_length() {
+        let short = [0u8; 10];
+        assert_eq!(precompile_id_from_address(&short), None);
+        let long = [0u8; 32];
+        assert_eq!(precompile_id_from_address(&long), None);
+    }
+
+    #[test]
+    fn precompile_id_from_address_rejects_unknown_low() {
+        // Last byte 0x42 is not in PRECOMPILE_ADDRESSES_LOWS.
+        let mut addr = [0u8; 20];
+        addr[19] = 0x42;
+        assert_eq!(precompile_id_from_address(&addr), None);
+    }
+}
diff --git a/system_hooks/src/lib.rs b/system_hooks/src/lib.rs
index 0e9d88649..a43944882 100644
--- a/system_hooks/src/lib.rs
+++ b/system_hooks/src/lib.rs
@@ -125,6 +125,40 @@ where
     }
 }
 
+/// EE-triggered ecrecover invocation. Wraps the existing ecrecover dispatch
+/// in an outer `"ecrecover_execution_environment"` cycle marker so
+/// per-execution cycles can be joined cleanly with the
+/// `PrecompileStatsTracer` `ecrecover.samples` (which only sees EE precompile
+/// dispatch frames). The bootloader's intrinsic sig-recovery calls do not go
+/// through this path, so the new marker fires only for EE-triggered calls.
+/// The inner `"ecrecover"` marker (from `EcRecoverImpl::execute`) still
+/// fires, preserving backward compatibility for older consumers.
+struct EcRecoverEEInvocation<S: SystemTypes>(PhantomData<S>);
+
+impl<S: SystemTypes>
+    SystemFunctionInvocation<S, zk_ee::system::base_system_functions::Secp256k1ECRecoverErrors>
+    for EcRecoverEEInvocation<S>
+where
+    S::IO: IOSubsystemExt,
+    S: EthereumLikeTypes,
+{
+    fn invoke<D: TryExtend<u8> + ?Sized, A: core::alloc::Allocator + Clone>(
+        oracle: &mut <S::IO as IOSubsystemExt>::IOOracle,
+        logger: &mut S::Logger,
+        input: &[u8],
+        output: &mut D,
+        resources: &mut S::Resources,
+        allocator: A,
+    ) -> Result<(), SubsystemError<zk_ee::system::base_system_functions::Secp256k1ECRecoverErrors>>
+    {
+        cycle_marker::wrap_with_resources!("ecrecover_execution_environment", resources, {
+            <S::SystemFunctionsExt as SystemFunctionsExt<_>>::Secp256k1ECRecover::execute(
+                input, output, resources, oracle, logger, allocator,
+            )
+        })
+    }
+}
+
 ///
 /// Adds EVM precompiles hooks.
 ///
@@ -134,12 +168,16 @@ pub fn add_precompiles<S: EthereumLikeTypes, A: Allocator + Clone>(
 where
     S::IO: IOSubsystemExt,
 {
-    add_precompile_ext::<
-        _,
-        _,
-        <S::SystemFunctionsExt as SystemFunctionsExt<_>>::Secp256k1ECRecover,
-        Secp256k1ECRecoverErrors,
-    >(hooks, ECRECOVER_HOOK_ADDRESS_LOW)?;
+    // EE-frame ecrecover dispatch uses a dedicated invocation wrapper that
+    // emits the `"ecrecover_execution_environment"` cycle marker around the
+    // underlying system function call, so per-execution stats can be joined
+    // without the positional bootloader/intrinsic filter heuristic. Routed
+    // through `install_precompile_hook` so the address sanity check stays
+    // in lockstep with `add_precompile` / `add_precompile_ext`.
+    install_precompile_hook::<S, A, EcRecoverEEInvocation<S>, Secp256k1ECRecoverErrors>(
+        hooks,
+        ECRECOVER_HOOK_ADDRESS_LOW,
+    )?;
     add_precompile::<_, _, <S::SystemFunctions as SystemFunctions<_>>::Sha256, Sha256Errors>(
         hooks,
         SHA256_HOOK_ADDRESS_LOW,
@@ -351,6 +389,36 @@ where
     )
 }
 
+/// Install a precompile hook with a hand-picked invocation type.
+///
+/// Centralizes the `PRECOMPILE_ADDRESSES_LOWS` sanity check so any future
+/// hook wired through this helper inherits the same defensive guard as
+/// `add_precompile`. Used both by `add_precompile_ext` (generic SystemFunctionExt
+/// dispatch) and the ecrecover EE dispatch below (which uses a custom
+/// invocation type to inject the `ecrecover_execution_environment` cycle
+/// marker).
+fn install_precompile_hook<S, A, I, E>(
+    hooks: &mut HooksStorage<S, A>,
+    address_low: u16,
+) -> Result<(), InternalError>
+where
+    S: EthereumLikeTypes,
+    S::IO: IOSubsystemExt,
+    A: Allocator + Clone,
+    I: SystemFunctionInvocation<S, E>,
+    E: Subsystem,
+{
+    if !PRECOMPILE_ADDRESSES_LOWS.contains(&address_low) {
+        return Err(internal_error!(
+            "Attempted to add a precompile that is not in the precompile addresses list"
+        ));
+    }
+    hooks.add_call_hook(
+        address_low,
+        SystemCallHook::new(pure_system_function_hook_impl::<I, E, S>),
+    )
+}
+
 fn add_precompile_ext<
     S: EthereumLikeTypes,
     A: Allocator + Clone,
@@ -363,12 +431,7 @@ fn add_precompile_ext<
 where
     S::IO: IOSubsystemExt,
 {
-    hooks.add_call_hook(
-        address_low,
-        SystemCallHook::new(
-            pure_system_function_hook_impl::<SystemFunctionInvocationExt<S, E, P>, E, S>,
-        ),
-    )
+    install_precompile_hook::<S, A, SystemFunctionInvocationExt<S, E, P>, E>(hooks, address_low)
 }
 
 ///
diff --git a/tests/instances/eth_runner/Cargo.toml b/tests/instances/eth_runner/Cargo.toml
index da5058b1d..896b5583b 100644
--- a/tests/instances/eth_runner/Cargo.toml
+++ b/tests/instances/eth_runner/Cargo.toml
@@ -35,3 +35,18 @@ proving = []
 gpu = ["airbender-host/gpu-prover"]
 default = []
 
+# Fast-compile variant used by the bench CI for the host-side eth_runner
+# binary. Its runtime perf doesn't affect cycle measurements (cycles come
+# from the RISC-V simulator), so disable LTO and parallelize codegen for
+# much faster CI builds.
+#
+# Mirrors `[profile.bench-fast]` in the root workspace's `Cargo.toml`
+# (`eth_runner` is excluded from that workspace; the root profile only
+# covers the in-workspace `-p precompiles` build path). Keep the two in sync.
+[profile.bench-fast]
+inherits = "release"
+opt-level = 3
+lto = false
+codegen-units = 16
+debug = false
+
diff --git a/tests/instances/eth_runner/src/single_run.rs b/tests/instances/eth_runner/src/single_run.rs
index 359b5c0d3..0e57f791e 100644
--- a/tests/instances/eth_runner/src/single_run.rs
+++ b/tests/instances/eth_runner/src/single_run.rs
@@ -8,6 +8,8 @@ use crate::receipts::{BlockReceipts, TransactionReceipt};
 use rig::chain::BlockExtraStats;
 use rig::forward_system::system::system_types::ForwardRunningSystem;
 use rig::forward_system::system::tracers::evm_opcode_stats::EvmOpcodeStatsTracer;
+use rig::forward_system::system::tracers::pair::Pair;
+use rig::forward_system::system::tracers::precompile_stats::PrecompileStatsTracer;
 use rig::log::info;
 use rig::*;
 use std::fs::{self, File};
@@ -56,41 +58,89 @@ fn run<const RANDOMIZED: bool>(
         ..Default::default()
     };
 
-    let (output, stats) = if opcode_stats {
-        let mut tracer = EvmOpcodeStatsTracer::<ForwardRunningSystem>::default();
-        let result = run_with_tracer(
-            &mut chain,
-            transactions,
-            block_context,
-            run_config,
-            &mut tracer,
-        );
-
+    let dump_opcode_tracer = |tracer: &EvmOpcodeStatsTracer<ForwardRunningSystem>| {
         tracer.print_stats();
-
         if let Ok(path) = std::env::var("OPCODE_STATS_PATH") {
             tracer
                 .write_csv(std::path::Path::new(&path))
                 .expect("Failed to write opcode stats CSV");
             info!("Opcode stats written to {path}");
         }
-
         if let Ok(dir) = std::env::var("OPCODE_SAMPLES_DIR") {
             tracer
                 .dump_samples(std::path::Path::new(&dir))
                 .expect("Failed to dump opcode samples");
             info!("Opcode samples dumped to {dir}");
         }
+    };
 
-        result
-    } else {
-        run_with_tracer(
+    let dump_precompile_tracer = |tracer: &PrecompileStatsTracer<ForwardRunningSystem>| {
+        tracer.print_stats();
+        if let Ok(path) = std::env::var("PRECOMPILE_STATS_PATH") {
+            tracer
+                .write_csv(std::path::Path::new(&path))
+                .expect("Failed to write precompile stats CSV");
+            info!("Precompile stats written to {path}");
+        }
+        if let Ok(dir) = std::env::var("PRECOMPILE_SAMPLES_DIR") {
+            tracer
+                .dump_samples(std::path::Path::new(&dir))
+                .expect("Failed to dump precompile samples");
+            info!("Precompile samples dumped to {dir}");
+        }
+    };
+
+    let precompile_stats_enabled = std::env::var("PRECOMPILE_STATS_PATH").is_ok()
+        || std::env::var("PRECOMPILE_SAMPLES_DIR").is_ok();
+
+    let (output, stats) = match (opcode_stats, precompile_stats_enabled) {
+        (true, true) => {
+            let mut composite = Pair::new(
+                EvmOpcodeStatsTracer::<ForwardRunningSystem>::default(),
+                PrecompileStatsTracer::<ForwardRunningSystem>::default(),
+            );
+            let result = run_with_tracer(
+                &mut chain,
+                transactions,
+                block_context,
+                run_config,
+                &mut composite,
+            );
+            dump_opcode_tracer(&composite.a);
+            dump_precompile_tracer(&composite.b);
+            result
+        }
+        (true, false) => {
+            let mut tracer = EvmOpcodeStatsTracer::<ForwardRunningSystem>::default();
+            let result = run_with_tracer(
+                &mut chain,
+                transactions,
+                block_context,
+                run_config,
+                &mut tracer,
+            );
+            dump_opcode_tracer(&tracer);
+            result
+        }
+        (false, true) => {
+            let mut p_tracer = PrecompileStatsTracer::<ForwardRunningSystem>::default();
+            let result = run_with_tracer(
+                &mut chain,
+                transactions,
+                block_context,
+                run_config,
+                &mut p_tracer,
+            );
+            dump_precompile_tracer(&p_tracer);
+            result
+        }
+        (false, false) => run_with_tracer(
             &mut chain,
             transactions,
             block_context,
             run_config,
             &mut NopTracer::default(),
-        )
+        ),
     };
 
     let _ratio = compute_ratio(stats);
@@ -107,11 +157,33 @@ fn run_with_tracer<const RANDOMIZED: bool>(
     run_config: rig::chain::RunConfig,
     tracer: &mut impl Tracer<ForwardRunningSystem>,
 ) -> (BlockOutput, BlockExtraStats) {
+    // Allow benchmarking to opt into a non-default DA commitment scheme. The
+    // bench currently runs two passes per block: the default `BlobsAndPubdataKeccak256`
+    // (which emits a placeholder zero-hash blob plus a keccak commitment over
+    // pubdata) and `BlobsZKsyncOS` (which actually exercises the
+    // `BlobCommitmentGenerator` and the `blob_versioned_hash` cycle marker).
+    // Falls back to the rig default when unset.
+    //
+    // NOTE: the literal string `BENCH_DA_SCHEME` is used as a `grep -q`
+    // fallback target by `.github/workflows/bench.yml` — if this env var
+    // name is renamed, update the workflow too.
+    let da_commitment_scheme = std::env::var("BENCH_DA_SCHEME").ok().and_then(|s| {
+        use zk_ee::common_structs::da_commitment_scheme::DACommitmentScheme;
+        match s.as_str() {
+            "keccak" | "blobs_and_pubdata_keccak256" => {
+                Some(DACommitmentScheme::BlobsAndPubdataKeccak256)
+            }
+            "blobs_zksync_os" | "blobs" => Some(DACommitmentScheme::BlobsZKsyncOS),
+            "empty_no_da" => Some(DACommitmentScheme::EmptyNoDA),
+            other => panic!("Unknown BENCH_DA_SCHEME: {other}"),
+        }
+    });
+
     let (output, stats, _, _) = chain
         .run_block_with_extra_stats(
             transactions,
             Some(block_context),
-            None,
+            da_commitment_scheme,
             Some(run_config),
             tracer,
             &mut NopTxValidator::default(),
diff --git a/tests/instances/precompiles/Cargo.toml b/tests/instances/precompiles/Cargo.toml
index 36d368128..d67a30464 100644
--- a/tests/instances/precompiles/Cargo.toml
+++ b/tests/instances/precompiles/Cargo.toml
@@ -14,6 +14,7 @@ rig = { path = "../../rig", features = ["for_tests"] }
 zksync_os_tests_common = { path = "../../common" }
 hex = { workspace = true }
 cycle_marker = { path = "../../../cycle_marker" }
+forward_system = { path = "../../../forward_system" }
 
 [features]
 cycle_marker = ["rig/cycle_marker"]
diff --git a/tests/instances/precompiles/src/lib.rs b/tests/instances/precompiles/src/lib.rs
index 2c7171897..0c943d87c 100644
--- a/tests/instances/precompiles/src/lib.rs
+++ b/tests/instances/precompiles/src/lib.rs
@@ -1,7 +1,12 @@
 #![cfg(test)]
 
+use std::sync::{Mutex, OnceLock};
+
+use forward_system::system::system_types::ForwardRunningSystem;
+use forward_system::system::tracers::precompile_stats::PrecompileStatsTracer;
 use rig::alloy::consensus::TxLegacy;
 use rig::utils::{calldata_for_forwarder, FORWARDER_BYTECODE};
+use rig::zk_ee::system::validator::NopTxValidator;
 use rig::zksync_os_interface::types::BlockOutput;
 use rig::zksync_os_interface::types::ExecutionResult::Revert;
 use rig::BlockContext;
@@ -30,6 +35,26 @@ macro_rules! assert_matches {
     };
 }
 
+/// Process-scoped tracer enabled when `PRECOMPILE_STATS_PATH` (or
+/// `PRECOMPILE_SAMPLES_DIR`) is set. Forward-mode-only; serialized across
+/// concurrent tests via the wrapping `Mutex`. The CSV and per-execution
+/// `.samples` files are rewritten after every precompile call — last writer
+/// wins, with each rewrite containing the cumulative in-memory state.
+static PRECOMPILE_STATS: OnceLock<Mutex<PrecompileStatsTracer<ForwardRunningSystem>>> =
+    OnceLock::new();
+
+fn precompile_stats_path() -> Option<std::path::PathBuf> {
+    std::env::var("PRECOMPILE_STATS_PATH").ok().map(Into::into)
+}
+
+fn precompile_samples_dir() -> Option<std::path::PathBuf> {
+    std::env::var("PRECOMPILE_SAMPLES_DIR").ok().map(Into::into)
+}
+
+fn precompile_stats_tracer() -> &'static Mutex<PrecompileStatsTracer<ForwardRunningSystem>> {
+    PRECOMPILE_STATS.get_or_init(|| Mutex::new(PrecompileStatsTracer::default()))
+}
+
 /// Performs two calls:
 /// 1. Calls the precompile with given input and gas limit.
 /// 2. Calls the forwarder contract to call the precompile with the same input and gas limit.
@@ -92,7 +117,35 @@ fn run_precompile_inner(
         tester = tester.without_revm_consistency_check();
     }
 
-    tester.execute_block(vec![direct_tx, forwarded_tx])
+    let txs = vec![direct_tx, forwarded_tx];
+
+    let stats_path = precompile_stats_path();
+    let samples_dir = precompile_samples_dir();
+    if stats_path.is_some() || samples_dir.is_some() {
+        let mutex = precompile_stats_tracer();
+        let mut guard = mutex.lock().expect("stats mutex poisoned");
+        // The mutex is held across the full forward+proof run inside
+        // `execute_block_with_tracing`. The stats tracer only sees the forward
+        // pass; the proof-mode (RISC-V) pass uses its own nop tracer. When the
+        // CI cycle bench runs with `ZKSYNC_RISC_V_RUN=true` this serializes
+        // the proof run across precompile invocations — still correct, just
+        // sequential.
+        let out = tester.execute_block_with_tracing(txs, &mut *guard, &mut NopTxValidator);
+        // Cumulative writes: each rewrite contains the in-memory state so far.
+        if let Some(path) = stats_path.as_ref() {
+            if let Err(e) = guard.write_csv(path) {
+                eprintln!("warning: failed to write {}: {e}", path.display());
+            }
+        }
+        if let Some(dir) = samples_dir.as_ref() {
+            if let Err(e) = guard.dump_samples(dir) {
+                eprintln!("warning: failed to dump samples to {}: {e}", dir.display());
+            }
+        }
+        out
+    } else {
+        tester.execute_block(txs)
+    }
 }
 
 fn run_precompile(precompile_id: &str, gas: Option<u64>, input: &[u8]) -> BlockOutput {
diff --git a/tests/rig/src/chain.rs b/tests/rig/src/chain.rs
index a7ac70438..0d898c06a 100644
--- a/tests/rig/src/chain.rs
+++ b/tests/rig/src/chain.rs
@@ -838,6 +838,40 @@ impl<const RANDOMIZED_TREE: bool> Chain<RANDOMIZED_TREE> {
         // forward run
         let mut result_keeper = ForwardRunningResultKeeper::new(NoopTxCallback);
 
+        // The regular forward run uses ZKHeaderStructurePostTxOpSequencing,
+        // which substitutes NopCommitmentGenerator for the DA commit work.
+        // Under DA schemes that fire cycle markers only in proving mode
+        // (notably BlobsZKsyncOS → blob_versioned_hash), the sequencing run's
+        // LABELS would have fewer entries than the RISC-V proving run's
+        // markers, tripping the count assertion in print_cycle_markers.
+        // Snapshot LABELS before the sequencing run and revert after, so the
+        // sequencing labels are discarded; only the prover_input forward run
+        // (which uses proving STF) contributes labels for RISC-V matching.
+        // LABELS is a `thread_local!`, so the snapshot is local to whichever
+        // thread runs `run_inner`; the snapshot covers only the sequencing
+        // forward call below.
+        //
+        // The guard pattern ensures the revert runs even when
+        // `run_forward_no_panic` returns `Err` (the `?` propagation would
+        // otherwise leave stale sequencing labels in LABELS and the next
+        // run on this thread would trip the marker-count assertion).
+        #[cfg(feature = "cycle_marker")]
+        struct SeqLabelsGuard {
+            snap: Option<cycle_marker::Snapshot>,
+        }
+        #[cfg(feature = "cycle_marker")]
+        impl Drop for SeqLabelsGuard {
+            fn drop(&mut self) {
+                if let Some(snap) = self.snap.take() {
+                    cycle_marker::revert(snap);
+                }
+            }
+        }
+        #[cfg(feature = "cycle_marker")]
+        let _seq_labels_guard = SeqLabelsGuard {
+            snap: Some(cycle_marker::snapshot()),
+        };
+
         // we use proving config here for benchmarking,
         // although sequencer can have extra optimizations
         run_forward_no_panic::<BasicBootloaderProvingExecutionConfig>(
@@ -847,6 +881,11 @@ impl<const RANDOMIZED_TREE: bool> Chain<RANDOMIZED_TREE> {
             validator,
         )?;
 
+        // Explicit drop: sequencing labels are discarded HERE so the
+        // prover_input forward run below appends to a clean LABELS.
+        #[cfg(feature = "cycle_marker")]
+        drop(_seq_labels_guard);
+
         let block_output: BlockOutput = result_keeper.into();
 
         let (prover_input_forward, pubdata) = if do_prover_input_run {
@@ -866,17 +905,17 @@ impl<const RANDOMIZED_TREE: bool> Chain<RANDOMIZED_TREE> {
             let mut tracer = NopTracer::default();
             let mut validator = NopTxValidator;
             let prover_input_forward = {
-                // Avoid capturing markers from the second run, as it would duplicate them.
-                #[cfg(feature = "cycle_marker")]
-                let snapshot = cycle_marker::snapshot();
+                // This run uses ProverInputSystem (proving STF) — its labels
+                // are exactly what RISC-V will fire markers for, so we keep
+                // them in LABELS for the post-RISC-V count match. The earlier
+                // sequencing forward run's labels were already reverted, so
+                // LABELS now contains only this run's proving-mode labels.
                 let result = run_prover_input_no_panic::<BasicBootloaderProvingExecutionConfig>(
                     copy_source,
                     &mut result_keeper_prover_input,
                     &mut tracer,
                     &mut validator,
                 );
-                #[cfg(feature = "cycle_marker")]
-                cycle_marker::revert(snapshot);
                 result?
             };
 
diff --git a/zksync_os/Cargo.toml b/zksync_os/Cargo.toml
index af3841f88..3a308ad64 100644
--- a/zksync_os/Cargo.toml
+++ b/zksync_os/Cargo.toml
@@ -56,6 +56,7 @@ eth_stf = ["proof_running_system/eth_stf"]
 evm_tester = ["proof_running_system/evm_tester"]
 multiblock-batch = ["proof_running_system/multiblock-batch"]
 state-diffs-pi = ["proof_running_system/state-diffs-pi"]
+pectra = ["proof_running_system/pectra"]
 
 
 # [patch."https://github.com/matter-labs/zksync-airbender"]
diff --git a/zksync_os/dump_bin.sh b/zksync_os/dump_bin.sh
index 137361fb8..b72e10272 100755
--- a/zksync_os/dump_bin.sh
+++ b/zksync_os/dump_bin.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 set -e
 
-USAGE="Usage: $0 --type {singleblock-batch|singleblock-batch-logging-enabled|debug-in-simulator|evm-replay|evm-replay-benchmarking|multiblock-batch|multiblock-batch-logging-enabled|evm-tester|for-tests|for-tests-benchmarking|for-tests-logging-enabled|eth-stf} [--reproducible]"
+USAGE="Usage: $0 --type {singleblock-batch|singleblock-batch-logging-enabled|debug-in-simulator|evm-replay|evm-replay-benchmarking|multiblock-batch|multiblock-batch-logging-enabled|evm-tester|for-tests|for-tests-benchmarking|for-tests-benchmarking-pectra|for-tests-logging-enabled|eth-stf} [--reproducible]"
 TYPE=""
 REPRODUCIBLE=""
 
@@ -54,6 +54,17 @@ case "$TYPE" in
     FEATURES="$FEATURES,for_tests,benchmarking"
     APP_NAME="for_tests"
     ;;
+  for-tests-benchmarking-pectra)
+    # Adds `pectra` on top of `for-tests-benchmarking` so the proving binary
+    # supports BLS12-381 + BLAKE2F + P256. Required by the precompiles bench
+    # CI when it exercises `test_pectra_precompiles` (BLS12-381 + BLAKE2F)
+    # and `test_kzg_regression` (point_evaluation) in proof mode.
+    # NOTE: the literal string `for-tests-benchmarking-pectra` is used as
+    # a `grep -q` fallback target by `.github/workflows/bench.yml` — if
+    # this case label is renamed, update the workflow too.
+    FEATURES="$FEATURES,for_tests,benchmarking,pectra"
+    APP_NAME="for_tests"
+    ;;
   for-tests-logging-enabled)
     FEATURES="$FEATURES,for_tests,print_debug_info"
     APP_NAME="for_tests"