From 4146bcbcc64a5a04462a0dc64d998a476f5fc9e3 Mon Sep 17 00:00:00 2001
From: Ray Andrew <rs@rs.ht>
Date: Mon, 18 May 2026 00:50:54 -0500
Subject: [PATCH] feat(perf): performance improvements for parallel reading,
 indexing, and aggregation

Indexer
- Streaming parse-and-emit worker pipeline with bounded memory usage
- Concurrent SST artifact ingestion with staging support
- Gzip member slicing for parallel indexing
- Lazy decoding for compressed value counts
- Bypass DOM wrapper for indexer hot path (simdjson on_demand)
- Decoupled write workers from parse workers
- --rebuild-summaries flag and optimized root summary rebuild

Aggregator / MPI
- Task-based DAG execution for aggregator pipeline
- Shared staging for multi-node artifact relocation
- Per-node thread scaling to avoid oversubscription
- Unified distributed aggregation tracking, removed manifest consolidation
- Deterministic aggregation and intra-file parallelism

Trace reader / query
- Compiled predicate evaluation for AND-of-EQ queries
- Uniform-match shortcut for AND-of-EQ queries
- Line-range support for work items and checkpoint processing
- Optimized chunk pruning and checkpoint handling

Replay
- Pipelined replay with coroutines and channels
- JsonParser-based trace processing
- Optimized string handling and i/o buffering

Organize / writer / dft
- Parallel slice creation and merging in organize visitor
- Inline indexer in organize
- Gzip member tracking in writer
- Coroutine-based event dispatcher with extracted parse logic
- Batch flushing in organize visitor

Arrow / call_tree
- Optimized arrow conversion
- Arrow IPC support and improved save/load in call_tree

Build / infrastructure
- zlib-ng option, system simdjson fallback
- cgroup v1/v2 memory limit detection
- Auto-computed per-file memory estimates and batch sizes
- CI: perf branch trigger, formatting

Docs
- Rewritten indexer and trace reader API references
---
 .envrc                                        |    1 -
 .github/workflows/ci.yml                      |  173 +-
 .gitignore                                    |    1 +
 CMakeLists.txt                                |   15 +
 CMakePresets.json                             |    3 +-
 .../dftracer_utils_config.dbg.h.in            |    7 +
 .../dftracer_utils_config.h.in                |    7 +
 cmake/modules/Dependencies.cmake              |  464 ++-
 cmake/modules/InstallHelpers.cmake            |  243 +-
 cmake/modules/PrecompiledHeader.cmake         |   15 +-
 docs/scripts/generate_api_index.py            |   56 +-
 docs/source/api/indexer.rst                   |   72 +-
 docs/source/api/reader.rst                    |   26 +-
 docs/source/api/runtime.rst                   |    8 +-
 docs/source/api/trace_reader.rst              |   90 +-
 docs/source/api/utilities.rst                 |   23 +-
 docs/source/call-tree.rst                     |   98 +-
 docs/source/cli.rst                           |  162 +
 docs/source/conf.py                           | 1255 ++++--
 docs/source/cpp_api/arrow.rst                 |   45 +-
 docs/source/cpp_api/coro.rst                  |   10 +
 docs/source/cpp_api/dft_aggregators.rst       |   32 +-
 docs/source/cpp_api/indexer.rst               |   62 +-
 docs/source/cpp_api/io.rst                    |   33 +
 docs/source/cpp_api/pipeline/executors.rst    |    3 -
 docs/source/cpp_api/reader.rst                |   29 +-
 docs/source/cpp_api/rocksdb.rst               |    7 +-
 docs/source/developers.rst                    |   32 +-
 docs/source/installation.rst                  |   61 +-
 docs/source/pipeline.rst                      |   85 +
 docs/source/quickstart.rst                    |    4 +
 docs/source/server.rst                        |   11 +
 docs/source/utilities/common.rst              |  112 +-
 docs/source/utilities/composites.rst          |   45 +-
 docs/source/utilities/compression.rst         |   27 +
 docs/source/utilities/fileio.rst              |   62 +
 docs/source/utilities/indexer.rst             |   85 +-
 docs/source/utilities/reader.rst              |   34 +-
 docs/source/utilities/replay.rst              |   12 +
 examples/call_tree_example1.cpp               |   55 +-
 examples/call_tree_example2.cpp               |   52 +-
 examples/call_tree_example3.cpp               |   34 +-
 flake.lock                                    |   27 -
 flake.nix                                     |   54 -
 include/dftracer/utils/call_tree/call_tree.h  |   65 +-
 .../dftracer/utils/call_tree/call_tree_mpi.h  |   37 +-
 .../utils/call_tree/internal/call_tree.h      |    6 +
 .../utils/call_tree/internal/factory.h        |   13 +-
 .../dftracer/utils/call_tree/internal/node.h  |   72 +-
 .../utils/call_tree/internal/trace_reader.h   |   74 +-
 .../utils/call_tree/json_serializer.h         |    5 +-
 .../dftracer/utils/call_tree/mpi/build_task.h |   33 -
 .../dftracer/utils/call_tree/mpi/builder.h    |  187 +-
 .../utils/call_tree/mpi/file_header.h         |   43 -
 .../utils/call_tree/mpi/filtered_reader.h     |   64 -
 .../utils/call_tree/mpi/pid_index_info.h      |   37 -
 .../utils/call_tree/mpi/serializable.h        |   73 +-
 .../utils/call_tree/mpi/serialization.h       |   31 -
 .../dftracer/utils/core/common/buffer_pool.h  |   30 +-
 .../dftracer/utils/core/common/constants.h    |    3 -
 .../utils/core/common/memory_budget.h         |   32 +
 .../dftracer/utils/core/common/object_pool.h  |   92 +-
 .../utils/core/common/string_intern.h         |  265 +-
 .../core/common/transparent_string_hash.h     |   33 +-
 include/dftracer/utils/core/coro/channel.h    |  111 +
 .../dftracer/utils/core/pipeline/executor.h   |   33 +-
 .../utils/core/pipeline/pipeline_config.h     |   11 +-
 include/dftracer/utils/core/rocksdb/async.h   |  130 -
 .../utils/core/rocksdb/column_families.h      |   65 +
 .../dftracer/utils/core/rocksdb/database.h    |   37 +-
 .../dftracer/utils/core/rocksdb/db_manager.h  |    3 +-
 include/dftracer/utils/core/runtime.h         |   66 +
 .../dftracer/utils/core/tasks/coro_scope.h    |   44 +-
 .../utils/core/utilities/streaming_utility.h  |   21 +
 .../dftracer/utils/core/utilities/utility.h   |   30 +
 include/dftracer/utils/server/trace_index.h   |    9 -
 .../utils/utilities/common/arrow/arrow.h      |    4 +
 .../utilities/common/arrow/arrow_export.h     |    1 +
 .../utilities/common/arrow/column_builder.h   |   74 +-
 .../utils/utilities/common/arrow/ipc_reader.h |   99 +
 .../utils/utilities/common/arrow/ipc_writer.h |  103 +-
 .../utilities/common/arrow/parallel_reader.h  |   95 +
 .../utilities/common/arrow/partition_router.h |   94 +
 .../utilities/common/arrow/partition_writer.h |   76 +
 .../utils/utilities/common/json/json.h        |   10 +-
 .../utilities/common/json/json_doc_guard.h    |   65 +-
 .../utils/utilities/common/json/json_value.h  |  182 +-
 .../utils/utilities/common/json/parser.h      |  241 ++
 .../utils/utilities/common/query/ast.h        |    5 +
 .../utils/utilities/common/query/evaluator.h  |    6 +-
 .../utils/utilities/common/query/query.h      |   11 +-
 .../common/serialization/binary_codec.h       |  210 +
 .../common/statistics/log2_histogram.h        |    7 -
 .../common/statistics/timestamp_histogram.h   |   59 +
 .../aggregators/aggregation_augmentation.h    |   33 +
 .../dft/aggregators/aggregation_config.h      |   46 +
 .../dft/aggregators/aggregation_logic.h       |   28 +
 .../aggregators/aggregation_merge_operator.h  |   26 +
 .../dft/aggregators/aggregation_metrics.h     |   23 +-
 .../dft/aggregators/aggregation_output.h      |   11 +-
 .../aggregators/aggregation_serialization.h   |  356 ++
 .../dft/aggregators/aggregation_visitor.h     |  130 +
 .../aggregators/aggregator_summary_utility.h  |    2 +-
 .../dft/aggregators/aggregator_types.h        |  164 +
 .../dft/aggregators/aggregator_utility.h      |   34 +-
 .../composites/dft/aggregators/aggregators.h  |    2 +-
 .../association_resolver_utility.h            |   11 +-
 .../dft/aggregators/association_tracker.h     |   13 +-
 .../aggregators/chunk_aggregator_utility.h    |   23 +-
 .../dft/aggregators/event_aggregator.h        |  132 +
 .../aggregators/event_aggregator_utility.h    |   36 -
 .../perfetto_trace_writer_utility.h           |   54 +-
 .../dft/aggregators/system_metrics.h          |  206 +
 .../system_metrics_merge_operator.h           |   26 +
 .../system_metrics_serialization.h            |   37 +
 .../utils/utilities/composites/dft/args_map.h |  216 ++
 .../dft/comparator/comparison_config.h        |   10 +-
 .../dft/comparator/comparison_result.h        |    2 +-
 .../dft/comparator/comparison_utility.h       |    6 +-
 .../composites/dft/dft_event_dispatcher.h     |  326 ++
 .../composites/dft/dft_event_visitor.h        |   70 +
 .../utils/utilities/composites/dft/event.h    |  207 +-
 .../composites/dft/indexing/bloom_filter.h    |   28 +-
 .../dft/indexing/chunk_dimension_stats.h      |   84 +-
 .../dft/indexing/chunk_indexer_utility.h      |    7 +-
 .../dft/indexing/chunk_pruner_utility.h       |   32 +
 .../dft/indexing/chunk_statistics.h           |   32 +-
 .../dft/indexing/index_resolver_utility.h     |   86 +
 .../dft/indexing/resolve_and_build.h          |   43 +
 .../utilities/composites/dft/internal/utils.h |   10 +-
 .../utilities/composites/dft/parse_inflated.h |  108 +
 .../dft/reorganize/group_writer_task.h        |   69 +
 .../dft/reorganize/manifest_extractor.h       |   38 +
 .../dft/reorganize/organize_visitor.h         |  106 +
 .../dft/reorganize/reconstructor_utility.h    |   54 +
 .../dft/reorganize/reorganization_planner.h   |    4 +-
 .../dft/statistics/detailed_statistics.h      |    7 +-
 .../shared_index_statistics_reader.h          |  156 +
 .../statistics_aggregator_utility.h           |    9 +
 .../dft/views/view_reader_utility.h           |    5 +-
 .../composites/dft/visitors/bloom_visitor.h   |  148 +
 .../dft/visitors/hash_table_visitor.h         |   57 +
 .../dft/visitors/manifest_visitor.h           |   59 +
 .../utils/utilities/fileio/chunk_writer.h     |   10 +
 .../utils/utilities/fileio/parallel/layout.h  |   59 +
 .../utils/utilities/fileio/parallel/merge.h   |   20 +
 .../fileio/parallel/parallel_writer.h         |   92 +
 .../filesystem/directory_scanner_utility.h    |   48 +-
 .../pattern_directory_scanner_utility.h       |   32 +-
 .../utils/utilities/filesystem/types.h        |   17 +-
 .../utilities/hash/fnv1a_hasher_utility.h     |   54 +-
 .../utils/utilities/indexer/file_partition.h  |   53 +
 .../utilities/indexer/index_batch_sink.h      |  155 +
 .../utilities/indexer/index_builder_utility.h |  134 +-
 .../utils/utilities/indexer/index_database.h  |  325 +-
 .../index_database_sst_writer_context.h       |  303 ++
 .../indexer/index_database_writer_context.h   |  184 +
 .../indexer/index_file_entry_capability.h     |   39 +
 .../utils/utilities/indexer/index_types.h     |   81 +
 .../utils/utilities/indexer/index_visitor.h   |   47 +-
 .../indexer/internal/index_encoding.h         |  133 +
 .../indexer/internal/payload_codec.h          |  140 +
 .../indexer/internal/statistics_codec.h       |   26 +
 .../utilities/indexer/provenance_database.h   |    4 +-
 .../indexer/visitors/bloom_visitor.h          |   66 -
 .../indexer/visitors/manifest_visitor.h       |   37 -
 .../utilities/reader/internal/stream_config.h |    8 +
 .../utils/utilities/reader/trace_reader.h     |   53 +-
 .../dftracer/utils/utilities/replay/replay.h  |   95 +-
 .../dftracer/utils/utilities/replay/trace.h   |   19 +-
 pyproject.toml                                |   14 +
 python/dftracer/utils/__init__.py             |   18 +-
 python/dftracer/utils/arrow.py                |  319 +-
 python/dftracer/utils/dask.py                 | 1143 +++++-
 python/dftracer/utils/dftracer_utils_ext.pyi  |  779 +++-
 python/dftracer/utils/indexer.py              |  371 ++
 python/dftracer/utils/runtime.py              |   12 +-
 python/dftracer/utils/trace_reader.py         |  413 ++
 src/CMakeLists.txt                            |  151 +-
 src/dftracer/utils/binaries/common_cli.h      |  329 ++
 .../utils/binaries/dftracer_aggregator.cpp    |  959 ++---
 .../binaries/dftracer_aggregator_mpi.cpp      | 1199 ++++++
 .../utils/binaries/dftracer_call_tree.cpp     |  829 ++--
 .../utils/binaries/dftracer_call_tree_mpi.cpp |  208 +
 .../utils/binaries/dftracer_comparator.cpp    |  886 +++--
 .../utils/binaries/dftracer_event_count.cpp   |  381 +-
 .../binaries/dftracer_gen_fake_trace.cpp      |  465 ++-
 .../utils/binaries/dftracer_index.cpp         |  385 +-
 src/dftracer/utils/binaries/dftracer_info.cpp | 1032 +++--
 .../utils/binaries/dftracer_merge.cpp         |  235 +-
 .../utils/binaries/dftracer_organize.cpp      | 1281 +++++--
 .../utils/binaries/dftracer_pgzip.cpp         |  209 +-
 .../utils/binaries/dftracer_reconstruct.cpp   |  415 +-
 .../utils/binaries/dftracer_replay.cpp        | 1014 ++---
 .../utils/binaries/dftracer_server.cpp        |  133 +-
 .../utils/binaries/dftracer_split.cpp         |  320 +-
 .../utils/binaries/dftracer_stats.cpp         | 1821 +++++----
 src/dftracer/utils/binaries/dftracer_view.cpp |  307 +-
 src/dftracer/utils/core/common/inflater.h     |   31 +-
 .../utils/core/common/memory_budget.cpp       |  206 +
 .../utils/core/io/io_backend_factory.cpp      |   35 +-
 src/dftracer/utils/core/pipeline/executor.cpp |   72 +-
 src/dftracer/utils/core/pipeline/pipeline.cpp |    2 -
 src/dftracer/utils/core/rocksdb/async.cpp     |   32 -
 src/dftracer/utils/core/rocksdb/database.cpp  |  122 +-
 .../utils/core/rocksdb/db_manager.cpp         |   13 +-
 .../utils/core/rocksdb/filesystem.cpp         |    6 +-
 src/dftracer/utils/core/runtime.cpp           |   12 +
 src/dftracer/utils/core/utils/timer.cpp       |   39 +
 src/dftracer/utils/core/utils/timer.h         |   35 +-
 src/dftracer/utils/python/arrow_helpers.cpp   |   26 +
 src/dftracer/utils/python/arrow_helpers.h     |    6 +
 .../utils/python/arrow_parallel_reader.cpp    |  212 +
 .../utils/python/arrow_parallel_reader.h      |   16 +
 .../utils/python/arrow_stream_capsule.cpp     |  323 ++
 .../utils/python/arrow_stream_capsule.h       |   25 +
 src/dftracer/utils/python/batch_byte_size.h   |   55 +
 src/dftracer/utils/python/batch_indexer.cpp   | 2554 +++++++++++++
 src/dftracer/utils/python/batch_indexer.h     |   38 +
 .../utils/python/dftracer_utils_ext.cpp       |   28 +-
 src/dftracer/utils/python/index_database.cpp  |  363 ++
 src/dftracer/utils/python/index_database.h    |   23 +
 src/dftracer/utils/python/indexer.cpp         |  337 +-
 src/dftracer/utils/python/indexer.h           |    7 +-
 src/dftracer/utils/python/json.cpp            |  947 ++---
 src/dftracer/utils/python/json.h              |   30 +-
 .../utils/python/memoryview_batch.cpp         |  114 +
 src/dftracer/utils/python/memoryview_batch.h  |   54 +
 src/dftracer/utils/python/runtime.cpp         |   32 +-
 .../utils/python/schema_reconcile.cpp         |  351 ++
 src/dftracer/utils/python/schema_reconcile.h  |   49 +
 .../utils/python/sst_distribution.cpp         | 1182 ++++++
 src/dftracer/utils/python/sst_distribution.h  |   18 +
 .../utils/python/streaming_iterator.cpp       |  168 +
 .../utils/python/streaming_iterator.h         |  166 +
 src/dftracer/utils/python/trace_reader.cpp    | 3405 ++++++++++++++---
 src/dftracer/utils/python/trace_reader.h      |    1 -
 .../utils/python/trace_reader_iterator.cpp    |  261 +-
 .../utils/python/trace_reader_iterator.h      |   80 +-
 .../utils/python/utilities/aggregator.cpp     |  676 +++-
 .../utils/python/utilities/comparator.cpp     |  212 +-
 .../utilities/reorganization_planner.cpp      |   23 +-
 src/dftracer/utils/server/cursor.cpp          |   10 +-
 src/dftracer/utils/server/trace_api.cpp       |  247 +-
 src/dftracer/utils/server/trace_index.cpp     |  150 +-
 src/dftracer/utils/server/viz_api.cpp         |  495 +--
 .../utils/utilities/call_tree/call_tree.cpp   |  422 +-
 .../call_tree/call_tree_internal.cpp          |  570 +--
 .../utilities/call_tree/call_tree_mpi.cpp     | 1525 ++------
 .../call_tree/call_tree_save_arrow.cpp        |  391 ++
 .../call_tree/call_tree_save_binary.cpp       |  429 +++
 .../utilities/call_tree/json_serializer.cpp   |  193 +-
 .../utilities/common/arrow/column_builder.cpp |  382 +-
 .../utilities/common/arrow/ipc_reader.cpp     |  355 ++
 .../utilities/common/arrow/ipc_writer.cpp     |  712 +++-
 .../common/arrow/parallel_reader.cpp          |  111 +
 .../common/arrow/partition_router.cpp         |  623 +++
 .../common/arrow/partition_writer.cpp         |  207 +
 .../utilities/common/json/json_value.cpp      |   38 +-
 .../utils/utilities/common/json/parser.cpp    |   73 +
 .../utils/utilities/common/query/ast.cpp      |   30 +
 .../utils/utilities/common/query/query.cpp    |    4 +-
 .../common/statistics/log2_histogram.cpp      |   69 +-
 .../common/statistics/timestamp_histogram.cpp |  173 +
 .../aggregators/aggregation_augmentation.cpp  |  281 ++
 .../dft/aggregators/aggregation_logic.cpp     |  212 +
 .../aggregation_merge_operator.cpp            |   54 +
 .../dft/aggregators/aggregation_metrics.cpp   |  172 +-
 .../aggregators/aggregation_serialization.cpp |  453 +++
 .../dft/aggregators/aggregation_visitor.cpp   |  461 +++
 .../aggregator_summary_utility.cpp            |   12 +-
 .../dft/aggregators/aggregator_utility.cpp    |  696 +++-
 .../association_resolver_utility.cpp          |    2 +-
 .../dft/aggregators/association_tracker.cpp   |  125 +-
 .../aggregators/chunk_aggregator_utility.cpp  |  241 +-
 .../dft/aggregators/event_aggregator.cpp      |  468 +++
 .../aggregators/event_aggregator_utility.cpp  |   56 -
 .../perfetto_trace_writer_utility.cpp         |  971 +++--
 .../system_metrics_merge_operator.cpp         |   54 +
 .../system_metrics_serialization.cpp          |  126 +
 .../dft/comparator/comparison_config.cpp      |  186 +-
 .../dft/comparator/comparison_result.cpp      |   12 +-
 .../dft/comparator/tree_table_formatter.cpp   |  209 +-
 .../dft/event_collector_utility.cpp           |   34 +-
 .../dft/event_id_extractor_utility.cpp        |   39 +-
 .../composites/dft/indexing/bloom_filter.cpp  |  110 +-
 .../dft/indexing/chunk_dimension_stats.cpp    |   86 +-
 .../dft/indexing/chunk_indexer_utility.cpp    |  350 +-
 .../dft/indexing/chunk_pruner_utility.cpp     |  299 +-
 .../dft/indexing/chunk_statistics.cpp         |  241 +-
 .../dft/indexing/index_resolver_utility.cpp   |  324 ++
 .../dft/indexing/resolve_and_build.cpp        |  214 ++
 .../composites/dft/internal/utils.cpp         |   13 +-
 .../dft/reorganize/group_writer_task.cpp      |  852 +++++
 .../dft/reorganize/manifest_extractor.cpp     |  176 +
 .../dft/reorganize/organize_visitor.cpp       |  145 +
 .../dft/reorganize/provenance_tracker.cpp     |   74 +-
 .../dft/reorganize/reconstructor_utility.cpp  |  410 ++
 .../dft/reorganize/reorganization_planner.cpp |  175 +-
 .../chunk_detail_scanner_utility.cpp          |   71 +-
 .../dft/statistics/detailed_statistics.cpp    |  122 +-
 .../shared_index_statistics_reader.cpp        |    5 +
 .../statistics_aggregator_utility.cpp         |  184 +-
 .../statistics/statistics_query_utility.cpp   |   69 +-
 .../dft/statistics/trace_statistics.cpp       |   96 +-
 .../composites/dft/views/view_definition.cpp  |  109 +-
 .../dft/views/view_reader_utility.cpp         |   92 +-
 .../composites/dft/visitors/bloom_visitor.cpp |  652 ++++
 .../dft/visitors/hash_table_visitor.cpp       |   96 +
 .../dft/visitors/manifest_visitor.cpp         |  128 +
 .../streaming_file_merger_utility.cpp         |   13 +-
 .../utils/utilities/fileio/chunk_writer.cpp   |   64 +-
 .../utilities/fileio/parallel/layout.cpp      |  148 +
 .../utils/utilities/fileio/parallel/merge.cpp |   83 +
 .../fileio/parallel/padded_striped_writer.cpp |  328 ++
 .../fileio/parallel/sharded_writer.cpp        |  135 +
 .../fileio/parallel/striped_writer.cpp        |  147 +
 .../indexer/index_builder_utility.cpp         |  789 +++-
 .../utilities/indexer/index_database.cpp      | 2340 ++++++-----
 .../index_database_sst_writer_context.cpp     |  399 ++
 .../indexer/index_database_writer_context.cpp | 1279 +++++++
 .../indexer/internal/common/gzip_inflater.h   |   61 +-
 .../internal/common/gzip_member_scanner.h     |  107 +
 .../indexer/internal/gzip/gzip_indexer.cpp    |  553 ++-
 .../indexer/internal/gzip/gzip_indexer.h      |   44 +
 .../utilities/indexer/internal/helpers.cpp    |    1 -
 .../indexer/internal/index_batch_writer.h     |  120 +
 .../indexer/internal/index_encoding.cpp       |  309 ++
 .../indexer/internal/tar/tar_indexer.cpp      |  101 +-
 .../indexer/internal/transaction_scope.h      |   39 +-
 .../utilities/indexer/provenance_database.cpp |  111 +-
 .../indexer/visitors/bloom_visitor.cpp        |  240 --
 .../indexer/visitors/manifest_visitor.cpp     |   73 -
 .../utilities/reader/internal/gzip_reader.cpp |    9 +
 .../utilities/reader/internal/inflater.h      |    2 +-
 .../internal/streams/gzip_line_byte_stream.h  |   43 +-
 .../reader/internal/streams/line_stream.h     |  254 +-
 .../utils/utilities/reader/trace_reader.cpp   | 1472 ++++++-
 .../utils/utilities/replay/replay.cpp         |  612 ++-
 tests/CMakeLists.txt                          |   39 +-
 .../binaries/test_dftracer_aggregator_mpi.cpp |  391 ++
 tests/binaries/test_dftracer_call_tree.cpp    |  124 +
 .../binaries/test_dftracer_call_tree_mpi.cpp  |  281 ++
 tests/binaries/test_dftracer_comparator.cpp   |  130 +-
 .../binaries/test_dftracer_gen_fake_trace.cpp |    4 +-
 tests/binaries/test_dftracer_organize.cpp     |    7 +-
 tests/binaries/test_dftracer_server.cpp       |    2 +-
 tests/pipeline/test_coro_scope.cpp            |   12 +-
 tests/python/common.py                        |  106 +-
 tests/python/test_aggregator.py               |  295 +-
 tests/python/test_dask.py                     |   88 +-
 tests/python/test_distributed_manifest.py     |  204 +
 tests/python/test_indexer.py                  |  826 ++--
 tests/python/test_reorganization_planner.py   |   21 +-
 tests/python/test_statistics_aggregator.py    |   36 +-
 tests/python/test_statistics_query.py         |   39 +-
 tests/python/test_trace_reader.py             |  101 +-
 tests/python/test_trace_reader_arrow.py       |  262 +-
 tests/python/test_trace_reader_directory.py   |  296 ++
 tests/python/test_trace_reader_write_arrow.py |  490 +++
 tests/replay/test_replay_fidelity.cpp         |  271 ++
 tests/utilities/CMakeLists.txt                |   44 +
 .../call_tree/test_call_tree_internal.cpp     |  171 +-
 .../arrow/test_arrow_column_builder.cpp       |    1 +
 .../common/arrow/test_arrow_ipc_reader.cpp    |  528 +++
 .../common/arrow/test_arrow_ipc_writer.cpp    |  320 +-
 .../utilities/common/query/test_evaluator.cpp |   23 +-
 tests/utilities/common/query/test_query.cpp   |   77 +-
 .../statistics/test_timestamp_histogram.cpp   |  240 ++
 .../test_aggregation_augmentation.cpp         |  128 +
 .../aggregators/test_aggregation_metrics.cpp  |  119 +-
 .../test_aggregation_serialization.cpp        |  205 +
 .../aggregators/test_aggregator_utility.cpp   |   96 +-
 .../test_event_aggregator_utility.cpp         |    6 +-
 .../dft/aggregators/test_system_metrics.cpp   |  309 ++
 .../test_system_metrics_merge_operator.cpp    |  183 +
 .../dft/comparator/test_comparison_result.cpp |   18 +-
 .../dft/indexing/test_bloom_query.cpp         |   51 +-
 .../dft/indexing/test_chunk_pruner.cpp        |   38 +-
 .../indexing/test_manifest_index_builder.cpp  |    8 +-
 .../dft/indexing/test_manifest_queries.cpp    |   83 +-
 .../test_reconstruct_integration.cpp          |   37 +-
 .../test_reconstruction_planner.cpp           |   24 +-
 .../test_reorganization_planner.cpp           |   41 +-
 .../test_reorganize_integration.cpp           |   37 +-
 .../statistics/test_detailed_statistics.cpp   |   90 +-
 .../statistics/test_statistics_aggregator.cpp |   82 +-
 .../dft/statistics/test_statistics_query.cpp  |   18 +-
 .../dft/statistics/test_trace_statistics.cpp  |   57 +-
 .../composites/dft/test_index_builder.cpp     |   11 +-
 .../dft/views/test_view_builder.cpp           |   74 +-
 .../utilities/composites/test_file_merger.cpp |   12 +-
 .../fileio/parallel/test_layout_sizing.cpp    |   93 +
 .../parallel/test_padded_striped_writer.cpp   |  219 ++
 .../fileio/parallel/test_sharded_writer.cpp   |  109 +
 .../fileio/parallel/test_striped_writer.cpp   |  109 +
 .../utilities/indexer/test_index_builder.cpp  |  148 +-
 .../utilities/indexer/test_index_database.cpp |  281 +-
 .../indexer/test_provenance_database.cpp      |   41 +-
 .../indexer/test_rocksdb_storage.cpp          |   40 +-
 .../indexer/test_sst_ingest_spike.cpp         |  469 +++
 tests/utilities/reader/test_trace_reader.cpp  |  279 +-
 402 files changed, 58918 insertions(+), 17518 deletions(-)
 delete mode 100644 .envrc
 delete mode 100644 flake.lock
 delete mode 100644 flake.nix
 delete mode 100644 include/dftracer/utils/call_tree/mpi/build_task.h
 delete mode 100644 include/dftracer/utils/call_tree/mpi/file_header.h
 delete mode 100644 include/dftracer/utils/call_tree/mpi/filtered_reader.h
 delete mode 100644 include/dftracer/utils/call_tree/mpi/pid_index_info.h
 delete mode 100644 include/dftracer/utils/call_tree/mpi/serialization.h
 create mode 100644 include/dftracer/utils/core/common/memory_budget.h
 delete mode 100644 include/dftracer/utils/core/rocksdb/async.h
 create mode 100644 include/dftracer/utils/core/rocksdb/column_families.h
 create mode 100644 include/dftracer/utils/utilities/common/arrow/ipc_reader.h
 create mode 100644 include/dftracer/utils/utilities/common/arrow/parallel_reader.h
 create mode 100644 include/dftracer/utils/utilities/common/arrow/partition_router.h
 create mode 100644 include/dftracer/utils/utilities/common/arrow/partition_writer.h
 create mode 100644 include/dftracer/utils/utilities/common/json/parser.h
 create mode 100644 include/dftracer/utils/utilities/common/serialization/binary_codec.h
 create mode 100644 include/dftracer/utils/utilities/common/statistics/timestamp_histogram.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_types.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h
 delete mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/args_map.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/dft_event_dispatcher.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/dft_event_visitor.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/parse_inflated.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.h
 create mode 100644 include/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.h
 create mode 100644 include/dftracer/utils/utilities/fileio/parallel/layout.h
 create mode 100644 include/dftracer/utils/utilities/fileio/parallel/merge.h
 create mode 100644 include/dftracer/utils/utilities/fileio/parallel/parallel_writer.h
 create mode 100644 include/dftracer/utils/utilities/indexer/file_partition.h
 create mode 100644 include/dftracer/utils/utilities/indexer/index_batch_sink.h
 create mode 100644 include/dftracer/utils/utilities/indexer/index_database_sst_writer_context.h
 create mode 100644 include/dftracer/utils/utilities/indexer/index_database_writer_context.h
 create mode 100644 include/dftracer/utils/utilities/indexer/index_file_entry_capability.h
 create mode 100644 include/dftracer/utils/utilities/indexer/index_types.h
 create mode 100644 include/dftracer/utils/utilities/indexer/internal/index_encoding.h
 create mode 100644 include/dftracer/utils/utilities/indexer/internal/payload_codec.h
 create mode 100644 include/dftracer/utils/utilities/indexer/internal/statistics_codec.h
 delete mode 100644 include/dftracer/utils/utilities/indexer/visitors/bloom_visitor.h
 delete mode 100644 include/dftracer/utils/utilities/indexer/visitors/manifest_visitor.h
 create mode 100644 python/dftracer/utils/indexer.py
 create mode 100644 python/dftracer/utils/trace_reader.py
 create mode 100644 src/dftracer/utils/binaries/common_cli.h
 create mode 100644 src/dftracer/utils/binaries/dftracer_aggregator_mpi.cpp
 create mode 100644 src/dftracer/utils/binaries/dftracer_call_tree_mpi.cpp
 create mode 100644 src/dftracer/utils/core/common/memory_budget.cpp
 delete mode 100644 src/dftracer/utils/core/rocksdb/async.cpp
 create mode 100644 src/dftracer/utils/python/arrow_parallel_reader.cpp
 create mode 100644 src/dftracer/utils/python/arrow_parallel_reader.h
 create mode 100644 src/dftracer/utils/python/arrow_stream_capsule.cpp
 create mode 100644 src/dftracer/utils/python/arrow_stream_capsule.h
 create mode 100644 src/dftracer/utils/python/batch_byte_size.h
 create mode 100644 src/dftracer/utils/python/batch_indexer.cpp
 create mode 100644 src/dftracer/utils/python/batch_indexer.h
 create mode 100644 src/dftracer/utils/python/index_database.cpp
 create mode 100644 src/dftracer/utils/python/index_database.h
 create mode 100644 src/dftracer/utils/python/memoryview_batch.cpp
 create mode 100644 src/dftracer/utils/python/memoryview_batch.h
 create mode 100644 src/dftracer/utils/python/schema_reconcile.cpp
 create mode 100644 src/dftracer/utils/python/schema_reconcile.h
 create mode 100644 src/dftracer/utils/python/sst_distribution.cpp
 create mode 100644 src/dftracer/utils/python/sst_distribution.h
 create mode 100644 src/dftracer/utils/python/streaming_iterator.cpp
 create mode 100644 src/dftracer/utils/python/streaming_iterator.h
 create mode 100644 src/dftracer/utils/utilities/call_tree/call_tree_save_arrow.cpp
 create mode 100644 src/dftracer/utils/utilities/call_tree/call_tree_save_binary.cpp
 create mode 100644 src/dftracer/utils/utilities/common/arrow/ipc_reader.cpp
 create mode 100644 src/dftracer/utils/utilities/common/arrow/parallel_reader.cpp
 create mode 100644 src/dftracer/utils/utilities/common/arrow/partition_router.cpp
 create mode 100644 src/dftracer/utils/utilities/common/arrow/partition_writer.cpp
 create mode 100644 src/dftracer/utils/utilities/common/json/parser.cpp
 create mode 100644 src/dftracer/utils/utilities/common/statistics/timestamp_histogram.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.cpp
 delete mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.cpp
 create mode 100644 src/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.cpp
 create mode 100644 src/dftracer/utils/utilities/fileio/parallel/layout.cpp
 create mode 100644 src/dftracer/utils/utilities/fileio/parallel/merge.cpp
 create mode 100644 src/dftracer/utils/utilities/fileio/parallel/padded_striped_writer.cpp
 create mode 100644 src/dftracer/utils/utilities/fileio/parallel/sharded_writer.cpp
 create mode 100644 src/dftracer/utils/utilities/fileio/parallel/striped_writer.cpp
 create mode 100644 src/dftracer/utils/utilities/indexer/index_database_sst_writer_context.cpp
 create mode 100644 src/dftracer/utils/utilities/indexer/index_database_writer_context.cpp
 create mode 100644 src/dftracer/utils/utilities/indexer/internal/common/gzip_member_scanner.h
 create mode 100644 src/dftracer/utils/utilities/indexer/internal/index_batch_writer.h
 create mode 100644 src/dftracer/utils/utilities/indexer/internal/index_encoding.cpp
 delete mode 100644 src/dftracer/utils/utilities/indexer/visitors/bloom_visitor.cpp
 delete mode 100644 src/dftracer/utils/utilities/indexer/visitors/manifest_visitor.cpp
 create mode 100644 tests/binaries/test_dftracer_aggregator_mpi.cpp
 create mode 100644 tests/binaries/test_dftracer_call_tree.cpp
 create mode 100644 tests/binaries/test_dftracer_call_tree_mpi.cpp
 create mode 100644 tests/python/test_distributed_manifest.py
 create mode 100644 tests/python/test_trace_reader_directory.py
 create mode 100644 tests/python/test_trace_reader_write_arrow.py
 create mode 100644 tests/replay/test_replay_fidelity.cpp
 create mode 100644 tests/utilities/common/arrow/test_arrow_ipc_reader.cpp
 create mode 100644 tests/utilities/common/statistics/test_timestamp_histogram.cpp
 create mode 100644 tests/utilities/composites/dft/aggregators/test_aggregation_augmentation.cpp
 create mode 100644 tests/utilities/composites/dft/aggregators/test_aggregation_serialization.cpp
 create mode 100644 tests/utilities/composites/dft/aggregators/test_system_metrics.cpp
 create mode 100644 tests/utilities/composites/dft/aggregators/test_system_metrics_merge_operator.cpp
 create mode 100644 tests/utilities/fileio/parallel/test_layout_sizing.cpp
 create mode 100644 tests/utilities/fileio/parallel/test_padded_striped_writer.cpp
 create mode 100644 tests/utilities/fileio/parallel/test_sharded_writer.cpp
 create mode 100644 tests/utilities/fileio/parallel/test_striped_writer.cpp
 create mode 100644 tests/utilities/indexer/test_sst_ingest_spike.cpp

diff --git a/.envrc b/.envrc
deleted file mode 100644
index 3550a30f..00000000
--- a/.envrc
+++ /dev/null
@@ -1 +0,0 @@
-use flake
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4ef4b04e..00ed292a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,9 +2,10 @@ name: CI
 
 on:
   push:
-    branches: [ main, develop, initialize, 'feat/**', 'fix/**', 'chore/**' ]
+    branches:
+      [main, develop, initialize, "feat/**", "fix/**", "chore/**", "perf/**"]
   pull_request:
-    branches: [ main, develop ]
+    branches: [main, develop]
   workflow_dispatch:
 
 jobs:
@@ -13,22 +14,22 @@ jobs:
     outputs:
       code: ${{ steps.filter.outputs.code }}
     steps:
-    - uses: actions/checkout@v6
-    - uses: dorny/paths-filter@v3.0.2
-      id: filter
-      with:
-        filters: |
-          code:
-            - 'include/**'
-            - 'src/**'
-            - 'tests/**'
-            - 'python/**'
-            - 'cmake/**'
-            - 'CMakeLists.txt'
-            - 'CMakePresets.json'
-            - 'pyproject.toml'
-            - 'Makefile'
-            - '.github/workflows/ci.yml'
+      - uses: actions/checkout@v6
+      - uses: dorny/paths-filter@v3.0.2
+        id: filter
+        with:
+          filters: |
+            code:
+              - 'include/**'
+              - 'src/**'
+              - 'tests/**'
+              - 'python/**'
+              - 'cmake/**'
+              - 'CMakeLists.txt'
+              - 'CMakePresets.json'
+              - 'pyproject.toml'
+              - 'Makefile'
+              - '.github/workflows/ci.yml'
 
   test:
     needs: changes
@@ -42,82 +43,82 @@ jobs:
       matrix:
         os: [ubuntu-22.04, ubuntu-24.04, ubuntu-latest, macos-latest]
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
-    
+
     steps:
-    - uses: actions/checkout@v6
-    
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v6.1.0
-      with:
-        python-version: ${{ matrix.python-version }}
+      - uses: actions/checkout@v6
 
-    - name: Cache ccache
-      uses: actions/cache@v5
-      with:
-        path: ~/.ccache
-        key: ccache-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('CMakeLists.txt', 'pyproject.toml', '.github/workflows/ci.yml') }}
-        restore-keys: |
-          ccache-${{ runner.os }}-${{ matrix.python-version }}-
-          ccache-${{ runner.os }}-
-    
-    - name: Install dependencies (Ubuntu)
-      if: runner.os == 'Linux'
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y build-essential cmake ccache lcov zlib1g-dev libsqlite3-dev pkg-config ninja-build
-    
-    - name: Install dependencies (macOS)
-      if: runner.os == 'macOS'
-      run: |
-        brew update
-        for f in cmake ccache lcov zlib sqlite pkg-config ninja; do
-          if brew list --versions "$f" >/dev/null; then
-            echo "$f already installed"
-          else
-            brew install "$f"
-          fi
-        done
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v6.1.0
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Cache ccache
+        uses: actions/cache@v5
+        with:
+          path: ~/.ccache
+          key: ccache-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('CMakeLists.txt', 'pyproject.toml', '.github/workflows/ci.yml') }}
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.python-version }}-
+            ccache-${{ runner.os }}-
+
+      - name: Install dependencies (Ubuntu)
+        if: runner.os == 'Linux'
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential cmake ccache lcov zlib1g-dev libsqlite3-dev pkg-config ninja-build
 
-    - name: Run coverage
-      if: (matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12'
-      run: |
-        make coverage
+      - name: Install dependencies (macOS)
+        if: runner.os == 'macOS'
+        run: |
+          brew update
+          for f in cmake ccache lcov zlib sqlite pkg-config ninja; do
+            if brew list --versions "$f" >/dev/null; then
+              echo "$f already installed"
+            else
+              brew install "$f"
+            fi
+          done
 
-    - name: Run test (Unix)
-      if: "!((matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12')"
-      run: |
-        make test
-    
-    - name: Run Python tests (with venv)
-      if: "!((matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12')"
-      run: |
-        if [ "${{ runner.os }}" = "Linux" ] && [ "${{ matrix.python-version }}" = "3.12" ]; then
-          make test-py RUN_TY=1
-        else
-          make test-py
-        fi
+      - name: Run coverage
+        if: (matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12'
+        run: |
+          make coverage
+
+      - name: Run test (Unix)
+        if: "!((matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12')"
+        run: |
+          make test
+
+      - name: Run Python tests (with venv)
+        if: "!((matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12')"
+        run: |
+          if [ "${{ runner.os }}" = "Linux" ] && [ "${{ matrix.python-version }}" = "3.12" ]; then
+            make test-py RUN_TY=1
+          else
+            make test-py
+          fi
 
-    - name: Upload coverage reports to Coveralls
-      if: (matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12'
-      uses: coverallsapp/github-action@v2.3.6
-      continue-on-error: true
-      with:
-        file: coverage/coverage_filtered.info
-        format: lcov
-        flag-name: ${{ matrix.os }}
-        parallel: true
-      env:
-        COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }}
+      - name: Upload coverage reports to Coveralls
+        if: (matrix.os == 'ubuntu-22.04' || matrix.os == 'macos-latest') && matrix.python-version == '3.12'
+        uses: coverallsapp/github-action@v2.3.6
+        continue-on-error: true
+        with:
+          file: coverage/coverage_filtered.info
+          format: lcov
+          flag-name: ${{ matrix.os }}
+          parallel: true
+        env:
+          COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }}
 
   coverage-finish:
     needs: test
     if: always()
     runs-on: ubuntu-latest
     steps:
-    - name: Coveralls finished
-      uses: coverallsapp/github-action@v2.3.6
-      continue-on-error: true
-      with:
-        parallel-finished: true
-      env:
-        COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }}
+      - name: Coveralls finished
+        uses: coverallsapp/github-action@v2.3.6
+        continue-on-error: true
+        with:
+          parallel-finished: true
+        env:
+          COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }}
diff --git a/.gitignore b/.gitignore
index 3cb352f7..e4ec0d8a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -94,3 +94,4 @@ PLANS.md
 docs/plans
 
 profiling-results*/
+dfanalyzer/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 238c0f41..9985f689 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,8 +65,11 @@ option(DFTRACER_UTILS_ENABLE_ASAN "Enable AddressSanitizer" OFF)
 option(DFTRACER_UTILS_ENABLE_UBSAN "Enable UndefinedBehaviorSanitizer" OFF)
 option(DFTRACER_UTILS_ENABLE_TSAN "Enable ThreadSanitizer" OFF)
 option(DFTRACER_UTILS_ENABLE_MPI "Enable MPI support for call tree" OFF)
+option(DFTRACER_USE_ZLIB_NG "Use zlib-ng (compat) instead of madler/zlib; falls back to madler on failure" ON)
 option(DFTRACER_UTILS_ENABLE_ARROW "Enable Arrow C Data Interface via nanoarrow" ON)
 option(DFTRACER_UTILS_ENABLE_ARROW_IPC "Enable Arrow IPC file read/write via nanoarrow" ON)
+option(DFTRACER_UTILS_ENABLE_ZSTD "Enable ZSTD compression for RocksDB" ON)
+option(DFTRACER_UTILS_ENABLE_LZ4 "Enable LZ4 compression for RocksDB" OFF)
 
 if(DFTRACER_UTILS_TESTS)
   message(STATUS "Building tests")
@@ -136,6 +139,18 @@ else()
   message(STATUS "kqueue support: disabled (sys/event.h not found)")
 endif()
 
+# lustreapi: optional for stripe_count / stripe_size queries. When absent, the
+# parallel writer treats Lustre like an opaque POSIX filesystem.
+check_include_file("lustre/lustreapi.h" HAVE_LUSTRE_LUSTREAPI_H)
+find_library(LUSTREAPI_LIBRARY NAMES lustreapi)
+if(HAVE_LUSTRE_LUSTREAPI_H AND LUSTREAPI_LIBRARY)
+  set(DFTRACER_UTILS_HAVE_LUSTREAPI ON)
+  message(STATUS "lustreapi support: enabled (${LUSTREAPI_LIBRARY})")
+else()
+  set(DFTRACER_UTILS_HAVE_LUSTREAPI OFF)
+  message(STATUS "lustreapi support: disabled (header or library not found)")
+endif()
+
 # Set C++ standard
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
diff --git a/CMakePresets.json b/CMakePresets.json
index dfb7deab..e3c0079a 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -77,7 +77,8 @@
       "inherits": "dev",
       "cacheVariables": {
         "DFTRACER_UTILS_TESTS": "ON",
-        "DFTRACER_UTILS_DEBUG": "ON"
+        "DFTRACER_UTILS_DEBUG": "ON",
+        "DFTRACER_UTILS_BUILD_PYTHON": "ON"
       }
     },
     {
diff --git a/cmake/configure_files/dftracer_utils_config.dbg.h.in b/cmake/configure_files/dftracer_utils_config.dbg.h.in
index 114d35cb..bd8a58f9 100644
--- a/cmake/configure_files/dftracer_utils_config.dbg.h.in
+++ b/cmake/configure_files/dftracer_utils_config.dbg.h.in
@@ -31,6 +31,13 @@
 /* Macro flags */
 #cmakedefine DFTRACER_UTILS_HAS_STD_FILESYSTEM 1
 
+/* Feature flags */
+#cmakedefine DFTRACER_UTILS_ENABLE_ARROW 1
+#cmakedefine DFTRACER_UTILS_ENABLE_ARROW_IPC 1
+#cmakedefine DFTRACER_UTILS_ENABLE_LZ4 1
+#cmakedefine DFTRACER_UTILS_ENABLE_ZSTD 1
+#cmakedefine DFTRACER_UTILS_HAVE_LUSTREAPI 1
+
 #define DFTRACER_UTILS_LOGGER_CPP_LOGGER 1
 #define DFTRACER_UTILS_LOGGER_LEVEL_TRACE 1
 #define DFTRACER_UTILS_LOGGER_LEVEL_DEBUG 1
diff --git a/cmake/configure_files/dftracer_utils_config.h.in b/cmake/configure_files/dftracer_utils_config.h.in
index 7c4482cc..486f59e7 100644
--- a/cmake/configure_files/dftracer_utils_config.h.in
+++ b/cmake/configure_files/dftracer_utils_config.h.in
@@ -31,6 +31,13 @@
 /* Macro flags */
 #cmakedefine DFTRACER_UTILS_HAS_STD_FILESYSTEM 1
 
+/* Feature flags */
+#cmakedefine DFTRACER_UTILS_ENABLE_ARROW 1
+#cmakedefine DFTRACER_UTILS_ENABLE_ARROW_IPC 1
+#cmakedefine DFTRACER_UTILS_ENABLE_LZ4 1
+#cmakedefine DFTRACER_UTILS_ENABLE_ZSTD 1
+#cmakedefine DFTRACER_UTILS_HAVE_LUSTREAPI 1
+
 #define DFTRACER_UTILS_LOGGER_CPP_LOGGER 1
 #define DFTRACER_UTILS_LOGGER_LEVEL_TRACE 0
 #define DFTRACER_UTILS_LOGGER_LEVEL_DEBUG 0
diff --git a/cmake/modules/Dependencies.cmake b/cmake/modules/Dependencies.cmake
index 2c5ccbfc..454b2ae4 100644
--- a/cmake/modules/Dependencies.cmake
+++ b/cmake/modules/Dependencies.cmake
@@ -273,6 +273,47 @@ function(need_nonstd_span)
   endif()
 endfunction()
 
+function(need_unordered_dense)
+  if(NOT unordered_dense_ADDED)
+    cpmaddpackage(
+      NAME
+      unordered_dense
+      GITHUB_REPOSITORY
+      martinus/unordered_dense
+      VERSION
+      4.4.0
+      OPTIONS
+      "UNORDERED_DENSE_INSTALL ON"
+      FORCE
+      YES)
+  endif()
+endfunction()
+
+function(link_unordered_dense TARGET_NAME)
+  if(NOT TARGET_NAME)
+    message(FATAL_ERROR "link_unordered_dense: TARGET_NAME is required")
+  endif()
+
+  if(NOT TARGET ${TARGET_NAME})
+    message(
+      FATAL_ERROR
+        "link_unordered_dense: Target '${TARGET_NAME}' does not exist")
+  endif()
+
+  if(NOT TARGET unordered_dense::unordered_dense)
+    message(
+      FATAL_ERROR
+        "link_unordered_dense: ankerl::unordered_dense not found! Call need_unordered_dense() first."
+    )
+  endif()
+
+  get_target_property(UD_INC unordered_dense::unordered_dense
+                      INTERFACE_INCLUDE_DIRECTORIES)
+  target_include_directories(${TARGET_NAME} PUBLIC
+    "$<BUILD_INTERFACE:${UD_INC}>"
+    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
+endfunction()
+
 function(need_tl_expected)
   # tl::expected is only needed when C++23 std::expected is unavailable
   if(CMAKE_CXX_STANDARD GREATER_EQUAL 23)
@@ -353,80 +394,94 @@ function(link_tl_expected TARGET_NAME)
 endfunction()
 
 # ==============================================================================
-# JSON and Serialization Dependencies
+# simdjson - SIMD-accelerated JSON parser (On-Demand API for zero-copy)
 # ==============================================================================
 
-function(need_yyjson)
-  if(NOT yyjson_ADDED)
+function(need_simdjson)
+  if(NOT simdjson_ADDED)
     cpmaddpackage(
       NAME
-      yyjson
+      simdjson
       GITHUB_REPOSITORY
-      ibireme/yyjson
+      simdjson/simdjson
       VERSION
-      0.12.0
+      4.6.1
       GIT_TAG
-      0.12.0
-      FORCE
-      YES
+      v4.6.1
       DOWNLOAD_ONLY
       YES)
   endif()
 
-  set(YYJSON_SOVERSION 0)
-  set(YYJSON_TARGETS)
+  if(simdjson_ADDED AND NOT TARGET simdjson)
+    message(STATUS "Building simdjson library (v4.6.1)")
 
-  if(DFTRACER_UTILS_BUILD_STATIC)
-    add_library(yyjson_static STATIC ${yyjson_SOURCE_DIR}/src/yyjson.h
-                                     ${yyjson_SOURCE_DIR}/src/yyjson.c)
-    target_include_directories(
-      yyjson_static PUBLIC $<BUILD_INTERFACE:${yyjson_SOURCE_DIR}/src>)
-    set_target_properties(
-      yyjson_static
-      PROPERTIES VERSION ${PROJECT_VERSION}
-                 SOVERSION ${YYJSON_SOVERSION}
-                 ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-    add_library(yyjson::yyjson_static ALIAS yyjson_static)
-    list(APPEND YYJSON_TARGETS yyjson_static)
-    message(STATUS "Added yyjson static library")
-  endif()
+    # simdjson is a single-header + single-source library
+    set(SIMDJSON_SOURCES
+      ${simdjson_SOURCE_DIR}/singleheader/simdjson.h
+      ${simdjson_SOURCE_DIR}/singleheader/simdjson.cpp)
 
-  if(DFTRACER_UTILS_BUILD_SHARED)
-    add_library(yyjson_shared SHARED ${yyjson_SOURCE_DIR}/src/yyjson.h
-                                     ${yyjson_SOURCE_DIR}/src/yyjson.c)
-    target_include_directories(
-      yyjson_shared PUBLIC $<BUILD_INTERFACE:${yyjson_SOURCE_DIR}/src>)
-    set_target_properties(
-      yyjson_shared
-      PROPERTIES VERSION ${PROJECT_VERSION}
-                 SOVERSION ${YYJSON_SOVERSION}
-                 OUTPUT_NAME yyjson
-                 LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
-                 ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-    add_library(yyjson::yyjson ALIAS yyjson_shared)
-    list(APPEND YYJSON_TARGETS yyjson_shared)
-    message(STATUS "Added yyjson shared library")
-  elseif(DFTRACER_UTILS_BUILD_STATIC)
-    # If only static is built, make it the default alias
-    add_library(yyjson::yyjson ALIAS yyjson_static)
-  endif()
+    set(SIMDJSON_TARGETS)
 
-  install(FILES ${yyjson_SOURCE_DIR}/src/yyjson.h
-          DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-  if(YYJSON_TARGETS)
-    install(
-      TARGETS ${YYJSON_TARGETS}
-      EXPORT yyjsonTargets
-      ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-      LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+    if(DFTRACER_UTILS_BUILD_STATIC)
+      add_library(simdjson_static STATIC ${SIMDJSON_SOURCES})
+      target_include_directories(
+        simdjson_static SYSTEM PUBLIC
+        $<BUILD_INTERFACE:${simdjson_SOURCE_DIR}/singleheader>
+        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+      target_compile_features(simdjson_static PUBLIC cxx_std_17)
+      # Suppress warnings from simdjson (third-party code)
+      target_compile_options(simdjson_static PRIVATE -w)
+      set_target_properties(
+        simdjson_static
+        PROPERTIES
+          OUTPUT_NAME simdjson
+          ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
+          POSITION_INDEPENDENT_CODE ON)
+      add_library(simdjson::simdjson_static ALIAS simdjson_static)
+      list(APPEND SIMDJSON_TARGETS simdjson_static)
+      message(STATUS "Added simdjson static library")
+    endif()
 
-    # Install the export set so other projects can find yyjson
-    install(
-      EXPORT yyjsonTargets
-      FILE yyjsonTargets.cmake
-      NAMESPACE yyjson::
-      DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/yyjson)
+    if(DFTRACER_UTILS_BUILD_SHARED)
+      add_library(simdjson_shared SHARED ${SIMDJSON_SOURCES})
+      target_include_directories(
+        simdjson_shared SYSTEM PUBLIC
+        $<BUILD_INTERFACE:${simdjson_SOURCE_DIR}/singleheader>
+        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+      target_compile_features(simdjson_shared PUBLIC cxx_std_17)
+      # Suppress warnings from simdjson (third-party code)
+      target_compile_options(simdjson_shared PRIVATE -w)
+      set_target_properties(
+        simdjson_shared
+        PROPERTIES
+          OUTPUT_NAME simdjson
+          LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
+          ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+      add_library(simdjson::simdjson ALIAS simdjson_shared)
+      list(APPEND SIMDJSON_TARGETS simdjson_shared)
+      message(STATUS "Added simdjson shared library")
+    elseif(DFTRACER_UTILS_BUILD_STATIC)
+      add_library(simdjson::simdjson ALIAS simdjson_static)
+    endif()
+
+    # Install header
+    install(FILES ${simdjson_SOURCE_DIR}/singleheader/simdjson.h
+            DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+    if(SIMDJSON_TARGETS)
+      install(
+        TARGETS ${SIMDJSON_TARGETS}
+        EXPORT simdjsonTargets
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+      install(
+        EXPORT simdjsonTargets
+        FILE simdjsonTargets.cmake
+        NAMESPACE simdjson::
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/simdjson)
+    endif()
   endif()
 endfunction()
 
@@ -497,13 +552,13 @@ function(need_rocksdb)
         "ROCKSDB_BUILD_SHARED ${DFTRACER_UTILS_BUILD_SHARED}"
         "WITH_TESTS OFF"
         "WITH_TOOLS OFF"
-        "WITH_CORE_TOOLS OFF"
+        "WITH_CORE_TOOLS ON"
         "WITH_BENCHMARK_TOOLS OFF"
         "WITH_GFLAGS OFF"
         "WITH_SNAPPY OFF"
-        "WITH_LZ4 ON"
+        "WITH_LZ4 ${DFTRACER_UTILS_ENABLE_LZ4}"
         "WITH_ZLIB ON"
-        "WITH_ZSTD OFF"
+        "WITH_ZSTD ${DFTRACER_UTILS_ENABLE_ZSTD}"
         "WITH_BZ2 OFF"
         "USE_RTTI ON"
         "FAIL_ON_WARNINGS OFF"
@@ -587,6 +642,23 @@ function(need_rocksdb)
           "${CMAKE_INSTALL_RPATH}"
           PARENT_SCOPE)
 
+      # Stage rocksdb's ldb (and sst_dump) into bin/ and reuse the standard
+      # $ORIGIN/../lib rpath helper so they find librocksdb.so without
+      # LD_LIBRARY_PATH. Install alongside our own binaries and ship a
+      # venv wrapper when building a Python wheel.
+      foreach(tool ldb sst_dump)
+        if(TARGET ${tool})
+          set_target_properties(
+            ${tool} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                               "${CMAKE_BINARY_DIR}/bin")
+          target_add_rpath(${tool})
+          install(TARGETS ${tool} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+          if(SKBUILD)
+            create_python_wrapper(${tool})
+          endif()
+        endif()
+      endforeach()
+
       set(RocksDB_FOUND
           TRUE
           PARENT_SCOPE)
@@ -840,7 +912,130 @@ function(need_lz4)
   endif()
 endfunction()
 
+function(_try_zlib_ng OUT_VAR)
+  set(${OUT_VAR}
+      FALSE
+      PARENT_SCOPE)
+
+  cpmaddpackage(
+    NAME
+    zlib-ng
+    GITHUB_REPOSITORY
+    zlib-ng/zlib-ng
+    VERSION
+    2.3.3
+    GIT_TAG
+    2.3.3
+    OPTIONS
+    "ZLIB_COMPAT ON"
+    "ZLIB_ENABLE_TESTS OFF"
+    "ZLIBNG_ENABLE_TESTS OFF"
+    "WITH_GTEST OFF"
+    "WITH_OPTIM ON"
+    "WITH_NEW_STRATEGIES ON"
+    "WITH_NATIVE_INSTRUCTIONS OFF"
+    "INSTALL_UTILS OFF"
+    "SKIP_INSTALL_ALL ON")
+
+  if(NOT zlib-ng_ADDED)
+    message(WARNING "zlib-ng CPM add failed; will fall back to madler/zlib")
+    return()
+  endif()
+
+  # zlib-ng compat mode: real targets are `zlib-ng` (shared) and
+  # `zlib-ng-static` (static); `zlib`/`zlibstatic` are ALIAS-only and cannot
+  # have properties or further aliases set on them.
+  set(ZLIB_NG_TARGETS)
+  if(DFTRACER_UTILS_BUILD_SHARED AND TARGET zlib-ng)
+    get_target_property(_zng_type zlib-ng TYPE)
+    if(_zng_type STREQUAL "SHARED_LIBRARY")
+      set_target_properties(
+        zlib-ng PROPERTIES OUTPUT_NAME dftracer_zlib LIBRARY_OUTPUT_DIRECTORY
+                                                     ${CMAKE_BINARY_DIR}/lib)
+      target_include_directories(
+        zlib-ng PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+      add_library(dftracer_zlib_shared ALIAS zlib-ng)
+      add_library(dftracer::zlib ALIAS zlib-ng)
+      list(APPEND ZLIB_NG_TARGETS zlib-ng)
+      message(STATUS "Using zlib-ng (compat, shared) as dftracer_zlib")
+    endif()
+  endif()
+
+  if(DFTRACER_UTILS_BUILD_STATIC AND TARGET zlib-ng-static)
+    set_target_properties(
+      zlib-ng-static PROPERTIES OUTPUT_NAME dftracer_zlib
+                                ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+    target_include_directories(
+      zlib-ng-static PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+    add_library(dftracer_zlib_static ALIAS zlib-ng-static)
+    add_library(dftracer::zlibstatic ALIAS zlib-ng-static)
+    if(NOT TARGET dftracer::zlib)
+      add_library(dftracer::zlib ALIAS zlib-ng-static)
+    endif()
+    list(APPEND ZLIB_NG_TARGETS zlib-ng-static)
+    message(STATUS "Using zlib-ng (compat, static) as dftracer_zlib")
+  endif()
+
+  if(NOT ZLIB_NG_TARGETS)
+    message(WARNING "zlib-ng targets not found after CPM add; falling back")
+    return()
+  endif()
+
+  install(
+    TARGETS ${ZLIB_NG_TARGETS}
+    EXPORT ZlibTargets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+  install(
+    EXPORT ZlibTargets
+    FILE ZlibTargets.cmake
+    NAMESPACE dftracer::
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/zlib)
+
+  # Compat headers: zlib-ng generates zlib.h/zconf.h in its binary dir when
+  # ZLIB_COMPAT=ON. Fall back to source dir if generated copy is absent.
+  foreach(hdr zlib.h zconf.h)
+    if(EXISTS "${zlib-ng_BINARY_DIR}/${hdr}")
+      install(FILES "${zlib-ng_BINARY_DIR}/${hdr}"
+              DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+    elseif(EXISTS "${zlib-ng_SOURCE_DIR}/${hdr}")
+      install(FILES "${zlib-ng_SOURCE_DIR}/${hdr}"
+              DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+    endif()
+  endforeach()
+
+  set(ZLIB_SOURCE_DIR
+      ${zlib-ng_SOURCE_DIR}
+      PARENT_SCOPE)
+  set(ZLIB_BINARY_DIR
+      ${zlib-ng_BINARY_DIR}
+      PARENT_SCOPE)
+  set(${OUT_VAR}
+      TRUE
+      PARENT_SCOPE)
+endfunction()
+
 function(need_zlib)
+  if(DFTRACER_USE_ZLIB_NG)
+    _try_zlib_ng(_ZLIB_NG_OK)
+    if(_ZLIB_NG_OK)
+      set(ZLIB_CPM
+          TRUE
+          PARENT_SCOPE)
+      set(ZLIB_SOURCE_DIR
+          ${ZLIB_SOURCE_DIR}
+          PARENT_SCOPE)
+      set(ZLIB_BINARY_DIR
+          ${ZLIB_BINARY_DIR}
+          PARENT_SCOPE)
+      set(ZLIB_FOUND
+          FALSE
+          PARENT_SCOPE)
+      return()
+    endif()
+  endif()
+
   find_package(ZLIB 1.2 QUIET)
 
   if(ZLIB_FOUND)
@@ -1132,50 +1327,119 @@ function(link_zlib TARGET_NAME LIBRARY_TYPE)
   endif()
 endfunction()
 
+function(need_zstd)
+  find_package(zstd QUIET CONFIG)
+  if(NOT zstd_FOUND)
+    find_path(zstd_INCLUDE_DIRS NAMES zstd.h)
+    find_library(zstd_LIBRARIES NAMES zstd)
+    if(zstd_INCLUDE_DIRS AND zstd_LIBRARIES)
+      set(zstd_FOUND TRUE)
+    endif()
+  endif()
+
+  if(zstd_FOUND)
+    message(STATUS "Found system zstd")
+    if(NOT TARGET zstd::libzstd_shared AND NOT TARGET zstd::libzstd_static)
+      if(DEFINED zstd_LIBRARIES)
+        add_library(zstd::libzstd_shared UNKNOWN IMPORTED)
+        set_target_properties(
+          zstd::libzstd_shared
+          PROPERTIES IMPORTED_LOCATION "${zstd_LIBRARIES}"
+                     INTERFACE_INCLUDE_DIRECTORIES "${zstd_INCLUDE_DIRS}")
+      endif()
+    endif()
+    set(zstd_FOUND
+        TRUE
+        PARENT_SCOPE)
+    set(zstd_CPM
+        FALSE
+        PARENT_SCOPE)
+  else()
+    if(NOT zstd_ADDED)
+      cpmaddpackage(
+        NAME
+        zstd
+        GITHUB_REPOSITORY
+        facebook/zstd
+        VERSION
+        1.5.7
+        GIT_TAG
+        v1.5.7
+        SOURCE_SUBDIR
+        build/cmake
+        OPTIONS
+        "ZSTD_BUILD_PROGRAMS OFF"
+        "ZSTD_BUILD_TESTS OFF"
+        "ZSTD_BUILD_SHARED ${DFTRACER_UTILS_BUILD_SHARED}"
+        "ZSTD_BUILD_STATIC ON")
+    endif()
+
+    if(zstd_ADDED)
+      message(STATUS "Built zstd with CPM")
+      set(zstd_FOUND
+          TRUE
+          PARENT_SCOPE)
+      set(zstd_CPM
+          TRUE
+          PARENT_SCOPE)
+      set(zstd_FOUND
+          TRUE
+          CACHE BOOL "zstd availability" FORCE)
+    endif()
+  endif()
+endfunction()
+
 # ==============================================================================
 # Hashing and Cryptography Dependencies
 # ==============================================================================
 
-function(link_yyjson TARGET_NAME LIBRARY_TYPE)
+function(link_simdjson TARGET_NAME LIBRARY_TYPE)
   # Validate parameters
   if(NOT TARGET_NAME)
-    message(FATAL_ERROR "link_yyjson: TARGET_NAME is required")
+    message(FATAL_ERROR "link_simdjson: TARGET_NAME is required")
   endif()
 
   if(NOT LIBRARY_TYPE MATCHES "^(STATIC|SHARED)$")
     message(
-      FATAL_ERROR "link_yyjson: LIBRARY_TYPE must be either STATIC or SHARED")
+      FATAL_ERROR "link_simdjson: LIBRARY_TYPE must be either STATIC or SHARED")
   endif()
 
   if(NOT TARGET ${TARGET_NAME})
-    message(FATAL_ERROR "link_yyjson: Target '${TARGET_NAME}' does not exist")
+    message(FATAL_ERROR "link_simdjson: Target '${TARGET_NAME}' does not exist")
   endif()
 
-  # Link appropriate yyjson variant Use PUBLIC linkage since yyjson headers may
-  # be included in public headers
+  # Link appropriate simdjson variant
   if(LIBRARY_TYPE STREQUAL "STATIC")
-    # For static libraries, prefer static yyjson if available
-    if(TARGET yyjson_static)
-      target_link_libraries(${TARGET_NAME} PUBLIC yyjson::yyjson_static)
-      message(STATUS "Linked ${TARGET_NAME} to yyjson_static")
-    elseif(TARGET yyjson_shared)
-      target_link_libraries(${TARGET_NAME} PUBLIC yyjson::yyjson)
-      message(STATUS "Linked ${TARGET_NAME} to yyjson (shared)")
+    # For static libraries, prefer static simdjson if available
+    if(TARGET simdjson_static)
+      target_link_libraries(${TARGET_NAME} PUBLIC simdjson::simdjson_static)
+      message(STATUS "Linked ${TARGET_NAME} to simdjson_static")
+    elseif(TARGET simdjson_shared)
+      target_link_libraries(${TARGET_NAME} PUBLIC simdjson::simdjson)
+      message(STATUS "Linked ${TARGET_NAME} to simdjson (shared)")
+    elseif(TARGET simdjson::simdjson)
+      # System / find_package() simdjson (e.g. Homebrew on macOS).
+      target_link_libraries(${TARGET_NAME} PUBLIC simdjson::simdjson)
+      message(STATUS "Linked ${TARGET_NAME} to system simdjson::simdjson")
     else()
       message(
-        FATAL_ERROR "link_yyjson: No yyjson found! Call need_yyjson() first.")
+        FATAL_ERROR "link_simdjson: No simdjson found! Call need_simdjson() first.")
     endif()
   else() # SHARED
-    # For shared libraries, prefer shared yyjson if available
-    if(TARGET yyjson_shared)
-      target_link_libraries(${TARGET_NAME} PUBLIC yyjson::yyjson)
-      message(STATUS "Linked ${TARGET_NAME} to yyjson (shared)")
-    elseif(TARGET yyjson_static)
-      target_link_libraries(${TARGET_NAME} PUBLIC yyjson::yyjson_static)
-      message(STATUS "Linked ${TARGET_NAME} to yyjson_static")
+    # For shared libraries, prefer shared simdjson if available
+    if(TARGET simdjson_shared)
+      target_link_libraries(${TARGET_NAME} PUBLIC simdjson::simdjson)
+      message(STATUS "Linked ${TARGET_NAME} to simdjson (shared)")
+    elseif(TARGET simdjson_static)
+      target_link_libraries(${TARGET_NAME} PUBLIC simdjson::simdjson_static)
+      message(STATUS "Linked ${TARGET_NAME} to simdjson_static")
+    elseif(TARGET simdjson::simdjson)
+      # System / find_package() simdjson (e.g. Homebrew on macOS).
+      target_link_libraries(${TARGET_NAME} PUBLIC simdjson::simdjson)
+      message(STATUS "Linked ${TARGET_NAME} to system simdjson::simdjson")
     else()
       message(
-        FATAL_ERROR "link_yyjson: No yyjson found! Call need_yyjson() first.")
+        FATAL_ERROR "link_simdjson: No simdjson found! Call need_simdjson() first.")
     endif()
   endif()
 endfunction()
@@ -1350,12 +1614,16 @@ function(need_nanoarrow)
           nanoarrow_static
           PUBLIC $<BUILD_INTERFACE:${NANOARROW_FLATCC_INCLUDE}>
                  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
-      endif()
-      target_compile_definitions(nanoarrow_static
-                                 PUBLIC DFTRACER_UTILS_ENABLE_ARROW)
-      if(DFTRACER_UTILS_ENABLE_ARROW_IPC)
-        target_compile_definitions(nanoarrow_static
-                                   PUBLIC DFTRACER_UTILS_ENABLE_ARROW_IPC)
+        # Enable zstd compression for Arrow IPC
+        if(DFTRACER_UTILS_ENABLE_ZSTD)
+          target_compile_definitions(nanoarrow_static
+                                     PRIVATE NANOARROW_IPC_WITH_ZSTD)
+          if(TARGET zstd::libzstd_static)
+            target_link_libraries(nanoarrow_static PRIVATE zstd::libzstd_static)
+          elseif(TARGET zstd::libzstd_shared)
+            target_link_libraries(nanoarrow_static PRIVATE zstd::libzstd_shared)
+          endif()
+        endif()
       endif()
       set_target_properties(
         nanoarrow_static
@@ -1380,12 +1648,16 @@ function(need_nanoarrow)
           nanoarrow_shared
           PUBLIC $<BUILD_INTERFACE:${NANOARROW_FLATCC_INCLUDE}>
                  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
-      endif()
-      target_compile_definitions(nanoarrow_shared
-                                 PUBLIC DFTRACER_UTILS_ENABLE_ARROW)
-      if(DFTRACER_UTILS_ENABLE_ARROW_IPC)
-        target_compile_definitions(nanoarrow_shared
-                                   PUBLIC DFTRACER_UTILS_ENABLE_ARROW_IPC)
+        # Enable zstd compression for Arrow IPC
+        if(DFTRACER_UTILS_ENABLE_ZSTD)
+          target_compile_definitions(nanoarrow_shared
+                                     PRIVATE NANOARROW_IPC_WITH_ZSTD)
+          if(TARGET zstd::libzstd_shared)
+            target_link_libraries(nanoarrow_shared PRIVATE zstd::libzstd_shared)
+          elseif(TARGET zstd::libzstd_static)
+            target_link_libraries(nanoarrow_shared PRIVATE zstd::libzstd_static)
+          endif()
+        endif()
       endif()
       set_target_properties(
         nanoarrow_shared
diff --git a/cmake/modules/InstallHelpers.cmake b/cmake/modules/InstallHelpers.cmake
index 4c776c93..c7f0f442 100644
--- a/cmake/modules/InstallHelpers.cmake
+++ b/cmake/modules/InstallHelpers.cmake
@@ -196,71 +196,87 @@ else()
     endif()
 endif()
 
-# YYJSON dependency
-find_library(YYJSON_LIBRARY_BUNDLED
-    NAMES yyjson libyyjson
+# GHC_FILESYSTEM dependency (header-only)
+find_path(GHC_FILESYSTEM_INCLUDE_DIR_BUNDLED
+    NAMES ghc/filesystem.hpp
+    PATHS \${_IMPORT_PREFIX}/include
+    NO_DEFAULT_PATH
+)
+
+if(GHC_FILESYSTEM_INCLUDE_DIR_BUNDLED AND NOT TARGET ghc_filesystem)
+    add_library(ghc_filesystem INTERFACE IMPORTED)
+    set_target_properties(ghc_filesystem PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES \"\${GHC_FILESYSTEM_INCLUDE_DIR_BUNDLED}\"
+    )
+else()
+    # Try to find system ghc_filesystem
+    find_dependency(ghc_filesystem QUIET)
+endif()
+
+# UNORDERED_DENSE dependency (header-only)
+find_path(UNORDERED_DENSE_INCLUDE_DIR_BUNDLED
+    NAMES ankerl/unordered_dense.h
+    PATHS \${_IMPORT_PREFIX}/include
+    NO_DEFAULT_PATH
+)
+
+if(UNORDERED_DENSE_INCLUDE_DIR_BUNDLED AND NOT TARGET unordered_dense::unordered_dense)
+    add_library(unordered_dense::unordered_dense INTERFACE IMPORTED)
+    set_target_properties(unordered_dense::unordered_dense PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES \"\${UNORDERED_DENSE_INCLUDE_DIR_BUNDLED}\"
+    )
+else()
+    find_dependency(unordered_dense QUIET)
+endif()
+
+# SIMDJSON dependency
+find_library(SIMDJSON_LIBRARY_BUNDLED
+    NAMES simdjson libsimdjson
     PATHS \${_IMPORT_PREFIX}/lib
     NO_DEFAULT_PATH
 )
 
-if(YYJSON_LIBRARY_BUNDLED)
-    # Found yyjson that was built with this package
-    find_path(YYJSON_INCLUDE_DIR_BUNDLED
-        NAMES yyjson.h
+if(SIMDJSON_LIBRARY_BUNDLED)
+    # Found simdjson that was built with this package
+    find_path(SIMDJSON_INCLUDE_DIR_BUNDLED
+        NAMES simdjson.h
         PATHS \${_IMPORT_PREFIX}/include
         NO_DEFAULT_PATH
     )
 
-    if(YYJSON_INCLUDE_DIR_BUNDLED)
+    if(SIMDJSON_INCLUDE_DIR_BUNDLED)
         # Create shared target if not exists
-        if(NOT TARGET yyjson::yyjson)
-            add_library(yyjson::yyjson UNKNOWN IMPORTED)
-            set_target_properties(yyjson::yyjson PROPERTIES
-                IMPORTED_LOCATION \"\${YYJSON_LIBRARY_BUNDLED}\"
-                INTERFACE_INCLUDE_DIRECTORIES \"\${YYJSON_INCLUDE_DIR_BUNDLED}\"
+        if(NOT TARGET simdjson::simdjson)
+            add_library(simdjson::simdjson UNKNOWN IMPORTED)
+            set_target_properties(simdjson::simdjson PROPERTIES
+                IMPORTED_LOCATION \"\${SIMDJSON_LIBRARY_BUNDLED}\"
+                INTERFACE_INCLUDE_DIRECTORIES \"\${SIMDJSON_INCLUDE_DIR_BUNDLED}\"
             )
         endif()
 
         # Also look for static version
-        find_library(YYJSON_STATIC_LIBRARY_BUNDLED
-            NAMES yyjson_static libyyjson_static
+        find_library(SIMDJSON_STATIC_LIBRARY_BUNDLED
+            NAMES simdjson_static libsimdjson_static
             PATHS \${_IMPORT_PREFIX}/lib
             NO_DEFAULT_PATH
         )
 
-        if(YYJSON_STATIC_LIBRARY_BUNDLED AND NOT TARGET yyjson::yyjson_static)
-            add_library(yyjson::yyjson_static UNKNOWN IMPORTED)
-            set_target_properties(yyjson::yyjson_static PROPERTIES
-                IMPORTED_LOCATION \"\${YYJSON_STATIC_LIBRARY_BUNDLED}\"
-                INTERFACE_INCLUDE_DIRECTORIES \"\${YYJSON_INCLUDE_DIR_BUNDLED}\"
+        if(SIMDJSON_STATIC_LIBRARY_BUNDLED AND NOT TARGET simdjson::simdjson_static)
+            add_library(simdjson::simdjson_static UNKNOWN IMPORTED)
+            set_target_properties(simdjson::simdjson_static PROPERTIES
+                IMPORTED_LOCATION \"\${SIMDJSON_STATIC_LIBRARY_BUNDLED}\"
+                INTERFACE_INCLUDE_DIRECTORIES \"\${SIMDJSON_INCLUDE_DIR_BUNDLED}\"
             )
         endif()
     endif()
 else()
-    # Try to find system yyjson (require minimum version 0.10.0)
-    find_dependency(yyjson 0.10.0 QUIET)
-    if(NOT yyjson_FOUND)
-        message(WARNING \"yyjson not found or version too old. Minimum version 0.10.0 is required.\")
+    # Try to find system simdjson (require minimum version 3.0.0)
+    find_dependency(simdjson 3.0.0 QUIET)
+    if(NOT simdjson_FOUND)
+        message(WARNING \"simdjson not found or version too old. Minimum version 3.0.0 is required.\")
     endif()
 endif()
 
-# GHC_FILESYSTEM dependency (header-only)
-find_path(GHC_FILESYSTEM_INCLUDE_DIR_BUNDLED
-    NAMES ghc/filesystem.hpp
-    PATHS \${_IMPORT_PREFIX}/include
-    NO_DEFAULT_PATH
-)
-
-if(GHC_FILESYSTEM_INCLUDE_DIR_BUNDLED AND NOT TARGET ghc_filesystem)
-    add_library(ghc_filesystem INTERFACE IMPORTED)
-    set_target_properties(ghc_filesystem PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES \"\${GHC_FILESYSTEM_INCLUDE_DIR_BUNDLED}\"
-    )
-else()
-    # Try to find system ghc_filesystem
-    find_dependency(ghc_filesystem QUIET)
-endif()
-
 # CPP-LOGGER dependency
 find_library(CPP_LOGGER_LIBRARY_BUNDLED
     NAMES cpp-logger libcpp-logger
@@ -288,6 +304,151 @@ else()
     find_dependency(cpp-logger QUIET)
 endif()
 
+# LZ4 dependency (used by RocksDB)
+find_library(LZ4_LIBRARY_BUNDLED
+    NAMES lz4 liblz4
+    PATHS \${_IMPORT_PREFIX}/lib
+    NO_DEFAULT_PATH
+)
+
+if(LZ4_LIBRARY_BUNDLED)
+    find_path(LZ4_INCLUDE_DIR_BUNDLED
+        NAMES lz4.h
+        PATHS \${_IMPORT_PREFIX}/include
+        NO_DEFAULT_PATH
+    )
+
+    if(LZ4_INCLUDE_DIR_BUNDLED AND NOT TARGET lz4::lz4)
+        add_library(lz4::lz4 UNKNOWN IMPORTED)
+        set_target_properties(lz4::lz4 PROPERTIES
+            IMPORTED_LOCATION \"\${LZ4_LIBRARY_BUNDLED}\"
+            INTERFACE_INCLUDE_DIRECTORIES \"\${LZ4_INCLUDE_DIR_BUNDLED}\"
+        )
+    endif()
+else()
+    find_dependency(lz4 QUIET)
+endif()
+
+# ZSTD dependency (compression)
+find_library(ZSTD_LIBRARY_BUNDLED
+    NAMES zstd libzstd
+    PATHS \${_IMPORT_PREFIX}/lib
+    NO_DEFAULT_PATH
+)
+
+if(ZSTD_LIBRARY_BUNDLED)
+    find_path(ZSTD_INCLUDE_DIR_BUNDLED
+        NAMES zstd.h
+        PATHS \${_IMPORT_PREFIX}/include
+        NO_DEFAULT_PATH
+    )
+
+    if(ZSTD_INCLUDE_DIR_BUNDLED AND NOT TARGET zstd::libzstd_shared)
+        add_library(zstd::libzstd_shared UNKNOWN IMPORTED)
+        set_target_properties(zstd::libzstd_shared PROPERTIES
+            IMPORTED_LOCATION \"\${ZSTD_LIBRARY_BUNDLED}\"
+            INTERFACE_INCLUDE_DIRECTORIES \"\${ZSTD_INCLUDE_DIR_BUNDLED}\"
+        )
+    endif()
+
+    # Also look for static version
+    find_library(ZSTD_STATIC_LIBRARY_BUNDLED
+        NAMES zstd_static libzstd_static
+        PATHS \${_IMPORT_PREFIX}/lib
+        NO_DEFAULT_PATH
+    )
+
+    if(ZSTD_STATIC_LIBRARY_BUNDLED AND NOT TARGET zstd::libzstd_static)
+        add_library(zstd::libzstd_static UNKNOWN IMPORTED)
+        set_target_properties(zstd::libzstd_static PROPERTIES
+            IMPORTED_LOCATION \"\${ZSTD_STATIC_LIBRARY_BUNDLED}\"
+            INTERFACE_INCLUDE_DIRECTORIES \"\${ZSTD_INCLUDE_DIR_BUNDLED}\"
+        )
+    endif()
+else()
+    find_dependency(zstd QUIET)
+endif()
+
+# ROCKSDB dependency (database for indexing)
+find_library(ROCKSDB_LIBRARY_BUNDLED
+    NAMES rocksdb librocksdb
+    PATHS \${_IMPORT_PREFIX}/lib
+    NO_DEFAULT_PATH
+)
+
+if(ROCKSDB_LIBRARY_BUNDLED)
+    find_path(ROCKSDB_INCLUDE_DIR_BUNDLED
+        NAMES rocksdb/db.h
+        PATHS \${_IMPORT_PREFIX}/include
+        NO_DEFAULT_PATH
+    )
+
+    if(ROCKSDB_INCLUDE_DIR_BUNDLED AND NOT TARGET RocksDB::rocksdb)
+        add_library(RocksDB::rocksdb UNKNOWN IMPORTED)
+        set_target_properties(RocksDB::rocksdb PROPERTIES
+            IMPORTED_LOCATION \"\${ROCKSDB_LIBRARY_BUNDLED}\"
+            INTERFACE_INCLUDE_DIRECTORIES \"\${ROCKSDB_INCLUDE_DIR_BUNDLED}\"
+        )
+    endif()
+
+    # Also look for static version
+    find_library(ROCKSDB_STATIC_LIBRARY_BUNDLED
+        NAMES rocksdb_static librocksdb_static rocksdb
+        PATHS \${_IMPORT_PREFIX}/lib
+        NO_DEFAULT_PATH
+    )
+
+    if(ROCKSDB_STATIC_LIBRARY_BUNDLED AND NOT TARGET RocksDB::rocksdb-shared)
+        add_library(RocksDB::rocksdb-shared UNKNOWN IMPORTED)
+        set_target_properties(RocksDB::rocksdb-shared PROPERTIES
+            IMPORTED_LOCATION \"\${ROCKSDB_STATIC_LIBRARY_BUNDLED}\"
+            INTERFACE_INCLUDE_DIRECTORIES \"\${ROCKSDB_INCLUDE_DIR_BUNDLED}\"
+        )
+    endif()
+else()
+    find_dependency(RocksDB QUIET)
+endif()
+
+# NANOARROW dependency (Arrow support)
+find_library(NANOARROW_LIBRARY_BUNDLED
+    NAMES nanoarrow libnanoarrow
+    PATHS \${_IMPORT_PREFIX}/lib
+    NO_DEFAULT_PATH
+)
+
+if(NANOARROW_LIBRARY_BUNDLED)
+    find_path(NANOARROW_INCLUDE_DIR_BUNDLED
+        NAMES nanoarrow/nanoarrow.h
+        PATHS \${_IMPORT_PREFIX}/include
+        NO_DEFAULT_PATH
+    )
+
+    if(NANOARROW_INCLUDE_DIR_BUNDLED AND NOT TARGET nanoarrow::nanoarrow)
+        add_library(nanoarrow::nanoarrow UNKNOWN IMPORTED)
+        set_target_properties(nanoarrow::nanoarrow PROPERTIES
+            IMPORTED_LOCATION \"\${NANOARROW_LIBRARY_BUNDLED}\"
+            INTERFACE_INCLUDE_DIRECTORIES \"\${NANOARROW_INCLUDE_DIR_BUNDLED}\"
+        )
+    endif()
+
+    # Also look for static version
+    find_library(NANOARROW_STATIC_LIBRARY_BUNDLED
+        NAMES nanoarrow_static libnanoarrow_static
+        PATHS \${_IMPORT_PREFIX}/lib
+        NO_DEFAULT_PATH
+    )
+
+    if(NANOARROW_STATIC_LIBRARY_BUNDLED AND NOT TARGET nanoarrow::nanoarrow_static)
+        add_library(nanoarrow::nanoarrow_static UNKNOWN IMPORTED)
+        set_target_properties(nanoarrow::nanoarrow_static PROPERTIES
+            IMPORTED_LOCATION \"\${NANOARROW_STATIC_LIBRARY_BUNDLED}\"
+            INTERFACE_INCLUDE_DIRECTORIES \"\${NANOARROW_INCLUDE_DIR_BUNDLED}\"
+        )
+    endif()
+else()
+    find_dependency(nanoarrow QUIET)
+endif()
+
 # Include the targets file
 include(\"\${CMAKE_CURRENT_LIST_DIR}/${PKG_TARGET}Targets.cmake\")
 
diff --git a/cmake/modules/PrecompiledHeader.cmake b/cmake/modules/PrecompiledHeader.cmake
index 07e90f8e..b668db5b 100644
--- a/cmake/modules/PrecompiledHeader.cmake
+++ b/cmake/modules/PrecompiledHeader.cmake
@@ -81,9 +81,20 @@ function(detect_common_headers)
   set(FILTERED_SOURCES "")
   foreach(SOURCE_FILE ${ALL_SOURCES})
     # Exclude Python binding files (only built when DFTRACER_UTILS_BUILD_PYTHON is ON)
-    if(NOT SOURCE_FILE MATCHES "/python/")
-      list(APPEND FILTERED_SOURCES "${SOURCE_FILE}")
+    if(SOURCE_FILE MATCHES "/python/")
+      continue()
     endif()
+    # Exclude MPI-guarded sources when MPI is off. They still live on
+    # disk and include <mpi.h>, which would otherwise land in the PCH
+    # (MIN_COUNT=2 is easy to hit) and break every non-MPI target
+    # because no MPI include path is attached.
+    if(NOT DFTRACER_UTILS_ENABLE_MPI)
+      if(SOURCE_FILE MATCHES "/mpi/"
+         OR SOURCE_FILE MATCHES "_mpi\\.(cpp|cc|cxx|h|hpp)$")
+        continue()
+      endif()
+    endif()
+    list(APPEND FILTERED_SOURCES "${SOURCE_FILE}")
   endforeach()
 
   set(ALL_SOURCES "${FILTERED_SOURCES}")
diff --git a/docs/scripts/generate_api_index.py b/docs/scripts/generate_api_index.py
index 08404bf0..291e2cc9 100644
--- a/docs/scripts/generate_api_index.py
+++ b/docs/scripts/generate_api_index.py
@@ -551,12 +551,31 @@ def _generate_dir_index(
         rel = child[len(dir_path) :].lstrip("/") if dir_path else child
         entries.append(f"{rel}/index")
 
-    # Leaf modules in this directory
+    # Leaf modules in this directory; ones that collide with a subdir of the
+    # same name are emitted as "<name>/_namespace" so the namespace page lives
+    # inside the subdir's toctree (see resolved_filename in generate()).
+    child_names = {c.rsplit("/", 1)[-1] for c in child_dirs}
     leaves = sorted(dir_leaves.get(dir_path, []), key=lambda m: m.filename)
     for mod in leaves:
         rel = mod.filename[len(dir_path) :].lstrip("/") if dir_path else mod.filename
+        if rel in child_names:
+            continue
         entries.append(rel)
 
+    # Also include the namespace overview page when this dir's name was a
+    # colliding leaf in the parent (file written as "<this>/_namespace.rst").
+    if dir_path:
+        leaf_name = dir_path.rsplit("/", 1)[-1] if "/" in dir_path else dir_path
+        parent_dir = dir_path.rsplit("/", 1)[0] if "/" in dir_path else ""
+        parent_leaves = dir_leaves.get(parent_dir, [])
+        for mod in parent_leaves:
+            parent_rel = (
+                mod.filename[len(parent_dir) :].lstrip("/") if parent_dir else mod.filename
+            )
+            if parent_rel == leaf_name:
+                entries.insert(0, "_namespace")
+                break
+
     if entries:
         lines.append(".. toctree::")
         lines.append("   :maxdepth: 1")
@@ -578,11 +597,25 @@ def _generate_dir_index(
         lines.append("     - Items")
         lines.append("     - Namespace")
 
+        collisions = {
+            m.filename
+            for m in all_modules
+            if any(
+                other.filename.startswith(m.filename + "/")
+                for other in all_modules
+                if other is not m
+            )
+        }
         total = 0
         for mod in all_modules:
             count = len(mod.items)
             total += count
-            lines.append(f"   * - :doc:`{mod.filename}`")
+            doc_path = (
+                f"{mod.filename}/_namespace"
+                if mod.filename in collisions
+                else mod.filename
+            )
+            lines.append(f"   * - :doc:`{doc_path}`")
             lines.append(f"     - {count}")
             lines.append(f"     - ``{mod.full_ns}``")
 
@@ -610,12 +643,27 @@ def generate(xml_dir: Path, output_dir: Path) -> None:
 
     modules = discover_modules(items)
 
+    # Detect leaf modules whose filename collides with a sibling subdir:
+    # e.g. "utilities/composites.rst" + directory "utilities/composites/".
+    # Re-route those leaves into "<filename>/_namespace.rst" so the namespace
+    # page lives under the subdir's toctree and Sphinx does not orphan it.
+    dir_paths = {mod.filename.rsplit("/", 1)[0] for mod in modules if "/" in mod.filename}
+    dir_paths |= {
+        "/".join(mod.filename.split("/")[: i + 1])
+        for mod in modules
+        for i in range(len(mod.filename.split("/")) - 1)
+    }
+    collisions = {mod.filename for mod in modules if mod.filename in dir_paths}
+
+    def resolved_filename(mod: "Module") -> str:
+        return f"{mod.filename}/_namespace" if mod.filename in collisions else mod.filename
+
     # Generate per-module pages
     output_dir.mkdir(parents=True, exist_ok=True)
-    expected_paths = {output_dir / f"{mod.filename}.rst" for mod in modules}
+    expected_paths = {output_dir / f"{resolved_filename(mod)}.rst" for mod in modules}
     for mod in modules:
         rst = generate_module_rst(mod, repo_root, repo_url, source_ref)
-        out_path = output_dir / f"{mod.filename}.rst"
+        out_path = output_dir / f"{resolved_filename(mod)}.rst"
         out_path.parent.mkdir(parents=True, exist_ok=True)
         out_path.write_text(rst)
 
diff --git a/docs/source/api/indexer.rst b/docs/source/api/indexer.rst
index 6be94a3e..3222cf37 100644
--- a/docs/source/api/indexer.rst
+++ b/docs/source/api/indexer.rst
@@ -1,13 +1,26 @@
 Indexer Module
 ==============
 
-The indexer module provides functionality for indexing and searching gzip trace
-files using a root-local ``.dftindex`` store.
+The indexer module provides functionality for indexing DFTracer trace files
+(``.pfw`` / ``.pfw.gz``) backed by a ``.dftindex`` RocksDB store. The
+top-level :class:`~dftracer.utils.Indexer` follows a ``resolve`` / ``build``
+pattern over a directory or file list and exposes the higher index tiers
+(checkpoints, bloom filters, manifests, aggregation).
+:class:`~dftracer.utils.CheckpointIndexer` is the lower-level single-file
+interface used for checkpoint-level operations.
 
 Indexer Class
 -------------
 
-.. autoclass:: dftracer.utils.Indexer(gz_path: str, index_path: str | None = None, checkpoint_size: int = 1048576, force_rebuild: bool = False, build_bloom: bool = False, build_manifest: bool = False, index_threshold: int = 8388608, runtime: Runtime | None = None)
+.. autoclass:: dftracer.utils.Indexer(directory: str = '', files: list[str] | None = None, index_dir: str = '', require_checkpoint: bool = True, require_bloom: bool = True, require_manifest: bool = True, require_aggregation: bool = False, time_interval_ms: float = 5000.0, group_keys: list[str] | None = None, custom_metric_fields: list[str] | None = None, compute_percentiles: bool = False, checkpoint_size: int = 33554432, parallelism: int = 0, force_rebuild: bool = False, runtime: Runtime | None = None)
+   :members: resolve, build, ensure_indexed, get_checkpoint_indexer, get_hash_table, query_file_pids, query_all_file_pids, query_file_info, iter_aggregation, iter_arrow_dfanalyzer, iter_arrow_dfanalyzer_all
+   :undoc-members:
+   :show-inheritance:
+
+CheckpointIndexer Class
+-----------------------
+
+.. autoclass:: dftracer.utils.CheckpointIndexer(gz_path: str, index_path: str | None = None, checkpoint_size: int = 1048576, force_rebuild: bool = False, build_bloom: bool = False, build_manifest: bool = False, runtime: Runtime | None = None)
    :members:
    :undoc-members:
    :show-inheritance:
@@ -20,3 +33,56 @@ IndexerCheckpoint Class
    :members:
    :undoc-members:
    :show-inheritance:
+
+Distributed Index (SST-based)
+-----------------------------
+
+The distributed-index path lets the coordinator pre-register files, hand out
+``file_id`` ranges to workers, and bulk-ingest worker-produced SST artifacts
+back into the unified ``.dftindex`` store.
+
+IndexDatabase
+~~~~~~~~~~~~~
+
+.. autoclass:: dftracer.utils.dftracer_utils_ext.IndexDatabase(index_path: str)
+   :members: init_schema, register_files, reserve_file_id_range, bulk_ingest, rebuild_root_summaries, write_agg_global_config, write_agg_file_markers, write_aggregation_tracker
+   :undoc-members:
+
+SstArtifactRegistry
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: dftracer.utils.dftracer_utils_ext.SstArtifactRegistry
+   :members: append
+   :undoc-members:
+
+Module-level Functions
+----------------------
+
+.. autofunction:: dftracer.utils.dftracer_utils_ext.scan_files
+
+.. autofunction:: dftracer.utils.dftracer_utils_ext.scan_aggregation_manifest
+
+.. autofunction:: dftracer.utils.dftracer_utils_ext.build_sst_batch
+
+.. autofunction:: dftracer.utils.dftracer_utils_ext.plan_lpt_partition
+
+.. autofunction:: dftracer.utils.dftracer_utils_ext.enumerate_gzip_members
+
+.. autofunction:: dftracer.utils.dftracer_utils_ext.plan_work_units
+
+.. autofunction:: dftracer.utils.dftracer_utils_ext.move_artifacts
+
+.. autofunction:: dftracer.utils.dftracer_utils_ext.enable_aggregation_deterministic_ids
+
+Dask Helpers
+------------
+
+The ``dftracer.utils.dask`` module provides Dask-distributed drivers built on
+the SST-based primitives above:
+
+.. autofunction:: dftracer.utils.dask.distributed_index
+
+.. autofunction:: dftracer.utils.dask.distributed_aggregate
+
+Dask is an optional dependency -- this module is only importable when
+``dask.distributed`` is installed.
diff --git a/docs/source/api/reader.rst b/docs/source/api/reader.rst
index 743e0d2c..f3160a3c 100644
--- a/docs/source/api/reader.rst
+++ b/docs/source/api/reader.rst
@@ -1,13 +1,27 @@
 JSON Module
 ===========
 
-The ``JSON`` class provides lazy JSON parsing backed by yyjson.
+The ``JsonDictValue`` class is a zero-copy wrapper over a parsed DFTracer JSON
+event. It is the element type yielded by
+:meth:`~dftracer.utils.TraceReader.iter_json` and
+:meth:`~dftracer.utils.TraceReader.read_json`. The underlying bytes are owned
+by the C++ reader buffer; call :meth:`JsonDictValue.to_dict` to materialize a
+regular Python dict for storage beyond the iterator's lifetime.
 
-JSON Class
-----------
+JsonDictValue Class
+-------------------
 
-.. autoclass:: dftracer.utils.JSON(json_str: str)
-   :members:
+.. autoclass:: dftracer.utils.JsonDictValue
+   :members: keys, values, items, get, to_dict
    :undoc-members:
    :show-inheritance:
-   :special-members: __getitem__, __contains__, __str__, __repr__
+   :special-members: __getitem__, __contains__, __len__
+
+.. code-block:: python
+
+   reader = TraceReader("trace.pfw.gz")
+   for event in reader.iter_json():
+       name = event["name"]                  # __getitem__
+       if "args" in event:                   # __contains__
+           ret = event["args"].get("ret")    # nested dict access
+       owned = event.to_dict()               # materialize to plain dict
diff --git a/docs/source/api/runtime.rst b/docs/source/api/runtime.rst
index 8d357dad..b6b9da88 100644
--- a/docs/source/api/runtime.rst
+++ b/docs/source/api/runtime.rst
@@ -8,12 +8,16 @@ Pipeline/DAG overhead.
 Runtime Class
 -------------
 
-.. autoclass:: dftracer.utils.Runtime
+.. autoclass:: dftracer.utils.Runtime(threads: int = 0, io_threads: int = 0)
    :members:
    :undoc-members:
    :show-inheritance:
    :special-members: __enter__, __exit__
 
+The ``threads`` argument sizes the compute pool; ``io_threads`` sizes a
+separate pool dedicated to blocking I/O tasks. Both default to ``0``,
+which lets the runtime auto-size based on the host.
+
 TaskHandle Class
 ----------------
 
@@ -40,7 +44,7 @@ without a return value.
 
    import dftracer.utils as dft
 
-   rt = dft.Runtime(threads=8, python_threads=4)
+   rt = dft.Runtime(threads=8, io_threads=4)
 
    # Submit a Python callable
    h = rt.submit(lambda x, y: x + y, 3, 4, name="add")
diff --git a/docs/source/api/trace_reader.rst b/docs/source/api/trace_reader.rst
index ee9fc936..f63d904c 100644
--- a/docs/source/api/trace_reader.rst
+++ b/docs/source/api/trace_reader.rst
@@ -8,39 +8,48 @@ RocksDB store exists.
 TraceReader Class
 -----------------
 
-.. autoclass:: dftracer.utils.TraceReader(file_path: str, index_dir: str = '', checkpoint_size: int = 33554432, auto_build_index: bool = False, index_threshold: int = 8388608, runtime: Runtime | None = None)
+.. autoclass:: dftracer.utils.TraceReader(path: str, index_dir: str = '', checkpoint_size: int = 33554432, auto_build_index: bool = False, runtime: Runtime | None = None)
    :members:
    :undoc-members:
    :show-inheritance:
    :special-members: __enter__, __exit__
 
+The ``path`` argument may be either a single trace file (``.pfw`` / ``.pfw.gz``)
+or a directory. When a directory is given, all ``iter_*`` / ``read_*`` methods
+discover ``.pfw`` and ``.pfw.gz`` files recursively and process them in parallel
+on the Runtime thread pool.
+
 Streaming Iterators
 -------------------
 
-``iter_lines()``, ``iter_raw()``, and ``iter_lines_json()`` return Python
+``iter_lines()``, ``iter_raw()``, and ``iter_json()`` return Python
 iterators backed by a bounded producer-consumer queue. The C++ coroutine runs
 on the Runtime's thread pool and pushes items; Python's ``__next__`` pops.
 
+``iter_lines()``, ``iter_raw()``, and ``read_lines()`` / ``read_raw()`` yield
+``memoryview`` objects (zero-copy views over the C++ buffer). Wrap with
+``bytes(mv)`` if you need an owned copy.
+
 .. code-block:: python
 
    reader = TraceReader("trace.pfw.gz")
 
-   # Stream decoded lines
+   # Stream decoded lines (memoryview)
    for line in reader.iter_lines():
-       process(line)  # str
+       process(bytes(line))
 
    # Stream raw byte chunks (one line per chunk)
    for chunk in reader.iter_raw(multi_line=False):
-       process(chunk)  # bytes
+       process(chunk)  # memoryview
 
-   # Stream parsed JSON objects
-   for obj in reader.iter_lines_json():
-       print(obj["name"], obj["dur"])  # lazy JSON access
+   # Stream parsed JSON events (zero-copy JsonDictValue wrappers)
+   for obj in reader.iter_json():
+       print(obj["name"], obj["dur"])
 
    # Materialize to list
-   lines = reader.read_lines()        # list[str]
-   chunks = reader.read_raw()         # list[bytes]
-   objects = reader.read_lines_json()  # list[JSON]
+   lines = reader.read_lines()        # list[memoryview]
+   chunks = reader.read_raw()         # list[memoryview]
+   objects = reader.read_json()       # list[JsonDictValue]
 
 Arrow Output
 ------------
@@ -48,7 +57,9 @@ Arrow Output
 ``iter_arrow()`` and ``read_arrow()`` parse JSON events into columnar Arrow
 record batches using dynamic schema discovery. Each JSON key becomes a column;
 types are inferred from values (int64, uint64, double, string, bool). Nested
-objects/arrays are serialized as JSON strings.
+objects/arrays are serialized as JSON strings by default; pass
+``flatten_objects=True`` to expand ``args`` into top-level columns, or
+``normalize=True`` to coerce mixed-type columns into a canonical form.
 
 The returned objects implement the Arrow PyCapsule protocol
 (``__arrow_c_array__``) for zero-copy interchange with pyarrow, polars, and
@@ -63,13 +74,44 @@ DuckDB.
        pa_batch = pyarrow.record_batch(batch)
        df = pa_batch.to_pandas()
 
+   # Single C-side stream drain via Arrow C Data Interface
+   stream = reader.iter_arrow_stream(batch_size=10000)
+   rbr = pyarrow.RecordBatchReader.from_stream(stream)
+   for batch in rbr:
+       ...
+
    # Materialize all events as ArrowTable
    table = reader.read_arrow()
    df = table.to_pandas()    # requires pyarrow
    df = table.to_polars()    # requires polars
 
-   # With range parameters
-   table = reader.read_arrow(start_line=100, end_line=200)
+   # With range parameters and object flattening
+   table = reader.read_arrow(start_line=100, end_line=200, flatten_objects=True)
+
+Writing Arrow IPC Files
+-----------------------
+
+``write_arrow()`` writes trace data to Arrow IPC files with optional
+view-based partitioning. For finer control, ``get_view_chunks()`` returns
+the candidate chunks after bloom-filter pruning, and ``write_view_chunk`` /
+``write_view_chunks`` write individual or batched chunks (the batched variant
+runs all chunks concurrently on the Runtime).
+
+.. code-block:: python
+
+   reader = TraceReader("trace.pfw.gz")
+
+   # Partition by predefined views
+   result = reader.write_arrow(
+       "out/",
+       views=["io", "compute"],
+       chunk_size_mb=32,
+       compression="zstd",
+   )
+
+   # Custom view + explicit chunk plan
+   info = reader.get_view_chunks({"name": "posix", "query": 'cat == "POSIX"'})
+   reader.write_view_chunks(info["chunks"], "out/", view="io")
 
 File Metadata
 -------------
@@ -90,14 +132,14 @@ reading the full file (when a ``.dftindex`` RocksDB index store exists):
        for i in range(num_workers):
            start = i * chunk_size
            end = min((i + 1) * chunk_size, max_bytes)
-           process(reader.read_lines_json(start_byte=start, end_byte=end))
+           process(reader.read_json(start_byte=start, end_byte=end))
 
 Query Filtering
 ---------------
 
 All line-based reading methods (``read_lines``, ``iter_lines``,
-``iter_lines_json``, ``read_lines_json``, ``iter_arrow``, ``read_arrow``)
-accept an optional ``query`` parameter for event filtering:
+``iter_json``, ``read_json``, ``iter_arrow``, ``iter_arrow_stream``,
+``read_arrow``) accept an optional ``query`` parameter for event filtering:
 
 .. code-block:: python
 
@@ -186,13 +228,25 @@ All reading methods accept these keyword arguments:
 - ``buffer_size`` -- internal buffer size in bytes (default 4 MB)
 - ``query`` -- query DSL string for event filtering (default None)
 
+Streaming methods (``iter_lines``, ``iter_raw``, ``iter_json``,
+``iter_arrow``, ``iter_arrow_stream``) additionally accept:
+
+- ``memory_budget`` -- soft cap on in-flight bytes queued from the
+  C++ producer (0 = default)
+
 ``iter_raw`` and ``read_raw`` additionally accept:
 
 - ``line_aligned`` -- if True, chunks are aligned to line boundaries (default True)
 - ``multi_line`` -- if True, chunks may contain multiple lines (default True)
 
-``iter_arrow`` additionally accepts:
+``iter_json`` and ``read_json`` additionally accept:
+
+- ``batch_size`` -- events per parse batch (default 1024)
+
+``iter_arrow``, ``iter_arrow_stream``, and ``read_arrow`` additionally accept:
 
 - ``batch_size`` -- maximum rows per Arrow batch (default 10000)
+- ``flatten_objects`` -- expand object fields into top-level columns (default False)
+- ``normalize`` -- coerce mixed-type columns into a canonical form (default False)
 
 Out-of-range values are clamped to the actual file bounds (no errors thrown).
diff --git a/docs/source/api/utilities.rst b/docs/source/api/utilities.rst
index f2d2fbb6..ab6399b2 100644
--- a/docs/source/api/utilities.rst
+++ b/docs/source/api/utilities.rst
@@ -138,14 +138,17 @@ StatisticsQueryUtility
 ~~~~~~~~~~~~~~~~~~~~~~
 
 Query pre-computed statistics from an indexed trace file.
-When bloom/chunk statistics are not available (e.g. file was below
-``index_threshold``), the utility falls back to streaming the file
-sequentially and computing statistics on-the-fly.
+When bloom/chunk statistics are not available, the utility falls back to
+streaming the file sequentially and computing statistics on-the-fly.
 
 .. autoclass:: dftracer.utils.dftracer_utils_ext.StatisticsQueryUtility(runtime: Runtime | None = None)
    :members: process
    :undoc-members:
 
+``process(file_path, query_type="summary", top_n=10, index_dir="")`` returns
+a dict; ``query_type`` accepts ``"summary"``, ``"top_n_names"``, and other
+pre-computed statistics views.
+
 .. code-block:: python
 
    sq = StatisticsQueryUtility()
@@ -160,10 +163,9 @@ StatisticsAggregatorUtility
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Aggregate statistics from a trace file. Uses pre-computed chunk
-statistics from the ``.idx`` sidecar when available. When chunk
-statistics are absent (e.g. file was below ``index_threshold``),
-falls back to streaming the ``.pfw.gz`` line-by-line and computing
-statistics on-the-fly.
+statistics from the ``.dftindex`` store when available. When chunk
+statistics are absent, falls back to streaming the ``.pfw.gz``
+line-by-line and computing statistics on-the-fly.
 
 .. autoclass:: dftracer.utils.dftracer_utils_ext.StatisticsAggregatorUtility(runtime: Runtime | None = None)
    :members: process
@@ -197,10 +199,9 @@ ReorganizationPlannerUtility
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Plan semantic reorganization of trace files. When manifest data is
-available in the ``.idx`` sidecar, produces per-checkpoint extraction
-tasks. When manifest tables are absent (e.g. file was below
-``index_threshold``), falls back to streaming the file line-by-line
-and emitting one whole-file extraction task per query group.
+available in the ``.dftindex`` store, produces per-checkpoint extraction
+tasks. When manifest tables are absent, falls back to streaming the file
+line-by-line and emitting one whole-file extraction task per query group.
 
 .. autoclass:: dftracer.utils.dftracer_utils_ext.ReorganizationPlannerUtility(runtime: Runtime | None = None)
    :members: process
diff --git a/docs/source/call-tree.rst b/docs/source/call-tree.rst
index f27d649f..eeb311f7 100644
--- a/docs/source/call-tree.rst
+++ b/docs/source/call-tree.rst
@@ -136,57 +136,113 @@ CallTree
 Serialization
 -------------
 
+Serialization moved to coroutine-based ``save_binary`` / ``save_arrow``
+free functions in ``dftracer/utils/call_tree/mpi/serializable.h``. The
+legacy ``CallTree::save_to_file`` / ``save_to_json`` / ``load_from_file``
+methods have been removed; the API now exposes
+``CallTree::internal_tree()`` for direct access to the underlying
+``internal::CallTree`` consumed by the save/load coroutines.
+
 **Save to binary format:**
 
+The custom binary format uses a shared string dictionary (name, category,
+arg keys / string values share storage) and preserves typed args
+(``int`` / ``uint`` / ``double`` / ``bool`` instead of flattening to
+strings).
+
 .. code-block:: cpp
 
-   // Save to default path (based on input directory)
-   tree.save_to_file();
+   #include <dftracer/utils/call_tree/call_tree.h>
+   #include <dftracer/utils/call_tree/mpi/serializable.h>
+   #include <dftracer/utils/core/pipeline/pipeline.h>
+   #include <dftracer/utils/core/tasks/task.h>
 
-   // Save to custom path
-   tree.save_to_file("output.calltree");
+   using namespace dftracer::utils;
+   using namespace dftracer::utils::call_tree;
 
-**Save to JSON (Chrome Tracing / Perfetto):**
+   CallTree tree;
+   tree.load_from_directory("/path/to/traces", "*.pfw.gz");
+   tree.generate();
 
-.. code-block:: cpp
+   auto task = make_task(
+       [&tree](CoroScope& scope) -> coro::CoroTask<void> {
+           co_await save_binary(&scope, tree.internal_tree(),
+                                "output.calltree");
+           co_return;
+       },
+       "save_binary");
+
+   Pipeline pipeline(PipelineConfig().with_name("calltree-save"));
+   pipeline.set_source({task});
+   pipeline.execute();
 
-   // Compatible with chrome://tracing and Perfetto UI
-   tree.save_to_json("output.pfw");
+**Save to Arrow IPC (.arrow):**
 
-**Save to text file:**
+Columnar Arrow IPC with buffer-level zstd compression and
+dictionary-encoded ``name`` / ``category`` columns. Readable directly by
+``pyarrow``, ``polars``, ``nanoarrow``, and DuckDB. Requires the build to
+be configured with ``DFTRACER_UTILS_ENABLE_ARROW_IPC=ON``.
 
 .. code-block:: cpp
 
-   tree.print_depth_first_to_file("output.txt", 5);  // Max depth 5
+   auto task = make_task(
+       [&tree](CoroScope& scope) -> coro::CoroTask<void> {
+           co_await save_arrow(&scope, tree.internal_tree(),
+                               "output.arrow");
+           co_return;
+       },
+       "save_arrow");
+
+**Load a previously saved tree:**
 
-**Load from previously saved file:**
+Both loaders are coroutines that return a fresh ``internal::CallTree``:
 
 .. code-block:: cpp
 
-   CallTree loaded_tree;
-   loaded_tree.load_from_file("output.calltree");
+   auto task = make_task([](CoroScope& scope) -> coro::CoroTask<void> {
+       auto loaded = co_await load_binary(&scope, "output.calltree");
+       // or: auto loaded = co_await load_arrow(&scope, "output.arrow");
+       printf("Loaded tree: %zu nodes\n", loaded->num_nodes());
+       co_return;
+   }, "load");
 
-   auto stats = loaded_tree.get_statistics();
-   printf("Loaded tree: %zu nodes, %zu levels\n",
-          stats.total_nodes, stats.num_levels);
+**Save to text file (still available on the high-level API):**
+
+.. code-block:: cpp
+
+   tree.print_depth_first_to_file("output.txt", 5);  // Max depth 5
 
 Output Formats
 --------------
 
-Binary Format
-~~~~~~~~~~~~~
+Binary Format (``.calltree``)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Compact custom format with a global string dictionary and typed args; best
+for round-tripping trees between dftracer-utils runs (for example, between
+a coordinator and downstream MPI ranks). Backed by the
+``CALLTREE_BINARY_VERSION = 2`` header.
+
+Arrow IPC (``.arrow``)
+~~~~~~~~~~~~~~~~~~~~~~
 
-Efficient binary serialization preserving all call tree information including node hierarchy, timing, function names, categories, and arguments.
+Columnar Arrow IPC file with zstd buffer compression and
+dictionary-encoded ``name`` / ``category`` columns. Best for analysis
+pipelines that already speak Arrow (pyarrow, polars, DuckDB, nanoarrow).
 
 JSON Format (Chrome Tracing)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Follows the Chrome Tracing format specification, viewable in ``chrome://tracing`` or `Perfetto UI <https://ui.perfetto.dev/>`_. Shows timeline of function calls with nested relationships and duration.
+The ``dftracer_call_tree`` CLI emits Chrome Tracing JSON (gzipped with
+``--gzip``) suitable for ``chrome://tracing`` and
+`Perfetto UI <https://ui.perfetto.dev/>`_. Programmatic JSON export is no
+longer exposed on the ``CallTree`` C++ API.
 
 Text Format
 ~~~~~~~~~~~
 
-Human-readable text with indentation showing hierarchical structure, function names, categories, and timing at each level.
+Human-readable text with indentation showing hierarchical structure,
+function names, categories, and timing at each level.
 
 Performance Considerations
 --------------------------
diff --git a/docs/source/cli.rst b/docs/source/cli.rst
index 8dd090a5..e4b74464 100644
--- a/docs/source/cli.rst
+++ b/docs/source/cli.rst
@@ -3,6 +3,57 @@ Command-Line Tools
 
 DFTracer Utils provides several command-line utilities for working with DFTracer trace files and compressed archives.
 
+.. _cli-shared-flags:
+
+Shared CLI Flags
+----------------
+
+Most tools wire in a common set of argument schemas defined in
+``src/dftracer/utils/binaries/common_cli.h``. The flags below have identical
+semantics across every binary that exposes the relevant schema and are not
+repeated in each tool's section.
+
+**Pipeline (``PipelineArgs``)**
+
+- ``--executor-threads <count>`` - Number of worker threads for parallel
+  processing (default: number of CPU cores)
+- ``--io-threads <count>`` - Number of I/O threads (default: number of CPU
+  cores)
+- ``--time-profiling`` - Print stage timing breakdown to stderr
+
+**Indexing (``IndexingArgs``)**
+
+- ``--index-dir <path>`` - Directory for ``.dftindex`` stores
+- ``--checkpoint-size <bytes>`` - Checkpoint size for gzip indexing in bytes
+  (default: 33554432 B / 32 MB)
+- ``-f, --force`` - Force index recreation
+
+**Query (``QueryArgs``)**
+
+- ``--query <query>`` - Query DSL filter
+  (e.g., ``'cat == "POSIX" and dur > 1000'``)
+
+**Watchdog (``WatchdogArgs``)**
+
+- ``--disable-watchdog`` - Disable watchdog for hang detection
+- ``--watchdog-global-timeout <s>`` - Watchdog global timeout for pipeline
+  execution in seconds (0 = no timeout, default: 0)
+- ``--watchdog-task-timeout <s>`` - Watchdog default task timeout in seconds
+  (0 = no timeout, default: 0)
+- ``--watchdog-interval <s>`` - Watchdog check interval in seconds
+  (default: 1)
+- ``--watchdog-warning-threshold <s>`` - Watchdog long-running task warning
+  threshold in seconds (default: 300)
+- ``--watchdog-idle-timeout <s>`` - Watchdog idle timeout in seconds
+  (0 = use default, default: 300)
+- ``--watchdog-deadlock-timeout <s>`` - Watchdog deadlock timeout in seconds
+  (0 = use default, default: 600)
+
+**Inputs (``DirectoryArgs`` / ``FilesArgs``)**
+
+- ``-d, --directory <path>`` - Directory containing trace files
+- ``--files <files...>`` - Trace files (``.pfw``, ``.pfw.gz``)
+
 dftracer_reader
 ---------------
 
@@ -357,6 +408,12 @@ dftracer_index
 - ``--false-positive-rate <rate>`` - Bloom filter false positive rate (default: 0.01)
 - ``--read-batch-size <MB>`` - Batch read size in MB for stream processing (default: 4)
 - ``--manifest`` - Also build manifest tables in .idx (per-checkpoint event line routing)
+- ``--rebuild-summaries`` - Rebuild ``ROOT_*`` aggregated summaries after ingest.
+  Off by default; ``ROOT_*`` CFs are only consumed by summary tools such as
+  ``dftracer_info``. Bloom-filter chunk-skipping queries do not require them.
+
+This binary also accepts the shared :ref:`cli-shared-flags` (Pipeline,
+Watchdog, Indexing).
 
 **Example:**
 
@@ -771,3 +828,108 @@ dftracer_comparator
             }
         ]
     }
+
+dftracer_aggregator_mpi
+-----------------------
+
+**Description:** MPI driver for the distributed-SST aggregator. Each rank
+produces per-rank aggregation SSTs; rank 0 bulk-ingests and the ranks jointly
+write the final gzip JSON output. Requires the build to be configured with
+``DFTRACER_UTILS_ENABLE_MPI=ON``.
+
+The pipeline is structured as a five-task DAG executed inside the standard
+``Pipeline`` runtime:
+
+``scan -> phase_a -> phase_b -> phase_c -> merge``
+
+- **scan** - Cooperative gzip-member pre-scan, ``Allgatherv`` of the member
+  map, and deterministic Longest-Processing-Time (LPT) assignment of work
+  units to ranks.
+- **phase_a** - Each rank runs the distributed-SST indexer + aggregation
+  visitor on its slice and writes SSTs (and ``tracker.bin``) to its rank
+  staging directory. SSTs are optionally moved to a shared-FS staging root
+  for the coordinator.
+- **phase_b** - Rank 0 ``Gatherv`` of artifact lists and a single
+  ``IndexDatabase::bulk_ingest`` + tracker merge.
+- **phase_c** - Each rank writes a shard-prefixed Perfetto gzip JSON slice
+  using ``PerfettoTraceWriterUtility``.
+- **merge** - Parallel ``pwrite`` on Lustre-striped output or serial
+  concatenation otherwise.
+
+**Usage:**
+
+.. code-block:: bash
+
+    mpirun -n <N> dftracer_aggregator_mpi [OPTIONS]
+
+**Options:**
+
+- ``-d, --directory <path>`` - Input directory containing .pfw or .pfw.gz
+  files (default: ``.``)
+- ``-o, --output <path>`` - Output gzip JSON path. ``.gz`` is appended if
+  missing (default: ``aggregated_output.json.gz``)
+- ``-t, --time-interval <ms>`` - Time interval in milliseconds for bucketing
+  (default: 5000)
+- ``--staging-dir <path>`` - Per-rank SST staging root. Defaults to
+  ``<index_dir>/_staging``; each rank writes to ``<staging_dir>/rank_<R>``.
+- ``--shared-staging <path>`` - Shared-FS staging root. When set and
+  different from ``--staging-dir``, each rank moves its SSTs and
+  ``tracker.bin`` from the (node-local) staging dir to
+  ``<shared-staging>/rank_<R>`` before the coordinator ingest. Required for
+  multi-node runs where ``--staging-dir`` points at node-local NVMe.
+- ``--keep-staging`` - Keep per-rank SST staging dirs after a successful
+  ingest
+
+This binary also accepts the shared :ref:`cli-shared-flags` (Pipeline and
+Indexing schemas). Per-rank ``--executor-threads`` / ``--io-threads`` are
+automatically scaled down by the detected processes-per-node count so
+co-located ranks do not oversubscribe cores.
+
+**Example:**
+
+.. code-block:: bash
+
+    # 16 ranks on one node, node-local staging
+    mpirun -n 16 dftracer_aggregator_mpi -d ./traces -o agg.json.gz
+
+    # Multi-node run with shared staging on Lustre
+    mpirun -n 64 dftracer_aggregator_mpi -d /lustre/traces \
+        --staging-dir /local/nvme/_staging \
+        --shared-staging /lustre/scratch/_staging \
+        -o /lustre/out/agg.json.gz
+
+dftracer_call_tree_mpi
+----------------------
+
+**Description:** MPI driver for parallel call-tree construction. Each rank
+owns a slice of PIDs, emits a Chrome Tracing JSON shard, and rank 0 merges
+the shards. Wraps the ``MPICallTreeBuilder`` engine
+(``discover_pids -> build -> hierarchy -> write -> merge`` coro phases).
+Requires ``DFTRACER_UTILS_ENABLE_MPI=ON``.
+
+**Usage:**
+
+.. code-block:: bash
+
+    mpirun -n <N> dftracer_call_tree_mpi [OPTIONS] <input>
+
+**Options:**
+
+- ``input`` - Input directory containing trace files [required]
+- ``-o, --output <path>`` - Output JSON path (default: ``call_tree.pfw``)
+- ``--staging-dir <path>`` - Shared-FS staging root for per-rank shards
+  (default: ``<output>.shards/``)
+- ``--gzip`` - gzip the merged output (``.gz`` appended if needed)
+- ``-v, --verbose`` - Verbose progress logging
+- ``--keep-staging`` - Keep per-rank shard files after merge
+
+This binary also accepts the shared :ref:`cli-shared-flags` (Pipeline);
+per-rank thread counts are scaled down by the detected processes-per-node
+count.
+
+**Example:**
+
+.. code-block:: bash
+
+    # 32 ranks across nodes; gzip merged output
+    mpirun -n 32 dftracer_call_tree_mpi ./traces -o call_tree.pfw --gzip
diff --git a/docs/source/conf.py b/docs/source/conf.py
index e46e120a..687f4718 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -62,77 +62,116 @@ def _install_rtd_extension_stub() -> None:
         return
 
     ext = types.ModuleType(ext_name)
-    JSONPrimitive = str | int | float | bool | None
 
     class _BaseNative:
         """RTD stub for native extension classes."""
         pass
 
-    class TaskHandle(_BaseNative):
-        """Handle to a submitted task.
+    class _ArrowBatchCapsule(_BaseNative):
+        """Internal Arrow batch wrapper implementing __arrow_c_array__ protocol."""
 
-        Returned by asynchronous utility calls and by
-        :class:`dftracer.utils.Runtime`. The handle can be waited on,
-        queried for completion, or used to fetch the task result.
-        """
+        @property
+        def num_rows(self) -> int:
+            return 0
 
-        name = ""
-        task_id = 0
+        @property
+        def num_columns(self) -> int:
+            return 0
 
-        def get(self) -> object | None:
-            """Block until the task completes and return its result."""
-            return None
+        def __arrow_c_array__(self, requested_schema: object = None) -> tuple[object, object]:
+            return (None, None)
 
-        def wait(self) -> None:
-            """Block until the task completes."""
+    class _ArrowBatchStream(_BaseNative):
+        """Zero-iteration Arrow stream backed by the C++ coroutine channel.
+
+        Implements the Arrow C Data Interface stream protocol. Pass directly
+        to ``pyarrow.RecordBatchReader.from_stream()`` or ``pyarrow.table()``.
+        Single-use: consuming ``__arrow_c_stream__`` once exhausts the object.
+        """
+
+        def __arrow_c_stream__(self, requested_schema: object = None) -> object:
             return None
 
-        def done(self) -> bool:
-            """Return ``True`` when the task has completed."""
-            return True
+    class JsonDictValue(_BaseNative):
+        """Zero-copy wrapper over a parsed DFTracer JSON event.
+
+        Supports dict-like access: ``event['name']``, ``event['args']['ret']``.
+        Call ``.to_dict()`` to materialize a regular Python dict.
+        """
+
+        def __getitem__(self, key: str) -> object:
+            raise KeyError(key)
+
+        def __len__(self) -> int:
+            return 0
+
+        def __contains__(self, key: str) -> bool:
+            return False
+
+        def keys(self) -> list[str]:
+            return []
+
+        def values(self) -> list[object]:
+            return []
+
+        def items(self) -> list[tuple[str, object]]:
+            return []
+
+        def get(self, key: str, default: object = None) -> object:
+            return default
+
+        def to_dict(self) -> dict[str, object]:
+            return {}
+
+    class IndexerCheckpoint(_BaseNative):
+        """Information about a checkpoint in the index."""
+
+        checkpoint_idx = 0
+        uc_offset = 0
+        uc_size = 0
+        c_offset = 0
+        c_size = 0
+        bits = 0
+        num_lines = 0
 
     class Runtime(_BaseNative):
-        """Lightweight task runtime for native and Python work.
+        """Lightweight coroutine runtime wrapping Executor + Watchdog.
 
-        The runtime owns the executor threads used by coroutine-backed
-        readers, indexers, and utilities. The higher-level Python
-        wrapper in :mod:`dftracer.utils.runtime` builds on this native
-        object to support Python callables and richer task tracking.
+        Note: For user-facing API, use dftracer.utils.Runtime (Python wrapper)
+        which adds submit(), Python callable support, and error handling.
         """
 
-        threads = 0
-
-        def __init__(self, threads: int = 0) -> None:
-            """Create a runtime with an optional worker-thread count."""
-            super().__init__(threads)
-            self.threads = threads
+        def __init__(self, threads: int = 0, io_threads: int = 0) -> None:
+            self._threads = threads
+            self._io_threads = io_threads
 
         def shutdown(self) -> None:
-            """Stop the runtime and release worker resources."""
             return None
 
         def wait_all(self) -> None:
-            """Block until all submitted native work completes."""
             return None
 
         def get_progress(self) -> dict[str, object]:
-            """Return runtime progress metadata."""
             return {}
 
         def is_responsive(self) -> bool:
-            """Return whether the watchdog still considers the runtime healthy."""
             return True
 
         def set_timeout(self, global_ms: int = 0) -> None:
-            """Set a global watchdog timeout in milliseconds."""
             return None
 
         def set_default_task_timeout(self, ms: int = 0) -> None:
-            """Set the default per-task timeout in milliseconds."""
             return None
 
+        @property
+        def threads(self) -> int:
+            return self._threads
+
+        @property
+        def io_threads(self) -> int:
+            return self._io_threads
+
         def __enter__(self) -> "Runtime":
-            """Enter the runtime context manager."""
             return self
 
         def __exit__(
@@ -141,31 +180,192 @@ def __exit__(
             exc_val: BaseException | None,
             exc_tb: TracebackType | None,
         ) -> None:
-            """Exit the runtime context manager."""
             return None
 
-    class IndexerCheckpoint(_BaseNative):
-        """Information about a single checkpoint in a ``.dftindex`` store.
+    class Indexer(_BaseNative):
+        """Indexer with resolve/build pattern for tiered indexing."""
 
-        Checkpoints map compressed and uncompressed offsets and carry
-        per-chunk metadata used for seeking and chunk-level pruning.
-        """
+        def __init__(
+            self,
+            directory: str = "",
+            files: list[str] | None = None,
+            index_dir: str = "",
+            require_checkpoint: bool = True,
+            require_bloom: bool = True,
+            require_manifest: bool = True,
+            require_aggregation: bool = False,
+            time_interval_ms: float = 5000.0,
+            group_keys: list[str] | None = None,
+            custom_metric_fields: list[str] | None = None,
+            compute_percentiles: bool = False,
+            checkpoint_size: int = 32 * 1024 * 1024,
+            parallelism: int = 0,
+            force_rebuild: bool = False,
+            runtime: Runtime | None = None,
+        ) -> None:
+            """Create an indexer for trace files.
+
+            At least one of 'directory' or 'files' must be provided.
+
+            Args:
+                directory: Path to the directory containing trace files.
+                files: List of specific file paths to index.
+                index_dir: Directory for `.dftindex` stores. If empty, uses
+                    directory-local paths.
+                require_checkpoint: If True, build checkpoint index (tier 1).
+                require_bloom: If True, build bloom filter data (tier 2).
+                require_manifest: If True, build manifest data (tier 2).
+                require_aggregation: If True, build aggregation data (tier 3).
+                time_interval_ms: Time interval for aggregation in milliseconds.
+                group_keys: Keys to group by for aggregation.
+                custom_metric_fields: Custom metric fields for aggregation.
+                compute_percentiles: If True, compute percentiles during aggregation.
+                parallelism: Number of parallel indexers. 0 = auto.
+                force_rebuild: If True, rebuild indices even if they exist.
+                runtime: Runtime instance for thread pool control.
+            """
+            return None
 
-        checkpoint_idx = 0
-        uc_offset = 0
-        uc_size = 0
-        c_offset = 0
-        c_size = 0
-        bits = 0
-        num_lines = 0
+        def resolve(self) -> dict[str, object]:
+            """Resolve which files need indexing.
 
-    class Indexer(_BaseNative):
-        """Build and query a root-local ``.dftindex`` RocksDB store.
+            Returns:
+                Dictionary with 'ready' and 'needs_work' file lists.
+            """
+            return {}
 
-        The indexer extracts checkpoints and optional bloom/manifest
-        data for a compressed DFTracer trace. Readers and higher-level
-        utilities use this store for chunk pruning and random access.
-        """
+        def build(self) -> dict[str, object]:
+            """Build indices for files that need work.
+
+            Returns:
+                Dictionary with build status and statistics.
+            """
+            return {}
+
+        def ensure_indexed(self) -> dict[str, object]:
+            """Ensure all files are indexed by calling resolve then build if needed.
+
+            Returns:
+                Dictionary with 'ready' and 'needs_work' file lists after indexing.
+            """
+            return {}
+
+        def get_checkpoint_indexer(self, file_path: str) -> "CheckpointIndexer":
+            """Get a checkpoint indexer for a specific file.
+
+            Args:
+                file_path: Path to the trace file (.pfw/.pfw.gz).
+
+            Returns:
+                CheckpointIndexer instance for checkpoint-level operations.
+            """
+            return CheckpointIndexer(file_path)
+
+        def get_hash_table(self, hash_type: str) -> dict[str, str]:
+            """Get hash table mapping hash values to original strings.
+
+            Args:
+                hash_type: Type of hash table ('file', 'host', or 'string').
+
+            Returns:
+                Dict mapping hash strings to original values.
+
+            Raises:
+                ValueError: If hash_type is not valid.
+            """
+            return {}
+
+        def query_file_pids(self, file_id: int) -> set:
+            """Query PIDs observed in a specific file.
+
+            Args:
+                file_id: File identifier (0-based index).
+
+            Returns:
+                Set of PIDs (int) observed in the file.
+            """
+            return set()
+
+        def query_all_file_pids(self) -> dict[int, set]:
+            """Query all file-to-PIDs mappings.
+
+            Returns:
+                Dict mapping file_id to set of PIDs observed in that file.
+            """
+            return {}
+
+        def query_file_info(self) -> tuple[dict[int, str], dict[int, set]]:
+            """Query file ID to path mapping and per-file PIDs in one call.
+
+            Returns:
+                Tuple of (file_id_to_path, file_pids).
+            """
+            return ({}, {})
+
+        def iter_aggregation(
+            self,
+            type: str = "events",
+            batch_size: int = 10000,
+        ) -> Iterator[object]:
+            """Iterate over aggregation data as Arrow batches.
+
+            Args:
+                type: 'events', 'profiles', or 'system'
+                batch_size: Number of entries per batch (default 10000)
+
+            Returns:
+                Iterator over Arrow batch capsules.
+            """
+            return iter(())
+
+        def iter_arrow_dfanalyzer(
+            self,
+            type: str = "events",
+            batch_size: int = 10000,
+            time_granularity: float = 1.0,
+            time_resolution: float = 1e6,
+            query: str | None = None,
+        ) -> Iterator[object]:
+            """Iterate over aggregation data as dfanalyzer-compatible Arrow batches.
+
+            Args:
+                type: 'events', 'profiles', or 'system'
+                batch_size: Number of entries per batch (default 10000)
+                time_granularity: Bucket width in seconds (default 1.0)
+                time_resolution: Microseconds per output time unit (default 1e6)
+                query: Optional query filter (e.g., "pid == 1234 or pid == 5678")
+
+            Returns:
+                Iterator over Arrow batch capsules with dfanalyzer schema.
+            """
+            return iter(())
+
+        def iter_arrow_dfanalyzer_all(
+            self,
+            batch_size: int = 10000,
+            time_granularity: float = 1.0,
+            time_resolution: float = 1e6,
+            query: str | None = None,
+            group_by: list[str] | None = None,
+        ) -> dict[str, list[object]]:
+            """Iterate over all aggregation types in a single scan.
+
+            Args:
+                batch_size: Number of entries per batch (default 10000)
+                time_granularity: Bucket width in seconds (default 1.0)
+                time_resolution: Microseconds per output time unit (default 1e6)
+                query: Optional query filter (e.g., "pid == 1234 or pid == 5678")
+                group_by: Optional list of columns to group by for coarse in-scan
+                    aggregation. When provided, output schema is reduced to the
+                    requested group columns plus aggregated metrics.
+
+            Returns:
+                Dict with 'events', 'profiles', 'system' keys containing Arrow batches.
+            """
+            return {"events": [], "profiles": [], "system": []}
+
+    class CheckpointIndexer(_BaseNative):
+        """Checkpoint indexer for single-file checkpoint-level operations."""
 
         def __init__(
             self,
@@ -175,50 +375,90 @@ def __init__(
             force_rebuild: bool = False,
             build_bloom: bool = False,
             build_manifest: bool = False,
-            index_threshold: int = 8388608,
             runtime: Runtime | None = None,
         ) -> None:
-            """Create an indexer for a compressed DFTracer trace."""
-            self.gz_path = gz_path
-            self.index_path = index_path or ""
-            self.checkpoint_size = checkpoint_size
-            self.has_bloom = build_bloom
-            self.has_manifest = build_manifest
+            """Create a checkpoint indexer for a gzip file.
+
+            Args:
+                gz_path: Path to the gzip trace file.
+                index_path: Path to the `.dftindex` store. If None, uses the
+                    root-local `.dftindex` next to ``gz_path``.
+                checkpoint_size: Checkpoint size in bytes for index building.
+                force_rebuild: If True, rebuild the index even if it exists.
+                build_bloom: If True, build bloom filter data in the index.
+                build_manifest: If True, build manifest data in the index.
+                runtime: Runtime instance for thread pool control.
+                    If None, uses the default global Runtime.
+            """
+            self._gz_path = gz_path
+            self._index_path = index_path or ""
+            self._checkpoint_size = checkpoint_size
+            self._has_bloom = build_bloom
+            self._has_manifest = build_manifest
 
         def build(self) -> None:
-            """Build the index store for the configured trace file."""
+            """Build the index."""
             return None
 
         def need_rebuild(self) -> bool:
-            """Return whether the index is missing or stale."""
+            """Check if index needs rebuilding."""
             return False
 
         def exists(self) -> bool:
-            """Return whether the index store already exists."""
+            """Check if the `.dftindex` store exists."""
             return False
 
         def get_max_bytes(self) -> int:
-            """Return the maximum decompressed byte position in the trace."""
+            """Get maximum byte position."""
             return 0
 
         def get_num_lines(self) -> int:
-            """Return the number of lines recorded in the index."""
+            """Get number of lines."""
             return 0
 
-        def get_checkpoints(self) -> list["IndexerCheckpoint"]:
-            """Return all checkpoints stored for the trace."""
+        def get_checkpoints(self) -> list[IndexerCheckpoint]:
+            """Get all checkpoints."""
             return []
 
-        def find_checkpoint(self, target_offset: int) -> "IndexerCheckpoint | None":
-            """Return the checkpoint closest to a decompressed offset."""
+        def find_checkpoint(self, target_offset: int) -> IndexerCheckpoint | None:
+            """Find checkpoint for target offset."""
             return None
 
         def close(self) -> None:
-            """Release this Python wrapper's native indexer handle."""
+            """Release this Python wrapper's native indexer handle.
+
+            This does not force-close the shared RocksDB instance for the same
+            ``.dftindex`` path.
+            """
             return None
 
-        def __enter__(self) -> "Indexer":
-            """Enter the indexer context manager."""
+        @property
+        def gz_path(self) -> str:
+            """Get gzip path."""
+            return self._gz_path
+
+        @property
+        def index_path(self) -> str:
+            """Get the `.dftindex` path."""
+            return self._index_path
+
+        @property
+        def checkpoint_size(self) -> int:
+            """Get checkpoint size."""
+            return self._checkpoint_size
+
+        @property
+        def has_bloom(self) -> bool:
+            """Whether bloom filter data exists in the `.dftindex` store."""
+            return self._has_bloom
+
+        @property
+        def has_manifest(self) -> bool:
+            """Whether manifest data exists in the `.dftindex` store."""
+            return self._has_manifest
+
+        def __enter__(self) -> "CheckpointIndexer":
+            """Enter the runtime context for the with statement."""
             return self
 
         def __exit__(
@@ -227,97 +467,70 @@ def __exit__(
             exc_val: BaseException | None,
             exc_tb: TracebackType | None,
         ) -> None:
-            """Exit the indexer context manager."""
-            return None
-
-    class JSON(_BaseNative):
-        """Lazy JSON wrapper backed by yyjson.
+            """Release this Python wrapper on context exit.
 
-        Nested objects are exposed as additional :class:`JSON` wrappers
-        so callers can inspect large trace records without eagerly
-        converting the entire payload to Python dictionaries.
-        """
-
-        def __init__(self, json_str: str) -> None:
-            """Create a lazy JSON wrapper from a JSON string."""
-            self._json_str = json_str
-
-        def get(
-            self,
-            key: str,
-            default: "JSON | JSONPrimitive" = None,
-        ) -> "JSON | JSONPrimitive":
-            """Look up a key and return ``default`` when it is absent."""
-            return default
-
-        def keys(self) -> list[str]:
-            """Return the keys in the current JSON object."""
-            return []
-
-        def values(self) -> list["JSON | JSONPrimitive"]:
-            """Return the values in the current JSON object."""
-            return []
+            This does not force-close the shared RocksDB instance for the same
+            ``.dftindex`` path.
+            """
+            return None
 
-        def items(self) -> list[tuple[str, "JSON | JSONPrimitive"]]:
-            """Return key-value pairs in the current JSON object."""
-            return []
+    class TaskHandle(_BaseNative):
+        """Handle to a submitted C++ coroutine task."""
 
-        def unwrap(self) -> dict[str, object] | list[object] | JSONPrimitive:
-            """Convert the lazy wrapper into native Python data."""
-            return {}
+        def get(self) -> object:
+            """Block until task completes and return result. Raises on error."""
+            return None
 
-        def copy(self) -> "JSON":
-            """Return a shallow copy of the current lazy JSON wrapper."""
-            return self
+        def wait(self) -> None:
+            """Block until task completes. Raises on error."""
+            return None
 
-        def __contains__(self, key: str) -> bool:
-            """Return ``True`` when a key exists in the object."""
-            return False
+        def done(self) -> bool:
+            """Return True if task has completed."""
+            return True
 
-        def __getitem__(self, key: str) -> "JSON | JSONPrimitive":
-            """Return a field value or nested :class:`JSON` wrapper."""
-            raise KeyError(key)
+        @property
+        def name(self) -> str:
+            """Task name."""
+            return ""
 
-        def __len__(self) -> int:
-            """Return the number of items in the current JSON object."""
+        @property
+        def task_id(self) -> int:
+            """Task identifier."""
             return 0
 
-        def __bool__(self) -> bool:
-            """Return whether the current JSON value is non-empty."""
-            return False
-
-        def __str__(self) -> str:
-            """Return a JSON-like string representation."""
-            return "{}"
-
-        def __repr__(self) -> str:
-            """Return a developer-facing representation."""
-            return "JSON('{}')"
-
     class TraceReader(_BaseNative):
-        """Read DFTracer traces with optional index-assisted pruning.
-
-        ``TraceReader`` chooses between sequential and indexed access
-        based on the file format and the presence of a shared
-        root-local ``.dftindex`` store. It exposes line, raw-byte,
-        JSON, and Arrow-based views over the same trace data.
-        """
+        """Smart trace file reader that auto-selects sequential vs indexed reading."""
 
         def __init__(
             self,
-            file_path: str,
+            path: str,
             index_dir: str = "",
             checkpoint_size: int = 33554432,
             auto_build_index: bool = False,
-            index_threshold: int = 8388608,
-            runtime: Runtime | None = None,
+            runtime: Runtime | object | None = None,
         ) -> None:
-            """Create a trace reader for plain or compressed DFTracer files."""
-            self.file_path = file_path
-            self.index_dir = index_dir
-            self.checkpoint_size = checkpoint_size
-            self.auto_build_index = auto_build_index
-            self.index_threshold = index_threshold
+            """Create a TraceReader.
+
+            Args:
+                path: Path to a trace file (.pfw/.pfw.gz) or a directory.
+                    When a directory is given, all iter/read methods discover
+                    .pfw and .pfw.gz files recursively and process them in
+                    parallel on the Runtime thread pool.
+                index_dir: Directory to search for ``.dftindex`` stores.
+                    Empty string (default) searches next to the trace file.
+                checkpoint_size: Checkpoint interval in bytes for index
+                    building (default 32 MB).
+                auto_build_index: If True, automatically build an index
+                    when none exists.
+                runtime: Runtime instance for thread pool control.
+                    If None, uses the default global Runtime.
+
+            Raises:
+                RuntimeError: If *file_path* does not exist or cannot be opened.
+            """
+            self._path = path
+            self._index_dir = index_dir
 
         def read_lines(
             self,
@@ -327,10 +540,12 @@ def read_lines(
             end_byte: int = 0,
             buffer_size: int = 4194304,
             query: str | None = None,
-        ) -> list[str]:
-            """Materialize decoded lines into a Python list.
+        ) -> list[memoryview]:
+            """Read lines from the trace file and return as a list.
 
-            Supports optional line/byte ranges and query-based filtering.
+            Lines are 1-indexed. Pass ``start_line=0, end_line=0`` (the
+            defaults) to read all lines. Out-of-range values are clamped
+            to the actual file bounds.
             """
             return []
 
@@ -342,67 +557,84 @@ def iter_lines(
             end_byte: int = 0,
             buffer_size: int = 4194304,
             query: str | None = None,
-        ) -> Iterator[str]:
-            """Stream decoded lines from the trace.
+            memory_budget: int = 0,
+        ) -> Iterator[memoryview]:
+            """Return a streaming iterator over decoded lines.
 
-            The returned iterator yields one UTF-8 decoded line at a time.
+            The C++ coroutine runs on the Runtime thread pool and pushes
+            lines into a bounded queue; Python ``__next__`` pops from it.
             """
             return iter(())
 
-        def iter_raw(
+        def iter_json(
             self,
             start_line: int = 0,
             end_line: int = 0,
             start_byte: int = 0,
             end_byte: int = 0,
-            line_aligned: bool = True,
-            multi_line: bool = True,
             buffer_size: int = 4194304,
             query: str | None = None,
-        ) -> Iterator[bytes]:
-            """Stream raw byte chunks from the trace.
+            batch_size: int = 1024,
+            memory_budget: int = 0,
+        ) -> Iterator["JsonDictValue"]:
+            """Return a streaming iterator over parsed JSON events.
 
-            Byte-range reads can be aligned to line boundaries and can
-            optionally return multi-line chunks.
+            Each event is parsed once in C++ and yielded as a zero-copy
+            :class:`JsonDictValue` wrapper. No double-parsing overhead.
             """
             return iter(())
 
-        def read_raw(
+        def read_json(
             self,
             start_line: int = 0,
             end_line: int = 0,
             start_byte: int = 0,
             end_byte: int = 0,
-            line_aligned: bool = True,
-            multi_line: bool = True,
             buffer_size: int = 4194304,
             query: str | None = None,
-        ) -> list[bytes]:
-            """Materialize raw byte chunks into a Python list."""
+            batch_size: int = 1024,
+        ) -> list["JsonDictValue"]:
+            """Read all events as parsed :class:`JsonDictValue` wrappers (list).
+
+            Equivalent to ``list(iter_json(...))``.
+            """
             return []
 
-        def iter_lines_json(
+        def iter_raw(
             self,
             start_line: int = 0,
             end_line: int = 0,
             start_byte: int = 0,
             end_byte: int = 0,
+            line_aligned: bool = True,
+            multi_line: bool = True,
             buffer_size: int = 4194304,
             query: str | None = None,
-        ) -> Iterator["JSON"]:
-            """Stream lazy :class:`JSON` objects for trace events."""
+            memory_budget: int = 0,
+        ) -> Iterator[memoryview]:
+            """Return a streaming iterator over raw byte chunks.
+
+            When ``query`` is set and an index exists, chunk-level pruning
+            skips non-matching chunks. No per-event filtering is applied.
+            """
             return iter(())
 
-        def read_lines_json(
+        def read_raw(
             self,
             start_line: int = 0,
             end_line: int = 0,
             start_byte: int = 0,
             end_byte: int = 0,
+            line_aligned: bool = True,
+            multi_line: bool = True,
             buffer_size: int = 4194304,
             query: str | None = None,
-        ) -> list["JSON"]:
-            """Materialize trace events as lazy :class:`JSON` objects."""
+        ) -> list[memoryview]:
+            """Read raw byte chunks and return as a list.
+
+            When ``query`` is set and an index exists, chunk-level pruning
+            skips non-matching chunks. No per-event filtering is applied.
+            """
             return []
 
         def iter_arrow(
@@ -414,10 +646,45 @@ def iter_arrow(
             end_byte: int = 0,
             buffer_size: int = 4194304,
             query: str | None = None,
-        ) -> Iterator["ArrowBatch"]:
-            """Stream Arrow batches parsed from trace events."""
+            flatten_objects: bool = False,
+            normalize: bool = False,
+            memory_budget: int = 0,
+        ) -> Iterator["_ArrowBatchCapsule"]:
+            """Return iterator over Arrow record batches.
+
+            Each batch is an ``_ArrowBatchCapsule`` implementing the Arrow
+            PyCapsule protocol (``__arrow_c_array__``).  Wrap with
+            :class:`~dftracer.utils.arrow.ArrowBatch` for convenience
+            methods, or pass directly to ``pyarrow.record_batch()``.
+            """
             return iter(())
 
+        def iter_arrow_stream(
+            self,
+            batch_size: int = 10000,
+            start_line: int = 0,
+            end_line: int = 0,
+            start_byte: int = 0,
+            end_byte: int = 0,
+            buffer_size: int = 4194304,
+            query: str | None = None,
+            flatten_objects: bool = False,
+            normalize: bool = False,
+            memory_budget: int = 0,
+        ) -> "_ArrowBatchStream":
+            """Return an Arrow C Data Interface stream over record batches.
+
+            PyArrow can drain the producer channel in a single C-side call:
+
+                rbr = pa.RecordBatchReader.from_stream(reader.iter_arrow_stream())
+                for batch in rbr:
+                    ...
+
+            Equivalent data to :meth:`iter_arrow`, but without per-batch
+            Python <-> C transitions.
+            """
+            return _ArrowBatchStream()
+
         def read_arrow(
             self,
             batch_size: int = 10000,
@@ -427,20 +694,144 @@ def read_arrow(
             end_byte: int = 0,
             buffer_size: int = 4194304,
             query: str | None = None,
-        ) -> "ArrowTable | None":
-            """Materialize Arrow batches as a single table-like result."""
+            flatten_objects: bool = False,
+            normalize: bool = False,
+        ) -> object:
+            """Read all events as an ArrowTable.
+
+            Equivalent to collecting all batches from :meth:`iter_arrow`
+            into an :class:`~dftracer.utils.arrow.ArrowTable`.
+            """
             return None
 
         def get_max_bytes(self) -> int:
-            """Return indexed decompressed size when available."""
+            """Get the maximum byte position in the decompressed trace.
+
+            Returns the decompressed size for indexed files, file size for
+            plain text files, or 0 for compressed files without an index.
+            """
             return 0
 
         def get_num_lines(self) -> int:
-            """Return indexed line count when available."""
+            """Get the total number of lines in the trace.
+
+            Returns the line count for indexed files, or 0 for files
+            without an index (use :attr:`num_lines` property for fallback
+            counting).
+            """
             return 0
 
+        @property
+        def path(self) -> str:
+            """Path to the trace file or directory."""
+            return self._path
+
+        @property
+        def index_dir(self) -> str:
+            """Directory searched for `.dftindex` stores."""
+            return self._index_dir
+
+        @property
+        def has_index(self) -> bool:
+            """True if a checkpoint index was found at construction time."""
+            return False
+
+        @property
+        def num_lines(self) -> int:
+            """Total line count (reads all lines to compute if needed)."""
+            return 0
+
+        def write_arrow(
+            self,
+            path: str,
+            views: list[str | dict[str, object]] | None = None,
+            chunk_size_mb: int = 32,
+            compression: str = "zstd",
+            batch_size: int = 10000,
+        ) -> dict[str, object]:
+            """Write trace data to Arrow IPC files with optional view-based partitioning.
+
+            Args:
+                path: Output directory for Arrow IPC files.
+                views: List of view definitions. Each can be:
+                    - A string: predefined view name ('io', 'compute', 'dlio')
+                    - A dict with 'name' and optional 'query', 'include_metadata'
+                    If None, writes all events to 'all' partition.
+                chunk_size_mb: Maximum uncompressed size per file in MB.
+                compression: 'zstd' or 'none'.
+                batch_size: Events per Arrow batch.
+
+            Returns:
+                Dict with partitions, total_rows, total_bytes, chunks_scanned, chunks_skipped.
+            """
+            return {}
+
+        def get_view_chunks(
+            self,
+            view: str | dict[str, object] | None = None,
+        ) -> dict[str, object]:
+            """Get candidate chunks for a view after bloom filter pruning.
+
+            Args:
+                view: View definition (string or dict with 'name' and optional 'query').
+
+            Returns:
+                Dict with chunks list, total_checkpoints, skipped_checkpoints, file_may_match.
+            """
+            return {}
+
+        def write_view_chunk(
+            self,
+            output_file: str,
+            checkpoint_idx: int,
+            start_byte: int,
+            end_byte: int,
+            view: str | dict[str, object] | None = None,
+            compression: str = "zstd",
+            batch_size: int = 10000,
+        ) -> dict[str, object]:
+            """Write a single chunk to an Arrow IPC file.
+
+            Args:
+                output_file: Path to output Arrow IPC file.
+                checkpoint_idx: Checkpoint index.
+                start_byte: Start byte offset.
+                end_byte: End byte offset.
+                view: View definition.
+                compression: 'zstd' or 'none'.
+                batch_size: Events per batch.
+
+            Returns:
+                Dict with output_file, events_matched, rows_written, bytes_written.
+            """
+            return {}
+
+        def write_view_chunks(
+            self,
+            chunks: list[dict[str, object]],
+            output_dir: str,
+            view: str | dict[str, object] | None = None,
+            compression: str = "zstd",
+            batch_size: int = 10000,
+        ) -> dict[str, object]:
+            """Write multiple chunks to Arrow IPC files in parallel.
+
+            All chunks are processed concurrently on the Runtime thread pool.
+
+            Args:
+                chunks: List of dicts with checkpoint_idx, start_byte, end_byte.
+                output_dir: Directory for output Arrow IPC files.
+                view: View definition.
+                compression: 'zstd' or 'none'.
+                batch_size: Events per batch.
+
+            Returns:
+                Dict with results list, total_rows, total_events_matched.
+            """
+            return {}
+
         def __enter__(self) -> "TraceReader":
-            """Enter the trace-reader context manager."""
+            """Enter the runtime context for the with statement."""
             return self
 
         def __exit__(
@@ -449,49 +840,165 @@ def __exit__(
             exc_val: BaseException | None,
             exc_tb: TracebackType | None,
         ) -> None:
-            """Exit the trace-reader context manager."""
+            """Exit the runtime context for the with statement."""
             return None
 
-    class AggregatorUtility(_BaseNative):
-        """Aggregate trace events into Arrow-ready time buckets."""
+    class StatisticsQueryUtility(_BaseNative):
+        def __init__(self, runtime: Runtime | None = None) -> None:
+            self.runtime = runtime
+
+        def process(
+            self,
+            file_path: str,
+            query_type: str = "summary",
+            top_n: int = 10,
+            index_dir: str = "",
+        ) -> dict[str, object]:
+            return {}
+
+        def __call__(
+            self,
+            file_path: str,
+            query_type: str = "summary",
+            top_n: int = 10,
+            index_dir: str = "",
+        ) -> dict[str, object]:
+            return {}
+
+    class StatisticsAggregatorUtility(_BaseNative):
+        def __init__(self, runtime: Runtime | None = None) -> None:
+            self.runtime = runtime
+
+        def process(
+            self,
+            file_path: str,
+            index_dir: str = "",
+        ) -> dict[str, object]:
+            return {}
+
+        def __call__(
+            self,
+            file_path: str,
+            index_dir: str = "",
+        ) -> dict[str, object]:
+            return {}
+
+    class MetadataCollectorUtility(_BaseNative):
+        def __init__(self, runtime: Runtime | None = None) -> None:
+            self.runtime = runtime
+
+        def process(
+            self,
+            file_path: str,
+            index_dir: str = "",
+        ) -> dict[str, object]:
+            return {}
+
+        def __call__(
+            self,
+            file_path: str,
+            index_dir: str = "",
+        ) -> dict[str, object]:
+            return {}
+
+    class ReorganizationPlannerUtility(_BaseNative):
+        def __init__(self, runtime: Runtime | None = None) -> None:
+            self.runtime = runtime
+
+        def process(
+            self,
+            source_files: list[str],
+            groups: list[dict[str, str]] | None = None,
+            index_dir: str = "",
+        ) -> dict[str, object]:
+            return {}
+
+        def __call__(
+            self,
+            source_files: list[str],
+            groups: list[dict[str, str]] | None = None,
+            index_dir: str = "",
+        ) -> dict[str, object]:
+            return {}
 
+    class ReconstructionPlannerUtility(_BaseNative):
         def __init__(self, runtime: Runtime | None = None) -> None:
-            """Create an aggregation utility bound to an optional runtime."""
             self.runtime = runtime
 
         def process(
             self,
-            source_dir: str,
-            output_path: str = "",
+            reorganized_files: list[str],
+            index_dir: str = "",
+        ) -> dict[str, object]:
+            return {}
+
+        def __call__(
+            self,
+            reorganized_files: list[str],
+            index_dir: str = "",
+        ) -> dict[str, object]:
+            return {}
+
+    class AggregatorUtility(_BaseNative):
+        def __init__(self, runtime: Runtime | None = None) -> None:
+            self.runtime = runtime
+
+        def process(
+            self,
+            directory: str,
             time_interval_ms: float = 5000.0,
-            query: str = "",
+            group_keys: list[str] | None = None,
+            categories: list[str] | None = None,
+            names: list[str] | None = None,
+            index_dir: str = "",
+            checkpoint_size: int = 33554432,
+            force_rebuild: bool = False,
+            chunk_size_mb: int = 64,
+            batch_size_mb: int = 4,
+            event_batch_size: int = 10000,
+            custom_metric_fields: list[str] | None = None,
+            compute_percentiles: bool = False,
+        ) -> object:
+            return None
+
+        def __call__(
+            self,
+            directory: str,
+            time_interval_ms: float = 5000.0,
+            group_keys: list[str] | None = None,
+            categories: list[str] | None = None,
+            names: list[str] | None = None,
             index_dir: str = "",
+            checkpoint_size: int = 33554432,
             force_rebuild: bool = False,
+            chunk_size_mb: int = 64,
+            batch_size_mb: int = 4,
+            event_batch_size: int = 10000,
             custom_metric_fields: list[str] | None = None,
             compute_percentiles: bool = False,
-        ) -> "ArrowTable | None":
-            """Aggregate trace events into a materialized Arrow-style result."""
+        ) -> object:
             return None
 
         def iter_arrow(
             self,
-            source_dir: str,
-            output_path: str = "",
+            directory: str,
             time_interval_ms: float = 5000.0,
-            query: str = "",
+            group_keys: list[str] | None = None,
+            categories: list[str] | None = None,
+            names: list[str] | None = None,
             index_dir: str = "",
+            checkpoint_size: int = 33554432,
             force_rebuild: bool = False,
+            chunk_size_mb: int = 64,
+            batch_size_mb: int = 4,
+            event_batch_size: int = 10000,
             custom_metric_fields: list[str] | None = None,
             compute_percentiles: bool = False,
-        ) -> Iterator["ArrowBatch"]:
-            """Stream Arrow batches for aggregated trace metrics."""
+        ) -> Iterator[object]:
             return iter(())
 
     class ComparatorUtility(_BaseNative):
-        """Compare baseline and variant traces."""
-
         def __init__(self, runtime: Runtime | None = None) -> None:
-            """Create a comparator utility bound to an optional runtime."""
             self.runtime = runtime
 
         def compare(
@@ -499,12 +1006,31 @@ def compare(
             baseline: str,
             variant: str,
             query: str = "",
+            group_by: str = "",
+            format: str = "table",
             time_interval_ms: float = 5000.0,
             threshold: float = 0.0,
+            executor_threads: int = 0,
             index_dir: str = "",
             force_rebuild: bool = False,
-        ) -> "ArrowTable | None":
-            """Return comparison results as Arrow-compatible output."""
+            config: str = "",
+        ) -> object:
+            return None
+
+        def __call__(
+            self,
+            baseline: str,
+            variant: str,
+            query: str = "",
+            group_by: str = "",
+            format: str = "table",
+            time_interval_ms: float = 5000.0,
+            threshold: float = 0.0,
+            executor_threads: int = 0,
+            index_dir: str = "",
+            force_rebuild: bool = False,
+            config: str = "",
+        ) -> object:
             return None
 
         def compare_json(
@@ -512,12 +1038,15 @@ def compare_json(
             baseline: str,
             variant: str,
             query: str = "",
+            group_by: str = "",
+            format: str = "table",
             time_interval_ms: float = 5000.0,
             threshold: float = 0.0,
+            executor_threads: int = 0,
             index_dir: str = "",
             force_rebuild: bool = False,
+            config: str = "",
         ) -> str:
-            """Return comparison results as JSON."""
             return "{}"
 
         def compare_table(
@@ -525,115 +1054,96 @@ def compare_table(
             baseline: str,
             variant: str,
             query: str = "",
+            group_by: str = "",
+            format: str = "table",
             time_interval_ms: float = 5000.0,
             threshold: float = 0.0,
+            executor_threads: int = 0,
             index_dir: str = "",
             force_rebuild: bool = False,
+            config: str = "",
         ) -> str:
-            """Return comparison results as a formatted text table."""
             return ""
 
-    class StatisticsQueryUtility(_BaseNative):
-        """Query summary or top-N statistics from a trace."""
+    class IndexDatabase(_BaseNative):
+        """Handle to a .dftindex RocksDB store.
 
-        def __init__(self, runtime: Runtime | None = None) -> None:
-            """Create a statistics-query utility bound to an optional runtime."""
-            self.runtime = runtime
+        Used by the distributed indexer coordinator to pre-register files,
+        reserve file_id ranges, bulk-ingest worker-produced SSTs, and rebuild
+        root summaries.
+        """
 
-        def process(
-            self,
-            file_path: str,
-            query_type: str = "summary",
-            top_n: int = 10,
-            index_dir: str = "",
-            auto_build_index: bool = False,
-            index_threshold: int = 8388608,
-        ) -> dict[str, object]:
-            """Return scalar statistics derived from the trace."""
-            return {}
+        def __init__(self, index_path: str) -> None:
+            self._index_path = index_path
 
-    class StatisticsAggregatorUtility(_BaseNative):
-        """Aggregate core statistics from a trace into a Python dictionary."""
+        def init_schema(self) -> None:
+            return None
 
-        def __init__(self, runtime: Runtime | None = None) -> None:
-            """Create a statistics-aggregator utility bound to an optional runtime."""
-            self.runtime = runtime
+        def register_files(self, paths: list[str], build_manifest: bool = False) -> list[int]:
+            """Register each path in the DEFAULT-CF file registry and return
+            the assigned file_ids (parallel to `paths`). Idempotent for files
+            with matching hash."""
+            return []
 
-        def process(
-            self,
-            file_path: str,
-            index_dir: str = "",
-            auto_build_index: bool = False,
-            index_threshold: int = 8388608,
-        ) -> dict[str, object]:
-            """Return aggregate trace statistics."""
-            return {}
+        def reserve_file_id_range(self, count: int) -> int:
+            """Atomically reserve `count` contiguous file_ids; return first."""
+            return 0
 
-    class MetadataCollectorUtility(_BaseNative):
-        """Collect file metadata and index-aware trace metadata."""
+        def bulk_ingest(
+            self,
+            registry: "SstArtifactRegistry",
+            skip_cfs: object = None,
+        ) -> None:
+            """Ingest all SSTs collected in the registry.
+
+            skip_cfs is an optional iterable of CF names whose SSTs are left
+            outside the unified DB. Distributed builds pass
+            {"aggregation", "system_metrics"} to keep per-worker AGG/SYS SSTs
+            addressable via `agg_manifest.json` for parallel reads at analyze
+            time. See `dftracer.utils.dask.consolidate_index` to fold them
+            back into the unified DB later.
+            """
+            return None
 
-        def __init__(self, runtime: Runtime | None = None) -> None:
-            """Create a metadata collector bound to an optional runtime."""
-            self.runtime = runtime
+        def rebuild_root_summaries(self) -> None:
+            """Recompute ROOT_* summary column families from per-file CFs."""
+            return None
 
-        def process(
-            self,
-            file_path: str,
-            index_dir: str = "",
-            checkpoint_size: int = 33554432,
-            force_rebuild: bool = False,
-            index_threshold: int = 8388608,
-        ) -> dict[str, object]:
-            """Return metadata for a DFTracer trace file."""
-            return {}
+        def write_agg_global_config(self, time_interval_us: int, config_hash: int = 0) -> None:
+            """Write the aggregation global-config marker into the AGGREGATION CF.
 
-    class ReorganizationPlannerUtility(_BaseNative):
-        """Build a semantic reorganization plan for trace files."""
+            Required for `Indexer.iter_arrow_dfanalyzer_all` on distributed
+            builds (which never materialise the key via worker SSTs) and
+            post-consolidate indices.
+            """
+            return None
 
-        def __init__(self, runtime: Runtime | None = None) -> None:
-            """Create a reorganization planner bound to an optional runtime."""
-            self.runtime = runtime
+        def write_agg_file_markers(self, file_ids: object) -> None:
+            """Write per-file aggregation completion markers into the AGGREGATION CF.
 
-        def process(
-            self,
-            source_files: list[str],
-            groups: list[dict[str, object]],
-            index_dir: str = "",
-            checkpoint_size: int = 33554432,
-            force_rebuild: bool = False,
-            index_threshold: int = 8388608,
-        ) -> dict[str, object]:
-            """Return a reorganization plan for the requested groups."""
-            return {}
+            Each marker is ``\\xFF\\xFF + file_id_be32``. The index resolver uses
+            their presence to decide whether each file has aggregated data; if
+            missing, ``ensure_indexed()`` concludes the aggregation tier is
+            incomplete and re-runs the entire build. Distributed_index must
+            call this after ``bulk_ingest`` so subsequent ``read_trace`` calls
+            do not redundantly re-aggregate.
+            """
+            return None
 
-    class ReconstructionPlannerUtility(_BaseNative):
-        """Build a reconstruction plan from reorganized traces."""
+        def write_aggregation_tracker(self, blobs: list[bytes]) -> None:
+            """Merge serialized AssociationTracker blobs and write the result
+            to the AGGREGATION CF under the ``__tracker__`` key."""
+            return None
 
-        def __init__(self, runtime: Runtime | None = None) -> None:
-            """Create a reconstruction planner bound to an optional runtime."""
-            self.runtime = runtime
+    class SstArtifactRegistry(_BaseNative):
+        """Thread-safe collector for SST artifact paths produced by workers."""
 
-        def process(
-            self,
-            reorganized_files: list[str],
-            provenance_dir: str = "",
-        ) -> dict[str, object]:
-            """Return a reconstruction plan for reorganized trace files."""
-            return {}
+        def __init__(self) -> None:
+            pass
 
-    ext.Indexer = Indexer
-    ext.IndexerCheckpoint = IndexerCheckpoint
-    ext.JSON = JSON
-    ext.Runtime = Runtime
-    ext.TaskHandle = TaskHandle
-    ext.TraceReader = TraceReader
-    ext.AggregatorUtility = AggregatorUtility
-    ext.ComparatorUtility = ComparatorUtility
-    ext.MetadataCollectorUtility = MetadataCollectorUtility
-    ext.ReconstructionPlannerUtility = ReconstructionPlannerUtility
-    ext.ReorganizationPlannerUtility = ReorganizationPlannerUtility
-    ext.StatisticsAggregatorUtility = StatisticsAggregatorUtility
-    ext.StatisticsQueryUtility = StatisticsQueryUtility
+        def append(self, artifacts_dict: dict[str, str | None]) -> None:
+            """Add a per-batch Artifacts dict as returned by `build_sst_batch`."""
+            return None
 
     def get_default_runtime() -> Runtime:
         """Return the process-wide default runtime."""
@@ -643,41 +1153,176 @@ def set_default_runtime(runtime: Runtime | None = None) -> None:
         """Replace or clear the process-wide default runtime."""
         return None
 
-    ext.get_default_runtime = get_default_runtime
-    ext.set_default_runtime = set_default_runtime
-    for name in [
-        "AggregatorUtility",
-        "ComparatorUtility",
-        "Indexer",
-        "IndexerCheckpoint",
-        "JSON",
-        "MetadataCollectorUtility",
-        "ReconstructionPlannerUtility",
-        "ReorganizationPlannerUtility",
-        "Runtime",
-        "StatisticsAggregatorUtility",
-        "StatisticsQueryUtility",
-        "TaskHandle",
-        "TraceReader",
-    ]:
-        getattr(ext, name).__module__ = ext_name
-    ext.__all__ = [
+    def read_arrow_files_parallel(
+        paths: list[str],
+        runtime: Runtime | None = None,
+    ) -> dict[str, object]:
+        """Read multiple Arrow IPC files in parallel using the Runtime.
+
+        Args:
+            paths: List of file paths to read.
+            runtime: Optional Runtime object. Uses default if not provided.
+
+        Returns:
+            dict with:
+                - file_results: List of per-file results, each with:
+                    - path: File path
+                    - success: True if read succeeded
+                    - error: Error message if failed, else None
+                    - total_rows: Number of rows in file
+                    - batches: List of ArrowBatch objects
+                - total_rows: Total rows across all files
+                - total_batches: Total batches across all files
+                - files_read: Number of files read successfully
+                - files_failed: Number of files that failed
+        """
+        return {}
+
+    def build_sst_batch(
+        files: list[str],
+        file_ids: list[int],
+        staging_dir: str,
+        batch_id: str,
+        index_dir: str = "",
+        checkpoint_size: int = 33554432,
+        build_manifest: bool = False,
+        force_rebuild: bool = False,
+        bloom_dimensions: list[str] | None = None,
+        parallelism: int = 0,
+        flush_every_files: int = 0,
+        runtime: Runtime | object | None = None,
+        aggregation_config: object = None,
+        file_slices: object = None,
+    ) -> tuple[list[dict[str, str | None]], bytes]:
+        """Run the indexer pipeline with an SST sink. Returns
+        `(artifact_dicts, tracker_blob)`. `tracker_blob` is the serialized
+        merged AssociationTracker for the batch (empty bytes when
+        `aggregation_config` is None). `file_slices` enables intra-file
+        parallelism; entries are `None` (whole file) or
+        `(member_begin, member_end, checkpoint_idx_base,
+        skip_file_scoped_writes, members)`."""
+        return ([], b"")
+
+    def plan_lpt_partition(
+        entries: list[tuple[str, int]], num_workers: int
+    ) -> list[list[tuple[str, int]]]:
+        """Greedy LPT bin-packing of (path, size) tuples into num_workers
+        buckets, minimising the maximum per-worker total size."""
+        return []
+
+    def scan_files(
+        directory: str,
+        patterns: list[str] | None = None,
+        recursive: bool = False,
+        runtime: Runtime | object | None = None,
+    ) -> list[tuple[str, int]]:
+        """Parallel directory scan returning (path, size) tuples for regular
+        files matching the patterns."""
+        return []
+
+    def enable_aggregation_deterministic_ids() -> None:
+        """Flip the global aggregation StringIntern into deterministic-id mode
+        so the same string maps to the same 32-bit id in every worker process."""
+        return None
+
+    def move_artifacts(
+        artifacts: dict[str, str | None], dest_dir: str
+    ) -> dict[str, str | None]:
+        """Move every populated SST in `artifacts` into `dest_dir` via the
+        C++ rename/copy helper, returning a fresh dict with the new paths."""
+        return {}
+
+    def enumerate_gzip_members(
+        files: list[str],
+        runtime: Runtime | object | None = None,
+    ) -> list[list[tuple[int, int]]]:
+        """Cooperative async scan of gzip member offsets. Returns lists of
+        `(c_offset, c_size)` parallel to `files`; empty for non-gzip files."""
+        return []
+
+    def plan_work_units(
+        member_map: list[list[tuple[int, int]]],
+        num_workers: int,
+        target_c_size: int = 0,
+    ) -> list[list[tuple[int, int, int, int]]]:
+        """Deterministic LPT assignment of intra-file gzip-member slices across
+        workers. Returns per-worker lists of
+        `(file_idx, member_begin, member_end, c_size)`."""
+        return []
+
+    def scan_aggregation_manifest(
+        agg_ssts: list[str],
+        sys_ssts: list[str],
+        scratch_dir: str,
+        meta_index_path: str,
+        batch_size: int = 10000,
+        time_granularity: float = 1.0,
+        time_resolution: float = 1e6,
+        query: str | None = None,
+        group_by: list[str] | None = None,
+        shard_begin: int = 0,
+        shard_end: int = 4096,
+        runtime: Runtime | object | None = None,
+        file_hashes: dict[str, str] | None = None,
+        host_hashes: dict[str, str] | None = None,
+    ) -> dict[str, list["_ArrowBatchCapsule"]]:
+        """Scan a worker's slice of the distributed aggregation manifest.
+
+        Ingests `agg_ssts` + `sys_ssts` into a scratch IndexDatabase at
+        `scratch_dir` (caller owns the directory lifecycle) and runs the
+        dfanalyzer aggregation scan over `[shard_begin, shard_end)`.
+        `meta_index_path` is the unified .dftindex used to resolve file /
+        host hashes.
+
+        Returns the same dict shape as `Indexer.iter_arrow_dfanalyzer_all`:
+        `{"events": [...], "profiles": [...], "system": [...]}`.
+        """
+        return {"events": [], "profiles": [], "system": []}
+
+    _class_symbols = [
+        "_ArrowBatchCapsule",
+        "_ArrowBatchStream",
         "AggregatorUtility",
+        "CheckpointIndexer",
         "ComparatorUtility",
+        "IndexDatabase",
         "Indexer",
         "IndexerCheckpoint",
-        "JSON",
+        "JsonDictValue",
         "MetadataCollectorUtility",
         "ReconstructionPlannerUtility",
         "ReorganizationPlannerUtility",
         "Runtime",
+        "SstArtifactRegistry",
         "StatisticsAggregatorUtility",
         "StatisticsQueryUtility",
         "TaskHandle",
         "TraceReader",
+    ]
+    _function_symbols = [
+        "build_sst_batch",
+        "enable_aggregation_deterministic_ids",
+        "enumerate_gzip_members",
         "get_default_runtime",
+        "move_artifacts",
+        "plan_lpt_partition",
+        "plan_work_units",
+        "read_arrow_files_parallel",
+        "scan_aggregation_manifest",
+        "scan_files",
         "set_default_runtime",
     ]
+
+    _local = locals()
+    for _name in _class_symbols + _function_symbols:
+        setattr(ext, _name, _local[_name])
+
+    for _name in _class_symbols:
+        getattr(ext, _name).__module__ = ext_name
+    for _name in _function_symbols:
+        getattr(ext, _name).__module__ = ext_name
+
+    ext.__all__ = sorted(_class_symbols + _function_symbols)
     sys.modules[ext_name] = ext
 
 
diff --git a/docs/source/cpp_api/arrow.rst b/docs/source/cpp_api/arrow.rst
index 5e9d946c..f09ae7e6 100644
--- a/docs/source/cpp_api/arrow.rst
+++ b/docs/source/cpp_api/arrow.rst
@@ -25,11 +25,20 @@ Guarded by ``DFTRACER_UTILS_ENABLE_ARROW`` (ON by default).
 
        subgraph Write["File Output"]
            IPC["IpcWriter"]
+           PW["PartitionWriter"]
+           PR["PartitionRouter"]
+       end
+
+       subgraph Read["File Input"]
+           IRD["IpcReader"]
        end
 
        RBB -->|"finish()"| AER
        AER -->|"write_batch()"| IPC
+       AER -->|"route()"| PR
+       PR -->|"per-partition"| PW
        AER -->|"PyCapsule"| Python["Python ArrowBatch"]
+       IRD -->|"read_batch()"| AER
 
 RecordBatchBuilder
 ------------------
@@ -42,6 +51,12 @@ Type-safe columnar builder with two modes:
   ``end_row()`` backfills nulls for missing columns. Best for
   ``TraceReader.iter_arrow()`` with arbitrary JSON.
 
+Once the first row has been finalized the schema is **locked**: subsequent
+rows may only append values into the already-discovered columns, and
+attempts to add new columns after the lock are rejected. This makes
+batches produced by the dynamic path safe to concatenate across a
+``TraceReader::read_arrow()`` stream without re-keying.
+
 String columns store ``string_view`` into source data for zero-copy during
 build; bulk copy only at ``finish()``. Caller must keep source data alive
 until ``finish()`` returns.
@@ -57,10 +72,38 @@ IpcWriter
 ---------
 
 Streaming Arrow IPC file writer. Writes ``.arrows`` files that can be
-read by pyarrow, polars, DuckDB, and any Arrow-compatible tool.
+read by pyarrow, polars, DuckDB, and any Arrow-compatible tool. Supports
+buffer-level compression: when built with ``DFTRACER_UTILS_ENABLE_ZSTD``,
+``IpcCompression::ZSTD`` is available and used by default for new files,
+producing pyarrow-compatible compressed IPC streams.
+
+Guarded by ``DFTRACER_UTILS_ENABLE_ARROW_IPC``.
+
+IpcReader
+---------
+
+Streaming Arrow IPC file reader. Mirrors ``IpcWriter`` and yields one
+``ArrowExportResult`` per record batch in the file. Supports buffer-level
+ZSTD decompression compatible with pyarrow / polars / DuckDB outputs.
 
 Guarded by ``DFTRACER_UTILS_ENABLE_ARROW_IPC``.
 
+PartitionWriter
+---------------
+
+Single-partition Arrow IPC sink with ``PartitionWriteStats`` tracking
+(bytes, rows, batches). Used as the per-partition output of
+``PartitionRouter`` and individually as a thin wrapper around
+``IpcWriter`` when only one output stream is needed.
+
+PartitionRouter
+---------------
+
+Multi-partition Arrow router. Takes an inbound ``ArrowExportResult`` plus
+a ``PartitionConfig`` (partition key columns, output template, target
+batch rows) and dispatches rows into one ``PartitionWriter`` per
+partition value. Aggregates ``RouterWriteStats`` across all partitions.
+
 Usage Example
 -------------
 
diff --git a/docs/source/cpp_api/coro.rst b/docs/source/cpp_api/coro.rst
index 8156f59a..ae405bdd 100644
--- a/docs/source/cpp_api/coro.rst
+++ b/docs/source/cpp_api/coro.rst
@@ -11,6 +11,16 @@ C++20 coroutine primitives for asynchronous task execution. All classes are in t
 
 For usage examples and task scheduling, see :doc:`/pipeline` and :doc:`pipeline/tasks`.
 
+.. note::
+
+   GCC 12 may corrupt large coroutine frames at ``-O2`` and above, especially
+   when frames contain references, ``string_view``, or captured lambdas. The
+   project mitigates this by heap-allocating per-task state in a
+   ``shared_ptr`` (or ``unique_ptr``) and capturing only the smart pointer in
+   coroutine lambdas, instead of capturing complex state by value. New
+   coroutines should follow the same pattern; see ``coroutine-caveats.md`` at
+   the repo root for the full discussion.
+
 .. mermaid::
 
    graph TD
diff --git a/docs/source/cpp_api/dft_aggregators.rst b/docs/source/cpp_api/dft_aggregators.rst
index d2dcc18e..ac1d8cc5 100644
--- a/docs/source/cpp_api/dft_aggregators.rst
+++ b/docs/source/cpp_api/dft_aggregators.rst
@@ -53,7 +53,7 @@ boundary event association, and Perfetto trace output.
        end
 
        subgraph Merge["Merge & Resolve"]
-           EA["EventAggregatorUtility"]
+           EA["EventAggregator"]
            AR["AssociationResolverUtility"]
        end
 
@@ -180,13 +180,33 @@ predicates for early chunk skipping when available.
 
 Tagged ``Parallelizable`` — multiple instances run concurrently across chunks.
 
-EventAggregatorUtility
-~~~~~~~~~~~~~~~~~~~~~~
+EventAggregator
+~~~~~~~~~~~~~~~
+
+Unified event aggregator (formerly ``EventAggregatorUtility`` and the
+internal ``RocksDbAggregator``, now merged into one class). Holds a
+``RocksDatabase`` handle and merges per-chunk aggregation results into a
+unified output, deduplicating file counts and collecting association
+trackers for downstream resolution.
+
+AggregationVisitor
+~~~~~~~~~~~~~~~~~~
+
+``DftEventVisitor`` subclass that accumulates ``AggregationMetrics`` per
+``AggregationKey`` directly from parsed events during a scan, so the
+aggregation pass can share a single parse with bloom and manifest
+visitors via ``DftEventDispatcher``. Defined in
+``dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h``.
 
-Merges per-chunk aggregation results into a unified output.
+DftEventDispatcher
+~~~~~~~~~~~~~~~~~~
 
-Combines metrics from all chunks, deduplicates file counts, and
-collects association trackers for downstream resolution.
+Fan-out adapter that implements the ``IndexVisitor`` interface, parses
+each line once, and dispatches the parsed ``DftEvent`` to a list of
+registered ``DftEventVisitor`` instances (``BloomVisitor``,
+``ManifestVisitor``, ``AggregationVisitor``, ...). This collapses
+multiple visitor passes into a single read of the input. Defined in
+``dftracer/utils/utilities/composites/dft/dft_event_dispatcher.h``.
 
 Association Tracking
 --------------------
diff --git a/docs/source/cpp_api/indexer.rst b/docs/source/cpp_api/indexer.rst
index 407a55f6..035dcccc 100644
--- a/docs/source/cpp_api/indexer.rst
+++ b/docs/source/cpp_api/indexer.rst
@@ -143,17 +143,50 @@ calls visitors in order:
 3. ``on_line(line, checkpoint_idx)`` -- called for every line in the file
 4. ``finalize(db, file_id)`` -- called once after the scan to persist results
 
+Indexer / CheckpointIndexer
+---------------------------
+
+The low-level checkpoint indexer is exposed as ``Indexer`` (formerly named
+``BatchIndexer``); the previous ``Indexer`` class is now ``CheckpointIndexer``
+in the internal namespace. ``SingleFileIndexer`` has been removed; use
+``IndexBuilderUtility`` or ``IndexBatchBuilderUtility`` instead.
+
+IndexBatchBuilderUtility
+------------------------
+
+Batched variant of ``IndexBuilderUtility`` that processes a list of files in
+parallel against a shared ``IndexDatabaseWriterContext``, yielding an
+``IndexBuildBatchResult`` with aggregated metrics. Configured via
+``IndexBuildBatchConfig`` (file list, parallelism, checkpoint size, bloom and
+manifest toggles, shared sink).
+
+IndexBuildBatchConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+Configuration struct for ``IndexBatchBuilderUtility``: file slices, output
+directory, checkpoint size, bloom/manifest flags, and the shared
+``IndexBatchSink`` (typically an ``IndexDatabaseWriterContext``) that
+receives encoded batches from all workers.
+
+IndexDatabaseWriterContext
+--------------------------
+
+Implements ``IndexBatchSink`` and owns a thread-safe writer pipeline into a
+RocksDB-backed ``IndexDatabase``. Workers in ``IndexBatchBuilderUtility``
+submit encoded index batches to this context, which serializes them into
+checkpoint, bloom, manifest, and statistics column families.
+
 BloomVisitor
 ------------
 
-Implements ``IndexVisitor`` to build per-chunk bloom filters and statistics
-during the indexing scan. Each checkpoint chunk gets its own set of bloom
-filters (one per configured dimension) plus per-chunk event counts and
-timestamp/duration distributions.
+Implements ``DftEventVisitor`` to build per-chunk bloom filters and
+statistics during the indexing scan. Each checkpoint chunk gets its own set
+of bloom filters (one per configured dimension) plus per-chunk event counts
+and timestamp/duration distributions.
 
 .. code-block:: cpp
 
-    #include <dftracer/utils/utilities/indexer/visitors/bloom_visitor.h>
+    #include <dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h>
 
     BloomVisitor visitor(bloom_config, {"name", "cat", "pid"});
     visitor.begin(num_checkpoints);
@@ -168,14 +201,15 @@ timestamp/duration distributions.
 ManifestVisitor
 ---------------
 
-Implements ``IndexVisitor`` to build per-checkpoint event routing manifests.
-During the scan, it collects which lines belong to which ``(cat, name)`` event
-pair within each checkpoint. The resulting manifests enable the reorganization
-pipeline to selectively read only the lines needed for a given event group.
+Implements ``DftEventVisitor`` to build per-checkpoint event routing
+manifests. During the scan, it collects which lines belong to which
+``(cat, name)`` event pair within each checkpoint. The resulting manifests
+enable the reorganization pipeline to selectively read only the lines needed
+for a given event group.
 
 .. code-block:: cpp
 
-    #include <dftracer/utils/utilities/indexer/visitors/manifest_visitor.h>
+    #include <dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.h>
 
     ManifestVisitor visitor;
     visitor.begin(num_checkpoints);
@@ -185,6 +219,14 @@ pipeline to selectively read only the lines needed for a given event group.
     // Later, query the manifest:
     auto ranges = db.query_event_ranges_for_checkpoint(file_id, checkpoint_idx);
 
+IndexResolverUtility
+--------------------
+
+Resolves a directory or file list into a set of ``FileWorkItem`` entries by
+opening or building per-file indexes and emitting line-range work items
+suitable for parallel scan / aggregation / replay pipelines. Defined in
+``dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h``.
+
 ProvenanceDatabase
 ------------------
 
diff --git a/docs/source/cpp_api/io.rst b/docs/source/cpp_api/io.rst
index 1abd2c0b..feced868 100644
--- a/docs/source/cpp_api/io.rst
+++ b/docs/source/cpp_api/io.rst
@@ -198,6 +198,39 @@ Example
        return 0;
    }
 
+Parallel File Writers
+---------------------
+
+The ``dftracer/utils/utilities/fileio/parallel/`` module provides
+high-throughput multi-stream file writers used by the reorganization and
+aggregation pipelines. The unified ``ParallelWriter`` class implements
+three on-disk layouts (selected via ``FileLayout`` in
+``parallel/layout.h``):
+
+- **Striped** -- one output file split into Lustre-friendly stripes,
+  each fed by an independent producer coroutine.
+- **Padded striped** -- striped layout with per-stripe alignment padding
+  for filesystems that prefer aligned writes.
+- **Sharded** -- one output file per shard, used when downstream
+  consumers want independent shards rather than a single concatenated
+  file.
+
+Sizing is Lustre-aware: ``LayoutInfo`` and ``WriterSizing`` derive stripe
+size and per-stripe buffer counts from the detected ``FilesystemKind``
+(Lustre vs generic POSIX). Internally, writes are coalesced via
+``coro::Channel``-based queues so that producer coroutines can submit
+small line-sized payloads without per-write ``write()`` syscalls.
+
+.. code-block:: cpp
+
+    #include <dftracer/utils/utilities/fileio/parallel/parallel_writer.h>
+    #include <dftracer/utils/utilities/fileio/parallel/layout.h>
+
+    WriterConfig cfg;
+    cfg.layout = FileLayout::STRIPED;
+    cfg.output_path = "merged.pfw";
+    ParallelWriter writer(cfg);
+
 Sync Fallback Behavior
 ~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/cpp_api/pipeline/executors.rst b/docs/source/cpp_api/pipeline/executors.rst
index 12f66dd0..1d54b70b 100644
--- a/docs/source/cpp_api/pipeline/executors.rst
+++ b/docs/source/cpp_api/pipeline/executors.rst
@@ -101,7 +101,6 @@ timeout thresholds. All fields have sensible defaults.
        std::size_t io_pool_size = 4;
        io::IoBackendType io_backend_type = io::IoBackendType::AUTO;
        unsigned io_batch_threshold = 16;
-       std::size_t db_pool_size = 2;
    };
 
 **Key fields:**
@@ -116,7 +115,6 @@ timeout thresholds. All fields have sensible defaults.
   (``AUTO``, ``IO_URING``, ``THREAD_POOL``).
 - ``io_batch_threshold`` -- Minimum number of I/O operations to batch before
   submitting to the backend.
-- ``db_pool_size`` -- Number of threads in the dedicated database work pool.
 
 **Example -- high-throughput configuration:**
 
@@ -127,7 +125,6 @@ timeout thresholds. All fields have sensible defaults.
        .io_pool_size = 8,
        .io_backend_type = io::IoBackendType::IO_URING,
        .io_batch_threshold = 32,
-       .db_pool_size = 4
    };
 
 Progress Tracking
diff --git a/docs/source/cpp_api/reader.rst b/docs/source/cpp_api/reader.rst
index dee4669f..b51a562c 100644
--- a/docs/source/cpp_api/reader.rst
+++ b/docs/source/cpp_api/reader.rst
@@ -22,7 +22,25 @@ decompression.
 
 The reader also supports query-based event filtering: when a query string is
 provided and an index exists, non-matching chunks are pruned entirely, and
-per-event filtering is applied to the remaining chunks.
+per-event filtering is applied to the remaining chunks. Conjunctions of
+equality predicates (``cat == 'io' AND name == 'read'``) are compiled into
+a vectorized predicate evaluator that runs against the index bloom dimensions
+before any line is decompressed.
+
+``TraceReader`` also accepts a **directory** as ``file_path``: when given a
+directory, it enumerates trace files inside it, opens one indexed reader per
+file, and yields lines / Arrow batches in file order. Batch chunk pruning is
+delegated to ``ChunkPrunerUtility``, which evaluates the compiled query
+against all candidate chunks in one pass and feeds the resulting line-range
+work items back to the per-file readers.
+
+When ``DFTRACER_UTILS_ENABLE_ARROW`` is set, ``TraceReader::read_arrow()``
+exports record batches via the Arrow C Data Interface
+(``ArrowExportResult``), which can be sent directly across the FFI boundary
+to Python / DuckDB / Polars without a copy. The ``ReadConfig::flatten_objects``
+flag expands one level of nested JSON objects (e.g. ``args``) into
+``parent.child`` columns with native Arrow types instead of serializing them
+as JSON strings.
 
 Getting Started
 ---------------
@@ -120,6 +138,13 @@ filtering. All fields have sensible defaults; pass a default-constructed
 - ``multi_line`` -- allow multiple lines per raw chunk (default true)
 - ``buffer_size`` -- internal read buffer size (default 4 MB)
 - ``query`` -- query DSL string for event filtering (empty = no filter)
+- ``chunk_prune_only`` -- when true, the query is used only for chunk-level
+  pruning via the index; per-line filtering is skipped (caller handles it)
+- ``skip_pruning`` -- skip the reader's own chunk pruner pass; the caller's
+  ``start_line``/``end_line`` window is trusted (used by the checkpoint-level
+  work-item dispatcher to avoid re-running ``ChunkPrunerUtility`` per item)
+- ``flatten_objects`` -- expand one level of nested JSON objects into
+  ``parent.child`` columns with native Arrow types in ``read_arrow()``
 
 Helper methods: ``has_line_range()`` and ``has_byte_range()`` test whether
 non-default range bounds have been set.
@@ -135,7 +160,9 @@ indexed) based on whether an index exists and what range the caller requests.
 **Async generators:**
 
 - ``read_lines(config)`` -- yields ``Line`` structs (``content`` + ``line_number``) with optional query filtering and chunk pruning
+- ``read_json(config)`` -- yields ``JsonLine`` records (parsed once with simdjson) for callers that would otherwise re-parse each line
 - ``read_raw(config)`` -- yields ``std::span<const char>`` byte chunks
+- ``read_arrow(config, batch_size)`` -- yields ``ArrowExportResult`` record batches via the Arrow C Data Interface (requires ``DFTRACER_UTILS_ENABLE_ARROW``)
 
 **Metadata queries:**
 
diff --git a/docs/source/cpp_api/rocksdb.rst b/docs/source/cpp_api/rocksdb.rst
index 76a6f77a..35695841 100644
--- a/docs/source/cpp_api/rocksdb.rst
+++ b/docs/source/cpp_api/rocksdb.rst
@@ -8,10 +8,12 @@ RocksDB migration.
 It includes:
 
 - database wrappers and lifecycle management
-- async awaitables for database work on executor-backed threads
+- column-family and merge-operator registration for the ``.dftindex`` schema
 - key encoding helpers for typed prefix/range scans
 - manager utilities for sharing open database handles across readers,
   indexers, and higher-level composites
+- bulk-ingest helpers (``SstFileWriter`` + ``IngestExternalFile``) used by
+  the distributed indexing pipeline
 
 Architecture
 ------------
@@ -23,10 +25,9 @@ Architecture
        Indexers["Indexer / provenance writers"] --> Manager
        Manager --> Database["RocksDatabase"]
        Database --> CFs["Column families"]
-       Database --> Async["DbAwaitable / rocks::run"]
        Database --> Codec["KeyCodec"]
+       Database --> Merge["MergeOperators (AGGREGATION, SYSTEM_METRICS)"]
        CFs --> Store[".dftindex / provenance store"]
-       Async --> Runtime["Executor-backed threads"]
        Codec --> Store
 
 See also:
diff --git a/docs/source/developers.rst b/docs/source/developers.rst
index 52583fb1..79c628be 100644
--- a/docs/source/developers.rst
+++ b/docs/source/developers.rst
@@ -331,27 +331,33 @@ For hot loops, reuse a single ``HasherUtility`` instance with ``reset()``:
 Anti-Patterns to Avoid
 ~~~~~~~~~~~~~~~~~~~~~~
 
-**Storing JsonValue beyond yyjson_doc lifetime**
+**Storing JsonValue / simdjson views beyond the parser's lifetime**
 
-``JsonValue`` is a non-owning view into a ``yyjson_doc``. Never store it across the document's lifetime.
+``JsonValue`` (and the underlying ``simdjson::ondemand::value`` /
+``simdjson::dom::element``) is a non-owning view into the parser's buffer.
+Never store it across the parser's or the input buffer's lifetime.
 
 .. code-block:: cpp
 
-    // WRONG: doc destroyed, but view stored
+    #include <simdjson.h>
+
+    // WRONG: parser/buffer destroyed, but view stored
     JsonValue stored_value;
     {
-        yyjson_doc* doc = yyjson_read_file("config.json", NULL);
-        stored_value = yyjson_get_obj(doc);
-        yyjson_doc_free(doc);
+        simdjson::ondemand::parser parser;
+        auto padded = simdjson::padded_string::load("config.json");
+        auto doc = parser.iterate(padded);
+        stored_value = doc.find_field("root").value();
     }
-    // stored_value now points to freed memory!
-    
-    // CORRECT: copy data before doc destruction
+    // stored_value now points into freed parser/buffer memory!
+
+    // CORRECT: copy the data out before the parser goes out of scope
     {
-        yyjson_doc* doc = yyjson_read_file("config.json", NULL);
-        auto data = serialize_json_value(yyjson_get_obj(doc));
-        yyjson_doc_free(doc);
-        // data is now safe
+        simdjson::ondemand::parser parser;
+        auto padded = simdjson::padded_string::load("config.json");
+        auto doc = parser.iterate(padded);
+        auto data = serialize_json_value(doc.find_field("root").value());
+        // data owns its copy; safe to use after the parser is destroyed
     }
 
 **Instantiating IOExecutor directly**
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index d3637b6d..609ae883 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -81,6 +81,51 @@ To install to a custom location:
    make
    make install
 
+Build Options
+~~~~~~~~~~~~~
+
+The following CMake options control optional features and dependencies. All
+options default to ``ON`` unless noted otherwise:
+
+- ``DFTRACER_UTILS_TESTS`` (default ``OFF``) - Build the test suite.
+- ``DFTRACER_UTILS_COVERAGE`` (default ``OFF``) - Enable coverage reporting.
+- ``DFTRACER_UTILS_DEBUG`` (default ``OFF``) - Enable debug mode with verbose
+  logging.
+- ``DFTRACER_UTILS_BUILD_SHARED`` (default ``ON``) - Build the shared library.
+- ``DFTRACER_UTILS_BUILD_STATIC`` (default ``ON``) - Build the static library.
+- ``DFTRACER_UTILS_BUILD_BINARIES`` (default ``ON``) - Build command-line
+  binaries.
+- ``DFTRACER_UTILS_BUILD_PYTHON`` (default ``OFF``) - Build Python bindings.
+- ``DFTRACER_UTILS_ENABLE_PCH`` (default ``ON``) - Enable precompiled
+  headers.
+- ``DFTRACER_UTILS_ENABLE_ASAN`` / ``_UBSAN`` / ``_TSAN`` (default ``OFF``) -
+  Address / undefined-behavior / thread sanitizers.
+- ``DFTRACER_UTILS_ENABLE_MPI`` (default ``OFF``) - Enable MPI support;
+  required to build ``dftracer_aggregator_mpi`` and
+  ``dftracer_call_tree_mpi``.
+- ``DFTRACER_USE_ZLIB_NG`` (default ``ON``) - Use ``zlib-ng`` (compat ABI)
+  for faster compression and decompression. Falls back to ``madler/zlib``
+  if zlib-ng fetch or build fails.
+- ``DFTRACER_UTILS_ENABLE_ARROW`` (default ``ON``) - Enable the Arrow C Data
+  Interface via nanoarrow (required for Python Arrow output).
+- ``DFTRACER_UTILS_ENABLE_ARROW_IPC`` (default ``ON``) - Enable Arrow IPC
+  file read/write via nanoarrow. Required for ``dftracer_aggregator
+  --format arrow`` output and for the ``save_arrow`` / ``load_arrow`` call-
+  tree serialization paths.
+- ``DFTRACER_UTILS_ENABLE_ZSTD`` (default ``ON``) - Enable ZSTD compression
+  for RocksDB SST blocks.
+- ``DFTRACER_UTILS_ENABLE_LZ4`` (default ``OFF``) - Enable LZ4 compression
+  for RocksDB SST blocks.
+
+Example:
+
+.. code-block:: bash
+
+   cmake .. \
+       -DDFTRACER_UTILS_ENABLE_MPI=ON \
+       -DDFTRACER_UTILS_ENABLE_ARROW_IPC=ON \
+       -DDFTRACER_USE_ZLIB_NG=ON
+
 Verifying Installation
 ----------------------
 
@@ -97,23 +142,21 @@ To verify your Python installation:
 C++
 ~~~
 
-To verify your C++ installation, try compiling a simple example:
+To verify your C++ installation, try compiling a simple example that
+opens a trace through the public ``TraceReader`` API:
 
 .. code-block:: cpp
 
-   #include <dftracer/utils/indexer/indexer_factory.h>
+   #include <dftracer/utils/utilities/reader/trace_reader.h>
    #include <iostream>
 
    int main() {
-       // Create an indexer to verify installation
-       auto indexer = dftracer::utils::IndexerFactory::create(
-           "test.pfw.gz",
-           "test.pfw.gz.idx",
-           false  // Don't force rebuild
-       );
+       using dftracer::utils::utilities::reader::TraceReader;
 
+       TraceReader reader("test.pfw.gz");
        std::cout << "Library installed successfully!" << std::endl;
-       std::cout << "Archive format: " << indexer->get_format_name() << std::endl;
+       std::cout << "Has index: " << std::boolalpha
+                 << reader.has_index() << std::endl;
        return 0;
    }
 
diff --git a/docs/source/pipeline.rst b/docs/source/pipeline.rst
index 43cbc422..05faad8a 100644
--- a/docs/source/pipeline.rst
+++ b/docs/source/pipeline.rst
@@ -1016,6 +1016,91 @@ The project has migrated from the old ``TaskContext``/``TaskScope`` API to the n
 - ``PipelineConfig`` configures the executor (threads, timeouts, watchdog)
 - ``Pipeline::execute()`` blocks until all work completes
 
+Pipelined Replay
+----------------
+
+``dftracer_replay`` was refactored onto the same coroutine + channel model
+documented above. The replay engine now expresses parsing, decoding, and
+execution as three stages connected by bounded channels, eliminating the
+old synchronous pre-load step. Three end-to-end improvements landed
+together:
+
+- ``JsonParser`` is shared between the parse and execute stages; the
+  trace JSON is decoded incrementally instead of being slurped into a
+  ``std::vector<Event>`` before execution starts.
+- Buffer reuse and zero-copy string handling are wired through the I/O
+  read path, removing per-line allocations in the hot loop.
+- Stages communicate via ``Channel<Event>`` instances with backpressure,
+  so a slow execute stage no longer forces the parse stage to materialize
+  the entire trace.
+
+The replay binary is otherwise unchanged from a CLI perspective; see the
+``dftracer_replay`` section in :doc:`cli` for flag documentation.
+
+Memory Budget Control for Streaming Iterators
+---------------------------------------------
+
+The ``MemoryBudget`` helpers in
+``dftracer/utils/core/common/memory_budget.h`` give utilities a single
+place to size streaming channels and per-file batch counts based on
+available system memory:
+
+.. code-block:: cpp
+
+   #include <dftracer/utils/core/common/memory_budget.h>
+
+   using namespace dftracer::utils;
+
+   // 50% of available RAM by default; clamped to >= 64 MiB
+   const std::size_t budget = compute_memory_budget();
+
+   // Or honor a user override (in bytes); 0 falls back to auto-detect
+   const std::size_t budget_user =
+       compute_memory_budget(/*user_override_bytes=*/4ULL << 30);
+
+   // Per-file expansion factor + sample probing yields a per-file peak
+   const std::size_t per_file =
+       estimate_per_file_bytes(file_sizes_in_bytes);
+
+   // Derive channel capacity and per-flush batch size
+   const std::size_t cap =
+       compute_channel_capacity(budget, estimated_batch_bytes, num_workers);
+   const std::size_t batch =
+       compute_file_batch_size(budget, per_file, /*min_files=*/4);
+
+The Python ``TraceReader`` exposes the same control as a ``memory_budget``
+keyword on its streaming iterators (``iter_lines``, ``iter_lines_json``,
+``iter_raw``, ``iter_arrow``). Passing ``0`` keeps the auto-detect default;
+passing a positive integer caps the in-flight bytes across the underlying
+``Channel<T>`` instances.
+
+``flush_every_files`` for Batched Index Writes
+----------------------------------------------
+
+``dftracer_organize`` exposes the underlying batched-index control via
+``--memory-budget-mb``: the binary derives a ``flush_every_files`` value
+from the budget and feeds it to ``IndexBuildBatchConfig``. Each batch of
+``flush_every_files`` files is fully indexed and flushed before the next
+batch begins, capping peak memory regardless of trace count.
+
+When constructing an ``IndexBuildBatchConfig`` directly from C++:
+
+.. code-block:: cpp
+
+   auto batch_config = std::make_shared<IndexBuildBatchConfig>();
+   batch_config->file_paths        = files;
+   batch_config->index_dir         = index_dir;
+   batch_config->checkpoint_size   = checkpoint_size;
+   batch_config->parallelism       = executor_threads;
+   batch_config->flush_every_files = compute_file_batch_size(
+       compute_memory_budget(),
+       estimate_per_file_bytes(file_sizes),
+       /*min_files=*/4);
+
+A ``flush_every_files`` of ``0`` (the default) disables sub-batching and
+processes every file in one shot, which is fastest for small inputs but
+not memory-safe at scale.
+
 API Reference
 -------------
 
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
index 11cbee48..99b3e5e5 100644
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -18,6 +18,10 @@ The most common use case is reading trace files:
    # Open a compressed trace file (auto-detects index sidecar)
    reader = TraceReader("trace.pfw.gz")
 
+   # ...or pass a directory; TraceReader scans for .pfw / .pfw.gz files
+   # and streams them transparently as a single logical input.
+   reader = TraceReader("./traces")
+
    # Read all lines
    lines = reader.read_lines()
    for line in lines:
diff --git a/docs/source/server.rst b/docs/source/server.rst
index ccd82385..947df27a 100644
--- a/docs/source/server.rst
+++ b/docs/source/server.rst
@@ -410,6 +410,17 @@ The server uses coroutine-based concurrency to handle multiple simultaneous requ
 
 Event filtering streams through bloom indexes and partial reads, minimizing memory usage. Both ``/api/v1/events`` and ``/api/v1/events/stream`` use chunked transfer encoding with iovec scatter-gather I/O, streaming NDJSON results without buffering the full response in memory.
 
+**Client Receive Timeouts:**
+
+The streaming endpoints (``/api/v1/events``, ``/api/v1/events/stream``,
+``/api/v1/viz/events``) use HTTP/1.1 chunked transfer encoding and can hold
+a connection open while the server is still scanning chunks before any
+bytes are emitted. Clients should set a receive timeout of at least
+**15 seconds** (the timeout used by the bundled integration tests, raised
+from 2 s in earlier builds) to accommodate the worst-case index-warmup
+path; the server itself does not impose a global request timeout
+(``with_global_timeout(0)``).
+
 **Query Optimization:**
 
 - Use narrow time ranges in ``/api/v1/viz/events`` queries
diff --git a/docs/source/utilities/common.rst b/docs/source/utilities/common.rst
index 7c71e953..3b571b29 100644
--- a/docs/source/utilities/common.rst
+++ b/docs/source/utilities/common.rst
@@ -7,26 +7,26 @@ statistics collection, and Arrow data interchange.
 JSON
 ----
 
-Lightweight zero-cost wrapper around `yyjson <https://github.com/ibireme/yyjson>`_ for lazy JSON evaluation.
+JSON parsing uses `simdjson <https://github.com/simdjson/simdjson>`_ exclusively (DOM and On-Demand APIs). ``JsonValue`` is a lightweight wrapper around ``simdjson::dom::element``; ``JsonParser`` exposes the On-Demand API for zero-copy lazy field access.
 
 .. code-block:: cpp
 
    #include <dftracer/utils/utilities/common/json/json.h>
-   #include <dftracer/utils/utilities/common/json/json_doc_guard.h>
+   #include <dftracer/utils/utilities/common/json/json_value.h>
+   #include <dftracer/utils/utilities/common/json/parser.h>
 
 JsonValue
 ~~~~~~~~~
 
-Non-owning view over parsed JSON data with fluent navigation and type-safe accessors.
+Wrapper over ``simdjson::dom::element`` with fluent navigation and type-safe accessors. Non-owning: only valid while the backing ``simdjson::dom::document`` is alive.
 
 **Parse and navigate:**
 
 .. code-block:: cpp
 
-   yyjson_doc* doc = yyjson_read(json_str.c_str(), json_str.size(), 0);
-   JsonDocGuard guard(doc);  // RAII cleanup
-
-   JsonValue root(yyjson_doc_get_root(doc));
+   simdjson::dom::parser parser;
+   simdjson::dom::element doc = parser.parse(json_str);
+   JsonValue root(doc);
 
    // Fluent navigation with defaults
    std::string name = root["metadata"]["name"].get<std::string>("unknown");
@@ -61,24 +61,43 @@ Non-owning view over parsed JSON data with fluent navigation and type-safe acces
    if (val.is_array())  { /* ... */ }
    if (val.exists())    { /* not null */ }
 
-.. warning::
-
-   ``JsonValue`` is a non-owning view. It is only valid while the ``yyjson_doc`` is alive. Use ``JsonDocGuard`` for RAII lifetime management.
-
 JsonDocGuard
 ~~~~~~~~~~~~
 
-RAII guard for ``yyjson_doc*`` to prevent leaks on exceptions or early coroutine returns.
+RAII helper that owns a ``simdjson::dom::parser``; ``parse(data, len)`` reuses
+the parser buffer and ``root()`` returns the parsed element. Use across
+short-lived parse sites; ``StringJsonParserUtility`` is preferred when the
+document must outlive a ``co_await`` boundary.
 
 .. code-block:: cpp
 
-   {
-       yyjson_doc* doc = yyjson_read(data, len, 0);
-       JsonDocGuard guard(doc);
-
-       JsonValue root(yyjson_doc_get_root(doc));
+   JsonDocGuard guard;
+   if (guard.parse(data, len)) {
+       JsonValue root(guard.root());
        // ... use root ...
-   }  // guard destructor frees doc
+   }
+
+JsonParser (On-Demand)
+~~~~~~~~~~~~~~~~~~~~~~
+
+On-Demand parser for zero-copy lazy field access. Reuses an internal padded
+buffer across rows; ``string_view`` results are valid until the next
+``parse()`` call. Used by the indexing visitors and ``TraceReader::read_json``.
+
+.. code-block:: cpp
+
+   JsonParser parser;
+
+   for (auto& line : input_lines) {
+       if (!parser.parse(line)) continue;
+       auto name = parser.get_string("name");
+       auto ts   = parser.get_int64("ts");
+
+       parser.for_each_field("args", [](std::string_view k,
+                                        simdjson::ondemand::value v) {
+           // process nested fields
+       });
+   }
 
 StringJsonParserUtility
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -100,6 +119,44 @@ Parses JSON strings with owned document lifetime. Safe for use across ``co_await
 
    parser.reset();  // Cleanup
 
+ArgsMap and ArgsValueProxy
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Owned key/value map used for trace event ``args``. Replaces ``JsonValue`` for
+event-args storage in the DFT composites: keys are interned with the global
+:cpp:class:`dftracer::utils::StringIntern` and values are a typed
+``std::variant`` (string, int64, uint64, double, bool). ``ArgsValueProxy``
+mirrors the ``JsonValue`` accessor surface (``get<T>``, ``get_optional<T>``,
+``is_string()`` ...) so event visitors can be written generically.
+
+.. code-block:: cpp
+
+   #include <dftracer/utils/utilities/composites/dft/args_map.h>
+
+   using dftracer::utils::utilities::composites::dft::ArgsMap;
+
+   ArgsMap args;
+   args.insert("hhash", std::uint64_t{0x1234});
+   args.set_valid(true);
+
+   uint64_t h = args["hhash"].get<uint64_t>(0);
+   args.for_each_member([](std::string_view k, auto v) { /* ... */ });
+
+JsonDictValue (Python)
+~~~~~~~~~~~~~~~~~~~~~~
+
+Python-facing wrapper that exposes a parsed JSON object as a lazy
+``Mapping``. Used by the ``TraceReader`` Python binding to surface parsed
+events without materialising a ``dict`` per row. Defined in
+``src/dftracer/utils/python/json.h``.
+
+.. note::
+
+   :cpp:class:`dftracer::utils::StringIntern` was reimplemented as a
+   lock-free open-chained hash table with a fast-path id table
+   (``FAST_CAPACITY = 1<<20``). Lookups are fully lock-free; only the rare
+   first insert of a string takes the insertion mutex.
+
 Query
 -----
 
@@ -190,6 +247,7 @@ Percentile estimation and histogram utilities for trace analysis.
 
    #include <dftracer/utils/utilities/common/statistics/ddsketch.h>
    #include <dftracer/utils/utilities/common/statistics/log2_histogram.h>
+   #include <dftracer/utils/utilities/common/statistics/timestamp_histogram.h>
 
 DDSketch
 ~~~~~~~~
@@ -274,6 +332,24 @@ Fixed 65-bin logarithmic histogram covering the ``uint64_t`` range. Bin 0 holds
    std::string json = a.to_json();
    Log2Histogram restored = Log2Histogram::from_json(json);
 
+TimestampHistogram
+~~~~~~~~~~~~~~~~~~
+
+Sparse fixed-width (100 ms) histogram over event timestamps. Used by the
+chunk pruner to compute time-range selectivity and to weight sub-bucket
+expansions for adaptive aggregation.
+
+.. code-block:: cpp
+
+   TimestampHistogram th;
+   for (auto ts_us : timestamps) th.add(ts_us);
+
+   std::uint64_t in_window = th.count_in_range(ts_lo, ts_hi);
+   double sel = th.selectivity(ts_lo, ts_hi);
+
+   auto bytes = th.serialize();
+   auto restored = TimestampHistogram::deserialize(bytes.data(), bytes.size());
+
 Arrow
 -----
 
diff --git a/docs/source/utilities/composites.rst b/docs/source/utilities/composites.rst
index 4e1ea430..35e0736a 100644
--- a/docs/source/utilities/composites.rst
+++ b/docs/source/utilities/composites.rst
@@ -502,10 +502,44 @@ Complete example of gathering statistics from a DFTracer trace file:
     std::cout << "Duration p99: " << stats.merged.duration_sketch.quantile(0.99)
               << " us" << std::endl;
 
+DFT Event Pipeline
+------------------
+
+DftEventDispatcher
+~~~~~~~~~~~~~~~~~~
+
+Adapter that turns a list of ``DftEventVisitor`` instances into a single
+``IndexVisitor`` consumable by ``IndexBuilderUtility``. Owns a per-instance
+``JsonParser`` and parses each decompressed line once before fanning out to
+the configured visitors (``BloomVisitor``, ``ManifestVisitor``,
+``AggregationVisitor``, ...). Supports a ``force_serial`` mode for
+deterministic-order replays.
+
+.. code-block:: cpp
+
+    #include <dftracer/utils/utilities/composites/dft/dft_event_dispatcher.h>
+
+    std::vector<std::unique_ptr<DftEventVisitor>> visitors;
+    visitors.push_back(std::make_unique<BloomVisitor>(...));
+    visitors.push_back(std::make_unique<ManifestVisitor>(...));
+    DftEventDispatcher dispatcher(std::move(visitors));
+
+AggregationVisitor
+~~~~~~~~~~~~~~~~~~
+
+Emits per-chunk aggregation + system-metric merge operands into the
+distributed aggregation column families. Pairs with
+``AggregationMergeOperator`` / ``SystemMetricsMergeOperator`` for
+distributed reduction; lives in
+``composites/dft/aggregators/aggregation_visitor.h``.
+
 Reorganization Pipeline
 -----------------------
 
-Parallel event routing for reorganizing traces by query-based groups.
+Parallel event routing for reorganizing traces by query-based groups. The
+``organize`` flow is a streaming pipeline that fans events through visitor
+groups, batches output, and periodically flushes group writers
+(``GroupWriterTask``) to bound peak memory.
 
 ChunkWriter
 ~~~~~~~~~~~
@@ -562,6 +596,15 @@ Tracks source-to-output mapping during reorganization. Records which source
 file and line produced each output event, enabling reconstruction of original
 traces from reorganized files via ``dftracer_reconstruct``.
 
+ReconstructorUtility
+~~~~~~~~~~~~~~~~~~~~
+
+Streaming reconstruction pipeline that inverts the organize pipeline:
+plans a reconstruction over a ``.pidx`` provenance store, fans out per-source
+read tasks through coroutines and channels, and merges results in
+original-order back into the requested output. Defined in
+``composites/dft/reorganize/reconstructor_utility.h``.
+
 Comparison
 ----------
 
diff --git a/docs/source/utilities/compression.rst b/docs/source/utilities/compression.rst
index 7d5d5ca4..23ecacf6 100644
--- a/docs/source/utilities/compression.rst
+++ b/docs/source/utilities/compression.rst
@@ -4,6 +4,21 @@ Compression
 Streaming zlib compression and decompression utilities supporting GZIP, ZLIB, and DEFLATE formats.
 All compression operates in streaming mode using zero-copy ``ByteView`` chunks.
 
+.. note::
+
+   The default gzip level used by the writer pipeline (``dftracer_aggregator``,
+   ``dftracer_organize``, parallel writers) is ``1`` (fastest); previous
+   releases defaulted to ``Z_DEFAULT_COMPRESSION`` (6). Override per-call with
+   the ``compression_level`` field on ``ManualStreamingCompressorUtility``.
+
+.. note::
+
+   The build defaults to zlib-ng (compat ABI) when the ``DFTRACER_USE_ZLIB_NG``
+   CMake option is ``ON`` (the default), falling back to ``madler/zlib`` if
+   zlib-ng cannot be added. The compressor sources are unchanged: the same
+   ``deflate``/``inflate`` symbols are linked against whichever backend was
+   selected at configure time.
+
 .. code-block:: cpp
 
    #include <dftracer/utils/utilities/compression/zlib/streaming_compressor_utility.h>
@@ -59,6 +74,18 @@ Yields compressed chunks as ``ByteView`` references into an internal buffer.
    std::size_t bytes_out = compressor.total_bytes_out();
    double ratio = compressor.compression_ratio();
 
+Buffered Compression
+--------------------
+
+Writer pipelines (parallel writer, perfetto trace writer, organize group
+writers) buffer compressed payloads and flush at a configurable
+``flush_threshold``. The threshold is computed by
+``compute_writer_sizing()`` from the detected filesystem layout
+(``LayoutInfo``): on Lustre/GPFS the threshold is sized to the PFS stripe
+so each compressed flush fits one stripe; on local FS it is ``max(default,
+stripe_size)``. Buffer capacity is always ``flush_threshold +
+buffer_headroom``.
+
 StreamingDecompressorUtility
 ----------------------------
 
diff --git a/docs/source/utilities/fileio.rst b/docs/source/utilities/fileio.rst
index 01e0db80..2d673601 100644
--- a/docs/source/utilities/fileio.rst
+++ b/docs/source/utilities/fileio.rst
@@ -248,6 +248,68 @@ Read lines from ``.gz`` files without building an index, using streaming decompr
        process(*line);
    }
 
+Parallel Writers
+----------------
+
+Layout-aware parallel writers for multi-worker output. The ``ParallelWriter``
+interface is implemented by three concrete layouts under
+``fileio/parallel/``:
+
+- **StripedWriter** — single output file, atomic-offset ``pwrite`` per
+  worker. Used on local FS and PFS without padded stripes.
+- **PaddedStripedWriter** — single output file where each worker chunk is
+  padded to a full PFS stripe so per-stripe writes never cross workers.
+  Recommended for Lustre/GPFS when the stripe size is at least
+  ``MIN_PADDED_STRIPE_BYTES`` (1 MiB).
+- **ShardedWriter** — N output files, one per worker, glob-named by
+  ordinal. Used on NFS where atomic-offset ``pwrite`` is not reliable.
+
+.. code-block:: cpp
+
+    #include <dftracer/utils/utilities/fileio/parallel/parallel_writer.h>
+    #include <dftracer/utils/utilities/fileio/parallel/layout.h>
+
+    using namespace dftracer::utils::utilities::fileio::parallel;
+
+    auto info   = detect_layout("/lustre/.../output.pfw.gz");
+    auto sizing = compute_writer_sizing(info, /*baseline_workers=*/64,
+                                        /*default_flush=*/4 << 20,
+                                        /*headroom=*/1 << 20,
+                                        /*padded=*/true);
+
+    WriterConfig cfg{
+        .layout = info.layout,
+        .stripe_size = info.stripe_size,
+        .gzip = true,
+    };
+    auto writer = make_writer(cfg);
+    co_await writer->open("output.pfw.gz", sizing.num_workers,
+                          /*gzip_extension=*/true, scope);
+
+    co_await writer->write_header(header_bytes);
+    co_await writer->write_chunk(worker_id, chunk_bytes);
+    auto member = writer->last_member(worker_id);  // offset+length of the gzip member
+    co_await writer->write_footer(footer_bytes);
+    co_await writer->close();
+
+The writer collects per-chunk ``MemberSpan`` entries (offset + length of
+each independently decompressable gzip member) and exposes them via
+``member_layout()`` after close. ``shard_base_offsets()`` remaps shard-local
+offsets to merged-file offsets for sharded layouts.
+
+Layout detection (``detect_layout``) classifies a path's filesystem as
+Lustre, GPFS, BeeGFS, NFS, or LOCAL and picks ``SHARDED`` on NFS,
+``STRIPED`` elsewhere; ``compute_writer_sizing`` caps worker count at the
+PFS stripe count and sets ``flush_threshold`` to the stripe size for
+padded layouts so each compressed flush coalesces into one stripe.
+
+.. note::
+
+    Compressor generators consumed by the parallel writer are wrapped in
+    smart pointers (``std::unique_ptr<ManualStreamingCompressorUtility>``)
+    so they can be moved across coroutine frames without leaking the
+    underlying zlib stream.
+
 Async vs Synchronous
 --------------------
 
diff --git a/docs/source/utilities/indexer.rst b/docs/source/utilities/indexer.rst
index 6306ecd8..60e8bf42 100644
--- a/docs/source/utilities/indexer.rst
+++ b/docs/source/utilities/indexer.rst
@@ -1,7 +1,7 @@
 Indexer
 =================
 
-Unified indexing and reading infrastructure for compressed trace files. Builds sidecar ``.idx`` files that enable efficient random access, bloom-filter-accelerated queries, and event-level manifest routing — all in a single decompression pass.
+Unified indexing and reading infrastructure for compressed trace files. Builds a sidecar ``.dftindex`` RocksDB store (and optional flat-file SSTs) that enables efficient random access, bloom-filter-accelerated queries, event-level manifest routing, and distributed aggregation, all from a single decompression pass.
 
 .. code-block:: cpp
 
@@ -11,19 +11,25 @@ Unified indexing and reading infrastructure for compressed trace files. Builds s
 Overview
 --------
 
-The indexer builds sidecar ``.idx`` files with an additive SQLite schema:
+The indexer writes column families into a shared ``.dftindex`` RocksDB store
+(or, for distributed builds, a content-addressed SST staging directory that
+is ingested into the store):
 
 - **Checkpoints** — byte offsets and decompression dictionaries for random access
 - **Bloom filters** — per-chunk bloom filters for fast event filtering (optional)
 - **Chunk statistics** — per-chunk event counts, duration distributions (optional)
-- **Manifest** — per-chunk event-to-line routing for reorganization (optional)
+- **Manifest** — per-chunk (cat, name) -> line numbers for sparse query routing (optional)
+- **Aggregation / system metrics** — distributed aggregation CFs populated via
+  ``SstFileWriter::Merge`` operands
 
-A separate ``.pidx`` file stores provenance data for reorganization tracking.
+SST files staged on disk are **content-addressed** (FNV-1a 64-bit fingerprint
+over the SST payload) so identical SSTs produced by different ranks collapse
+to a single ingest, and re-ingesting is idempotent. String IDs in the
+``names`` and ``cats`` CFs are deterministic FNV-1a hashes so the same name
+maps to the same id across processes.
 
-Sidecar files:
-
-- ``.idx`` — Unified content index (checkpoints + bloom filters + chunk statistics + manifest)
-- ``.pidx`` — Provenance index (reorganization tracking)
+A separate ``.pidx`` provenance store tracks source-to-output mapping for
+reorganized files.
 
 IndexBuilder
 ------------
@@ -64,6 +70,54 @@ Single-pass index builder. Decompresses each file once and builds all requested
    // Later: all features present, skips entirely
    co_await builder.process(config2);  // "Skipping already-indexed file"
 
+IndexBatchBuilderUtility
+------------------------
+
+Builds many files in a single pipelined pass. Parses files in parallel
+(``parallelism`` workers) and routes their parsed artifacts (bloom rows,
+manifest entries, aggregation merge operands, extra-visitor SSTs) to a
+write phase. Supports batched flushing (``flush_every_files``) to bound
+peak memory, distributed SST sinks via ``sink_factory`` / ``sink_commit``,
+preassigned file ids, and per-file gzip-member slicing for cross-rank file
+splitting (the MPI driver pre-scans each ``.pfw.gz`` for member boundaries
+and assigns disjoint ``[member_begin, member_end)`` ranges to ranks).
+
+.. code-block:: cpp
+
+   #include <dftracer/utils/utilities/indexer/index_builder_utility.h>
+
+   IndexBuildBatchConfig cfg;
+   cfg.file_paths = {"a.pfw.gz", "b.pfw.gz", "c.pfw.gz"};
+   cfg.index_dir = "/data/.dftindex";
+   cfg.parallelism = 16;
+   cfg.build_manifest = true;
+   cfg.use_batch_write = true;
+   cfg.rebuild_root_summaries = true;
+   cfg.flush_every_files = 8;
+
+   auto batch = co_await IndexBatchBuilderUtility::process(scope,
+       std::make_shared<IndexBuildBatchConfig>(std::move(cfg)));
+
+IndexDatabaseWriterContext
+--------------------------
+
+Implements ``IndexBatchSink`` over a coordinator-owned RocksDB store: each
+batch's parsed artifacts are buffered, then committed atomically via
+``WriteBatch``. ``IndexDatabaseSstWriterContext`` is the SST-staging
+variant used by the distributed indexer; its outputs are content-addressed
+SST files later ingested into the coordinator store.
+
+IndexResolverUtility
+--------------------
+
+Resolves the index directory for a given trace file, building the index on
+demand when ``auto_build_index`` is set. Lives in
+``composites/dft/indexing/`` because it depends on the DFT visitor set.
+
+.. code-block:: cpp
+
+   #include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
+
 IndexDatabase
 -------------
 
@@ -192,10 +246,17 @@ Interface for processing decompressed lines during index building. Implementatio
        virtual void finalize(IndexDatabase& db, int file_id) = 0;
    };
 
-Built-in visitors:
-
-- **BloomVisitor** — parses JSON events, populates bloom filters and chunk statistics
-- **ManifestVisitor** — tracks (category, name) to line number mappings per checkpoint
+Built-in event visitors live in ``composites/dft/visitors/`` (they extend
+``DftEventVisitor`` and are wrapped by ``DftEventDispatcher``, which
+implements ``IndexVisitor``):
+
+- **BloomVisitor** (``composites/dft/visitors/bloom_visitor.h``) - parses
+  JSON events, populates bloom filters and chunk statistics
+- **ManifestVisitor** (``composites/dft/visitors/manifest_visitor.h``) -
+  tracks (category, name) -> line numbers per checkpoint for sparse query
+  acceleration
+- **AggregationVisitor** (``composites/dft/aggregators/aggregation_visitor.h``)
+  - emits per-chunk aggregation and system-metric merge operands
 
 Low-level IndexerFactory
 ------------------------
diff --git a/docs/source/utilities/reader.rst b/docs/source/utilities/reader.rst
index 4bd39946..b8695e37 100644
--- a/docs/source/utilities/reader.rst
+++ b/docs/source/utilities/reader.rst
@@ -15,7 +15,39 @@ Streaming reader for compressed trace files with support for line-based and byte
 Overview
 --------
 
-The reader provides random access into indexed compressed files (``.pfw.gz`` + ``.idx``). It supports multiple stream types for different access patterns and both synchronous and asynchronous reads.
+The reader provides random access into indexed compressed files
+(``.pfw.gz`` + ``.dftindex``). It supports multiple stream types for
+different access patterns and both synchronous and asynchronous reads.
+
+The high-level :cpp:class:`dftracer::utils::utilities::reader::TraceReader`
+exposes:
+
+- **Directory input** (Python binding): when constructed with a directory,
+  all matching ``.pfw.gz`` files share one ``.dftindex`` root and are
+  processed in parallel (each file becomes one or more checkpoint-level
+  work items routed across the runtime thread pool).
+- **JSON streaming** (``read_json``): each line is parsed once with a
+  reused ``simdjson`` ondemand ``JsonParser``; the yielded ``JsonLine``
+  borrows the parser until the next ``next()`` call.
+- **Arrow streaming** (``read_arrow``, Python ``iter_arrow_stream``):
+  yields native ``ArrowExportResult`` record batches sized at
+  ``batch_size`` rows. The Python binding exposes this as an Arrow C
+  Data Interface stream (no Python-side row materialisation).
+- **Query filtering**: an optional ``query`` DSL string is compiled into
+  AND-of-EQ probes when possible. The compiled probes evaluate directly
+  against simdjson fields, with a uniform-match shortcut when every
+  candidate chunk fully matches the predicate (no per-event re-evaluation).
+- **Line-range work items**: the dispatcher splits a file's checkpoints
+  into independent line-range work items that the runtime executes in
+  parallel; ``ReadConfig::skip_pruning`` lets the dispatcher avoid
+  re-running the chunk pruner per work item.
+- **Batch chunk pruning**: a single pruner pass per file feeds all work
+  items, using ``ChunkPrunerUtility`` over bloom filters, chunk
+  statistics, and the manifest CF.
+- **flatten_objects**: when set, top-level JSON object values
+  (e.g. ``args``) are expanded one level into ``parent.child`` columns
+  with native Arrow types; deeper nesting still round-trips as a JSON
+  text column.
 
 ReaderFactory
 -------------
diff --git a/docs/source/utilities/replay.rst b/docs/source/utilities/replay.rst
index 44556984..0302d170 100644
--- a/docs/source/utilities/replay.rst
+++ b/docs/source/utilities/replay.rst
@@ -3,6 +3,18 @@ Replay
 
 The replay utility replays DFTracer trace files by reading recorded events and executing them in a configurable replay mode. It supports plain text and gzipped traces, dry-run analysis, timing-aware replay, and filtered execution for focused testing.
 
+.. note::
+
+   The engine is now pipelined with C++20 coroutines and channels: trace
+   reading, JSON parsing, filtering, and execution run as concurrent stages
+   communicating through bounded channels, so a slow executor no longer
+   blocks the reader. JSON parsing uses the shared
+   :cpp:class:`dftracer::utils::utilities::common::json::JsonParser`
+   (on-demand simdjson) which reuses one padded buffer per stage. String
+   handling and file I/O have been re-tuned with a fixed read buffer and
+   ``string_view`` line slicing; the public ``ReplayEngine`` /
+   ``ReplayConfig`` / ``ReplayResult`` API is unchanged.
+
 .. code-block:: cpp
 
    #include <dftracer/utils/utilities/replay/replay.h>
diff --git a/examples/call_tree_example1.cpp b/examples/call_tree_example1.cpp
index 490be316..b5544c75 100644
--- a/examples/call_tree_example1.cpp
+++ b/examples/call_tree_example1.cpp
@@ -4,9 +4,19 @@
  */
 
 #include <dftracer/utils/call_tree/call_tree.h>
+#include <dftracer/utils/call_tree/internal/call_tree.h>
+#include <dftracer/utils/call_tree/mpi/serializable.h>
+#include <dftracer/utils/core/pipeline/pipeline.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/core/tasks/task.h>
+
 #include <cstdio>
 
 using namespace dftracer::utils::call_tree;
+using dftracer::utils::CoroScope;
+using dftracer::utils::Pipeline;
+using dftracer::utils::make_task;
+namespace coro = dftracer::utils::coro;
 
 int main(int argc, char* argv[]) {
     printf("=== CallTree API Example 1: Basic Usage ===\n");
@@ -59,30 +69,29 @@ int main(int argc, char* argv[]) {
     }
     printf("\n");
     
-    // Save to file
-    printf("Step 6: Serialize call tree to binary file\n");
-    std::string output_file = tree.get_output_path();
-    printf("  Default output path: %s\n", output_file.c_str());
-    
-    if (tree.save_to_file()) {
-        printf("  Successfully saved!\n");
-    }
-    printf("\n");
-    
-    // Save to JSON format
-    printf("Step 7: Serialize call tree to JSON (Chrome Tracing format)\n");
-    if (tree.save_to_json()) {
-        printf("  Successfully saved to JSON!\n");
+    // Step 6: persist via the coroutine save APIs driven by a Pipeline.
+    printf("Step 6: Save call tree (custom binary + Arrow IPC)\n");
+    const std::string bin_path = "nodes-1_calltree.bin";
+    const std::string arrow_path = "nodes-1_calltree.arrow";
+    bool bin_ok = false, arrow_ok = false;
+    {
+        Pipeline pipeline;
+        auto save = make_task(
+            [&](CoroScope& scope) -> coro::CoroTask<void> {
+                bin_ok = co_await save_binary(&scope, tree.internal_tree(),
+                                              bin_path);
+                arrow_ok = co_await save_arrow(&scope, tree.internal_tree(),
+                                               arrow_path);
+            },
+            "save_call_tree");
+        pipeline.set_source(save);
+        pipeline.set_destination(save);
+        pipeline.execute();
     }
-    printf("\n");
-    
-    // Print tree to text file
-    printf("Step 8: Export call tree to text file\n");
-    std::string text_file = "nodes-1_calltree.txt";
-    if (tree.print_depth_first_to_file(text_file)) {
-        printf("  Exported to: %s\n", text_file.c_str());
-    }
-    
+    printf("  Binary: %s -> %s\n", bin_path.c_str(), bin_ok ? "ok" : "failed");
+    printf("  Arrow:  %s -> %s\n", arrow_path.c_str(),
+           arrow_ok ? "ok" : "failed");
+
     printf("\n=== Example completed successfully ===\n");
     
     return 0;
diff --git a/examples/call_tree_example2.cpp b/examples/call_tree_example2.cpp
index ab3bdc70..0b4fb4d7 100644
--- a/examples/call_tree_example2.cpp
+++ b/examples/call_tree_example2.cpp
@@ -4,10 +4,20 @@
  */
 
 #include <dftracer/utils/call_tree/call_tree.h>
+#include <dftracer/utils/call_tree/internal/call_tree.h>
+#include <dftracer/utils/call_tree/mpi/serializable.h>
+#include <dftracer/utils/core/pipeline/pipeline.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/core/tasks/task.h>
+
 #include <cstdio>
 #include <map>
 
 using namespace dftracer::utils::call_tree;
+using dftracer::utils::CoroScope;
+using dftracer::utils::Pipeline;
+using dftracer::utils::make_task;
+namespace coro = dftracer::utils::coro;
 
 int main(int argc, char* argv[]) {
     printf("=== CallTree API Example 2: Multi-Node Traces ===\n");
@@ -84,29 +94,29 @@ int main(int argc, char* argv[]) {
     }
     printf("\n");
     
-    // Save outputs
     printf("--- Saving Outputs ---\n");
-    
-    // Set custom output path
-    tree.set_output_path("nodes-4_calltree.bin");
-    
-    if (tree.save_to_file()) {
-        printf("Binary format saved: nodes-4_calltree.bin\n");
+    const std::string bin_path = "nodes-4_calltree.bin";
+    const std::string arrow_path = "nodes-4_calltree.arrow";
+    bool bin_ok = false, arrow_ok = false;
+    {
+        Pipeline pipeline;
+        auto save = make_task(
+            [&](CoroScope& scope) -> coro::CoroTask<void> {
+                bin_ok = co_await save_binary(&scope, tree.internal_tree(),
+                                              bin_path);
+                arrow_ok = co_await save_arrow(&scope, tree.internal_tree(),
+                                               arrow_path);
+            },
+            "save_call_tree");
+        pipeline.set_source(save);
+        pipeline.set_destination(save);
+        pipeline.execute();
     }
-    
-    // Save to JSON (Chrome Tracing format)
-    if (tree.save_to_json("nodes-4_calltree.pfw")) {
-        printf("JSON format saved: nodes-4_calltree.pfw (Chrome Tracing compatible)\n");
-    }
-    
-    if (tree.print_depth_first_to_file("nodes-4_calltree_full.txt", 0)) {
-        printf("Full tree saved: nodes-4_calltree_full.txt\n");
-    }
-    
-    if (tree.print_depth_first_to_file("nodes-4_calltree_summary.txt", 2)) {
-        printf("Summary (2 levels) saved: nodes-4_calltree_summary.txt\n");
-    }
-    
+    printf("Binary format: %s -> %s\n", bin_path.c_str(),
+           bin_ok ? "saved" : "failed");
+    printf("Arrow IPC:     %s -> %s\n", arrow_path.c_str(),
+           arrow_ok ? "saved" : "failed");
+
     printf("\n=== Example completed successfully ===\n");
     
     return 0;
diff --git a/examples/call_tree_example3.cpp b/examples/call_tree_example3.cpp
index c2cfb385..d6fd704a 100644
--- a/examples/call_tree_example3.cpp
+++ b/examples/call_tree_example3.cpp
@@ -4,12 +4,22 @@
  */
 
 #include <dftracer/utils/call_tree/call_tree.h>
+#include <dftracer/utils/call_tree/internal/call_tree.h>
+#include <dftracer/utils/call_tree/mpi/serializable.h>
+#include <dftracer/utils/core/pipeline/pipeline.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/core/tasks/task.h>
+
 #include <cstdio>
 #include <algorithm>
 #include <numeric>
 #include <map>
 
 using namespace dftracer::utils::call_tree;
+using dftracer::utils::CoroScope;
+using dftracer::utils::Pipeline;
+using dftracer::utils::make_task;
+namespace coro = dftracer::utils::coro;
 
 static void analyze_call_patterns(const std::vector<CallTreeNodeInfo>& nodes) {
     printf("\n--- Call Pattern Analysis ---\n");
@@ -122,12 +132,26 @@ int main(int argc, char* argv[]) {
     // Also print the built-in statistics
     tree.print_statistics();
     
-    // Save analysis results in JSON format for downstream processing
+    // Save analysis results. For Chrome Tracing JSON use the
+    // dftracer_call_tree binary; for fast C++ round-trip use save_binary;
+    // for Arrow tooling use save_arrow. Both run inside a Pipeline.
     printf("\nSaving analysis results...\n");
-    if (tree.save_to_json("analysis_output.pfw")) {
-        printf("✓ JSON output saved to: analysis_output.pfw\n");
-        printf("  This file can be imported into Chrome Tracing, Perfetto,\n");
-        printf("  or analyzed with DFAnalyzer tools.\n");
+    bool arrow_ok = false;
+    {
+        Pipeline pipeline;
+        auto save = make_task(
+            [&](CoroScope& scope) -> coro::CoroTask<void> {
+                arrow_ok = co_await save_arrow(&scope, tree.internal_tree(),
+                                               "analysis_output.arrow");
+            },
+            "save_call_tree");
+        pipeline.set_source(save);
+        pipeline.set_destination(save);
+        pipeline.execute();
+    }
+    if (arrow_ok) {
+        printf("Arrow IPC output saved to: analysis_output.arrow\n");
+        printf("  Readable by pyarrow / polars / dfanalyzer.\n");
     }
     
     printf("\n=== Analysis complete ===\n");
diff --git a/flake.lock b/flake.lock
deleted file mode 100644
index d9bd5088..00000000
--- a/flake.lock
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-  "nodes": {
-    "nixpkgs": {
-      "locked": {
-        "lastModified": 1758690382,
-        "narHash": "sha256-NY3kSorgqE5LMm1LqNwGne3ZLMF2/ILgLpFr1fS4X3o=",
-        "owner": "NixOS",
-        "repo": "nixpkgs",
-        "rev": "e643668fd71b949c53f8626614b21ff71a07379d",
-        "type": "github"
-      },
-      "original": {
-        "owner": "NixOS",
-        "ref": "nixos-unstable",
-        "repo": "nixpkgs",
-        "type": "github"
-      }
-    },
-    "root": {
-      "inputs": {
-        "nixpkgs": "nixpkgs"
-      }
-    }
-  },
-  "root": "root",
-  "version": 7
-}
diff --git a/flake.nix b/flake.nix
deleted file mode 100644
index d0f8d887..00000000
--- a/flake.nix
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  description = "DFTracer Utilities";
-
-  inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
-
-  outputs = { self, nixpkgs }:
-  let
-    systems = [ "x86_64-linux" "aarch64-linux" "x86_64-darwin" "aarch64-darwin" ];
-    forAllSystems = f:
-      nixpkgs.lib.genAttrs systems (system:
-        f system (import nixpkgs { inherit system; }));
-  in
-  {
-    devShells = forAllSystems (system: pkgs:
-      let
-        gcc = pkgs.gcc14;
-      in {
-        default = pkgs.mkShell {
-          packages = [ gcc ] ++ (with pkgs; [
-            cmake
-            ninja
-            pkg-config
-            pigz
-            lcov
-            openmpi
-            cmake-format
-            doxygen
-            graphviz
-            # valgrind-light
-            # sqlite
-            # zlib
-            # spdlog
-            gnutar
-            python311
-            python312
-            (python310.withPackages (p: [
-              p.cython
-              p.setuptools
-              p.wheel
-              p.venvShellHook
-            ]))
-          ]);
-
-          CC = "gcc";
-          CXX = "g++";
-          shellHook = ''
-            export CC=gcc
-            export CXX=g++
-            unset SOURCE_DATE_EPOCH
-          '';
-        };
-      });
-  };
-}
diff --git a/include/dftracer/utils/call_tree/call_tree.h b/include/dftracer/utils/call_tree/call_tree.h
index 8e88f915..cc79e6b2 100644
--- a/include/dftracer/utils/call_tree/call_tree.h
+++ b/include/dftracer/utils/call_tree/call_tree.h
@@ -62,8 +62,9 @@ struct CallTreeStats {
 
 // Forward declarations
 namespace internal {
+class CallTree;
 class CallTreeImpl;
-}
+}  // namespace internal
 
 /**
  * @brief Simple, clean API for working with call trees from DFTracer traces.
@@ -114,65 +115,11 @@ class CallTree {
      */
     bool generate();
 
-    /**
-     * Print the call tree in depth-first order to stdout
-     * @param max_depth Maximum depth to print (0 = unlimited)
-     */
+    /// Print depth-first tree to stdout. max_depth=0 means unlimited.
     void print_depth_first(int max_depth = 0) const;
 
-    /**
-     * Print the call tree to a file in depth-first order
-     * @param filename Output file path
-     * @param max_depth Maximum depth to print (0 = unlimited)
-     * @return true if successful, false otherwise
-     */
-    bool print_depth_first_to_file(const std::string& filename,
-                                   int max_depth = 0) const;
-
-    /**
-     * Get list of nodes in depth-first traversal order
-     * Returns simple node info structures (no complex internals)
-     * @return Vector of node information structures
-     */
     std::vector<CallTreeNodeInfo> get_nodes_depth_first() const;
 
-    /**
-     * Get the path where serialized tree would be saved
-     * @return Default output path based on input directory
-     */
-    std::string get_output_path() const;
-
-    /**
-     * Set custom output path for serialization
-     * @param path Custom output file path
-     */
-    void set_output_path(const std::string& path);
-
-    /**
-     * Serialize and save call tree to file in binary format
-     * @param filename Output file path (optional, uses get_output_path() if
-     * empty)
-     * @return true if successful, false otherwise
-     */
-    bool save_to_file(const std::string& filename = "") const;
-
-    /**
-     * Serialize and save call tree to file in JSON format (Chrome
-     * Tracing/Perfetto compatible) Follows DFTracer serialization format for
-     * compatibility with existing analysis tools
-     * @param filename Output file path (optional, uses get_output_path() with
-     * .pfw extension if empty)
-     * @return true if successful, false otherwise
-     */
-    bool save_to_json(const std::string& filename = "") const;
-
-    /**
-     * Load call tree from previously saved file
-     * @param filename Input file path
-     * @return true if successful, false otherwise
-     */
-    bool load_from_file(const std::string& filename);
-
     /**
      * Get aggregate statistics about the call tree
      * @return Statistics structure with aggregate information
@@ -236,6 +183,12 @@ class CallTree {
      */
     CallTreeNodeInfo get_node_by_id(std::uint64_t id) const;
 
+    /// Direct access to the underlying internal::CallTree. Use with the
+    /// save_binary / save_arrow coroutines in mpi/serializable.h. Returns a
+    /// reference; callers must keep the CallTree alive while it's in use.
+    internal::CallTree& internal_tree();
+    const internal::CallTree& internal_tree() const;
+
    private:
     std::unique_ptr<internal::CallTreeImpl> impl_;
 };
diff --git a/include/dftracer/utils/call_tree/call_tree_mpi.h b/include/dftracer/utils/call_tree/call_tree_mpi.h
index 8f112a3a..7f2b1d90 100644
--- a/include/dftracer/utils/call_tree/call_tree_mpi.h
+++ b/include/dftracer/utils/call_tree/call_tree_mpi.h
@@ -1,39 +1,16 @@
 #ifndef DFTRACER_UTILS_CALL_TREE_MPI_H
 #define DFTRACER_UTILS_CALL_TREE_MPI_H
 
-/**
- * @file call_tree_mpi.h
- * @brief Umbrella header for MPI-parallel call tree components
- *
- * This header provides convenient access to all MPI-related call tree
- * functionality. Individual components can also be included separately
- * from the mpi/ subdirectory for finer-grained control.
- *
- * Components included:
- * - PIDIndexInfo: PID index information structure
- * - SerializableCallNode/ProcessGraph: Serializable structures for MPI transfer
- * - MPICallTreeConfig/Result: Configuration and result structures
- * - CallGraphFileHeader: File header for call graph serialization
- * - MPICallTreeBuilder: Main builder class for MPI-parallel call graph
- * generation
- * - MPIFilteredTraceReader: Filtered trace reader for specific PIDs
- * - CallTreeBuildTask: Pipeline task for call tree building
- * - serialization utilities: Read/write primitives for MPI serialization
- */
+// MPI call-tree umbrella header. The engine is the coroutine-driven
+// MPICallTreeBuilder; older per-component headers (serializable,
+// file_header, build_task, filtered_reader, pid_index_info, serialization)
+// were removed when the build/gather phases moved to ParallelWriter +
+// merge_shards on Chrome Tracing JSON output.
 
-// Include all MPI call tree components
-#include <dftracer/utils/call_tree/mpi/build_task.h>
-#include <dftracer/utils/call_tree/mpi/builder.h>
-#include <dftracer/utils/call_tree/mpi/config.h>
-#include <dftracer/utils/call_tree/mpi/file_header.h>
-#include <dftracer/utils/call_tree/mpi/filtered_reader.h>
-#include <dftracer/utils/call_tree/mpi/pid_index_info.h>
-#include <dftracer/utils/call_tree/mpi/serializable.h>
-#include <dftracer/utils/call_tree/mpi/serialization.h>
-
-// Include specific internal call tree components needed by MPI
 #include <dftracer/utils/call_tree/internal/call_tree.h>
 #include <dftracer/utils/call_tree/internal/process_call_tree.h>
 #include <dftracer/utils/call_tree/internal/process_key.h>
+#include <dftracer/utils/call_tree/mpi/builder.h>
+#include <dftracer/utils/call_tree/mpi/config.h>
 
 #endif  // DFTRACER_UTILS_CALL_TREE_MPI_H
diff --git a/include/dftracer/utils/call_tree/internal/call_tree.h b/include/dftracer/utils/call_tree/internal/call_tree.h
index 35511ab9..791388f4 100644
--- a/include/dftracer/utils/call_tree/internal/call_tree.h
+++ b/include/dftracer/utils/call_tree/internal/call_tree.h
@@ -116,6 +116,12 @@ class CallTree {
      */
     void add_call(const ProcessKey& key, std::shared_ptr<CallTreeNode> call);
 
+    // Moves every ProcessCallTree out of `other` into this tree. When both
+    // sides share a ProcessKey, calls/call_sequence from `other` are appended.
+    // `other` is left empty; intended for joining per-file CallTree fragments
+    // built concurrently into a single merged tree.
+    void merge_from(CallTree&& other);
+
     /**
      * Build parent-child relationships after all traces loaded
      * Called by TraceReader after all data is loaded
diff --git a/include/dftracer/utils/call_tree/internal/factory.h b/include/dftracer/utils/call_tree/internal/factory.h
index 8598b02d..3e3cdf37 100644
--- a/include/dftracer/utils/call_tree/internal/factory.h
+++ b/include/dftracer/utils/call_tree/internal/factory.h
@@ -5,8 +5,7 @@
 
 #include <cstdint>
 #include <memory>
-#include <string>
-#include <unordered_map>
+#include <string_view>
 #include <vector>
 
 namespace dftracer::utils::call_tree {
@@ -46,10 +45,12 @@ class CallTreeFactory {
      * Create a new CallTreeNode from trace event data
      * The factory manages the lifecycle of created nodes
      */
-    std::shared_ptr<CallTreeNode> create_node(
-        std::uint64_t id, const std::string& name, const std::string& category,
-        std::uint64_t start_time, std::uint64_t duration, int level,
-        const std::unordered_map<std::string, std::string>& args = {});
+    std::shared_ptr<CallTreeNode> create_node(std::uint64_t id,
+                                              std::string_view name,
+                                              std::string_view category,
+                                              std::uint64_t start_time,
+                                              std::uint64_t duration, int level,
+                                              ArgsMap args = {});
 
     /**
      * Get total number of nodes created by this factory
diff --git a/include/dftracer/utils/call_tree/internal/node.h b/include/dftracer/utils/call_tree/internal/node.h
index 69b9c800..440d6e09 100644
--- a/include/dftracer/utils/call_tree/internal/node.h
+++ b/include/dftracer/utils/call_tree/internal/node.h
@@ -1,102 +1,68 @@
 #ifndef DFTRACER_UTILS_CALL_TREE_INTERNAL_NODE_H
 #define DFTRACER_UTILS_CALL_TREE_INTERNAL_NODE_H
 
+#include <dftracer/utils/utilities/composites/dft/args_map.h>
+
 #include <cstdint>
 #include <string>
-#include <unordered_map>
+#include <string_view>
 #include <vector>
 
 namespace dftracer::utils::call_tree {
 namespace internal {
 
-/**
- * CallTreeNode - Represents a single function call in the trace
- * Follows initialization pattern:
- * 1. Constructor: Initialize internal variables, pointers to defaults (no
- * allocation)
- * 2. initialize(): Initialize state and perform allocations
- * 3. cleanup(): Deallocate memory and clean up state
- * 4. Destructor: Clear all state and reset variables
- */
+using ArgsMap = dftracer::utils::utilities::composites::dft::ArgsMap;
+
+// name_ and category_ are non-owning views into a process-wide StringIntern
+// pool. ArgsMap interns its own keys.
 class CallTreeNode {
    public:
-    /**
-     * Constructor - initializes internal variables and pointers to defaults
-     * No memory allocation or recursion
-     */
     CallTreeNode();
-
-    /**
-     * Parameterized constructor for setting basic properties
-     */
-    CallTreeNode(std::uint64_t id, const std::string& name,
-                 const std::string& category);
-
-    /**
-     * Destructor - clears all state of variables and resets them
-     */
+    CallTreeNode(std::uint64_t id, std::string_view name,
+                 std::string_view category);
     ~CallTreeNode();
 
-    // Disable copy operations to prevent unintended copies
     CallTreeNode(const CallTreeNode&) = delete;
     CallTreeNode& operator=(const CallTreeNode&) = delete;
 
-    // Enable move operations for efficient transfers
     CallTreeNode(CallTreeNode&& other) noexcept;
     CallTreeNode& operator=(CallTreeNode&& other) noexcept;
 
-    /**
-     * Initialize the state of class private variables and allocations
-     * Called after constructor to set up the node with specific values
-     */
-    void initialize(std::uint64_t id, const std::string& name,
-                    const std::string& category, std::uint64_t start_time,
+    void initialize(std::uint64_t id, std::string_view name,
+                    std::string_view category, std::uint64_t start_time,
                     std::uint64_t duration, int level);
 
-    /**
-     * Cleanup - deallocates memory and cleans up state
-     * Called only at the end, ensures no memory leaks
-     */
     void cleanup();
 
-    // Getters
     std::uint64_t get_id() const { return id_; }
-    const std::string& get_name() const { return name_; }
-    const std::string& get_category() const { return category_; }
+    std::string_view get_name() const { return name_; }
+    std::string_view get_category() const { return category_; }
     std::uint64_t get_start_time() const { return start_time_; }
     std::uint64_t get_duration() const { return duration_; }
     int get_level() const { return level_; }
     std::uint64_t get_parent_id() const { return parent_id_; }
-    const std::unordered_map<std::string, std::string>& get_args() const {
-        return args_;
-    }
+    const ArgsMap& get_args() const { return args_; }
+    ArgsMap& mut_args() { return args_; }
     const std::vector<std::uint64_t>& get_children() const { return children_; }
 
-    // Setters
     void set_parent_id(std::uint64_t parent_id) { parent_id_ = parent_id; }
     void add_child(std::uint64_t child_id) { children_.push_back(child_id); }
-    void add_arg(const std::string& key, const std::string& value) {
-        args_[key] = value;
-    }
-    void set_args(const std::unordered_map<std::string, std::string>& args) {
-        args_ = args;
-    }
+    void set_args(ArgsMap args) { args_ = std::move(args); }
 
    private:
     std::uint64_t id_;
-    std::string name_;
-    std::string category_;
+    std::string_view name_;
+    std::string_view category_;
     std::uint64_t start_time_;
     std::uint64_t duration_;
     int level_;
     std::uint64_t parent_id_;
-    std::unordered_map<std::string, std::string> args_;
+    ArgsMap args_;
     std::vector<std::uint64_t> children_;
     bool initialized_;
     bool cleaned_up_;
 };
 
-// Keep FunctionCall as alias for backward compatibility
 using FunctionCall = CallTreeNode;
 
 }  // namespace internal
diff --git a/include/dftracer/utils/call_tree/internal/trace_reader.h b/include/dftracer/utils/call_tree/internal/trace_reader.h
index 80af58e7..1f858e58 100644
--- a/include/dftracer/utils/call_tree/internal/trace_reader.h
+++ b/include/dftracer/utils/call_tree/internal/trace_reader.h
@@ -1,82 +1,56 @@
 #ifndef DFTRACER_UTILS_CALL_TREE_INTERNAL_TRACE_READER_H
 #define DFTRACER_UTILS_CALL_TREE_INTERNAL_TRACE_READER_H
 
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/utilities/common/json/parser.h>
+
+#include <cstddef>
+#include <cstdint>
 #include <functional>
+#include <set>
 #include <string>
 #include <vector>
 
 namespace dftracer::utils::call_tree {
 namespace internal {
 
-// Forward declaration
 class CallTree;
 
-/**
- * Callback function type for processing traces
- * Returns true to continue processing, false to stop
- */
 using TraceCallback = std::function<bool(const std::string& json_line)>;
 
-/**
- * TraceReader - Handles reading and parsing trace files
- * Separates I/O concerns from the CallTree data structure
- * Supports reading from single files, multiple files, or directories
- */
 class TraceReader {
    public:
     TraceReader() = default;
     ~TraceReader() = default;
 
-    /**
-     * Read trace file and populate call graph
-     * @param trace_file Path to trace log file
-     * @param graph CallTree to populate
-     * @return true if successful, false otherwise
-     */
     bool read(const std::string& trace_file, CallTree& graph);
-
-    /**
-     * Read multiple trace files and populate call graph
-     * Each file may contain traces from different nodes/processes
-     * @param trace_files Vector of paths to trace files
-     * @param graph CallTree to populate
-     * @return true if all files read successfully, false otherwise
-     */
     bool read_multiple(const std::vector<std::string>& trace_files,
                        CallTree& graph);
-
-    /**
-     * Read all trace files matching pattern from a directory
-     * @param directory Path to directory containing trace files
-     * @param pattern Glob pattern for trace files (e.g., "*.pfw")
-     * @param graph CallTree to populate
-     * @return true if successful, false otherwise
-     */
     bool read_directory(const std::string& directory,
                         const std::string& pattern, CallTree& graph);
 
-    /**
-     * Process a single JSON trace line
-     * Made public for MPI-based filtered readers
-     * @param line JSON line from trace file
-     * @param graph CallTree to add data to
-     * @return true if successful, false otherwise
-     */
+    bool process_trace_line(
+        dftracer::utils::utilities::common::json::JsonParser& parser,
+        CallTree& graph);
     bool process_trace_line(const std::string& line, CallTree& graph);
+};
 
-   private:
-    /**
-     * Detect file format and use appropriate reader
-     * Returns true if read with Reader API, false if need fallback
-     */
-    bool read_with_reader(const std::string& trace_file, CallTree& graph);
-
-    /**
-     * Fallback to direct file reading for plain text files
-     */
-    bool read_direct(const std::string& trace_file, CallTree& graph);
+struct ReadCounts {
+    std::size_t processed = 0;
+    std::size_t filtered = 0;
 };
 
+// allowed_pids == nullptr disables filtering.
+ReadCounts read_trace_file(
+    const std::string& trace_file, CallTree& graph,
+    const std::set<std::uint32_t>* allowed_pids = nullptr);
+
+// Coroutine entry point. Drives utilities::reader::TraceReader::read_json
+// inline so callers can fan out via CoroScope::spawn over multiple files.
+coro::CoroTask<ReadCounts> read_trace_file_async(
+    std::string trace_file, CallTree* graph,
+    const std::set<std::uint32_t>* allowed_pids = nullptr);
+
 }  // namespace internal
 }  // namespace dftracer::utils::call_tree
 
diff --git a/include/dftracer/utils/call_tree/json_serializer.h b/include/dftracer/utils/call_tree/json_serializer.h
index 74821370..34af60c2 100644
--- a/include/dftracer/utils/call_tree/json_serializer.h
+++ b/include/dftracer/utils/call_tree/json_serializer.h
@@ -7,7 +7,6 @@
 #include <cstdint>
 #include <sstream>
 #include <string>
-#include <unordered_map>
 
 namespace dftracer::utils::call_tree {
 namespace internal {
@@ -88,9 +87,7 @@ class JsonSerializer {
      * @param stream Output string stream
      * @return True if metadata was present, false otherwise
      */
-    bool convert_args_to_json(
-        const std::unordered_map<std::string, std::string>& args,
-        std::stringstream& stream);
+    bool convert_args_to_json(const ArgsMap& args, std::stringstream& stream);
 
     std::string hostname_hash_;
 };
diff --git a/include/dftracer/utils/call_tree/mpi/build_task.h b/include/dftracer/utils/call_tree/mpi/build_task.h
deleted file mode 100644
index a9a44c65..00000000
--- a/include/dftracer/utils/call_tree/mpi/build_task.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef DFTRACER_UTILS_CALL_TREE_MPI_BUILD_TASK_H
-#define DFTRACER_UTILS_CALL_TREE_MPI_BUILD_TASK_H
-
-/**
- * @file build_task.h
- * @brief Pipeline-based call graph builder task
- */
-
-#include <dftracer/utils/call_tree/internal/call_tree.h>
-#include <dftracer/utils/call_tree/internal/process_call_tree.h>
-
-#include <cstdint>
-#include <set>
-#include <string>
-#include <vector>
-
-namespace dftracer::utils::call_tree {
-
-/**
- * Pipeline-based call graph builder task
- * Input: vector of trace files
- * Output: internal::ProcessCallTree for assigned PIDs
- */
-struct CallTreeBuildTask {
-    std::set<std::uint32_t> pids;
-    std::vector<std::string> trace_files;
-
-    internal::ProcessCallTree execute(internal::CallTree& tree);
-};
-
-}  // namespace dftracer::utils::call_tree
-
-#endif  // DFTRACER_UTILS_CALL_TREE_MPI_BUILD_TASK_H
diff --git a/include/dftracer/utils/call_tree/mpi/builder.h b/include/dftracer/utils/call_tree/mpi/builder.h
index 2ea69e1f..791d0936 100644
--- a/include/dftracer/utils/call_tree/mpi/builder.h
+++ b/include/dftracer/utils/call_tree/mpi/builder.h
@@ -1,21 +1,14 @@
 #ifndef DFTRACER_UTILS_CALL_TREE_MPI_BUILDER_H
 #define DFTRACER_UTILS_CALL_TREE_MPI_BUILDER_H
 
-/**
- * @file builder.h
- * @brief MPICallTreeBuilder - Main class for MPI-parallel call graph generation
- */
-
 #include <dftracer/utils/call_tree/internal/call_tree.h>
 #include <dftracer/utils/call_tree/internal/process_call_tree.h>
+#include <dftracer/utils/call_tree/internal/process_key.h>
 #include <dftracer/utils/call_tree/mpi/config.h>
-#include <dftracer/utils/call_tree/mpi/pid_index_info.h>
-#include <dftracer/utils/call_tree/mpi/serializable.h>
-#include <dftracer/utils/core/mpi/mpi_utils.h>
-#include <dftracer/utils/utilities/indexer/internal/indexer.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
 
 #include <cstdint>
-#include <map>
 #include <memory>
 #include <set>
 #include <string>
@@ -23,172 +16,66 @@
 
 namespace dftracer::utils::call_tree {
 
-// Type alias for indexer shared pointer
-using IndexerPtr =
-    std::shared_ptr<dftracer::utils::utilities::indexer::internal::Indexer>;
-
-/**
- * MPICallTreeBuilder - Main class for MPI-parallel call graph generation
- *
- * Usage:
- *   1. Create builder with config
- *   2. Call discover_pids() to find all PIDs in trace files
- *   3. Call build() to generate call graphs in parallel
- *   4. Call gather() to collect all graphs to all ranks (all-to-all)
- *   5. Call save() to write to file
- *
- * Follows initialization pattern:
- * 1. Constructor: Initialize internal variables (no allocation)
- * 2. initialize(): Set up MPI, index files, discover PIDs
- * 3. build(): Generate call graphs using pipeline
- * 4. gather(): All-to-all MPI communication
- * 5. save()/load(): File I/O
- * 6. cleanup(): Deallocate memory
- */
+// MPI-parallel call tree engine. Each method is a coroutine driven by the
+// caller's pipeline; this class owns no Pipeline of its own. Phases follow
+// the dftracer_aggregator_mpi pattern:
+//
+//   discover_pids_async : cooperative PID pre-scan + allgather + round-robin
+//                         assign so each pid is owned by exactly one rank.
+//   build_async         : per-file CoroScope fan-out, pid-filtered ingest,
+//                         merge of per-file fragments into a local tree.
+//   hierarchy_async     : per-process CoroScope fan-out (each PID lives on
+//                         one rank so no cross-rank dependency).
+//   write_async         : per-rank Chrome Tracing JSON shard via sharded
+//                         ParallelWriter (io_backend driven).
+//   merge_async         : rank 0 concatenates shards into the final output
+//                         via fileio::parallel::merge_shards.
 class MPICallTreeBuilder {
    public:
-    /**
-     * Constructor - initializes with configuration
-     */
     explicit MPICallTreeBuilder(const MPICallTreeConfig& config);
-
-    /**
-     * Destructor
-     */
     ~MPICallTreeBuilder();
 
-    // Disable copy
     MPICallTreeBuilder(const MPICallTreeBuilder&) = delete;
     MPICallTreeBuilder& operator=(const MPICallTreeBuilder&) = delete;
-
-    // Enable move
     MPICallTreeBuilder(MPICallTreeBuilder&&) noexcept;
     MPICallTreeBuilder& operator=(MPICallTreeBuilder&&) noexcept;
 
-    /**
-     * Initialize MPI and internal structures
-     * Must be called after MPI_Init
-     */
-    void initialize();
-
-    /**
-     * Cleanup and release resources
-     */
-    void cleanup();
-
-    /**
-     * Add trace files to process
-     * @param files Vector of file paths
-     */
     void add_trace_files(const std::vector<std::string>& files);
-
-    /**
-     * Add trace files from directory
-     * @param directory Path to directory
-     * @param pattern File pattern (e.g., "*.pfw.gz")
-     */
     void add_trace_directory(const std::string& directory,
                              const std::string& pattern = "*.pfw.gz");
 
-    /**
-     * Phase 1: Discover all PIDs and build index
-     * Each MPI rank discovers PIDs from the trace files
-     * Results are gathered and PIDs are distributed
-     * @return Map of PID to index info
-     */
-    std::map<std::uint32_t, PIDIndexInfo> discover_pids();
-
-    /**
-     * Phase 2: Build call graphs for assigned PIDs
-     * Uses pipeline for parallel processing within rank
-     * @return Result containing success status and statistics
-     */
-    MPICallGraphResult build();
-
-    /**
-     * Phase 3: All-to-all communication to share graphs
-     * After this, all ranks have identical copies of all call graphs
-     * @return true if successful
-     */
-    bool gather();
-
-    /**
-     * Save the global call graph to file
-     * @param filename Output file path
-     * @return true if successful
-     */
-    bool save(const std::string& filename) const;
+    coro::CoroTask<bool> discover_pids(CoroScope* scope);
+    coro::CoroTask<bool> build(CoroScope* scope);
+    coro::CoroTask<bool> hierarchy(CoroScope* scope);
+    coro::CoroTask<bool> write(CoroScope* scope, std::string output_path,
+                               std::string staging_dir, bool gzip);
+    coro::CoroTask<bool> merge(std::string output_path, std::string staging_dir,
+                               bool gzip, bool keep_staging);
 
-    /**
-     * Load call tree from file (static method)
-     * @param filename Input file path
-     * @return Loaded call tree or nullptr on error
-     */
-    static std::unique_ptr<internal::CallTree> load(
-        const std::string& filename);
+    int rank() const { return rank_; }
+    int world_size() const { return world_size_; }
 
-    /**
-     * Get the generated call tree
-     * @return Reference to the call tree
-     */
-    internal::CallTree& get_call_tree() { return *call_tree_; }
-    const internal::CallTree& get_call_tree() const { return *call_tree_; }
-
-    /**
-     * Get MPI rank (delegates to MPIUtils singleton)
-     */
-    int get_rank() const { return mpi::MPIUtils::instance().get_rank(); }
-
-    /**
-     * Get MPI world size (delegates to MPIUtils singleton)
-     */
-    int get_world_size() const {
-        return mpi::MPIUtils::instance().get_world_size();
-    }
-
-    /**
-     * Get PIDs assigned to this rank
-     */
-    const std::set<std::uint32_t>& get_assigned_pids() const {
+    const std::vector<std::string>& trace_files() const { return trace_files_; }
+    const std::set<std::uint32_t>& all_pids() const { return all_pids_; }
+    const std::set<std::uint32_t>& assigned_pids() const {
         return assigned_pids_;
     }
-
-    /**
-     * Print summary statistics
-     */
-    void print_summary() const;
+    internal::CallTree& local_tree() { return *call_tree_; }
+    const internal::CallTree& local_tree() const { return *call_tree_; }
 
    private:
     MPICallTreeConfig config_;
     std::unique_ptr<internal::CallTree> call_tree_;
 
-    // File tracking
-    std::vector<std::string> trace_files_;
-    std::map<std::string, IndexerPtr> indexers_;
+    int rank_ = 0;
+    int world_size_ = 1;
 
-    // PID management
-    std::map<std::uint32_t, PIDIndexInfo> pid_index_map_;
+    std::vector<std::string> trace_files_;
+    std::set<std::uint32_t> all_pids_;
     std::set<std::uint32_t> assigned_pids_;
-    std::vector<std::uint32_t> all_pids_;
-
-    // State flags
-    bool initialized_ = false;
-    bool pids_discovered_ = false;
-    bool graphs_built_ = false;
-    bool graphs_gathered_ = false;
-
-    // Internal methods
-    void create_indexer(const std::string& trace_file);
-    std::set<std::uint32_t> scan_file_for_pids(const std::string& trace_file);
-    bool read_traces_for_pids(const std::vector<std::string>& files,
-                              const std::set<std::uint32_t>& pids);
-    SerializableProcessGraph convert_to_serializable(
-        const internal::ProcessCallTree& graph) const;
-    void merge_from_serializable(const SerializableProcessGraph& serializable);
+    std::vector<internal::ProcessKey> my_process_keys_;
 
-    // Internal MPI helpers
-    void distribute_pids();
-    bool alltoall_graphs();
+    std::string my_shard_path_;
 };
 
 }  // namespace dftracer::utils::call_tree
diff --git a/include/dftracer/utils/call_tree/mpi/file_header.h b/include/dftracer/utils/call_tree/mpi/file_header.h
deleted file mode 100644
index fbf459f1..00000000
--- a/include/dftracer/utils/call_tree/mpi/file_header.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef DFTRACER_UTILS_CALL_TREE_MPI_FILE_HEADER_H
-#define DFTRACER_UTILS_CALL_TREE_MPI_FILE_HEADER_H
-
-/**
- * @file file_header.h
- * @brief File header structure for persisted call graph files
- */
-
-#include <cstdint>
-#include <cstring>
-
-namespace dftracer::utils::call_tree {
-
-/**
- * File header for persisted call graph
- */
-struct CallGraphFileHeader {
-    static constexpr char MAGIC[8] = {'D', 'F', 'T', 'C', 'G', 'R', 'P', 'H'};
-    static constexpr std::uint32_t VERSION = 1;
-
-    char magic[8];
-    std::uint32_t version;
-    std::uint32_t num_process_graphs;
-    std::uint64_t data_offset;
-    std::uint64_t total_events;
-
-    CallGraphFileHeader()
-        : version(VERSION),
-          num_process_graphs(0),
-          data_offset(0),
-          total_events(0) {
-        std::memcpy(magic, MAGIC, sizeof(MAGIC));
-    }
-
-    bool is_valid() const {
-        return std::memcmp(magic, MAGIC, sizeof(MAGIC)) == 0 &&
-               version == VERSION;
-    }
-};
-
-}  // namespace dftracer::utils::call_tree
-
-#endif  // DFTRACER_UTILS_CALL_TREE_MPI_FILE_HEADER_H
diff --git a/include/dftracer/utils/call_tree/mpi/filtered_reader.h b/include/dftracer/utils/call_tree/mpi/filtered_reader.h
deleted file mode 100644
index 249db978..00000000
--- a/include/dftracer/utils/call_tree/mpi/filtered_reader.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef DFTRACER_UTILS_CALL_TREE_MPI_FILTERED_READER_H
-#define DFTRACER_UTILS_CALL_TREE_MPI_FILTERED_READER_H
-
-/**
- * @file filtered_reader.h
- * @brief Filtered trace reader that only processes events for specific PIDs
- */
-
-#include <dftracer/utils/call_tree/internal/call_tree.h>
-
-#include <cstdint>
-#include <set>
-#include <string>
-#include <vector>
-
-namespace dftracer::utils::call_tree {
-
-/**
- * Filtered trace reader that only processes events for specific PIDs
- * Uses the indexer to efficiently skip to relevant sections
- */
-class MPIFilteredTraceReader {
-   public:
-    explicit MPIFilteredTraceReader(
-        const std::set<std::uint32_t>& allowed_pids);
-
-    /**
-     * Read trace file and populate call graph
-     * Only processes events for allowed PIDs
-     */
-    bool read(const std::string& trace_file, internal::CallTree& graph);
-
-    /**
-     * Read with indexer for efficient access
-     */
-    bool read_with_indexer(const std::string& trace_file,
-                           const std::string& index_file,
-                           internal::CallTree& graph);
-
-    /**
-     * Read multiple files
-     */
-    bool read_multiple(const std::vector<std::string>& trace_files,
-                       internal::CallTree& graph);
-
-    /**
-     * Get count of processed events
-     */
-    std::size_t get_processed_count() const { return processed_count_; }
-
-    /**
-     * Get count of filtered (skipped) events
-     */
-    std::size_t get_filtered_count() const { return filtered_count_; }
-
-   private:
-    std::set<std::uint32_t> allowed_pids_;
-    std::size_t processed_count_ = 0;
-    std::size_t filtered_count_ = 0;
-};
-
-}  // namespace dftracer::utils::call_tree
-
-#endif  // DFTRACER_UTILS_CALL_TREE_MPI_FILTERED_READER_H
diff --git a/include/dftracer/utils/call_tree/mpi/pid_index_info.h b/include/dftracer/utils/call_tree/mpi/pid_index_info.h
deleted file mode 100644
index 0539744c..00000000
--- a/include/dftracer/utils/call_tree/mpi/pid_index_info.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef DFTRACER_UTILS_CALL_TREE_MPI_PID_INDEX_INFO_H
-#define DFTRACER_UTILS_CALL_TREE_MPI_PID_INDEX_INFO_H
-
-/**
- * @file pid_index_info.h
- * @brief Structure to hold PID index information from gzip indexer
- */
-
-#include <cstdint>
-#include <string>
-
-namespace dftracer::utils::call_tree {
-
-/**
- * Structure to hold PID index information from gzip indexer
- * Maps each PID to its starting line in the trace file
- */
-struct PIDIndexInfo {
-    std::uint32_t pid;
-    std::uint64_t start_line;
-    std::uint64_t end_line;
-    std::uint64_t event_count;
-    std::string source_file;
-
-    PIDIndexInfo() : pid(0), start_line(0), end_line(0), event_count(0) {}
-    PIDIndexInfo(std::uint32_t p, std::uint64_t sl, std::uint64_t el,
-                 std::uint64_t ec, const std::string& sf)
-        : pid(p),
-          start_line(sl),
-          end_line(el),
-          event_count(ec),
-          source_file(sf) {}
-};
-
-}  // namespace dftracer::utils::call_tree
-
-#endif  // DFTRACER_UTILS_CALL_TREE_MPI_PID_INDEX_INFO_H
diff --git a/include/dftracer/utils/call_tree/mpi/serializable.h b/include/dftracer/utils/call_tree/mpi/serializable.h
index 8b1ed600..5e0f69a2 100644
--- a/include/dftracer/utils/call_tree/mpi/serializable.h
+++ b/include/dftracer/utils/call_tree/mpi/serializable.h
@@ -1,53 +1,44 @@
 #ifndef DFTRACER_UTILS_CALL_TREE_MPI_SERIALIZABLE_H
 #define DFTRACER_UTILS_CALL_TREE_MPI_SERIALIZABLE_H
 
-/**
- * @file serializable.h
- * @brief Serializable structures for MPI transfer of call graph data
- */
-
-#include <dftracer/utils/call_tree/internal/process_key.h>
+// Two save/load formats for in-memory call trees:
+//
+//   save_binary / load_binary  -- compact custom format with a string
+//     dictionary (name/category/arg keys/string values share storage) and
+//     typed args (preserves int/uint/double/bool vs flattening to strings).
+//     Header is fixed-size; body lays out a global string table followed
+//     by ProcessCallTree records.
+//
+//   save_arrow / load_arrow    -- Arrow IPC (.arrow) with zstd buffer-level
+//     compression. Columnar layout with dictionary-encoded name/category;
+//     readable by pyarrow / polars / nanoarrow. Best for analysis tooling
+//     that already speaks Arrow.
+
+#include <dftracer/utils/call_tree/internal/call_tree.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
 
 #include <cstdint>
+#include <memory>
 #include <string>
-#include <unordered_map>
-#include <vector>
 
 namespace dftracer::utils::call_tree {
 
-/**
- * Serializable call graph node for MPI transfer
- */
-struct SerializableCallNode {
-    std::uint64_t id;
-    std::string name;
-    std::string category;
-    std::uint64_t start_time;
-    std::uint64_t duration;
-    int level;
-    std::uint64_t parent_id;
-    std::vector<std::uint64_t> children;
-    std::unordered_map<std::string, std::string> args;
-
-    // Serialization to bytes
-    std::vector<char> serialize() const;
-    static SerializableCallNode deserialize(const char* data, size_t& offset);
-};
-
-/**
- * Serializable process call graph for MPI transfer
- */
-struct SerializableProcessGraph {
-    internal::ProcessKey key;
-    std::vector<SerializableCallNode> nodes;
-    std::vector<std::uint64_t> root_calls;
-    std::vector<std::uint64_t> call_sequence;
-
-    // Serialization to bytes
-    std::vector<char> serialize() const;
-    static SerializableProcessGraph deserialize(const char* data,
-                                                size_t& offset);
-};
+inline constexpr char CALLTREE_BINARY_MAGIC[8] = {'D', 'F', 'T', 'C',
+                                                  'G', 'R', 'P', '2'};
+inline constexpr std::uint32_t CALLTREE_BINARY_VERSION = 2;
+
+coro::CoroTask<bool> save_binary(CoroScope* scope,
+                                 const internal::CallTree& tree,
+                                 std::string output_path);
+coro::CoroTask<std::unique_ptr<internal::CallTree>> load_binary(
+    CoroScope* scope, std::string input_path);
+
+coro::CoroTask<bool> save_arrow(CoroScope* scope,
+                                const internal::CallTree& tree,
+                                std::string output_path);
+coro::CoroTask<std::unique_ptr<internal::CallTree>> load_arrow(
+    CoroScope* scope, std::string input_path);
 
 }  // namespace dftracer::utils::call_tree
 
diff --git a/include/dftracer/utils/call_tree/mpi/serialization.h b/include/dftracer/utils/call_tree/mpi/serialization.h
deleted file mode 100644
index b766f9d7..00000000
--- a/include/dftracer/utils/call_tree/mpi/serialization.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef DFTRACER_UTILS_CALL_TREE_MPI_SERIALIZATION_H
-#define DFTRACER_UTILS_CALL_TREE_MPI_SERIALIZATION_H
-
-/**
- * @file serialization.h
- * @brief Utility functions for serialization of MPI data
- */
-
-#include <cstdint>
-#include <string>
-#include <vector>
-
-namespace dftracer::utils::call_tree {
-namespace serialization {
-
-// Write primitives
-void write_uint32(std::vector<char>& buffer, std::uint32_t value);
-void write_uint64(std::vector<char>& buffer, std::uint64_t value);
-void write_int(std::vector<char>& buffer, int value);
-void write_string(std::vector<char>& buffer, const std::string& str);
-
-// Read primitives
-std::uint32_t read_uint32(const char* data, size_t& offset);
-std::uint64_t read_uint64(const char* data, size_t& offset);
-int read_int(const char* data, size_t& offset);
-std::string read_string(const char* data, size_t& offset);
-
-}  // namespace serialization
-}  // namespace dftracer::utils::call_tree
-
-#endif  // DFTRACER_UTILS_CALL_TREE_MPI_SERIALIZATION_H
diff --git a/include/dftracer/utils/core/common/buffer_pool.h b/include/dftracer/utils/core/common/buffer_pool.h
index d265f037..591ea3c1 100644
--- a/include/dftracer/utils/core/common/buffer_pool.h
+++ b/include/dftracer/utils/core/common/buffer_pool.h
@@ -1,11 +1,11 @@
 #ifndef DFTRACER_UTILS_CORE_COMMON_BUFFER_POOL_H
 #define DFTRACER_UTILS_CORE_COMMON_BUFFER_POOL_H
 
+#include <concurrentqueue.h>
+
 #include <cstddef>
 #include <memory>
-#include <mutex>
 #include <utility>
-#include <vector>
 
 namespace dftracer::utils {
 
@@ -24,10 +24,6 @@ struct NoOpReset {
 /**
  * @brief Thread-safe typed buffer pool. Zero allocations after warmup.
  *
- * Buffers are never dropped. Released buffers are always kept for reuse.
- * The init factory is only called when the pool is empty (during warmup
- * or under unexpected load).
- *
  * @tparam T Buffer type. Must support move semantics.
  */
 template <typename T>
@@ -38,27 +34,19 @@ class BufferPool {
     virtual void release(T buf) = 0;
 };
 
-/**
- * @brief Concrete buffer pool with typed Init and Reset callables.
- *
- * Init and Reset are stored by value to avoid std::function overhead.
- */
 template <typename T, typename Init, typename Reset = DefaultReset>
 class BufferPoolImpl : public BufferPool<T> {
    public:
     BufferPoolImpl(std::size_t capacity, Init init, Reset reset = Reset{})
-        : init_(std::move(init)), reset_(std::move(reset)) {
-        pool_.reserve(capacity);
+        : queue_(capacity), init_(std::move(init)), reset_(std::move(reset)) {
         for (std::size_t i = 0; i < capacity; ++i) {
-            pool_.push_back(init_());
+            queue_.enqueue(init_());
         }
     }
 
     T acquire() override {
-        std::lock_guard<std::mutex> lock(mu_);
-        if (!pool_.empty()) {
-            T item = std::move(pool_.back());
-            pool_.pop_back();
+        T item;
+        if (queue_.try_dequeue(item)) {
             return item;
         }
         return init_();
@@ -66,13 +54,11 @@ class BufferPoolImpl : public BufferPool<T> {
 
     void release(T buf) override {
         reset_(buf);
-        std::lock_guard<std::mutex> lock(mu_);
-        pool_.push_back(std::move(buf));
+        queue_.enqueue(std::move(buf));
     }
 
    private:
-    std::mutex mu_;
-    std::vector<T> pool_;
+    moodycamel::ConcurrentQueue<T> queue_;
     Init init_;
     Reset reset_;
 };
diff --git a/include/dftracer/utils/core/common/constants.h b/include/dftracer/utils/core/common/constants.h
index 72e27dcf..44233c08 100644
--- a/include/dftracer/utils/core/common/constants.h
+++ b/include/dftracer/utils/core/common/constants.h
@@ -19,8 +19,6 @@ static constexpr std::size_t INFLATE_BUFFER_SIZE = 262144;  // 256KB
 #endif
 static constexpr std::uint64_t DEFAULT_CHECKPOINT_SIZE =
     32 * 1024 * 1024;  // 32MB
-static constexpr std::size_t DEFAULT_INDEX_SIZE_THRESHOLD =
-    1 * 1024 * 1024;   // 1MB
 extern const char* const& SQL_SCHEMA;
 inline const char* EXTENSION = ".dftindex";
 }  // namespace indexer
@@ -41,7 +39,6 @@ static constexpr std::size_t FILE_IO_BUFFER_SIZE =
 #define DFTRACER_UTILS_ZLIB_WINDOW_SIZE 32768
 #define DFTRACER_UTILS_ZLIB_GZIP_WINDOW_BITS 31
 #define DFTRACER_UTILS_DEFAULT_CHECKPOINT_SIZE (32 * 1024 * 1024)
-#define DFTRACER_UTILS_DEFAULT_INDEX_SIZE_THRESHOLD (1 * 1024 * 1024)
 #define DFTRACER_UTILS_DEFAULT_BUFFER_SIZE 65536
 #define DFTRACER_UTILS_SKIP_BUFFER_SIZE 131072
 #define DFTRACER_UTILS_FILE_IO_BUFFER_SIZE 262144
diff --git a/include/dftracer/utils/core/common/memory_budget.h b/include/dftracer/utils/core/common/memory_budget.h
new file mode 100644
index 00000000..beca660a
--- /dev/null
+++ b/include/dftracer/utils/core/common/memory_budget.h
@@ -0,0 +1,32 @@
+#ifndef DFTRACER_UTILS_CORE_COMMON_MEMORY_BUDGET_H
+#define DFTRACER_UTILS_CORE_COMMON_MEMORY_BUDGET_H
+
+#include <cstddef>
+#include <vector>
+
+namespace dftracer::utils {
+
+static constexpr std::size_t DEFAULT_MEMORY_BUDGET_FRACTION_PERCENT = 50;
+static constexpr std::size_t MIN_MEMORY_BUDGET_BYTES = 64 * 1024 * 1024;
+
+static constexpr std::size_t PER_FILE_EXPANSION_FACTOR = 24;
+static constexpr std::size_t MIN_PER_FILE_PEAK_BYTES = 64ULL * 1024 * 1024;
+static constexpr std::size_t MAX_PER_FILE_PEAK_BYTES =
+    16ULL * 1024 * 1024 * 1024;
+static constexpr std::size_t PER_FILE_SAMPLE_LIMIT = 1024;
+
+std::size_t detect_available_memory();
+std::size_t compute_memory_budget(std::size_t user_override_bytes = 0);
+std::size_t compute_channel_capacity(std::size_t memory_budget_bytes,
+                                     std::size_t estimated_batch_bytes,
+                                     std::size_t num_workers);
+std::size_t compute_file_batch_size(std::size_t memory_budget_bytes,
+                                    std::size_t estimated_file_bytes,
+                                    std::size_t min_files = 4);
+
+std::size_t estimate_per_file_bytes(const std::vector<std::size_t>& file_sizes,
+                                    std::size_t user_override_bytes = 0);
+
+}  // namespace dftracer::utils
+
+#endif  // DFTRACER_UTILS_CORE_COMMON_MEMORY_BUDGET_H
diff --git a/include/dftracer/utils/core/common/object_pool.h b/include/dftracer/utils/core/common/object_pool.h
index 8ceefa0e..d0de1157 100644
--- a/include/dftracer/utils/core/common/object_pool.h
+++ b/include/dftracer/utils/core/common/object_pool.h
@@ -7,28 +7,11 @@
 #include <atomic>
 #include <cstddef>
 #include <cstdint>
-#include <mutex>
 #include <new>
-#include <unordered_map>
 
 namespace dftracer::utils {
 
-/**
- * @brief Lock-free LIFO stack (Treiber stack) with ABA-safe tagged pointers.
- *
- * Intrusive: the `next` pointer is stored in the first 8 bytes of the
- * block itself (valid since blocks are at least sizeof(void*) bytes).
- *
- * ABA protection: x86-64 uses 48-bit virtual addresses. A 16-bit
- * generation counter is packed into the upper bits of a 64-bit atomic.
- *
- * Reference: Treiber, R.K. (1986) "Systems Programming: Coping with
- * Parallelism", IBM Technical Report.
- */
 class TreiberStack {
-    // Intrusive next-pointer stored in the first sizeof(void*) bytes of
-    // freed blocks. Accessed via atomic_ref so TSAN can track the
-    // happens-before relationship through the CAS on head_.
     static void store_next(void* block, void* next) noexcept {
         std::atomic_ref<void*>(*reinterpret_cast<void**>(block))
             .store(next, std::memory_order_release);
@@ -81,7 +64,6 @@ class TreiberStack {
 
     static void* unpack_ptr(std::uint64_t packed) noexcept {
         auto raw = static_cast<std::uintptr_t>(packed & PTR_MASK);
-        // Sign-extend bit 47 for canonical x86-64 addresses
         if (raw & (1ULL << 47)) {
             raw |= ~PTR_MASK;
         }
@@ -93,18 +75,6 @@ class TreiberStack {
     }
 };
 
-/**
- * @brief Thread-safe, lock-free object pool with size-bucketed freelists.
- *
- * Uses TreiberStack (LIFO) per size class. After warmup, allocations are
- * zero-malloc: freed blocks are recycled immediately.
- *
- * Usage:
- * @code
- *   void* p = ObjectPool::instance().allocate(256);
- *   ObjectPool::instance().deallocate(p, 256);
- * @endcode
- */
 class ObjectPool {
    public:
     static ObjectPool& instance() {
@@ -113,15 +83,20 @@ class ObjectPool {
     }
 
     void* allocate(std::size_t size) {
-        auto& stack = get_stack(size);
-        void* block = stack.pop();
+        auto* stack = get_stack(size);
+        if (!stack) return ::operator new(size);
+        void* block = stack->pop();
         if (block) return block;
         return ::operator new(size);
     }
 
     void deallocate(void* block, std::size_t size) {
-        auto& stack = get_stack(size);
-        stack.push(block);
+        auto* stack = get_stack(size);
+        if (!stack) {
+            ::operator delete(block);
+            return;
+        }
+        stack->push(block);
     }
 
     ObjectPool(const ObjectPool&) = delete;
@@ -131,16 +106,16 @@ class ObjectPool {
     ObjectPool() = default;
 
     ~ObjectPool() {
-        // Drain all fast buckets
         for (auto& stack : fast_buckets_) {
             while (void* block = stack.pop()) {
                 ::operator delete(block);
             }
         }
-        // Drain all slow buckets
-        for (auto& [_, stack] : slow_buckets_) {
-            while (void* block = stack.pop()) {
-                ::operator delete(block);
+        for (auto& slot : slow_table_) {
+            if (slot.bucket.load(std::memory_order_relaxed) != 0) {
+                while (void* block = slot.stack.pop()) {
+                    ::operator delete(block);
+                }
             }
         }
     }
@@ -149,18 +124,45 @@ class ObjectPool {
     static constexpr std::size_t MAX_FAST_SIZE = 4096;
     static constexpr std::size_t NUM_FAST_BUCKETS = MAX_FAST_SIZE / ALIGNMENT;
 
+    static constexpr std::size_t SLOW_TABLE_SIZE = 256;
+    static constexpr std::size_t SLOW_TABLE_MASK = SLOW_TABLE_SIZE - 1;
+
     std::array<TreiberStack, NUM_FAST_BUCKETS> fast_buckets_;
 
-    std::mutex slow_mutex_;
-    std::unordered_map<std::size_t, TreiberStack> slow_buckets_;
+    struct SlowSlot {
+        std::atomic<std::size_t> bucket{0};
+        TreiberStack stack;
+    };
+    std::array<SlowSlot, SLOW_TABLE_SIZE> slow_table_;
 
-    TreiberStack& get_stack(std::size_t size) {
+    TreiberStack* get_stack(std::size_t size) {
         std::size_t bucket = (size + ALIGNMENT - 1) / ALIGNMENT;
         if (bucket > 0 && bucket <= NUM_FAST_BUCKETS) {
-            return fast_buckets_[bucket - 1];
+            return &fast_buckets_[bucket - 1];
+        }
+        return find_slow_stack(bucket);
+    }
+
+    TreiberStack* find_slow_stack(std::size_t bucket) {
+        auto h = bucket;
+        for (std::size_t i = 0; i < SLOW_TABLE_SIZE; ++i) {
+            auto idx = (h + i) & SLOW_TABLE_MASK;
+            auto& slot = slow_table_[idx];
+            auto existing = slot.bucket.load(std::memory_order_acquire);
+            if (existing == bucket) return &slot.stack;
+            if (existing == 0) {
+                std::size_t expected = 0;
+                if (slot.bucket.compare_exchange_strong(
+                        expected, bucket, std::memory_order_release,
+                        std::memory_order_acquire)) {
+                    return &slot.stack;
+                }
+                if (slot.bucket.load(std::memory_order_acquire) == bucket) {
+                    return &slot.stack;
+                }
+            }
         }
-        std::lock_guard<std::mutex> lock(slow_mutex_);
-        return slow_buckets_[bucket];
+        return nullptr;
     }
 };
 
diff --git a/include/dftracer/utils/core/common/string_intern.h b/include/dftracer/utils/core/common/string_intern.h
index 5cbfa740..976c2edc 100644
--- a/include/dftracer/utils/core/common/string_intern.h
+++ b/include/dftracer/utils/core/common/string_intern.h
@@ -1,98 +1,241 @@
 #ifndef DFTRACER_UTILS_CORE_COMMON_STRING_INTERN_H
 #define DFTRACER_UTILS_CORE_COMMON_STRING_INTERN_H
 
-#include <dftracer/utils/core/common/transparent_string_hash.h>
-
+#include <atomic>
 #include <cstdint>
+#include <functional>
+#include <memory>
 #include <mutex>
-#include <shared_mutex>
 #include <string>
 #include <string_view>
-#include <unordered_map>
-#include <vector>
 
 namespace dftracer::utils {
 
-/**
- * @brief Thread-safe string interning table.
- *
- * Stores each unique string once and returns an integer ID.
- * Lookups by string_view avoid allocation on cache hit.
- * IDs are stable for the lifetime of the table.
- *
- * Usage:
- * @code
- *   StringIntern intern;
- *   uint32_t id = intern.get_or_insert("POSIX");  // first call: stores string
- *   uint32_t id2 = intern.get_or_insert("POSIX"); // cache hit: no alloc
- *   assert(id == id2);
- *   assert(intern.resolve(id) == "POSIX");
- * @endcode
- */
 class StringIntern {
    public:
-    StringIntern() = default;
+    static constexpr std::size_t FAST_CAPACITY = 1u << 20;
+
+    StringIntern()
+        : buckets_(std::make_unique<std::atomic<Node*>[]>(BUCKET_COUNT)),
+          fast_(std::make_unique<std::atomic<const std::string*>[]>(
+              FAST_CAPACITY)) {
+        for (std::size_t i = 0; i < BUCKET_COUNT; ++i) {
+            buckets_[i].store(nullptr, std::memory_order_relaxed);
+        }
+    }
+
+    ~StringIntern() {
+        for (std::size_t i = 0; i < BUCKET_COUNT; ++i) {
+            auto* node = buckets_[i].load(std::memory_order_relaxed);
+            while (node) {
+                auto* next = node->next.load(std::memory_order_relaxed);
+                delete node;
+                node = next;
+            }
+        }
+    }
 
-    // Non-copyable, non-movable (shared_mutex is not movable)
     StringIntern(const StringIntern&) = delete;
     StringIntern& operator=(const StringIntern&) = delete;
     StringIntern(StringIntern&&) = delete;
     StringIntern& operator=(StringIntern&&) = delete;
 
-    /**
-     * @brief Intern a string. Returns its unique ID.
-     * Thread-safe. Uses shared_mutex: concurrent reads, exclusive writes.
-     * Lookups use string_view (no allocation on cache hit).
-     */
     std::uint32_t get_or_insert(std::string_view sv) {
-        // Fast path: read lock, check if already interned
-        {
-            std::shared_lock lock(mutex_);
-            auto it = str_to_id_.find(sv);
-            if (it != str_to_id_.end()) return it->second;
+        const auto h = hash(sv);
+        const auto bucket = h & BUCKET_MASK;
+
+        // Lock-free lookup
+        auto* node = buckets_[bucket].load(std::memory_order_acquire);
+        while (node) {
+            if (node->hash == h && node->str == sv) {
+                return node->id;
+            }
+            node = node->next.load(std::memory_order_acquire);
+        }
+
+        // Rare: new string; take mutex
+        std::lock_guard lock(insert_mutex_);
+
+        // Re-check under lock (another thread may have inserted)
+        node = buckets_[bucket].load(std::memory_order_acquire);
+        while (node) {
+            if (node->hash == h && node->str == sv) {
+                return node->id;
+            }
+            node = node->next.load(std::memory_order_acquire);
+        }
+
+        std::uint32_t id;
+        if (deterministic_ids_.load(std::memory_order_acquire)) {
+            // Deterministic-hash id so the same string maps to the same id
+            // across processes. Mask off top bit + clamp into FAST_CAPACITY
+            // so `resolve()` hits the fast path. Collisions (different
+            // strings -> same id) are bucket-chained on insert but
+            // `resolve(id)` returns the first-inserted string for that id.
+            id = static_cast<std::uint32_t>(h & (FAST_CAPACITY - 1));
+        } else {
+            id = static_cast<std::uint32_t>(
+                num_strings_.load(std::memory_order_relaxed));
+        }
+        auto* new_node = new Node{std::string(sv), h, id, {}};
+        new_node->next.store(buckets_[bucket].load(std::memory_order_relaxed),
+                             std::memory_order_relaxed);
+
+        if (id < FAST_CAPACITY) {
+            // Respect "first-inserted wins" when a collision maps two
+            // different strings to the same deterministic id: only set
+            // fast_[id] if the slot is still empty.
+            const std::string* expected = nullptr;
+            fast_[id].compare_exchange_strong(expected, &new_node->str,
+                                              std::memory_order_release,
+                                              std::memory_order_relaxed);
         }
-        // Slow path: write lock, insert new string
-        std::unique_lock lock(mutex_);
-        // Double-check after acquiring write lock
-        auto [it, inserted] = str_to_id_.try_emplace(
-            std::string(sv), static_cast<std::uint32_t>(id_to_str_.size()));
-        if (inserted) {
-            id_to_str_.push_back(it->first);
+
+        // Publish to bucket; all prior stores (node fields, fast_[])
+        // are visible to readers via this release.
+        buckets_[bucket].store(new_node, std::memory_order_release);
+
+        if (!deterministic_ids_.load(std::memory_order_acquire)) {
+            // Sequential id path: advance counter past the id we just
+            // handed out so size() stays monotonic.
+            num_strings_.store(static_cast<std::size_t>(id) + 1,
+                               std::memory_order_release);
+        } else {
+            // Deterministic id path: `size()` is a weak estimate of
+            // distinct strings; bump if this id is the highest seen.
+            std::size_t cur = num_strings_.load(std::memory_order_relaxed);
+            const std::size_t need = static_cast<std::size_t>(id) + 1;
+            while (cur < need && !num_strings_.compare_exchange_weak(
+                                     cur, need, std::memory_order_release,
+                                     std::memory_order_relaxed)) {
+            }
+        }
+
+        return id;
+    }
+
+    /// Insert or look up a string at a specific id (for loading a persisted
+    /// dictionary where ids must be preserved). If the id already holds a
+    /// different string, the existing binding wins (caller error -> ignored).
+    /// Safe to call concurrently with other inserts; must be called before
+    /// any `resolve(id)` at that id.
+    void insert_at_id(std::uint32_t id, std::string_view sv) {
+        const auto h = hash(sv);
+        const auto bucket = h & BUCKET_MASK;
+
+        std::lock_guard lock(insert_mutex_);
+
+        // If the id already has a string in fast_, nothing to do.
+        if (id < FAST_CAPACITY) {
+            if (fast_[id].load(std::memory_order_acquire) != nullptr) {
+                return;
+            }
+        }
+
+        // Also avoid inserting a second node for the same string (would leave
+        // the older node referenced by the bucket chain pointing at a stale
+        // id, confusing get_or_insert which returns the first match).
+        auto* node = buckets_[bucket].load(std::memory_order_acquire);
+        while (node) {
+            if (node->hash == h && node->str == sv) {
+                // String already interned under a different id; point fast_[id]
+                // at it so resolve(id) returns something valid.
+                if (id < FAST_CAPACITY) {
+                    fast_[id].store(&node->str, std::memory_order_release);
+                }
+                if (static_cast<std::size_t>(id) + 1 >
+                    num_strings_.load(std::memory_order_relaxed)) {
+                    num_strings_.store(static_cast<std::size_t>(id) + 1,
+                                       std::memory_order_release);
+                }
+                return;
+            }
+            node = node->next.load(std::memory_order_acquire);
+        }
+
+        auto* new_node = new Node{std::string(sv), h, id, {}};
+        new_node->next.store(buckets_[bucket].load(std::memory_order_relaxed),
+                             std::memory_order_relaxed);
+
+        if (id < FAST_CAPACITY) {
+            fast_[id].store(&new_node->str, std::memory_order_release);
+        }
+
+        buckets_[bucket].store(new_node, std::memory_order_release);
+
+        // Advance num_strings_ past the highest id ever inserted so future
+        // get_or_insert calls don't collide with a loaded id.
+        std::size_t cur = num_strings_.load(std::memory_order_relaxed);
+        const std::size_t need = static_cast<std::size_t>(id) + 1;
+        while (cur < need && !num_strings_.compare_exchange_weak(
+                                 cur, need, std::memory_order_release,
+                                 std::memory_order_relaxed)) {
         }
-        return it->second;
     }
 
-    /**
-     * @brief Resolve an ID back to its string. Thread-safe.
-     */
     std::string_view resolve(std::uint32_t id) const {
-        std::shared_lock lock(mutex_);
-        return id_to_str_[id];
+        if (id >= FAST_CAPACITY) return {};
+        auto* p = fast_[id].load(std::memory_order_acquire);
+        return p ? std::string_view(*p) : std::string_view{};
     }
 
-    /**
-     * @brief Intern a string and return a stable string_view.
-     * Convenience wrapper: inserts if new, then resolves to string_view.
-     */
     std::string_view intern(std::string_view sv) {
         return resolve(get_or_insert(sv));
     }
 
-    /**
-     * @brief Number of unique strings interned.
-     */
     std::size_t size() const {
-        std::shared_lock lock(mutex_);
-        return id_to_str_.size();
+        return num_strings_.load(std::memory_order_acquire);
+    }
+
+    /// Shift the next-to-assign id counter to `base`. Subsequent
+    /// `get_or_insert` calls allocate ids starting at `base`.
+    /// Must be called before any `get_or_insert` on this instance.
+    /// Lock-free: caller ensures no concurrent inserts.
+    void reserve_id_base(std::uint32_t base) noexcept {
+        num_strings_.store(base, std::memory_order_release);
+    }
+
+    /// Enable deterministic-hash id assignment. When set, `get_or_insert`
+    /// returns a stable id derived from the string's content rather than a
+    /// sequential counter. Same string -> same id in every process,
+    /// regardless of insertion order. Intended for multi-process workflows
+    /// (e.g. MPI ranks) where keys that include string ids must be
+    /// identical across ranks so RocksDB merge operators can combine
+    /// operands for the same logical key.
+    ///
+    /// Collision handling: the 32-bit id is `hash(str) & 0x7FFFFFFF` to
+    /// stay within FAST_CAPACITY-reachable range on lookup when
+    /// `id < FAST_CAPACITY`. Different strings with the same id are
+    /// chained in the bucket and lookup resolves by string equality, but
+    /// `resolve(id)` can only return one of them. For the typical
+    /// dftracer workload (cat/name/hhash/fhash dictionaries with O(1000)
+    /// entries) birthday collisions are negligible.
+    ///
+    /// Must be called before any `get_or_insert`.
+    void enable_deterministic_ids() noexcept {
+        deterministic_ids_.store(true, std::memory_order_release);
     }
 
    private:
-    mutable std::shared_mutex mutex_;
-    std::unordered_map<std::string, std::uint32_t, TransparentStringHash,
-                       TransparentStringEqual>
-        str_to_id_;
-    // Stores references to the map's owned strings for O(1) resolve.
-    std::vector<std::string_view> id_to_str_;
+    static constexpr std::size_t BUCKET_COUNT = 1u << 12;  // 4096
+    static constexpr std::size_t BUCKET_MASK = BUCKET_COUNT - 1;
+
+    struct Node {
+        const std::string str;
+        const std::size_t hash;
+        const std::uint32_t id;
+        std::atomic<Node*> next;
+    };
+
+    static std::size_t hash(std::string_view sv) {
+        return std::hash<std::string_view>{}(sv);
+    }
+
+    std::unique_ptr<std::atomic<Node*>[]> buckets_;
+    std::unique_ptr<std::atomic<const std::string*>[]> fast_;
+    std::atomic<std::size_t> num_strings_{0};
+    std::atomic<bool> deterministic_ids_{false};
+    std::mutex insert_mutex_;
 };
 
 }  // namespace dftracer::utils
diff --git a/include/dftracer/utils/core/common/transparent_string_hash.h b/include/dftracer/utils/core/common/transparent_string_hash.h
index eef6c77c..d45eab62 100644
--- a/include/dftracer/utils/core/common/transparent_string_hash.h
+++ b/include/dftracer/utils/core/common/transparent_string_hash.h
@@ -1,27 +1,25 @@
 #ifndef DFTRACER_UTILS_CORE_COMMON_TRANSPARENT_STRING_HASH_H
 #define DFTRACER_UTILS_CORE_COMMON_TRANSPARENT_STRING_HASH_H
 
+#include <ankerl/unordered_dense.h>
+
 #include <cstddef>
-#include <functional>
+#include <string>
 #include <string_view>
 
 namespace dftracer::utils {
 
-/**
- * @brief Transparent hash for std::unordered_map<std::string, ...> that
- * accepts std::string_view lookups without constructing std::string.
- *
- * Usage:
- * @code
- *   std::unordered_map<std::string, int, TransparentStringHash,
- *                      TransparentStringEqual> map;
- *   map[some_string_view];  // no std::string construction for lookup
- * @endcode
- */
 struct TransparentStringHash {
     using is_transparent = void;
+    using is_avalanching = void;
     std::size_t operator()(std::string_view sv) const noexcept {
-        return std::hash<std::string_view>{}(sv);
+        return ankerl::unordered_dense::hash<std::string_view>{}(sv);
+    }
+    std::size_t operator()(const std::string& s) const noexcept {
+        return ankerl::unordered_dense::hash<std::string_view>{}(s);
+    }
+    std::size_t operator()(const char* s) const noexcept {
+        return ankerl::unordered_dense::hash<std::string_view>{}(s);
     }
 };
 
@@ -32,6 +30,15 @@ struct TransparentStringEqual {
     }
 };
 
+template <typename V>
+using StringViewMap =
+    ankerl::unordered_dense::map<std::string, V, TransparentStringHash,
+                                 TransparentStringEqual>;
+
+using StringViewSet =
+    ankerl::unordered_dense::set<std::string, TransparentStringHash,
+                                 TransparentStringEqual>;
+
 }  // namespace dftracer::utils
 
 #endif  // DFTRACER_UTILS_CORE_COMMON_TRANSPARENT_STRING_HASH_H
diff --git a/include/dftracer/utils/core/coro/channel.h b/include/dftracer/utils/core/coro/channel.h
index 8eeb52a6..5f541891 100644
--- a/include/dftracer/utils/core/coro/channel.h
+++ b/include/dftracer/utils/core/coro/channel.h
@@ -23,6 +23,9 @@ namespace dftracer::utils::coro {
 template <typename T>
 class ChannelProducer;
 
+template <typename T>
+class ChannelConsumer;
+
 /**
  * Channel<T> - Producer-consumer queue for streaming data
  *
@@ -850,8 +853,24 @@ class Channel : public std::enable_shared_from_this<Channel<T>> {
         return ChannelProducer<T>(this);
     }
 
+    /**
+     * Get a receive-side handle for capturing into a coroutine lambda:
+     *
+     *   [ch = channel->consumer()](...) -> CoroTask<...> {
+     *       while (auto item = co_await ch.receive()) { ... }
+     *   }
+     */
+    ChannelConsumer<T> consumer() {
+        auto sp = this->weak_from_this().lock();
+        if (sp) {
+            return ChannelConsumer<T>(std::move(sp));
+        }
+        return ChannelConsumer<T>(this);
+    }
+
    private:
     friend class ChannelProducer<T>;
+    friend class ChannelConsumer<T>;
 
     ProducerGuard adopt_producer() {
         return ProducerGuard(this, typename ProducerGuard::Adopt{});
@@ -977,6 +996,35 @@ class Channel : public std::enable_shared_from_this<Channel<T>> {
 
     ReceiveAwaitable receive() { return ReceiveAwaitable(this); }
 
+    std::optional<T> blocking_receive() {
+        T item;
+        if (queue_.try_dequeue(item)) {
+            mark_item_consumed();
+            if (!wake_one_send_waiter_after_receive()) {
+                cv_writable_.notify_one();
+            }
+            maybe_notify_terminal();
+            return std::optional<T>(std::move(item));
+        }
+
+        std::unique_lock<std::mutex> lock(state_mutex_);
+        while (true) {
+            if (try_receive_locked(item)) {
+                release_slot_if_bounded_locked();
+                lock.unlock();
+                if (!wake_one_send_waiter_after_receive()) {
+                    cv_writable_.notify_one();
+                }
+                maybe_notify_terminal();
+                return std::optional<T>(std::move(item));
+            }
+            if (is_terminal_locked()) {
+                return std::nullopt;
+            }
+            cv_readable_.wait(lock);
+        }
+    }
+
     SendAwaitable send(const T& item) { return SendAwaitable(this, item); }
 
     SendAwaitable send(T&& item) {
@@ -1132,6 +1180,69 @@ class ChannelProducer {
     auto send(T&& item) { return raw_->send(std::move(item)); }
 };
 
+/**
+ * ChannelConsumer<T> - Receive-side handle for Channel<T>
+ *
+ * Holds a raw pointer for operations and optionally a shared_ptr to keep
+ * the channel alive when created from a shared_ptr channel.
+ *
+ * Usage:
+ * @code
+ *     [ch = channel->consumer()](CoroScope& ctx)
+ *         -> CoroTask<void> {
+ *         while (auto item = co_await ch.receive()) {
+ *             process(*item);
+ *         }
+ *     }
+ * @endcode
+ */
+template <typename T>
+class ChannelConsumer {
+    Channel<T>* raw_{nullptr};
+    std::shared_ptr<Channel<T>> shared_;
+
+   public:
+    explicit ChannelConsumer(Channel<T>* ch) : raw_(ch) {}
+
+    explicit ChannelConsumer(std::shared_ptr<Channel<T>> ch)
+        : raw_(ch.get()), shared_(std::move(ch)) {}
+
+    ~ChannelConsumer() = default;
+
+    ChannelConsumer(ChannelConsumer&& other) noexcept
+        : raw_(other.raw_), shared_(std::move(other.shared_)) {
+        other.raw_ = nullptr;
+    }
+
+    ChannelConsumer& operator=(ChannelConsumer&& other) noexcept {
+        if (this != &other) {
+            raw_ = other.raw_;
+            shared_ = std::move(other.shared_);
+            other.raw_ = nullptr;
+        }
+        return *this;
+    }
+
+    ChannelConsumer(const ChannelConsumer& other)
+        : raw_(other.raw_), shared_(other.shared_) {}
+
+    ChannelConsumer& operator=(const ChannelConsumer& other) {
+        if (this != &other) {
+            raw_ = other.raw_;
+            shared_ = other.shared_;
+        }
+        return *this;
+    }
+
+    auto receive() const { return raw_->receive(); }
+
+    std::optional<T> blocking_receive() const {
+        return raw_->blocking_receive();
+    }
+
+    bool is_closed() const { return raw_->is_closed(); }
+};
+
 /**
  * Helper to create shared_ptr Channel<T>
  */
diff --git a/include/dftracer/utils/core/pipeline/executor.h b/include/dftracer/utils/core/pipeline/executor.h
index f2daf5e2..38eb568e 100644
--- a/include/dftracer/utils/core/pipeline/executor.h
+++ b/include/dftracer/utils/core/pipeline/executor.h
@@ -12,7 +12,6 @@
 #include <any>
 #include <atomic>
 #include <chrono>
-#include <condition_variable>
 #include <cstddef>
 #include <functional>
 #include <memory>
@@ -25,10 +24,6 @@
 #include <unordered_set>
 #include <vector>
 
-namespace dftracer::utils::io {
-class IoThreadPool;
-}  // namespace dftracer::utils::io
-
 namespace dftracer::utils {
 
 class Task;
@@ -36,13 +31,12 @@ class CoroScope;
 class Scheduler;
 
 struct ExecutorConfig {
-    std::size_t num_threads = 0;  // 0 = hardware_concurrency
+    std::size_t num_threads = 0;   // 0 = hardware_concurrency
     std::chrono::seconds idle_timeout{5};
     std::chrono::seconds deadlock_timeout{10};
-    std::size_t io_pool_size = 4;
+    std::size_t io_pool_size = 0;  // 0 = hardware_concurrency
     io::IoBackendType io_backend_type = io::IoBackendType::AUTO;
     unsigned io_batch_threshold = 16;
-    std::size_t db_pool_size = 2;
 };
 
 /**
@@ -154,11 +148,6 @@ class Executor {
     // Aligned to avoid false sharing between adjacent workers.
     struct alignas(DFTRACER_OPTIMAL_ALIGNMENT) WorkerContext {
         std::size_t worker_id;
-        // queue_mutex + cv: used for worker sleep/wake protocol.
-        // Workers sleep on cv; wake_one_worker/wake_all_workers
-        // lock+unlock this mutex before notifying to prevent lost wakeups.
-        mutable std::mutex queue_mutex;
-        std::condition_variable cv;
 
         // Health monitoring for watchdog
         std::atomic<bool> is_idle{false};
@@ -168,7 +157,7 @@ class Executor {
         // Current task info (for debugging/watchdog)
         std::atomic<TaskIndex> current_task_id{-1};
         std::string current_task_name;
-        std::mutex task_name_mutex;  // Protects current_task_name
+        std::mutex task_name_mutex;
 
         // Worker thread
         std::thread thread;
@@ -197,8 +186,7 @@ class Executor {
     alignas(DFTRACER_OPTIMAL_ALIGNMENT)
         std::atomic<std::size_t> total_tasks_submitted_{0};
 
-    std::chrono::steady_clock::time_point last_activity_time_;
-    mutable std::mutex activity_mutex_;
+    std::atomic<std::int64_t> last_activity_ns_;
 
     // Shutdown coordination
     std::atomic<bool> shutdown_requested_{false};
@@ -229,14 +217,10 @@ class Executor {
     // I/O backend (owned by executor, created by factory)
     std::unique_ptr<io::IoBackend> io_backend_;
 
-    // Dedicated thread pool for blocking DB operations.
-    std::unique_ptr<io::IoThreadPool> db_pool_;
-
     // Configuration (stored from ExecutorConfig)
-    std::size_t io_pool_size_ = 4;
+    std::size_t io_pool_size_ = 0;
     io::IoBackendType io_backend_type_ = io::IoBackendType::AUTO;
     unsigned io_batch_threshold_ = 16;
-    std::size_t db_pool_size_ = 2;
 
    public:
     /**
@@ -294,6 +278,8 @@ class Executor {
      */
     std::size_t get_num_threads() const { return num_threads_; }
 
+    std::size_t get_io_pool_size() const { return io_pool_size_; }
+
     /**
      * Check if an I/O backend is available
      */
@@ -305,11 +291,6 @@ class Executor {
     io::IoBackend& io_backend() { return *io_backend_; }
     const io::IoBackend& io_backend() const { return *io_backend_; }
 
-    /**
-     * Get the dedicated DB thread pool (nullptr if not started).
-     */
-    io::IoThreadPool* db_pool() noexcept;
-
     /**
      * Get the executor running on the current worker thread (nullptr
      * if the calling thread is not a worker).  Thread-local.
diff --git a/include/dftracer/utils/core/pipeline/pipeline_config.h b/include/dftracer/utils/core/pipeline/pipeline_config.h
index 5f727620..76ae3708 100644
--- a/include/dftracer/utils/core/pipeline/pipeline_config.h
+++ b/include/dftracer/utils/core/pipeline/pipeline_config.h
@@ -69,11 +69,10 @@ struct PipelineConfig {
         600};     // Executor deadlock timeout (10 minutes)
     std::chrono::microseconds timeslice_duration{
         10'000};  // Coroutine yield timeslice (10ms, 0 = disabled)
-    std::size_t io_thread_count = 4;   // I/O thread pool size
+    std::size_t io_thread_count = 0;   // 0 = hardware_concurrency
     io::IoBackendType io_backend_type =
         io::IoBackendType::AUTO;       // Backend selection
     unsigned io_batch_threshold = 16;  // SQE batch threshold (0 = per-op)
-    std::size_t db_pool_size = 2;      // Blocking DB async thread pool size
 
     /**
      * Set pipeline name
@@ -200,14 +199,6 @@ struct PipelineConfig {
         return *this;
     }
 
-    /**
-     * Set blocking DB async thread pool size (default 2)
-     */
-    PipelineConfig& with_db_pool_size(std::size_t size) {
-        db_pool_size = size;
-        return *this;
-    }
-
     /**
      * Create sequential execution configuration (1 thread)
      */
diff --git a/include/dftracer/utils/core/rocksdb/async.h b/include/dftracer/utils/core/rocksdb/async.h
deleted file mode 100644
index 3ff71a07..00000000
--- a/include/dftracer/utils/core/rocksdb/async.h
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifndef DFTRACER_UTILS_CORE_ROCKSDB_ASYNC_H
-#define DFTRACER_UTILS_CORE_ROCKSDB_ASYNC_H
-
-#include <coroutine>
-#include <exception>
-#include <functional>
-#include <optional>
-#include <utility>
-
-namespace dftracer::utils::io {
-class IoThreadPool;
-}  // namespace dftracer::utils::io
-
-namespace dftracer::utils::rocksdb {
-
-io::IoThreadPool* get_db_pool();
-void db_async_submit(io::IoThreadPool* pool, std::function<void()> fn);
-void db_async_resume_on(void* executor, std::coroutine_handle<> h);
-void* get_current_executor_opaque();
-
-template <typename T>
-class DbAwaitable {
-    io::IoThreadPool* pool_;
-    void* executor_;
-    std::function<T()> fn_;
-    std::optional<T> result_;
-    std::exception_ptr error_;
-    std::coroutine_handle<> handle_;
-
-   public:
-    DbAwaitable(io::IoThreadPool* pool, void* executor, std::function<T()> fn)
-        : pool_(pool), executor_(executor), fn_(std::move(fn)) {}
-
-    bool await_ready() noexcept {
-        if (pool_ == nullptr) {
-            try {
-                auto fn = std::move(fn_);
-                fn_ = {};
-                result_.emplace(fn());
-            } catch (...) {
-                error_ = std::current_exception();
-            }
-            return true;
-        }
-        return false;
-    }
-
-    void await_suspend(std::coroutine_handle<> h) {
-        handle_ = h;
-        auto* self = this;
-        db_async_submit(pool_, [self] {
-            try {
-                auto fn = std::move(self->fn_);
-                self->fn_ = {};
-                self->result_.emplace(fn());
-            } catch (...) {
-                self->error_ = std::current_exception();
-            }
-            db_async_resume_on(self->executor_, self->handle_);
-        });
-    }
-
-    T await_resume() {
-        if (error_ != nullptr) {
-            std::rethrow_exception(error_);
-        }
-        return std::move(*result_);
-    }
-};
-
-template <>
-class DbAwaitable<void> {
-    io::IoThreadPool* pool_;
-    void* executor_;
-    std::function<void()> fn_;
-    std::exception_ptr error_;
-    std::coroutine_handle<> handle_;
-
-   public:
-    DbAwaitable(io::IoThreadPool* pool, void* executor,
-                std::function<void()> fn)
-        : pool_(pool), executor_(executor), fn_(std::move(fn)) {}
-
-    bool await_ready() noexcept {
-        if (pool_ == nullptr) {
-            try {
-                auto fn = std::move(fn_);
-                fn_ = {};
-                fn();
-            } catch (...) {
-                error_ = std::current_exception();
-            }
-            return true;
-        }
-        return false;
-    }
-
-    void await_suspend(std::coroutine_handle<> h) {
-        handle_ = h;
-        auto* self = this;
-        db_async_submit(pool_, [self] {
-            try {
-                auto fn = std::move(self->fn_);
-                self->fn_ = {};
-                fn();
-            } catch (...) {
-                self->error_ = std::current_exception();
-            }
-            db_async_resume_on(self->executor_, self->handle_);
-        });
-    }
-
-    void await_resume() {
-        if (error_ != nullptr) {
-            std::rethrow_exception(error_);
-        }
-    }
-};
-
-template <typename F>
-auto run(F&& fn) -> DbAwaitable<decltype(fn())> {
-    using R = decltype(fn());
-    auto* pool = get_db_pool();
-    auto* executor = get_current_executor_opaque();
-    return DbAwaitable<R>(pool, executor, std::forward<F>(fn));
-}
-
-}  // namespace dftracer::utils::rocksdb
-
-#endif  // DFTRACER_UTILS_CORE_ROCKSDB_ASYNC_H
diff --git a/include/dftracer/utils/core/rocksdb/column_families.h b/include/dftracer/utils/core/rocksdb/column_families.h
new file mode 100644
index 00000000..c0b9c0ab
--- /dev/null
+++ b/include/dftracer/utils/core/rocksdb/column_families.h
@@ -0,0 +1,65 @@
+#ifndef DFTRACER_UTILS_CORE_ROCKSDB_COLUMN_FAMILIES_H
+#define DFTRACER_UTILS_CORE_ROCKSDB_COLUMN_FAMILIES_H
+
+#include <array>
+#include <string_view>
+
+namespace dftracer::utils::rocksdb::cf {
+
+inline constexpr std::string_view DEFAULT = "default";
+inline constexpr std::string_view CHECKPOINTS = "checkpoints";
+inline constexpr std::string_view METADATA = "metadata";
+inline constexpr std::string_view CHUNK_BLOOM = "chunk_bloom";
+inline constexpr std::string_view FILE_BLOOM = "file_bloom";
+inline constexpr std::string_view CHUNK_STATS = "chunk_stats";
+inline constexpr std::string_view DIMENSIONS = "dimensions";
+inline constexpr std::string_view CHUNK_DIM_STATS = "chunk_dim_stats";
+inline constexpr std::string_view FILE_SCALAR_STATS = "file_scalar_stats";
+inline constexpr std::string_view FILE_CAT_COUNTS = "file_cat_counts";
+inline constexpr std::string_view FILE_NAME_COUNTS = "file_name_counts";
+inline constexpr std::string_view FILE_PID_TID_COUNTS = "file_pid_tid_counts";
+inline constexpr std::string_view ROOT_SCALAR_STATS = "root_scalar_stats";
+inline constexpr std::string_view ROOT_CAT_COUNTS = "root_cat_counts";
+inline constexpr std::string_view ROOT_NAME_COUNTS = "root_name_counts";
+inline constexpr std::string_view ROOT_PID_TID_COUNTS = "root_pid_tid_counts";
+inline constexpr std::string_view NAME_DICTIONARY = "name_dictionary";
+inline constexpr std::string_view NAME_FILE_POSTINGS = "name_file_postings";
+inline constexpr std::string_view NAME_CHUNK_POSTINGS = "name_chunk_postings";
+inline constexpr std::string_view MANIFEST = "manifest";
+inline constexpr std::string_view PROVENANCE = "provenance";
+inline constexpr std::string_view ARCHIVES = "archives";
+inline constexpr std::string_view TAR_FILES = "tar_files";
+inline constexpr std::string_view AGGREGATION = "aggregation";
+inline constexpr std::string_view SYSTEM_METRICS = "system_metrics";
+inline constexpr std::string_view HASH_TABLES = "hash_tables";
+inline constexpr auto ALL =
+    std::to_array<std::string_view>({DEFAULT,
+                                     CHECKPOINTS,
+                                     METADATA,
+                                     CHUNK_BLOOM,
+                                     FILE_BLOOM,
+                                     CHUNK_STATS,
+                                     DIMENSIONS,
+                                     CHUNK_DIM_STATS,
+                                     FILE_SCALAR_STATS,
+                                     FILE_CAT_COUNTS,
+                                     FILE_NAME_COUNTS,
+                                     FILE_PID_TID_COUNTS,
+                                     ROOT_SCALAR_STATS,
+                                     ROOT_CAT_COUNTS,
+                                     ROOT_NAME_COUNTS,
+                                     ROOT_PID_TID_COUNTS,
+                                     NAME_DICTIONARY,
+                                     NAME_FILE_POSTINGS,
+                                     NAME_CHUNK_POSTINGS,
+                                     MANIFEST,
+                                     PROVENANCE,
+                                     ARCHIVES,
+                                     TAR_FILES,
+                                     AGGREGATION,
+                                     SYSTEM_METRICS,
+                                     HASH_TABLES});
+
+}  // namespace dftracer::utils::rocksdb::cf
+
+#endif  // DFTRACER_UTILS_CORE_ROCKSDB_COLUMN_FAMILIES_H
diff --git a/include/dftracer/utils/core/rocksdb/database.h b/include/dftracer/utils/core/rocksdb/database.h
index e4d70216..ed004959 100644
--- a/include/dftracer/utils/core/rocksdb/database.h
+++ b/include/dftracer/utils/core/rocksdb/database.h
@@ -1,12 +1,15 @@
 #ifndef DFTRACER_UTILS_CORE_ROCKSDB_DATABASE_H
 #define DFTRACER_UTILS_CORE_ROCKSDB_DATABASE_H
 
+#include <dftracer/utils/core/rocksdb/column_families.h>
 #include <rocksdb/db.h>
 #include <rocksdb/env.h>
 #include <rocksdb/file_system.h>
+#include <rocksdb/merge_operator.h>
 #include <rocksdb/options.h>
 #include <rocksdb/write_batch.h>
 
+#include <functional>
 #include <memory>
 #include <string>
 #include <string_view>
@@ -43,24 +46,47 @@ class RocksDatabase {
     ::rocksdb::DB* get() const noexcept;
 
     ::rocksdb::Status put(std::string_view key, std::string_view value,
-                          std::string_view column_family = "default");
+                          std::string_view column_family = cf::DEFAULT);
     ::rocksdb::Status get(std::string_view key, std::string* value,
-                          std::string_view column_family = "default") const;
+                          std::string_view column_family = cf::DEFAULT) const;
     ::rocksdb::Status del(std::string_view key,
-                          std::string_view column_family = "default");
+                          std::string_view column_family = cf::DEFAULT);
+    ::rocksdb::Status delete_range(
+        std::string_view begin_key, std::string_view end_key,
+        std::string_view column_family = cf::DEFAULT);
 
     ::rocksdb::Status put(Batch& batch, std::string_view column_family,
                           std::string_view key, std::string_view value);
     ::rocksdb::Status del(Batch& batch, std::string_view column_family,
                           std::string_view key);
 
+    ::rocksdb::Status merge(std::string_view key, std::string_view value,
+                            std::string_view column_family = cf::DEFAULT);
+    ::rocksdb::Status merge(Batch& batch, std::string_view column_family,
+                            std::string_view key, std::string_view value);
+
     Batch begin_batch() const;
     ::rocksdb::Status commit_batch(Batch& batch);
 
     std::unique_ptr<::rocksdb::Iterator> new_iterator(
-        std::string_view column_family = "default") const;
+        std::string_view column_family = cf::DEFAULT) const;
+
+    ::rocksdb::Status compact(std::string_view column_family = cf::DEFAULT);
+
+    /// Bulk-ingest externally built SST files into the named column family.
+    /// Keys across the SSTs must be sorted and non-overlapping unless the
+    /// caller requests `ingest_behind`, which pushes entries to the bottom
+    /// level and silently drops duplicate keys (for content-addressed CFs).
+    ::rocksdb::Status ingest_external_files(
+        std::string_view column_family,
+        const std::vector<std::string>& external_files,
+        bool ingest_behind = false);
+
+    using CfOptionsOverride = std::function<void(
+        const std::string&, ::rocksdb::ColumnFamilyOptions&)>;
+    void set_cf_options_override(CfOptionsOverride override);
 
-    static std::vector<std::string> default_column_families();
+    static const decltype(cf::ALL)& default_column_families();
     static ::rocksdb::Options default_options();
     static ::rocksdb::ColumnFamilyOptions default_column_family_options();
 
@@ -75,6 +101,7 @@ class RocksDatabase {
     ::rocksdb::DB* db_ = nullptr;
     std::unordered_map<std::string, ::rocksdb::ColumnFamilyHandle*>
         column_families_;
+    CfOptionsOverride cf_options_override_;
 };
 
 }  // namespace dftracer::utils::rocksdb
diff --git a/include/dftracer/utils/core/rocksdb/db_manager.h b/include/dftracer/utils/core/rocksdb/db_manager.h
index c45eec95..9b0a6abf 100644
--- a/include/dftracer/utils/core/rocksdb/db_manager.h
+++ b/include/dftracer/utils/core/rocksdb/db_manager.h
@@ -22,7 +22,8 @@ class RocksDBManager {
 
     std::shared_ptr<RocksDatabase> get_or_open(
         const std::string& db_path,
-        RocksDatabase::OpenMode open_mode = RocksDatabase::OpenMode::ReadWrite);
+        RocksDatabase::OpenMode open_mode = RocksDatabase::OpenMode::ReadWrite,
+        RocksDatabase::CfOptionsOverride cf_override = nullptr);
     void reset(const std::string& db_path);
     void shutdown();
 
diff --git a/include/dftracer/utils/core/runtime.h b/include/dftracer/utils/core/runtime.h
index 0e617a93..790a1974 100644
--- a/include/dftracer/utils/core/runtime.h
+++ b/include/dftracer/utils/core/runtime.h
@@ -6,6 +6,8 @@
 #include <dftracer/utils/core/pipeline/executor.h>
 #include <dftracer/utils/core/pipeline/watchdog.h>
 #include <dftracer/utils/core/task_handle.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/core/utilities/utility_traits.h>
 
 #include <atomic>
 #include <chrono>
@@ -19,6 +21,23 @@
 
 namespace dftracer::utils {
 
+namespace detail {
+
+template <typename UtilityT, typename InputT>
+coro::CoroTask<void> run_scoped_utility(CoroScope& scope, UtilityT* utility,
+                                        InputT input) {
+    utility->set_context(scope);
+    try {
+        co_await utility->process(std::move(input));
+        utility->clear_context();
+    } catch (...) {
+        utility->clear_context();
+        throw;
+    }
+}
+
+}  // namespace detail
+
 /// Lightweight wrapper around Executor + Watchdog for running coroutines
 /// on a thread pool without Pipeline/Scheduler/DAG overhead.
 /// Intended for Python bindings and other non-DAG consumers.
@@ -41,6 +60,44 @@ class Runtime {
     template <typename T>
     TypedTaskHandle<T> submit(coro::CoroTask<T> task, std::string name = "");
 
+    /// Submit a scoped task (provides CoroScope to the lambda).
+    /// Returns immediately, task runs on executor.
+    ///
+    /// Usage:
+    /// @code
+    /// auto handle = rt->scope("my_task", [](CoroScope& scope) ->
+    /// CoroTask<void> {
+    ///     scope.spawn([](CoroScope& s) -> CoroTask<void> { co_return; });
+    ///     co_await scope.join();
+    /// });
+    /// handle.get();  // wait when needed
+    /// @endcode
+    template <typename Func>
+        requires std::is_invocable_r_v<coro::CoroTask<void>, Func, CoroScope&>
+    TaskHandle scope(std::string name, Func&& func) {
+        return submit(run_coro_scope(executor_.get(), std::forward<Func>(func)),
+                      std::move(name));
+    }
+
+    /// Submit a NeedsContext utility with automatic context injection.
+    ///
+    /// Usage:
+    /// @code
+    /// AggregatorUtility util;
+    /// rt->scope("aggregator", util, input).get();
+    /// @endcode
+    template <typename UtilityT, typename InputT,
+              typename DecayedUtility = std::remove_reference_t<UtilityT>>
+        requires utilities::has_tag_v<utilities::tags::NeedsContext,
+                                      DecayedUtility>
+    TaskHandle scope(std::string name, UtilityT& utility, InputT input) {
+        return submit(
+            run_coro_scope(executor_.get(),
+                           detail::run_scoped_utility<UtilityT, InputT>,
+                           &utility, std::move(input)),
+            std::move(name));
+    }
+
     /// Wait for all outstanding tasks to complete.
     void wait_all();
 
@@ -52,6 +109,7 @@ class Runtime {
 
     void shutdown();
     std::size_t threads() const;
+    std::size_t io_threads() const;
     Executor* executor() { return executor_.get(); }
     Watchdog* watchdog() { return watchdog_.get(); }
 
@@ -107,6 +165,14 @@ TypedTaskHandle<T> Runtime::submit(coro::CoroTask<T> task, std::string name) {
         vp->set_value();
     };
 
+    // Set the executor on the task's promise so awaitables (e.g. channels)
+    // that capture `get_root_promise()->get_executor()` can schedule
+    // resumption. Without this, awaiters end up with executor=nullptr because
+    // the wrapping `coro::Coro` doesn't extend PromiseBase and the
+    // root-promise chain stops at the user's CoroTask.
+    if (task.handle()) {
+        task.handle().promise().set_executor(executor_.get());
+    }
     auto coro = wrapper(std::move(task), typed_promise, void_promise,
                         executor_.get(), tid);
     TaskIndex id = executor_->enqueue_tracked(std::move(coro), name, tid);
diff --git a/include/dftracer/utils/core/tasks/coro_scope.h b/include/dftracer/utils/core/tasks/coro_scope.h
index ddb095da..adc50b51 100644
--- a/include/dftracer/utils/core/tasks/coro_scope.h
+++ b/include/dftracer/utils/core/tasks/coro_scope.h
@@ -10,6 +10,9 @@
 #include <dftracer/utils/core/coro/spawn_future.h>
 #include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/core/pipeline/executor.h>
+#include <dftracer/utils/core/utilities/tags/needs_context.h>
+#include <dftracer/utils/core/utilities/utility.h>
+#include <dftracer/utils/core/utilities/utility_traits.h>
 
 #include <atomic>
 #include <cstddef>
@@ -246,6 +249,32 @@ class CoroScope {
         return coro::SpawnFuture<R>(std::move(state));
     }
 
+    template <typename UtilityT, typename InputT,
+              typename DecayedUtility = std::remove_reference_t<UtilityT>,
+              typename R = typename DecayedUtility::Output,
+              std::enable_if_t<
+                  utilities::detail::has_process_v<DecayedUtility, InputT, R>,
+                  int> = 0>
+    coro::SpawnFuture<R> spawn(UtilityT& utility, InputT input) {
+        return spawn([utility_ptr = &utility, input = std::move(input)](
+                         CoroScope& child_scope) mutable -> coro::CoroTask<R> {
+            if constexpr (utilities::has_tag_v<utilities::tags::NeedsContext,
+                                               DecayedUtility>) {
+                utility_ptr->set_context(child_scope);
+                try {
+                    R result = co_await utility_ptr->process(input);
+                    utility_ptr->clear_context();
+                    co_return result;
+                } catch (...) {
+                    utility_ptr->clear_context();
+                    throw;
+                }
+            } else {
+                co_return co_await utility_ptr->process(input);
+            }
+        });
+    }
+
     // ====================================================================
     // Channel Operations
     // ====================================================================
@@ -395,9 +424,9 @@ class CoroScope {
     void spawn_consumers(std::shared_ptr<coro::Channel<T>> channel,
                          std::size_t count, Func&& consumer_func) {
         for (std::size_t i = 0; i < count; i++) {
-            spawn([channel, func = consumer_func](
+            spawn([ch = channel->consumer(), func = consumer_func](
                       CoroScope& scope) -> coro::CoroTask<void> {
-                while (auto item = co_await channel->receive()) {
+                while (auto item = co_await ch.receive()) {
                     co_await func(scope, std::move(*item));
                 }
                 co_return;
@@ -530,14 +559,15 @@ class CoroScope {
 // Creates a child CoroScope, runs the lambda, and auto-joins.
 // ========================================================================
 
-template <typename Func>
-    requires std::is_invocable_r_v<coro::CoroTask<void>, Func, CoroScope&>
-inline coro::CoroTask<void> run_coro_scope(Executor* executor,
-                                           Func scope_func) {
+template <typename Func, typename... Args>
+    requires std::is_invocable_r_v<coro::CoroTask<void>, Func, CoroScope&,
+                                   Args...>
+inline coro::CoroTask<void> run_coro_scope(Executor* executor, Func scope_func,
+                                           Args... args) {
     CoroScope scope(executor);
     std::exception_ptr error;
     try {
-        co_await scope_func(scope);
+        co_await scope_func(scope, std::move(args)...);
     } catch (...) {
         error = std::current_exception();
     }
diff --git a/include/dftracer/utils/core/utilities/streaming_utility.h b/include/dftracer/utils/core/utilities/streaming_utility.h
index 17f46f07..188a37d8 100644
--- a/include/dftracer/utils/core/utilities/streaming_utility.h
+++ b/include/dftracer/utils/core/utilities/streaming_utility.h
@@ -2,7 +2,9 @@
 #define DFTRACER_UTILS_CORE_UTILITIES_STREAMING_UTILITY_H
 
 #include <dftracer/utils/core/coro/async_generator.h>
+#include <dftracer/utils/core/utilities/tags/needs_context.h>
 #include <dftracer/utils/core/utilities/utility.h>
+#include <dftracer/utils/core/utilities/utility_traits.h>
 
 namespace dftracer::utils::utilities {
 
@@ -46,6 +48,25 @@ class StreamingUtility : public UtilityBase<I, Tags...> {
     static constexpr std::string_view get_name() { return sig_; }
 
     virtual coro::AsyncGenerator<Batch> process(const I& input) = 0;
+
+    /// Bind context for streaming utilities with NeedsContext tag.
+    /// Unlike Utility::process which is wrapped by CoroScope::spawn,
+    /// streaming utilities need explicit context binding since their
+    /// AsyncGenerator cannot be spawned directly.
+    void bind_context(CoroScope& ctx) {
+        static_assert(
+            has_tag_v<tags::NeedsContext, StreamingUtility<I, Batch, Tags...>>,
+            "bind_context requires NeedsContext tag");
+        this->set_context(ctx);
+    }
+
+    /// Unbind context after streaming completes.
+    void unbind_context() {
+        static_assert(
+            has_tag_v<tags::NeedsContext, StreamingUtility<I, Batch, Tags...>>,
+            "unbind_context requires NeedsContext tag");
+        this->clear_context();
+    }
 };
 
 }  // namespace dftracer::utils::utilities
diff --git a/include/dftracer/utils/core/utilities/utility.h b/include/dftracer/utils/core/utilities/utility.h
index 127f14e0..7c8f332c 100644
--- a/include/dftracer/utils/core/utilities/utility.h
+++ b/include/dftracer/utils/core/utilities/utility.h
@@ -5,10 +5,12 @@
 #include <dftracer/utils/core/common/type_name.h>
 #include <dftracer/utils/core/coro/task.h>
 
+#include <memory>
 #include <stdexcept>
 #include <string_view>
 #include <tuple>
 #include <type_traits>
+#include <utility>
 
 namespace dftracer::utils {
 class CoroScope;
@@ -105,6 +107,8 @@ class UtilityBase {
     static constexpr std::string_view get_name() { return sig_; }
 
    protected:
+    bool has_context() const noexcept { return ctx_ != nullptr; }
+
     /**
      * @brief Access CoroScope (only valid when NeedsContext tag is present).
      */
@@ -124,6 +128,8 @@ class UtilityBase {
 
     void set_context(CoroScope& ctx) { ctx_ = &ctx; }
     void clear_context() { ctx_ = nullptr; }
+
+    friend class ::dftracer::utils::CoroScope;
 };
 
 /**
@@ -157,6 +163,30 @@ class Utility : public UtilityBase<I, Tags...> {
     friend class behaviors::UtilityExecutor<I, O, Tags...>;
 
     virtual coro::CoroTask<O> process(const I& input) = 0;
+
+    // Rvalue overload picked automatically for braced-init / std::move /
+    // other prvalue call expressions. Moves the input into wrapper storage
+    // so the inner virtual receives a stable reference that outlives every
+    // internal suspension point. Lvalue call sites still bind to
+    // process(const I&) directly, so hot loops that reuse a named local
+    // pay zero overhead.
+    coro::CoroTask<O> process(I&& input) {
+#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 14)
+        // GCC 12/13 miscalculate frame offsets for non-trivial locals in
+        // coroutine frames (coroutine-caveats.md §3). Heap-allocate so only
+        // a trivial unique_ptr slot lives in the wrapper frame, isolating
+        // the input object from frame-layout corruption. Drop this branch
+        // once the GCC 12/13 baseline is retired.
+        auto owned = std::make_unique<I>(std::move(input));
+        co_return co_await this->process(static_cast<const I&>(*owned));
+#else
+        // GCC 14+, Clang 14+, MSVC: frame-local is safe per the language
+        // rules, the local lives in the wrapper coroutine frame and the
+        // inner co_await holds a reference to it across suspension.
+        I local(std::move(input));
+        co_return co_await this->process(static_cast<const I&>(local));
+#endif
+    }
 };
 
 }  // namespace dftracer::utils::utilities
diff --git a/include/dftracer/utils/server/trace_index.h b/include/dftracer/utils/server/trace_index.h
index d131cd00..f89071b3 100644
--- a/include/dftracer/utils/server/trace_index.h
+++ b/include/dftracer/utils/server/trace_index.h
@@ -19,20 +19,11 @@ namespace dftracer::utils::server {
 /// paths and check index availability.
 class TraceIndex {
    public:
-    // Files below this compressed size are streamed directly without
-    // building a `.dftindex` database. At 8 MB compressed
-    // (~160 MB uncompressed with typical 20x JSON compression), a file
-    // has only a handful of 32 MB checkpoints -- the indexing overhead
-    // exceeds the benefit of bloom-filter skip.
-    static constexpr std::size_t INDEX_SIZE_THRESHOLD =
-        constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD;
-
     struct FileInfo {
         std::string path;
         std::string index_path;
         bool has_bloom_data = false;
         bool has_checkpoint_index = false;
-        bool is_small = false;
         std::uint64_t min_timestamp_us = 0;
         std::uint64_t max_timestamp_us = 0;
         std::uint64_t compressed_size = 0;
diff --git a/include/dftracer/utils/utilities/common/arrow/arrow.h b/include/dftracer/utils/utilities/common/arrow/arrow.h
index 3350baf8..fb7f15fc 100644
--- a/include/dftracer/utils/utilities/common/arrow/arrow.h
+++ b/include/dftracer/utils/utilities/common/arrow/arrow.h
@@ -5,7 +5,11 @@
 #include <dftracer/utils/utilities/common/arrow/column_builder.h>
 
 #ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+#include <dftracer/utils/utilities/common/arrow/ipc_reader.h>
 #include <dftracer/utils/utilities/common/arrow/ipc_writer.h>
+#include <dftracer/utils/utilities/common/arrow/parallel_reader.h>
+#include <dftracer/utils/utilities/common/arrow/partition_router.h>
+#include <dftracer/utils/utilities/common/arrow/partition_writer.h>
 #endif
 
 #endif  // DFTRACER_UTILS_UTILITIES_COMMON_ARROW_ARROW_H
diff --git a/include/dftracer/utils/utilities/common/arrow/arrow_export.h b/include/dftracer/utils/utilities/common/arrow/arrow_export.h
index d9631c4e..e8739984 100644
--- a/include/dftracer/utils/utilities/common/arrow/arrow_export.h
+++ b/include/dftracer/utils/utilities/common/arrow/arrow_export.h
@@ -1,6 +1,7 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMMON_ARROW_ARROW_EXPORT_H
 #define DFTRACER_UTILS_UTILITIES_COMMON_ARROW_ARROW_EXPORT_H
 
+#include <dftracer/utils/core/common/config.h>
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
 
 #include <cstdint>
diff --git a/include/dftracer/utils/utilities/common/arrow/column_builder.h b/include/dftracer/utils/utilities/common/arrow/column_builder.h
index e29d8717..dd54f694 100644
--- a/include/dftracer/utils/utilities/common/arrow/column_builder.h
+++ b/include/dftracer/utils/utilities/common/arrow/column_builder.h
@@ -1,12 +1,15 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMMON_ARROW_COLUMN_BUILDER_H
 #define DFTRACER_UTILS_UTILITIES_COMMON_ARROW_COLUMN_BUILDER_H
 
+#include <dftracer/utils/core/common/config.h>
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
 
+#include <dftracer/utils/core/common/transparent_string_hash.h>
 #include <dftracer/utils/utilities/common/arrow/arrow_export.h>
 
 #include <cstddef>
 #include <cstdint>
+#include <deque>
 #include <initializer_list>
 #include <optional>
 #include <string>
@@ -16,7 +19,7 @@
 
 namespace dftracer::utils::utilities::common::arrow {
 
-enum class ColumnType { INT64, UINT64, DOUBLE, STRING, BOOL };
+enum class ColumnType { INT64, UINT64, DOUBLE, STRING, BOOL, DICT_STRING };
 
 struct ColumnSpec {
     std::string name;
@@ -26,14 +29,21 @@ struct ColumnSpec {
 struct ColumnData {
     std::string name;
     ColumnType type;
-    std::vector<int64_t> int64_values;
-    std::vector<uint64_t> uint64_values;
+    std::vector<std::int64_t> int64_values;
+    std::vector<std::uint64_t> uint64_values;
     std::vector<double> double_values;
-    std::vector<std::string_view> string_values;
-    std::vector<uint8_t> bool_values;
-    std::vector<uint8_t> validity;  // 1 = valid, 0 = null
-    size_t count = 0;
+    std::vector<std::int32_t> string_offsets;
+    std::vector<char> string_data;
+    std::vector<std::uint8_t> bool_values;
+    std::vector<std::uint8_t> validity;
+    std::size_t count = 0;
     bool has_nulls = false;
+
+    // Dictionary encoding support (for DICT_STRING)
+    std::vector<std::int32_t> dict_indices;  // indices into dict_values
+    std::deque<std::string> dict_values;     // unique strings (dictionary)
+    std::unordered_map<std::string_view, std::int32_t>
+        dict_map;                            // string -> index
 };
 
 /**
@@ -44,8 +54,8 @@ struct ColumnData {
  *   Dynamic: add_or_get_column() on first encounter; backfills nulls for
  *            columns not touched in a given row via end_row().
  *
- * String columns store std::string_view — caller must keep source data
- * alive until finish() returns.
+ * String columns copy and own their data — no lifetime requirements on
+ * the source strings passed to append_string().
  *
  * NOT thread-safe. One builder per worker/coroutine.
  */
@@ -62,23 +72,24 @@ class RecordBatchBuilder {
     // Returns existing index if column already exists; type is ignored for
     // existing columns — callers must use find_column() to check type before
     // appending, and fall back to append_null() on mismatch.
-    size_t add_or_get_column(std::string_view name, ColumnType type);
+    std::size_t add_or_get_column(std::string_view name, ColumnType type);
 
     // Returns the index of an existing column, or std::nullopt if not found.
     // Use before appending null values to avoid creating STRING-typed columns
     // that may later receive typed values.
-    std::optional<size_t> find_column(std::string_view name) const;
+    std::optional<std::size_t> find_column(std::string_view name) const;
 
     // Returns the type of column at col_idx.
-    ColumnType column_type(size_t col_idx) const noexcept;
+    ColumnType column_type(std::size_t col_idx) const noexcept;
 
     // Append typed values by column index.
-    void append_int64(size_t col_idx, int64_t value);
-    void append_uint64(size_t col_idx, uint64_t value);
-    void append_double(size_t col_idx, double value);
-    void append_string(size_t col_idx, std::string_view value);
-    void append_bool(size_t col_idx, bool value);
-    void append_null(size_t col_idx);
+    void append_int64(std::size_t col_idx, std::int64_t value);
+    void append_uint64(std::size_t col_idx, std::uint64_t value);
+    void append_double(std::size_t col_idx, double value);
+    void append_string(std::size_t col_idx, std::string_view value);
+    void append_dict_string(std::size_t col_idx, std::string_view value);
+    void append_bool(std::size_t col_idx, bool value);
+    void append_null(std::size_t col_idx);
 
     // End current row. In dynamic mode, backfills nulls for untouched
     // columns. In static mode, validates all columns were appended.
@@ -86,29 +97,40 @@ class RecordBatchBuilder {
     void end_row();
 
     // Pre-allocate internal buffers for num_rows rows.
-    void reserve(size_t num_rows);
+    void reserve(std::size_t num_rows);
 
     // Bulk-convert internal vectors to Arrow and return a self-contained
     // result. Builder is in an undefined state until reset() is called.
     ArrowExportResult finish();
 
     // Clear data. If keep_schema is true, column structure is preserved
-    // for the next batch (static mode only; dynamic mode always clears).
+    // for the next batch (requires schema to be locked first).
     void reset(bool keep_schema = true);
 
-    size_t num_rows() const noexcept { return num_rows_; }
-    size_t num_columns() const noexcept { return columns_.size(); }
+    // Lock the current schema. After locking:
+    // - Existing columns maintain their positions
+    // - New columns discovered via add_or_get_column() are appended at end
+    // - reset(true) preserves the schema structure
+    // Call after emitting the first batch to ensure consistent column ordering.
+    void lock_schema() noexcept { schema_locked_ = true; }
+
+    // Check if schema is locked.
+    bool is_schema_locked() const noexcept { return schema_locked_; }
+
+    std::size_t num_rows() const noexcept { return num_rows_; }
+    std::size_t num_columns() const noexcept { return columns_.size(); }
 
    private:
     std::vector<ColumnData> columns_;
-    std::unordered_map<std::string, size_t> name_to_index_;
-    size_t num_rows_ = 0;
+    StringViewMap<std::size_t> name_to_index_;
+    std::size_t num_rows_ = 0;
+    std::size_t row_touched_count_ = 0;
     bool schema_declared_ = false;
-    // Tracks which columns were touched in the current row (dynamic mode).
+    bool schema_locked_ = false;
     std::vector<bool> touched_;
 
     void init_column(ColumnData& col, ColumnType type, std::string_view name);
-    void backfill_nulls(ColumnData& col, size_t target_count);
+    void backfill_nulls(ColumnData& col, std::size_t target_count);
 };
 
 }  // namespace dftracer::utils::utilities::common::arrow
diff --git a/include/dftracer/utils/utilities/common/arrow/ipc_reader.h b/include/dftracer/utils/utilities/common/arrow/ipc_reader.h
new file mode 100644
index 00000000..2d88326b
--- /dev/null
+++ b/include/dftracer/utils/utilities/common/arrow/ipc_reader.h
@@ -0,0 +1,99 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMMON_ARROW_IPC_READER_H
+#define DFTRACER_UTILS_UTILITIES_COMMON_ARROW_IPC_READER_H
+
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+#include <dftracer/utils/utilities/common/arrow/arrow_export.h>
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace dftracer::utils::utilities::common::arrow::detail {
+// Store block info separately from decoder state
+struct IpcBlock {
+    std::int64_t offset;
+    std::int32_t metadata_length;
+    std::int64_t body_length;
+};
+}  // namespace dftracer::utils::utilities::common::arrow::detail
+
+namespace dftracer::utils::utilities::common::arrow {
+
+/**
+ * RAII reader for Arrow IPC file format (.arrow).
+ *
+ * Optimized with:
+ * - Memory-mapped I/O for zero-copy file access
+ * - Shared schema (no deep copy per batch)
+ * - Buffer reuse for decompression
+ *
+ * Supports buffer-level ZSTD decompression compatible with
+ * pyarrow, polars, and this library's IpcWriter.
+ *
+ * Sequence: open() -> num_batches() -> read_batch(i) [or read_all()]
+ *
+ * Move-only. Not thread-safe.
+ */
+class IpcReader {
+   public:
+    IpcReader() = default;
+    ~IpcReader();
+
+    IpcReader(const IpcReader&) = delete;
+    IpcReader& operator=(const IpcReader&) = delete;
+    IpcReader(IpcReader&& other) noexcept;
+    IpcReader& operator=(IpcReader&& other) noexcept;
+
+    // Open file for reading. Returns 0 on success.
+    int open(const std::string& path);
+
+    // Close the file.
+    void close();
+
+    bool is_open() const noexcept { return mapped_data_ != nullptr; }
+
+    // Number of record batches in the file.
+    std::size_t num_batches() const noexcept { return num_batches_; }
+
+    // Total rows across all batches.
+    std::int64_t total_rows() const noexcept { return total_rows_; }
+
+    // Read a single batch by index. Returns empty result on error.
+    ArrowExportResult read_batch(std::size_t index);
+
+    // Read all batches and return as a vector.
+    std::vector<ArrowExportResult> read_all();
+
+    // Iterate over all batches, calling callback for each.
+    // Returns 0 on success, non-zero if callback returns non-zero or on error.
+    int for_each_batch(std::function<int(ArrowExportResult&)> callback);
+
+   private:
+    // Memory-mapped file data
+    void* mapped_data_ = nullptr;
+    std::size_t mapped_size_ = 0;
+    int fd_ = -1;
+
+    // Decoder state
+    void* decoder_ = nullptr;  // ArrowIpcDecoder*
+
+    // Shared schema (not deep-copied per batch)
+    std::shared_ptr<void> shared_schema_;  // ArrowSchema*, ref-counted
+
+    // Block metadata
+    std::vector<detail::IpcBlock> blocks_;
+    std::size_t num_batches_ = 0;
+    std::int64_t total_rows_ = 0;
+
+    void reset_state() noexcept;
+    int read_footer();
+};
+
+}  // namespace dftracer::utils::utilities::common::arrow
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW_IPC
+#endif  // DFTRACER_UTILS_UTILITIES_COMMON_ARROW_IPC_READER_H
diff --git a/include/dftracer/utils/utilities/common/arrow/ipc_writer.h b/include/dftracer/utils/utilities/common/arrow/ipc_writer.h
index 2e0dc572..41319e1a 100644
--- a/include/dftracer/utils/utilities/common/arrow/ipc_writer.h
+++ b/include/dftracer/utils/utilities/common/arrow/ipc_writer.h
@@ -1,21 +1,74 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMMON_ARROW_IPC_WRITER_H
 #define DFTRACER_UTILS_UTILITIES_COMMON_ARROW_IPC_WRITER_H
 
+#include <dftracer/utils/core/common/config.h>
 #ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
 
+#include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/utilities/common/arrow/arrow_export.h>
 
+#include <atomic>
 #include <cstdio>
+#include <memory>
 #include <string>
+#include <vector>
 
 namespace dftracer::utils::utilities::common::arrow {
 
 /**
- * RAII wrapper for writing Arrow IPC file format (.arrows).
+ * Compression type for Arrow IPC buffer-level compression.
  *
- * Sequence: open() -> write_batch() [1..N] -> close()
- * The first write_batch() call writes the schema; subsequent calls append
- * record batches. close() finalizes the file footer.
+ * Buffer-level compression means each buffer in a record batch is compressed
+ * independently. This is the standard Arrow IPC compression format and is
+ * readable by pyarrow, polars, and other Arrow implementations.
+ */
+enum class IpcCompression {
+    NONE,  // Uncompressed (maximum compatibility)
+#ifdef DFTRACER_UTILS_ENABLE_ZSTD
+    ZSTD,  // zstd compression (best ratio/speed)
+#endif
+};
+
+#ifdef DFTRACER_UTILS_ENABLE_ZSTD
+constexpr IpcCompression DEFAULT_ARROW_IPC_COMPRESSION = IpcCompression::ZSTD;
+#else
+constexpr IpcCompression DEFAULT_ARROW_IPC_COMPRESSION = IpcCompression::NONE;
+#endif
+
+class BufferPool {
+   public:
+    static constexpr std::size_t DEFAULT_BUFFER_CAPACITY = 4 * 1024 * 1024;
+
+    struct Slot {
+        std::vector<uint8_t> data;
+        std::atomic<bool> in_use{false};
+    };
+
+    explicit BufferPool(std::size_t num_slots = 4,
+                        std::size_t initial_capacity = DEFAULT_BUFFER_CAPACITY);
+    ~BufferPool() = default;
+
+    BufferPool(const BufferPool&) = delete;
+    BufferPool& operator=(const BufferPool&) = delete;
+    BufferPool(BufferPool&&) = default;
+    BufferPool& operator=(BufferPool&&) = default;
+
+    Slot* acquire(std::size_t min_capacity = 0);
+    void release(Slot* slot);
+    std::size_t size() const { return slots_.size(); }
+
+   private:
+    std::vector<std::unique_ptr<Slot>> slots_;
+};
+
+/**
+ * Async Arrow IPC file writer (.arrow).
+ *
+ * Uses Executor::current() for async I/O - must be called from within executor.
+ * Supports buffer-level compression (zstd) compatible with pyarrow, polars,
+ * nanoarrow, and other Arrow IPC readers.
+ *
+ * Usage: open() -> write_batch() [1..N] -> close()
  *
  * Move-only. Not thread-safe.
  */
@@ -29,28 +82,40 @@ class IpcWriter {
     IpcWriter(IpcWriter&& other) noexcept;
     IpcWriter& operator=(IpcWriter&& other) noexcept;
 
-    // Open path for writing. Returns 0 on success.
-    int open(const std::string& path);
-
-    // Write one record batch. First call also writes the schema.
-    // Returns 0 on success.
-    int write_batch(ArrowExportResult& batch);
+    coro::CoroTask<int> open(
+        const std::string& path,
+        IpcCompression compression = DEFAULT_ARROW_IPC_COMPRESSION,
+        std::size_t pool_slots = 4);
 
-    // Finalize footer and close. Returns 0 on success.
-    int close();
+    coro::CoroTask<int> write_batch(ArrowExportResult& batch);
+    coro::CoroTask<int> write_batches(std::vector<ArrowExportResult>& batches);
+    coro::CoroTask<int> close();
 
-    bool is_open() const noexcept { return file_ != nullptr; }
+    bool is_open() const noexcept { return fd_ >= 0; }
 
    private:
-    std::FILE* file_ = nullptr;
+    int fd_ = -1;
+    off_t write_offset_ = 0;
+    BufferPool buffer_pool_;
     bool schema_written_ = false;
-    // Heap-allocated nanoarrow structs stored as void* to avoid pulling
-    // nanoarrow_ipc.h into every translation unit that includes this header.
-    void* writer_ = nullptr;  // ArrowIpcWriter*
-    void* stream_ =
-        nullptr;  // ArrowIpcOutputStream* (owned by writer_ after init)
+    IpcCompression compression_ = DEFAULT_ARROW_IPC_COMPRESSION;
+    void* batch_blocks_ = nullptr;
+    void* schema_copy_ = nullptr;
 
     void reset_state() noexcept;
+
+    struct CompressedBatch {
+        std::vector<uint8_t> header;
+        BufferPool::Slot* body_slot;
+        std::size_t body_size;
+        std::int32_t metadata_length;
+        std::int64_t body_length;
+    };
+
+    coro::CoroTask<CompressedBatch> compress_batch(ArrowExportResult& batch);
+    coro::CoroTask<int> write_compressed(CompressedBatch& cb);
+    coro::CoroTask<int> write_schema(ArrowExportResult& batch);
+    coro::CoroTask<int> write_footer();
 };
 
 }  // namespace dftracer::utils::utilities::common::arrow
diff --git a/include/dftracer/utils/utilities/common/arrow/parallel_reader.h b/include/dftracer/utils/utilities/common/arrow/parallel_reader.h
new file mode 100644
index 00000000..356ef27b
--- /dev/null
+++ b/include/dftracer/utils/utilities/common/arrow/parallel_reader.h
@@ -0,0 +1,95 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARALLEL_READER_H
+#define DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARALLEL_READER_H
+
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/utilities/common/arrow/arrow_export.h>
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace dftracer::utils {
+class CoroScope;
+}
+
+namespace dftracer::utils::utilities::common::arrow {
+
+using dftracer::utils::CoroScope;
+
+/**
+ * Result from reading a single Arrow IPC file.
+ * Batches are stored in shared_ptr to allow copying through std::shared_future.
+ */
+struct ArrowFileReadResult {
+    std::string path;
+    std::shared_ptr<std::vector<ArrowExportResult>> batches;
+    std::int64_t total_rows = 0;
+    std::string error;
+    bool success = true;
+
+    ArrowFileReadResult()
+        : batches(std::make_shared<std::vector<ArrowExportResult>>()) {}
+};
+
+/**
+ * Result from reading multiple Arrow IPC files in parallel.
+ */
+struct ParallelReadResult {
+    std::vector<ArrowFileReadResult> file_results;
+    std::int64_t total_rows = 0;
+    std::int64_t total_batches = 0;
+    std::size_t files_read = 0;
+    std::size_t files_failed = 0;
+};
+
+/**
+ * Read a single Arrow IPC file as a coroutine.
+ *
+ * @param path Path to the Arrow IPC file.
+ * @return ArrowFileReadResult with batches or error.
+ */
+coro::CoroTask<ArrowFileReadResult> read_arrow_file_async(std::string path);
+
+/**
+ * Read multiple Arrow IPC files in parallel.
+ *
+ * Collects all results before returning. For streaming results as they
+ * complete, use read_arrow_files_streaming instead.
+ *
+ * @param paths List of file paths to read.
+ * @return ParallelReadResult with all results.
+ */
+coro::CoroTask<ParallelReadResult> read_arrow_files_parallel(
+    std::vector<std::string> paths);
+
+/**
+ * Callback type for streaming file results.
+ * Return false to cancel remaining reads.
+ */
+using FileResultCallback = std::function<bool(ArrowFileReadResult&&)>;
+
+/**
+ * Read multiple Arrow IPC files in parallel, streaming results via callback.
+ *
+ * Results are delivered in completion order (whichever file finishes first).
+ * This is more memory-efficient for large numbers of files.
+ *
+ * Must be run within a CoroScope (via runtime.scope() or run_coro_scope).
+ *
+ * @param scope CoroScope for spawning parallel tasks.
+ * @param paths List of file paths to read.
+ * @param callback Called for each file result. Return false to cancel.
+ * @return Summary stats (files_read, files_failed, total_rows, total_batches).
+ */
+coro::CoroTask<ParallelReadResult> read_arrow_files_streaming(
+    CoroScope& scope, std::vector<std::string> paths,
+    FileResultCallback callback);
+
+}  // namespace dftracer::utils::utilities::common::arrow
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW_IPC
+#endif  // DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARALLEL_READER_H
diff --git a/include/dftracer/utils/utilities/common/arrow/partition_router.h b/include/dftracer/utils/utilities/common/arrow/partition_router.h
new file mode 100644
index 00000000..41f4352c
--- /dev/null
+++ b/include/dftracer/utils/utilities/common/arrow/partition_router.h
@@ -0,0 +1,94 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARTITION_ROUTER_H
+#define DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARTITION_ROUTER_H
+
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/utilities/common/arrow/arrow_export.h>
+#include <dftracer/utils/utilities/common/arrow/partition_writer.h>
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace dftracer::utils::utilities::common::arrow {
+
+struct RouterWriteStats {
+    std::unordered_map<std::string, PartitionWriteStats> partitions;
+    int64_t total_rows = 0;
+    int64_t total_uncompressed_bytes = 0;
+};
+
+struct PartitionConfig {
+    enum class Mode {
+        NONE,
+        COLUMN,
+        BUCKETED,
+        VIEW,
+    };
+
+    Mode mode = Mode::NONE;
+    std::vector<std::string> partition_columns;
+    int num_buckets = 0;
+    std::vector<std::pair<std::string, std::optional<std::string>>> views;
+};
+
+using PredicateEvaluator =
+    std::function<bool(const std::unordered_map<std::string, std::string>&)>;
+
+/**
+ * Routes Arrow record batches to partitioned output directories.
+ * Supports column-based, bucketed, and view-based partitioning.
+ */
+class PartitionRouter {
+   public:
+    PartitionRouter() = default;
+    ~PartitionRouter();
+
+    PartitionRouter(const PartitionRouter&) = delete;
+    PartitionRouter& operator=(const PartitionRouter&) = delete;
+    PartitionRouter(PartitionRouter&& other) noexcept;
+    PartitionRouter& operator=(PartitionRouter&& other) noexcept;
+
+    int open(const std::string& output_dir, const PartitionConfig& config,
+             int64_t chunk_size_bytes,
+             IpcCompression compression = DEFAULT_ARROW_IPC_COMPRESSION);
+
+    void register_predicate(const std::string& view_name,
+                            PredicateEvaluator evaluator);
+
+    coro::CoroTask<int> write_batch(ArrowExportResult& batch);
+    coro::CoroTask<RouterWriteStats> close();
+
+    bool is_open() const noexcept { return is_open_; }
+
+   private:
+    std::string output_dir_;
+    PartitionConfig config_;
+    int64_t chunk_size_bytes_ = 0;
+    IpcCompression compression_ = DEFAULT_ARROW_IPC_COMPRESSION;
+    bool is_open_ = false;
+
+    std::unordered_map<std::string, std::unique_ptr<PartitionWriter>> writers_;
+    std::unordered_map<std::string, PredicateEvaluator> predicates_;
+
+    coro::CoroTask<PartitionWriter*> get_or_create_writer(
+        const std::string& partition_key);
+    std::string partition_path(const std::string& partition_key) const;
+    int compute_bucket(const std::vector<std::string>& values) const;
+
+    coro::CoroTask<int> route_none(ArrowExportResult& batch);
+    coro::CoroTask<int> route_column(ArrowExportResult& batch);
+    coro::CoroTask<int> route_bucketed(ArrowExportResult& batch);
+    coro::CoroTask<int> route_view(ArrowExportResult& batch);
+};
+
+}  // namespace dftracer::utils::utilities::common::arrow
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW_IPC
+#endif  // DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARTITION_ROUTER_H
diff --git a/include/dftracer/utils/utilities/common/arrow/partition_writer.h b/include/dftracer/utils/utilities/common/arrow/partition_writer.h
new file mode 100644
index 00000000..4c3cfe50
--- /dev/null
+++ b/include/dftracer/utils/utilities/common/arrow/partition_writer.h
@@ -0,0 +1,76 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARTITION_WRITER_H
+#define DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARTITION_WRITER_H
+
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/utilities/common/arrow/arrow_export.h>
+#include <dftracer/utils/utilities/common/arrow/ipc_writer.h>
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace dftracer::utils::utilities::common::arrow {
+
+struct PartitionWriteStats {
+    std::vector<std::string> files;
+    std::vector<int64_t> row_counts;
+    int64_t total_rows = 0;
+    int64_t total_uncompressed_bytes = 0;
+};
+
+/**
+ * Async wrapper around IpcWriter with automatic file rotation.
+ * Writes part-NNNNN.arrow files, rotating when size threshold is exceeded.
+ */
+class PartitionWriter {
+   public:
+    PartitionWriter() = default;
+    ~PartitionWriter();
+
+    PartitionWriter(const PartitionWriter&) = delete;
+    PartitionWriter& operator=(const PartitionWriter&) = delete;
+    PartitionWriter(PartitionWriter&& other) noexcept;
+    PartitionWriter& operator=(PartitionWriter&& other) noexcept;
+
+    coro::CoroTask<int> open(
+        const std::string& output_dir, int64_t chunk_size_bytes,
+        IpcCompression compression = DEFAULT_ARROW_IPC_COMPRESSION);
+
+    coro::CoroTask<int> write_batch(ArrowExportResult& batch);
+    coro::CoroTask<PartitionWriteStats> close();
+
+    bool is_open() const noexcept { return is_open_; }
+    int64_t current_file_bytes() const noexcept { return current_file_bytes_; }
+    int64_t total_bytes() const noexcept { return total_bytes_; }
+    int64_t total_rows() const noexcept { return total_rows_; }
+    size_t file_count() const noexcept { return file_index_; }
+
+   private:
+    std::string output_dir_;
+    int64_t chunk_size_bytes_ = 0;
+    IpcCompression compression_ = DEFAULT_ARROW_IPC_COMPRESSION;
+
+    IpcWriter writer_;
+    bool is_open_ = false;
+    size_t file_index_ = 0;
+
+    int64_t current_file_bytes_ = 0;
+    int64_t current_file_rows_ = 0;
+    int64_t total_bytes_ = 0;
+    int64_t total_rows_ = 0;
+
+    std::vector<std::string> files_;
+    std::vector<int64_t> row_counts_;
+
+    std::string generate_filename() const;
+    coro::CoroTask<int> rotate_file();
+    int64_t calculate_uncompressed_size(ArrowExportResult& batch);
+};
+
+}  // namespace dftracer::utils::utilities::common::arrow
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW_IPC
+#endif  // DFTRACER_UTILS_UTILITIES_COMMON_ARROW_PARTITION_WRITER_H
diff --git a/include/dftracer/utils/utilities/common/json/json.h b/include/dftracer/utils/utilities/common/json/json.h
index 1ece20b3..1dfa622b 100644
--- a/include/dftracer/utils/utilities/common/json/json.h
+++ b/include/dftracer/utils/utilities/common/json/json.h
@@ -5,7 +5,8 @@
  * @file json.h
  * @brief Common JSON utilities for the dftracer-utils library.
  *
- * Provides JsonValue - a lightweight zero-cost wrapper around yyjson_val*.
+ * Provides JsonValue - a lightweight zero-cost wrapper around simdjson DOM
+ * elements.
  */
 
 #include <dftracer/utils/utilities/common/json/json_value.h>
@@ -14,10 +15,9 @@
 
 namespace dftracer::utils::utilities::common::json {
 
-/// Stack buffer size for yyjson_alc_pool used in per-line JSON parsing.
-/// 4KB is sufficient for typical trace events (few hundred bytes each).
-/// If a line exceeds this, yyjson silently falls back to malloc.
-inline constexpr std::size_t YYJSON_LINE_POOL_SIZE = 4096;
+/// Default capacity for simdjson parser buffer.
+/// 1MB is sufficient for most JSON documents.
+inline constexpr std::size_t SIMDJSON_DEFAULT_CAPACITY = 1 << 20;
 
 }  // namespace dftracer::utils::utilities::common::json
 
diff --git a/include/dftracer/utils/utilities/common/json/json_doc_guard.h b/include/dftracer/utils/utilities/common/json/json_doc_guard.h
index 3debb463..2c5a0431 100644
--- a/include/dftracer/utils/utilities/common/json/json_doc_guard.h
+++ b/include/dftracer/utils/utilities/common/json/json_doc_guard.h
@@ -1,37 +1,60 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMMON_JSON_JSON_DOC_GUARD_H
 #define DFTRACER_UTILS_UTILITIES_COMMON_JSON_JSON_DOC_GUARD_H
 
-#include <yyjson.h>
+#include <simdjson.h>
+
+#include <string>
 
 namespace dftracer::utils::utilities::common::json {
 
-/// RAII guard for yyjson_doc to prevent leaks on exceptions or
-/// early co_return from coroutines.
+/// RAII guard that owns a simdjson DOM parser and document.
+/// With simdjson, the parser manages document lifetime internally.
 struct JsonDocGuard {
-    yyjson_doc* doc = nullptr;
+    simdjson::dom::parser parser;
+    bool valid = false;
 
-    explicit JsonDocGuard(yyjson_doc* d) : doc(d) {}
-    ~JsonDocGuard() {
-        if (doc) yyjson_doc_free(doc);
-    }
+    JsonDocGuard() = default;
 
-    JsonDocGuard(const JsonDocGuard&) = delete;
-    JsonDocGuard& operator=(const JsonDocGuard&) = delete;
-    JsonDocGuard(JsonDocGuard&& other) noexcept : doc(other.doc) {
-        other.doc = nullptr;
-    }
-    JsonDocGuard& operator=(JsonDocGuard&& other) noexcept {
-        if (this != &other) {
-            if (doc) yyjson_doc_free(doc);
-            doc = other.doc;
-            other.doc = nullptr;
-        }
-        return *this;
+    bool parse(const char* data, std::size_t len) {
+        auto result = parser.parse(data, len);
+        valid = !result.error();
+        return valid;
     }
 
-    explicit operator bool() const { return doc != nullptr; }
+    simdjson::dom::element root() const { return parser.doc.root(); }
+
+    explicit operator bool() const { return valid; }
 };
 
+/// Convert an On-Demand value to string for bloom filter insertion.
+/// Handles strings, integers, floats, bools.
+inline std::string ondemand_value_to_string(simdjson::ondemand::value& val) {
+    auto type_result = val.type();
+    if (type_result.error()) return {};
+
+    switch (type_result.value()) {
+        case simdjson::ondemand::json_type::string: {
+            auto s = val.get_string();
+            return s.error() ? std::string{} : std::string(s.value());
+        }
+        case simdjson::ondemand::json_type::number: {
+            auto u = val.get_uint64();
+            if (!u.error()) return std::to_string(u.value());
+            auto i = val.get_int64();
+            if (!i.error()) return std::to_string(i.value());
+            auto d = val.get_double();
+            if (!d.error()) return std::to_string(d.value());
+            return {};
+        }
+        case simdjson::ondemand::json_type::boolean: {
+            auto b = val.get_bool();
+            return b.error() ? std::string{} : (b.value() ? "true" : "false");
+        }
+        default:
+            return {};
+    }
+}
+
 }  // namespace dftracer::utils::utilities::common::json
 
 #endif  // DFTRACER_UTILS_UTILITIES_COMMON_JSON_JSON_DOC_GUARD_H
diff --git a/include/dftracer/utils/utilities/common/json/json_value.h b/include/dftracer/utils/utilities/common/json/json_value.h
index 8f6d5489..8f051c0a 100644
--- a/include/dftracer/utils/utilities/common/json/json_value.h
+++ b/include/dftracer/utils/utilities/common/json/json_value.h
@@ -4,7 +4,7 @@
 #include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/core/utilities/utility.h>
 #include <dftracer/utils/utilities/text/shared.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
 #include <cstdint>
 #include <cstring>
@@ -18,35 +18,45 @@
 namespace dftracer::utils::utilities::common::json {
 
 /**
- * Lightweight zero-cost wrapper around yyjson_val* with convenient accessors.
+ * Lightweight wrapper around simdjson::dom::element with convenient accessors.
  *
- * Provides pure lazy evaluation:
+ * Provides:
  * - Fluent chaining: json["args"]["hhash"]
  * - Template get<T>() with auto-casting
  * - Default values for missing/null fields
- * - Zero overhead - just pointer navigation
+ * - Zero overhead - just element navigation
  *
- * IMPORTANT: JsonValue is only valid while the yyjson_doc is alive.
+ * IMPORTANT: JsonValue is only valid while the simdjson::dom::document is
+ * alive.
  */
 class JsonValue {
    private:
-    yyjson_val* val_;
+    simdjson::dom::element elem_;
+    bool valid_ = false;
 
    public:
-    explicit JsonValue(yyjson_val* val = nullptr) : val_(val) {}
-
-    bool is_null() const { return !val_ || yyjson_is_null(val_); }
-    bool is_bool() const { return val_ && yyjson_is_bool(val_); }
-    bool is_string() const { return val_ && yyjson_is_str(val_); }
-    bool is_uint() const { return val_ && yyjson_is_uint(val_); }
-    bool is_int() const { return val_ && yyjson_is_int(val_); }
-    bool is_number() const { return val_ && yyjson_is_num(val_); }
-    bool is_object() const { return val_ && yyjson_is_obj(val_); }
-    bool is_array() const { return val_ && yyjson_is_arr(val_); }
-    bool exists() const { return val_ != nullptr; }
+    JsonValue() : valid_(false) {}
+    explicit JsonValue(simdjson::dom::element elem)
+        : elem_(elem), valid_(true) {}
+
+    bool is_null() const { return !valid_ || elem_.is_null(); }
+    bool is_bool() const { return valid_ && elem_.is_bool(); }
+    bool is_string() const { return valid_ && elem_.is_string(); }
+    bool is_uint() const { return valid_ && elem_.is_uint64(); }
+    bool is_int() const { return valid_ && elem_.is_int64(); }
+    bool is_number() const {
+        return valid_ &&
+               (elem_.is_int64() || elem_.is_uint64() || elem_.is_double());
+    }
+    bool is_object() const { return valid_ && elem_.is_object(); }
+    bool is_array() const { return valid_ && elem_.is_array(); }
+    bool exists() const { return valid_; }
 
     JsonValue operator[](const char* key) const {
-        return JsonValue(val_ ? yyjson_obj_get(val_, key) : nullptr);
+        if (!valid_ || !elem_.is_object()) return JsonValue();
+        auto result = elem_[key];
+        if (result.error()) return JsonValue();
+        return JsonValue(result.value_unsafe());
     }
 
     JsonValue operator[](const std::string& key) const {
@@ -54,8 +64,10 @@ class JsonValue {
     }
 
     JsonValue operator[](std::string_view key) const {
-        std::string key_str(key);
-        return (*this)[key_str.c_str()];
+        if (!valid_ || !elem_.is_object()) return JsonValue();
+        auto result = elem_[key];
+        if (result.error()) return JsonValue();
+        return JsonValue(result.value_unsafe());
     }
 
     JsonValue at(const char* path) const;
@@ -64,49 +76,43 @@ class JsonValue {
 
     template <typename T>
     T get(const T& default_val = T{}) const {
+        if (!valid_) return default_val;
+
         if constexpr (std::is_same_v<T, bool>) {
-            return val_ && yyjson_is_bool(val_) ? yyjson_get_bool(val_)
-                                                : default_val;
+            auto r = elem_.get_bool();
+            return r.error() ? default_val : r.value_unsafe();
         } else if constexpr (std::is_same_v<T, std::string>) {
-            return (val_ && yyjson_is_str(val_))
-                       ? std::string(yyjson_get_str(val_))
-                       : default_val;
+            auto r = elem_.get_string();
+            return r.error() ? default_val : std::string(r.value_unsafe());
         } else if constexpr (std::is_same_v<T, std::string_view>) {
-            if (val_ && yyjson_is_str(val_)) {
-                const char* str = yyjson_get_str(val_);
-                std::size_t len = yyjson_get_len(val_);
-                return std::string_view(str, len);
-            }
-            return default_val;
+            auto r = elem_.get_string();
+            return r.error() ? default_val : r.value_unsafe();
         } else if constexpr (std::is_same_v<T, const char*>) {
-            return (val_ && yyjson_is_str(val_)) ? yyjson_get_str(val_)
-                                                 : default_val;
+            auto r = elem_.get_c_str();
+            return r.error() ? default_val : r.value_unsafe();
         } else if constexpr (std::is_same_v<T, std::uint64_t>) {
-            if (!val_) return default_val;
-            if (yyjson_is_uint(val_)) return yyjson_get_uint(val_);
-            if (yyjson_is_int(val_)) {
-                auto v = yyjson_get_int(val_);
-                return v >= 0 ? static_cast<std::uint64_t>(v) : default_val;
-            }
+            auto r = elem_.get_uint64();
+            if (!r.error()) return r.value_unsafe();
+            auto ri = elem_.get_int64();
+            if (!ri.error() && ri.value_unsafe() >= 0)
+                return static_cast<std::uint64_t>(ri.value_unsafe());
             return default_val;
         } else if constexpr (std::is_same_v<T, std::int64_t>) {
-            if (!val_) return default_val;
-            if (yyjson_is_int(val_)) return yyjson_get_int(val_);
-            if (yyjson_is_uint(val_)) {
-                auto v = yyjson_get_uint(val_);
-                return v <= static_cast<uint64_t>(
-                                std::numeric_limits<int64_t>::max())
-                           ? static_cast<std::int64_t>(v)
-                           : default_val;
-            }
+            auto r = elem_.get_int64();
+            if (!r.error()) return r.value_unsafe();
+            auto ru = elem_.get_uint64();
+            if (!ru.error() &&
+                ru.value_unsafe() <=
+                    static_cast<uint64_t>(std::numeric_limits<int64_t>::max()))
+                return static_cast<std::int64_t>(ru.value_unsafe());
             return default_val;
         } else if constexpr (std::is_same_v<T, double>) {
-            if (!val_) return default_val;
-            if (yyjson_is_real(val_)) return yyjson_get_real(val_);
-            if (yyjson_is_int(val_))
-                return static_cast<double>(yyjson_get_int(val_));
-            if (yyjson_is_uint(val_))
-                return static_cast<double>(yyjson_get_uint(val_));
+            auto r = elem_.get_double();
+            if (!r.error()) return r.value_unsafe();
+            auto ri = elem_.get_int64();
+            if (!ri.error()) return static_cast<double>(ri.value_unsafe());
+            auto ru = elem_.get_uint64();
+            if (!ru.error()) return static_cast<double>(ru.value_unsafe());
             return default_val;
         } else if constexpr (std::is_same_v<T, float>) {
             return static_cast<float>(
@@ -126,56 +132,61 @@ class JsonValue {
 
     template <typename T>
     std::optional<T> get_optional() const {
-        if (!val_) return std::nullopt;
+        if (!valid_) return std::nullopt;
 
         if constexpr (std::is_same_v<T, std::string>) {
-            return yyjson_is_str(val_)
-                       ? std::optional(std::string(yyjson_get_str(val_)))
-                       : std::nullopt;
+            auto r = elem_.get_string();
+            return r.error() ? std::nullopt
+                             : std::optional(std::string(r.value_unsafe()));
         } else if constexpr (std::is_same_v<T, std::string_view>) {
-            if (yyjson_is_str(val_)) {
-                const char* str = yyjson_get_str(val_);
-                std::size_t len = yyjson_get_len(val_);
-                return std::optional(std::string_view(str, len));
-            }
-            return std::nullopt;
+            auto r = elem_.get_string();
+            return r.error() ? std::nullopt : std::optional(r.value_unsafe());
         } else if constexpr (std::is_same_v<T, const char*>) {
-            return yyjson_is_str(val_) ? std::optional(yyjson_get_str(val_))
-                                       : std::nullopt;
+            auto r = elem_.get_c_str();
+            return r.error() ? std::nullopt : std::optional(r.value_unsafe());
         } else if constexpr (std::is_same_v<T, std::uint64_t>) {
-            if (yyjson_is_uint(val_)) return yyjson_get_uint(val_);
-            if (yyjson_is_int(val_)) {
-                auto v = yyjson_get_int(val_);
-                return v >= 0 ? std::optional(static_cast<std::uint64_t>(v))
-                              : std::nullopt;
-            }
+            auto r = elem_.get_uint64();
+            if (!r.error()) return r.value_unsafe();
+            auto ri = elem_.get_int64();
+            if (!ri.error() && ri.value_unsafe() >= 0)
+                return static_cast<std::uint64_t>(ri.value_unsafe());
             return std::nullopt;
         } else if constexpr (std::is_same_v<T, std::int64_t>) {
-            if (yyjson_is_int(val_)) return yyjson_get_int(val_);
-            return std::nullopt;
+            auto r = elem_.get_int64();
+            return r.error() ? std::nullopt : std::optional(r.value_unsafe());
         } else if constexpr (std::is_same_v<T, double>) {
-            if (yyjson_is_real(val_)) return yyjson_get_real(val_);
-            if (yyjson_is_int(val_))
-                return static_cast<double>(yyjson_get_int(val_));
-            if (yyjson_is_uint(val_))
-                return static_cast<double>(yyjson_get_uint(val_));
+            auto r = elem_.get_double();
+            if (!r.error()) return r.value_unsafe();
+            auto ri = elem_.get_int64();
+            if (!ri.error()) return static_cast<double>(ri.value_unsafe());
+            auto ru = elem_.get_uint64();
+            if (!ru.error()) return static_cast<double>(ru.value_unsafe());
             return std::nullopt;
         } else if constexpr (std::is_same_v<T, bool>) {
-            return yyjson_is_bool(val_) ? std::optional(yyjson_get_bool(val_))
-                                        : std::nullopt;
+            auto r = elem_.get_bool();
+            return r.error() ? std::nullopt : std::optional(r.value_unsafe());
         } else {
             static_assert(!sizeof(T),
                           "Unsupported type for JsonValue::get_optional<T>()");
         }
     }
 
-    yyjson_val* raw() const { return val_; }
-    explicit operator yyjson_val*() const { return val_; }
+    template <typename Fn>
+    void for_each_member(Fn&& fn) const {
+        if (!valid_ || !elem_.is_object()) return;
+        auto obj = elem_.get_object();
+        if (obj.error()) return;
+        for (auto field : obj.value_unsafe()) {
+            fn(field.key, JsonValue(field.value));
+        }
+    }
+
+    simdjson::dom::element raw() const { return elem_; }
     explicit operator bool() const { return exists(); }
 };
 
-using JsonParserInput = yyjson_val*;
 using JsonParserOutput = JsonValue;
+using JsonParserInput = simdjson::dom::element;
 
 class JsonParserUtility
     : public utilities::Utility<JsonParserInput, JsonParserOutput> {
@@ -199,7 +210,8 @@ class StringJsonParserUtility
     : public utilities::Utility<StringJsonParserInput, JsonParserOutput> {
    private:
     utilities::text::Text content_;
-    std::shared_ptr<yyjson_doc> owned_doc_;
+    simdjson::dom::parser parser_;
+    simdjson::dom::document doc_;
 
    public:
     coro::CoroTask<JsonParserOutput> process(
diff --git a/include/dftracer/utils/utilities/common/json/parser.h b/include/dftracer/utils/utilities/common/json/parser.h
new file mode 100644
index 00000000..3907bd7f
--- /dev/null
+++ b/include/dftracer/utils/utilities/common/json/parser.h
@@ -0,0 +1,241 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMMON_JSON_PARSER_H
+#define DFTRACER_UTILS_UTILITIES_COMMON_JSON_PARSER_H
+
+#include <simdjson.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <string_view>
+
+namespace dftracer::utils::utilities::common::json {
+
+/**
+ * @brief On-Demand JSON parser for zero-copy parsing.
+ *
+ * Key design principles:
+ * 1. On-Demand API for lazy field access - only parses what you use
+ * 2. Parser is reused across rows (internal buffer management)
+ * 3. Zero-copy: string_view points directly into the padded JSON buffer
+ * 4. Forward-only iteration: once a field is accessed, it's consumed
+ *
+ * Usage pattern for batch processing:
+ * @code
+ *   JsonParser parser;
+ *
+ *   for (auto& line : input_lines) {
+ *       // parse() copies to internal padded buffer
+ *       if (!parser.parse(line)) continue;
+ *
+ *       // Access fields directly from parser
+ *       auto name = parser.get_string("name");
+ *       auto ts = parser.get_int64("ts");
+ *
+ *       // Iterate over 'args' object
+ *       parser.for_each_field("args", [](std::string_view key, auto& val) {
+ *           // process nested fields
+ *       });
+ *   }
+ * @endcode
+ *
+ * @note string_view values are only valid until the next parse() call.
+ */
+class JsonParser {
+   public:
+    static constexpr std::size_t DEFAULT_CAPACITY = 1 << 20;  // 1MB
+
+    explicit JsonParser(std::size_t capacity = DEFAULT_CAPACITY);
+
+    JsonParser(const JsonParser&) = delete;
+    JsonParser& operator=(const JsonParser&) = delete;
+    JsonParser(JsonParser&&) = default;
+    JsonParser& operator=(JsonParser&&) = default;
+
+    /**
+     * @brief Parse a JSON line.
+     *
+     * Copies the input to an internal padded buffer for SIMD processing.
+     * Previous parse results become invalid after this call.
+     *
+     * @param json_line The JSON string to parse.
+     * @return true on success, false on parse error.
+     */
+    bool parse(std::string_view json_line);
+
+    /**
+     * @brief Parse from pre-padded string (avoids copy).
+     */
+    bool parse_padded(simdjson::padded_string_view json);
+
+    /**
+     * @brief Check if current document is valid (last parse succeeded).
+     */
+    bool is_valid() const { return valid_; }
+
+    // Direct field access from root object
+    // Returns nullopt if field missing or wrong type
+
+    std::optional<std::int64_t> get_int64(std::string_view key);
+    std::optional<std::uint64_t> get_uint64(std::string_view key);
+    std::optional<double> get_double(std::string_view key);
+    std::optional<bool> get_bool(std::string_view key);
+    std::optional<std::string_view> get_string(std::string_view key);
+
+    /**
+     * @brief Iterate over all fields in the root object.
+     *
+     * @param fn Callback: void(std::string_view key, simdjson::ondemand::value
+     * val)
+     *
+     * @note This consumes the document. After calling, field access methods
+     *       will return nullopt. Call parse() again to re-parse.
+     */
+    template <typename Fn>
+    void for_each_field(Fn&& fn);
+
+    /**
+     * @brief Iterate over fields of a nested object.
+     *
+     * @param object_key The field containing the nested object.
+     * @param fn Callback: void(std::string_view key, simdjson::ondemand::value
+     * val)
+     * @return true if object found and iterated, false otherwise.
+     */
+    template <typename Fn>
+    bool for_each_field(std::string_view object_key, Fn&& fn);
+
+    /**
+     * @brief Rewind document for re-iteration.
+     *
+     * After accessing fields, the document position advances. Call this
+     * to reset to the beginning for another pass.
+     */
+    void rewind();
+
+    /**
+     * @brief Get raw document for advanced usage.
+     */
+    simdjson::ondemand::document& raw_document() { return doc_; }
+
+    /**
+     * @brief Borrow an externally-owned parsed document.
+     *
+     * After this call, for_each_field/rewind/get_* operate on the borrowed
+     * reference. The caller must keep the underlying document alive until
+     * another parse() / set_borrowed_document() call. Intended for bridging
+     * iterate_many output (document_reference) to consumers that accept a
+     * JsonParser&.
+     */
+    void set_borrowed_document(
+        simdjson::ondemand::document_reference ref) noexcept {
+        active_ = ref;
+        valid_ = true;
+    }
+
+   private:
+    simdjson::ondemand::parser parser_;
+    simdjson::padded_string padded_json_;
+    simdjson::ondemand::document doc_;
+    simdjson::ondemand::document_reference active_;
+    bool valid_ = false;
+};
+
+// Template implementations
+
+template <typename Fn>
+void JsonParser::for_each_field(Fn&& fn) {
+    if (!valid_) return;
+
+    auto obj_result = active_.get_object();
+    if (obj_result.error()) return;
+
+    for (auto field : obj_result.value()) {
+        if (field.error()) continue;
+
+        auto key_result = field.unescaped_key();
+        if (key_result.error()) continue;
+
+        auto val_result = field.value();
+        if (val_result.error()) continue;
+
+        fn(key_result.value(), val_result.value());
+    }
+}
+
+template <typename Fn>
+bool JsonParser::for_each_field(std::string_view object_key, Fn&& fn) {
+    if (!valid_) return false;
+
+    auto nested_result = active_[object_key].get_object();
+    if (nested_result.error()) return false;
+
+    for (auto field : nested_result.value()) {
+        if (field.error()) continue;
+
+        auto key_result = field.unescaped_key();
+        if (key_result.error()) continue;
+
+        auto val_result = field.value();
+        if (val_result.error()) continue;
+
+        fn(key_result.value(), val_result.value());
+    }
+    return true;
+}
+
+/**
+ * @brief Helper to extract typed value from simdjson::ondemand::value.
+ *
+ * Use in for_each_field callbacks to safely extract values.
+ */
+struct JsonValueHelper {
+    static std::optional<std::int64_t> get_int64(
+        simdjson::ondemand::value& val) {
+        auto r = val.get_int64();
+        return r.error() ? std::nullopt : std::optional(r.value());
+    }
+
+    static std::optional<std::uint64_t> get_uint64(
+        simdjson::ondemand::value& val) {
+        auto r = val.get_uint64();
+        return r.error() ? std::nullopt : std::optional(r.value());
+    }
+
+    static std::optional<double> get_double(simdjson::ondemand::value& val) {
+        auto r = val.get_double();
+        return r.error() ? std::nullopt : std::optional(r.value());
+    }
+
+    static std::optional<bool> get_bool(simdjson::ondemand::value& val) {
+        auto r = val.get_bool();
+        return r.error() ? std::nullopt : std::optional(r.value());
+    }
+
+    static std::optional<std::string_view> get_string(
+        simdjson::ondemand::value& val) {
+        auto r = val.get_string();
+        return r.error() ? std::nullopt : std::optional(r.value());
+    }
+
+    static bool is_null(simdjson::ondemand::value& val) {
+        auto r = val.is_null();
+        return r.error() ? false : r.value();
+    }
+
+    static std::optional<simdjson::ondemand::json_type> get_type(
+        simdjson::ondemand::value& val) {
+        auto r = val.type();
+        return r.error() ? std::nullopt : std::optional(r.value());
+    }
+
+    static std::optional<std::string> to_json_string(
+        simdjson::ondemand::value& val) {
+        auto r = simdjson::to_json_string(val);
+        return r.error() ? std::nullopt : std::optional(std::string(r.value()));
+    }
+};
+
+}  // namespace dftracer::utils::utilities::common::json
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMMON_JSON_PARSER_H
diff --git a/include/dftracer/utils/utilities/common/query/ast.h b/include/dftracer/utils/utilities/common/query/ast.h
index 2f8e3c03..b76aa97d 100644
--- a/include/dftracer/utils/utilities/common/query/ast.h
+++ b/include/dftracer/utils/utilities/common/query/ast.h
@@ -1,6 +1,8 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMMON_QUERY_AST_H
 #define DFTRACER_UTILS_UTILITIES_COMMON_QUERY_AST_H
 
+#include <dftracer/utils/core/common/transparent_string_hash.h>
+
 #include <cstdint>
 #include <memory>
 #include <string>
@@ -92,6 +94,9 @@ const char* compare_op_str(CompareOp op);
 /// Serialize an AST back to query DSL string.
 std::string to_string(const QueryNode& node);
 
+/// Collect all field names referenced in a query AST.
+dftracer::utils::StringViewSet collect_fields(const QueryNode& node);
+
 }  // namespace dftracer::utils::utilities::common::query
 
 #endif  // DFTRACER_UTILS_UTILITIES_COMMON_QUERY_AST_H
diff --git a/include/dftracer/utils/utilities/common/query/evaluator.h b/include/dftracer/utils/utilities/common/query/evaluator.h
index f4249a9a..aa33c682 100644
--- a/include/dftracer/utils/utilities/common/query/evaluator.h
+++ b/include/dftracer/utils/utilities/common/query/evaluator.h
@@ -1,12 +1,10 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMMON_QUERY_EVALUATOR_H
 #define DFTRACER_UTILS_UTILITIES_COMMON_QUERY_EVALUATOR_H
 
+#include <dftracer/utils/core/common/transparent_string_hash.h>
 #include <dftracer/utils/utilities/common/json/json_value.h>
 #include <dftracer/utils/utilities/common/query/ast.h>
 
-#include <string>
-#include <unordered_map>
-
 namespace dftracer::utils::utilities::common::query {
 
 using json::JsonValue;
@@ -15,7 +13,7 @@ using json::JsonValue;
 bool evaluate(const QueryNode& node, const JsonValue& event);
 
 /// Typed key-value map for non-JSON evaluation contexts.
-using ValueMap = std::unordered_map<std::string, LiteralValue>;
+using ValueMap = dftracer::utils::StringViewMap<LiteralValue>;
 
 /// Evaluate against a typed key-value map.
 /// Missing fields evaluate to false.
diff --git a/include/dftracer/utils/utilities/common/query/query.h b/include/dftracer/utils/utilities/common/query/query.h
index 2c9938bf..d60af516 100644
--- a/include/dftracer/utils/utilities/common/query/query.h
+++ b/include/dftracer/utils/utilities/common/query/query.h
@@ -1,6 +1,7 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMMON_QUERY_QUERY_H
 #define DFTRACER_UTILS_UTILITIES_COMMON_QUERY_QUERY_H
 
+#include <dftracer/utils/core/common/transparent_string_hash.h>
 #include <dftracer/utils/utilities/common/query/ast.h>
 #include <dftracer/utils/utilities/common/query/evaluator.h>
 #include <dftracer/utils/utilities/common/query/parser.h>
@@ -33,13 +34,21 @@ class Query {
     const std::string& source() const { return source_; }
     /// Serialize AST back to query DSL string.
     std::string to_string() const;
+    /// Fields referenced by this query, precomputed at construction.
+    const dftracer::utils::StringViewSet& fields() const { return fields_; }
+    bool references(std::string_view field) const {
+        return fields_.count(field) > 0;
+    }
 
    private:
     Query(QueryNodePtr root, std::string source)
-        : root_(std::move(root)), source_(std::move(source)) {}
+        : root_(std::move(root)),
+          source_(std::move(source)),
+          fields_(collect_fields(*root_)) {}
 
     QueryNodePtr root_;
     std::string source_;
+    dftracer::utils::StringViewSet fields_;
 };
 
 /// Parse a query string, throwing QueryParseError on failure.
diff --git a/include/dftracer/utils/utilities/common/serialization/binary_codec.h b/include/dftracer/utils/utilities/common/serialization/binary_codec.h
new file mode 100644
index 00000000..bd7652f0
--- /dev/null
+++ b/include/dftracer/utils/utilities/common/serialization/binary_codec.h
@@ -0,0 +1,210 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMMON_SERIALIZATION_BINARY_CODEC_H
+#define DFTRACER_UTILS_UTILITIES_COMMON_SERIALIZATION_BINARY_CODEC_H
+
+#include <cstdint>
+#include <cstring>
+#include <span>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+
+namespace dftracer::utils::utilities::common::serialization {
+
+// =============================================================================
+// Binary Writer Utilities
+// =============================================================================
+
+inline void put_u8(std::string& out, std::uint8_t v) {
+    out.push_back(static_cast<char>(v));
+}
+
+inline void put_be16(std::string& out, std::uint16_t v) {
+    out.push_back(static_cast<char>(v >> 8));
+    out.push_back(static_cast<char>(v));
+}
+
+inline void put_be32(std::string& out, std::uint32_t v) {
+    out.push_back(static_cast<char>(v >> 24));
+    out.push_back(static_cast<char>(v >> 16));
+    out.push_back(static_cast<char>(v >> 8));
+    out.push_back(static_cast<char>(v));
+}
+
+inline void put_be64(std::string& out, std::uint64_t v) {
+    out.push_back(static_cast<char>(v >> 56));
+    out.push_back(static_cast<char>(v >> 48));
+    out.push_back(static_cast<char>(v >> 40));
+    out.push_back(static_cast<char>(v >> 32));
+    out.push_back(static_cast<char>(v >> 24));
+    out.push_back(static_cast<char>(v >> 16));
+    out.push_back(static_cast<char>(v >> 8));
+    out.push_back(static_cast<char>(v));
+}
+
+inline void put_double(std::string& out, double v) {
+    std::uint64_t bits;
+    std::memcpy(&bits, &v, 8);
+    put_be64(out, bits);
+}
+
+inline void put_str(std::string& out, std::string_view s) {
+    put_be16(out, static_cast<std::uint16_t>(s.size()));
+    out.append(s.data(), s.size());
+}
+
+inline void put_varint(std::string& out, std::uint64_t v) {
+    while (v >= 0x80) {
+        out.push_back(static_cast<char>(v | 0x80));
+        v >>= 7;
+    }
+    out.push_back(static_cast<char>(v));
+}
+
+inline void put_blob(std::string& out, std::span<const std::uint8_t> data) {
+    put_be32(out, static_cast<std::uint32_t>(data.size()));
+    out.append(reinterpret_cast<const char*>(data.data()), data.size());
+}
+
+// =============================================================================
+// Raw Pointer Writer Utilities (for pre-sized buffers)
+// =============================================================================
+
+inline char* write_varint(char* p, std::uint64_t v) {
+    while (v >= 0x80) {
+        *p++ = static_cast<char>(v | 0x80);
+        v >>= 7;
+    }
+    *p++ = static_cast<char>(v);
+    return p;
+}
+
+inline char* write_be16(char* p, std::uint16_t v) {
+    p[0] = static_cast<char>(v >> 8);
+    p[1] = static_cast<char>(v);
+    return p + 2;
+}
+
+inline char* write_be32(char* p, std::uint32_t v) {
+    p[0] = static_cast<char>(v >> 24);
+    p[1] = static_cast<char>(v >> 16);
+    p[2] = static_cast<char>(v >> 8);
+    p[3] = static_cast<char>(v);
+    return p + 4;
+}
+
+inline char* write_be64(char* p, std::uint64_t v) {
+    p[0] = static_cast<char>(v >> 56);
+    p[1] = static_cast<char>(v >> 48);
+    p[2] = static_cast<char>(v >> 40);
+    p[3] = static_cast<char>(v >> 32);
+    p[4] = static_cast<char>(v >> 24);
+    p[5] = static_cast<char>(v >> 16);
+    p[6] = static_cast<char>(v >> 8);
+    p[7] = static_cast<char>(v);
+    return p + 8;
+}
+
+inline char* write_double(char* p, double v) {
+    std::uint64_t bits;
+    std::memcpy(&bits, &v, 8);
+    return write_be64(p, bits);
+}
+
+inline char* write_str(char* p, std::string_view s) {
+    const auto n = static_cast<std::uint16_t>(s.size());
+    p[0] = static_cast<char>(n >> 8);
+    p[1] = static_cast<char>(n);
+    p += 2;
+    std::memcpy(p, s.data(), s.size());
+    return p + s.size();
+}
+
+// =============================================================================
+// Binary Reader Class
+// =============================================================================
+
+class BinaryReader {
+   public:
+    explicit BinaryReader(std::string_view data) : data_(data) {}
+
+    std::uint8_t u8() { return static_cast<std::uint8_t>(take(1)[0]); }
+
+    std::uint16_t be16() {
+        auto s = take(2);
+        return static_cast<std::uint16_t>(
+            (static_cast<std::uint8_t>(s[0]) << 8) |
+            static_cast<std::uint8_t>(s[1]));
+    }
+
+    std::uint32_t be32() {
+        auto s = take(4);
+        return (static_cast<std::uint32_t>(static_cast<std::uint8_t>(s[0]))
+                << 24) |
+               (static_cast<std::uint32_t>(static_cast<std::uint8_t>(s[1]))
+                << 16) |
+               (static_cast<std::uint32_t>(static_cast<std::uint8_t>(s[2]))
+                << 8) |
+               static_cast<std::uint32_t>(static_cast<std::uint8_t>(s[3]));
+    }
+
+    std::uint64_t be64() {
+        auto s = take(8);
+        std::uint64_t v = 0;
+        for (int i = 0; i < 8; ++i) {
+            v = (v << 8) | static_cast<std::uint8_t>(s[i]);
+        }
+        return v;
+    }
+
+    double f64() {
+        std::uint64_t bits = be64();
+        double v;
+        std::memcpy(&v, &bits, 8);
+        return v;
+    }
+
+    std::string_view blob() {
+        auto len = be32();
+        return take(len);
+    }
+
+    std::string_view str() {
+        auto len = be16();
+        return take(len);
+    }
+
+    std::uint64_t varint() {
+        std::uint64_t v = 0;
+        unsigned shift = 0;
+        while (off_ < data_.size()) {
+            auto b = static_cast<std::uint8_t>(data_[off_++]);
+            v |= static_cast<std::uint64_t>(b & 0x7F) << shift;
+            if ((b & 0x80) == 0) return v;
+            shift += 7;
+        }
+        throw std::runtime_error("binary_codec: truncated varint");
+    }
+
+    bool has_remaining() const { return off_ < data_.size(); }
+
+    std::string_view remaining() const { return data_.substr(off_); }
+
+    std::size_t offset() const { return off_; }
+
+   private:
+    std::string_view take(std::size_t n) {
+        if (off_ + n > data_.size()) {
+            throw std::runtime_error("binary_codec: truncated data");
+        }
+        auto s = data_.substr(off_, n);
+        off_ += n;
+        return s;
+    }
+
+    std::string_view data_;
+    std::size_t off_ = 0;
+};
+
+}  // namespace dftracer::utils::utilities::common::serialization
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMMON_SERIALIZATION_BINARY_CODEC_H
diff --git a/include/dftracer/utils/utilities/common/statistics/log2_histogram.h b/include/dftracer/utils/utilities/common/statistics/log2_histogram.h
index 9cb9da9e..262ce62c 100644
--- a/include/dftracer/utils/utilities/common/statistics/log2_histogram.h
+++ b/include/dftracer/utils/utilities/common/statistics/log2_histogram.h
@@ -5,10 +5,6 @@
 #include <cstdint>
 #include <string>
 
-// Forward declaration for direct JSON serialization
-struct yyjson_mut_doc;
-struct yyjson_mut_val;
-
 namespace dftracer::utils::utilities::common::statistics {
 
 /**
@@ -36,9 +32,6 @@ class Log2Histogram {
     std::string to_json() const;
     static Log2Histogram from_json(const std::string& json);
 
-    /// Serialize directly to yyjson mutable array (avoids string roundtrip)
-    yyjson_mut_val* to_yyjson(yyjson_mut_doc* doc) const;
-
     std::uint64_t total_count() const { return total_count_; }
     const std::array<std::uint64_t, NUM_BINS>& bins() const { return bins_; }
 
diff --git a/include/dftracer/utils/utilities/common/statistics/timestamp_histogram.h b/include/dftracer/utils/utilities/common/statistics/timestamp_histogram.h
new file mode 100644
index 00000000..16cea6f0
--- /dev/null
+++ b/include/dftracer/utils/utilities/common/statistics/timestamp_histogram.h
@@ -0,0 +1,59 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_TIMESTAMP_HISTOGRAM_H
+#define DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_TIMESTAMP_HISTOGRAM_H
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+namespace dftracer::utils::utilities::common::statistics {
+
+class TimestampHistogram {
+   public:
+    static constexpr std::uint64_t BIN_WIDTH_US = 100'000;  // 100ms
+
+    TimestampHistogram() = default;
+
+    void add(std::uint64_t timestamp_us);
+    void merge(const TimestampHistogram& other);
+
+    std::uint64_t count_in_range(std::uint64_t ts_start_us,
+                                 std::uint64_t ts_end_us) const;
+    double selectivity(std::uint64_t ts_start_us,
+                       std::uint64_t ts_end_us) const;
+    std::vector<double> expansion_weights(std::uint64_t bucket_start_us,
+                                          std::uint64_t bucket_end_us,
+                                          std::size_t num_sub_buckets) const;
+
+    std::vector<std::uint8_t> serialize() const;
+    static TimestampHistogram deserialize(const std::uint8_t* data,
+                                          std::size_t len);
+
+    std::uint64_t total_count() const { return total_count_; }
+    bool empty() const { return bins_.empty(); }
+    std::size_t num_bins() const { return bins_.size(); }
+
+    const std::vector<std::pair<std::uint64_t, std::uint64_t>>& bins() const {
+        return bins_;
+    }
+
+    static std::uint64_t bin_index(std::uint64_t timestamp_us) {
+        return timestamp_us / BIN_WIDTH_US;
+    }
+
+    static std::uint64_t bin_start_us(std::uint64_t bin_idx) {
+        return bin_idx * BIN_WIDTH_US;
+    }
+
+    static std::uint64_t bin_end_us(std::uint64_t bin_idx) {
+        return (bin_idx + 1) * BIN_WIDTH_US;
+    }
+
+   private:
+    // Sorted by bin_index. Sparse: only non-zero bins stored.
+    std::vector<std::pair<std::uint64_t, std::uint64_t>> bins_;
+    std::uint64_t total_count_ = 0;
+};
+
+}  // namespace dftracer::utils::utilities::common::statistics
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMMON_STATISTICS_TIMESTAMP_HISTOGRAM_H
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.h
new file mode 100644
index 00000000..38f3e0f5
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.h
@@ -0,0 +1,33 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_AUGMENTATION_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_AUGMENTATION_H
+
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregator_types.h>
+
+#include <cmath>
+#include <cstdint>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+struct AugmentationConfig {
+    std::uint64_t source_interval_us;  // interval stored in index
+    std::uint64_t target_interval_us;  // interval requested by user
+};
+
+// Augment a batch to match target interval.
+// - If source > target: expand (split buckets, approximate with CI)
+// - If source < target: shrink (merge buckets, lossless)
+// - If source == target: pass through
+AggregationBatch augment_batch(const AggregationBatch& input,
+                               const AugmentationConfig& config);
+
+// Compute Poisson 95% confidence interval for a count
+inline CountConfidenceInterval compute_poisson_ci(double count,
+                                                  double confidence = 1.96) {
+    double sqrt_count = std::sqrt(count);
+    return {std::max(0.0, count - confidence * sqrt_count),
+            count + confidence * sqrt_count};
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_AUGMENTATION_H
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h
index 8b3d8d7a..142af76e 100644
--- a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h
@@ -1,6 +1,9 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_CONFIG_H
 #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_CONFIG_H
 
+#include <dftracer/utils/core/common/config.h>
+#include <dftracer/utils/utilities/hash/fnv1a_hasher_utility.h>
+
 #include <cstdint>
 #include <string>
 #include <vector>
@@ -17,10 +20,13 @@ struct AggregationConfig {
     std::uint64_t time_interval_us = 1000000;
     bool use_relative_time = false;
     std::uint64_t reference_timestamp = 0;
+    bool normalize_time = false;  // Normalize time_bucket to 0-based time_range
 
     std::vector<std::string> extra_group_keys;
     std::vector<std::string> custom_metric_fields;
 
+    bool track_default_args = true;
+
     bool compute_statistics = true;
 
     bool compute_percentiles = false;
@@ -50,6 +56,46 @@ struct AggregationConfig {
 #endif
         return s;
     }
+
+    // Compute hash of config fields that affect aggregation output.
+    // Used to detect if cached aggregation data matches current config.
+    std::uint32_t compute_hash() const {
+        hash::Fnv1aHashBuilder h;
+
+        h.update_value(time_interval_us);
+        h.update_value(use_relative_time);
+        if (use_relative_time) {
+            h.update_value(reference_timestamp);
+        }
+
+        for (const auto& k : extra_group_keys) {
+            h.update(k);
+        }
+        for (const auto& m : custom_metric_fields) {
+            h.update(m);
+        }
+
+        h.update_value(track_default_args);
+        h.update_value(compute_statistics);
+        h.update_value(compute_percentiles);
+
+        if (compute_percentiles) {
+            h.update_value(sketch_accuracy);
+            for (double p : percentiles) {
+                h.update_value(p);
+            }
+        }
+
+        for (const auto& be : boundary_events) {
+            h.update(be.event_name);
+            h.update(be.value_field);
+            h.update(be.output_name);
+        }
+
+        h.update_value(track_process_parents);
+
+        return h.finish32();
+    }
 };
 
 }  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.h
new file mode 100644
index 00000000..f8663000
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.h
@@ -0,0 +1,28 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_LOGIC_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_LOGIC_H
+
+#include <dftracer/utils/utilities/common/json/json_value.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_key.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_map.h>
+#include <dftracer/utils/utilities/composites/dft/event.h>
+
+#include <cstdint>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+std::uint64_t compute_time_bucket(std::uint64_t timestamp,
+                                  std::uint64_t duration,
+                                  const AggregationConfig& config);
+
+AggregationKey build_aggregation_key(const DFTracerEvent& ev,
+                                     const AggregationConfig& config);
+
+void update_aggregation_entry(const DFTracerEvent& ev,
+                              const AggregationConfig& config,
+                              AggregationMap& aggregations,
+                              const AggregationKey& key);
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_LOGIC_H
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.h
new file mode 100644
index 00000000..e3bd4722
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.h
@@ -0,0 +1,26 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_MERGE_OPERATOR_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_MERGE_OPERATOR_H
+
+#include <rocksdb/merge_operator.h>
+
+#include <string>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+class AggregationMergeOperator : public ::rocksdb::MergeOperator {
+   public:
+    bool FullMergeV2(const MergeOperationInput& merge_in,
+                     MergeOperationOutput* merge_out) const override;
+
+    bool PartialMerge(const ::rocksdb::Slice& key,
+                      const ::rocksdb::Slice& left_operand,
+                      const ::rocksdb::Slice& right_operand,
+                      std::string* new_value,
+                      ::rocksdb::Logger* logger) const override;
+
+    const char* Name() const override { return "AggregationMergeOperator"; }
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_MERGE_OPERATOR_H
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.h
index a2147de6..e3fb9576 100644
--- a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.h
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.h
@@ -16,6 +16,7 @@ namespace dftracer::utils::utilities::composites::dft::aggregators {
 using common::statistics::DDSketch;
 
 struct MetricStats {
+    std::uint64_t count = 0;
     std::uint64_t total = 0;
     std::uint64_t min = std::numeric_limits<std::uint64_t>::max();
     std::uint64_t max = 0;
@@ -30,7 +31,8 @@ struct MetricStats {
         : sketch_accuracy_(relative_accuracy) {}
 
     MetricStats(const MetricStats& other)
-        : total(other.total),
+        : count(other.count),
+          total(other.total),
           min(other.min),
           max(other.max),
           mean(other.mean),
@@ -43,6 +45,7 @@ struct MetricStats {
 
     MetricStats& operator=(const MetricStats& other) {
         if (this != &other) {
+            count = other.count;
             total = other.total;
             min = other.min;
             max = other.max;
@@ -60,13 +63,11 @@ struct MetricStats {
     MetricStats(MetricStats&&) = default;
     MetricStats& operator=(MetricStats&&) = default;
 
-    void update(std::uint64_t value, std::uint64_t count,
-                bool compute_percentiles = false);
-    void merge_from(const MetricStats& other, std::uint64_t n1,
-                    std::uint64_t n2, std::uint64_t n);
-    double get_stddev(std::uint64_t count) const;
-    double get_skewness(std::uint64_t count) const;
-    double get_kurtosis(std::uint64_t count) const;
+    void update(std::uint64_t value, bool compute_percentiles = false);
+    void merge_from(const MetricStats& other);
+    double get_stddev() const;
+    double get_skewness() const;
+    double get_kurtosis() const;
 };
 
 using CustomMetricsMap =
@@ -146,13 +147,9 @@ struct AggregationMetrics {
     void update_timestamp_clamped(std::uint64_t event_ts, std::uint64_t dur,
                                   std::uint64_t bucket_start,
                                   std::uint64_t bucket_size);
-    void update_custom_metric(const std::string& name, std::uint64_t value,
+    void update_custom_metric(std::string_view name, std::uint64_t value,
                               bool compute_percentiles = false);
 
-    double get_stddev_duration() const;
-    double get_stddev_size() const;
-    double get_custom_stddev(const std::string& name) const;
-
     void merge_from(const AggregationMetrics& other);
 };
 
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_output.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_output.h
index 06cf4bbd..f103c32e 100644
--- a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_output.h
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_output.h
@@ -5,6 +5,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -12,6 +13,12 @@
 
 namespace dftracer::utils::utilities::composites::dft::aggregators {
 
+enum class AggMapType : std::uint8_t {
+    EVENT = 0,
+    PROFILE = 1,
+    SYSTEM = 2,
+};
+
 class AssociationTracker;
 
 struct BoundaryTimeRange {
@@ -33,9 +40,11 @@ struct ChunkAggregationOutput {
     std::string file_path;
     bool success = false;
     std::shared_ptr<AssociationTracker> local_tracker;
+    std::uint64_t min_time_bucket = std::numeric_limits<std::uint64_t>::max();
+    std::uint64_t max_time_bucket = 0;
 };
 
-struct EventAggregatorUtilityOutput {
+struct EventAggregatorOutput {
     AggregationMap aggregations;
     AggregationMap profile_aggregations;
     AggregationMap system_aggregations;
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h
new file mode 100644
index 00000000..08d596a3
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h
@@ -0,0 +1,356 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_SERIALIZATION_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_SERIALIZATION_H
+
+#include <dftracer/utils/core/rocksdb/database.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_output.h>
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <string_view>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+static constexpr std::uint16_t AGG_KEY_NUM_SHARDS = 4096;
+
+static constexpr std::uint8_t METRIC_FMT_COMPACT = 0;
+static constexpr std::uint8_t METRIC_FMT_FULL = 1;
+static constexpr std::uint8_t METRIC_FMT_FULL_WITH_SKETCH = 2;
+
+// Intern dictionary: 0xFFFD + varint(id) -> string value
+static constexpr char AGG_INTERN_DICT_PREFIX[] = "\xFF\xFD";
+static constexpr std::size_t AGG_INTERN_DICT_PREFIX_LEN = 2;
+
+// Global config: 0xFFFE -> time_interval_us (8) + config_hash (4)
+static constexpr char AGG_GLOBAL_CONFIG_KEY[] = "\xFF\xFE";
+static constexpr std::size_t AGG_GLOBAL_CONFIG_LEN = 12;
+
+struct AggGlobalConfig {
+    std::uint64_t time_interval_us = 0;
+    std::uint32_t config_hash = 0;
+};
+
+inline std::string serialize_agg_global_config(const AggGlobalConfig& cfg) {
+    std::string val(AGG_GLOBAL_CONFIG_LEN, '\0');
+    val[0] = static_cast<char>((cfg.time_interval_us >> 56) & 0xFF);
+    val[1] = static_cast<char>((cfg.time_interval_us >> 48) & 0xFF);
+    val[2] = static_cast<char>((cfg.time_interval_us >> 40) & 0xFF);
+    val[3] = static_cast<char>((cfg.time_interval_us >> 32) & 0xFF);
+    val[4] = static_cast<char>((cfg.time_interval_us >> 24) & 0xFF);
+    val[5] = static_cast<char>((cfg.time_interval_us >> 16) & 0xFF);
+    val[6] = static_cast<char>((cfg.time_interval_us >> 8) & 0xFF);
+    val[7] = static_cast<char>(cfg.time_interval_us & 0xFF);
+    val[8] = static_cast<char>((cfg.config_hash >> 24) & 0xFF);
+    val[9] = static_cast<char>((cfg.config_hash >> 16) & 0xFF);
+    val[10] = static_cast<char>((cfg.config_hash >> 8) & 0xFF);
+    val[11] = static_cast<char>(cfg.config_hash & 0xFF);
+    return val;
+}
+
+inline AggGlobalConfig deserialize_agg_global_config(std::string_view data) {
+    AggGlobalConfig cfg;
+    if (data.size() >= AGG_GLOBAL_CONFIG_LEN) {
+        cfg.time_interval_us =
+            (static_cast<std::uint64_t>(static_cast<std::uint8_t>(data[0]))
+             << 56) |
+            (static_cast<std::uint64_t>(static_cast<std::uint8_t>(data[1]))
+             << 48) |
+            (static_cast<std::uint64_t>(static_cast<std::uint8_t>(data[2]))
+             << 40) |
+            (static_cast<std::uint64_t>(static_cast<std::uint8_t>(data[3]))
+             << 32) |
+            (static_cast<std::uint64_t>(static_cast<std::uint8_t>(data[4]))
+             << 24) |
+            (static_cast<std::uint64_t>(static_cast<std::uint8_t>(data[5]))
+             << 16) |
+            (static_cast<std::uint64_t>(static_cast<std::uint8_t>(data[6]))
+             << 8) |
+            static_cast<std::uint64_t>(static_cast<std::uint8_t>(data[7]));
+        cfg.config_hash =
+            (static_cast<std::uint32_t>(static_cast<std::uint8_t>(data[8]))
+             << 24) |
+            (static_cast<std::uint32_t>(static_cast<std::uint8_t>(data[9]))
+             << 16) |
+            (static_cast<std::uint32_t>(static_cast<std::uint8_t>(data[10]))
+             << 8) |
+            static_cast<std::uint32_t>(static_cast<std::uint8_t>(data[11]));
+    }
+    return cfg;
+}
+
+// Per-file: 0xFFFF + file_id (4) -> empty value (presence = aggregated)
+static constexpr char AGG_FILE_KEY_PREFIX[] = "\xFF\xFF";
+static constexpr std::size_t AGG_FILE_KEY_PREFIX_LEN = 2;
+static constexpr std::size_t AGG_FILE_KEY_LEN =
+    AGG_FILE_KEY_PREFIX_LEN + sizeof(std::int32_t);
+
+inline std::string make_agg_file_key(std::int32_t file_id) {
+    std::string key(AGG_FILE_KEY_LEN, '\0');
+    key[0] = AGG_FILE_KEY_PREFIX[0];
+    key[1] = AGG_FILE_KEY_PREFIX[1];
+    key[2] = static_cast<char>((file_id >> 24) & 0xFF);
+    key[3] = static_cast<char>((file_id >> 16) & 0xFF);
+    key[4] = static_cast<char>((file_id >> 8) & 0xFF);
+    key[5] = static_cast<char>(file_id & 0xFF);
+    return key;
+}
+
+void serialize_agg_key_into(std::string& out, std::uint32_t config_hash,
+                            AggMapType map_type, const AggregationKey& key);
+
+void serialize_agg_key_into(
+    std::string& out, std::uint32_t config_hash, AggMapType map_type,
+    std::string_view cat, std::string_view name, std::uint64_t pid,
+    std::uint64_t tid, std::string_view hhash, std::string_view fhash,
+    std::uint64_t time_bucket,
+    const std::vector<std::pair<std::string_view, std::string_view>>*
+        extra_keys = nullptr);
+std::string serialize_agg_key(std::uint32_t config_hash, AggMapType map_type,
+                              const AggregationKey& key);
+
+struct DeserializedAggKey {
+    std::uint32_t config_hash;
+    AggMapType map_type;
+    AggregationKey key;
+};
+DeserializedAggKey deserialize_agg_key(std::string_view data);
+
+/// Key view with resolved strings from the intern table.
+/// Lifetime: valid as long as aggregation_intern() exists (process lifetime).
+struct AggKeyView {
+    AggMapType map_type;
+    std::string_view cat;
+    std::string_view name;
+    std::uint64_t pid;
+    std::uint64_t tid;
+    std::string_view hhash;
+    std::string_view fhash;
+    std::uint64_t time_bucket;
+};
+
+/// Parse aggregation key: reads varint intern IDs and resolves to strings.
+/// Returns false if parsing fails.
+inline bool parse_agg_key_view(std::string_view data, AggKeyView& out) {
+    if (data.size() < 6) return false;
+
+    const auto* p = reinterpret_cast<const std::uint8_t*>(data.data());
+    const auto* end = p + data.size();
+
+    p += 2;  // shard
+
+    out.map_type = static_cast<AggMapType>(*p++);
+
+    auto read_varint = [&]() -> std::uint64_t {
+        std::uint64_t v = 0;
+        unsigned shift = 0;
+        while (p < end) {
+            auto b = *p++;
+            v |= static_cast<std::uint64_t>(b & 0x7F) << shift;
+            if ((b & 0x80) == 0) return v;
+            shift += 7;
+        }
+        return v;
+    };
+
+    auto& intern = aggregation_intern();
+    auto cat_id = static_cast<std::uint32_t>(read_varint());
+    auto name_id = static_cast<std::uint32_t>(read_varint());
+    out.pid = read_varint();
+    out.tid = read_varint();
+    auto hhash_id = static_cast<std::uint32_t>(read_varint());
+    auto fhash_id = static_cast<std::uint32_t>(read_varint());
+    out.time_bucket = read_varint();
+
+    out.cat = intern.resolve(cat_id);
+    out.name = intern.resolve(name_id);
+    out.hhash = hhash_id ? intern.resolve(hhash_id) : std::string_view{};
+    out.fhash = fhash_id ? intern.resolve(fhash_id) : std::string_view{};
+
+    return true;
+}
+
+void serialize_agg_value_into(std::string& out,
+                              const AggregationMetrics& metrics);
+std::string serialize_agg_value(const AggregationMetrics& metrics);
+AggregationMetrics deserialize_agg_value(std::string_view data);
+
+/// Lightweight metrics view for Arrow export - only the fields needed.
+struct AggMetricsView {
+    std::uint64_t count;
+    std::uint64_t dur_total;
+    std::uint64_t dur_min;
+    std::uint64_t dur_max;
+    std::uint64_t size_total;
+    std::uint64_t size_min;
+    std::uint64_t size_max;
+    std::uint64_t ts;
+    std::uint64_t te;
+};
+
+/// Full metrics view including mean/m2 for stddev computation.
+/// Use for iter_aggregation which needs mean and stddev columns.
+struct AggMetricsFullView {
+    std::uint64_t count;
+    std::uint64_t dur_total;
+    std::uint64_t dur_min;
+    std::uint64_t dur_max;
+    double dur_mean;
+    double dur_m2;  // For Welford's stddev: stddev = sqrt(m2 / count)
+    std::uint64_t size_total;
+    std::uint64_t size_min;
+    std::uint64_t size_max;
+    double size_mean;
+    double size_m2;
+    std::uint64_t ts;
+    std::uint64_t te;
+
+    double dur_stddev() const {
+        return count > 1 ? std::sqrt(dur_m2 / static_cast<double>(count)) : 0.0;
+    }
+    double size_stddev() const {
+        return count > 1 ? std::sqrt(size_m2 / static_cast<double>(count))
+                         : 0.0;
+    }
+};
+
+/// Fast value parser for Arrow export - skips mean/m2/m3/m4/sketch.
+inline bool parse_agg_value_view(std::string_view data, AggMetricsView& out) {
+    const auto* p = reinterpret_cast<const std::uint8_t*>(data.data());
+    const auto* end = p + data.size();
+
+    auto read_varint = [&]() -> std::uint64_t {
+        std::uint64_t v = 0;
+        int shift = 0;
+        while (p < end) {
+            std::uint8_t b = *p++;
+            v |= static_cast<std::uint64_t>(b & 0x7F) << shift;
+            if ((b & 0x80) == 0) break;
+            shift += 7;
+        }
+        return v;
+    };
+
+    auto skip_f64 = [&]() { p += 8; };
+
+    auto read_metric_stats_partial =
+        [&](std::uint64_t& total, std::uint64_t& min, std::uint64_t& max) {
+            auto fmt = read_varint();
+            if (fmt == METRIC_FMT_COMPACT) {
+                auto val = read_varint();
+                total = min = max = val;
+                return;
+            }
+            read_varint();  // skip count
+            total = read_varint();
+            min = read_varint();
+            max = read_varint();
+            skip_f64();  // mean
+            skip_f64();  // m2
+            if (fmt == METRIC_FMT_FULL_WITH_SKETCH) {
+                auto len = read_varint();
+                p += len;
+            }
+        };
+
+    if (p >= end) return false;
+
+    out.count = read_varint();
+    read_metric_stats_partial(out.dur_total, out.dur_min, out.dur_max);
+    read_metric_stats_partial(out.size_total, out.size_min, out.size_max);
+    out.ts = read_varint();
+    out.te = read_varint();
+
+    return true;
+}
+
+/// Full value parser for iter_aggregation - includes mean/m2 for stddev.
+inline bool parse_agg_value_full_view(std::string_view data,
+                                      AggMetricsFullView& out) {
+    const auto* p = reinterpret_cast<const std::uint8_t*>(data.data());
+    const auto* end = p + data.size();
+
+    auto read_varint = [&]() -> std::uint64_t {
+        std::uint64_t v = 0;
+        int shift = 0;
+        while (p < end) {
+            std::uint8_t b = *p++;
+            v |= static_cast<std::uint64_t>(b & 0x7F) << shift;
+            if ((b & 0x80) == 0) break;
+            shift += 7;
+        }
+        return v;
+    };
+
+    auto read_f64 = [&]() -> double {
+        if (p + 8 > end) return 0.0;
+        std::uint64_t bits = 0;
+        for (int i = 0; i < 8; ++i) {
+            bits |= static_cast<std::uint64_t>(*p++) << (i * 8);
+        }
+        double result;
+        std::memcpy(&result, &bits, sizeof(result));
+        return result;
+    };
+
+    auto read_metric_stats_full = [&](std::uint64_t& total, std::uint64_t& min,
+                                      std::uint64_t& max, double& mean,
+                                      double& m2) {
+        auto fmt = read_varint();
+        if (fmt == METRIC_FMT_COMPACT) {
+            auto val = read_varint();
+            total = min = max = val;
+            mean = static_cast<double>(val);
+            m2 = 0.0;
+            return;
+        }
+        read_varint();  // skip count (use outer count)
+        total = read_varint();
+        min = read_varint();
+        max = read_varint();
+        mean = read_f64();
+        m2 = read_f64();
+        if (fmt == METRIC_FMT_FULL_WITH_SKETCH) {
+            auto len = read_varint();
+            p += len;
+        }
+    };
+
+    if (p >= end) return false;
+
+    out.count = read_varint();
+    read_metric_stats_full(out.dur_total, out.dur_min, out.dur_max,
+                           out.dur_mean, out.dur_m2);
+    read_metric_stats_full(out.size_total, out.size_min, out.size_max,
+                           out.size_mean, out.size_m2);
+    out.ts = read_varint();
+    out.te = read_varint();
+
+    return true;
+}
+
+/// Load intern dictionary from RocksDB into aggregation_intern().
+void load_intern_dictionary(dftracer::utils::rocksdb::RocksDatabase& db);
+
+/// Flush any new intern entries to RocksDB as 0xFFFD keys.
+void flush_intern_dictionary(
+    dftracer::utils::rocksdb::RocksDatabase& db,
+    dftracer::utils::rocksdb::RocksDatabase::Batch& batch);
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
+
+namespace dftracer::utils::utilities::indexer {
+class IndexBatchSink;
+}
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+/// Sink-backed overload: flushes new intern entries via
+/// `IndexBatchSink::insert_aggregation_put`. Used by the distributed SST
+/// pipeline where the visitor writes to an SST instead of a live DB.
+void flush_intern_dictionary(
+    dftracer::utils::utilities::indexer::IndexBatchSink& sink);
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_SERIALIZATION_H
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h
new file mode 100644
index 00000000..bdc9b8e2
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h
@@ -0,0 +1,130 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_VISITOR_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_VISITOR_H
+
+#include <dftracer/utils/core/common/transparent_string_hash.h>
+#include <dftracer/utils/core/rocksdb/database.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_output.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/system_metrics.h>
+#include <dftracer/utils/utilities/composites/dft/dft_event_visitor.h>
+#include <dftracer/utils/utilities/indexer/index_database_sst_writer_context.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+class AggregationVisitor : public DftEventVisitor {
+   public:
+    /// Legacy mode: flush directly to a live RocksDatabase via Merge/Put
+    /// during parse. FLUSH_THRESHOLD commits the visitor's own batch to
+    /// bound memory. Used by `aggregator_utility.cpp`,
+    /// `dftracer_aggregator.cpp`, `dftracer_organize.cpp`.
+    AggregationVisitor(std::shared_ptr<rocksdb::RocksDatabase> db,
+                       std::uint32_t config_hash, AggregationConfig config,
+                       std::string file_path);
+
+    /// Distributed mode: flush to a per-visitor SstWriterContext rooted at
+    /// `staging_dir`. FLUSH_THRESHOLD emits partial SSTs (mixed Put+Merge)
+    /// so the in-memory map never exceeds the threshold. At
+    /// `on_file_complete`, the writer context is committed and its
+    /// Artifacts are embedded in the ChunkAggregationOutput so the worker
+    /// / coordinator can forward them to the main `SstArtifactRegistry`.
+    ///
+    /// `staging_dir` is typically the same node-local dir the rest of the
+    /// SST pipeline uses. `batch_id_prefix` is joined with a per-file
+    /// suffix to form a unique SstWriterContext root (so concurrent
+    /// per-file visitors never collide).
+    AggregationVisitor(std::string staging_dir, std::string batch_id_prefix,
+                       std::uint32_t config_hash, AggregationConfig config,
+                       std::string file_path);
+
+    void begin(std::size_t num_checkpoints) override;
+    void on_checkpoint(std::size_t checkpoint_idx) override;
+    void on_event(const EventRecord& record) override;
+    coro::CoroTask<void> on_file_complete() override;
+    bool needs_args_map() const override { return true; }
+
+    ChunkAggregationOutput take_output();
+    void flush_to_batch(rocksdb::RocksDatabase::Batch& batch);
+
+    const std::unordered_set<std::string>& observed_extra_keys() const {
+        return observed_extra_keys_;
+    }
+    const std::unordered_set<std::string>& observed_custom_metrics() const {
+        return observed_custom_metrics_;
+    }
+
+    /// Distributed mode only: one or more per-flush SST artifact sets
+    /// produced by this visitor after `on_file_complete`. Each flush
+    /// emits its own SST(s) because `SstFileWriter` requires strictly
+    /// ascending keys and merge operands for the same key across flushes
+    /// would violate that invariant. Empty in legacy mode.
+    std::vector<indexer::IndexDatabaseSstWriterContext::Artifacts>&
+    aggregation_artifacts() noexcept {
+        return sst_artifacts_;
+    }
+
+   private:
+    void seal_local_buffer();
+    void handle_system_event(const EventRecord& record);
+
+    // Legacy (RocksDatabase-backed) mode.
+    std::shared_ptr<rocksdb::RocksDatabase> db_;
+    std::vector<rocksdb::RocksDatabase::Batch> pending_batches_;
+
+    // Distributed (SST-backed) mode. The visitor rotates sst_sink_ per
+    // flush to keep each SST's key space strictly ascending (merge
+    // operands for the same key across flushes must live in separate
+    // SSTs). `sst_staging_dir_` and `sst_batch_prefix_` persist across
+    // rotations so the next SstWriterContext can be constructed at
+    // flush time.
+    std::unique_ptr<indexer::IndexDatabaseSstWriterContext> sst_sink_;
+    std::string sst_staging_dir_;
+    std::string sst_batch_prefix_;
+    std::size_t sst_flush_counter_ = 0;
+    std::vector<indexer::IndexDatabaseSstWriterContext::Artifacts>
+        sst_artifacts_;
+
+    std::uint32_t config_hash_;
+    AggregationConfig config_;
+    std::string file_path_;
+
+    std::shared_ptr<AssociationTracker> tracker_;
+    std::size_t events_processed_ = 0;
+
+    static constexpr std::size_t FLUSH_THRESHOLD = 65536;
+    std::unordered_map<std::string, AggregationMetrics, TransparentStringHash,
+                       TransparentStringEqual>
+        local_buffer_;
+    std::string key_buf_;
+    std::string val_buf_;
+
+    AggregationMetrics* last_entry_ = nullptr;
+    std::string_view last_key_;
+
+    // System metrics buffer (keyed by hhash + time_bucket)
+    std::unordered_map<std::string, SystemAggregationMetrics,
+                       TransparentStringHash, TransparentStringEqual>
+        system_buffer_;
+    std::string system_key_buf_;
+    std::string system_val_buf_;
+
+    std::unordered_set<std::string> observed_extra_keys_;
+    std::unordered_set<std::string> observed_custom_metrics_;
+    std::unordered_set<std::string> observed_system_metrics_;
+
+    std::uint64_t min_time_bucket_ = std::numeric_limits<std::uint64_t>::max();
+    std::uint64_t max_time_bucket_ = 0;
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATION_VISITOR_H
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.h
index ad4b7e2b..bdd9fc5f 100644
--- a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.h
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.h
@@ -6,7 +6,7 @@
 
 namespace dftracer::utils::utilities::composites::dft::aggregators {
 
-using AggregatorSummaryInput = EventAggregatorUtilityOutput;
+using AggregatorSummaryInput = EventAggregatorOutput;
 using AggregatorSummaryOutput = void;
 
 class AggregatorSummaryUtility
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_types.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_types.h
new file mode 100644
index 00000000..9ef45743
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_types.h
@@ -0,0 +1,164 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATOR_TYPES_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATOR_TYPES_H
+
+#include <dftracer/utils/core/common/config.h>
+#include <dftracer/utils/utilities/common/query/query.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_key.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+#include <dftracer/utils/utilities/common/arrow/arrow_export.h>
+#endif
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+/// Context for converting aggregation data to dfanalyzer-compatible Arrow
+/// format.
+struct DfanalyzerContext {
+    /// Hash tables for resolving fhash/hhash to file_name/host_name.
+    const std::unordered_map<std::string, std::string>* file_hashes = nullptr;
+    const std::unordered_map<std::string, std::string>* host_hashes = nullptr;
+
+    const common::query::Query* query_filter = nullptr;
+
+    /// Time origin (minimum time_bucket) for normalization.
+    std::uint64_t time_origin = 0;
+
+    /// Time resolution (microseconds per output unit, default 1e6 = seconds).
+    double time_resolution = 1e6;
+
+    /// Time granularity in seconds (bucket width for time_range computation).
+    double time_granularity = 1.0;
+};
+
+enum class AggregationBatchType { EVENT, PROFILE, SYSTEM };
+
+struct CountConfidenceInterval {
+    double lower = 0.0;
+    double upper = 0.0;
+};
+
+struct AggregationEntry {
+    AggregationKey key;
+    AggregationMetrics metrics;
+    bool is_approximated = false;
+    CountConfidenceInterval count_ci;
+
+    AggregationEntry() = default;
+    AggregationEntry(AggregationKey k, AggregationMetrics m)
+        : key(std::move(k)), metrics(std::move(m)) {}
+
+    /// Create a ValueMap from the key and metrics for query evaluation.
+    /// Includes cat, name, pid, tid, hhash, fhash, time_bucket, extra_keys,
+    /// and aggregation metrics (count, dur_total, dur_min, dur_max, etc.).
+    common::query::ValueMap to_value_map() const {
+        common::query::ValueMap fields;
+        // Key fields
+        fields["cat"] = std::string(key.cat());
+        fields["name"] = std::string(key.name());
+        fields["pid"] = static_cast<uint64_t>(key.pid);
+        fields["tid"] = static_cast<uint64_t>(key.tid);
+        if (!key.hhash().empty()) {
+            fields["hhash"] = std::string(key.hhash());
+        }
+        if (!key.fhash().empty()) {
+            fields["fhash"] = std::string(key.fhash());
+        }
+        fields["time_bucket"] = key.time_bucket;
+        // Include extra_keys (args fields used for grouping)
+        if (key.extra_keys) {
+            for (const auto& [key_id, value_id] : *key.extra_keys) {
+                auto key_str =
+                    std::string(aggregation_intern().resolve(key_id));
+                auto value_str =
+                    std::string(aggregation_intern().resolve(value_id));
+                fields[key_str] = value_str;
+            }
+        }
+        // Aggregation metrics
+        fields["count"] = metrics.count;
+        fields["dur_total"] = metrics.duration.total;
+        fields["dur_min"] = metrics.duration.min;
+        fields["dur_max"] = metrics.duration.max;
+        fields["dur_mean"] = metrics.duration.mean;
+        fields["size_total"] = metrics.size.total;
+        fields["size_min"] = metrics.size.min;
+        fields["size_max"] = metrics.size.max;
+        fields["size_mean"] = metrics.size.mean;
+        fields["ts"] = metrics.ts;
+        fields["te"] = metrics.te;
+        // Custom metrics (arbitrary args fields aggregated as numeric stats)
+        if (metrics.custom_metrics) {
+            for (const auto& [name, stats] : *metrics.custom_metrics) {
+                fields[name + "_total"] = stats.total;
+                fields[name + "_min"] = stats.min;
+                fields[name + "_max"] = stats.max;
+                fields[name + "_mean"] = stats.mean;
+            }
+        }
+        return fields;
+    }
+
+    /// Check if this entry matches a query.
+    bool matches(const common::query::Query& query) const {
+        return query.evaluate(to_value_map());
+    }
+};
+
+struct AggregationBatch {
+    std::vector<AggregationEntry> entries;
+    AggregationBatchType batch_type = AggregationBatchType::EVENT;
+    std::size_t total_events_processed = 0;
+    std::size_t total_files_processed = 0;
+    std::size_t total_bytes_processed = 0;
+    bool has_approximated_entries = false;
+
+    // When set, to_arrow() uses these instead of discovering from entries.
+    // All batches in an IPC file must use the same columns for a consistent
+    // schema.
+    const std::vector<std::uint32_t>* global_extra_key_ids = nullptr;
+    const std::vector<std::string>* global_custom_metric_names = nullptr;
+
+    /// Filter entries by query, returning a new batch with matching entries.
+    AggregationBatch filter(const common::query::Query& query) const {
+        AggregationBatch filtered;
+        filtered.batch_type = batch_type;
+        filtered.total_events_processed = total_events_processed;
+        filtered.total_files_processed = total_files_processed;
+        filtered.total_bytes_processed = total_bytes_processed;
+        filtered.has_approximated_entries = has_approximated_entries;
+        filtered.global_extra_key_ids = global_extra_key_ids;
+        filtered.global_custom_metric_names = global_custom_metric_names;
+
+        for (const auto& entry : entries) {
+            if (entry.matches(query)) {
+                filtered.entries.push_back(entry);
+            }
+        }
+        return filtered;
+    }
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+    common::arrow::ArrowExportResult to_arrow() const;
+
+    /// Convert to dfanalyzer-compatible Arrow format.
+    /// Outputs columns matching dfanalyzer schema:
+    /// - Events/Profiles: cat, func_name, pid, tid, file_hash, host_hash,
+    ///   file_name, host_name, proc_name, io_cat, acc_pat, count, time, size,
+    ///   time_min, time_max, size_min, size_max, time_range, time_start,
+    ///   time_end
+    /// - System: host_hash, time_range, sys_cpu_*, sys_mem_*
+    common::arrow::ArrowExportResult to_dfanalyzer_arrow(
+        const DfanalyzerContext& ctx) const;
+#endif
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATOR_TYPES_H
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.h
index 6c9068cf..f68b8ed2 100644
--- a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.h
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.h
@@ -2,21 +2,14 @@
 #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATOR_UTILITY_H
 
 #include <dftracer/utils/core/utilities/streaming_utility.h>
+#include <dftracer/utils/core/utilities/tags/needs_context.h>
 #include <dftracer/utils/utilities/common/query/query.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h>
-#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_key.h>
-#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.h>
-
-#ifdef DFTRACER_UTILS_ENABLE_ARROW
-#include <dftracer/utils/utilities/common/arrow/arrow_export.h>
-#endif
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregator_types.h>
 
 #include <cstddef>
-#include <cstdint>
 #include <optional>
 #include <string>
-#include <utility>
-#include <vector>
 
 namespace dftracer::utils::utilities::composites::dft::aggregators {
 
@@ -27,8 +20,7 @@ struct AggregatorInput {
     std::size_t checkpoint_size = 32 * 1024 * 1024;
     std::string index_dir;
     bool force_rebuild = false;
-    std::size_t chunk_size_mb = 64;
-    std::size_t batch_size_mb = 4;
+    std::size_t parallelism = 0;  // 0 = use all available threads
     std::size_t event_batch_size = 10000;
 
     AggregatorInput& with_directory(const std::string& dir);
@@ -36,27 +28,13 @@ struct AggregatorInput {
     AggregatorInput& with_checkpoint_size(std::size_t sz);
     AggregatorInput& with_index_dir(const std::string& dir);
     AggregatorInput& with_force_rebuild(bool force);
-    AggregatorInput& with_chunk_size_mb(std::size_t mb);
-    AggregatorInput& with_batch_size_mb(std::size_t mb);
+    AggregatorInput& with_parallelism(std::size_t n);
     AggregatorInput& with_event_batch_size(std::size_t sz);
 };
 
-enum class AggregationBatchType { EVENT, PROFILE, SYSTEM };
-
-struct AggregationBatch {
-    std::vector<std::pair<AggregationKey, AggregationMetrics>> entries;
-    AggregationBatchType batch_type = AggregationBatchType::EVENT;
-    std::size_t total_events_processed = 0;
-    std::size_t total_files_processed = 0;
-    std::size_t total_bytes_processed = 0;
-
-#ifdef DFTRACER_UTILS_ENABLE_ARROW
-    common::arrow::ArrowExportResult to_arrow() const;
-#endif
-};
-
 class AggregatorUtility
-    : public StreamingUtility<AggregatorInput, AggregationBatch> {
+    : public StreamingUtility<AggregatorInput, AggregationBatch,
+                              tags::NeedsContext> {
    public:
     coro::AsyncGenerator<AggregationBatch> process(
         const AggregatorInput& input) override;
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregators.h b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregators.h
index 9e4a43c7..cbf14485 100644
--- a/include/dftracer/utils/utilities/composites/dft/aggregators/aggregators.h
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/aggregators.h
@@ -12,7 +12,7 @@
 #include <dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/chunk_mapper_utility.h>
-#include <dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.h>
 
 #endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_AGGREGATORS_H
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.h b/include/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.h
index 40b69386..7b957024 100644
--- a/include/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.h
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.h
@@ -15,13 +15,13 @@
 namespace dftracer::utils::utilities::composites::dft::aggregators {
 
 struct AssociationResolverInput {
-    EventAggregatorUtilityOutput aggregations;
+    EventAggregatorOutput aggregations;
     std::vector<std::shared_ptr<AssociationTracker>> trackers;
     AggregationConfig config;
 };
 
 struct AssociationResolverOutput {
-    EventAggregatorUtilityOutput aggregations;
+    EventAggregatorOutput aggregations;
     std::unordered_set<std::uint64_t> root_pids;
     std::uint64_t trace_duration = 0;
     BoundaryTimeRangesMap boundary_ranges;
@@ -36,10 +36,9 @@ class AssociationResolverUtility
         const AssociationResolverInput& input) override;
 
    private:
-    void compute_trace_metadata(
-        const AssociationTracker& tracker,
-        const EventAggregatorUtilityOutput& aggregations,
-        AssociationResolverOutput& output);
+    void compute_trace_metadata(const AssociationTracker& tracker,
+                                const EventAggregatorOutput& aggregations,
+                                AssociationResolverOutput& output);
 };
 
 }  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h b/include/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h
index ddee89ae..00b4444f 100644
--- a/include/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h
@@ -1,20 +1,18 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_ASSOCIATION_TRACKER_H
 #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_ASSOCIATION_TRACKER_H
 
-#include <dftracer/utils/utilities/common/json/json_value.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h>
+#include <dftracer/utils/utilities/composites/dft/args_map.h>
 
 #include <cstdint>
 #include <string>
+#include <string_view>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
 namespace dftracer::utils::utilities::composites::dft::aggregators {
 
-// Import JsonValue from common json namespace
-using dftracer::utils::utilities::common::json::JsonValue;
-
 struct BoundaryInterval {
     std::string name;
     std::string value;
@@ -34,7 +32,9 @@ class AssociationTracker {
    public:
     AssociationTracker() = default;
 
-    void extract_from_event(const JsonValue& json, const JsonValue& args,
+    void extract_from_event(std::string_view name, std::uint64_t pid,
+                            std::uint64_t ts, std::uint64_t dur,
+                            const ArgsMap& args,
                             const AggregationConfig& config);
     void finalize();
 
@@ -51,6 +51,9 @@ class AssociationTracker {
 
     std::unordered_set<std::uint64_t> get_root_pids() const;
     void merge(const AssociationTracker& other);
+
+    std::string serialize() const;
+    static AssociationTracker deserialize(std::string_view data);
 };
 
 }  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.h b/include/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.h
index 0691c45e..b6eb202e 100644
--- a/include/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.h
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.h
@@ -30,14 +30,14 @@ using dftracer::utils::utilities::composites::dft::DFTracerEvent;
 struct ChunkAggregatorInput {
     std::string file_path;
     std::string index_path;
-    std::size_t start_byte;
-    std::size_t end_byte;
-    std::size_t start_line;
-    std::size_t end_line;
+    std::size_t start_byte = 0;
+    std::size_t end_byte = 0;
+    std::size_t start_line = 0;
+    std::size_t end_line = 0;
     AggregationConfig config;
     std::optional<common::query::Query> query;
-    std::size_t checkpoint_size;
-    int chunk_index;
+    std::size_t checkpoint_size = 0;
+    int chunk_index = 0;
 
     std::size_t batch_size = 4 * 1024 * 1024;
 
@@ -87,17 +87,6 @@ struct ChunkAggregatorInput {
 class ChunkAggregatorUtility
     : public utilities::Utility<ChunkAggregatorInput, ChunkAggregationOutput,
                                 utilities::tags::Parallelizable> {
-   private:
-    std::uint64_t compute_time_bucket(std::uint64_t timestamp,
-                                      std::uint64_t duration,
-                                      const AggregationConfig& config) const;
-
-    AggregationKey build_key(const DFTracerEvent& ev,
-                             const AggregationConfig& config) const;
-
-    void update_entry(const DFTracerEvent& ev, const AggregationConfig& config,
-                      AggregationMap& aggregations, const AggregationKey& key);
-
    public:
     ChunkAggregatorUtility() = default;
 
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h b/include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h
new file mode 100644
index 00000000..077de51a
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h
@@ -0,0 +1,132 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_EVENT_AGGREGATOR_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_EVENT_AGGREGATOR_H
+
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_output.h>
+
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+namespace dftracer::utils::rocksdb {
+class RocksDatabase;
+}
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+class EventAggregator {
+   public:
+    EventAggregator();
+
+    EventAggregator(std::shared_ptr<rocksdb::RocksDatabase> db,
+                    std::uint32_t config_hash);
+
+    void merge_chunk(ChunkAggregationOutput&& chunk_output);
+
+    EventAggregatorOutput finalize();
+
+    using ScanCallback = std::function<bool(AggMapType, const AggregationKey&,
+                                            AggregationMetrics&)>;
+    std::size_t scan(ScanCallback callback) const;
+    std::size_t scan_shard_range(std::uint16_t shard_begin,
+                                 std::uint16_t shard_end,
+                                 ScanCallback callback) const;
+
+    /// Type-erased raw scan. Use the templated overload below for zero-
+    /// allocation calls.
+    using RawScanCallbackFn = bool (*)(void* ctx, std::string_view key_bytes,
+                                       std::string_view value_bytes);
+    std::size_t scan_shard_range_raw_fn(std::uint16_t shard_begin,
+                                        std::uint16_t shard_end,
+                                        RawScanCallbackFn fn, void* ctx) const;
+
+    /// Template wrapper: forwards any callable `(sv, sv) -> bool` into the
+    /// raw scan with zero heap allocations. The adapter lambda is a captureless
+    /// `+[]` so it decays to a plain function pointer.
+    template <typename F>
+    std::size_t scan_shard_range_raw(std::uint16_t shard_begin,
+                                     std::uint16_t shard_end,
+                                     F&& callback) const {
+        auto adapter =
+            +[](void* ctx, std::string_view k, std::string_view v) -> bool {
+            return (*static_cast<std::decay_t<F>*>(ctx))(k, v);
+        };
+        return scan_shard_range_raw_fn(shard_begin, shard_end, adapter,
+                                       static_cast<void*>(&callback));
+    }
+
+    /// Move trackers out without materializing the full aggregation map.
+    std::vector<std::shared_ptr<AssociationTracker>> take_trackers();
+
+    /// Merge fresh trackers with any persisted tracker from the DB,
+    /// persist the result, and return the merged tracker.
+    std::unique_ptr<AssociationTracker> build_global_tracker();
+
+    struct ObservedColumns {
+        std::vector<std::uint32_t> extra_key_ids;
+        std::vector<std::string> custom_metric_names;
+    };
+    ObservedColumns observed_columns();
+    void add_observed_extra_key(const std::string& key);
+    void add_observed_custom_metric(const std::string& name);
+
+    std::size_t total_events() const { return total_events_.load(); }
+    std::size_t total_bytes() const { return total_bytes_.load(); }
+    std::size_t total_files() const { return unique_files_.size(); }
+
+    void update_time_bounds(std::uint64_t time_bucket);
+    std::uint64_t min_time_bucket() const;
+    std::uint64_t max_time_bucket() const;
+
+    struct TimeBoundsResult {
+        std::uint64_t min_time_bucket;
+        std::uint64_t max_time_bucket;
+        bool valid;
+    };
+    TimeBoundsResult query_time_bounds() const;
+
+    bool is_rocksdb_mode() const { return rocksdb_mode_; }
+    std::shared_ptr<rocksdb::RocksDatabase> db() const { return db_; }
+    std::uint32_t config_hash() const { return config_hash_; }
+
+    static std::shared_ptr<rocksdb::RocksDatabase> open_with_merge_operator(
+        const std::string& index_path);
+
+    /// Read-only variant for multi-process concurrent scan (e.g. MPI ranks
+    /// covering disjoint shard-prefix ranges of a shared unified index).
+    /// Multi-process RocksDB writes are forbidden; read-only opens do not
+    /// hold the exclusive LOCK, so N ranks can open the same DB at once.
+    static std::shared_ptr<rocksdb::RocksDatabase>
+    open_read_only_with_merge_operator(const std::string& index_path);
+
+   private:
+    void merge_chunk_memory(ChunkAggregationOutput&& chunk_output);
+    void merge_chunk_rocksdb(ChunkAggregationOutput&& chunk_output);
+
+    bool rocksdb_mode_ = false;
+
+    // In-memory state
+    EventAggregatorOutput state_;
+    std::unordered_set<std::string> unique_files_;
+
+    // RocksDB state
+    std::shared_ptr<rocksdb::RocksDatabase> db_;
+    std::uint32_t config_hash_ = 0;
+    std::atomic<std::size_t> total_events_{0};
+    std::atomic<std::size_t> total_bytes_{0};
+    std::vector<std::shared_ptr<AssociationTracker>> trackers_;
+
+    std::set<std::uint32_t> observed_extra_key_ids_;
+    std::set<std::string> observed_custom_metric_names_;
+
+    std::atomic<std::uint64_t> min_time_bucket_{UINT64_MAX};
+    std::atomic<std::uint64_t> max_time_bucket_{0};
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_EVENT_AGGREGATOR_H
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.h b/include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.h
deleted file mode 100644
index ab14e8d9..00000000
--- a/include/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_EVENT_AGGREGATOR_UTILITY_H
-#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_EVENT_AGGREGATOR_UTILITY_H
-
-#include <dftracer/utils/core/coro/task.h>
-#include <dftracer/utils/core/utilities/utility.h>
-#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_output.h>
-
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-namespace dftracer::utils::utilities::composites::dft::aggregators {
-
-struct EventAggregatorUtilityInput {
-    std::vector<ChunkAggregationOutput> chunk_outputs;
-};
-
-class EventAggregatorUtility
-    : public utilities::Utility<EventAggregatorUtilityInput,
-                                EventAggregatorUtilityOutput> {
-   public:
-    coro::CoroTask<EventAggregatorUtilityOutput> process(
-        const EventAggregatorUtilityInput& input) override;
-
-    void merge_chunk(ChunkAggregationOutput&& chunk_output);
-    EventAggregatorUtilityOutput finalize();
-
-   private:
-    EventAggregatorUtilityOutput state_;
-    std::unordered_set<std::string> unique_files_;
-};
-
-}  // namespace dftracer::utils::utilities::composites::dft::aggregators
-
-#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_EVENT_AGGREGATOR_UTILITY_H
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.h b/include/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.h
index d56ce0e5..a95d9bc3 100644
--- a/include/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.h
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.h
@@ -1,50 +1,68 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_PERFETTO_TRACE_WRITER_UTILITY_H
 #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_PERFETTO_TRACE_WRITER_UTILITY_H
 
+#include <dftracer/utils/core/utilities/tags/needs_context.h>
+#include <dftracer/utils/core/utilities/tags/parallelizable.h>
 #include <dftracer/utils/core/utilities/utility.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_key.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.h>
 
+#include <atomic>
 #include <cstdint>
+#include <memory>
 #include <string>
-#include <string_view>
+#include <unordered_set>
 #include <vector>
 
 namespace dftracer::utils::utilities::composites::dft::aggregators {
 
+class EventAggregator;
+
 enum class PerfettoEventFormat { COUNTER, ASYNC, REGULAR };
 
 struct PerfettoTraceWriterInput {
     std::string output_path;
-    AssociationResolverOutput resolver_output;
+    const EventAggregator* aggregator = nullptr;
+    const AssociationTracker* tracker = nullptr;
+    const AggregationConfig* agg_config = nullptr;
+    std::unique_ptr<AssociationTracker> owned_tracker;
+    std::unordered_set<std::uint64_t> root_pids;
+    std::uint64_t trace_duration = 0;
+    BoundaryTimeRangesMap boundary_ranges;
     bool compute_statistics = true;
     bool compute_percentiles = false;
     std::vector<double> percentiles;
     bool compress = false;
     int compression_level = 6;
     PerfettoEventFormat format = PerfettoEventFormat::COUNTER;
+    /// Workers add their per-shard key count here if non-null.
+    std::atomic<std::size_t>* keys_written = nullptr;
+    /// Concatenate shards into `output_path` and unlink them on SHARDED
+    /// layouts (typically NFS). Callers that read shards directly leave false.
+    bool merge_on_sharded = false;
+    /// Total shard-prefix range (half-open) this invocation is responsible
+    /// for. Defaults cover the whole key space. MPI drivers set a disjoint
+    /// range per rank so N ranks collectively cover `[0, AGG_KEY_NUM_SHARDS)`
+    /// without overlap. Local coroutine workers within a single process
+    /// further subdivide this range.
+    std::uint16_t shard_begin = 0;
+    std::uint16_t shard_end = 0;  // 0 means "use AGG_KEY_NUM_SHARDS"
+    /// Emit the JSON array prologue (`[\n` + trace_metadata + root_process
+    /// markers) to this invocation's output. MPI drivers set this on rank 0
+    /// only so concatenated rank outputs produce exactly one array open.
+    bool emit_header = true;
+    /// Emit the JSON array epilogue (`]`). MPI drivers set this on the last
+    /// rank only.
+    bool emit_footer = true;
 };
 
 using PerfettoTraceWriterOutput = bool;
 
 class PerfettoTraceWriterUtility
-    : public utilities::Utility<PerfettoTraceWriterInput,
-                                PerfettoTraceWriterOutput> {
-   private:
-    std::uint64_t generate_synthetic_tid(const AggregationKey& key) const;
-    void append_json_string(std::string& buffer, std::string_view str) const;
-    void append_double(std::string& buffer, double value) const;
-    void append_metric_stats(std::string& buffer, const MetricStats& stats,
-                             std::uint64_t count, bool compute_statistics,
-                             bool compute_percentiles,
-                             const std::vector<double>& percentiles) const;
-    void append_event_args(std::string& buffer, const AggregationKey& key,
-                           const AggregationMetrics& metrics,
-                           bool compute_statistics, bool compute_percentiles,
-                           const std::vector<double>& percentiles,
-                           std::uint64_t real_tid = 0) const;
-
+    : public utilities::Utility<
+          PerfettoTraceWriterInput, PerfettoTraceWriterOutput,
+          utilities::tags::Parallelizable, utilities::tags::NeedsContext> {
    public:
     coro::CoroTask<bool> process(
         const PerfettoTraceWriterInput& input) override;
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics.h b/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics.h
new file mode 100644
index 00000000..32d406b1
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics.h
@@ -0,0 +1,206 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_H
+
+#include <dftracer/utils/core/common/transparent_string_hash.h>
+#include <dftracer/utils/utilities/common/statistics/ddsketch.h>
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+using common::statistics::DDSketch;
+
+struct FloatMetricStats {
+    std::uint64_t count = 0;
+    double total = 0.0;
+    double min = std::numeric_limits<double>::max();
+    double max = std::numeric_limits<double>::lowest();
+    double mean = 0.0;
+    double m2 = 0.0;
+    std::unique_ptr<DDSketch> sketch;
+    double sketch_accuracy_ = 0.01;
+
+    explicit FloatMetricStats(double relative_accuracy = 0.01)
+        : sketch_accuracy_(relative_accuracy) {}
+
+    FloatMetricStats(const FloatMetricStats& other)
+        : count(other.count),
+          total(other.total),
+          min(other.min),
+          max(other.max),
+          mean(other.mean),
+          m2(other.m2),
+          sketch(other.sketch ? std::make_unique<DDSketch>(*other.sketch)
+                              : nullptr),
+          sketch_accuracy_(other.sketch_accuracy_) {}
+
+    FloatMetricStats& operator=(const FloatMetricStats& other) {
+        if (this != &other) {
+            count = other.count;
+            total = other.total;
+            min = other.min;
+            max = other.max;
+            mean = other.mean;
+            m2 = other.m2;
+            sketch = other.sketch ? std::make_unique<DDSketch>(*other.sketch)
+                                  : nullptr;
+            sketch_accuracy_ = other.sketch_accuracy_;
+        }
+        return *this;
+    }
+
+    FloatMetricStats(FloatMetricStats&&) = default;
+    FloatMetricStats& operator=(FloatMetricStats&&) = default;
+
+    void update(double value, bool compute_percentiles = false) {
+        count++;
+        total += value;
+        if (value < min) min = value;
+        if (value > max) max = value;
+
+        // Welford's online mean/variance
+        double delta = value - mean;
+        mean += delta / static_cast<double>(count);
+        double delta2 = value - mean;
+        m2 += delta * delta2;
+
+        if (compute_percentiles) {
+            if (!sketch) {
+                sketch = std::make_unique<DDSketch>(sketch_accuracy_);
+            }
+            sketch->add(value);
+        }
+    }
+
+    void merge_from(const FloatMetricStats& other) {
+        if (other.count == 0) return;
+        if (count == 0) {
+            *this = other;
+            return;
+        }
+
+        std::uint64_t new_count = count + other.count;
+        double delta = other.mean - mean;
+        double new_mean = mean + delta * static_cast<double>(other.count) /
+                                     static_cast<double>(new_count);
+        double new_m2 = m2 + other.m2 +
+                        delta * delta * static_cast<double>(count) *
+                            static_cast<double>(other.count) /
+                            static_cast<double>(new_count);
+
+        count = new_count;
+        total += other.total;
+        if (other.min < min) min = other.min;
+        if (other.max > max) max = other.max;
+        mean = new_mean;
+        m2 = new_m2;
+
+        if (other.sketch) {
+            if (!sketch) {
+                sketch = std::make_unique<DDSketch>(*other.sketch);
+            } else {
+                sketch->merge(*other.sketch);
+            }
+        }
+    }
+
+    double get_stddev() const {
+        if (count < 2) return 0.0;
+        return std::sqrt(m2 / static_cast<double>(count - 1));
+    }
+};
+
+using FloatMetricsMap =
+    std::unordered_map<std::string, FloatMetricStats, TransparentStringHash,
+                       TransparentStringEqual>;
+
+struct SystemAggregationMetrics {
+    std::uint64_t count = 0;
+
+    // Timestamp bounds for this bucket
+    std::uint64_t ts = std::numeric_limits<std::uint64_t>::max();
+    std::uint64_t te = 0;
+
+    // Named system metrics (aggregated as mean per bucket)
+    std::unique_ptr<FloatMetricsMap> metrics;
+
+    double sketch_accuracy = 0.01;
+
+    explicit SystemAggregationMetrics(double relative_accuracy = 0.01)
+        : sketch_accuracy(relative_accuracy) {}
+
+    SystemAggregationMetrics(const SystemAggregationMetrics& other)
+        : count(other.count),
+          ts(other.ts),
+          te(other.te),
+          metrics(other.metrics
+                      ? std::make_unique<FloatMetricsMap>(*other.metrics)
+                      : nullptr),
+          sketch_accuracy(other.sketch_accuracy) {}
+
+    SystemAggregationMetrics& operator=(const SystemAggregationMetrics& other) {
+        if (this != &other) {
+            count = other.count;
+            ts = other.ts;
+            te = other.te;
+            metrics = other.metrics
+                          ? std::make_unique<FloatMetricsMap>(*other.metrics)
+                          : nullptr;
+            sketch_accuracy = other.sketch_accuracy;
+        }
+        return *this;
+    }
+
+    SystemAggregationMetrics(SystemAggregationMetrics&&) = default;
+    SystemAggregationMetrics& operator=(SystemAggregationMetrics&&) = default;
+
+    void update_metric(std::string_view name, double value,
+                       bool compute_percentiles = false) {
+        if (!metrics) {
+            metrics = std::make_unique<FloatMetricsMap>();
+        }
+        auto it = metrics->find(name);
+        if (it == metrics->end()) {
+            it = metrics
+                     ->emplace(std::string(name),
+                               FloatMetricStats(sketch_accuracy))
+                     .first;
+        }
+        it->second.update(value, compute_percentiles);
+    }
+
+    void update_timestamp(std::uint64_t event_ts) {
+        if (event_ts < ts) ts = event_ts;
+        if (event_ts > te) te = event_ts;
+    }
+
+    void merge_from(const SystemAggregationMetrics& other) {
+        count += other.count;
+        if (other.ts < ts) ts = other.ts;
+        if (other.te > te) te = other.te;
+
+        if (other.metrics) {
+            if (!metrics) {
+                metrics = std::make_unique<FloatMetricsMap>();
+            }
+            for (const auto& [name, stats] : *other.metrics) {
+                auto it = metrics->find(name);
+                if (it == metrics->end()) {
+                    it = metrics
+                             ->emplace(name, FloatMetricStats(sketch_accuracy))
+                             .first;
+                }
+                it->second.merge_from(stats);
+            }
+        }
+    }
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_H
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.h b/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.h
new file mode 100644
index 00000000..1fee3c47
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.h
@@ -0,0 +1,26 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_MERGE_OPERATOR_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_MERGE_OPERATOR_H
+
+#include <rocksdb/merge_operator.h>
+
+#include <string>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+class SystemMetricsMergeOperator : public ::rocksdb::MergeOperator {
+   public:
+    bool FullMergeV2(const MergeOperationInput& merge_in,
+                     MergeOperationOutput* merge_out) const override;
+
+    bool PartialMerge(const ::rocksdb::Slice& key,
+                      const ::rocksdb::Slice& left_operand,
+                      const ::rocksdb::Slice& right_operand,
+                      std::string* new_value,
+                      ::rocksdb::Logger* logger) const override;
+
+    const char* Name() const override { return "SystemMetricsMergeOperator"; }
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_MERGE_OPERATOR_H
diff --git a/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.h b/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.h
new file mode 100644
index 00000000..db5a16f8
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.h
@@ -0,0 +1,37 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_SERIALIZATION_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_SERIALIZATION_H
+
+#include <dftracer/utils/utilities/composites/dft/aggregators/system_metrics.h>
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+// System metrics key: [hhash:var][time_bucket:varint]
+// Simpler key than regular aggregation since system metrics are host-level
+
+struct SystemMetricKey {
+    std::string hhash;
+    std::uint64_t time_bucket = 0;
+};
+
+void serialize_system_key_into(std::string& out, std::string_view hhash,
+                               std::uint64_t time_bucket);
+std::string serialize_system_key(std::string_view hhash,
+                                 std::uint64_t time_bucket);
+
+struct DeserializedSystemKey {
+    SystemMetricKey key;
+};
+DeserializedSystemKey deserialize_system_key(std::string_view data);
+
+void serialize_system_value_into(std::string& out,
+                                 const SystemAggregationMetrics& metrics);
+std::string serialize_system_value(const SystemAggregationMetrics& metrics);
+SystemAggregationMetrics deserialize_system_value(std::string_view data);
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_AGGREGATORS_SYSTEM_METRICS_SERIALIZATION_H
diff --git a/include/dftracer/utils/utilities/composites/dft/args_map.h b/include/dftracer/utils/utilities/composites/dft/args_map.h
new file mode 100644
index 00000000..7bd92dfc
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/args_map.h
@@ -0,0 +1,216 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_ARGS_MAP_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_ARGS_MAP_H
+
+#include <dftracer/utils/core/common/string_intern.h>
+#include <dftracer/utils/core/common/transparent_string_hash.h>
+
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <variant>
+
+namespace dftracer::utils::utilities::composites::dft {
+
+using ArgsValue = std::variant<std::monostate, std::string, std::int64_t,
+                               std::uint64_t, double, bool>;
+
+class ArgsValueProxy {
+    const ArgsValue* val_;
+
+   public:
+    explicit ArgsValueProxy(const ArgsValue* v = nullptr) : val_(v) {}
+
+    bool exists() const {
+        return val_ != nullptr &&
+               !std::holds_alternative<std::monostate>(*val_);
+    }
+    explicit operator bool() const { return exists(); }
+    bool is_null() const { return !exists(); }
+    bool is_string() const {
+        return val_ && std::holds_alternative<std::string>(*val_);
+    }
+    bool is_uint() const {
+        return val_ && std::holds_alternative<std::uint64_t>(*val_);
+    }
+    bool is_int() const {
+        return val_ && std::holds_alternative<std::int64_t>(*val_);
+    }
+    bool is_number() const {
+        return val_ && (std::holds_alternative<std::int64_t>(*val_) ||
+                        std::holds_alternative<std::uint64_t>(*val_) ||
+                        std::holds_alternative<double>(*val_));
+    }
+    bool is_bool() const { return val_ && std::holds_alternative<bool>(*val_); }
+    bool is_object() const { return false; }
+    bool is_array() const { return false; }
+
+    template <typename T>
+    T get(const T& default_val = T{}) const {
+        if (!val_ || std::holds_alternative<std::monostate>(*val_))
+            return default_val;
+
+        if constexpr (std::is_same_v<T, bool>) {
+            if (auto* p = std::get_if<bool>(val_)) return *p;
+            return default_val;
+        } else if constexpr (std::is_same_v<T, std::string>) {
+            if (auto* p = std::get_if<std::string>(val_)) return *p;
+            return default_val;
+        } else if constexpr (std::is_same_v<T, std::string_view>) {
+            if (auto* p = std::get_if<std::string>(val_))
+                return std::string_view(*p);
+            return default_val;
+        } else if constexpr (std::is_same_v<T, const char*>) {
+            if (auto* p = std::get_if<std::string>(val_)) return p->c_str();
+            return default_val;
+        } else if constexpr (std::is_same_v<T, std::uint64_t>) {
+            if (auto* p = std::get_if<std::uint64_t>(val_)) return *p;
+            if (auto* p = std::get_if<std::int64_t>(val_)) {
+                if (*p >= 0) return static_cast<std::uint64_t>(*p);
+            }
+            if (auto* p = std::get_if<double>(val_)) {
+                if (*p >= 0 && *p <= static_cast<double>(
+                                         std::numeric_limits<uint64_t>::max()))
+                    return static_cast<std::uint64_t>(*p);
+            }
+            return default_val;
+        } else if constexpr (std::is_same_v<T, std::int64_t>) {
+            if (auto* p = std::get_if<std::int64_t>(val_)) return *p;
+            if (auto* p = std::get_if<std::uint64_t>(val_)) {
+                if (*p <=
+                    static_cast<uint64_t>(std::numeric_limits<int64_t>::max()))
+                    return static_cast<std::int64_t>(*p);
+            }
+            if (auto* p = std::get_if<double>(val_)) {
+                return static_cast<std::int64_t>(*p);
+            }
+            return default_val;
+        } else if constexpr (std::is_same_v<T, double>) {
+            if (auto* p = std::get_if<double>(val_)) return *p;
+            if (auto* p = std::get_if<std::int64_t>(val_))
+                return static_cast<double>(*p);
+            if (auto* p = std::get_if<std::uint64_t>(val_))
+                return static_cast<double>(*p);
+            return default_val;
+        } else if constexpr (std::is_same_v<T, float>) {
+            return static_cast<float>(
+                get<double>(static_cast<double>(default_val)));
+        } else if constexpr (std::is_integral_v<T> && std::is_unsigned_v<T>) {
+            return static_cast<T>(
+                get<std::uint64_t>(static_cast<std::uint64_t>(default_val)));
+        } else if constexpr (std::is_integral_v<T> && std::is_signed_v<T>) {
+            return static_cast<T>(
+                get<std::int64_t>(static_cast<std::int64_t>(default_val)));
+        } else {
+            static_assert(!sizeof(T),
+                          "Unsupported type for ArgsValueProxy::get<T>()");
+        }
+    }
+
+    template <typename T>
+    std::optional<T> get_optional() const {
+        if (!val_ || std::holds_alternative<std::monostate>(*val_))
+            return std::nullopt;
+
+        if constexpr (std::is_same_v<T, std::string>) {
+            if (auto* p = std::get_if<std::string>(val_)) return *p;
+            return std::nullopt;
+        } else if constexpr (std::is_same_v<T, std::string_view>) {
+            if (auto* p = std::get_if<std::string>(val_))
+                return std::string_view(*p);
+            return std::nullopt;
+        } else if constexpr (std::is_same_v<T, std::uint64_t>) {
+            if (auto* p = std::get_if<std::uint64_t>(val_)) return *p;
+            if (auto* p = std::get_if<std::int64_t>(val_)) {
+                if (*p >= 0) return static_cast<std::uint64_t>(*p);
+            }
+            return std::nullopt;
+        } else if constexpr (std::is_same_v<T, std::int64_t>) {
+            if (auto* p = std::get_if<std::int64_t>(val_)) return *p;
+            if (auto* p = std::get_if<std::uint64_t>(val_)) {
+                if (*p <=
+                    static_cast<uint64_t>(std::numeric_limits<int64_t>::max()))
+                    return static_cast<std::int64_t>(*p);
+            }
+            return std::nullopt;
+        } else if constexpr (std::is_same_v<T, double>) {
+            if (auto* p = std::get_if<double>(val_)) return *p;
+            if (auto* p = std::get_if<std::int64_t>(val_))
+                return static_cast<double>(*p);
+            if (auto* p = std::get_if<std::uint64_t>(val_))
+                return static_cast<double>(*p);
+            return std::nullopt;
+        } else if constexpr (std::is_same_v<T, bool>) {
+            if (auto* p = std::get_if<bool>(val_)) return *p;
+            return std::nullopt;
+        } else {
+            static_assert(
+                !sizeof(T),
+                "Unsupported type for ArgsValueProxy::get_optional<T>()");
+        }
+    }
+};
+
+class ArgsMap {
+    using Map = dftracer::utils::StringViewMap<ArgsValue>;
+    Map data_;
+    bool valid_ = false;
+
+    static dftracer::utils::StringIntern& key_intern() {
+        static dftracer::utils::StringIntern instance;
+        return instance;
+    }
+
+   public:
+    ArgsMap() = default;
+
+    bool exists() const { return valid_; }
+    explicit operator bool() const { return valid_; }
+
+    void set_valid(bool v) { valid_ = v; }
+
+    void insert(std::string_view key, ArgsValue value) {
+        auto interned = std::string(key_intern().intern(key));
+        data_.emplace(std::move(interned), std::move(value));
+    }
+
+    void clear() {
+        data_.clear();
+        valid_ = false;
+    }
+
+    ArgsValueProxy operator[](std::string_view key) const {
+        if (!valid_) return ArgsValueProxy{};
+        auto it = data_.find(key);
+        return it != data_.end() ? ArgsValueProxy{&it->second}
+                                 : ArgsValueProxy{};
+    }
+
+    ArgsValueProxy operator[](const char* key) const {
+        return (*this)[std::string_view(key)];
+    }
+
+    ArgsValueProxy operator[](const std::string& key) const {
+        return (*this)[std::string_view(key)];
+    }
+
+    ArgsValueProxy at(const char* key) const { return (*this)[key]; }
+    ArgsValueProxy at(const std::string& key) const { return (*this)[key]; }
+    ArgsValueProxy at(std::string_view key) const { return (*this)[key]; }
+
+    template <typename Fn>
+    void for_each_member(Fn&& fn) const {
+        if (!valid_) return;
+        for (const auto& [k, v] : data_) {
+            fn(std::string_view(k), ArgsValueProxy{&v});
+        }
+    }
+
+    const Map& raw() const { return data_; }
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_ARGS_MAP_H
diff --git a/include/dftracer/utils/utilities/composites/dft/comparator/comparison_config.h b/include/dftracer/utils/utilities/composites/dft/comparator/comparison_config.h
index 87acb4fa..be5feb3c 100644
--- a/include/dftracer/utils/utilities/composites/dft/comparator/comparison_config.h
+++ b/include/dftracer/utils/utilities/composites/dft/comparator/comparison_config.h
@@ -1,6 +1,8 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_COMPARATOR_COMPARISON_CONFIG_H
 #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_COMPARATOR_COMPARISON_CONFIG_H
 
+#include <simdjson.h>
+
 #include <cstddef>
 #include <optional>
 #include <string>
@@ -87,8 +89,10 @@ struct ComparisonConfig {
     std::size_t executor_threads = 0;
     /// Checkpoint size for index building (0 = default).
     std::size_t checkpoint_size = 0;
-    /// Directory for `.dftindex` stores.
-    std::string index_dir;
+    /// Directory for baseline `.dftindex` store (empty = co-located).
+    std::string baseline_index_dir;
+    /// Directory for variant `.dftindex` store (empty = co-located).
+    std::string variant_index_dir;
     /// Force rebuild of existing indexes.
     bool force_rebuild = false;
 
@@ -109,7 +113,7 @@ struct ComparisonConfig {
     void resolve();
 
    private:
-    static bool parse_node(void* yyjson_val_ptr, ComparisonNode& node,
+    static bool parse_node(simdjson::dom::element val, ComparisonNode& node,
                            std::string& error);
     void resolve_node(ComparisonNode& node, const std::string& parent_query,
                       const std::vector<std::string>& parent_metrics,
diff --git a/include/dftracer/utils/utilities/composites/dft/comparator/comparison_result.h b/include/dftracer/utils/utilities/composites/dft/comparator/comparison_result.h
index 446d077d..a9d3c480 100644
--- a/include/dftracer/utils/utilities/composites/dft/comparator/comparison_result.h
+++ b/include/dftracer/utils/utilities/composites/dft/comparator/comparison_result.h
@@ -1,10 +1,10 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_COMPARATOR_COMPARISON_RESULT_H
 #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_COMPARATOR_COMPARISON_RESULT_H
 
+#include <dftracer/utils/core/common/config.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_key.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_map.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.h>
-
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
 #include <dftracer/utils/utilities/common/arrow/arrow_export.h>
 #endif
diff --git a/include/dftracer/utils/utilities/composites/dft/comparator/comparison_utility.h b/include/dftracer/utils/utilities/composites/dft/comparator/comparison_utility.h
index 61c55f83..dbef2dc1 100644
--- a/include/dftracer/utils/utilities/composites/dft/comparator/comparison_utility.h
+++ b/include/dftracer/utils/utilities/composites/dft/comparator/comparison_utility.h
@@ -10,15 +10,15 @@
 
 namespace dftracer::utils::utilities::composites::dft::comparator {
 
-using aggregators::EventAggregatorUtilityOutput;
+using aggregators::EventAggregatorOutput;
 
 /// Paired baseline/variant aggregation outputs for a single comparison
 /// node.
 struct ComparisonVisitorPair {
     /// Aggregation output for the baseline run.
-    EventAggregatorUtilityOutput baseline;
+    EventAggregatorOutput baseline;
     /// Aggregation output for the variant run.
-    EventAggregatorUtilityOutput variant;
+    EventAggregatorOutput variant;
     /// Resolved config node for this visitor.
     ComparisonNode node;
 };
diff --git a/include/dftracer/utils/utilities/composites/dft/dft_event_dispatcher.h b/include/dftracer/utils/utilities/composites/dft/dft_event_dispatcher.h
new file mode 100644
index 00000000..88cf08c0
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/dft_event_dispatcher.h
@@ -0,0 +1,326 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_DFT_EVENT_DISPATCHER_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_DFT_EVENT_DISPATCHER_H
+
+#include <dftracer/utils/core/coro/coro.h>
+#include <dftracer/utils/core/coro/when_all.h>
+#include <dftracer/utils/core/pipeline/executor.h>
+#include <dftracer/utils/utilities/common/json/json.h>
+#include <dftracer/utils/utilities/composites/dft/dft_event_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/event.h>
+#include <dftracer/utils/utilities/composites/dft/parse_inflated.h>
+#include <dftracer/utils/utilities/indexer/index_visitor.h>
+#include <simdjson.h>
+
+#include <cstddef>
+#include <cstring>
+#include <functional>
+#include <memory>
+#include <string_view>
+#include <vector>
+
+namespace dftracer::utils::utilities::composites::dft {
+
+class DftEventDispatcher : public indexer::IndexVisitor {
+   public:
+    using VisitorList = std::vector<std::reference_wrapper<DftEventVisitor>>;
+
+    static constexpr std::size_t FLUSH_THRESHOLD = 4 * 1024 * 1024;  // 4MB
+
+    explicit DftEventDispatcher(VisitorList visitors, bool force_serial = false)
+        : visitors_(std::move(visitors)), force_serial_(force_serial) {
+        for (auto& v : visitors_) {
+            if (v.get().needs_args_map()) {
+                needs_args_map_ = true;
+                break;
+            }
+        }
+    }
+
+    void begin(std::size_t num_checkpoints) override {
+        for (auto& v : visitors_) {
+            v.get().begin(num_checkpoints);
+        }
+    }
+
+    coro::CoroTask<void> on_checkpoint(std::size_t checkpoint_idx) override {
+        co_await flush_batch(pending_checkpoint_idx_);
+        line_number_ = 0;
+        for (auto& v : visitors_) {
+            v.get().on_checkpoint(checkpoint_idx);
+        }
+    }
+
+    coro::CoroTask<void> on_chunk(const char* data, std::size_t len,
+                                  std::size_t checkpoint_idx) override {
+        if (len == 0) co_return;
+        ensure_accum();
+        accum_->append(data, len);
+        pending_checkpoint_idx_ = checkpoint_idx;
+        if (accum_->size() >= FLUSH_THRESHOLD) {
+            co_await flush_batch(checkpoint_idx);
+        }
+    }
+
+    coro::CoroTask<void> flush() override {
+        co_await flush_batch(pending_checkpoint_idx_);
+    }
+
+    bool wants_drain() const noexcept override {
+        for (const auto& v : visitors_) {
+            if (v.get().wants_drain()) return true;
+        }
+        return false;
+    }
+
+    coro::CoroTask<void> drain_pending() override {
+        for (auto& v : visitors_) {
+            if (v.get().wants_drain()) {
+                co_await v.get().drain_pending();
+            }
+        }
+    }
+
+    void on_line(std::string_view line, indexer::SharedLineBuffer buffer,
+                 std::size_t checkpoint_idx) override {
+        std::size_t ln = line_number_++;
+
+        if (line.empty()) return;
+
+        auto result = parser_.parse(line.data(), line.size());
+        if (result.error()) return;
+
+        auto root = result.value_unsafe();
+        if (!root.is_object()) return;
+
+        common::json::JsonValue json(root);
+        DFTracerEvent ev;
+        simdjson::dom::element args_dom{};
+        bool has_args = false;
+        bool ok = false;
+        if (needs_args_map_) {
+            ok = DFTracerEvent::parse(json, ev);
+            if (ok) {
+                auto args_r = root["args"];
+                if (!args_r.error() && args_r.value_unsafe().is_object()) {
+                    args_dom = args_r.value_unsafe();
+                    has_args = true;
+                }
+            }
+        } else {
+            ok = DFTracerEvent::parse_scalars(root, ev, args_dom, has_args);
+        }
+        if (ok) {
+            EventRecord record{ev, json,     line,    buffer, checkpoint_idx,
+                               ln, args_dom, has_args};
+            for (auto& v : visitors_) {
+                v.get().on_event(record);
+            }
+        }
+    }
+
+    void finalize(indexer::IndexDatabaseWriterContext& writer,
+                  int file_id) override {
+        (void)writer;
+        (void)file_id;
+    }
+
+   private:
+    void ensure_accum() {
+        if (!accum_) {
+            accum_ = std::make_shared<std::string>();
+            accum_->reserve(FLUSH_THRESHOLD + FLUSH_THRESHOLD / 4);
+            if (!partial_doc_.empty()) {
+                accum_->append(partial_doc_.data(), partial_doc_.size());
+                partial_doc_.clear();
+            }
+        }
+    }
+
+    coro::CoroTask<void> flush_batch(std::size_t checkpoint_idx) {
+        if (!accum_ || accum_->empty()) co_return;
+
+        std::size_t total = accum_->size();
+        strip_array_delimiters(accum_->data(), total);
+        accum_->resize(total + simdjson::SIMDJSON_PADDING, '\0');
+        auto chunk_buffer = std::move(accum_);
+        accum_ = nullptr;
+
+        std::size_t partial = 0;
+        Executor* exec = Executor::current();
+        std::size_t num_slices = preferred_slice_count(exec, total);
+
+        if (!force_serial_ && num_slices >= 2 &&
+            all_visitors_parallelizable()) {
+            partial = co_await parallel_flush(*exec, chunk_buffer, total,
+                                              checkpoint_idx, num_slices);
+        } else {
+            partial = serial_flush(chunk_buffer, total, checkpoint_idx);
+        }
+
+        if (partial > 0 && partial <= total) {
+            partial_doc_.assign(chunk_buffer->data() + total - partial,
+                                chunk_buffer->data() + total);
+        }
+    }
+
+    bool all_visitors_parallelizable() {
+        if (visitor_parallel_clones_cached_) return parallelizable_cached_;
+        visitor_parallel_clones_cached_ = true;
+        for (auto& v : visitors_) {
+            if (!v.get().create_parallel_slice()) {
+                parallelizable_cached_ = false;
+                return false;
+            }
+        }
+        parallelizable_cached_ = true;
+        return true;
+    }
+
+    std::size_t preferred_slice_count(Executor* exec, std::size_t total) const {
+        if (!exec) return 1;
+        const std::size_t MIN_SLICE_BYTES = 256 * 1024;
+        std::size_t cap = exec->get_num_threads();
+        if (cap < 2) return 1;
+        if (cap > 8) cap = 8;
+        std::size_t by_size = total / MIN_SLICE_BYTES;
+        if (by_size < 2) return 1;
+        return std::min(by_size, cap);
+    }
+
+    std::size_t serial_flush(std::shared_ptr<std::string> chunk_buffer,
+                             std::size_t total, std::size_t checkpoint_idx) {
+        return parse_buffer(parser_, chunk_buffer, total, checkpoint_idx,
+                            line_number_, needs_args_map_,
+                            [this](const EventRecord& record) {
+                                for (auto& v : visitors_)
+                                    v.get().on_event(record);
+                            });
+    }
+
+    coro::CoroTask<std::size_t> parallel_flush(
+        Executor& /*exec*/, std::shared_ptr<std::string> chunk_buffer,
+        std::size_t total, std::size_t checkpoint_idx, std::size_t num_slices) {
+        std::vector<std::pair<std::size_t, std::size_t>> ranges;
+        ranges.reserve(num_slices);
+        const char* data = chunk_buffer->data();
+        std::size_t cursor = 0;
+        std::size_t partial_tail = 0;
+        for (std::size_t i = 0; i < num_slices; ++i) {
+            std::size_t target =
+                (i + 1 == num_slices) ? total : (i + 1) * (total / num_slices);
+            std::size_t end = target;
+            if (i + 1 == num_slices) {
+                end = total;
+            } else {
+                while (end < total && data[end] != '\n') ++end;
+                if (end < total) ++end;
+            }
+            if (end <= cursor) continue;
+            ranges.emplace_back(cursor, end);
+            cursor = end;
+        }
+        if (cursor < total) {
+            partial_tail = total - cursor;
+        }
+
+        std::vector<std::vector<std::unique_ptr<DftEventVisitor>>> slice_vis(
+            ranges.size());
+        for (std::size_t s = 0; s < ranges.size(); ++s) {
+            slice_vis[s].reserve(visitors_.size());
+            for (auto& v : visitors_) {
+                slice_vis[s].push_back(v.get().create_parallel_slice());
+            }
+        }
+
+        std::vector<coro::CoroTask<std::size_t>> slice_tasks;
+        slice_tasks.reserve(ranges.size());
+        for (std::size_t s = 0; s < ranges.size(); ++s) {
+            slice_tasks.push_back(
+                make_slice_task(chunk_buffer, ranges[s].first, ranges[s].second,
+                                checkpoint_idx, slice_vis[s]));
+        }
+        auto slice_truncs = co_await coro::when_all(std::move(slice_tasks));
+
+        if (!slice_truncs.empty()) {
+            std::size_t last_trunc = slice_truncs.back();
+            std::size_t last_len = ranges.back().second - ranges.back().first;
+            if (last_trunc > 0 && last_trunc <= last_len) {
+                partial_tail = last_trunc;
+            }
+        }
+
+        std::vector<std::size_t> running_offsets(visitors_.size(), 0);
+        for (std::size_t s = 0; s < ranges.size(); ++s) {
+            for (std::size_t i = 0; i < visitors_.size(); ++i) {
+                if (!slice_vis[s][i]) continue;
+                slice_vis[s][i]->set_line_offset(running_offsets[i]);
+                running_offsets[i] += slice_vis[s][i]->parallel_event_count();
+                visitors_[i].get().merge_parallel_slice(*slice_vis[s][i]);
+            }
+        }
+
+        co_return partial_tail;
+    }
+
+    static coro::CoroTask<std::size_t> make_slice_task(
+        std::shared_ptr<std::string> chunk_buffer, std::size_t start,
+        std::size_t end, std::size_t checkpoint_idx,
+        std::vector<std::unique_ptr<DftEventVisitor>>& slice_vis) {
+        std::size_t truncated = 0;
+        try {
+            simdjson::dom::parser local_parser;
+            simdjson::dom::document_stream stream;
+            auto err = local_parser
+                           .parse_many(chunk_buffer->data() + start,
+                                       end - start, end - start)
+                           .get(stream);
+            if (!err) {
+                std::size_t slice_ln = 0;
+                for (auto it = stream.begin(); it != stream.end(); ++it) {
+                    if ((*it).error()) continue;
+                    auto root = (*it).value_unsafe();
+                    if (!root.is_object()) continue;
+                    common::json::JsonValue json(root);
+                    DFTracerEvent ev;
+                    simdjson::dom::element args_dom{};
+                    bool has_args = false;
+                    if (DFTracerEvent::parse_scalars(root, ev, args_dom,
+                                                     has_args)) {
+                        std::string_view src = it.source();
+                        EventRecord record{
+                            ev,           json,           src,
+                            chunk_buffer, checkpoint_idx, slice_ln,
+                            args_dom,     has_args};
+                        ++slice_ln;
+                        for (auto& v : slice_vis) {
+                            if (v) v->on_event(record);
+                        }
+                    }
+                }
+                truncated = stream.truncated_bytes();
+            }
+        } catch (...) {
+        }
+        co_return truncated;
+    }
+
+    static void strip_array_delimiters(char* buf, std::size_t len) {
+        ::dftracer::utils::utilities::composites::dft::strip_array_delimiters(
+            buf, len);
+    }
+
+    VisitorList visitors_;
+    bool needs_args_map_ = false;
+    bool force_serial_ = false;
+    simdjson::dom::parser parser_;
+    std::size_t line_number_ = 0;
+    std::shared_ptr<std::string> accum_;
+    std::vector<char> partial_doc_;
+    std::size_t pending_checkpoint_idx_ = 0;
+    bool visitor_parallel_clones_cached_ = false;
+    bool parallelizable_cached_ = false;
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_DFT_EVENT_DISPATCHER_H
diff --git a/include/dftracer/utils/utilities/composites/dft/dft_event_visitor.h b/include/dftracer/utils/utilities/composites/dft/dft_event_visitor.h
new file mode 100644
index 00000000..ddc2d2f9
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/dft_event_visitor.h
@@ -0,0 +1,70 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_DFT_EVENT_VISITOR_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_DFT_EVENT_VISITOR_H
+
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/utilities/common/json/json_value.h>
+#include <dftracer/utils/utilities/composites/dft/event.h>
+#include <dftracer/utils/utilities/indexer/index_visitor.h>
+#include <simdjson.h>
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <string_view>
+
+namespace dftracer::utils::utilities::composites::dft {
+
+struct EventRecord {
+    const DFTracerEvent& ev;
+    const common::json::JsonValue& json;
+    std::string_view line;
+    indexer::SharedLineBuffer line_buffer;  // keeps line data alive
+    std::size_t checkpoint_idx;
+    std::size_t line_number;
+    simdjson::dom::element args_dom{};
+    bool has_args{false};
+};
+
+class DftEventVisitor {
+   public:
+    virtual ~DftEventVisitor() = default;
+
+    virtual void begin(std::size_t num_checkpoints) = 0;
+
+    virtual void on_checkpoint(std::size_t checkpoint_idx) = 0;
+
+    virtual void on_event(const EventRecord& record) = 0;
+
+    // Hint that the visitor has accumulated work that should be drained.
+    // Cheap (no allocation/co_await): the dispatcher polls this after every
+    // on_event and only co_awaits drain_pending() when true.
+    virtual bool wants_drain() const noexcept { return false; }
+
+    // Drain any accumulated work via async operations (e.g. channel send).
+    // Suspends the calling coroutine when downstream is full, providing
+    // real backpressure without blocking an executor thread.
+    virtual coro::CoroTask<void> drain_pending() { co_return; }
+
+    virtual coro::CoroTask<void> on_file_complete() { co_return; }
+
+    virtual std::unique_ptr<DftEventVisitor> create_parallel_slice() const {
+        return nullptr;
+    }
+    virtual void merge_parallel_slice(DftEventVisitor& /*slice*/) {}
+
+    /// In parallel-flush mode, slices receive events with slice-local line
+    /// numbers (0..N-1). The dispatcher calls this on the slice before
+    /// merge_parallel_slice with the cumulative successful-event count of
+    /// prior slices, so the slice can renumber its stored line indices.
+    virtual void set_line_offset(std::size_t /*offset*/) {}
+
+    /// Successful events processed by this slice. Used by the dispatcher to
+    /// propagate line offsets across slices in byte order.
+    virtual std::size_t parallel_event_count() const { return 0; }
+
+    virtual bool needs_args_map() const { return false; }
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_DFT_EVENT_VISITOR_H
diff --git a/include/dftracer/utils/utilities/composites/dft/event.h b/include/dftracer/utils/utilities/composites/dft/event.h
index 86ec78b6..b9754c45 100644
--- a/include/dftracer/utils/utilities/composites/dft/event.h
+++ b/include/dftracer/utils/utilities/composites/dft/event.h
@@ -1,52 +1,30 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_EVENT_H
 #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_EVENT_H
 
-/**
- * @file event.h
- * @brief Common DFTracer event representation and parser.
- *
- * Provides DFTracerEvent, a lightweight struct capturing the core fields
- * of a Chrome Tracing / DFTracer event.  All string fields are string_view
- * into the yyjson document (valid only while the doc lives).
- */
-
 #include <dftracer/utils/utilities/common/json/json_value.h>
+#include <dftracer/utils/utilities/common/json/parser.h>
+#include <dftracer/utils/utilities/composites/dft/args_map.h>
 
 #include <cstdint>
+#include <string>
 #include <string_view>
 
 namespace dftracer::utils::utilities::composites::dft {
 
 using common::json::JsonValue;
 
-/**
- * Parsed DFTracer trace event.
- *
- * All string_view fields point into the yyjson document memory and are
- * only valid while the document is alive.
- *
- * Typical usage:
- *   DFTracerEvent ev;
- *   if (DFTracerEvent::parse(json, ev)) {
- *       if (ev.is_complete()) { ... }
- *   }
- */
 struct DFTracerEvent {
-    // Core Chrome Tracing fields
     std::uint64_t id = 0;
     std::string_view name;
     std::string_view cat;
-    std::string_view ph;  // "X" (complete), "M" (metadata), "B"/"E", etc.
+    std::string_view ph;
     std::uint64_t pid = 0;
     std::uint64_t tid = 0;
     std::uint64_t ts = 0;
     std::uint64_t dur = 0;
 
-    // Args subtree, lazy, just the yyjson_val* pointer.
-    // Access via args["field"] or pass to evaluator/bloom.
-    JsonValue args;
+    ArgsMap args;
 
-    // Convenience predicates
     bool is_metadata() const { return ph == "M"; }
     bool is_counter() const { return ph == "C"; }
     bool is_profile() const { return ph == "C" && cat != "sys"; }
@@ -55,11 +33,6 @@ struct DFTracerEvent {
     bool is_complete() const { return ph == "X"; }
     bool has_id() const { return id != 0; }
 
-    /**
-     * Parse from a JsonValue (wrapping a yyjson_val* object root).
-     * Returns true if the JSON is a valid object with at least "ph".
-     * Fields that are absent get their default (0 / empty).
-     */
     static bool parse(const JsonValue& json, DFTracerEvent& out) {
         auto ph_val = json["ph"];
         if (!ph_val.exists()) return false;
@@ -88,18 +61,172 @@ struct DFTracerEvent {
         if (dur_val.exists()) out.dur = dur_val.get<std::uint64_t>();
 
         auto args_val = json["args"];
-        if (args_val.exists()) out.args = args_val;
+        if (args_val.exists() && args_val.is_object()) {
+            out.args.set_valid(true);
+            args_val.for_each_member([&](std::string_view k, JsonValue v) {
+                if (v.is_string()) {
+                    out.args.insert(k, std::string(v.get<std::string_view>()));
+                } else if (v.is_uint()) {
+                    out.args.insert(k, v.get<std::uint64_t>());
+                } else if (v.is_int()) {
+                    out.args.insert(k, v.get<std::int64_t>());
+                } else if (v.is_number()) {
+                    out.args.insert(k, v.get<double>());
+                } else if (v.is_bool()) {
+                    out.args.insert(k, v.get<bool>());
+                }
+            });
+        }
 
         return true;
     }
 
-    /**
-     * Parse from a raw yyjson_val* root (for call sites that don't use
-     * JsonValue).  Same semantics as the JsonValue overload.
-     */
-    static bool parse(yyjson_val* root, DFTracerEvent& out) {
-        if (!root || !yyjson_is_obj(root)) return false;
-        return parse(JsonValue(root), out);
+    static bool parse_scalars(simdjson::dom::element root, DFTracerEvent& out,
+                              simdjson::dom::element& out_args,
+                              bool& out_has_args) {
+        out_has_args = false;
+        if (!root.is_object()) return false;
+
+        bool has_ph = false;
+        for (auto field : root.get_object()) {
+            std::string_view key = field.key;
+            simdjson::dom::element val = field.value;
+            switch (key.size()) {
+                case 2:
+                    if (key == "ph") {
+                        if (val.is_string()) {
+                            out.ph = val.get_string().value_unsafe();
+                            has_ph = true;
+                        }
+                    } else if (key == "id") {
+                        if (val.is_uint64())
+                            out.id = val.get_uint64().value_unsafe();
+                    } else if (key == "ts") {
+                        if (val.is_uint64())
+                            out.ts = val.get_uint64().value_unsafe();
+                    }
+                    break;
+                case 3:
+                    if (key == "pid") {
+                        if (val.is_uint64())
+                            out.pid = val.get_uint64().value_unsafe();
+                    } else if (key == "tid") {
+                        if (val.is_uint64())
+                            out.tid = val.get_uint64().value_unsafe();
+                    } else if (key == "cat") {
+                        if (val.is_string())
+                            out.cat = val.get_string().value_unsafe();
+                    } else if (key == "dur") {
+                        if (val.is_uint64())
+                            out.dur = val.get_uint64().value_unsafe();
+                    }
+                    break;
+                case 4:
+                    if (key == "name") {
+                        if (val.is_string())
+                            out.name = val.get_string().value_unsafe();
+                    } else if (key == "args") {
+                        if (val.is_object()) {
+                            out_args = val;
+                            out_has_args = true;
+                        }
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+        return has_ph;
+    }
+
+    static bool parse_ondemand(common::json::JsonParser& parser,
+                               DFTracerEvent& out) {
+        bool has_ph = false;
+        parser.for_each_field([&](std::string_view key,
+                                  simdjson::ondemand::value val) {
+            if (key == "ph") {
+                auto r = val.get_string();
+                if (!r.error()) {
+                    out.ph = r.value_unsafe();
+                    has_ph = true;
+                }
+            } else if (key == "id") {
+                auto r = val.get_uint64();
+                if (!r.error()) out.id = r.value_unsafe();
+            } else if (key == "name") {
+                auto r = val.get_string();
+                if (!r.error()) out.name = r.value_unsafe();
+            } else if (key == "cat") {
+                auto r = val.get_string();
+                if (!r.error()) out.cat = r.value_unsafe();
+            } else if (key == "pid") {
+                auto r = val.get_uint64();
+                if (!r.error()) out.pid = r.value_unsafe();
+            } else if (key == "tid") {
+                auto r = val.get_uint64();
+                if (!r.error()) out.tid = r.value_unsafe();
+            } else if (key == "ts") {
+                auto r = val.get_uint64();
+                if (!r.error()) out.ts = r.value_unsafe();
+            } else if (key == "dur") {
+                auto r = val.get_uint64();
+                if (!r.error()) out.dur = r.value_unsafe();
+            } else if (key == "args") {
+                auto obj = val.get_object();
+                if (!obj.error()) {
+                    out.args.set_valid(true);
+                    for (auto field : obj.value_unsafe()) {
+                        if (field.error()) continue;
+                        auto fkey = field.unescaped_key();
+                        if (fkey.error()) continue;
+                        auto fval = field.value();
+                        if (fval.error()) continue;
+
+                        auto type = fval.type();
+                        if (type.error()) continue;
+
+                        switch (type.value_unsafe()) {
+                            case simdjson::ondemand::json_type::string: {
+                                auto r = fval.get_string();
+                                if (!r.error())
+                                    out.args.insert(
+                                        fkey.value_unsafe(),
+                                        std::string(r.value_unsafe()));
+                                break;
+                            }
+                            case simdjson::ondemand::json_type::number: {
+                                auto ri = fval.get_int64();
+                                if (!ri.error()) {
+                                    auto v = ri.value_unsafe();
+                                    if (v >= 0)
+                                        out.args.insert(
+                                            fkey.value_unsafe(),
+                                            static_cast<std::uint64_t>(v));
+                                    else
+                                        out.args.insert(fkey.value_unsafe(), v);
+                                } else {
+                                    auto rd = fval.get_double();
+                                    if (!rd.error())
+                                        out.args.insert(fkey.value_unsafe(),
+                                                        rd.value_unsafe());
+                                }
+                                break;
+                            }
+                            case simdjson::ondemand::json_type::boolean: {
+                                auto r = fval.get_bool();
+                                if (!r.error())
+                                    out.args.insert(fkey.value_unsafe(),
+                                                    r.value_unsafe());
+                                break;
+                            }
+                            default:
+                                break;
+                        }
+                    }
+                }
+            }
+        });
+        return has_ph;
     }
 };
 
diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h b/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h
index f6a75b7c..9ffd1c9a 100644
--- a/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h
+++ b/include/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h
@@ -3,6 +3,7 @@
 
 #include <dftracer/utils/utilities/hash/fnv1a_hasher_utility.h>
 
+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <string_view>
@@ -11,11 +12,25 @@
 namespace dftracer::utils::utilities::composites::dft::indexing {
 
 /**
- * @brief Bloom filter for approximate set membership testing.
+ * @brief Split block Bloom filter for approximate set membership testing.
  *
- * Uses Kirsch-Mitzenmacher optimization: k hash functions derived from
- * 2 base hash values (std::hash with different seeds). Supports
- * serialization to/from binary blobs for RocksDB storage.
+ * Implements the split block Bloom filter from the Apache Parquet spec:
+ * 256-bit blocks of 8 x uint32 words; each insert/query touches exactly
+ * one block (one cache line) and sets/tests one bit in each of the 8
+ * words via a fixed SALT array. Block selection uses Lemire's reduction
+ * on h1; in-block masks use h2 multiplied by SALT.
+ *
+ * References:
+ *  - Apple, J. "Split block Bloom filters." arXiv:2101.01719 (2021).
+ *  - Putze, F., Sanders, P., Singler, J. "Cache-, hash-, and space-
+ *    efficient bloom filters." ACM JEA 14, Article 4 (2009).
+ *  - Apache Parquet Bloom filter spec:
+ *    https://github.com/apache/parquet-format/blob/master/BloomFilter.md
+ *
+ * Differs from canonical Parquet:
+ *  - Underlying hash is FNV1a + SplitMix64 finisher (not xxhash64).
+ *  - Custom 12-byte LE header (num_hashes, num_entries, num_bits) instead
+ *    of Thrift; num_hashes is unused at insert/test (vestigial).
  *
  * Serialization format (self-describing):
  *   [4 bytes: num_hashes (uint32_t LE)]
@@ -58,6 +73,11 @@ class BloomFilter {
     std::size_t num_hashes_;
     std::size_t num_entries_;
     mutable hash::Fnv1aHasherUtility hasher_;
+
+    static constexpr std::size_t LAST_VALUE_CAP = 64;
+    std::array<char, LAST_VALUE_CAP> last_value_buf_{};
+    std::size_t last_value_size_ = 0;
+    bool last_value_valid_ = false;
 };
 
 }  // namespace dftracer::utils::utilities::composites::dft::indexing
diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h
index 7807ed77..103816bc 100644
--- a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h
+++ b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h
@@ -6,7 +6,6 @@
 #include <cstdint>
 #include <optional>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 namespace dftracer::utils::utilities::composites::dft::indexing {
@@ -21,16 +20,64 @@ struct ChunkDimensionStats {
     std::string value_type =
         "string";           ///< "string", "uint", "int", or "double".
 
-    /// Value -> count map. Nullopt when compressed size exceeds cap.
-    /// Uses transparent hash to allow string_view lookups without allocation.
-    std::optional<std::unordered_map<std::string, std::uint64_t,
-                                     utils::TransparentStringHash,
-                                     utils::TransparentStringEqual>>
-        value_counts;
+    std::optional<dftracer::utils::StringViewMap<std::uint64_t>> value_counts;
+
+    // Skips the hash lookup when the same value is observed back-to-back.
+    // Not copied/moved: a copy would point into the original's nodes.
+    const std::string* last_key_ = nullptr;
+    std::uint64_t* last_counter_ = nullptr;
+
+    ChunkDimensionStats() = default;
+    ChunkDimensionStats(const ChunkDimensionStats& other)
+        : dimension(other.dimension),
+          distinct_count(other.distinct_count),
+          min_value(other.min_value),
+          max_value(other.max_value),
+          value_type(other.value_type),
+          value_counts(other.value_counts) {}
+    ChunkDimensionStats(ChunkDimensionStats&& other) noexcept
+        : dimension(std::move(other.dimension)),
+          distinct_count(other.distinct_count),
+          min_value(std::move(other.min_value)),
+          max_value(std::move(other.max_value)),
+          value_type(std::move(other.value_type)),
+          value_counts(std::move(other.value_counts)) {
+        other.last_key_ = nullptr;
+        other.last_counter_ = nullptr;
+    }
+    ChunkDimensionStats& operator=(const ChunkDimensionStats& other) {
+        if (this != &other) {
+            dimension = other.dimension;
+            distinct_count = other.distinct_count;
+            min_value = other.min_value;
+            max_value = other.max_value;
+            value_type = other.value_type;
+            value_counts = other.value_counts;
+            last_key_ = nullptr;
+            last_counter_ = nullptr;
+        }
+        return *this;
+    }
+    ChunkDimensionStats& operator=(ChunkDimensionStats&& other) noexcept {
+        if (this != &other) {
+            dimension = std::move(other.dimension);
+            distinct_count = other.distinct_count;
+            min_value = std::move(other.min_value);
+            max_value = std::move(other.max_value);
+            value_type = std::move(other.value_type);
+            value_counts = std::move(other.value_counts);
+            last_key_ = nullptr;
+            last_counter_ = nullptr;
+            other.last_key_ = nullptr;
+            other.last_counter_ = nullptr;
+        }
+        return *this;
+    }
 
     /// Record a value observation. Updates min/max, distinct_count,
     /// value_counts.
     void observe(std::string_view value);
+    void observe_range_only(std::uint64_t value);
 
     /// Serialize value_counts to binary format:
     /// [u32 LE num_entries] [u16 LE key_len, key bytes, u64 LE count]*
@@ -41,11 +88,11 @@ struct ChunkDimensionStats {
     std::optional<std::vector<std::uint8_t>> compress_value_counts(
         std::size_t cap_bytes = 4096) const;
 
-    static std::unordered_map<std::string, std::uint64_t>
+    static dftracer::utils::StringViewMap<std::uint64_t>
     deserialize_value_counts(const std::uint8_t* data, std::size_t len);
 
     /// Decompress zlib-compressed value_counts, then deserialize.
-    static std::unordered_map<std::string, std::uint64_t>
+    static dftracer::utils::StringViewMap<std::uint64_t>
     decompress_value_counts(const std::uint8_t* data, std::size_t len);
 };
 
@@ -57,8 +104,23 @@ struct ChunkDimensionStatsResult {
     std::string min_value;
     std::string max_value;
     std::string value_type;
-    // NULL in DB → nullopt here
-    std::optional<std::unordered_map<std::string, std::uint64_t>> value_counts;
+    mutable std::optional<dftracer::utils::StringViewMap<std::uint64_t>>
+        value_counts;
+    // Raw compressed value_counts. Populated when value_counts is left
+    // un-decoded so callers can lazily decode on first access.
+    mutable std::vector<std::uint8_t> compressed_value_counts;
+
+    bool has_value_counts_payload() const {
+        return value_counts.has_value() || !compressed_value_counts.empty();
+    }
+
+    void ensure_value_counts_decoded() const {
+        if (value_counts || compressed_value_counts.empty()) return;
+        value_counts = ChunkDimensionStats::decompress_value_counts(
+            compressed_value_counts.data(), compressed_value_counts.size());
+        compressed_value_counts.clear();
+        compressed_value_counts.shrink_to_fit();
+    }
 };
 
 }  // namespace dftracer::utils::utilities::composites::dft::indexing
diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h
index a49ebe64..c38b64ba 100644
--- a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h
+++ b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h
@@ -58,13 +58,10 @@ struct ChunkIndexerConfig {
 };
 
 // Hash resolution maps (collected once per file from metadata events)
-using HashResolveMap =
-    std::shared_ptr<std::unordered_map<std::string, std::string>>;
+using HashResolveMap = std::shared_ptr<StringViewMap<std::string>>;
 
 // Hash resolution entry: dimension -> {hash -> resolved_value}
-using HashResolutions =
-    std::unordered_map<std::string,
-                       std::unordered_map<std::string, std::string>>;
+using HashResolutions = StringViewMap<StringViewMap<std::string>>;
 
 // Tracks which dimensions have been indexed per chunk for incremental updates
 struct IndexedDimensions {
diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h
index cb0f0378..07acd8cc 100644
--- a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h
+++ b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h
@@ -5,6 +5,7 @@
 #include <dftracer/utils/core/utilities/utility.h>
 #include <dftracer/utils/utilities/common/query/query.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/bloom_filter_cache.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
 
 #include <cstdint>
 #include <string>
@@ -15,11 +16,18 @@ namespace dftracer::utils::utilities::composites::dft::indexing {
 using common::query::Query;
 
 /// Input for chunk pruning: index path, file path, query, optional cache.
+///
+/// If `external_db` is non-null the utility reuses that handle instead of
+/// opening the RocksDB at `index_path` itself. This lets callers that
+/// prune many files against the same directory-level index amortize the
+/// (expensive) RocksDB open cost to once per batch rather than once per
+/// file.
 struct ChunkPrunerInput {
     std::string index_path;             ///< Path to the `.dftindex` store.
     std::string file_path;              ///< Path to trace file.
     Query query;                        ///< Query to evaluate for pruning.
     BloomFilterCache* cache = nullptr;  ///< Optional bloom filter cache.
+    indexer::IndexDatabase* external_db = nullptr;  ///< Reused DB handle.
 };
 
 /// Result of chunk pruning.
@@ -31,6 +39,26 @@ struct ChunkPrunerOutput {
     bool success = false;  ///< True if pruning completed without error.
 };
 
+/// Input for batched pruning across many files that share the same
+/// `.dftindex` store. Allows a single RocksDB scan per column family to
+/// populate per-file pruner contexts instead of one scan per file.
+struct ChunkPrunerBatchItem {
+    std::string file_path;
+    Query query;
+};
+
+struct ChunkPrunerBatchInput {
+    std::string index_path;
+    std::vector<ChunkPrunerBatchItem> items;
+    BloomFilterCache* cache = nullptr;
+    indexer::IndexDatabase* external_db = nullptr;
+};
+
+struct ChunkPrunerBatchOutput {
+    std::vector<ChunkPrunerOutput> outputs;  ///< Parallel to items[].
+    bool success = false;
+};
+
 /// Three-tier chunk pruner: dictionary → min/max range → bloom filter.
 /// Walks the Query AST recursively (AND=intersect, OR=union, NOT=complement).
 class ChunkPrunerUtility
@@ -41,6 +69,10 @@ class ChunkPrunerUtility
 
     coro::CoroTask<ChunkPrunerOutput> process(
         const ChunkPrunerInput& input) override;
+
+    /// Batch-prune many files against the same index with shared RocksDB
+    /// range scans for dim_stats / chunk_statistics.
+    ChunkPrunerBatchOutput process_batch(const ChunkPrunerBatchInput& input);
 };
 
 }  // namespace dftracer::utils::utilities::composites::dft::indexing
diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h
index 37abe99c..203d1e6b 100644
--- a/include/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h
+++ b/include/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h
@@ -1,8 +1,10 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_CHUNK_STATISTICS_H
 #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_CHUNK_STATISTICS_H
 
+#include <dftracer/utils/core/common/transparent_string_hash.h>
 #include <dftracer/utils/utilities/common/statistics/ddsketch.h>
 #include <dftracer/utils/utilities/common/statistics/log2_histogram.h>
+#include <dftracer/utils/utilities/common/statistics/timestamp_histogram.h>
 
 #include <cstdint>
 #include <limits>
@@ -18,14 +20,14 @@ namespace dftracer::utils::utilities::composites::dft::indexing {
  *
  * Tracks event counts by category/name/pid:tid, timestamp ranges,
  * and duration statistics using Welford's online algorithm for variance.
- * Map fields serialize to JSON text via yyjson for storage in the
+ * Map fields serialize to JSON text for storage in the
  * shared `.dftindex` database.
  */
 struct ChunkStatistics {
     std::uint64_t total_events = 0;
-    std::unordered_map<std::string, std::uint64_t> category_counts;
-    std::unordered_map<std::string, std::uint64_t> name_counts;
-    std::unordered_map<std::string, std::uint64_t> pid_tid_counts;
+    StringViewMap<std::uint64_t> category_counts;
+    StringViewMap<std::uint64_t> name_counts;
+    StringViewMap<std::uint64_t> pid_tid_counts;
 
     std::uint64_t min_timestamp_us = std::numeric_limits<std::uint64_t>::max();
     std::uint64_t max_timestamp_us = 0;
@@ -37,13 +39,12 @@ struct ChunkStatistics {
 
     common::statistics::DDSketch duration_sketch{0.01};
     common::statistics::Log2Histogram duration_histogram;
-    std::unordered_map<std::string, common::statistics::DDSketch>
-        name_duration_sketches;
-    std::unordered_map<std::string, common::statistics::Log2Histogram>
-        name_duration_histograms;
-    std::unordered_map<std::string, double> name_duration_sums;
-    std::unordered_map<std::string, double> name_duration_sum_sqs;
-    std::unordered_map<std::string, std::string> name_category;
+    common::statistics::TimestampHistogram timestamp_histogram;
+    StringViewMap<common::statistics::DDSketch> name_duration_sketches;
+    StringViewMap<common::statistics::Log2Histogram> name_duration_histograms;
+    StringViewMap<double> name_duration_sums;
+    StringViewMap<double> name_duration_sum_sqs;
+    StringViewMap<std::string> name_category;
 
     void update_from_event(std::string_view name, std::string_view cat,
                            std::uint64_t pid, std::uint64_t tid,
@@ -62,13 +63,12 @@ struct ChunkStatistics {
     /// Serialize per-name DDSketches to a single binary blob.
     std::vector<std::uint8_t> serialize_name_duration_sketches() const;
 
-    static std::unordered_map<std::string, std::string> parse_string_map_json(
+    static StringViewMap<std::string> parse_string_map_json(
         const std::string& json);
-    static std::unordered_map<std::string, double> parse_double_map_json(
-        const std::string& json);
-    static std::unordered_map<std::string, common::statistics::Log2Histogram>
+    static StringViewMap<double> parse_double_map_json(const std::string& json);
+    static StringViewMap<common::statistics::Log2Histogram>
     parse_histogram_map_json(const std::string& json);
-    static std::unordered_map<std::string, common::statistics::DDSketch>
+    static StringViewMap<common::statistics::DDSketch>
     deserialize_name_duration_sketches(const std::uint8_t* data,
                                        std::size_t len);
 };
diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h b/include/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h
new file mode 100644
index 00000000..14806cce
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h
@@ -0,0 +1,86 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_INDEX_RESOLVER_UTILITY_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_INDEX_RESOLVER_UTILITY_H
+
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/utilities/tags/needs_context.h>
+#include <dftracer/utils/core/utilities/tags/parallelizable.h>
+#include <dftracer/utils/core/utilities/utilities.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h>
+#include <dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h>
+#include <dftracer/utils/utilities/indexer/index_file_entry_capability.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace dftracer::utils::utilities::composites::dft::indexing {
+
+struct ResolvedFile {
+    std::size_t file_index = 0;
+    std::string file_path;
+    std::int32_t file_id = -1;
+    indexer::IndexFileEntryCapability capabilities =
+        indexer::IndexFileEntryCapability::NONE;
+};
+
+struct FileWorkItem {
+    std::size_t file_index = 0;
+    std::string file_path;
+    std::int32_t file_id = -1;
+};
+
+struct ResolverInput {
+    std::string directory;
+    std::string index_dir;
+    std::vector<std::string> files;
+
+    bool require_checkpoints = true;
+    bool require_bloom = false;
+    bool require_manifest = false;
+    bool require_aggregation = false;
+
+    // Full config for computing hash with stored time_interval
+    std::optional<aggregators::AggregationConfig> aggregation_config;
+};
+
+struct ResolverResult {
+    std::vector<std::string> all_files;
+    std::vector<std::size_t> all_file_sizes;
+    std::string index_path;
+
+    std::vector<FileWorkItem> needs_checkpoint;
+    std::vector<FileWorkItem> needs_bloom;
+    std::vector<FileWorkItem> needs_manifest;
+    std::vector<FileWorkItem> needs_aggregation;
+
+    std::vector<ResolvedFile> cached;
+
+    // Aggregation augmentation info (when cached aggregation exists with
+    // different time_interval)
+    bool needs_augmentation = false;
+    std::uint64_t stored_time_interval_us = 0;  // Time interval in cached data
+
+    std::size_t total_needs_work() const {
+        return needs_checkpoint.size() + needs_bloom.size() +
+               needs_manifest.size() + needs_aggregation.size();
+    }
+
+    std::size_t total_cached() const { return cached.size(); }
+};
+
+class IndexResolverUtility
+    : public utilities::Utility<ResolverInput, ResolverResult,
+                                utilities::tags::Parallelizable,
+                                utilities::tags::NeedsContext> {
+   private:
+    filesystem::PatternDirectoryScannerUtility scanner_;
+
+   public:
+    coro::CoroTask<ResolverResult> process(const ResolverInput& input) override;
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft::indexing
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_INDEX_RESOLVER_UTILITY_H
diff --git a/include/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.h b/include/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.h
new file mode 100644
index 00000000..1317e98e
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.h
@@ -0,0 +1,43 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_RESOLVE_AND_BUILD_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_INDEXING_RESOLVE_AND_BUILD_H
+
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace dftracer::utils::utilities::composites::dft::indexing {
+
+struct ResolveAndBuildInput {
+    std::string directory;
+    std::vector<std::string> files;
+    std::string index_dir;
+
+    std::size_t checkpoint_size = 32 * 1024 * 1024;  // 32MB default
+    std::size_t parallelism = 0;
+    bool force_rebuild = false;
+
+    bool require_checkpoints = true;
+    bool require_bloom = false;
+    bool require_manifest = false;
+    bool require_aggregation = false;
+
+    std::optional<aggregators::AggregationConfig> aggregation_config;
+};
+
+// Consolidates the common resolve -> build -> re-resolve pattern.
+// Returns ResolverResult with:
+//   - all_files: discovered files
+//   - index_path: path to shared index
+//   - cached: fully resolved files ready for use
+//   - needs_checkpoint: files that failed to index (for direct scan fallback)
+coro::CoroTask<ResolverResult> resolve_and_build_index(
+    CoroScope* scope, ResolveAndBuildInput input);
+
+}  // namespace dftracer::utils::utilities::composites::dft::indexing
+
+#endif
diff --git a/include/dftracer/utils/utilities/composites/dft/internal/utils.h b/include/dftracer/utils/utilities/composites/dft/internal/utils.h
index d2c6db5d..fe287abd 100644
--- a/include/dftracer/utils/utilities/composites/dft/internal/utils.h
+++ b/include/dftracer/utils/utilities/composites/dft/internal/utils.h
@@ -11,17 +11,19 @@ namespace dftracer::utils::utilities::composites::dft::internal {
 bool is_data_transfer_op(std::string_view cat, std::string_view name);
 
 /**
- * @brief Determine the root-local RocksDB index path for a given data file.
+ * @brief Determine the root-local RocksDB index path for a given input path.
  *
  * When a custom index directory is provided, the index root is
  * `<index_dir>/.dftindex`. Otherwise, the index root is placed alongside the
- * data file as `<file_dir>/.dftindex`.
+ * input path:
+ * - file path: `<file_dir>/.dftindex`
+ * - directory path: `<directory>/.dftindex`
  *
- * @param file_path Path to the data file (e.g., "data/trace.pfw.gz")
+ * @param path Path to a data file or directory
  * @param index_dir Optional custom directory for the index root.
  * @return Path to the owning `.dftindex` directory.
  */
-std::string determine_index_path(const std::string& file_path,
+std::string determine_index_path(const std::string& path,
                                  const std::string& index_dir = "");
 
 /**
diff --git a/include/dftracer/utils/utilities/composites/dft/parse_inflated.h b/include/dftracer/utils/utilities/composites/dft/parse_inflated.h
new file mode 100644
index 00000000..c3057863
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/parse_inflated.h
@@ -0,0 +1,108 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_PARSE_INFLATED_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_PARSE_INFLATED_H
+
+#include <dftracer/utils/utilities/common/json/json.h>
+#include <dftracer/utils/utilities/composites/dft/dft_event_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/event.h>
+#include <simdjson.h>
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <string_view>
+
+namespace dftracer::utils::utilities::composites::dft {
+
+// Replace "[\n" / "]\n" delimiter lines with spaces in-place. Mirrors the
+// behaviour used by DftEventDispatcher so parse_buffer can consume the
+// stripped buffer with parse_many. Public so the direct-organize path can
+// reuse it on inflated buffers it accumulates itself.
+inline void strip_array_delimiters(char* buf, std::size_t len) {
+    for (std::size_t i = 0; i < len;) {
+        std::size_t line_start = i;
+        std::size_t line_end = i;
+        while (line_end < len && buf[line_end] != '\n') ++line_end;
+
+        bool has_bracket = false;
+        for (std::size_t j = line_start; j < line_end; ++j) {
+            char c = buf[j];
+            if (c == ' ' || c == '\t' || c == '\r') continue;
+            if ((c == '[' || c == ']') && !has_bracket) {
+                has_bracket = true;
+            } else {
+                has_bracket = false;
+                break;
+            }
+        }
+
+        if (has_bracket) {
+            for (std::size_t j = line_start; j < line_end; ++j) buf[j] = ' ';
+        }
+
+        i = (line_end < len) ? line_end + 1 : len;
+    }
+}
+
+// Iterate parsed dftracer events from a single inflated buffer. The buffer
+// is assumed to hold concatenated NDJSON-ish events plus the simdjson
+// padding required by parse_many; callers responsible for stripping any
+// "[" / "]" delimiter lines (see strip_array_delimiters above).
+//
+// `chunk_buffer` is held shared so the EventRecord's string_view can outlive
+// the loop body if a visitor stashes it. `len` excludes the simdjson
+// padding tail. `line_number` is incremented for each successfully parsed
+// event before being copied into the EventRecord. `needs_args_map` follows
+// the existing dispatcher contract (when any consumer requires the args
+// JSON object materialized, take the slower DFTracerEvent::parse path).
+//
+// Returns the number of trailing bytes parse_many reported as truncated, so
+// the caller can carry them over to the next buffer.
+template <typename Cb>
+std::size_t parse_buffer(simdjson::dom::parser& parser,
+                         std::shared_ptr<std::string> chunk_buffer,
+                         std::size_t len, std::size_t checkpoint_idx,
+                         std::size_t& line_number, bool needs_args_map,
+                         Cb&& cb) {
+    if (!chunk_buffer || len == 0) return 0;
+
+    simdjson::dom::document_stream stream;
+    auto err = parser.parse_many(chunk_buffer->data(), len, len).get(stream);
+    if (err) return 0;
+
+    for (auto it = stream.begin(); it != stream.end(); ++it) {
+        if ((*it).error()) continue;
+        auto root = (*it).value_unsafe();
+        if (!root.is_object()) continue;
+        common::json::JsonValue json(root);
+        DFTracerEvent ev;
+        simdjson::dom::element args_dom{};
+        bool has_args = false;
+        bool ok = false;
+        if (needs_args_map) {
+            ok = DFTracerEvent::parse(json, ev);
+            if (ok) {
+                auto args_r = root["args"];
+                if (!args_r.error() && args_r.value_unsafe().is_object()) {
+                    args_dom = args_r.value_unsafe();
+                    has_args = true;
+                }
+            }
+        } else {
+            ok = DFTracerEvent::parse_scalars(root, ev, args_dom, has_args);
+        }
+        if (!ok) continue;
+
+        std::size_t ln = line_number++;
+        std::string_view src = it.source();
+        EventRecord record{ev, json,     src,     chunk_buffer, checkpoint_idx,
+                           ln, args_dom, has_args};
+        cb(record);
+    }
+
+    std::size_t truncated = stream.truncated_bytes();
+    return (truncated > 0 && truncated <= len) ? truncated : 0;
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_PARSE_INFLATED_H
diff --git a/include/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.h b/include/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.h
new file mode 100644
index 00000000..c7172231
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.h
@@ -0,0 +1,69 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_GROUP_WRITER_TASK_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_GROUP_WRITER_TASK_H
+
+#include <concurrentqueue.h>
+#include <dftracer/utils/core/coro/channel.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h>
+#include <dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h>
+#include <dftracer/utils/utilities/fileio/parallel/parallel_writer.h>
+#include <dftracer/utils/utilities/indexer/index_database_sst_writer_context.h>
+
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace dftracer::utils::utilities::composites::dft::reorganize {
+
+struct GroupWriterConfig {
+    std::string group_name;
+    std::string group_query;
+    std::string output_dir;
+    std::size_t chunk_size_bytes = 256 * 1024 * 1024;
+    bool compress = true;
+    int compression_level = -1;  // Z_DEFAULT_COMPRESSION (level 6)
+    std::shared_ptr<coro::Channel<std::shared_ptr<LineBatch>>> input_channel;
+    const std::vector<SourceFileInfo>* source_files = nullptr;
+    bool build_output_index = true;
+
+    std::string index_dir;
+    bool with_aggregation = false;
+    double agg_time_interval_us = 5'000'000.0;
+    std::vector<std::string> bloom_dimensions;
+    indexing::ChunkIndexerConfig bloom_config;
+    std::string staging_root;
+    std::shared_ptr<moodycamel::ConcurrentQueue<
+        indexer::IndexDatabaseSstWriterContext::Artifacts>>
+        artifacts_queue;
+    std::shared_ptr<std::atomic<std::size_t>> batch_counter;
+};
+
+struct ChunkMemberLayout {
+    std::string path;
+    std::vector<fileio::parallel::ParallelWriter::MemberSpan> members;
+};
+
+struct GroupWriterResult {
+    std::string group_name;
+    std::size_t events_written = 0;
+    std::size_t bytes_written = 0;
+    std::size_t chunks_created = 0;
+    std::vector<std::string> output_files;
+    /// Per-chunk-file gzip-member layout captured directly from the writer.
+    /// Lets downstream indexing skip the post-write gzip header re-scan.
+    std::vector<ChunkMemberLayout> chunk_layouts;
+    bool indexed_inline = false;
+    bool success = false;
+    std::string error_message;
+};
+
+coro::CoroTask<GroupWriterResult> run_group_writer(CoroScope* scope,
+                                                   GroupWriterConfig config);
+
+}  // namespace dftracer::utils::utilities::composites::dft::reorganize
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_GROUP_WRITER_TASK_H
diff --git a/include/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.h b/include/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.h
new file mode 100644
index 00000000..7912cb91
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.h
@@ -0,0 +1,38 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_MANIFEST_EXTRACTOR_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_MANIFEST_EXTRACTOR_H
+
+#include <dftracer/utils/core/coro/channel.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h>
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace dftracer::utils::utilities::composites::dft::reorganize {
+
+struct ManifestExtractorConfig {
+    std::string file_path;
+    std::string index_path;
+    std::size_t source_file_idx = 0;
+    std::vector<PredicateGroup> groups;
+    std::vector<std::shared_ptr<coro::Channel<std::shared_ptr<LineBatch>>>>
+        group_channels;
+    std::size_t batch_size = 1024;
+};
+
+struct ManifestExtractorResult {
+    std::size_t events_extracted = 0;
+    std::size_t events_unmatched = 0;
+    bool success = false;
+    std::string error_message;
+};
+
+coro::CoroTask<ManifestExtractorResult> extract_from_manifest(
+    ManifestExtractorConfig config);
+
+}  // namespace dftracer::utils::utilities::composites::dft::reorganize
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_MANIFEST_EXTRACTOR_H
diff --git a/include/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.h b/include/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.h
new file mode 100644
index 00000000..ab6f73a2
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.h
@@ -0,0 +1,106 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_ORGANIZE_VISITOR_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_ORGANIZE_VISITOR_H
+
+#include <dftracer/utils/core/coro/channel.h>
+#include <dftracer/utils/utilities/common/query/query.h>
+#include <dftracer/utils/utilities/composites/dft/dft_event_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h>
+#include <dftracer/utils/utilities/indexer/index_visitor.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace dftracer::utils::utilities::composites::dft::reorganize {
+
+struct LineRecord {
+    std::uint32_t offset;
+    std::uint32_t length;
+    std::size_t source_file_idx;
+    std::size_t checkpoint_idx;
+    std::size_t source_line_number;
+};
+
+struct LineBatch {
+    std::string bytes;
+    std::vector<LineRecord> lines;
+
+    void reserve(std::size_t n) {
+        lines.reserve(n);
+        bytes.reserve(n * 256);
+    }
+    std::size_t size() const { return lines.size(); }
+    bool empty() const { return lines.empty(); }
+    void clear() {
+        lines.clear();
+        bytes.clear();
+    }
+
+    std::string_view line_view(std::size_t i) const {
+        const auto& r = lines[i];
+        return std::string_view(bytes.data() + r.offset, r.length);
+    }
+
+    void append_line(std::string_view line, std::size_t source_file_idx,
+                     std::size_t checkpoint_idx,
+                     std::size_t source_line_number) {
+        auto offset = static_cast<std::uint32_t>(bytes.size());
+        bytes.append(line.data(), line.size());
+        lines.push_back(LineRecord{
+            .offset = offset,
+            .length = static_cast<std::uint32_t>(line.size()),
+            .source_file_idx = source_file_idx,
+            .checkpoint_idx = checkpoint_idx,
+            .source_line_number = source_line_number,
+        });
+    }
+};
+
+struct OrganizeVisitorConfig {
+    std::vector<PredicateGroup> groups;
+    std::vector<std::shared_ptr<coro::Channel<std::shared_ptr<LineBatch>>>>
+        group_channels;
+    std::size_t source_file_idx = 0;
+    std::size_t batch_size = 1024;
+};
+
+class OrganizeVisitor : public DftEventVisitor {
+   public:
+    explicit OrganizeVisitor(OrganizeVisitorConfig config);
+
+    void begin(std::size_t num_checkpoints) override;
+    void on_checkpoint(std::size_t checkpoint_idx) override;
+    void on_event(const EventRecord& record) override;
+    bool wants_drain() const noexcept override;
+    coro::CoroTask<void> drain_pending() override;
+    coro::CoroTask<void> on_file_complete() override;
+
+    std::unique_ptr<DftEventVisitor> create_parallel_slice() const override;
+    void merge_parallel_slice(DftEventVisitor& slice) override;
+
+    std::size_t events_routed() const { return events_routed_; }
+    std::size_t events_unmatched() const { return events_unmatched_; }
+
+   private:
+    std::size_t evaluate_event(const DFTracerEvent& ev,
+                               const common::json::JsonValue& json);
+
+    OrganizeVisitorConfig config_;
+    std::vector<std::optional<common::query::Query>> parsed_queries_;
+    std::vector<LineBatch> pending_batches_;
+    /// Full LineBatches queued by `merge_parallel_slice` (move-only, no
+    /// byte copy). Drained alongside `pending_batches_` on the next
+    /// `drain_pending` / `on_file_complete` call.
+    std::vector<std::vector<std::shared_ptr<LineBatch>>> drain_queue_;
+    std::size_t current_checkpoint_ = 0;
+    std::size_t events_routed_ = 0;
+    std::size_t events_unmatched_ = 0;
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft::reorganize
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_ORGANIZE_VISITOR_H
diff --git a/include/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.h b/include/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.h
new file mode 100644
index 00000000..bd6fc2e6
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.h
@@ -0,0 +1,54 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_RECONSTRUCTOR_UTILITY_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_RECONSTRUCTOR_UTILITY_H
+
+#include <dftracer/utils/core/common/constants.h>
+#include <dftracer/utils/core/utilities/tags/needs_context.h>
+#include <dftracer/utils/core/utilities/utility.h>
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+namespace dftracer::utils::utilities::composites::dft::reorganize {
+
+struct ReconstructorInput {
+    std::string input_dir;
+    std::string output_dir;
+    std::size_t checkpoint_size = constants::indexer::DEFAULT_CHECKPOINT_SIZE;
+    std::size_t parallelism = 0;
+    bool compress = true;
+
+    ReconstructorInput& with_input_dir(std::string dir);
+    ReconstructorInput& with_output_dir(std::string dir);
+    ReconstructorInput& with_checkpoint_size(std::size_t sz);
+    ReconstructorInput& with_parallelism(std::size_t n);
+    ReconstructorInput& with_compress(bool c);
+};
+
+struct ReconstructedFileInfo {
+    std::string original_path;
+    std::string output_path;
+    std::size_t events_written = 0;
+    std::size_t bytes_written = 0;
+};
+
+struct ReconstructorResult {
+    std::vector<ReconstructedFileInfo> files;
+    std::size_t total_events = 0;
+    std::size_t total_bytes = 0;
+    std::size_t total_segments = 0;
+    bool success = false;
+    std::string error_message;
+};
+
+class ReconstructorUtility
+    : public utilities::Utility<ReconstructorInput, ReconstructorResult,
+                                utilities::tags::NeedsContext> {
+   public:
+    coro::CoroTask<ReconstructorResult> process(
+        const ReconstructorInput& input) override;
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft::reorganize
+
+#endif
diff --git a/include/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h b/include/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h
index 92ea2f20..9eb920a0 100644
--- a/include/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h
+++ b/include/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h
@@ -1,6 +1,7 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_REORGANIZATION_PLANNER_H
 #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_REORGANIZE_REORGANIZATION_PLANNER_H
 
+#include <dftracer/utils/core/utilities/tags/needs_context.h>
 #include <dftracer/utils/core/utilities/utility.h>
 
 #include <cstddef>
@@ -47,7 +48,8 @@ struct ReorganizationPlannerInput {
 };
 
 class ReorganizationPlannerUtility
-    : public utilities::Utility<ReorganizationPlannerInput, ExtractionPlan> {
+    : public utilities::Utility<ReorganizationPlannerInput, ExtractionPlan,
+                                utilities::tags::NeedsContext> {
    public:
     ReorganizationPlannerUtility() = default;
 
diff --git a/include/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.h b/include/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.h
index 6163de8d..f981aea0 100644
--- a/include/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.h
+++ b/include/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.h
@@ -1,6 +1,7 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_STATISTICS_DETAILED_STATISTICS_H
 #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_STATISTICS_DETAILED_STATISTICS_H
 
+#include <dftracer/utils/core/common/transparent_string_hash.h>
 #include <dftracer/utils/utilities/common/statistics/ddsketch.h>
 #include <dftracer/utils/utilities/common/statistics/log2_histogram.h>
 
@@ -51,14 +52,14 @@ struct DetailedStatistics {
     DistributionStats duration;
 
     // Per-group-key duration statistics
-    std::unordered_map<std::string, DistributionStats> grouped_duration;
+    StringViewMap<DistributionStats> grouped_duration;
 
     // Per-group-key I/O metrics (only for groups with I/O events)
-    std::unordered_map<std::string, IOEventMetrics> grouped_io;
+    StringViewMap<IOEventMetrics> grouped_io;
 
     // Maps group key -> category string (e.g. "POSIX", "dlio_benchmark")
     // Used by the display layer to split events by category.
-    std::unordered_map<std::string, std::string> group_key_category;
+    StringViewMap<std::string> group_key_category;
 
     // Scan progress
     std::uint64_t events_scanned = 0;
diff --git a/include/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.h b/include/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.h
new file mode 100644
index 00000000..5e258dcb
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.h
@@ -0,0 +1,156 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_STATISTICS_SHARED_INDEX_STATISTICS_READER_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_STATISTICS_SHARED_INDEX_STATISTICS_READER_H
+
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
+#include <dftracer/utils/utilities/composites/dft/statistics/statistics_query_utility.h>
+#include <dftracer/utils/utilities/composites/dft/statistics/trace_statistics.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace dftracer::utils::utilities::composites::dft::statistics {
+
+struct EntrySnapshot {
+    std::size_t file_index;
+    int file_id;
+    std::string file_path;
+};
+
+struct SharedIndexBatchRows {
+    std::unordered_map<int, std::uint64_t> num_chunks;
+    std::unordered_map<int, utilities::indexer::MergedStatisticsResult>
+        fallback_merged_stats;
+    std::unordered_map<int, utilities::indexer::ChunkStatistics> merged_stats;
+    std::vector<EntrySnapshot> entries_snapshot;
+};
+
+inline SharedIndexBatchRows query_shared_index_batch(
+    std::string index_path, std::vector<indexing::ResolvedFile> entries,
+    StatisticsQueryType query_type) {
+    SharedIndexBatchRows rows;
+    std::vector<int> file_ids;
+    file_ids.reserve(entries.size());
+    rows.entries_snapshot.reserve(entries.size());
+    for (auto& entry : entries) {
+        file_ids.push_back(entry.file_id);
+        rows.entries_snapshot.push_back(EntrySnapshot{
+            entry.file_index, entry.file_id, std::move(entry.file_path)});
+    }
+
+    utilities::indexer::IndexDatabase idx_db(
+        index_path,
+        dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+    auto scalar_rows = idx_db.query_file_scalar_stats_batch(file_ids);
+    std::unordered_map<int, utilities::indexer::ChunkStatistics*> merge_targets;
+    merge_targets.reserve(file_ids.size());
+
+    std::vector<int> missing_ids;
+    missing_ids.reserve(file_ids.size());
+    rows.num_chunks.reserve(scalar_rows.size());
+    rows.merged_stats.reserve(scalar_rows.size());
+    for (auto& [file_id, merged] : scalar_rows) {
+        rows.num_chunks.emplace(file_id, merged.num_chunks);
+        auto merged_entry =
+            rows.merged_stats.emplace(file_id, std::move(merged.stats));
+        merge_targets.emplace(file_id, &merged_entry.first->second);
+    }
+    for (const auto file_id : file_ids) {
+        if (rows.num_chunks.find(file_id) == rows.num_chunks.end()) {
+            missing_ids.push_back(file_id);
+        }
+    }
+    const bool needs_categories =
+        query_type == StatisticsQueryType::SUMMARY ||
+        query_type == StatisticsQueryType::CATEGORIES ||
+        query_type == StatisticsQueryType::TOP_N_CATEGORIES;
+    const bool needs_names = query_type == StatisticsQueryType::NAMES ||
+                             query_type == StatisticsQueryType::TOP_N_NAMES;
+    const bool needs_pid_tids = query_type == StatisticsQueryType::SUMMARY ||
+                                query_type == StatisticsQueryType::PID_TIDS;
+
+    if (needs_categories) {
+        idx_db.merge_file_category_counts_batch_into(file_ids, merge_targets);
+    }
+    if (needs_names) {
+        idx_db.merge_file_name_counts_batch_into(file_ids, merge_targets);
+    }
+    if (needs_pid_tids) {
+        idx_db.merge_file_pid_tid_counts_batch_into(file_ids, merge_targets);
+    }
+    if (!missing_ids.empty()) {
+        rows.fallback_merged_stats =
+            idx_db.query_merged_statistics_batch(missing_ids);
+    }
+    return rows;
+}
+
+class SharedIndexStatisticsReader {
+   public:
+    SharedIndexStatisticsReader() = default;
+
+    coro::CoroTask<SharedIndexBatchRows> query(
+        std::string index_path, std::vector<indexing::ResolvedFile> entries,
+        StatisticsQueryType query_type) const {
+        co_return query_shared_index_batch(std::move(index_path),
+                                           std::move(entries), query_type);
+    }
+
+    template <typename Callback>
+    static void process_batch_results(SharedIndexBatchRows& batch_rows,
+                                      Callback& callback) {
+        for (const auto& [file_index, file_id, file_path] :
+             batch_rows.entries_snapshot) {
+            const auto chunks_it = batch_rows.num_chunks.find(file_id);
+            if (chunks_it != batch_rows.num_chunks.end()) {
+                TraceStatistics stats;
+                stats.file_path = file_path;
+                stats.num_chunks = chunks_it->second;
+                stats.merged = std::move(batch_rows.merged_stats[file_id]);
+                stats.success = stats.num_chunks > 0;
+                if (!stats.success) {
+                    stats.error_message =
+                        "No chunk statistics in index for " + file_path;
+                }
+                callback(file_index, std::move(stats));
+                continue;
+            }
+
+            auto merged_it = batch_rows.fallback_merged_stats.find(file_id);
+            callback(file_index,
+                     build_trace_statistics_from_index(
+                         file_path,
+                         merged_it == batch_rows.fallback_merged_stats.end()
+                             ? nullptr
+                             : &merged_it->second));
+        }
+    }
+
+   private:
+    static TraceStatistics build_trace_statistics_from_index(
+        const std::string& file_path,
+        utilities::indexer::MergedStatisticsResult* merged) {
+        TraceStatistics result;
+        result.file_path = file_path;
+
+        if (merged == nullptr || merged->num_chunks == 0) {
+            result.success = false;
+            result.error_message =
+                "No chunk statistics in index for " + file_path;
+            return result;
+        }
+
+        result.num_chunks = merged->num_chunks;
+        result.merged = std::move(merged->stats);
+        result.success = true;
+        return result;
+    }
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft::statistics
+
+#endif
diff --git a/include/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h b/include/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h
index 7141ba23..c844c49f 100644
--- a/include/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h
+++ b/include/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h
@@ -6,6 +6,7 @@
 #include <dftracer/utils/utilities/composites/dft/statistics/trace_statistics.h>
 
 #include <string>
+#include <vector>
 
 namespace dftracer::utils::utilities::composites::dft::statistics {
 
@@ -15,6 +16,11 @@ struct StatisticsAggregatorInput {
     std::string index_dir;
 };
 
+struct StatisticsAggregatorBatchInput {
+    std::vector<std::string> file_paths;
+    std::string index_path;
+};
+
 class StatisticsAggregatorUtility
     : public utilities::Utility<StatisticsAggregatorInput, TraceStatistics,
                                 utilities::tags::Parallelizable> {
@@ -23,6 +29,9 @@ class StatisticsAggregatorUtility
 
     coro::CoroTask<TraceStatistics> process(
         const StatisticsAggregatorInput& input) override;
+
+    coro::CoroTask<std::vector<TraceStatistics>> process_batch(
+        const StatisticsAggregatorBatchInput& input);
 };
 
 }  // namespace dftracer::utils::utilities::composites::dft::statistics
diff --git a/include/dftracer/utils/utilities/composites/dft/views/view_reader_utility.h b/include/dftracer/utils/utilities/composites/dft/views/view_reader_utility.h
index 7709f9e8..46c44ac1 100644
--- a/include/dftracer/utils/utilities/composites/dft/views/view_reader_utility.h
+++ b/include/dftracer/utils/utilities/composites/dft/views/view_reader_utility.h
@@ -1,14 +1,15 @@
 #ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VIEWS_VIEW_READER_UTILITY_H
 #define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VIEWS_VIEW_READER_UTILITY_H
 
+#include <dftracer/utils/core/common/config.h>
 #include <dftracer/utils/core/utilities/streaming_utility.h>
 #include <dftracer/utils/core/utilities/tags/parallelizable.h>
 #include <dftracer/utils/utilities/common/query/query.h>
 #include <dftracer/utils/utilities/composites/dft/views/view_definition.h>
 #include <dftracer/utils/utilities/indexer/internal/indexer.h>
-
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
 #include <dftracer/utils/utilities/common/arrow/arrow_export.h>
+#include <dftracer/utils/utilities/common/arrow/column_builder.h>
 #endif
 
 #include <cstddef>
@@ -56,6 +57,8 @@ struct ViewReaderBatch {
 
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
     common::arrow::ArrowExportResult to_arrow() const;
+    common::arrow::ArrowExportResult to_arrow(
+        common::arrow::RecordBatchBuilder& builder) const;
 #endif
 };
 
diff --git a/include/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h b/include/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h
new file mode 100644
index 00000000..b5d96146
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h
@@ -0,0 +1,148 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_BLOOM_VISITOR_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_BLOOM_VISITOR_H
+
+#include <dftracer/utils/core/common/transparent_string_hash.h>
+#include <dftracer/utils/utilities/composites/dft/dft_event_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h>
+
+#include <array>
+#include <cstddef>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace dftracer::utils::utilities::indexer {
+class IndexBatchSink;
+class IndexDatabaseWriterContext;
+}  // namespace dftracer::utils::utilities::indexer
+
+namespace dftracer::utils::utilities::composites::dft::visitors {
+
+class BloomVisitor : public DftEventVisitor {
+   public:
+    using HashResolutions = indexing::HashResolutions;
+    using ChunkStatistics = indexing::ChunkStatistics;
+    using ChunkDimensionStats = indexing::ChunkDimensionStats;
+    using ChunkIndexerConfig = indexing::ChunkIndexerConfig;
+
+    /// Fixed bloom filter slots. Indices match DEFAULT_BLOOM_DIMENSIONS
+    /// order: name, cat, pid, tid, hhash, fhash, shash.
+    enum FixedBloom : std::uint8_t {
+        BF_NAME = 0,
+        BF_CAT,
+        BF_PID,
+        BF_TID,
+        BF_HHASH,
+        BF_FHASH,
+        BF_SHASH,
+        BF_COUNT
+    };
+
+    /// Fixed dimension_stats slots. Superset of bloom dims plus pid_tid,
+    /// ts, dur (which are observed for range stats but not hashed).
+    enum FixedDim : std::uint8_t {
+        FD_NAME = 0,
+        FD_CAT,
+        FD_PID,
+        FD_TID,
+        FD_PID_TID,
+        FD_HHASH,
+        FD_FHASH,
+        FD_SHASH,
+        FD_TS,
+        FD_DUR,
+        FD_COUNT
+    };
+
+    struct ChunkState {
+        std::array<indexing::BloomFilter, BF_COUNT> fixed_blooms;
+        std::array<ChunkDimensionStats, FD_COUNT> fixed_dim_stats;
+        std::vector<indexing::BloomFilter> extra_blooms;
+        std::vector<ChunkDimensionStats> extra_dim_stats;
+        ChunkStatistics statistics;
+        HashResolutions hash_resolutions;
+        std::size_t events_processed = 0;
+
+        ChunkState();
+    };
+
+    BloomVisitor(ChunkIndexerConfig config,
+                 std::vector<std::string> dimensions);
+    BloomVisitor(const BloomVisitor&) = delete;
+    BloomVisitor& operator=(const BloomVisitor&) = delete;
+    BloomVisitor(BloomVisitor&&) noexcept = default;
+    BloomVisitor& operator=(BloomVisitor&&) noexcept = default;
+
+    void begin(std::size_t num_checkpoints) override;
+    void on_checkpoint(std::size_t checkpoint_idx) override;
+    void on_event(const EventRecord& record) override;
+
+    std::unique_ptr<DftEventVisitor> create_parallel_slice() const override;
+    void merge_parallel_slice(DftEventVisitor& slice) override;
+
+    void finalize(indexer::IndexDatabaseWriterContext& writer, int file_id);
+    /// Emit bloom / stats / dimension records plus name dictionary/postings
+    /// to a sink backend. Skips ROOT_* summaries (rebuilt separately by
+    /// `IndexDatabase::rebuild_root_summaries()`). Works for both the
+    /// RocksDB-backed writer and the SST writer.
+    void finalize_sink_only(indexer::IndexBatchSink& sink, int file_id);
+
+    /// Emit per-checkpoint chunk records (bloom, stats, dim_stats,
+    /// name_chunk_postings) using the current `chunks_` buffer, merge their
+    /// state into the persistent file-level accumulator, then clear
+    /// `chunks_` and advance the base index. Used for mid-chunk slice
+    /// rotation when `chunks_` would otherwise grow unbounded.
+    void flush_per_checkpoint_to_sink(indexer::IndexBatchSink& sink,
+                                      int file_id);
+
+    /// Emit file-level records (file_bloom, scalar_stats, counts,
+    /// dimensions, name_dictionary, name_file_postings) from the persistent
+    /// accumulator. Call once at end-of-file.
+    void finalize_file_to_sink(indexer::IndexBatchSink& sink, int file_id);
+
+    std::size_t num_chunks() const { return chunks_base_idx_ + chunks_.size(); }
+
+    /// Total event count across already-flushed chunks plus the currently
+    /// buffered ones. Reflects all events ingested via on_event() so far.
+    std::uint64_t total_events() const {
+        std::uint64_t total = file_acc_.statistics.total_events;
+        for (const auto& chunk : chunks_) {
+            total += chunk.statistics.total_events;
+        }
+        return total;
+    }
+
+   private:
+    void ensure_chunk(std::size_t checkpoint_idx);
+
+    ChunkIndexerConfig config_;
+    std::vector<std::string> extra_dim_names_;
+    std::vector<ChunkState> chunks_;
+    /// Number of checkpoints already flushed and dropped from `chunks_`.
+    /// `chunks_[i]` represents checkpoint `chunks_base_idx_ + i`.
+    std::size_t chunks_base_idx_ = 0;
+
+    struct FileAccumulator {
+        std::array<indexing::BloomFilter, BF_COUNT> fixed_blooms;
+        std::vector<indexing::BloomFilter> extra_blooms;
+        ChunkStatistics statistics;
+        std::size_t num_chunks_emitted = 0;
+        bool initialized = false;
+    };
+    FileAccumulator file_acc_;
+
+    std::uint64_t last_pid_ = UINT64_MAX;
+    std::uint64_t last_tid_ = UINT64_MAX;
+    char last_pid_buf_[24] = {};
+    char last_tid_buf_[24] = {};
+    std::uint8_t last_pid_len_ = 0;
+    std::uint8_t last_tid_len_ = 0;
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft::visitors
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_BLOOM_VISITOR_H
diff --git a/include/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.h b/include/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.h
new file mode 100644
index 00000000..5281a06a
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.h
@@ -0,0 +1,57 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_HASH_TABLE_VISITOR_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_HASH_TABLE_VISITOR_H
+
+#include <dftracer/utils/utilities/composites/dft/dft_event_visitor.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+
+namespace dftracer::utils::utilities::indexer {
+class IndexBatchSink;
+}
+
+namespace dftracer::utils::utilities::composites::dft::visitors {
+
+/// Captures FH/HH/SH/PR metadata events during indexing and stores them
+/// in HASH_TABLES column family with bidirectional lookups:
+///   - Forward (hash -> name): for resolving hashes in output
+///   - Reverse (name -> hash): for query DSL like `file_name == "/path/..."`
+class HashTableVisitor : public DftEventVisitor {
+   public:
+    /// Hash table types matching dfanalyzer naming conventions
+    enum class HashType : std::uint8_t {
+        FILE = 0,    // fhash <-> file_name
+        HOST = 1,    // hhash <-> host_name
+        STRING = 2,  // shash <-> string value
+        PROC = 3     // phash <-> proc metadata
+    };
+
+    HashTableVisitor() = default;
+    HashTableVisitor(const HashTableVisitor&) = delete;
+    HashTableVisitor& operator=(const HashTableVisitor&) = delete;
+    HashTableVisitor(HashTableVisitor&&) noexcept = default;
+    HashTableVisitor& operator=(HashTableVisitor&&) noexcept = default;
+
+    void begin(std::size_t num_checkpoints) override;
+    void on_checkpoint(std::size_t checkpoint_idx) override;
+    void on_event(const EventRecord& record) override;
+
+    std::unique_ptr<DftEventVisitor> create_parallel_slice() const override;
+    void merge_parallel_slice(DftEventVisitor& slice) override;
+
+    void finalize(indexer::IndexBatchSink& writer, int file_id);
+
+    std::size_t num_entries() const;
+
+   private:
+    std::unordered_map<std::string, std::string> file_hashes_;
+    std::unordered_map<std::string, std::string> host_hashes_;
+    std::unordered_map<std::string, std::string> string_hashes_;
+    std::unordered_map<std::string, std::string> proc_metadata_;
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft::visitors
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_HASH_TABLE_VISITOR_H
diff --git a/include/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.h b/include/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.h
new file mode 100644
index 00000000..752e0448
--- /dev/null
+++ b/include/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.h
@@ -0,0 +1,59 @@
+#ifndef DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_MANIFEST_VISITOR_H
+#define DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_MANIFEST_VISITOR_H
+
+#include <dftracer/utils/utilities/composites/dft/dft_event_visitor.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace dftracer::utils::utilities::indexer {
+class IndexBatchSink;
+}
+
+namespace dftracer::utils::utilities::composites::dft::visitors {
+
+class ManifestVisitor : public DftEventVisitor {
+   public:
+    ManifestVisitor() = default;
+
+    void begin(std::size_t num_checkpoints) override;
+    void on_checkpoint(std::size_t checkpoint_idx) override;
+    void on_event(const EventRecord& record) override;
+
+    std::unique_ptr<DftEventVisitor> create_parallel_slice() const override;
+    void merge_parallel_slice(DftEventVisitor& slice) override;
+    void set_line_offset(std::size_t offset) override { line_offset_ = offset; }
+    std::size_t parallel_event_count() const override { return event_count_; }
+
+    void finalize(indexer::IndexBatchSink& writer, int file_id);
+
+    /// Emit per-checkpoint event/metadata line records and clear the
+    /// vectors. Used for mid-chunk slice rotation.
+    void flush_per_checkpoint_to_sink(indexer::IndexBatchSink& sink,
+                                      int file_id);
+
+    /// Emit file-level records (observed pids). Call once at end-of-file.
+    void finalize_file_to_sink(indexer::IndexBatchSink& sink, int file_id);
+
+   private:
+    void ensure_chunk(std::size_t checkpoint_idx);
+
+    using EventKey = std::pair<std::string, std::string>;
+    using LineVec = std::vector<std::uint32_t>;
+
+    std::vector<std::map<EventKey, LineVec>> event_lines_;
+    std::vector<std::map<std::string, LineVec>> metadata_lines_;
+    std::unordered_set<std::uint64_t> observed_pids_;
+    std::size_t event_count_ = 0;
+    std::size_t line_offset_ = 0;
+    std::size_t base_idx_ = 0;
+};
+
+}  // namespace dftracer::utils::utilities::composites::dft::visitors
+
+#endif  // DFTRACER_UTILS_UTILITIES_COMPOSITES_DFT_VISITORS_MANIFEST_VISITOR_H
diff --git a/include/dftracer/utils/utilities/fileio/chunk_writer.h b/include/dftracer/utils/utilities/fileio/chunk_writer.h
index d61f8472..7c4432e4 100644
--- a/include/dftracer/utils/utilities/fileio/chunk_writer.h
+++ b/include/dftracer/utils/utilities/fileio/chunk_writer.h
@@ -6,6 +6,7 @@
 #include <dftracer/utils/utilities/compression/zlib/streaming_compressor_utility.h>
 
 #include <cstddef>
+#include <functional>
 #include <memory>
 #include <string>
 #include <vector>
@@ -20,6 +21,11 @@ struct ChunkWriterConfig {
     int compression_level = Z_DEFAULT_COMPRESSION;
     bool json_array_wrapper = true;
 
+    using ChunkRotationCallback = std::function<void(
+        std::size_t chunk_index, const std::string& chunk_path,
+        std::size_t event_count, std::size_t byte_count)>;
+    ChunkRotationCallback on_chunk_complete;
+
     ChunkWriterConfig& with_output_dir(std::string dir) {
         output_dir = std::move(dir);
         return *this;
@@ -44,6 +50,10 @@ struct ChunkWriterConfig {
         json_array_wrapper = enabled;
         return *this;
     }
+    ChunkWriterConfig& with_on_chunk_complete(ChunkRotationCallback callback) {
+        on_chunk_complete = std::move(callback);
+        return *this;
+    }
 };
 
 struct ChunkInfo {
diff --git a/include/dftracer/utils/utilities/fileio/parallel/layout.h b/include/dftracer/utils/utilities/fileio/parallel/layout.h
new file mode 100644
index 00000000..4a16e6e4
--- /dev/null
+++ b/include/dftracer/utils/utilities/fileio/parallel/layout.h
@@ -0,0 +1,59 @@
+#ifndef DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_LAYOUT_H
+#define DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_LAYOUT_H
+
+#include <cstddef>
+#include <string>
+
+namespace dftracer::utils::utilities::fileio::parallel {
+
+enum class FileLayout {
+    SHARDED,  // N files, glob by name; used on NFS
+    STRIPED,  // single file, atomic-offset pwrite; used on local and PFS
+};
+
+enum class FilesystemKind {
+    UNKNOWN,
+    LOCAL,  // ext4, xfs, btrfs, tmpfs, etc.
+    NFS,
+    LUSTRE,
+    GPFS,
+    BEEGFS,
+};
+
+struct LayoutInfo {
+    FileLayout layout;
+    FilesystemKind fs;
+    std::size_t stripe_size;   // 0 if unknown/not applicable
+    std::size_t stripe_count;  // 0 if unknown/not applicable
+};
+
+/// Detect layout for a path (the file need not exist yet; falls back to the
+/// parent directory). NFS maps to SHARDED, everything else to STRIPED.
+LayoutInfo detect_layout(const std::string& path) noexcept;
+
+struct WriterSizing {
+    std::size_t num_workers;
+    std::size_t flush_threshold;
+    std::size_t buffer_capacity;
+};
+
+/// Minimum stripe_size for which the padded-striped layout is worth picking.
+/// Below this, compressed payloads may not reliably fit one stripe, so we
+/// fall back to the atomic-offset striped writer.
+constexpr std::size_t MIN_PADDED_STRIPE_BYTES = 1 * 1024 * 1024;
+
+/// Pure sizing policy. Worker count is capped by stripe_count on PFS.
+/// For the atomic-offset striped writer, flush_threshold = max(default,
+/// stripe_size) to keep pwrites large. For the padded striped writer,
+/// flush_threshold = stripe_size so the compressed result fits in one stripe.
+/// `baseline_workers` should already be capped at any caller-specific limit
+/// (e.g. number of aggregation shards).
+WriterSizing compute_writer_sizing(const LayoutInfo& info,
+                                   std::size_t baseline_workers,
+                                   std::size_t default_flush_bytes,
+                                   std::size_t buffer_headroom_bytes,
+                                   bool padded_layout = false) noexcept;
+
+}  // namespace dftracer::utils::utilities::fileio::parallel
+
+#endif  // DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_LAYOUT_H
diff --git a/include/dftracer/utils/utilities/fileio/parallel/merge.h b/include/dftracer/utils/utilities/fileio/parallel/merge.h
new file mode 100644
index 00000000..b5e4cc4d
--- /dev/null
+++ b/include/dftracer/utils/utilities/fileio/parallel/merge.h
@@ -0,0 +1,20 @@
+#ifndef DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_MERGE_H
+#define DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_MERGE_H
+
+#include <dftracer/utils/core/coro/task.h>
+
+#include <string>
+#include <vector>
+
+namespace dftracer::utils::utilities::fileio::parallel {
+
+/// Concatenate `shards` into `target` (truncating) and unlink the shards on
+/// success. Valid for any format whose bytes concatenate cleanly (plain
+/// JSON/NDJSON, gzip members). Shards are left in place on failure.
+/// Returns 0 on success, -1 on any I/O failure.
+coro::CoroTask<int> merge_shards(const std::string& target,
+                                 const std::vector<std::string>& shards);
+
+}  // namespace dftracer::utils::utilities::fileio::parallel
+
+#endif  // DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_MERGE_H
diff --git a/include/dftracer/utils/utilities/fileio/parallel/parallel_writer.h b/include/dftracer/utils/utilities/fileio/parallel/parallel_writer.h
new file mode 100644
index 00000000..3f1aeec0
--- /dev/null
+++ b/include/dftracer/utils/utilities/fileio/parallel/parallel_writer.h
@@ -0,0 +1,92 @@
+#ifndef DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_PARALLEL_WRITER_H
+#define DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_PARALLEL_WRITER_H
+
+#include <dftracer/utils/core/common/byte_view.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/utilities/fileio/parallel/layout.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <span>
+#include <string>
+#include <vector>
+
+namespace dftracer::utils {
+class CoroScope;
+}
+
+namespace dftracer::utils::utilities::fileio::parallel {
+
+/// Parallel file writer interface. Concrete impls (striped, sharded) hide the
+/// on-disk layout; for gzip output the caller must feed standalone gzip
+/// members so chunks stay valid at any offset.
+class ParallelWriter {
+   public:
+    virtual ~ParallelWriter() = default;
+
+    /// Create/truncate backing storage. `scope` may be null for layouts that
+    /// don't spawn internal coroutines; padded-striped requires a non-null
+    /// scope that outlives close().
+    virtual coro::CoroTask<int> open(std::string path, std::size_t num_workers,
+                                     bool gzip_extension, CoroScope* scope) = 0;
+
+    /// Prologue, written before any worker chunk.
+    virtual coro::CoroTask<int> write_header(ByteView data) = 0;
+
+    /// Striped: placed at an atomic offset. Sharded: appended to shard N.
+    virtual coro::CoroTask<int> write_chunk(std::size_t worker_idx,
+                                            ByteView data) = 0;
+
+    /// Epilogue, written after all workers drain.
+    virtual coro::CoroTask<int> write_footer(ByteView data) = 0;
+
+    virtual coro::CoroTask<int> close() = 0;
+
+    /// One entry for striped; N entries (read order) for sharded.
+    virtual std::vector<std::string> output_paths() const = 0;
+
+    /// Per-write_chunk layout entry: byte offset + length of one independently
+    /// decompressable gzip member (or raw chunk for non-gzip layouts).
+    struct MemberSpan {
+        std::uint64_t offset;
+        std::uint64_t length;
+    };
+
+    /// Member offsets recorded by `write_chunk`, sorted by ascending offset.
+    /// Returned span is owned by the writer; valid until destruction.
+    /// Must be called after `close()` (no concurrent writes).
+    /// Empty for layouts that don't expose member boundaries.
+    virtual std::span<const MemberSpan> member_layout() const { return {}; }
+
+    /// Span of the most recent `write_chunk(worker_idx, ...)` call on this
+    /// worker. Caller must invoke immediately after `co_await write_chunk()`
+    /// returns; subsequent calls overwrite. For sharded layouts the offset
+    /// is shard-local; remap with `shard_base_offsets()` after close.
+    virtual std::optional<MemberSpan> last_member(
+        std::size_t /*worker_idx*/) const {
+        return std::nullopt;
+    }
+
+    /// Per-worker base offset to add to a shard-local `MemberSpan.offset` to
+    /// get the merged-file offset. Empty by default (no remap needed for
+    /// single-stream layouts). Call after `close()`.
+    virtual std::vector<std::uint64_t> shard_base_offsets() const { return {}; }
+};
+
+struct WriterConfig {
+    FileLayout layout = FileLayout::STRIPED;
+    std::size_t stripe_size = 0;  // PFS stripe; 0 disables padded layout
+    bool gzip = false;
+};
+
+std::unique_ptr<ParallelWriter> make_writer(const WriterConfig& cfg);
+std::unique_ptr<ParallelWriter> make_striped_writer();
+std::unique_ptr<ParallelWriter> make_sharded_writer();
+std::unique_ptr<ParallelWriter> make_padded_striped_writer(
+    std::size_t stripe_size);
+
+}  // namespace dftracer::utils::utilities::fileio::parallel
+
+#endif  // DFTRACER_UTILS_UTILITIES_FILEIO_PARALLEL_PARALLEL_WRITER_H
diff --git a/include/dftracer/utils/utilities/filesystem/directory_scanner_utility.h b/include/dftracer/utils/utilities/filesystem/directory_scanner_utility.h
index 561c7486..5bec6dc7 100644
--- a/include/dftracer/utils/utilities/filesystem/directory_scanner_utility.h
+++ b/include/dftracer/utils/utilities/filesystem/directory_scanner_utility.h
@@ -3,6 +3,9 @@
 
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/coro/when_all.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/core/utilities/tags/needs_context.h>
 #include <dftracer/utils/core/utilities/tags/parallelizable.h>
 #include <dftracer/utils/core/utilities/utility.h>
 #include <dftracer/utils/utilities/filesystem/types.h>
@@ -20,13 +23,16 @@ namespace dftracer::utils::utilities::filesystem {
 struct DirectoryScannerUtilityInput {
     fs::path path;
     bool recursive = false;  // Whether to scan subdirectories
+    bool populate_size = true;
 
-    explicit DirectoryScannerUtilityInput(fs::path p, bool rec = false)
-        : path(std::move(p)), recursive(rec) {}
+    explicit DirectoryScannerUtilityInput(fs::path p, bool rec = false,
+                                          bool with_size = true)
+        : path(std::move(p)), recursive(rec), populate_size(with_size) {}
 
     // Equality operator for caching/hashing
     bool operator==(const DirectoryScannerUtilityInput& other) const {
-        return path == other.path && recursive == other.recursive;
+        return path == other.path && recursive == other.recursive &&
+               populate_size == other.populate_size;
     }
 
     bool operator!=(const DirectoryScannerUtilityInput& other) const {
@@ -56,9 +62,9 @@ struct DirectoryScannerUtilityInput {
  * @endcode
  */
 class DirectoryScannerUtility
-    : public utilities::Utility<DirectoryScannerUtilityInput,
-                                std::vector<FileEntry>,
-                                utilities::tags::Parallelizable> {
+    : public utilities::Utility<
+          DirectoryScannerUtilityInput, std::vector<FileEntry>,
+          utilities::tags::Parallelizable, utilities::tags::NeedsContext> {
    public:
     DirectoryScannerUtility() = default;
     ~DirectoryScannerUtility() = default;
@@ -73,7 +79,7 @@ class DirectoryScannerUtility
      */
     coro::CoroTask<std::vector<FileEntry>> process(
         const DirectoryScannerUtilityInput& input) override {
-        std::vector<FileEntry> entries;
+        std::vector<fs::directory_entry> raw_entries;
 
         if (!fs::exists(input.path)) {
             throw fs::filesystem_error(
@@ -91,15 +97,39 @@ class DirectoryScannerUtility
             // Recursive directory iteration
             for (const auto& entry :
                  fs::recursive_directory_iterator(input.path)) {
-                entries.emplace_back(entry.path());
+                raw_entries.push_back(entry);
             }
         } else {
             // Non-recursive directory iteration
             for (const auto& entry : fs::directory_iterator(input.path)) {
-                entries.emplace_back(entry.path());
+                raw_entries.push_back(entry);
             }
         }
 
+        if (!this->has_context()) {
+            std::vector<FileEntry> entries;
+            entries.reserve(raw_entries.size());
+            for (const auto& entry : raw_entries) {
+                entries.emplace_back(entry, input.populate_size);
+            }
+            co_return entries;
+        }
+
+        CoroScope& ctx = this->context();
+        std::vector<coro::SpawnFuture<FileEntry>> tasks;
+        tasks.reserve(raw_entries.size());
+        for (auto& entry : raw_entries) {
+            auto entry_copy = std::move(entry);
+            tasks.push_back(
+                ctx.spawn([entry_copy = std::move(entry_copy),
+                           populate_size = input.populate_size](
+                              CoroScope&) mutable -> coro::CoroTask<FileEntry> {
+                    co_return FileEntry(entry_copy, populate_size);
+                }));
+        }
+        std::vector<FileEntry> entries =
+            co_await coro::when_all(std::move(tasks));
+
         co_return entries;
     }
 };
diff --git a/include/dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h b/include/dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h
index d934856f..e98a2685 100644
--- a/include/dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h
+++ b/include/dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h
@@ -2,6 +2,7 @@
 #define DFTRACER_UTILS_UTILITIES_FILESYSTEM_PATTERN_DIRECTORY_SCANNER_H
 
 #include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/utilities/tags/needs_context.h>
 #include <dftracer/utils/core/utilities/tags/parallelizable.h>
 #include <dftracer/utils/core/utilities/utilities.h>
 #include <dftracer/utils/utilities/filesystem/directory_scanner_utility.h>
@@ -18,14 +19,18 @@ namespace dftracer::utils::utilities::filesystem {
 struct PatternDirectoryScannerUtilityInput {
     std::string path;
     bool recursive = false;
+    bool populate_size = true;
     std::vector<std::string> patterns;  // e.g., {".pfw", ".pfw.gz", "*.txt"}
 
     PatternDirectoryScannerUtilityInput() = default;
 
     PatternDirectoryScannerUtilityInput(std::string p,
                                         std::vector<std::string> pats,
-                                        bool rec = false)
-        : path(std::move(p)), recursive(rec), patterns(std::move(pats)) {}
+                                        bool rec = false, bool with_size = true)
+        : path(std::move(p)),
+          recursive(rec),
+          populate_size(with_size),
+          patterns(std::move(pats)) {}
 
     static PatternDirectoryScannerUtilityInput from_path(std::string p) {
         PatternDirectoryScannerUtilityInput input;
@@ -43,6 +48,11 @@ struct PatternDirectoryScannerUtilityInput {
         recursive = rec;
         return *this;
     }
+
+    PatternDirectoryScannerUtilityInput& with_populate_size(bool with_size) {
+        populate_size = with_size;
+        return *this;
+    }
 };
 
 /**
@@ -63,9 +73,9 @@ struct PatternDirectoryScannerUtilityInput {
  * @endcode
  */
 class PatternDirectoryScannerUtility
-    : public utilities::Utility<PatternDirectoryScannerUtilityInput,
-                                std::vector<FileEntry>,
-                                utilities::tags::Parallelizable> {
+    : public utilities::Utility<
+          PatternDirectoryScannerUtilityInput, std::vector<FileEntry>,
+          utilities::tags::Parallelizable, utilities::tags::NeedsContext> {
    private:
     DirectoryScannerUtility base_scanner_;
 
@@ -81,9 +91,15 @@ class PatternDirectoryScannerUtility
     coro::CoroTask<std::vector<FileEntry>> process(
         const PatternDirectoryScannerUtilityInput& input) override {
         // Step 1: Use base DirectoryScanner
-        DirectoryScannerUtilityInput dir_input{input.path, input.recursive};
-        std::vector<FileEntry> all_entries =
-            co_await base_scanner_.process(dir_input);
+        DirectoryScannerUtilityInput dir_input{input.path, input.recursive,
+                                               input.populate_size};
+        std::vector<FileEntry> all_entries;
+        if (this->has_context()) {
+            all_entries =
+                co_await this->context().spawn(base_scanner_, dir_input);
+        } else {
+            all_entries = co_await base_scanner_.process(dir_input);
+        }
 
         // Step 2: Filter by patterns
         std::vector<FileEntry> matched_entries;
diff --git a/include/dftracer/utils/utilities/filesystem/types.h b/include/dftracer/utils/utilities/filesystem/types.h
index bff24a6d..f41ca47d 100644
--- a/include/dftracer/utils/utilities/filesystem/types.h
+++ b/include/dftracer/utils/utilities/filesystem/types.h
@@ -16,16 +16,29 @@ struct FileEntry {
 
     FileEntry() = default;
 
-    explicit FileEntry(const fs::path& p)
+    explicit FileEntry(const fs::path& p, bool populate_size = true)
         : path(p), size(0), is_directory(false), is_regular_file(false) {
         if (fs::exists(p)) {
             is_directory = fs::is_directory(p);
             is_regular_file = fs::is_regular_file(p);
-            if (is_regular_file) {
+            if (populate_size && is_regular_file) {
                 size = fs::file_size(p);
             }
         }
     }
+
+    explicit FileEntry(const fs::directory_entry& entry,
+                       bool populate_size = true)
+        : path(entry.path()),
+          size(0),
+          is_directory(false),
+          is_regular_file(false) {
+        is_directory = entry.is_directory();
+        is_regular_file = entry.is_regular_file();
+        if (populate_size && is_regular_file) {
+            size = static_cast<std::size_t>(entry.file_size());
+        }
+    }
 };
 
 }  // namespace dftracer::utils::utilities::filesystem
diff --git a/include/dftracer/utils/utilities/hash/fnv1a_hasher_utility.h b/include/dftracer/utils/utilities/hash/fnv1a_hasher_utility.h
index 1ba7159b..6b8b6e88 100644
--- a/include/dftracer/utils/utilities/hash/fnv1a_hasher_utility.h
+++ b/include/dftracer/utils/utilities/hash/fnv1a_hasher_utility.h
@@ -9,6 +9,51 @@
 
 namespace dftracer::utils::utilities::hash {
 
+// FNV-1a constants
+inline constexpr std::uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325ULL;
+inline constexpr std::uint64_t FNV1A_PRIME = 0x00000100000001B3ULL;
+
+// Simple FNV-1a 64-bit hash for one-shot use
+inline std::uint64_t fnv1a_hash(const void* data, std::size_t len) {
+    std::uint64_t hash = FNV1A_OFFSET_BASIS;
+    const auto* bytes = static_cast<const std::uint8_t*>(data);
+    for (std::size_t i = 0; i < len; ++i) {
+        hash ^= bytes[i];
+        hash *= FNV1A_PRIME;
+    }
+    return hash;
+}
+
+inline std::uint64_t fnv1a_hash(std::string_view data) {
+    return fnv1a_hash(data.data(), data.size());
+}
+
+// Incremental hash builder for combining multiple values
+struct Fnv1aHashBuilder {
+    std::uint64_t state = FNV1A_OFFSET_BASIS;
+
+    void update(const void* data, std::size_t len) {
+        const auto* bytes = static_cast<const std::uint8_t*>(data);
+        for (std::size_t i = 0; i < len; ++i) {
+            state ^= bytes[i];
+            state *= FNV1A_PRIME;
+        }
+    }
+
+    void update(std::string_view data) { update(data.data(), data.size()); }
+
+    template <typename T>
+    void update_value(const T& val) {
+        update(&val, sizeof(val));
+    }
+
+    std::uint64_t finish() const { return state; }
+    std::uint32_t finish32() const {
+        // XOR-fold 64-bit to 32-bit
+        return static_cast<std::uint32_t>(state ^ (state >> 32));
+    }
+};
+
 /**
  * @brief FNV-1a 64-bit streaming hasher utility.
  *
@@ -18,10 +63,7 @@ namespace dftracer::utils::utilities::hash {
  */
 class Fnv1aHasherUtility : public internal::BaseHasherUtility {
    private:
-    static constexpr std::uint64_t FNV_OFFSET_BASIS = 0xcbf29ce484222325ULL;
-    static constexpr std::uint64_t FNV_PRIME = 0x00000100000001B3ULL;
-
-    std::uint64_t state_ = FNV_OFFSET_BASIS;
+    std::uint64_t state_ = FNV1A_OFFSET_BASIS;
 
    public:
     Fnv1aHasherUtility() { reset(); }
@@ -29,14 +71,14 @@ class Fnv1aHasherUtility : public internal::BaseHasherUtility {
     ~Fnv1aHasherUtility() override = default;
 
     void reset() override {
-        state_ = FNV_OFFSET_BASIS;
+        state_ = FNV1A_OFFSET_BASIS;
         current_hash_ = Hash{0};
     }
 
     void update(std::string_view data) override {
         for (unsigned char c : data) {
             state_ ^= c;
-            state_ *= FNV_PRIME;
+            state_ *= FNV1A_PRIME;
         }
         current_hash_ = Hash{static_cast<std::size_t>(state_)};
     }
diff --git a/include/dftracer/utils/utilities/indexer/file_partition.h b/include/dftracer/utils/utilities/indexer/file_partition.h
new file mode 100644
index 00000000..6d0d0dd4
--- /dev/null
+++ b/include/dftracer/utils/utilities/indexer/file_partition.h
@@ -0,0 +1,53 @@
+#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_FILE_PARTITION_H
+#define DFTRACER_UTILS_UTILITIES_INDEXER_FILE_PARTITION_H
+
+#include <dftracer/utils/utilities/filesystem/types.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <queue>
+#include <utility>
+#include <vector>
+
+namespace dftracer::utils::utilities::indexer {
+
+/// Greedy Longest-Processing-Time-first (LPT) bin-packing of files into
+/// `num_workers` partitions, minimising the maximum per-worker total size.
+///
+/// Used to eliminate straggler tails in the distributed indexer: workers
+/// see file lists whose total bytes are as balanced as possible.
+///
+/// Complexity: O(N log N) for the initial sort + O(N log K) for the
+/// min-heap over K = num_workers. `files` is consumed.
+inline std::vector<std::vector<filesystem::FileEntry>> plan_lpt_partition(
+    std::vector<filesystem::FileEntry> files, std::size_t num_workers) {
+    if (num_workers == 0) num_workers = 1;
+
+    std::vector<std::vector<filesystem::FileEntry>> buckets(num_workers);
+    if (files.empty()) return buckets;
+
+    std::sort(files.begin(), files.end(),
+              [](const auto& a, const auto& b) { return a.size > b.size; });
+
+    // Min-heap of (total_size, bucket_idx): next file goes to the currently
+    // lightest bucket.
+    using HeapEntry = std::pair<std::size_t, std::size_t>;
+    std::priority_queue<HeapEntry, std::vector<HeapEntry>, std::greater<>> heap;
+    for (std::size_t i = 0; i < num_workers; ++i) {
+        heap.emplace(0, i);
+    }
+
+    for (auto& entry : files) {
+        auto [total, idx] = heap.top();
+        heap.pop();
+        total += entry.size;
+        buckets[idx].push_back(std::move(entry));
+        heap.emplace(total, idx);
+    }
+
+    return buckets;
+}
+
+}  // namespace dftracer::utils::utilities::indexer
+
+#endif  // DFTRACER_UTILS_UTILITIES_INDEXER_FILE_PARTITION_H
diff --git a/include/dftracer/utils/utilities/indexer/index_batch_sink.h b/include/dftracer/utils/utilities/indexer/index_batch_sink.h
new file mode 100644
index 00000000..28f68bc2
--- /dev/null
+++ b/include/dftracer/utils/utilities/indexer/index_batch_sink.h
@@ -0,0 +1,155 @@
+#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_BATCH_SINK_H
+#define DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_BATCH_SINK_H
+
+#include <dftracer/utils/core/common/transparent_string_hash.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h>
+#include <dftracer/utils/utilities/indexer/internal/checkpoint.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <span>
+#include <string_view>
+#include <unordered_set>
+#include <vector>
+
+namespace dftracer::utils::utilities::indexer {
+
+/// Abstract sink that accepts index records for a batch of files.
+///
+/// Two backends implement this:
+///   - `IndexDatabaseWriterContext`: writes directly to a live RocksDB.
+///   - `IndexDatabaseSstWriterContext`: writes to SST files for later bulk
+///     ingest (process-safe fan-out for distributed indexing).
+///
+/// Only the step-1 subset of methods is abstracted here (file metadata,
+/// checkpoints, manifest event ranges and metadata lines). Bloom/hash/stats
+/// writes remain on the concrete type until their CFs are ported to SST.
+class IndexBatchSink {
+   public:
+    using IndexerCheckpoint = internal::IndexerCheckpoint;
+    using ChunkStatistics = composites::dft::indexing::ChunkStatistics;
+    using ChunkDimensionStats = composites::dft::indexing::ChunkDimensionStats;
+
+    virtual ~IndexBatchSink() = default;
+
+    virtual void insert_file_metadata(int file_id,
+                                      std::uint64_t checkpoint_size,
+                                      std::uint64_t total_lines,
+                                      std::uint64_t total_uc_size) = 0;
+
+    virtual void insert_checkpoint(int file_id,
+                                   const IndexerCheckpoint& checkpoint) = 0;
+
+    virtual void insert_event_range(
+        int file_id, std::uint64_t checkpoint_idx, std::string_view cat,
+        std::string_view name, std::span<const std::uint32_t> line_numbers) = 0;
+
+    virtual void insert_metadata_lines(
+        int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type,
+        std::span<const std::uint32_t> line_numbers) = 0;
+
+    virtual void insert_file_pids(
+        int file_id, const std::unordered_set<std::uint64_t>& pids) = 0;
+
+    // Bloom / stats / dimension CFs --------------------------------------
+
+    virtual void insert_chunk_bloom_filter(
+        int file_id, std::uint64_t checkpoint_idx, std::string_view dimension,
+        std::span<const unsigned char> blob_data,
+        std::uint64_t num_entries) = 0;
+
+    virtual void insert_file_bloom_filter(
+        int file_id, std::string_view dimension,
+        std::span<const unsigned char> blob_data,
+        std::uint64_t num_entries) = 0;
+
+    virtual void insert_chunk_statistics(int file_id,
+                                         std::uint64_t checkpoint_idx,
+                                         const ChunkStatistics& stats) = 0;
+
+    virtual void insert_file_scalar_stats(int file_id,
+                                          const ChunkStatistics& stats,
+                                          std::uint64_t num_chunks) = 0;
+
+    virtual void insert_file_category_counts(
+        int file_id, const StringViewMap<std::uint64_t>& counts) = 0;
+
+    virtual void insert_file_pid_tid_counts(
+        int file_id, const StringViewMap<std::uint64_t>& counts) = 0;
+
+    virtual void insert_file_name_counts(
+        int file_id, const StringViewMap<std::uint64_t>& counts) = 0;
+
+    virtual void insert_index_dimension(int file_id,
+                                        std::string_view dimension) = 0;
+
+    virtual void insert_chunk_dimension_stats(
+        int file_id, std::uint64_t checkpoint_idx,
+        const ChunkDimensionStats& stats,
+        std::size_t value_counts_cap = 4096) = 0;
+
+    // Name dictionary + postings. `name_id` is a 64-bit FNV1a hash of `name`
+    // (deterministic, stateless), so multiple workers can emit the same
+    // (name_id, name) pair without coordination. Dictionary duplicates are
+    // dropped via `ingest_behind=true` at bulk-ingest time. Posting keys
+    // include the file_id, which is worker-disjoint, so they need no
+    // coordination either.
+    virtual void insert_name_dictionary_entry(std::uint64_t name_id,
+                                              std::string_view name) = 0;
+
+    virtual void insert_name_file_posting(std::uint64_t name_id,
+                                          int file_id) = 0;
+
+    virtual void insert_name_chunk_posting(std::uint64_t name_id, int file_id,
+                                           std::uint64_t checkpoint_idx) = 0;
+
+    // Content-addressed hash table (FH/HH/SH/PR). Writes both the forward
+    // (hash -> name) and reverse (name -> hash) entries. Deterministic keys
+    // mean different workers emit identical (key, value) pairs for shared
+    // hashes; cross-worker duplicates are resolved at read time via the LSM
+    // sequence number.
+    virtual void insert_hash_table_entry(std::uint8_t type,
+                                         std::string_view hash,
+                                         std::string_view name) = 0;
+
+    // Aggregation column family. The AGGREGATION CF holds a mix of
+    // Merge-operand records (per-`(pid, time_bucket, ...)` aggregated
+    // stats) and Put records (intern-dictionary entries using the
+    // AGG_INTERN_DICT_PREFIX prefix, global-config key, per-file
+    // completion markers, and EventAggregator finalization metadata).
+    // A rocksdb merge_operator collapses Merge operands at read/compaction
+    // time; the concrete writer routes to `db_->merge` / `db_->put` via
+    // the shared WriteBatch, the SST writer buffers `(key, value, op_kind)`
+    // tuples and emits a mixed-op SST on commit.
+    virtual void insert_aggregation_merge(std::string_view key,
+                                          std::string_view operand) = 0;
+
+    virtual void insert_aggregation_put(std::string_view key,
+                                        std::string_view value) = 0;
+
+    // System-metrics column family. Merge-operand only in practice (no
+    // intern dictionary sidecar).
+    virtual void insert_system_metrics_merge(std::string_view key,
+                                             std::string_view operand) = 0;
+
+    // Convenience overloads forwarding to span variants; concrete classes
+    // need not override.
+    void insert_event_range(int file_id, std::uint64_t checkpoint_idx,
+                            std::string_view cat, std::string_view name,
+                            const std::vector<std::uint32_t>& line_numbers) {
+        insert_event_range(file_id, checkpoint_idx, cat, name,
+                           std::span<const std::uint32_t>(line_numbers));
+    }
+
+    void insert_metadata_lines(int file_id, std::uint64_t checkpoint_idx,
+                               std::string_view meta_type,
+                               const std::vector<std::uint32_t>& line_numbers) {
+        insert_metadata_lines(file_id, checkpoint_idx, meta_type,
+                              std::span<const std::uint32_t>(line_numbers));
+    }
+};
+
+}  // namespace dftracer::utils::utilities::indexer
+
+#endif  // DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_BATCH_SINK_H
diff --git a/include/dftracer/utils/utilities/indexer/index_builder_utility.h b/include/dftracer/utils/utilities/indexer/index_builder_utility.h
index cf86a8af..bca20b15 100644
--- a/include/dftracer/utils/utilities/indexer/index_builder_utility.h
+++ b/include/dftracer/utils/utilities/indexer/index_builder_utility.h
@@ -5,36 +5,51 @@
 #include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/core/utilities/tags/needs_context.h>
 #include <dftracer/utils/core/utilities/utility.h>
+#include <dftracer/utils/utilities/composites/dft/dft_event_visitor.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h>
+#include <dftracer/utils/utilities/indexer/index_batch_sink.h>
+#include <dftracer/utils/utilities/indexer/internal/common/gzip_member_scanner.h>
 
+#include <array>
 #include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
 #include <string>
+#include <string_view>
 #include <vector>
 
+namespace dftracer::utils {
+class CoroScope;
+}  // namespace dftracer::utils
+
 namespace dftracer::utils::utilities::indexer {
 
-inline std::vector<std::string> default_bloom_dimensions() {
-    return {"name", "cat", "pid", "tid", "hhash", "fhash", "shash"};
-}
+using dftracer::utils::CoroScope;
+
+inline constexpr std::array<std::string_view, 7> DEFAULT_BLOOM_DIMENSIONS = {
+    "name", "cat", "pid", "tid", "hhash", "fhash", "shash",
+};
+
+inline constexpr std::array<std::string_view, 5> DEFAULT_EXTRA_DIMENSIONS = {
+    "ret", "count", "offset", "epoch", "step",
+};
 
 struct IndexBuildConfig {
     std::string file_path;
     std::string index_dir;
     std::size_t checkpoint_size = 32 * 1024 * 1024;
-    std::size_t index_threshold =
-        constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD;
     bool force_rebuild = false;
-    bool build_bloom = false;
     bool build_manifest = false;
     composites::dft::indexing::ChunkIndexerConfig bloom_config;
     std::vector<std::string> bloom_dimensions;
+    std::vector<std::reference_wrapper<composites::dft::DftEventVisitor>>
+        extra_dft_visitors;
 
     static IndexBuildConfig for_file(const std::string& path);
     IndexBuildConfig& with_index_dir(const std::string& dir);
     IndexBuildConfig& with_checkpoint_size(std::size_t size);
-    IndexBuildConfig& with_index_threshold(std::size_t threshold);
     IndexBuildConfig& with_force_rebuild(bool force);
-    IndexBuildConfig& with_bloom(bool enable = true);
     IndexBuildConfig& with_manifest(bool enable = true);
     IndexBuildConfig& with_bloom_config(
         const composites::dft::indexing::ChunkIndexerConfig& config);
@@ -53,6 +68,103 @@ struct IndexBuildResult {
     std::string error_message;
 };
 
+struct IndexBuildBatchConfig {
+    std::vector<std::string> file_paths;
+    std::string index_dir;
+    std::size_t checkpoint_size = 32 * 1024 * 1024;
+    std::size_t parallelism = 1;
+    bool force_rebuild = false;
+    bool build_manifest = false;
+    composites::dft::indexing::ChunkIndexerConfig bloom_config;
+    std::vector<std::string> bloom_dimensions;
+    bool use_batch_write = true;
+    bool rebuild_root_summaries = true;
+
+    /// If > 0, process files in sub-batches of this size, flushing parsed
+    /// artifacts to the write phase between sub-batches. Bounds peak memory
+    /// to ~flush_every_files worth of ParsedBloomJob state. 0 = no flush
+    /// (all files parsed before any write).
+    std::size_t flush_every_files = 0;
+
+    /// Factory for creating per-file DftEventVisitors during the parse phase.
+    /// Called once per file with the file path. Caller owns the returned
+    /// visitors and can extract results after the batch completes.
+    using DftVisitorFactory = std::function<
+        std::vector<std::unique_ptr<composites::dft::DftEventVisitor>>(
+            const std::string& file_path)>;
+    DftVisitorFactory dft_visitor_factory;
+
+    /// Optional drain callback invoked once per sub-batch with the extra
+    /// visitors for that sub-batch's files. Lets the caller consume and
+    /// release visitor state immediately, keeping memory bounded by
+    /// flush_every_files instead of accumulating across the whole pipeline.
+    using ExtraVisitorsDrainFn = std::function<void(
+        std::vector<
+            std::vector<std::unique_ptr<composites::dft::DftEventVisitor>>>)>;
+    ExtraVisitorsDrainFn extra_visitors_drain;
+
+    /// If non-empty, parallel to `file_paths`: use these file_ids instead
+    /// of allocating via `get_or_create_file_info`. Used by the distributed
+    /// indexer where the coordinator pre-registers all files. When set,
+    /// the write phase skips the DEFAULT-CF registry open/write step.
+    std::vector<int> preassigned_file_ids;
+
+    /// Optional per-file member slice (cross-rank file splitting). When
+    /// non-empty, must be parallel to `file_paths`. A null/empty entry
+    /// means "process the whole file"; a populated entry restricts the
+    /// build to `[member_begin, member_end)`. The `members` vector must
+    /// outlive the batch (typically stored in a shared member map).
+    struct FileSlice {
+        const std::vector<internal::GzipMember>* members = nullptr;
+        std::size_t member_begin = 0;
+        std::size_t member_end = 0;
+        std::uint64_t checkpoint_idx_base = 0;
+        /// When true, this file's file-scoped data (checkpoints,
+        /// bloom/manifest/hashtable, file_metadata) is NOT persisted by
+        /// the write phase. Aggregation/system-metrics SSTs produced by
+        /// extra visitors are still collected. Set by the MPI driver for
+        /// sliced ranks where `member_begin > 0` to avoid cross-rank key
+        /// collisions on file-scoped CFs.
+        bool skip_file_scoped_writes = false;
+    };
+    std::vector<FileSlice> file_slices;
+
+    /// Optional batch-sink factory. If set, the write phase constructs a
+    /// fresh sink per batch via this factory instead of opening the
+    /// RocksDB-backed writer on `index_dir`. Used by the distributed (SST)
+    /// pipeline to route writes to per-worker SstWriterContext instances.
+    /// `sink_commit` must also be set and is responsible for finalising
+    /// each sink (RocksDB path: call .commit(); SST path: flush + route
+    /// Artifacts to a registry).
+    using SinkFactory = std::function<std::unique_ptr<IndexBatchSink>()>;
+    using SinkCommitFn = std::function<void(IndexBatchSink&)>;
+    SinkFactory sink_factory;
+    SinkCommitFn sink_commit;
+};
+
+struct IndexBuildBatchMetrics {
+    std::uint64_t parse_ns = 0;
+    std::uint64_t write_ns = 0;
+    std::size_t files_enqueued = 0;
+    std::size_t files_parsed = 0;
+    std::size_t files_written = 0;
+};
+
+struct IndexBuildBatchResult {
+    std::vector<IndexBuildResult> results;
+    std::size_t indexed = 0;
+    std::size_t skipped = 0;
+    std::size_t failed = 0;
+    std::uint64_t total_events = 0;
+    IndexBuildBatchMetrics metrics;
+
+    /// Per-file extra visitors created by dft_visitor_factory during parsing.
+    /// Index corresponds to the file index in the original file_paths vector.
+    /// Empty vectors for files that failed or had no factory.
+    std::vector<std::vector<std::unique_ptr<composites::dft::DftEventVisitor>>>
+        extra_visitors;
+};
+
 class IndexBuilderUtility
     : public Utility<IndexBuildConfig, IndexBuildResult, tags::NeedsContext> {
    public:
@@ -60,6 +172,12 @@ class IndexBuilderUtility
         const IndexBuildConfig& config) override;
 };
 
+class IndexBatchBuilderUtility {
+   public:
+    static coro::CoroTask<IndexBuildBatchResult> process(
+        CoroScope* scope, std::shared_ptr<IndexBuildBatchConfig> config);
+};
+
 }  // namespace dftracer::utils::utilities::indexer
 
 #endif  // DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_BUILDER_UTILITY_H
diff --git a/include/dftracer/utils/utilities/indexer/index_database.h b/include/dftracer/utils/utilities/indexer/index_database.h
index 76846a7c..bf79d741 100644
--- a/include/dftracer/utils/utilities/indexer/index_database.h
+++ b/include/dftracer/utils/utilities/indexer/index_database.h
@@ -3,67 +3,25 @@
 
 #include <dftracer/utils/core/rocksdb/database.h>
 #include <dftracer/utils/core/rocksdb/db_manager.h>
-#include <dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h>
-#include <dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h>
-#include <dftracer/utils/utilities/composites/dft/indexing/queries/manifest_queries.h>
-#include <dftracer/utils/utilities/composites/dft/indexing/queries/queries.h>
-#include <dftracer/utils/utilities/indexer/internal/checkpoint.h>
+#include <dftracer/utils/utilities/indexer/index_types.h>
 
 #include <cstdint>
 #include <memory>
 #include <optional>
-#include <span>
+#include <shared_mutex>
 #include <string>
 #include <string_view>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 namespace dftracer::utils::utilities::indexer {
 
-/**
- * @brief Unified `.dftindex` RocksDB store combining checkpoint, bloom
- *        filter, manifest, and archive metadata.
- *
- * Schema is additive: call init_base_schema() always, then
- * init_bloom_schema() and/or init_manifest_schema() as needed.
- *
- * All query/insert/delete operations are exposed as methods so callers
- * never need to use the queries:: namespace directly.
- */
+class IndexDatabaseWriterContext;
+class SstArtifactRegistry;
+
 class IndexDatabase {
    public:
-    // Re-export result types so callers don't need query headers
-    using ChunkBloomResult =
-        composites::dft::indexing::queries::ChunkBloomResult;
-    using FileBloomResult = composites::dft::indexing::queries::FileBloomResult;
-    using ChunkStatisticsResult =
-        composites::dft::indexing::queries::ChunkStatisticsResult;
-    using TimeBounds = composites::dft::indexing::queries::TimeBounds;
-    using EventRangeResult =
-        composites::dft::indexing::queries::EventRangeResult;
-    using MetadataLinesResult =
-        composites::dft::indexing::queries::MetadataLinesResult;
-    using ChunkStatistics = composites::dft::indexing::ChunkStatistics;
-    using ChunkDimensionStats = composites::dft::indexing::ChunkDimensionStats;
-    using ChunkDimensionStatsResult =
-        composites::dft::indexing::ChunkDimensionStatsResult;
-    using IndexerCheckpoint = internal::IndexerCheckpoint;
-    struct TarArchiveMetadata {
-        std::string archive_name;
-        std::uint64_t checkpoint_size = 0;
-        std::uint64_t total_lines = 0;
-        std::uint64_t total_uc_size = 0;
-        std::uint64_t total_files = 0;
-    };
-    struct TarFileRecord {
-        std::string file_name;
-        std::uint64_t file_size = 0;
-        std::uint64_t file_mtime = 0;
-        char typeflag = '\0';
-        std::uint64_t data_offset = 0;
-        std::uint64_t uncompressed_offset = 0;
-    };
-
     explicit IndexDatabase(
         const std::string& index_path,
         dftracer::utils::rocksdb::RocksDatabase::OpenMode open_mode =
@@ -77,84 +35,94 @@ class IndexDatabase {
 
     ~IndexDatabase() = default;
 
-    // Schema initialisation — idempotent (CREATE TABLE IF NOT EXISTS)
-    void init_base_schema();
-    void init_bloom_schema();
-    void init_manifest_schema();
+    std::unique_ptr<IndexDatabaseWriterContext> begin_write();
+
+    /// Ingest SST files produced by `IndexDatabaseSstWriterContext` instances.
+    /// File-id ranges across input SSTs must be disjoint for the METADATA,
+    /// CHECKPOINTS, and MANIFEST column families (step-1 scope). No-op on an
+    /// empty registry. Does NOT refresh root summaries; call
+    /// `rebuild_root_summaries()` afterward once all ingest phases are done.
+    ///
+    /// `skip_cfs` optionally holds CF names (e.g. `cf::AGGREGATION`) whose
+    /// SSTs must be left outside the unified DB. Distributed builds use this
+    /// to keep per-worker AGGREGATION / SYSTEM_METRICS SSTs addressable by
+    /// manifest for parallel reads at analyze time.
+    void bulk_ingest(const SstArtifactRegistry& registry,
+                     const std::unordered_set<std::string>& skip_cfs = {});
+
+    /// Recompute ROOT_SCALAR_STATS, ROOT_{CAT,NAME,PID_TID}_COUNTS from the
+    /// current per-file CFs. Call after `bulk_ingest` completes, or whenever
+    /// root-level summaries need to be regenerated from scratch.
+    void rebuild_root_summaries();
+
+    /// Write the aggregation global-config key (0xFFFE) into the
+    /// AGGREGATION CF. Required for `iter_arrow_dfanalyzer_all` to recognise
+    /// the index as aggregator-populated. Distributed builds call this after
+    /// `bulk_ingest(skip_cfs={aggregation, system_metrics})` so the unified
+    /// DB has a config marker even though the AGG SSTs live in the manifest.
+    /// `consolidate_index` invokes it too before the deferred AGG ingest.
+    void write_agg_global_config(std::uint64_t time_interval_us,
+                                 std::uint32_t config_hash = 0);
+
+    /// Write per-file aggregation completion markers (0xFFFF + file_id BE)
+    /// into the AGGREGATION CF. The index resolver treats these as "this
+    /// file has aggregated data"; without them, `ensure_indexed()` concludes
+    /// the aggregation tier is incomplete and re-runs the build. Distributed
+    /// builds must call this after `bulk_ingest`, since per-worker SSTs
+    /// carry data but not markers (markers are written via direct db->put,
+    /// not via the SST sink).
+    void write_agg_file_markers(const std::vector<int>& file_ids);
+
+    /// Merge per-worker AssociationTracker blobs and write the result to
+    /// the AGGREGATION CF under the `__tracker__` key.
+    void write_aggregation_tracker(const std::vector<std::string>& blobs);
+
+    /// Atomically reserve `count` contiguous file_ids, returning the first
+    /// id in the range `[first, first + count)`. Intended for the
+    /// distributed indexer: coordinator hands each worker its own disjoint
+    /// range up front so workers need no cross-worker coordination.
+    int reserve_file_id_range(std::size_t count);
+
+    /// Register a list of trace files in the DEFAULT-CF file registry and
+    /// return the assigned file_ids (parallel to `file_paths`). Idempotent:
+    /// files already registered with a matching hash keep their existing
+    /// id. Used by the distributed indexer's coordinator to pre-register
+    /// every file before dispatching work to SST-backed workers, so workers
+    /// never need to touch the DEFAULT column family themselves.
+    std::vector<int> register_files(const std::vector<std::string>& file_paths,
+                                    bool build_manifest);
+
+    std::shared_ptr<dftracer::utils::rocksdb::RocksDatabase> db() const {
+        return db_;
+    }
+
+    // Schema initialisation, idempotent
+    void init_schema();
+
+    // -----------------------------------------------------------------------
+    // Read-only query API
+    // -----------------------------------------------------------------------
+
+    IndexFileEntryCapability get_file_capabilities(int file_id) const;
 
-    // Query helpers
     bool has_bloom_data(int file_id) const;
     bool has_manifest_data(int file_id) const;
 
-    int get_or_create_file_info(std::string_view path, std::uint64_t file_hash);
     int get_file_info_id(std::string_view path) const;
     std::optional<std::uint64_t> get_file_hash(std::string_view path) const;
+    std::unordered_map<std::string, int> query_all_file_info_ids() const;
+    std::unordered_map<std::string, FileRegistryEntry> query_all_file_registry()
+        const;
+    std::unordered_set<int> query_files_with_file_scalar_stats() const;
+    std::unordered_set<int> query_files_with_bloom_data() const;
 
-    // Convenience: resolve file path to file_id (handles logical path)
     int find_file(std::string_view file_path) const;
 
-    // Metadata queries
-    void insert_file_metadata(int file_id, std::uint64_t checkpoint_size,
-                              std::uint64_t total_lines,
-                              std::uint64_t total_uc_size);
     std::uint64_t get_checkpoint_size(int file_id) const;
     std::uint64_t get_num_lines(int file_id) const;
     std::uint64_t get_max_bytes(int file_id) const;
-
-    // Returns exact event count from chunk_statistics if bloom was built,
-    // otherwise falls back to num_lines (approximate).
     std::uint64_t get_total_events(int file_id) const;
 
-    void begin_transaction();
-    void commit_transaction();
-    void rollback_transaction() noexcept;
-
-    // -----------------------------------------------------------------------
-    // Bloom insert operations
-    // -----------------------------------------------------------------------
-
-    void insert_chunk_bloom_filter(int file_id, std::uint64_t checkpoint_idx,
-                                   std::string_view dimension,
-                                   std::span<const unsigned char> blob_data,
-                                   std::uint64_t num_entries);
-
-    void insert_chunk_bloom_filter(int file_id, std::uint64_t checkpoint_idx,
-                                   std::string_view dimension,
-                                   const void* blob_data, int blob_size,
-                                   std::uint64_t num_entries);
-
-    void insert_file_bloom_filter(int file_id, std::string_view dimension,
-                                  std::span<const unsigned char> blob_data,
-                                  std::uint64_t num_entries);
-
-    void insert_file_bloom_filter(int file_id, std::string_view dimension,
-                                  const void* blob_data, int blob_size,
-                                  std::uint64_t num_entries);
-
-    void insert_chunk_statistics(int file_id, std::uint64_t checkpoint_idx,
-                                 const ChunkStatistics& stats);
-    void insert_checkpoint(int file_id, const IndexerCheckpoint& checkpoint);
-
-    void insert_index_dimension(int file_id, std::string_view dimension);
-
-    void insert_hash_resolution(int file_id, std::string_view dimension,
-                                std::string_view hash_value,
-                                std::string_view resolved_value);
-
-    void insert_chunk_dimension_stats(int file_id, std::uint64_t checkpoint_idx,
-                                      const ChunkDimensionStats& stats,
-                                      std::size_t value_counts_cap = 4096);
-    void insert_tar_archive_metadata(int file_id, std::string_view archive_name,
-                                     std::uint64_t checkpoint_size,
-                                     std::uint64_t total_lines,
-                                     std::uint64_t total_uc_size,
-                                     std::uint64_t total_files);
-    void insert_tar_file(int file_id, const TarFileRecord& record);
-
-    // -----------------------------------------------------------------------
-    // Bloom query operations
-    // -----------------------------------------------------------------------
-
     std::vector<ChunkBloomResult> query_chunk_bloom_filters(
         int file_id, std::string_view dimension) const;
 
@@ -175,6 +143,40 @@ class IndexDatabase {
 
     std::vector<ChunkStatisticsResult> query_chunk_statistics(
         int file_id) const;
+    std::unordered_map<int, std::vector<ChunkStatisticsResult>>
+    query_chunk_statistics_batch(const std::vector<int>& file_ids) const;
+    std::unordered_map<int, MergedStatisticsResult>
+    query_merged_statistics_batch(const std::vector<int>& file_ids) const;
+    std::unordered_map<int, MergedStatisticsResult>
+    query_file_scalar_stats_batch(const std::vector<int>& file_ids) const;
+    std::unordered_map<int, FileMetadataResult> query_file_metadata_batch(
+        const std::vector<int>& file_ids) const;
+    std::unordered_map<int, StringViewMap<std::uint64_t>>
+    query_file_category_counts_batch(const std::vector<int>& file_ids) const;
+    std::unordered_map<int, StringViewMap<std::uint64_t>>
+    query_file_pid_tid_counts_batch(const std::vector<int>& file_ids) const;
+    std::unordered_map<int, NameSummaryResult> query_file_name_summaries_batch(
+        const std::vector<int>& file_ids) const;
+    std::optional<RootStatisticsResult> query_root_scalar_stats() const;
+    StringViewMap<std::uint64_t> query_root_category_counts() const;
+    StringViewMap<std::uint64_t> query_root_pid_tid_counts() const;
+    StringViewMap<std::uint64_t> query_root_name_counts() const;
+    void merge_file_category_counts_batch_into(
+        const std::vector<int>& file_ids,
+        std::unordered_map<int, ChunkStatistics*>& targets) const;
+    void merge_file_pid_tid_counts_batch_into(
+        const std::vector<int>& file_ids,
+        std::unordered_map<int, ChunkStatistics*>& targets) const;
+    void merge_file_name_counts_batch_into(
+        const std::vector<int>& file_ids,
+        std::unordered_map<int, ChunkStatistics*>& targets) const;
+    void merge_root_category_counts_into(ChunkStatistics& target) const;
+    void merge_root_pid_tid_counts_into(ChunkStatistics& target) const;
+    void merge_root_name_counts_into(ChunkStatistics& target) const;
+    std::vector<int> query_name_file_postings(std::string_view name) const;
+    std::vector<std::uint64_t> query_name_chunk_postings(std::string_view name,
+                                                         int file_id) const;
+    bool has_file_scalar_stats(int file_id) const;
     bool find_checkpoint(int file_id, std::size_t target_offset,
                          IndexerCheckpoint& checkpoint) const;
     std::vector<IndexerCheckpoint> query_checkpoints(int file_id) const;
@@ -193,76 +195,89 @@ class IndexDatabase {
 
     std::vector<ChunkDimensionStatsResult> query_chunk_dimension_stats(
         int file_id) const;
+    std::unordered_map<int, std::vector<ChunkDimensionStatsResult>>
+    query_chunk_dimension_stats_batch(const std::vector<int>& file_ids) const;
 
     std::vector<ChunkDimensionStatsResult>
     query_chunk_dimension_stats_for_dimension(int file_id,
                                               std::string_view dimension) const;
 
-    // Global queries (search across all files)
-    std::optional<std::string> query_resolved_by_hash(
-        std::string_view dimension, std::string_view hash_value) const;
+    std::optional<std::uint64_t> query_name_id(std::string_view name) const;
+    std::optional<std::string> query_name_by_id(std::uint64_t name_id) const;
 
-    std::vector<std::string> query_hash_by_resolved(
-        std::string_view dimension, std::string_view resolved_value) const;
+    std::vector<EventRangeResult> query_event_ranges(int file_id) const;
 
-    // -----------------------------------------------------------------------
-    // Bloom delete operations
-    // -----------------------------------------------------------------------
+    std::vector<EventRangeResult> query_event_ranges_for_checkpoint(
+        int file_id, std::uint64_t checkpoint_idx) const;
+
+    std::vector<MetadataLinesResult> query_metadata_lines(int file_id) const;
 
-    void delete_chunk_bloom_filters(int file_id, std::string_view dimension);
-    void delete_file_bloom_filter(int file_id, std::string_view dimension);
-    void delete_chunk_statistics(int file_id);
-    void delete_chunk_dimension_stats(int file_id);
-    void delete_hash_resolutions(int file_id);
+    std::vector<MetadataLinesResult> query_metadata_lines_for_checkpoint(
+        int file_id, std::uint64_t checkpoint_idx) const;
 
     // -----------------------------------------------------------------------
-    // Manifest insert operations
+    // PID manifest query API (for distributed aggregation)
     // -----------------------------------------------------------------------
 
-    void insert_event_range(int file_id, std::uint64_t checkpoint_idx,
-                            std::string_view cat, std::string_view name,
-                            std::span<const std::uint32_t> line_numbers);
+    /// Query the set of PIDs observed in a specific file.
+    std::unordered_set<std::uint64_t> query_file_pids(int file_id) const;
 
-    void insert_event_range(int file_id, std::uint64_t checkpoint_idx,
-                            std::string_view cat, std::string_view name,
-                            const std::vector<std::uint32_t>& line_numbers);
-
-    void insert_metadata_lines(int file_id, std::uint64_t checkpoint_idx,
-                               std::string_view meta_type,
-                               std::span<const std::uint32_t> line_numbers);
-
-    void insert_metadata_lines(int file_id, std::uint64_t checkpoint_idx,
-                               std::string_view meta_type,
-                               const std::vector<std::uint32_t>& line_numbers);
+    /// Query the PIDs for all files at once.
+    /// Returns {file_id -> set of PIDs}.
+    std::unordered_map<int, std::unordered_set<std::uint64_t>>
+    query_all_file_pids() const;
 
     // -----------------------------------------------------------------------
-    // Manifest query operations
+    // Hash table query API (FH/HH/SH/PR mappings)
     // -----------------------------------------------------------------------
 
-    std::vector<EventRangeResult> query_event_ranges(int file_id) const;
+    enum class HashType : std::uint8_t {
+        FILE = 0,    // FH: file hash -> file name
+        HOST = 1,    // HH: host hash -> host name
+        STRING = 2,  // SH: string hash -> string value
+        PROC = 3     // PR: proc hash -> proc metadata
+    };
 
-    std::vector<EventRangeResult> query_event_ranges_for_checkpoint(
-        int file_id, std::uint64_t checkpoint_idx) const;
+    /// Query all entries of a given hash type.
+    /// Returns map of {hash_value -> resolved_name}.
+    std::unordered_map<std::string, std::string> query_hash_table(
+        HashType type) const;
 
-    std::vector<MetadataLinesResult> query_metadata_lines(int file_id) const;
+    /// Resolve a single hash to its name.
+    /// Returns nullopt if hash is not found.
+    std::optional<std::string> resolve_hash(HashType type,
+                                            std::string_view hash) const;
 
-    std::vector<MetadataLinesResult> query_metadata_lines_for_checkpoint(
-        int file_id, std::uint64_t checkpoint_idx) const;
+    /// Query all hash tables at once.
+    /// Returns {type -> {hash -> name}}.
+    std::unordered_map<HashType, std::unordered_map<std::string, std::string>>
+    query_all_hash_tables() const;
 
-    // -----------------------------------------------------------------------
-    // Manifest delete operations
-    // -----------------------------------------------------------------------
-
-    void delete_event_ranges(int file_id);
-    void delete_metadata_lines(int file_id);
+    /// Resolve a name to its hash (reverse lookup for query DSL).
+    /// Returns nullopt if name is not found.
+    std::optional<std::string> resolve_name_to_hash(
+        HashType type, std::string_view name) const;
 
    private:
-    void delete_file_data(int file_id);
+    void ensure_hash_tables_cached() const;
 
     std::string db_path_;
     dftracer::utils::rocksdb::RocksDatabase::OpenMode open_mode_;
     std::shared_ptr<dftracer::utils::rocksdb::RocksDatabase> db_;
-    std::unique_ptr<dftracer::utils::rocksdb::RocksDatabase::Batch> txn_batch_;
+
+    struct HashCache {
+        std::shared_mutex mutex;
+        bool loaded = false;
+        std::unordered_map<std::string, std::string> file_hash;
+        std::unordered_map<std::string, std::string> host_hash;
+        std::unordered_map<std::string, std::string> string_hash;
+        std::unordered_map<std::string, std::string> proc_hash;
+        std::unordered_map<std::string, std::string> file_name;
+        std::unordered_map<std::string, std::string> host_name;
+        std::unordered_map<std::string, std::string> string_name;
+        std::unordered_map<std::string, std::string> proc_name;
+    };
+    mutable std::unique_ptr<HashCache> hash_cache_;
 };
 
 }  // namespace dftracer::utils::utilities::indexer
diff --git a/include/dftracer/utils/utilities/indexer/index_database_sst_writer_context.h b/include/dftracer/utils/utilities/indexer/index_database_sst_writer_context.h
new file mode 100644
index 00000000..7ec980a8
--- /dev/null
+++ b/include/dftracer/utils/utilities/indexer/index_database_sst_writer_context.h
@@ -0,0 +1,303 @@
+#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_DATABASE_SST_WRITER_CONTEXT_H
+#define DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_DATABASE_SST_WRITER_CONTEXT_H
+
+#include <dftracer/utils/utilities/indexer/index_batch_sink.h>
+
+#include <cstdint>
+#include <mutex>
+#include <optional>
+#include <span>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace dftracer::utils::utilities::indexer {
+
+/// Per-batch SST emitter that implements `IndexBatchSink` by buffering
+/// (key, value) pairs in memory per column family and flushing them to sorted
+/// SST files on `commit()`.
+///
+/// Usage is identical to `IndexDatabaseWriterContext`: construct one per
+/// batch, call the `insert_*` methods, then `commit()`. The returned
+/// `Artifacts` hold the paths of the SST files produced, which a coordinator
+/// later ingests via `IndexDatabase::bulk_ingest()`.
+///
+/// Process-safe: holds no RocksDB handle. Many contexts run concurrently
+/// across threads or processes, provided each is given a disjoint file_id
+/// range so SST key prefixes do not overlap.
+class IndexDatabaseSstWriterContext : public IndexBatchSink {
+   public:
+    struct Artifacts {
+        std::optional<std::string> metadata_sst;
+        std::optional<std::string> checkpoints_sst;
+        std::optional<std::string> manifest_sst;
+        std::optional<std::string> chunk_bloom_sst;
+        std::optional<std::string> file_bloom_sst;
+        std::optional<std::string> chunk_stats_sst;
+        std::optional<std::string> chunk_dim_stats_sst;
+        std::optional<std::string> dimensions_sst;
+        std::optional<std::string> file_scalar_stats_sst;
+        std::optional<std::string> file_cat_counts_sst;
+        std::optional<std::string> file_pid_tid_counts_sst;
+        std::optional<std::string> file_name_counts_sst;
+        std::optional<std::string> name_dictionary_sst;
+        std::optional<std::string> name_file_postings_sst;
+        std::optional<std::string> name_chunk_postings_sst;
+        std::optional<std::string> hash_tables_sst;
+        std::optional<std::string> aggregation_sst;
+        std::optional<std::string> system_metrics_sst;
+
+        bool empty() const noexcept {
+            return !metadata_sst.has_value() && !checkpoints_sst.has_value() &&
+                   !manifest_sst.has_value() && !chunk_bloom_sst.has_value() &&
+                   !file_bloom_sst.has_value() &&
+                   !chunk_stats_sst.has_value() &&
+                   !chunk_dim_stats_sst.has_value() &&
+                   !dimensions_sst.has_value() &&
+                   !file_scalar_stats_sst.has_value() &&
+                   !file_cat_counts_sst.has_value() &&
+                   !file_pid_tid_counts_sst.has_value() &&
+                   !file_name_counts_sst.has_value() &&
+                   !name_dictionary_sst.has_value() &&
+                   !name_file_postings_sst.has_value() &&
+                   !name_chunk_postings_sst.has_value() &&
+                   !hash_tables_sst.has_value() &&
+                   !aggregation_sst.has_value() &&
+                   !system_metrics_sst.has_value();
+        }
+
+        /// Move every populated SST file to `dest_dir` (created if missing)
+        /// and return a new Artifacts whose paths point at the new location.
+        /// Uses `fs::rename` when src and dst resolve to the same filesystem
+        /// (O(1), atomic) and falls back to copy + unlink across filesystems.
+        /// Intended for the node-local -> shared FS handoff in the
+        /// distributed indexer. Rvalue-qualified: the original Artifacts is
+        /// left empty.
+        Artifacts move_to(std::string_view dest_dir) &&;
+    };
+
+    /// Build SSTs into a unique subdirectory under `staging_dir`. `batch_id`
+    /// must be unique across concurrent writers pointing at the same staging
+    /// root so paths do not collide.
+    IndexDatabaseSstWriterContext(std::string staging_dir,
+                                  std::string batch_id);
+
+    IndexDatabaseSstWriterContext(const IndexDatabaseSstWriterContext&) =
+        delete;
+    IndexDatabaseSstWriterContext& operator=(
+        const IndexDatabaseSstWriterContext&) = delete;
+
+    IndexDatabaseSstWriterContext(IndexDatabaseSstWriterContext&&) noexcept;
+    IndexDatabaseSstWriterContext& operator=(
+        IndexDatabaseSstWriterContext&&) noexcept;
+
+    ~IndexDatabaseSstWriterContext() override;
+
+    using IndexBatchSink::insert_event_range;
+    using IndexBatchSink::insert_metadata_lines;
+
+    void insert_file_metadata(int file_id, std::uint64_t checkpoint_size,
+                              std::uint64_t total_lines,
+                              std::uint64_t total_uc_size) override;
+
+    void insert_checkpoint(int file_id,
+                           const IndexerCheckpoint& checkpoint) override;
+
+    void insert_event_range(
+        int file_id, std::uint64_t checkpoint_idx, std::string_view cat,
+        std::string_view name,
+        std::span<const std::uint32_t> line_numbers) override;
+
+    void insert_metadata_lines(
+        int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type,
+        std::span<const std::uint32_t> line_numbers) override;
+
+    void insert_file_pids(
+        int file_id, const std::unordered_set<std::uint64_t>& pids) override;
+
+    void insert_chunk_bloom_filter(int file_id, std::uint64_t checkpoint_idx,
+                                   std::string_view dimension,
+                                   std::span<const unsigned char> blob_data,
+                                   std::uint64_t num_entries) override;
+
+    void insert_file_bloom_filter(int file_id, std::string_view dimension,
+                                  std::span<const unsigned char> blob_data,
+                                  std::uint64_t num_entries) override;
+
+    void insert_chunk_statistics(int file_id, std::uint64_t checkpoint_idx,
+                                 const ChunkStatistics& stats) override;
+
+    void insert_file_scalar_stats(int file_id, const ChunkStatistics& stats,
+                                  std::uint64_t num_chunks) override;
+
+    void insert_file_category_counts(
+        int file_id, const StringViewMap<std::uint64_t>& counts) override;
+
+    void insert_file_pid_tid_counts(
+        int file_id, const StringViewMap<std::uint64_t>& counts) override;
+
+    void insert_file_name_counts(
+        int file_id, const StringViewMap<std::uint64_t>& counts) override;
+
+    void insert_index_dimension(int file_id,
+                                std::string_view dimension) override;
+
+    void insert_chunk_dimension_stats(
+        int file_id, std::uint64_t checkpoint_idx,
+        const ChunkDimensionStats& stats,
+        std::size_t value_counts_cap = 4096) override;
+
+    void insert_name_dictionary_entry(std::uint64_t name_id,
+                                      std::string_view name) override;
+
+    void insert_name_file_posting(std::uint64_t name_id, int file_id) override;
+
+    void insert_name_chunk_posting(std::uint64_t name_id, int file_id,
+                                   std::uint64_t checkpoint_idx) override;
+
+    void insert_hash_table_entry(std::uint8_t type, std::string_view hash,
+                                 std::string_view name) override;
+
+    void insert_aggregation_merge(std::string_view key,
+                                  std::string_view operand) override;
+
+    void insert_aggregation_put(std::string_view key,
+                                std::string_view value) override;
+
+    void insert_system_metrics_merge(std::string_view key,
+                                     std::string_view operand) override;
+
+    /// Aggregation / system_metrics buffers hold mixed Put+Merge entries
+    /// in one CF. `is_merge` distinguishes them at emit time so the SST
+    /// records the right operation kind (rocksdb supports mixed-op SSTs).
+    struct MergeableKeyValue {
+        std::string key;
+        std::string value;
+        bool is_merge = true;
+    };
+
+    /// Sort buffers, emit one SST per non-empty column family, return the
+    /// resulting paths. Calling twice or after a move is a no-op.
+    Artifacts commit();
+
+   private:
+    using KeyValue = std::pair<std::string, std::string>;
+
+    std::string staging_dir_;
+    std::string batch_id_;
+    bool committed_ = false;
+
+    std::vector<KeyValue> metadata_buf_;
+    std::vector<KeyValue> checkpoints_buf_;
+    std::vector<KeyValue> manifest_buf_;
+    std::vector<KeyValue> chunk_bloom_buf_;
+    std::vector<KeyValue> file_bloom_buf_;
+    std::vector<KeyValue> chunk_stats_buf_;
+    std::vector<KeyValue> chunk_dim_stats_buf_;
+    std::vector<KeyValue> dimensions_buf_;
+    std::vector<KeyValue> file_scalar_stats_buf_;
+    std::vector<KeyValue> file_cat_counts_buf_;
+    std::vector<KeyValue> file_pid_tid_counts_buf_;
+    std::vector<KeyValue> file_name_counts_buf_;
+    std::vector<KeyValue> name_dictionary_buf_;
+    std::vector<KeyValue> name_file_postings_buf_;
+    std::vector<KeyValue> name_chunk_postings_buf_;
+    std::vector<KeyValue> hash_tables_buf_;
+    std::vector<MergeableKeyValue> aggregation_buf_;
+    std::vector<MergeableKeyValue> system_metrics_buf_;
+};
+
+/// Thread-safe collector for SST artifacts produced by many concurrent
+/// `IndexDatabaseSstWriterContext` instances. The coordinator hands the
+/// populated registry to `IndexDatabase::bulk_ingest()`.
+class SstArtifactRegistry {
+   public:
+    void append(IndexDatabaseSstWriterContext::Artifacts artifacts) {
+        std::lock_guard<std::mutex> lock(mutex_);
+        auto move_into = [](std::vector<std::string>& dst,
+                            std::optional<std::string>& src) {
+            if (src) dst.push_back(std::move(*src));
+        };
+        move_into(metadata_, artifacts.metadata_sst);
+        move_into(checkpoints_, artifacts.checkpoints_sst);
+        move_into(manifest_, artifacts.manifest_sst);
+        move_into(chunk_bloom_, artifacts.chunk_bloom_sst);
+        move_into(file_bloom_, artifacts.file_bloom_sst);
+        move_into(chunk_stats_, artifacts.chunk_stats_sst);
+        move_into(chunk_dim_stats_, artifacts.chunk_dim_stats_sst);
+        move_into(dimensions_, artifacts.dimensions_sst);
+        move_into(file_scalar_stats_, artifacts.file_scalar_stats_sst);
+        move_into(file_cat_counts_, artifacts.file_cat_counts_sst);
+        move_into(file_pid_tid_counts_, artifacts.file_pid_tid_counts_sst);
+        move_into(file_name_counts_, artifacts.file_name_counts_sst);
+        move_into(name_dictionary_, artifacts.name_dictionary_sst);
+        move_into(name_file_postings_, artifacts.name_file_postings_sst);
+        move_into(name_chunk_postings_, artifacts.name_chunk_postings_sst);
+        move_into(hash_tables_, artifacts.hash_tables_sst);
+        move_into(aggregation_, artifacts.aggregation_sst);
+        move_into(system_metrics_, artifacts.system_metrics_sst);
+    }
+
+    const std::vector<std::string>& metadata() const { return metadata_; }
+    const std::vector<std::string>& checkpoints() const { return checkpoints_; }
+    const std::vector<std::string>& manifest() const { return manifest_; }
+    const std::vector<std::string>& chunk_bloom() const { return chunk_bloom_; }
+    const std::vector<std::string>& file_bloom() const { return file_bloom_; }
+    const std::vector<std::string>& chunk_stats() const { return chunk_stats_; }
+    const std::vector<std::string>& chunk_dim_stats() const {
+        return chunk_dim_stats_;
+    }
+    const std::vector<std::string>& dimensions() const { return dimensions_; }
+    const std::vector<std::string>& file_scalar_stats() const {
+        return file_scalar_stats_;
+    }
+    const std::vector<std::string>& file_cat_counts() const {
+        return file_cat_counts_;
+    }
+    const std::vector<std::string>& file_pid_tid_counts() const {
+        return file_pid_tid_counts_;
+    }
+    const std::vector<std::string>& file_name_counts() const {
+        return file_name_counts_;
+    }
+    const std::vector<std::string>& name_dictionary() const {
+        return name_dictionary_;
+    }
+    const std::vector<std::string>& name_file_postings() const {
+        return name_file_postings_;
+    }
+    const std::vector<std::string>& name_chunk_postings() const {
+        return name_chunk_postings_;
+    }
+    const std::vector<std::string>& hash_tables() const { return hash_tables_; }
+    const std::vector<std::string>& aggregation() const { return aggregation_; }
+    const std::vector<std::string>& system_metrics() const {
+        return system_metrics_;
+    }
+
+   private:
+    std::mutex mutex_;
+    std::vector<std::string> metadata_;
+    std::vector<std::string> checkpoints_;
+    std::vector<std::string> manifest_;
+    std::vector<std::string> chunk_bloom_;
+    std::vector<std::string> file_bloom_;
+    std::vector<std::string> chunk_stats_;
+    std::vector<std::string> chunk_dim_stats_;
+    std::vector<std::string> dimensions_;
+    std::vector<std::string> file_scalar_stats_;
+    std::vector<std::string> file_cat_counts_;
+    std::vector<std::string> file_pid_tid_counts_;
+    std::vector<std::string> file_name_counts_;
+    std::vector<std::string> name_dictionary_;
+    std::vector<std::string> name_file_postings_;
+    std::vector<std::string> name_chunk_postings_;
+    std::vector<std::string> hash_tables_;
+    std::vector<std::string> aggregation_;
+    std::vector<std::string> system_metrics_;
+};
+
+}  // namespace dftracer::utils::utilities::indexer
+
+#endif  // DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_DATABASE_SST_WRITER_CONTEXT_H
diff --git a/include/dftracer/utils/utilities/indexer/index_database_writer_context.h b/include/dftracer/utils/utilities/indexer/index_database_writer_context.h
new file mode 100644
index 00000000..2b70f822
--- /dev/null
+++ b/include/dftracer/utils/utilities/indexer/index_database_writer_context.h
@@ -0,0 +1,184 @@
+#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_DATABASE_WRITER_CONTEXT_H
+#define DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_DATABASE_WRITER_CONTEXT_H
+
+#include <dftracer/utils/core/rocksdb/database.h>
+#include <dftracer/utils/core/rocksdb/db_manager.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/queries/manifest_queries.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/queries/queries.h>
+#include <dftracer/utils/utilities/indexer/index_batch_sink.h>
+#include <dftracer/utils/utilities/indexer/index_file_entry_capability.h>
+#include <dftracer/utils/utilities/indexer/internal/checkpoint.h>
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <span>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace dftracer::utils::utilities::indexer {
+
+class IndexDatabase;
+
+class IndexDatabaseWriterContext : public IndexBatchSink {
+   public:
+    using IndexBatchSink::ChunkDimensionStats;
+    using IndexBatchSink::ChunkStatistics;
+    using IndexBatchSink::IndexerCheckpoint;
+    using IndexBatchSink::insert_event_range;
+    using IndexBatchSink::insert_metadata_lines;
+    struct TarFileRecord {
+        std::string file_name;
+        std::uint64_t file_size = 0;
+        std::uint64_t file_mtime = 0;
+        char typeflag = '\0';
+        std::uint64_t data_offset = 0;
+        std::uint64_t uncompressed_offset = 0;
+    };
+
+    IndexDatabaseWriterContext(IndexDatabaseWriterContext&&) noexcept;
+    IndexDatabaseWriterContext& operator=(
+        IndexDatabaseWriterContext&&) noexcept;
+    IndexDatabaseWriterContext(const IndexDatabaseWriterContext&) = delete;
+    IndexDatabaseWriterContext& operator=(const IndexDatabaseWriterContext&) =
+        delete;
+    ~IndexDatabaseWriterContext() override;
+
+    void commit();
+
+    // Read-through queries (needed by visitors during write)
+    bool has_file_scalar_stats(int file_id) const;
+
+    // Schema initialisation
+    void init_schema();
+
+    // Registry/capability writes
+    int get_or_create_file_info(
+        std::string_view path, std::uint64_t file_hash,
+        IndexFileEntryCapability caps = IndexFileEntryCapability::NONE);
+    void set_file_capabilities(int file_id, IndexFileEntryCapability caps);
+    void set_file_capabilities_by_path(std::string_view logical_path,
+                                       IndexFileEntryCapability caps);
+    void add_file_capability(int file_id, IndexFileEntryCapability cap);
+
+    // Metadata
+    void insert_file_metadata(int file_id, std::uint64_t checkpoint_size,
+                              std::uint64_t total_lines,
+                              std::uint64_t total_uc_size) override;
+
+    // Bloom inserts
+    void insert_chunk_bloom_filter(int file_id, std::uint64_t checkpoint_idx,
+                                   std::string_view dimension,
+                                   std::span<const unsigned char> blob_data,
+                                   std::uint64_t num_entries) override;
+
+    void insert_chunk_bloom_filter(int file_id, std::uint64_t checkpoint_idx,
+                                   std::string_view dimension,
+                                   const void* blob_data, int blob_size,
+                                   std::uint64_t num_entries);
+
+    void insert_file_bloom_filter(int file_id, std::string_view dimension,
+                                  std::span<const unsigned char> blob_data,
+                                  std::uint64_t num_entries) override;
+
+    void insert_file_bloom_filter(int file_id, std::string_view dimension,
+                                  const void* blob_data, int blob_size,
+                                  std::uint64_t num_entries);
+
+    void insert_chunk_statistics(int file_id, std::uint64_t checkpoint_idx,
+                                 const ChunkStatistics& stats) override;
+    void insert_file_scalar_stats(int file_id, const ChunkStatistics& stats,
+                                  std::uint64_t num_chunks) override;
+    void insert_file_category_counts(
+        int file_id, const StringViewMap<std::uint64_t>& counts) override;
+    void insert_file_pid_tid_counts(
+        int file_id, const StringViewMap<std::uint64_t>& counts) override;
+    void insert_file_name_counts(
+        int file_id, const StringViewMap<std::uint64_t>& counts) override;
+    std::uint64_t get_or_create_name_id(std::string_view name);
+    void insert_name_dictionary_entry(std::uint64_t name_id,
+                                      std::string_view name) override;
+    void insert_name_file_posting(std::uint64_t name_id, int file_id) override;
+    void insert_name_chunk_posting(std::uint64_t name_id, int file_id,
+                                   std::uint64_t checkpoint_idx) override;
+    void refresh_root_summaries_after_file_write(
+        int file_id, const ChunkStatistics& stats, std::uint64_t num_chunks,
+        bool had_existing_file_summary, std::uint64_t file_lines = 0,
+        std::uint64_t file_uncompressed_bytes = 0);
+    void rebuild_root_summaries();
+    void insert_checkpoint(int file_id,
+                           const IndexerCheckpoint& checkpoint) override;
+
+    void insert_index_dimension(int file_id,
+                                std::string_view dimension) override;
+
+    /// Insert a hash table entry with bidirectional storage.
+    /// Forward: [type][hash] -> name  (for output resolution)
+    /// Reverse: [type+4][name] -> hash  (for query DSL)
+    /// Type: 0=FILE, 1=HOST, 2=STRING, 3=PROC
+    void insert_hash_table_entry(std::uint8_t type, std::string_view hash,
+                                 std::string_view name) override;
+
+    // Aggregation / system-metrics CF writes.
+    void insert_aggregation_merge(std::string_view key,
+                                  std::string_view operand) override;
+
+    void insert_aggregation_put(std::string_view key,
+                                std::string_view value) override;
+
+    void insert_system_metrics_merge(std::string_view key,
+                                     std::string_view operand) override;
+
+    void insert_chunk_dimension_stats(
+        int file_id, std::uint64_t checkpoint_idx,
+        const ChunkDimensionStats& stats,
+        std::size_t value_counts_cap = 4096) override;
+    void insert_tar_archive_metadata(int file_id, std::string_view archive_name,
+                                     std::uint64_t checkpoint_size,
+                                     std::uint64_t total_lines,
+                                     std::uint64_t total_uc_size,
+                                     std::uint64_t total_files);
+    void insert_tar_file(int file_id, const TarFileRecord& record);
+
+    // Deletes
+    void delete_chunk_bloom_filters(int file_id, std::string_view dimension);
+    void delete_file_bloom_filter(int file_id, std::string_view dimension);
+    void delete_chunk_statistics(int file_id);
+    void delete_chunk_dimension_stats(int file_id);
+    void delete_file_contents(int file_id);
+    void delete_event_ranges(int file_id);
+    void delete_metadata_lines(int file_id);
+
+    // Manifest inserts
+    void insert_event_range(
+        int file_id, std::uint64_t checkpoint_idx, std::string_view cat,
+        std::string_view name,
+        std::span<const std::uint32_t> line_numbers) override;
+
+    void insert_metadata_lines(
+        int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type,
+        std::span<const std::uint32_t> line_numbers) override;
+
+    /// Insert the set of PIDs observed in a file (for distributed aggregation)
+    void insert_file_pids(
+        int file_id, const std::unordered_set<std::uint64_t>& pids) override;
+
+   private:
+    friend class IndexDatabase;
+    explicit IndexDatabaseWriterContext(
+        std::shared_ptr<dftracer::utils::rocksdb::RocksDatabase> db);
+
+    std::shared_ptr<dftracer::utils::rocksdb::RocksDatabase> db_;
+    dftracer::utils::rocksdb::RocksDatabase::Batch batch_;
+    bool committed_ = false;
+    std::int64_t cached_next_file_id_ = -1;
+};
+
+}  // namespace dftracer::utils::utilities::indexer
+
+#endif  // DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_DATABASE_WRITER_CONTEXT_H
diff --git a/include/dftracer/utils/utilities/indexer/index_file_entry_capability.h b/include/dftracer/utils/utilities/indexer/index_file_entry_capability.h
new file mode 100644
index 00000000..d7ae0973
--- /dev/null
+++ b/include/dftracer/utils/utilities/indexer/index_file_entry_capability.h
@@ -0,0 +1,39 @@
+#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_FILE_ENTRY_CAPABILITY_H
+#define DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_FILE_ENTRY_CAPABILITY_H
+
+#include <cstdint>
+
+namespace dftracer::utils::utilities::indexer {
+
+enum class IndexFileEntryCapability : std::uint8_t {
+    NONE = 0,
+    BLOOM = 1 << 0,
+    MANIFEST = 1 << 1,
+    FILE_SUMMARY = 1 << 2,
+    CHECKPOINTS = 1 << 3,
+    INDEXING_COMPLETE = 1 << 4,
+};
+
+inline IndexFileEntryCapability operator|(IndexFileEntryCapability a,
+                                          IndexFileEntryCapability b) {
+    return static_cast<IndexFileEntryCapability>(static_cast<std::uint8_t>(a) |
+                                                 static_cast<std::uint8_t>(b));
+}
+inline IndexFileEntryCapability operator&(IndexFileEntryCapability a,
+                                          IndexFileEntryCapability b) {
+    return static_cast<IndexFileEntryCapability>(static_cast<std::uint8_t>(a) &
+                                                 static_cast<std::uint8_t>(b));
+}
+inline IndexFileEntryCapability& operator|=(IndexFileEntryCapability& a,
+                                            IndexFileEntryCapability b) {
+    a = a | b;
+    return a;
+}
+inline bool has_capability(IndexFileEntryCapability caps,
+                           IndexFileEntryCapability flag) {
+    return (caps & flag) != IndexFileEntryCapability::NONE;
+}
+
+}  // namespace dftracer::utils::utilities::indexer
+
+#endif  // DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_FILE_ENTRY_CAPABILITY_H
diff --git a/include/dftracer/utils/utilities/indexer/index_types.h b/include/dftracer/utils/utilities/indexer/index_types.h
new file mode 100644
index 00000000..845e6ecc
--- /dev/null
+++ b/include/dftracer/utils/utilities/indexer/index_types.h
@@ -0,0 +1,81 @@
+#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_TYPES_H
+#define DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_TYPES_H
+
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/queries/manifest_queries.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/queries/queries.h>
+#include <dftracer/utils/utilities/indexer/index_file_entry_capability.h>
+#include <dftracer/utils/utilities/indexer/internal/checkpoint.h>
+
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace dftracer::utils::utilities::indexer {
+
+using ChunkBloomResult = composites::dft::indexing::queries::ChunkBloomResult;
+using FileBloomResult = composites::dft::indexing::queries::FileBloomResult;
+using ChunkStatisticsResult =
+    composites::dft::indexing::queries::ChunkStatisticsResult;
+using TimeBounds = composites::dft::indexing::queries::TimeBounds;
+using EventRangeResult = composites::dft::indexing::queries::EventRangeResult;
+using MetadataLinesResult =
+    composites::dft::indexing::queries::MetadataLinesResult;
+using ChunkStatistics = composites::dft::indexing::ChunkStatistics;
+using ChunkDimensionStats = composites::dft::indexing::ChunkDimensionStats;
+using ChunkDimensionStatsResult =
+    composites::dft::indexing::ChunkDimensionStatsResult;
+using IndexerCheckpoint = internal::IndexerCheckpoint;
+
+struct MergedStatisticsResult {
+    ChunkStatistics stats;
+    std::uint64_t num_chunks = 0;
+};
+
+struct RootStatisticsResult {
+    ChunkStatistics stats;
+    std::uint64_t num_chunks = 0;
+    std::uint64_t num_files = 0;
+    std::uint64_t total_lines = 0;
+    std::uint64_t total_uncompressed_bytes = 0;
+};
+
+struct NameSummaryResult {
+    StringViewMap<std::uint64_t> counts;
+    std::uint64_t other_count = 0;
+    std::uint64_t unique_count = 0;
+};
+
+struct FileMetadataResult {
+    std::uint64_t checkpoint_size = 0;
+    std::uint64_t num_lines = 0;
+    std::uint64_t max_bytes = 0;
+};
+
+struct FileRegistryEntry {
+    int file_id = -1;
+    IndexFileEntryCapability capabilities = IndexFileEntryCapability::NONE;
+};
+
+struct TarArchiveMetadata {
+    std::string archive_name;
+    std::uint64_t checkpoint_size = 0;
+    std::uint64_t total_lines = 0;
+    std::uint64_t total_uc_size = 0;
+    std::uint64_t total_files = 0;
+};
+
+struct TarFileRecord {
+    std::string file_name;
+    std::uint64_t file_size = 0;
+    std::uint64_t file_mtime = 0;
+    char typeflag = '\0';
+    std::uint64_t data_offset = 0;
+    std::uint64_t uncompressed_offset = 0;
+};
+
+}  // namespace dftracer::utils::utilities::indexer
+
+#endif  // DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_TYPES_H
diff --git a/include/dftracer/utils/utilities/indexer/index_visitor.h b/include/dftracer/utils/utilities/indexer/index_visitor.h
index f7dae313..13d815d0 100644
--- a/include/dftracer/utils/utilities/indexer/index_visitor.h
+++ b/include/dftracer/utils/utilities/indexer/index_visitor.h
@@ -1,12 +1,21 @@
 #ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_VISITOR_H
 #define DFTRACER_UTILS_UTILITIES_INDEXER_INDEX_VISITOR_H
 
+#include <dftracer/utils/core/coro/task.h>
+
 #include <cstddef>
+#include <cstring>
+#include <memory>
+#include <string>
 #include <string_view>
 
 namespace dftracer::utils::utilities::indexer {
 
-class IndexDatabase;
+class IndexDatabaseWriterContext;
+
+/// Shared buffer for zero-copy line passing. The string_view passed to
+/// on_line points into this buffer; storing the shared_ptr keeps it alive.
+using SharedLineBuffer = std::shared_ptr<std::string>;
 
 class IndexVisitor {
    public:
@@ -14,11 +23,41 @@ class IndexVisitor {
 
     virtual void begin(std::size_t num_checkpoints) = 0;
 
-    virtual void on_checkpoint(std::size_t checkpoint_idx) = 0;
+    virtual coro::CoroTask<void> on_checkpoint(std::size_t checkpoint_idx) = 0;
+
+    virtual coro::CoroTask<void> on_chunk(const char* data, std::size_t len,
+                                          std::size_t checkpoint_idx) {
+        auto buffer = std::make_shared<std::string>(data, len);
+        std::size_t pos = 0;
+        while (pos < len) {
+            const void* nl = std::memchr(data + pos, '\n', len - pos);
+            if (!nl) break;
+            std::size_t end = static_cast<const char*>(nl) - data;
+            on_line(std::string_view(buffer->data() + pos, end - pos), buffer,
+                    checkpoint_idx);
+            pos = end + 1;
+        }
+        co_return;
+    }
+
+    /// Called for each line. The line string_view points into buffer.
+    /// Implementations that need the data to outlive this call should
+    /// store the buffer shared_ptr (zero-copy) rather than copying line.
+    virtual void on_line(std::string_view line, SharedLineBuffer buffer,
+                         std::size_t checkpoint_idx) = 0;
+
+    virtual coro::CoroTask<void> flush() { co_return; }
+
+    /// Cheap hint that drain_pending() should be called to apply
+    /// backpressure. Default false. Polled after each on_line call.
+    virtual bool wants_drain() const noexcept { return false; }
 
-    virtual void on_line(std::string_view line, std::size_t checkpoint_idx) = 0;
+    /// Drain accumulated work via async ops (e.g. channel send). Suspends
+    /// the calling coroutine when downstream is full -- real backpressure
+    /// without blocking an executor thread.
+    virtual coro::CoroTask<void> drain_pending() { co_return; }
 
-    virtual void finalize(IndexDatabase& db, int file_id) = 0;
+    virtual void finalize(IndexDatabaseWriterContext& writer, int file_id) = 0;
 };
 
 }  // namespace dftracer::utils::utilities::indexer
diff --git a/include/dftracer/utils/utilities/indexer/internal/index_encoding.h b/include/dftracer/utils/utilities/indexer/internal/index_encoding.h
new file mode 100644
index 00000000..6f9a7a4f
--- /dev/null
+++ b/include/dftracer/utils/utilities/indexer/internal/index_encoding.h
@@ -0,0 +1,133 @@
+#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_INDEX_ENCODING_H
+#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_INDEX_ENCODING_H
+
+#include <dftracer/utils/core/rocksdb/key_codec.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h>
+#include <dftracer/utils/utilities/indexer/internal/checkpoint.h>
+#include <dftracer/utils/utilities/indexer/internal/payload_codec.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <span>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+
+namespace dftracer::utils::utilities::indexer::internal::encoding {
+
+std::string prefix_for_file(int file_id);
+
+/// DEFAULT-CF key holding the monotonically increasing counter for the next
+/// file_id to assign. Used by both `get_or_create_file_info` (single-file
+/// path) and `IndexDatabase::reserve_file_id_range` (distributed pre-alloc).
+inline constexpr std::string_view NEXT_FILE_ID_KEY = "_next_file_id";
+
+std::string metadata_key(int file_id);
+
+std::string checkpoint_key(int file_id, std::uint64_t uc_offset,
+                           std::uint64_t checkpoint_idx);
+
+std::string manifest_event_key(int file_id, std::uint64_t checkpoint_idx,
+                               std::string_view cat, std::string_view name);
+
+std::string manifest_metadata_key(int file_id, std::uint64_t checkpoint_idx,
+                                  std::string_view meta_type);
+
+std::string encode_metadata_record(std::uint64_t checkpoint_size,
+                                   std::uint64_t total_lines,
+                                   std::uint64_t total_uc_size);
+
+std::string encode_checkpoint_value(const IndexerCheckpoint& checkpoint);
+
+std::string encode_event_range_value(std::span<const std::uint32_t> lines);
+
+std::string encode_metadata_value(std::span<const std::uint32_t> lines);
+
+std::string file_pids_key(int file_id);
+
+std::string encode_file_pids_value(
+    const std::unordered_set<std::uint64_t>& pids);
+
+// Bloom / stats / dimension CFs --------------------------------------------
+
+std::string make_dimension_key(int file_id, std::string_view dimension);
+
+std::string chunk_bloom_key(int file_id, std::string_view dimension,
+                            std::uint64_t checkpoint_idx);
+
+std::string file_bloom_key(int file_id, std::string_view dimension);
+
+std::string chunk_stats_key(int file_id, std::uint64_t checkpoint_idx);
+
+std::string file_scalar_stats_key(int file_id);
+std::string file_category_counts_key(int file_id);
+std::string file_pid_tid_counts_key(int file_id);
+std::string file_name_counts_key(int file_id);
+
+std::string chunk_dim_stats_key(int file_id, std::uint64_t checkpoint_idx,
+                                std::string_view dimension);
+
+// Name dictionary + postings (name_id is a 64-bit FNV1a hash of the name) ---
+
+std::string name_lookup_key(std::string_view name);
+std::string name_reverse_key(std::uint64_t name_id);
+
+std::string name_file_posting_key(std::uint64_t name_id, int file_id);
+std::string name_file_owner_key(int file_id, std::uint64_t name_id);
+std::string name_file_owner_prefix(int file_id);
+
+std::string name_chunk_posting_key(std::uint64_t name_id, int file_id,
+                                   std::uint64_t checkpoint_idx);
+std::string name_chunk_owner_key(int file_id, std::uint64_t name_id,
+                                 std::uint64_t checkpoint_idx);
+std::string name_chunk_owner_prefix(int file_id);
+
+// Hash tables (content-addressed) ------------------------------------------
+
+std::string hash_table_forward_key(std::uint8_t type, std::string_view hash);
+std::string hash_table_reverse_key(std::uint8_t type, std::string_view name);
+
+std::string encode_bloom_value(std::span<const unsigned char> blob,
+                               std::uint64_t num_entries);
+
+std::string encode_chunk_statistics_value(
+    const composites::dft::indexing::ChunkStatistics& stats);
+
+std::string encode_chunk_dimension_stats_value(
+    const composites::dft::indexing::ChunkDimensionStats& stats,
+    std::size_t value_counts_cap);
+
+// Count map and name summary encoders are templated so they can accept any
+// map type exposing string keys and uint64 values.
+template <typename Map>
+std::string encode_count_map_value(const Map& counts) {
+    std::string value;
+    dftracer::utils::rocksdb::KeyCodec::append_be32(
+        value, static_cast<std::uint32_t>(counts.size()));
+    for (const auto& [key, count] : counts) {
+        append_string(value, key);
+        append_u64(value, count);
+    }
+    return value;
+}
+
+template <typename Map>
+std::string encode_name_summary_value(const Map& counts,
+                                      std::uint64_t other_count,
+                                      std::uint64_t unique_count) {
+    std::string value;
+    dftracer::utils::rocksdb::KeyCodec::append_be32(
+        value, static_cast<std::uint32_t>(counts.size()));
+    append_u64(value, other_count);
+    append_u64(value, unique_count);
+    for (const auto& [key, count] : counts) {
+        append_string(value, key);
+        append_u64(value, count);
+    }
+    return value;
+}
+
+}  // namespace dftracer::utils::utilities::indexer::internal::encoding
+
+#endif  // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_INDEX_ENCODING_H
diff --git a/include/dftracer/utils/utilities/indexer/internal/payload_codec.h b/include/dftracer/utils/utilities/indexer/internal/payload_codec.h
new file mode 100644
index 00000000..ea244c3a
--- /dev/null
+++ b/include/dftracer/utils/utilities/indexer/internal/payload_codec.h
@@ -0,0 +1,140 @@
+#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_PAYLOAD_CODEC_H
+#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_PAYLOAD_CODEC_H
+
+#include <dftracer/utils/core/rocksdb/key_codec.h>
+
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <span>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace dftracer::utils::utilities::indexer::internal {
+
+inline constexpr std::size_t DECODE_CONTEXT_BUF_SIZE = 256;
+inline thread_local char g_decode_context[DECODE_CONTEXT_BUF_SIZE] = {};
+
+struct DecodeContextGuard {
+    template <typename... Args>
+    explicit DecodeContextGuard(const char* fmt, Args... args) {
+        std::memcpy(previous_, g_decode_context, DECODE_CONTEXT_BUF_SIZE);
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat-nonliteral"
+        std::snprintf(g_decode_context, DECODE_CONTEXT_BUF_SIZE, fmt, args...);
+#pragma GCC diagnostic pop
+    }
+
+    ~DecodeContextGuard() {
+        std::memcpy(g_decode_context, previous_, DECODE_CONTEXT_BUF_SIZE);
+    }
+
+    DecodeContextGuard(const DecodeContextGuard&) = delete;
+    DecodeContextGuard& operator=(const DecodeContextGuard&) = delete;
+
+   private:
+    char previous_[DECODE_CONTEXT_BUF_SIZE];
+};
+
+inline void append_u8(std::string& out, std::uint8_t value) {
+    out.push_back(static_cast<char>(value));
+}
+
+inline void append_u32(std::string& out, std::uint32_t value) {
+    dftracer::utils::rocksdb::KeyCodec::append_be32(out, value);
+}
+
+inline void append_u64(std::string& out, std::uint64_t value) {
+    dftracer::utils::rocksdb::KeyCodec::append_be64(out, value);
+}
+
+inline void append_i64(std::string& out, std::int64_t value) {
+    dftracer::utils::rocksdb::KeyCodec::append_be64(
+        out, static_cast<std::uint64_t>(value));
+}
+
+inline void append_double(std::string& out, double value) {
+    static_assert(sizeof(double) == sizeof(std::uint64_t));
+    std::uint64_t bits = 0;
+    std::memcpy(&bits, &value, sizeof(bits));
+    append_u64(out, bits);
+}
+
+inline void append_string(std::string& out, std::string_view value) {
+    append_u32(out, static_cast<std::uint32_t>(value.size()));
+    out.append(value.data(), value.size());
+}
+
+inline void append_blob(std::string& out, std::span<const unsigned char> blob) {
+    append_u32(out, static_cast<std::uint32_t>(blob.size()));
+    out.append(reinterpret_cast<const char*>(blob.data()), blob.size());
+}
+
+class Cursor {
+   public:
+    explicit Cursor(std::string_view data) : data_(data) {}
+
+    std::uint8_t u8() { return static_cast<std::uint8_t>(take(1)[0]); }
+
+    std::uint32_t u32() {
+        return dftracer::utils::rocksdb::KeyCodec::decode_be32(take(4));
+    }
+
+    std::uint64_t u64() {
+        return dftracer::utils::rocksdb::KeyCodec::decode_be64(take(8));
+    }
+
+    std::int64_t i64() { return static_cast<std::int64_t>(u64()); }
+
+    double f64() {
+        std::uint64_t bits = u64();
+        double value = 0.0;
+        std::memcpy(&value, &bits, sizeof(value));
+        return value;
+    }
+
+    std::string_view str_view() {
+        auto len = static_cast<std::size_t>(u32());
+        return take(len);
+    }
+
+    std::string str() {
+        auto bytes = str_view();
+        return std::string(bytes.data(), bytes.size());
+    }
+
+    std::vector<unsigned char> blob() {
+        auto len = static_cast<std::size_t>(u32());
+        auto bytes = take(len);
+        return std::vector<unsigned char>(bytes.begin(), bytes.end());
+    }
+
+    std::size_t offset() const { return offset_; }
+    bool eof() const { return offset_ >= data_.size(); }
+
+   private:
+    std::string_view take(std::size_t len) {
+        if (offset_ + len > data_.size()) {
+            char err[DECODE_CONTEXT_BUF_SIZE + 64];
+            if (g_decode_context[0] != '\0') {
+                std::snprintf(err, sizeof(err), "Corrupt RocksDB payload [%s]",
+                              g_decode_context);
+            } else {
+                std::snprintf(err, sizeof(err), "Corrupt RocksDB payload");
+            }
+            throw std::runtime_error(err);
+        }
+        auto chunk = data_.substr(offset_, len);
+        offset_ += len;
+        return chunk;
+    }
+
+    std::string_view data_;
+    std::size_t offset_ = 0;
+};
+
+}  // namespace dftracer::utils::utilities::indexer::internal
+
+#endif  // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_PAYLOAD_CODEC_H
diff --git a/include/dftracer/utils/utilities/indexer/internal/statistics_codec.h b/include/dftracer/utils/utilities/indexer/internal/statistics_codec.h
new file mode 100644
index 00000000..003d5ea0
--- /dev/null
+++ b/include/dftracer/utils/utilities/indexer/internal/statistics_codec.h
@@ -0,0 +1,26 @@
+#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_STATISTICS_CODEC_H
+#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_STATISTICS_CODEC_H
+
+#include <dftracer/utils/utilities/indexer/index_types.h>
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+
+namespace dftracer::utils::utilities::indexer::internal {
+
+std::string encode_file_scalar_stats_value(const ChunkStatistics& stats,
+                                           std::uint64_t num_chunks);
+
+std::string encode_root_scalar_stats_value(
+    const ChunkStatistics& stats, std::uint64_t num_chunks,
+    std::uint64_t num_files, std::uint64_t total_lines = 0,
+    std::uint64_t total_uncompressed_bytes = 0);
+
+MergedStatisticsResult decode_file_scalar_stats_value(std::string_view value);
+
+RootStatisticsResult decode_root_scalar_stats_value(std::string_view value);
+
+}  // namespace dftracer::utils::utilities::indexer::internal
+
+#endif  // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_STATISTICS_CODEC_H
diff --git a/include/dftracer/utils/utilities/indexer/provenance_database.h b/include/dftracer/utils/utilities/indexer/provenance_database.h
index 974eb771..2353f552 100644
--- a/include/dftracer/utils/utilities/indexer/provenance_database.h
+++ b/include/dftracer/utils/utilities/indexer/provenance_database.h
@@ -71,8 +71,8 @@ class ProvenanceDatabase {
                       std::string_view predicate);
 
     void insert_segment(int file_info_id, int source_idx, int source_checkpoint,
-                        int output_line_start, int output_line_end,
-                        int event_count);
+                        int segment_seq, int output_line_start,
+                        int output_line_end, int event_count);
 
     // -----------------------------------------------------------------------
     // Provenance query operations
diff --git a/include/dftracer/utils/utilities/indexer/visitors/bloom_visitor.h b/include/dftracer/utils/utilities/indexer/visitors/bloom_visitor.h
deleted file mode 100644
index ee450aed..00000000
--- a/include/dftracer/utils/utilities/indexer/visitors/bloom_visitor.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_VISITORS_BLOOM_VISITOR_H
-#define DFTRACER_UTILS_UTILITIES_INDEXER_VISITORS_BLOOM_VISITOR_H
-
-#include <dftracer/utils/utilities/common/json/json.h>
-#include <dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h>
-#include <dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h>
-#include <dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h>
-#include <dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h>
-#include <dftracer/utils/utilities/indexer/index_visitor.h>
-#include <yyjson.h>
-
-#include <array>
-#include <cstddef>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace dftracer::utils::utilities::indexer {
-
-class BloomVisitor : public IndexVisitor {
-   public:
-    using BloomFilterMap = std::unordered_map<
-        std::string,
-        dftracer::utils::utilities::composites::dft::indexing::BloomFilter>;
-    using HashResolutions =
-        dftracer::utils::utilities::composites::dft::indexing::HashResolutions;
-    using ChunkStatistics =
-        dftracer::utils::utilities::composites::dft::indexing::ChunkStatistics;
-    using ChunkDimensionStats = dftracer::utils::utilities::composites::dft::
-        indexing::ChunkDimensionStats;
-    using ChunkIndexerConfig = dftracer::utils::utilities::composites::dft::
-        indexing::ChunkIndexerConfig;
-
-    struct ChunkState {
-        BloomFilterMap bloom_filters;
-        ChunkStatistics statistics;
-        HashResolutions hash_resolutions;
-        std::unordered_map<std::string, ChunkDimensionStats> dimension_stats;
-        std::size_t events_processed = 0;
-    };
-
-    BloomVisitor(ChunkIndexerConfig config,
-                 std::vector<std::string> dimensions);
-
-    void begin(std::size_t num_checkpoints) override;
-    void on_checkpoint(std::size_t checkpoint_idx) override;
-    void on_line(std::string_view line, std::size_t checkpoint_idx) override;
-    void finalize(IndexDatabase& db, int file_id) override;
-
-    std::size_t num_chunks() const { return chunks_.size(); }
-
-   private:
-    void ensure_chunk(std::size_t checkpoint_idx);
-
-    ChunkIndexerConfig config_;
-    std::vector<std::string> dimensions_;
-    std::vector<ChunkState> chunks_;
-
-    std::array<char, common::json::YYJSON_LINE_POOL_SIZE> yy_buf_{};
-    yyjson_alc yy_alc_{};
-    bool yy_alc_initialized_ = false;
-};
-
-}  // namespace dftracer::utils::utilities::indexer
-
-#endif  // DFTRACER_UTILS_UTILITIES_INDEXER_VISITORS_BLOOM_VISITOR_H
diff --git a/include/dftracer/utils/utilities/indexer/visitors/manifest_visitor.h b/include/dftracer/utils/utilities/indexer/visitors/manifest_visitor.h
deleted file mode 100644
index 4458e6ce..00000000
--- a/include/dftracer/utils/utilities/indexer/visitors/manifest_visitor.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_VISITORS_MANIFEST_VISITOR_H
-#define DFTRACER_UTILS_UTILITIES_INDEXER_VISITORS_MANIFEST_VISITOR_H
-
-#include <dftracer/utils/utilities/indexer/index_visitor.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <map>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace dftracer::utils::utilities::indexer {
-
-class ManifestVisitor : public IndexVisitor {
-   public:
-    ManifestVisitor() = default;
-
-    void begin(std::size_t num_checkpoints) override;
-    void on_checkpoint(std::size_t checkpoint_idx) override;
-    void on_line(std::string_view line, std::size_t checkpoint_idx) override;
-    void finalize(IndexDatabase& db, int file_id) override;
-
-   private:
-    void ensure_chunk(std::size_t checkpoint_idx);
-
-    using EventKey = std::pair<std::string, std::string>;
-    using LineVec = std::vector<std::uint32_t>;
-
-    std::vector<std::map<EventKey, LineVec>> event_lines_;
-    std::vector<std::map<std::string, LineVec>> metadata_lines_;
-    std::uint32_t chunk_line_ = 0;
-};
-
-}  // namespace dftracer::utils::utilities::indexer
-
-#endif  // DFTRACER_UTILS_UTILITIES_INDEXER_VISITORS_MANIFEST_VISITOR_H
diff --git a/include/dftracer/utils/utilities/reader/internal/stream_config.h b/include/dftracer/utils/utilities/reader/internal/stream_config.h
index 188e063f..85416302 100644
--- a/include/dftracer/utils/utilities/reader/internal/stream_config.h
+++ b/include/dftracer/utils/utilities/reader/internal/stream_config.h
@@ -106,6 +106,12 @@ class StreamConfig {
           end_(end),
           buffer_size_(buffer_size) {}
 
+    bool extend_to_line_boundary() const { return extend_to_line_boundary_; }
+    StreamConfig& extend_to_line_boundary(bool v) {
+        extend_to_line_boundary_ = v;
+        return *this;
+    }
+
     static constexpr std::size_t DEFAULT_BUFFER_SIZE = 4 * 1024 * 1024;  // 4MB
     // ========================================================================
     // Fluent API - Basic Setters
@@ -226,6 +232,8 @@ class StreamConfig {
      * Larger buffers improve I/O performance but use more memory.
      */
     std::size_t buffer_size_ = 4 * 1024 * 1024;  // 4MB default
+
+    bool extend_to_line_boundary_ = false;
 };
 
 }  // namespace dftracer::utils::utilities::reader::internal
diff --git a/include/dftracer/utils/utilities/reader/trace_reader.h b/include/dftracer/utils/utilities/reader/trace_reader.h
index ccd2341a..297899db 100644
--- a/include/dftracer/utils/utilities/reader/trace_reader.h
+++ b/include/dftracer/utils/utilities/reader/trace_reader.h
@@ -2,11 +2,16 @@
 #define DFTRACER_UTILS_UTILITIES_READER_TRACE_READER_H
 
 #include <dftracer/utils/core/common/archive_format.h>
+#include <dftracer/utils/core/common/config.h>
 #include <dftracer/utils/core/common/constants.h>
 #include <dftracer/utils/core/coro/async_generator.h>
+#include <dftracer/utils/utilities/common/json/parser.h>
 #include <dftracer/utils/utilities/fileio/lines/line_types.h>
 #include <dftracer/utils/utilities/reader/internal/reader.h>
 #include <dftracer/utils/utilities/reader/internal/stream_type.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+#include <dftracer/utils/utilities/common/arrow/arrow_export.h>
+#endif
 
 #include <cstddef>
 #include <memory>
@@ -15,17 +20,21 @@
 
 namespace dftracer::utils::utilities::reader {
 
+using common::json::JsonParser;
 using fileio::lines::Line;
 
+struct JsonLine {
+    std::string_view content;
+    std::size_t line_number;
+    JsonParser* parser;
+};
+
 /// File-level configuration for TraceReader.
 struct TraceReaderConfig {
     std::string file_path;  ///< Path to trace file (.pfw.gz or plain).
     std::string index_dir;  ///< Directory containing `.dftindex` roots.
     std::size_t checkpoint_size = 32 * 1024 * 1024;  ///< Checkpoint interval.
     bool auto_build_index = false;  ///< Auto-build index if missing.
-    std::size_t index_threshold =
-        constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD;  ///< Min size for
-                                                           ///< auto-index.
 };
 
 /// Per-read configuration for range, buffering, and query filtering.
@@ -42,9 +51,30 @@ struct ReadConfig {
 
     /// Query DSL string for event filtering (empty = no filter).
     /// When set and an index exists, chunk pruning skips non-matching
-    /// chunks. Per-event filtering always applies.
+    /// chunks. Per-event filtering always applies unless chunk_prune_only
+    /// is set.
     std::string query;
 
+    /// When true, the query is used only for chunk-level pruning via
+    /// the index. Per-line filtering is skipped (caller handles it).
+    bool chunk_prune_only = false;
+
+    /// When true, the reader skips its own chunk pruner pass entirely and
+    /// trusts the caller's start_line/end_line window. Intended for the
+    /// checkpoint-level work-item dispatcher, which already pruned once
+    /// per file at enumeration time. Without this the pruner would
+    /// re-run per work item (hundreds-of-thousands of RocksDB opens).
+    bool skip_pruning = false;
+
+    bool start_at_checkpoint = false;
+    bool end_at_checkpoint = false;
+
+    /// When true, top-level object values (e.g. `args`) are expanded one
+    /// level into `parent.child` columns with native Arrow types instead
+    /// of being serialized as a JSON string column. One-level only; deeper
+    /// nesting still round-trips as JSON text under the flattened key.
+    bool flatten_objects = false;
+
     bool has_line_range() const { return start_line > 0 || end_line > 0; }
     bool has_byte_range() const { return start_byte > 0 || end_byte > 0; }
 };
@@ -58,10 +88,25 @@ class TraceReader {
     /// Read lines with optional query filtering and chunk pruning.
     coro::AsyncGenerator<Line> read_lines(ReadConfig config = {});
 
+    /// Read parsed JSON lines. Parses each line once with simdjson ondemand,
+    /// applies query filtering, and yields the parsed document.
+    /// The yielded JsonParser is valid until the next next() call.
+    coro::AsyncGenerator<JsonLine> read_json(ReadConfig config = {});
+
     /// Read raw byte chunks.
     coro::AsyncGenerator<std::span<const char>> read_raw(
         ReadConfig config = {});
 
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+    /// Direct Arrow batch pipeline: chunk-prune + line-level prefilter +
+    /// simdjson iterate_many + inline row build. Yields complete Arrow
+    /// record batches sized at `batch_size` rows. Emits the final
+    /// partial batch on generator close. Non-normalized schema only
+    /// (dynamic columns follow the first row seen).
+    coro::AsyncGenerator<common::arrow::ArrowExportResult> read_arrow(
+        ReadConfig config = {}, std::size_t batch_size = 10000);
+#endif
+
     /// True if a `.dftindex` database was found at construction time.
     bool has_index() const;
     /// Decompressed size (0 if no index for compressed files).
diff --git a/include/dftracer/utils/utilities/replay/replay.h b/include/dftracer/utils/utilities/replay/replay.h
index fc2383c7..988dfec6 100644
--- a/include/dftracer/utils/utilities/replay/replay.h
+++ b/include/dftracer/utils/utilities/replay/replay.h
@@ -2,8 +2,10 @@
 #define DFTRACER_UTILS_UTILITIES_REPLAY_REPLAY_H
 
 #include <dftracer/utils/call_tree/call_tree.h>
-#include <dftracer/utils/utilities/reader/internal/line_processor.h>
-#include <dftracer/utils/utilities/reader/internal/reader.h>
+#include <dftracer/utils/core/coro/async_generator.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/utilities/common/json/parser.h>
 #include <dftracer/utils/utilities/replay/trace.h>
 
 #include <chrono>
@@ -90,6 +92,13 @@ struct ReplayConfig {
     // MPI options
     int mpi_rank = 0;  // MPI rank of this process
     int mpi_size = 1;  // Total number of MPI processes
+
+    // Optional observation hook fired in dispatch_trace after apply_timing
+    // returns and before the executor runs. Used by fidelity tests to
+    // measure dispatch lateness vs. the trace's wall-clock anchor; left
+    // unset in production so the per-event branch is the only cost.
+    std::function<void(const Trace&, std::chrono::steady_clock::time_point)>
+        on_dispatch;
 };
 
 /**
@@ -102,8 +111,9 @@ struct ReplayResult {
     std::size_t failed_events = 0;
     std::chrono::microseconds total_duration{0};
     std::chrono::microseconds execution_duration{0};
-    std::unordered_map<std::string, std::size_t> function_counts;
-    std::unordered_map<std::string, std::size_t> category_counts;
+    // Keys are non-owning views into the replay StringIntern pool.
+    std::unordered_map<std::string_view, std::size_t> function_counts;
+    std::unordered_map<std::string_view, std::size_t> category_counts;
     std::vector<std::string> error_messages;
 
     // Extended statistics
@@ -164,7 +174,10 @@ class PosixExecutor : public TraceExecutor {
     std::string get_name() const override { return "POSIX"; }
 
    private:
-    std::unordered_map<std::string, int> open_files_;
+    // Keys are interned via the replay StringIntern pool.
+    std::unordered_map<std::string_view, int> open_files_;
+    // Scratch buffer reused across reads and writes.
+    std::vector<char> io_buffer_;
 
     bool execute_open(const Trace& trace, const ReplayConfig& config);
     bool execute_close(const Trace& trace, const ReplayConfig& config);
@@ -172,6 +185,9 @@ class PosixExecutor : public TraceExecutor {
     bool execute_write(const Trace& trace, const ReplayConfig& config);
     bool execute_seek(const Trace& trace, const ReplayConfig& config);
     bool execute_stat(const Trace& trace, const ReplayConfig& config);
+
+    // Ensure io_buffer_ has at least `size` bytes; grow with 'A' fill.
+    void ensure_io_buffer(std::size_t size);
 };
 
 /**
@@ -188,9 +204,6 @@ class DFTracerExecutor : public TraceExecutor {
     void sleep_for_duration(double duration_microseconds);
 };
 
-// Forward declaration
-class ReplayLineProcessor;
-
 /**
  * Main replay engine that coordinates trace reading and execution
  *
@@ -205,8 +218,6 @@ class ReplayLineProcessor;
  *   result.print_summary();
  */
 class ReplayEngine {
-    friend class ReplayLineProcessor;
-
    public:
     /**
      * Construct replay engine with configuration
@@ -255,6 +266,43 @@ class ReplayEngine {
     ReplayResult replay_with_call_tree(const std::string& trace_dir,
                                        const std::string& pattern = "*.pfw.gz");
 
+    /**
+     * Process a single trace event already loaded into a JsonParser.
+     * Public so callers driving their own TraceReader::read_json loop can
+     * feed events in directly without going through replay(file).
+     */
+    bool process_trace_line(
+        dftracer::utils::utilities::common::json::JsonParser& parser,
+        ReplayResult& result);
+
+    /**
+     * Stream parsed Trace events from the given trace files. Drives
+     * TraceReader::read_json under the hood; each call yields one event.
+     * Used to plug replay into a producer task inside a Pipeline.
+     */
+    coro::AsyncGenerator<Trace> stream_traces(
+        const std::vector<std::string>& files);
+
+    /**
+     * Drive a producer/consumer pipeline that decouples read+parse from
+     * timing+execute. The producer fills a bounded channel from
+     * stream_traces; a single consumer drains it and dispatches events
+     * (apply_timing → executor->execute). Read latency is hidden behind
+     * the consumer's per-event work + sleep_for, eliminating the
+     * dispatch lateness that the sequential path accumulates on large
+     * gz-compressed traces.
+     *
+     * @param scope Parent CoroScope (typically a Pipeline task scope).
+     * @param files Trace files to replay in order.
+     * @param result Aggregated counts and per-event stats are written
+     *               here. Must outlive the awaited coroutine.
+     * @param channel_capacity Max in-flight parsed Traces. Default 4096.
+     */
+    coro::CoroTask<void> run_pipelined(dftracer::utils::CoroScope& scope,
+                                       const std::vector<std::string>& files,
+                                       ReplayResult& result,
+                                       std::size_t channel_capacity = 4096);
+
    private:
     ReplayConfig config_;
     std::vector<std::unique_ptr<TraceExecutor>> executors_;
@@ -263,14 +311,18 @@ class ReplayEngine {
     bool first_timestamp_set_ = false;
 
     /**
-     * Process a single trace line (JSON)
+     * Update result counts and execute one already-parsed Trace.
+     * Extracted from process_trace_line so the pipeline consumer and the
+     * sync per-line path share the same dispatch semantics.
      */
-    bool process_trace_line(const std::string& line, ReplayResult& result);
+    void dispatch_trace(const Trace& trace, ReplayResult& result);
 
     /**
-     * Parse JSON trace into Trace structure
+     * Populate a Trace from a parsed JsonParser document.
      */
-    bool parse_trace_json(const std::string& json_line, Trace& trace);
+    bool parse_trace_json(
+        dftracer::utils::utilities::common::json::JsonParser& parser,
+        Trace& trace);
 
     /**
      * Apply timing logic before executing trace
@@ -314,21 +366,6 @@ class ReplayEngine {
         ReplayResult& result);
 };
 
-/**
- * Line processor for handling trace lines during replay
- */
-class ReplayLineProcessor
-    : public dftracer::utils::utilities::reader::internal::LineProcessor {
-   public:
-    explicit ReplayLineProcessor(ReplayEngine& engine, ReplayResult& result);
-
-    coro::CoroTask<bool> process(const char* data, std::size_t length) override;
-
-   private:
-    ReplayEngine& engine_;
-    ReplayResult& result_;
-};
-
 }  // namespace dftracer::utils::utilities::replay
 
 #endif  // DFTRACER_UTILS_UTILITIES_REPLAY_REPLAY_H
diff --git a/include/dftracer/utils/utilities/replay/trace.h b/include/dftracer/utils/utilities/replay/trace.h
index c148c4f5..c4c80d7c 100644
--- a/include/dftracer/utils/utilities/replay/trace.h
+++ b/include/dftracer/utils/utilities/replay/trace.h
@@ -6,6 +6,7 @@
 #include <memory>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <unordered_map>
 #include <vector>
 
@@ -34,11 +35,15 @@ using ViewFields = std::unordered_map<std::string, std::string>;
  * Contains all information needed for replay operations
  */
 struct Trace {
-    // Category and function identification
-    std::string cat;        // Category (e.g., "posix", "stdio", "h5py")
-    std::string io_cat;     // I/O category (read, write, metadata)
-    std::string acc_pat;    // Access pattern
-    std::string func_name;  // Function name (e.g., "read", "write", "open")
+    // Category and function identification.
+    // Short-lived enum-like strings (cat/func_name) and per-event hashes
+    // (fhash/hhash) are non-owning views into a process-wide StringIntern
+    // pool; the pool keeps them alive for the program lifetime so the
+    // views remain valid past the parser that produced them.
+    std::string_view cat;        // Category (e.g., "posix", "stdio", "h5py")
+    std::string io_cat;          // I/O category (read, write, metadata)
+    std::string acc_pat;         // Access pattern
+    std::string_view func_name;  // Function name (e.g., "read", "write")
 
     // Timing information
     double duration;           // Duration in microseconds
@@ -53,8 +58,8 @@ struct Trace {
     std::uint64_t tid;  // Thread ID
 
     // File identification
-    std::string fhash;       // File hash
-    std::string hhash;       // Host hash
+    std::string_view fhash;  // File hash (interned)
+    std::string_view hhash;  // Host hash (interned)
     std::uint64_t image_id;  // Image ID
 
     // Trace type
diff --git a/pyproject.toml b/pyproject.toml
index 5fb51f6e..a1af280f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,20 @@ dev = [
 requires = ["scikit-build-core >=0.10"]
 build-backend = "scikit_build_core.build"
 
+[tool.uv]
+cache-keys = [
+  { file = "pyproject.toml" },
+  { file = "CMakeLists.txt" },
+  { file = "CMakePresets.json" },
+  { file = "cmake/**/*.cmake" },
+  { file = "cmake/modules/**/*.cmake" },
+  { file = "include/**/*.h" },
+  { file = "src/**/*.h" },
+  { file = "src/**/*.cpp" },
+  { file = "python/**/*.py" },
+  { file = "python/**/*.pyi" },
+]
+
 [tool.scikit-build]
 metadata.version.provider = "scikit_build_core.metadata.setuptools_scm"
 minimum-version = "build-system.requires"
diff --git a/python/dftracer/utils/__init__.py b/python/dftracer/utils/__init__.py
index 25f9b8ae..c95eb0ad 100644
--- a/python/dftracer/utils/__init__.py
+++ b/python/dftracer/utils/__init__.py
@@ -1,11 +1,11 @@
 from importlib.metadata import PackageNotFoundError, version
 from typing import Optional
 
+from .arrow import read_arrow, write_arrow  # noqa: F401
 from .dftracer_utils_ext import (
-    JSON,  # noqa: F401
-    Indexer,  # noqa: F401
+    CheckpointIndexer,  # noqa: F401
     IndexerCheckpoint,  # noqa: F401
-    TraceReader,  # noqa: F401
+    JsonDictValue,  # noqa: F401
 )
 from .dftracer_utils_ext import (
     get_default_runtime as _get_default_native_runtime,
@@ -13,8 +13,14 @@
 from .dftracer_utils_ext import (
     set_default_runtime as _set_default_native_runtime,
 )
+from .indexer import (  # noqa: F401
+    AggregationConfig,
+    Indexer,
+    IndexStatus,
+)
 from .query import Expr, Field  # noqa: F401
 from .runtime import Runtime, TaskHandle  # noqa: F401
+from .trace_reader import TraceReader  # noqa: F401
 
 _default_wrapper: Optional["Runtime"] = None
 
@@ -46,13 +52,19 @@ def set_default_runtime(runtime: Optional["Runtime"]) -> None:
 
 
 __all__ = [
+    "AggregationConfig",
+    "CheckpointIndexer",
     "Expr",
     "Field",
     "Indexer",
     "IndexerCheckpoint",
+    "IndexStatus",
+    "JsonDictValue",
     "TraceReader",
     "Runtime",
     "TaskHandle",
     "get_default_runtime",
+    "read_arrow",
     "set_default_runtime",
+    "write_arrow",
 ]
diff --git a/python/dftracer/utils/arrow.py b/python/dftracer/utils/arrow.py
index e7558e89..37846e33 100644
--- a/python/dftracer/utils/arrow.py
+++ b/python/dftracer/utils/arrow.py
@@ -1,8 +1,11 @@
-"""Arrow data interchange wrappers for DFTracer.
+"""Arrow data interchange and I/O for DFTracer.
 
-Provides ArrowBatch and ArrowTable classes that wrap Arrow C Data Interface
-objects (PyCapsules) with convenience methods for conversion to pandas and
-polars DataFrames.
+Provides:
+- ArrowBatch and ArrowTable classes that wrap Arrow C Data Interface
+  objects (PyCapsules) with convenience methods for conversion to pandas
+  and polars DataFrames.
+- write_arrow() and read_arrow() for Arrow IPC file I/O with Runtime
+  parallelization.
 
 These wrappers are pure Python. The actual Arrow data is produced by the
 C extension (TraceReader.iter_arrow, utility to_arrow methods). Conversion
@@ -12,7 +15,36 @@
 
 from __future__ import annotations
 
-from typing import Any, Iterator, Optional, Tuple
+import os
+from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple, Union
+
+if TYPE_CHECKING:
+    import pyarrow as pa
+    import pyarrow.ipc as ipc
+
+    from dftracer.utils.dftracer_utils_ext import (
+        read_arrow_files_parallel as _cpp_read_parallel,
+    )
+
+_HAS_PYARROW = False
+_HAS_CPP_READER = False
+
+try:
+    import pyarrow as pa
+    import pyarrow.ipc as ipc
+
+    _HAS_PYARROW = True
+except ImportError:
+    pass
+
+try:
+    from dftracer.utils.dftracer_utils_ext import (
+        read_arrow_files_parallel as _cpp_read_parallel,
+    )
+
+    _HAS_CPP_READER = True
+except ImportError:
+    pass
 
 
 class ArrowBatch:
@@ -98,32 +130,62 @@ def to_polars(self) -> Any:
 class ArrowTable:
     """Wrapper around a collection of Arrow RecordBatches.
 
-    Returned by read_arrow() and utility process() methods. Holds
-    multiple batches with a shared schema. Supports the Arrow PyCapsule
-    stream protocol (__arrow_c_stream__) for zero-copy interchange.
+    Returned by read_arrow() and utility process() methods. Supports
+    the Arrow PyCapsule stream protocol (__arrow_c_stream__) for
+    zero-copy interchange.
 
-    The pyarrow Table is cached on first conversion so subsequent calls
-    to ``to_pandas()``, ``to_polars()``, or ``__arrow_c_stream__()``
-    are safe.
+    Accepts either a pre-built list of batches or a lazy iterator.
+    When constructed from an iterator, batches are not materialized
+    until data access (to_pandas, to_polars, batches, etc.).
 
-    Empty results (no events matched) return an ArrowTable with
-    num_batches=0 and no columns.
+    ``num_rows`` is special: if the iterator has not been consumed yet,
+    it streams through counting rows without retaining batches (O(1)
+    memory). After a streaming ``num_rows``, data access methods will
+    return empty results -- use ``iter_arrow`` directly if you need
+    both count and data for very large datasets.
     """
 
     def __init__(
         self,
-        batches: list[Any],
+        batches: Any,
         schema_capsule: Optional[Any] = None,
     ) -> None:
-        self._batches = list(batches)
+        self._stream: Any = None
+        if isinstance(batches, list):
+            self._batches: Optional[list[Any]] = batches
+            self._iter: Optional[Iterator[Any]] = None
+        elif hasattr(batches, "__arrow_c_stream__"):
+            self._batches = None
+            self._iter = None
+            self._stream = batches
+        else:
+            self._batches = None
+            self._iter = iter(batches)
         self._schema_capsule = schema_capsule
-        self._pa_table: Any = None  # cached pyarrow.Table
+        self._pa_table: Any = None
+
+    def _materialize(self) -> list[Any]:
+        if self._batches is not None:
+            return self._batches
+        if self._stream is not None:
+            self._to_pa_table()
+            if self._pa_table is not None:
+                self._batches = list(self._pa_table.to_batches())
+                return self._batches
+        if self._iter is not None:
+            self._batches = list(self._iter)
+            self._iter = None
+            return self._batches
+        self._batches = []
+        return self._batches
 
     def _to_pa_table(self) -> Any:
         """Convert to pyarrow Table, caching the result.
 
         Arrow C Data Interface export is single-use (ownership transfer),
-        so we cache the pyarrow table on first conversion.
+        so we cache the pyarrow table on first conversion.  After
+        conversion the batch capsule references are cleared since pyarrow
+        now owns the underlying buffers.
 
         Returns:
             pyarrow.Table: The converted table.
@@ -137,7 +199,13 @@ def _to_pa_table(self) -> Any:
             import pyarrow as pa
         except ImportError:
             raise ImportError("pyarrow is required. Install with: pip install pyarrow") from None
-        pa_batches = [pa.record_batch(b) for b in self._batches]
+        if self._stream is not None:
+            self._pa_table = pa.table(self._stream)
+            self._stream = None
+            return self._pa_table
+        batches = self._materialize()
+        pa_batches = [pa.record_batch(b) for b in batches]
+        self._batches = None
         if not pa_batches:
             schema = pa.schema([])
             if self._schema_capsule is not None:
@@ -154,25 +222,29 @@ def __arrow_c_stream__(self, requested_schema: Any = None) -> Any:
     @property
     def num_batches(self) -> int:
         """Number of batches."""
-        return len(self._batches)
+        return len(self._materialize())
 
     @property
     def num_rows(self) -> int:
         """Total number of rows across all batches."""
-        return sum(b.num_rows for b in self._batches)
+        if self._pa_table is not None:
+            return self._pa_table.num_rows
+        return sum(b.num_rows for b in self._materialize())
 
     @property
     def empty(self) -> bool:
         """True if there are no batches."""
-        return len(self._batches) == 0
+        if self._pa_table is not None:
+            return self._pa_table.num_rows == 0
+        return len(self._materialize()) == 0
 
     def batch(self, i: int) -> Any:
         """Get the i-th batch."""
-        return self._batches[i]
+        return self._materialize()[i]
 
     def batches(self) -> Iterator[Any]:
         """Iterate over batches."""
-        return iter(self._batches)
+        return iter(self._materialize())
 
     def to_pandas(self) -> Any:
         """Convert all batches to a single pandas DataFrame.
@@ -200,6 +272,203 @@ def to_polars(self) -> Any:
             raise ImportError(
                 "polars is required for to_polars(). Install with: pip install polars"
             ) from None
-        if not self._batches:
+        table = self._to_pa_table()
+        if table.num_rows == 0:
             return pl.DataFrame()
-        return pl.concat([pl.from_arrow(self._to_pa_table())])
+        return pl.from_arrow(table)
+
+
+def write_arrow(
+    file_path: str,
+    output_dir: str,
+    view: Optional[Union[str, Dict]] = None,
+    index_dir: str = "",
+    checkpoint_size: int = 32 * 1024 * 1024,
+    compression: str = "zstd",
+    batch_size: int = 10000,
+    chunks: Optional[List[Dict]] = None,
+    parallel: bool = True,
+) -> Dict:
+    """Write trace data to Arrow IPC files.
+
+    If chunks is provided, writes those specific chunks. Otherwise, gets
+    all candidate chunks from the file after bloom filter pruning.
+
+    Args:
+        file_path: Path to the trace file.
+        output_dir: Directory for output Arrow IPC files.
+        view: View definition - string ('io', 'compute', 'dlio') or
+              dict with 'name' and optional 'query'.
+        index_dir: Directory for index files.
+        checkpoint_size: Checkpoint size for indexing.
+        compression: 'zstd' or 'none'.
+        batch_size: Events per batch.
+        chunks: Optional list of specific chunks to write. If None,
+            gets all candidate chunks from the file.
+        parallel: If True (default), process chunks in parallel via Runtime.
+
+    Returns:
+        dict with:
+            - files: List of written Arrow IPC file paths
+            - total_chunks: Number of chunks processed
+            - skipped_chunks: Number of chunks skipped by bloom filter
+            - total_rows: Total rows written
+            - total_events_matched: Total events matched
+
+    Example:
+        >>> from dftracer.utils.arrow import write_arrow
+        >>> result = write_arrow(
+        ...     "trace.pfw.gz",
+        ...     "/output/io_view",
+        ...     view="io",
+        ... )
+        >>> print(f"Wrote {len(result['files'])} files")
+    """
+    from dftracer.utils import TraceReader, get_default_runtime
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    reader = TraceReader(file_path, index_dir=index_dir, checkpoint_size=checkpoint_size)
+
+    if chunks is None:
+        chunks_result = reader.get_view_chunks(view=view)
+
+        if not chunks_result["file_may_match"]:
+            return {
+                "files": [],
+                "total_chunks": 0,
+                "skipped_chunks": chunks_result["skipped_checkpoints"],
+                "total_rows": 0,
+                "total_events_matched": 0,
+            }
+
+        chunks = chunks_result["chunks"]
+        skipped_chunks = chunks_result["skipped_checkpoints"]
+    else:
+        skipped_chunks = 0
+
+    if not chunks:
+        return {
+            "files": [],
+            "total_chunks": 0,
+            "skipped_chunks": skipped_chunks,
+            "total_rows": 0,
+            "total_events_matched": 0,
+        }
+
+    if parallel and len(chunks) > 1:
+        runtime = get_default_runtime()
+
+        def write_chunk(chunk: Dict) -> Dict:
+            r = TraceReader(file_path, index_dir=index_dir, checkpoint_size=checkpoint_size)
+            return r.write_view_chunks(
+                chunks=[chunk],
+                output_dir=output_dir,
+                view=view,
+                compression=compression,
+                batch_size=batch_size,
+            )
+
+        handles = [
+            runtime.submit(write_chunk, chunk, name=f"write:chunk_{i}")
+            for i, chunk in enumerate(chunks)
+        ]
+        batch_results = [h.get() for h in handles]
+    else:
+        result = reader.write_view_chunks(
+            chunks=chunks,
+            output_dir=output_dir,
+            view=view,
+            compression=compression,
+            batch_size=batch_size,
+        )
+        batch_results = [result]
+
+    files = []
+    total_rows = 0
+    total_events_matched = 0
+    for br in batch_results:
+        for r in br.get("results", []):
+            if r.get("rows_written", 0) > 0:
+                files.append(r["output_file"])
+        total_rows += br.get("total_rows", 0)
+        total_events_matched += br.get("total_events_matched", 0)
+
+    return {
+        "files": files,
+        "total_chunks": len(chunks),
+        "skipped_chunks": skipped_chunks,
+        "total_rows": total_rows,
+        "total_events_matched": total_events_matched,
+    }
+
+
+def read_arrow(
+    files: List[str],
+    parallel: bool = True,
+):
+    """Read Arrow IPC files and return a combined pyarrow Table.
+
+    Uses pyarrow for reading with optional parallelization via Runtime.
+    Falls back to C++ reader if pyarrow is not available.
+
+    Args:
+        files: List of Arrow IPC file paths.
+        parallel: If True (default), read files in parallel using Runtime.
+
+    Returns:
+        pyarrow.Table with all data combined, or list of batch objects if
+        pyarrow is not available.
+
+    Example:
+        >>> from dftracer.utils.arrow import read_arrow
+        >>> table = read_arrow(["file1.arrow", "file2.arrow"])
+        >>> print(f"Read {table.num_rows} rows")
+    """
+    from dftracer.utils import get_default_runtime
+
+    if not files:
+        return None
+
+    valid_files = [f for f in files if os.path.exists(f) and os.path.getsize(f) > 0]
+    if not valid_files:
+        return None
+
+    if _HAS_PYARROW:
+
+        def read_one(path: str) -> pa.Table:
+            return ipc.open_file(path).read_all()
+
+        if parallel and len(valid_files) > 1:
+            runtime = get_default_runtime()
+            handles = [
+                runtime.submit(read_one, f, name=f"read:{os.path.basename(f)}") for f in valid_files
+            ]
+            tables = [h.get() for h in handles]
+        else:
+            tables = [read_one(f) for f in valid_files]
+
+        if not tables:
+            return None
+
+        return pa.concat_tables(tables)
+
+    if not _HAS_CPP_READER:
+        raise ImportError(
+            "Neither pyarrow nor C++ Arrow IPC reader available. "
+            "Install pyarrow or build dftracer-utils with Arrow IPC support."
+        )
+
+    from dftracer.utils import Runtime as PyRuntime
+
+    runtime = get_default_runtime()
+    native_rt = runtime._native if isinstance(runtime, PyRuntime) else runtime
+
+    result = _cpp_read_parallel(valid_files, runtime=native_rt)
+
+    batches = []
+    for fr in result.get("file_results", []):
+        if fr.get("success"):
+            batches.extend(fr.get("batches", []))
+
+    return batches
diff --git a/python/dftracer/utils/dask.py b/python/dftracer/utils/dask.py
index 36365e20..f48ad108 100644
--- a/python/dftracer/utils/dask.py
+++ b/python/dftracer/utils/dask.py
@@ -1,32 +1,42 @@
 """Dask distributed integration for dftracer-utils."""
 
-from typing import Any, Optional
+import os
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Union
 
 try:
-    from dask.distributed import WorkerPlugin
+    from dask.distributed import Client, WorkerPlugin
 except ImportError:
+    Client: Optional[Any] = None
     WorkerPlugin: Optional[Any] = None
 
-from dftracer.utils import Runtime, get_default_runtime, set_default_runtime
+try:
+    import dask
+    import dask.dataframe as dd
+except ImportError:
+    dask = None  # type: ignore[assignment]  # ty: ignore[invalid-assignment]
+    dd = None  # type: ignore[assignment]  # ty: ignore[invalid-assignment]
+
+try:
+    import pyarrow as pa
+except ImportError:
+    pa = None  # type: ignore[assignment]  # ty: ignore[invalid-assignment]
+
+from dftracer.utils import Runtime, TraceReader, get_default_runtime, set_default_runtime
+from dftracer.utils.indexer import AggregationConfig, Indexer
 
 if WorkerPlugin is not None:
 
     class DFTracerUtilsDaskWorkerPlugin(WorkerPlugin):
-        """Creates a persistent Runtime per Dask worker.
-
-        Usage:
-            client = Client("scheduler:8786")
-            client.register_plugin(
-                DFTracerUtilsDaskWorkerPlugin(threads=48)
-            )
-        """
+        """Creates a persistent Runtime per Dask worker."""
 
-        def __init__(self, threads=0):
+        def __init__(self, threads=0, io_threads=0):
             self.threads = threads
+            self.io_threads = io_threads
 
         def setup(self, worker):
             worker._dftracer_prev_runtime = get_default_runtime()
-            rt = Runtime(threads=self.threads)
+            rt = Runtime(threads=self.threads, io_threads=self.io_threads)
             worker.dftracer_utils_runtime = rt
             set_default_runtime(rt)
 
@@ -35,5 +45,1110 @@ def teardown(self, worker):
                 set_default_runtime(worker._dftracer_prev_runtime)
                 del worker._dftracer_prev_runtime
             if hasattr(worker, "dftracer_utils_runtime"):
-                worker.dftracer_utils_runtime.shutdown()
+                # wait=False: don't block on pending tasks during teardown.
+                # Dask may be tearing down because of timeout/cancel, and a
+                # stuck task would hang the worker process indefinitely.
+                try:
+                    worker.dftracer_utils_runtime.shutdown(wait=False)
+                except Exception:
+                    pass
                 del worker.dftracer_utils_runtime
+
+
+def _write_arrow_task(
+    file_path: str,
+    output_dir: str,
+    view: Optional[Union[str, Dict]],
+    index_dir: str,
+    checkpoint_size: int,
+    compression: str,
+    batch_size: int,
+    chunks: List[Dict],
+) -> Dict:
+    """Task function for writing chunks on a Dask worker."""
+    from dftracer.utils.arrow import write_arrow
+
+    return write_arrow(
+        file_path=file_path,
+        output_dir=output_dir,
+        view=view,
+        index_dir=index_dir,
+        checkpoint_size=checkpoint_size,
+        compression=compression,
+        batch_size=batch_size,
+        chunks=chunks,
+        parallel=True,
+    )
+
+
+def distributed_write_arrow(
+    file_path: str,
+    output_dir: str,
+    view: Optional[Union[str, Dict]] = None,
+    index_dir: str = "",
+    checkpoint_size: int = 32 * 1024 * 1024,
+    compression: str = "zstd",
+    batch_size: int = 10000,
+    chunks_per_task: int = 0,
+) -> Dict:
+    """Write trace data to Arrow IPC files using Dask distributed.
+
+    This function:
+    1. Gets candidate chunks after bloom filter pruning (coordinator)
+    2. Distributes chunk processing to Dask workers
+    3. Each worker writes its chunks to Arrow IPC files
+    4. Returns paths to all written files for pyarrow reading
+
+    Args:
+        file_path: Path to the trace file.
+        output_dir: Directory for output Arrow IPC files.
+        view: View definition - string ('io', 'compute', 'dlio') or
+              dict with 'name' and optional 'query'.
+        index_dir: Directory for index files.
+        checkpoint_size: Checkpoint size for indexing.
+        compression: 'zstd' or 'none'.
+        batch_size: Events per batch.
+        chunks_per_task: Number of chunks per Dask task. If 0, uses 1 chunk
+            per task. Higher values batch chunks per worker, processing
+            them in parallel on the worker's Runtime thread pool.
+
+    Returns:
+        dict with:
+            - files: List of written Arrow IPC file paths
+            - total_chunks: Number of chunks processed
+            - skipped_chunks: Number of chunks skipped by bloom filter
+            - total_rows: Total rows written
+            - total_events_matched: Total events matched
+
+    Example:
+        >>> import pyarrow.ipc as ipc
+        >>> import pyarrow as pa
+        >>> from dftracer.utils.dask import distributed_write_arrow
+        >>>
+        >>> result = distributed_write_arrow(
+        ...     "trace.pfw.gz",
+        ...     "/output/io_view",
+        ...     view="io",
+        ...     chunks_per_task=8,  # batch 8 chunks per worker
+        ... )
+        >>> # Read back with pyarrow
+        >>> tables = [ipc.open_file(f).read_all() for f in result["files"]]
+        >>> combined = pa.concat_tables(tables)
+    """
+    if dask is None:
+        raise ImportError("dask is required for distributed_write_arrow")
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    reader = TraceReader(file_path, index_dir=index_dir, checkpoint_size=checkpoint_size)
+    chunks_result = reader.get_view_chunks(view=view)
+
+    if not chunks_result["file_may_match"]:
+        return {
+            "files": [],
+            "total_chunks": 0,
+            "skipped_chunks": chunks_result["skipped_checkpoints"],
+            "total_rows": 0,
+            "total_events_matched": 0,
+        }
+
+    chunks = chunks_result["chunks"]
+    if not chunks:
+        return {
+            "files": [],
+            "total_chunks": 0,
+            "skipped_chunks": chunks_result["skipped_checkpoints"],
+            "total_rows": 0,
+            "total_events_matched": 0,
+        }
+
+    if chunks_per_task <= 0:
+        chunks_per_task = 1
+
+    batches = [chunks[i : i + chunks_per_task] for i in range(0, len(chunks), chunks_per_task)]
+
+    delayed_tasks = [
+        dask.delayed(_write_arrow_task)(
+            file_path,
+            output_dir,
+            view,
+            index_dir,
+            checkpoint_size,
+            compression,
+            batch_size,
+            batch,
+        )
+        for batch in batches
+    ]
+
+    batch_results = dask.compute(*delayed_tasks)
+
+    files = []
+    total_rows = 0
+    total_events_matched = 0
+    for br in batch_results:
+        for r in br.get("results", []):
+            if r.get("rows_written", 0) > 0:
+                files.append(r["output_file"])
+        total_rows += br.get("total_rows", 0)
+        total_events_matched += br.get("total_events_matched", 0)
+
+    return {
+        "files": files,
+        "total_chunks": len(chunks),
+        "skipped_chunks": chunks_result["skipped_checkpoints"],
+        "total_rows": total_rows,
+        "total_events_matched": total_events_matched,
+    }
+
+
+def _assign_files_by_pid(
+    file_pids: Dict[int, set],
+    n_workers: int,
+) -> Dict[int, List[int]]:
+    """Assign files to workers based on majority PID affinity.
+
+    Files with overlapping PIDs are assigned to the same worker to minimize
+    cross-worker aggregation during the merge phase.
+
+    Args:
+        file_pids: Dict mapping file_id to set of PIDs in that file.
+        n_workers: Number of workers to distribute to.
+
+    Returns:
+        Dict mapping worker_id to list of file_ids.
+    """
+    if n_workers <= 0:
+        n_workers = 1
+
+    # Count PIDs per file and assign to worker by hash(majority_pid) % n_workers
+    worker_assignments: Dict[int, List[int]] = defaultdict(list)
+
+    for file_id, pids in file_pids.items():
+        if not pids:
+            # No PIDs known, round-robin assignment
+            worker_id = file_id % n_workers
+        else:
+            # Use hash of first PID for deterministic assignment
+            # Files with same PIDs go to same worker
+            majority_pid = min(pids)  # Use min for determinism
+            worker_id = hash(majority_pid) % n_workers
+        worker_assignments[worker_id].append(file_id)
+
+    return dict(worker_assignments)
+
+
+def _aggregate_files_task(
+    files: List[str],
+    index_path: str,
+    time_granularity: float,
+    time_resolution: float,
+    data_type: str,
+) -> List[bytes]:
+    """Worker task: read pre-indexed data and return Arrow IPC buffers.
+
+    This runs on a Dask worker. It reads from an already-indexed database
+    and returns Arrow IPC buffers for the specified files.
+
+    Args:
+        files: List of trace file paths (used for filtering, not re-indexing).
+        index_path: Path to the .dftindex store (already built by coordinator).
+        time_granularity: Output time bucket width in seconds.
+        time_resolution: Microseconds per output time unit.
+        data_type: 'events', 'profiles', or 'system'.
+
+    Returns:
+        List of Arrow IPC buffer bytes.
+    """
+    if not files:
+        return []
+
+    # Use existing index (read-only) - coordinator already built it
+    indexer = Indexer(
+        files=files,
+        index_dir=os.path.dirname(index_path) if index_path else "",
+        require_checkpoint=False,  # Don't rebuild
+        require_bloom=False,
+        require_manifest=False,
+        require_aggregation=False,  # Already aggregated
+        force_rebuild=False,
+    )
+
+    # Collect Arrow batches as IPC buffers
+    ipc_buffers = []
+    for batch_capsule in indexer.iter_arrow_dfanalyzer(
+        data_type,
+        time_granularity=time_granularity,
+        time_resolution=time_resolution,
+    ):
+        batch = pa.record_batch(batch_capsule)
+        # Serialize to IPC buffer for transfer
+        sink = pa.BufferOutputStream()
+        writer = pa.ipc.new_stream(sink, batch.schema)
+        writer.write_batch(batch)
+        writer.close()
+        ipc_buffers.append(sink.getvalue().to_pybytes())
+
+    return ipc_buffers
+
+
+def _aggregate_files_task_all(
+    files: List[str],
+    index_path: str,
+    time_granularity: float,
+    time_resolution: float,
+    query: Optional[str] = None,
+) -> Dict[str, List[bytes]]:
+    """Worker task: read all aggregation types and return Arrow IPC buffers.
+
+    This runs on a Dask worker. It reads from an already-indexed database
+    and returns Arrow IPC buffers for all types in a single scan.
+
+    Args:
+        files: List of trace file paths (used for filtering, not re-indexing).
+        index_path: Path to the .dftindex store (already built by coordinator).
+        time_granularity: Output time bucket width in seconds.
+        time_resolution: Microseconds per output time unit.
+        query: Optional query filter string (e.g., "pid == 1234 or pid == 5678").
+
+    Returns:
+        Dict with 'events', 'profiles', 'system' keys, each containing
+        a list of Arrow IPC buffer bytes.
+    """
+    if not files:
+        return {"events": [], "profiles": [], "system": []}
+
+    indexer = Indexer(
+        files=files,
+        index_dir=os.path.dirname(index_path) if index_path else "",
+        require_checkpoint=False,
+        require_bloom=False,
+        require_manifest=False,
+        require_aggregation=False,
+        force_rebuild=False,
+    )
+
+    all_batches = indexer.iter_arrow_dfanalyzer_all(
+        time_granularity=time_granularity,
+        time_resolution=time_resolution,
+        query=query,
+    )
+
+    # Convert to IPC buffers for each type
+    result = {}
+    for data_type in ("events", "profiles", "system"):
+        ipc_buffers = []
+        for batch_capsule in all_batches.get(data_type, []):
+            batch = pa.record_batch(batch_capsule)
+            sink = pa.BufferOutputStream()
+            writer = pa.ipc.new_stream(sink, batch.schema)
+            writer.write_batch(batch)
+            writer.close()
+            ipc_buffers.append(sink.getvalue().to_pybytes())
+        result[data_type] = ipc_buffers
+
+    return result
+
+
+def _merge_welford(group):
+    """Merge mean/variance using parallel Welford algorithm.
+
+    Used for re-aggregating overlapping keys across workers.
+    """
+    n_total = group["count"].sum()
+    if n_total == 0:
+        return {
+            "count": 0,
+            "time": 0.0,
+            "size": 0,
+            "time_min": 0.0,
+            "time_max": 0.0,
+            "size_min": 0,
+            "size_max": 0,
+        }
+
+    # Sum aggregation for count, time, size
+    result = {
+        "count": n_total,
+        "time": group["time"].sum(),
+        "size": group["size"].sum(),
+        "time_min": group["time_min"].min(),
+        "time_max": group["time_max"].max(),
+        "size_min": group["size_min"].min(),
+        "size_max": group["size_max"].max(),
+    }
+    return result
+
+
+def distributed_aggregate(
+    directory: str = "",
+    files: Optional[List[str]] = None,
+    client: Optional["Client"] = None,
+    time_interval_ms: float = 5000.0,
+    time_granularity: float = 1.0,
+    time_resolution: float = 1e6,
+    index_dir: str = "",
+    data_type: str = "events",
+) -> "pa.Table":
+    """Aggregate trace data using Dask distributed workers.
+
+    This function:
+    1. Indexes all files on coordinator to get PID manifests
+    2. Assigns files to workers by PID affinity (minimize cross-worker overlap)
+    3. Each worker aggregates its files using iter_arrow_dfanalyzer
+    4. Gathers partial Arrow tables from workers
+    5. Re-aggregates overlapping keys (same PID/time_range across files)
+
+    Args:
+        directory: Directory containing trace files (.pfw/.pfw.gz).
+        files: Explicit list of files (alternative to directory).
+        client: Dask distributed Client. If None, uses dask.delayed locally.
+        time_interval_ms: Aggregation time bucket in milliseconds.
+        time_granularity: Output time bucket width in seconds.
+        time_resolution: Microseconds per output time unit.
+        index_dir: Directory for index storage.
+        data_type: Type of data to aggregate - 'events', 'profiles', or 'system'.
+
+    Returns:
+        PyArrow Table with aggregated data.
+
+    Example:
+        >>> from dask.distributed import Client
+        >>> from dftracer.utils.dask import distributed_aggregate
+        >>>
+        >>> client = Client("scheduler:8786")
+        >>> client.register_plugin(DFTracerUtilsDaskWorkerPlugin(threads=48))
+        >>>
+        >>> table = distributed_aggregate(
+        ...     directory="/traces",
+        ...     client=client,
+        ...     time_interval_ms=5000,
+        ... )
+        >>> df = table.to_pandas()
+    """
+    if dask is None:
+        raise ImportError("dask is required for distributed_aggregate")
+    if pa is None:
+        raise ImportError("pyarrow is required for distributed_aggregate")
+
+    # Step 1: Index on coordinator
+    indexer = Indexer(
+        directory=directory,
+        files=files,
+        index_dir=index_dir,
+        require_checkpoint=True,
+        require_bloom=True,
+        require_manifest=True,
+        require_aggregation=AggregationConfig(
+            time_interval_ms=time_interval_ms,
+            compute_percentiles=False,
+        ),
+        force_rebuild=False,
+    )
+    status = indexer.ensure_indexed()
+
+    if status.total_files == 0:
+        return pa.table({})
+
+    # For local execution (no client), just use iter_arrow_dfanalyzer directly
+    # This avoids RocksDB locking issues when running in a single process
+    if client is None:
+        all_batches = []
+        for batch_capsule in indexer.iter_arrow_dfanalyzer(
+            data_type,
+            time_granularity=time_granularity,
+            time_resolution=time_resolution,
+        ):
+            all_batches.append(pa.record_batch(batch_capsule))
+
+        if not all_batches:
+            return pa.table({})
+
+        return pa.Table.from_batches(all_batches)
+
+    # Distributed execution: assign files to workers by PID affinity
+    all_files = status.ready + status.needs_work
+    file_id_to_path, file_pids = indexer.query_file_info()
+    index_path = status.index_path
+
+    # Close indexer before distributing (release RocksDB lock)
+    indexer.close()
+
+    worker_nthreads = client.nthreads()
+    n_workers = len(worker_nthreads) or 1
+
+    all_file_ids = set(file_id_to_path.keys())
+    full_file_pids = {fid: file_pids.get(fid, set()) for fid in all_file_ids}
+    worker_file_ids = _assign_files_by_pid(full_file_pids, n_workers)
+
+    worker_files: Dict[int, List[str]] = {}
+    for worker_id, fids in worker_file_ids.items():
+        worker_files[worker_id] = [file_id_to_path[fid] for fid in fids if fid in file_id_to_path]
+
+    futures = []
+    worker_list = list(worker_nthreads.keys())
+    for worker_id, wfiles in worker_files.items():
+        if not wfiles:
+            continue
+        worker_addr = worker_list[worker_id % len(worker_list)] if worker_list else None
+        future = client.submit(
+            _aggregate_files_task,
+            wfiles,
+            index_path,
+            time_granularity,
+            time_resolution,
+            data_type,
+            workers=[worker_addr] if worker_addr else None,
+            pure=False,
+        )
+        futures.append(future)
+
+    # Gather results
+    all_ipc_buffers = client.gather(futures)
+
+    # Deserialize IPC buffers and combine
+    all_batches = []
+    for ipc_buffers in all_ipc_buffers:
+        for buf_bytes in ipc_buffers:
+            reader = pa.ipc.open_stream(pa.BufferReader(buf_bytes))
+            for batch in reader:
+                all_batches.append(batch)
+
+    if not all_batches:
+        return pa.table({})
+
+    combined_table = pa.Table.from_batches(all_batches)
+
+    # Step 6: Re-aggregate overlapping keys using Dask DataFrame
+    # This handles cases where the same (pid, tid, time_range, func_name) appears
+    # across multiple files assigned to different workers
+    if data_type == "system":
+        # System metrics: group by host_hash, time_range
+        group_cols = ["host_hash", "time_range"]
+        agg_dict = {
+            "sys_cpu_iowait_pct": "mean",
+            "sys_cpu_user_pct": "mean",
+            "sys_cpu_system_pct": "mean",
+            "sys_cpu_idle_pct": "mean",
+            "sys_core_iowait_pct_max": "max",
+            "sys_core_iowait_pct_p95": "max",
+            "sys_mem_dirty": "mean",
+            "sys_mem_cached": "mean",
+            "sys_mem_available": "mean",
+        }
+    else:
+        # Events/Profiles: group by all key columns
+        group_cols = [
+            "cat",
+            "func_name",
+            "pid",
+            "tid",
+            "file_hash",
+            "host_hash",
+            "time_range",
+        ]
+        agg_dict = {
+            "count": "sum",
+            "time": "sum",
+            "size": "sum",
+            "time_min": "min",
+            "time_max": "max",
+            "size_min": "min",
+            "size_max": "max",
+        }
+
+    # Check if re-aggregation is needed (more than one file)
+    if len(all_files) > 1:
+        df = combined_table.to_pandas()
+
+        # Preserve non-aggregated columns
+        first_cols = {}
+        for col in df.columns:
+            if col not in group_cols and col not in agg_dict:
+                first_cols[col] = "first"
+
+        agg_dict.update(first_cols)
+
+        # Group and aggregate
+        result_df = df.groupby(group_cols, as_index=False).agg(agg_dict)
+        return pa.Table.from_pandas(result_df, preserve_index=False)
+
+    return combined_table
+
+
+def distributed_aggregate_all(
+    directory: str = "",
+    files: Optional[List[str]] = None,
+    client: Optional["Client"] = None,
+    time_interval_ms: float = 5000.0,
+    time_granularity: float = 1.0,
+    time_resolution: float = 1e6,
+    index_dir: str = "",
+) -> Dict[str, "pa.Table"]:
+    """Aggregate all trace data types in a single scan.
+
+    This is ~3x faster than calling distributed_aggregate separately for
+    events, profiles, and system because it scans the index only once.
+
+    Args:
+        directory: Directory containing trace files (.pfw/.pfw.gz).
+        files: Explicit list of files (alternative to directory).
+        client: Dask distributed Client. If None, uses local execution.
+        time_interval_ms: Aggregation time bucket in milliseconds.
+        time_granularity: Output time bucket width in seconds.
+        time_resolution: Microseconds per output time unit.
+        index_dir: Directory for index storage.
+
+    Returns:
+        Dict with 'events', 'profiles', 'system' keys, each containing a
+        PyArrow Table with aggregated data.
+
+    Example:
+        >>> from dftracer.utils.dask import distributed_aggregate_all
+        >>> tables = distributed_aggregate_all("/traces")
+        >>> events_df = tables["events"].to_pandas()
+        >>> profiles_df = tables["profiles"].to_pandas()
+    """
+    if dask is None:
+        raise ImportError("dask is required for distributed_aggregate_all")
+    if pa is None:
+        raise ImportError("pyarrow is required for distributed_aggregate_all")
+
+    # Index on coordinator
+    indexer = Indexer(
+        directory=directory,
+        files=files,
+        index_dir=index_dir,
+        require_checkpoint=True,
+        require_bloom=True,
+        require_manifest=True,
+        require_aggregation=AggregationConfig(
+            time_interval_ms=time_interval_ms,
+            compute_percentiles=False,
+        ),
+        force_rebuild=False,
+    )
+    status = indexer.ensure_indexed()
+
+    if status.total_files == 0:
+        return {"events": pa.table({}), "profiles": pa.table({}), "system": pa.table({})}
+
+    # Use fused API for local execution
+    if client is None:
+        result = indexer.iter_arrow_dfanalyzer_all(
+            time_granularity=time_granularity,
+            time_resolution=time_resolution,
+        )
+
+        tables = {}
+        for key in ("events", "profiles", "system"):
+            batches = [pa.record_batch(cap) for cap in result.get(key, [])]
+            tables[key] = pa.Table.from_batches(batches) if batches else pa.table({})
+
+        return tables
+
+    # Distributed execution: assign files to workers by PID affinity
+    file_id_to_path, file_pids = indexer.query_file_info()
+    index_path = status.index_path
+
+    # Close indexer before distributing (release RocksDB lock)
+    indexer.close()
+
+    worker_nthreads = client.nthreads()
+    n_workers = len(worker_nthreads) or 1
+
+    all_file_ids = set(file_id_to_path.keys())
+    full_file_pids = {fid: file_pids.get(fid, set()) for fid in all_file_ids}
+    worker_file_ids = _assign_files_by_pid(full_file_pids, n_workers)
+
+    worker_files: Dict[int, List[str]] = {}
+    worker_pids: Dict[int, set] = {}
+    for worker_id, fids in worker_file_ids.items():
+        worker_files[worker_id] = [file_id_to_path[fid] for fid in fids if fid in file_id_to_path]
+        pids = set()
+        for fid in fids:
+            if fid in file_pids:
+                pids.update(file_pids[fid])
+        worker_pids[worker_id] = pids
+
+    futures = []
+    worker_list = list(worker_nthreads.keys())
+    for worker_id, wfiles in worker_files.items():
+        if not wfiles:
+            continue
+        # Build query filter for this worker's PIDs
+        pids = worker_pids.get(worker_id, set())
+        query = None
+        if pids:
+            pid_conditions = " or ".join(f"pid == {pid}" for pid in sorted(pids))
+            query = f"({pid_conditions})"
+        worker_addr = worker_list[worker_id % len(worker_list)] if worker_list else None
+        future = client.submit(
+            _aggregate_files_task_all,
+            wfiles,
+            index_path,
+            time_granularity,
+            time_resolution,
+            query,
+            workers=[worker_addr] if worker_addr else None,
+            pure=False,
+        )
+        futures.append(future)
+
+    # Gather results (each is a dict with events/profiles/system)
+    all_results = client.gather(futures)
+
+    # Collect batches by type
+    batches_by_type: Dict[str, List] = {"events": [], "profiles": [], "system": []}
+    for result_dict in all_results:
+        for data_type in ("events", "profiles", "system"):
+            for buf_bytes in result_dict.get(data_type, []):
+                reader = pa.ipc.open_stream(pa.BufferReader(buf_bytes))
+                for batch in reader:
+                    batches_by_type[data_type].append(batch)
+
+    tables = {}
+    for data_type in ("events", "profiles", "system"):
+        batches = batches_by_type[data_type]
+        if not batches:
+            tables[data_type] = pa.table({})
+            continue
+        table = pa.Table.from_batches(batches)
+        # Unify dictionary columns from different workers to plain strings
+        for i, field in enumerate(table.schema):
+            if pa.types.is_dictionary(field.type):
+                table = table.set_column(i, field.name, table.column(i).cast(pa.string()))
+        tables[data_type] = table
+
+    return tables
+
+
+# ---------------------------------------------------------------------------
+# Distributed index build (SST sink path)
+# ---------------------------------------------------------------------------
+
+
+def _build_sst_task(
+    files: List[str],
+    file_ids: List[int],
+    file_slices: Optional[List[Any]],
+    local_staging: str,
+    lustre_staging: str,
+    batch_id: str,
+    index_dir: str,
+    checkpoint_size: int,
+    bloom_dimensions: Optional[List[str]],
+    build_manifest: bool,
+    force_rebuild: bool,
+    parallelism: int,
+    flush_every_files: int,
+    aggregation_config: Optional[Any] = None,
+    enable_det_ids: bool = False,
+) -> tuple:
+    """Dask worker task: build per-worker SSTs and relocate to shared FS.
+
+    Returns ``(artifact_dicts, tracker_blob)``."""
+    import logging as _logging
+    import socket as _socket
+    import time as _time
+
+    from .dftracer_utils_ext import build_sst_batch, move_artifacts
+
+    _log = _logging.getLogger("dftracer.utils.dask._build_sst_task")
+    _host = _socket.gethostname()
+
+    t0 = _time.monotonic()
+    if enable_det_ids:
+        from .dftracer_utils_ext import enable_aggregation_deterministic_ids
+
+        enable_aggregation_deterministic_ids()
+
+    artifact_dicts, tracker_blob = build_sst_batch(
+        files,
+        file_ids,
+        local_staging,
+        batch_id,
+        index_dir,
+        checkpoint_size,
+        build_manifest,
+        force_rebuild,
+        bloom_dimensions,
+        parallelism,
+        flush_every_files,
+        None,
+        aggregation_config,
+        file_slices,
+    )
+    t_build = _time.monotonic()
+
+    n_moved = 0
+    if lustre_staging and lustre_staging != local_staging:
+        # Keep per-sink subdir to avoid aggregation.sst collisions.
+        base = os.path.join(lustre_staging, batch_id)
+        relocated: List[Dict[str, Optional[str]]] = []
+        for i, d in enumerate(artifact_dicts):
+            relocated.append(move_artifacts(d, os.path.join(base, f"sub_{i}")))
+        artifact_dicts = relocated
+        n_moved = len(relocated)
+    t_move = _time.monotonic()
+
+    _log.info(
+        "build host=%s batch=%s n_files=%d n_slices=%d n_artifacts=%d "
+        "build=%.2fs move=%.2fs(n=%d) total=%.2fs",
+        _host,
+        batch_id,
+        len(set(file_ids)),
+        len(files),
+        len(artifact_dicts),
+        t_build - t0,
+        t_move - t_build,
+        n_moved,
+        t_move - t0,
+    )
+    return artifact_dicts, tracker_blob
+
+
+def _scan_gzip_members_task(paths: List[str]) -> List[List[tuple]]:
+    """Worker task: scan gzip member offsets for its file subset."""
+    from .dftracer_utils_ext import enumerate_gzip_members
+
+    return enumerate_gzip_members(paths, None)
+
+
+def distributed_index(
+    directory: str = "",
+    files: Optional[List[str]] = None,
+    index_path: str = "",
+    local_staging: str = "",
+    lustre_staging: str = "",
+    client: Optional["Client"] = None,
+    checkpoint_size: int = 32 * 1024 * 1024,
+    bloom_dimensions: Optional[List[str]] = None,
+    build_manifest: bool = True,
+    force_rebuild: bool = False,
+    partition: str = "lpt",
+    rebuild_root_summaries: bool = True,
+    parallelism_per_worker: int = 0,
+    flush_every_files: int = 0,
+    aggregation_config: Optional[Any] = None,
+) -> Dict[str, Any]:
+    """Index a set of trace files using Dask workers writing SSTs in parallel.
+
+    Steps (all O(1) on the coordinator except the fan-out):
+      1. Enumerate files + sizes via parallel scan.
+      2. LPT bin-pack files into one bucket per Dask worker.
+      3. Register all files on the coordinator's IndexDatabase (pre-assigns
+         file_ids and writes DEFAULT-CF entries once).
+      4. Submit one Dask task per non-empty worker that runs the existing
+         indexer pipeline with an SST sink, writing SSTs to `local_staging`
+         and (if different) moving them to `lustre_staging`.
+      5. Collect artifact dicts into an SstArtifactRegistry; coordinator
+         calls bulk_ingest + rebuild_root_summaries.
+
+    Args:
+        directory: Directory containing trace files.
+        files: Explicit file list (alternative to directory).
+        index_path: Target .dftindex path (coordinator-writable).
+        local_staging: Per-worker SST build dir. If equal to lustre_staging,
+            no post-build move.
+        lustre_staging: Shared FS dir the coordinator reads SSTs from during
+            ingest. Must be on the same filesystem as index_path for the
+            cheapest ingest.
+        client: Dask distributed Client. None -> run tasks inline.
+        partition: "lpt" (greedy longest-processing-time bin-pack) or
+            "round_robin".
+        rebuild_root_summaries: If True, recompute ROOT_* CFs after ingest.
+        parallelism_per_worker: 0 -> let the plugin/default Runtime choose
+            (one coroutine thread per core).
+        flush_every_files: 0 -> build SSTs once per worker; >0 -> flush
+            mid-batch to bound peak memory.
+
+    Returns:
+        dict with total_files, per_worker sizes, index_path, artifact_count.
+    """
+    if dask is None:
+        raise ImportError("dask is required for distributed_index")
+    if not index_path:
+        raise ValueError("index_path is required")
+    if not local_staging:
+        raise ValueError("local_staging is required")
+    if not lustre_staging:
+        lustre_staging = local_staging
+
+    import logging as _logging
+    import time as _time
+
+    from .dftracer_utils_ext import (
+        IndexDatabase as _IndexDatabase,
+    )
+    from .dftracer_utils_ext import (
+        SstArtifactRegistry as _SstArtifactRegistry,
+    )
+    from .dftracer_utils_ext import (
+        enumerate_gzip_members as _enumerate_gzip_members,
+    )
+    from .dftracer_utils_ext import (
+        plan_work_units as _plan_work_units,
+    )
+    from .dftracer_utils_ext import (
+        scan_files as _scan_files,
+    )
+
+    _log = _logging.getLogger("dftracer.utils.dask.distributed_index")
+    if not _log.handlers:
+        _log.setLevel(_logging.INFO)
+
+    # 1. Enumerate files + sizes.
+    _t0 = _time.monotonic()
+    if files is None:
+        if not directory:
+            raise ValueError("either directory or files is required")
+        _log.info("distributed_index: scan_files(%s)", directory)
+        entries = _scan_files(directory, [".pfw", ".pfw.gz"], True, None)
+    else:
+        _log.info("distributed_index: sizing %d pre-listed files", len(files))
+        entries = [(p, os.path.getsize(p)) for p in files]
+    _log.info("distributed_index: scanned %d files in %.1fs", len(entries), _time.monotonic() - _t0)
+
+    if not entries:
+        return {"total_files": 0, "per_worker": [], "index_path": index_path}
+
+    n_workers = 1
+    if client is not None:
+        n_workers = len(client.nthreads()) or 1
+    _log.info("distributed_index: %d workers visible", n_workers)
+
+    all_paths = [p for (p, _) in entries]
+
+    # 2. Register all files once on coordinator (one register_files call;
+    #    file_ids are then parallel to `entries`).
+    _t1 = _time.monotonic()
+    _log.info("distributed_index: opening IndexDatabase at %s", index_path)
+    db = _IndexDatabase(index_path)
+    db.init_schema()
+    all_file_ids = db.register_files(all_paths, build_manifest)
+    _log.info(
+        "distributed_index: register_files done (%d files, %.1fs)",
+        len(all_paths),
+        _time.monotonic() - _t1,
+    )
+
+    # 3. SCAN: distribute gzip-member scan across workers (round-robin
+    #    per file_idx). Each worker sends back only its 1/N member maps;
+    #    coordinator stitches into the full map.
+    _t2 = _time.monotonic()
+    member_map: List[List[tuple]] = [[] for _ in range(len(entries))]
+    if client is None:
+        member_map = list(_enumerate_gzip_members(all_paths, None))
+    else:
+        worker_addrs = list(client.nthreads().keys())
+        scan_buckets: List[List[int]] = [[] for _ in range(n_workers)]
+        for i in range(len(all_paths)):
+            scan_buckets[i % n_workers].append(i)
+        scan_futs = []
+        scan_idx_lists: List[List[int]] = []
+        for w, idxs in enumerate(scan_buckets):
+            if not idxs:
+                continue
+            sub_paths = [all_paths[i] for i in idxs]
+            target = [worker_addrs[w % len(worker_addrs)]] if worker_addrs else None
+            scan_idx_lists.append(idxs)
+            scan_futs.append(
+                client.submit(_scan_gzip_members_task, sub_paths, workers=target, pure=False)
+            )
+        scan_results = client.gather(scan_futs)
+        for idxs, res in zip(scan_idx_lists, scan_results):
+            for i, members in zip(idxs, res):
+                member_map[i] = list(members)
+    _log.info(
+        "distributed_index: gzip-member scan done in %.1fs",
+        _time.monotonic() - _t2,
+    )
+
+    # 4. PLAN: deterministic LPT of work units across workers (mirrors MPI).
+    _t3 = _time.monotonic()
+    if partition == "lpt":
+        per_worker_units = _plan_work_units(member_map, n_workers, 0)
+    elif partition == "round_robin":
+        # Whole-file fallback for round_robin (no intra-file slicing).
+        per_worker_units = [[] for _ in range(n_workers)]
+        for i, mv in enumerate(member_map):
+            mlen = max(1, len(mv))
+            per_worker_units[i % n_workers].append((i, 0, mlen, 0))
+    else:
+        raise ValueError(f"unknown partition={partition}")
+    _log.info(
+        "distributed_index: planned in %.2fs (per-worker units=%s)",
+        _time.monotonic() - _t3,
+        [len(u) for u in per_worker_units],
+    )
+
+    # 5. BUILD: each worker receives its (paths, file_ids, file_slices)
+    #    parallel lists. A file split across workers appears once per slice.
+    index_dir = os.path.dirname(index_path.rstrip("/"))
+    os.makedirs(local_staging, exist_ok=True)
+    os.makedirs(lustre_staging, exist_ok=True)
+
+    worker_file_lists: List[List[str]] = []
+    worker_file_ids: List[List[int]] = []
+    worker_slices: List[List[Any]] = []
+    CKPT_STRIDE = 1 << 20
+    for w, units in enumerate(per_worker_units):
+        paths_w: List[str] = []
+        ids_w: List[int] = []
+        slices_w: List[Any] = []
+        for file_idx, mb, me, _csz in units:
+            paths_w.append(all_paths[file_idx])
+            ids_w.append(int(all_file_ids[file_idx]))
+            members = member_map[file_idx] or [(0, 0)]
+            # Clamp [mb, me) into the actual member vector. plan_work_units
+            # may have synthesised a single (0, 0) for a non-gzip file; in
+            # that case mb=0, me=1 and the slice is "whole file".
+            if me > len(members):
+                me = len(members)
+            if mb > me:
+                mb = me
+            slices_w.append(
+                (
+                    int(mb),
+                    int(me),
+                    int(mb) * CKPT_STRIDE,
+                    bool(mb != 0),
+                    [(int(mo), int(ms)) for (mo, ms) in members],
+                )
+            )
+        worker_file_lists.append(paths_w)
+        worker_file_ids.append(ids_w)
+        worker_slices.append(slices_w)
+
+    _t_build = _time.monotonic()
+    worker_ids: List[int] = []
+    # Each entry is (artifact_dicts, tracker_blob) returned by _build_sst_task.
+    worker_results: List[Any] = []
+    if client is None:
+        for w, (paths_w, ids_w, slices_w) in enumerate(
+            zip(worker_file_lists, worker_file_ids, worker_slices)
+        ):
+            if not paths_w:
+                continue
+            worker_ids.append(w)
+            worker_results.append(
+                _build_sst_task(
+                    paths_w,
+                    ids_w,
+                    slices_w,
+                    local_staging,
+                    lustre_staging,
+                    f"worker_{w}",
+                    index_dir,
+                    checkpoint_size,
+                    bloom_dimensions,
+                    build_manifest,
+                    force_rebuild,
+                    parallelism_per_worker,
+                    flush_every_files,
+                    aggregation_config,
+                    False,
+                )
+            )
+    else:
+        worker_addrs = list(client.nthreads().keys())
+        futures = []
+        for w, (paths_w, ids_w, slices_w) in enumerate(
+            zip(worker_file_lists, worker_file_ids, worker_slices)
+        ):
+            if not paths_w:
+                continue
+            target = [worker_addrs[w % len(worker_addrs)]] if worker_addrs else None
+            worker_ids.append(w)
+            futures.append(
+                client.submit(
+                    _build_sst_task,
+                    paths_w,
+                    ids_w,
+                    slices_w,
+                    local_staging,
+                    lustre_staging,
+                    f"worker_{w}",
+                    index_dir,
+                    checkpoint_size,
+                    bloom_dimensions,
+                    build_manifest,
+                    force_rebuild,
+                    parallelism_per_worker,
+                    flush_every_files,
+                    aggregation_config,
+                    True,
+                    workers=target,
+                    pure=False,
+                )
+            )
+        worker_results = client.gather(futures)
+    _log.info(
+        "distributed_index: build dispatch+gather done in %.1fs (%d workers)",
+        _time.monotonic() - _t_build,
+        len(worker_ids),
+    )
+
+    # 5. Bulk-ingest on coordinator (all CFs).
+    _t_collect = _time.monotonic()
+    registry = _SstArtifactRegistry()
+    total_artifacts = 0
+    tracker_blobs: List[bytes] = []
+    has_aggregation = False
+    for wres in worker_results:
+        if isinstance(wres, tuple) and len(wres) == 2:
+            dicts, tracker_blob = wres
+        else:
+            dicts, tracker_blob = wres, b""
+        if tracker_blob:
+            tracker_blobs.append(tracker_blob)
+        for d in dicts:
+            registry.append(d)
+            total_artifacts += 1
+            if isinstance(d, dict) and (d.get("aggregation_sst") or d.get("system_metrics_sst")):
+                has_aggregation = True
+    _log.info(
+        "distributed_index: collected %d artifacts in %.2fs",
+        total_artifacts,
+        _time.monotonic() - _t_collect,
+    )
+
+    _t_ingest = _time.monotonic()
+    db.bulk_ingest(registry)
+    _log.info(
+        "distributed_index: bulk_ingest done in %.1fs (%d artifacts)",
+        _time.monotonic() - _t_ingest,
+        total_artifacts,
+    )
+    if rebuild_root_summaries:
+        _t_root = _time.monotonic()
+        db.rebuild_root_summaries()
+        _log.info(
+            "distributed_index: rebuild_root_summaries done in %.1fs",
+            _time.monotonic() - _t_root,
+        )
+
+    if aggregation_config is not None and has_aggregation:
+        _t_meta = _time.monotonic()
+        time_interval_ms = getattr(aggregation_config, "time_interval_ms", 0) or 0
+        time_interval_us = int(round(time_interval_ms * 1000.0))
+        db.write_agg_global_config(time_interval_us=time_interval_us)
+        if all_file_ids:
+            db.write_agg_file_markers(list(all_file_ids))
+        if tracker_blobs:
+            db.write_aggregation_tracker(tracker_blobs)
+        _log.info(
+            "distributed_index: agg meta writes done in %.2fs (markers=%d, trackers=%d)",
+            _time.monotonic() - _t_meta,
+            len(all_file_ids),
+            len(tracker_blobs),
+        )
+
+    per_worker_file_counts = [len(set(ids)) for ids in worker_file_ids]
+    return {
+        "total_files": len(entries),
+        "per_worker": per_worker_file_counts,
+        "index_path": index_path,
+        "artifact_batches": total_artifacts,
+    }
diff --git a/python/dftracer/utils/dftracer_utils_ext.pyi b/python/dftracer/utils/dftracer_utils_ext.pyi
index c9324a74..b03572e7 100644
--- a/python/dftracer/utils/dftracer_utils_ext.pyi
+++ b/python/dftracer/utils/dftracer_utils_ext.pyi
@@ -1,7 +1,44 @@
 """Type stubs for dftracer_utils_ext module."""
 
 from types import TracebackType
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
+
+from .arrow import ArrowTable
+
+class _ArrowBatchCapsule:
+    """Internal Arrow batch wrapper implementing __arrow_c_array__ protocol."""
+
+    @property
+    def num_rows(self) -> int: ...
+    @property
+    def num_columns(self) -> int: ...
+    def __arrow_c_array__(self, requested_schema: Any = None) -> Tuple[Any, Any]: ...
+
+class _ArrowBatchStream:
+    """Zero-iteration Arrow stream backed by the C++ coroutine channel.
+
+    Implements the Arrow C Data Interface stream protocol. Pass directly
+    to ``pyarrow.RecordBatchReader.from_stream()`` or ``pyarrow.table()``.
+    Single-use: consuming ``__arrow_c_stream__`` once exhausts the object.
+    """
+
+    def __arrow_c_stream__(self, requested_schema: Any = None) -> Any: ...
+
+class JsonDictValue:
+    """Zero-copy wrapper over a parsed DFTracer JSON event.
+
+    Supports dict-like access: ``event['name']``, ``event['args']['ret']``.
+    Call ``.to_dict()`` to materialize a regular Python dict.
+    """
+
+    def __getitem__(self, key: str) -> Any: ...
+    def __len__(self) -> int: ...
+    def __contains__(self, key: str) -> bool: ...
+    def keys(self) -> List[str]: ...
+    def values(self) -> List[Any]: ...
+    def items(self) -> List[Tuple[str, Any]]: ...
+    def get(self, key: str, default: Any = None) -> Any: ...
+    def to_dict(self) -> Dict[str, Any]: ...
 
 # ========== INDEXER ==========
 
@@ -17,7 +54,185 @@ class IndexerCheckpoint:
     num_lines: int
 
 class Indexer:
-    """Indexer for creating and managing root-local ``.dftindex`` stores."""
+    """Indexer with resolve/build pattern for tiered indexing."""
+
+    def __init__(
+        self,
+        directory: str = "",
+        files: Optional[List[str]] = None,
+        index_dir: str = "",
+        require_checkpoint: bool = True,
+        require_bloom: bool = True,
+        require_manifest: bool = True,
+        require_aggregation: bool = False,
+        time_interval_ms: float = 5000.0,
+        group_keys: Optional[List[str]] = None,
+        custom_metric_fields: Optional[List[str]] = None,
+        compute_percentiles: bool = False,
+        checkpoint_size: int = 32 * 1024 * 1024,
+        parallelism: int = 0,
+        force_rebuild: bool = False,
+        runtime: Optional["Runtime"] = None,
+    ) -> None:
+        """Create an indexer for trace files.
+
+        At least one of 'directory' or 'files' must be provided.
+
+        Args:
+            directory: Path to the directory containing trace files.
+            files: List of specific file paths to index.
+            index_dir: Directory for `.dftindex` stores. If empty, uses
+                directory-local paths.
+            require_checkpoint: If True, build checkpoint index (tier 1).
+            require_bloom: If True, build bloom filter data (tier 2).
+            require_manifest: If True, build manifest data (tier 2).
+            require_aggregation: If True, build aggregation data (tier 3).
+            time_interval_ms: Time interval for aggregation in milliseconds.
+            group_keys: Keys to group by for aggregation.
+            custom_metric_fields: Custom metric fields for aggregation.
+            compute_percentiles: If True, compute percentiles during aggregation.
+            parallelism: Number of parallel indexers. 0 = auto.
+            force_rebuild: If True, rebuild indices even if they exist.
+            runtime: Runtime instance for thread pool control.
+        """
+        ...
+
+    def resolve(self) -> Dict[str, Any]:
+        """Resolve which files need indexing.
+
+        Returns:
+            Dictionary with 'ready' and 'needs_work' file lists.
+        """
+        ...
+
+    def build(self) -> Dict[str, Any]:
+        """Build indices for files that need work.
+
+        Returns:
+            Dictionary with build status and statistics.
+        """
+        ...
+
+    def ensure_indexed(self) -> Dict[str, Any]:
+        """Ensure all files are indexed by calling resolve then build if needed.
+
+        Returns:
+            Dictionary with 'ready' and 'needs_work' file lists after indexing.
+        """
+        ...
+
+    def get_checkpoint_indexer(self, file_path: str) -> "CheckpointIndexer":
+        """Get a checkpoint indexer for a specific file.
+
+        Args:
+            file_path: Path to the trace file (.pfw/.pfw.gz).
+
+        Returns:
+            CheckpointIndexer instance for checkpoint-level operations.
+        """
+        ...
+
+    def get_hash_table(self, hash_type: str) -> Dict[str, str]:
+        """Get hash table mapping hash values to original strings.
+
+        Args:
+            hash_type: Type of hash table ('file', 'host', or 'string').
+
+        Returns:
+            Dict mapping hash strings to original values.
+
+        Raises:
+            ValueError: If hash_type is not valid.
+        """
+        ...
+
+    def query_file_pids(self, file_id: int) -> set:
+        """Query PIDs observed in a specific file.
+
+        Args:
+            file_id: File identifier (0-based index).
+
+        Returns:
+            Set of PIDs (int) observed in the file.
+        """
+        ...
+
+    def query_all_file_pids(self) -> Dict[int, set]:
+        """Query all file-to-PIDs mappings.
+
+        Returns:
+            Dict mapping file_id to set of PIDs observed in that file.
+        """
+        ...
+
+    def query_file_info(self) -> Tuple[Dict[int, str], Dict[int, set]]:
+        """Query file ID to path mapping and per-file PIDs in one call.
+
+        Returns:
+            Tuple of (file_id_to_path, file_pids).
+        """
+        ...
+
+    def iter_aggregation(self, type: str = "events", batch_size: int = 10000) -> Iterator[Any]:
+        """Iterate over aggregation data as Arrow batches.
+
+        Args:
+            type: 'events', 'profiles', or 'system'
+            batch_size: Number of entries per batch (default 10000)
+
+        Returns:
+            Iterator over Arrow batch capsules.
+        """
+        ...
+
+    def iter_arrow_dfanalyzer(
+        self,
+        type: str = "events",
+        batch_size: int = 10000,
+        time_granularity: float = 1.0,
+        time_resolution: float = 1e6,
+        query: Optional[str] = None,
+    ) -> Iterator[Any]:
+        """Iterate over aggregation data as dfanalyzer-compatible Arrow batches.
+
+        Args:
+            type: 'events', 'profiles', or 'system'
+            batch_size: Number of entries per batch (default 10000)
+            time_granularity: Bucket width in seconds (default 1.0)
+            time_resolution: Microseconds per output time unit (default 1e6)
+            query: Optional query filter (e.g., "pid == 1234 or pid == 5678")
+
+        Returns:
+            Iterator over Arrow batch capsules with dfanalyzer schema.
+        """
+        ...
+
+    def iter_arrow_dfanalyzer_all(
+        self,
+        batch_size: int = 10000,
+        time_granularity: float = 1.0,
+        time_resolution: float = 1e6,
+        query: Optional[str] = None,
+        group_by: Optional[List[str]] = None,
+    ) -> Dict[str, List[Any]]:
+        """Iterate over all aggregation types in a single scan.
+
+        Args:
+            batch_size: Number of entries per batch (default 10000)
+            time_granularity: Bucket width in seconds (default 1.0)
+            time_resolution: Microseconds per output time unit (default 1e6)
+            query: Optional query filter (e.g., "pid == 1234 or pid == 5678")
+            group_by: Optional list of columns to group by for coarse in-scan
+                aggregation. When provided, output schema is reduced to the
+                requested group columns plus aggregated metrics.
+
+        Returns:
+            Dict with 'events', 'profiles', 'system' keys containing Arrow batches.
+        """
+        ...
+
+class CheckpointIndexer:
+    """Checkpoint indexer for single-file checkpoint-level operations."""
 
     def __init__(
         self,
@@ -27,10 +242,9 @@ class Indexer:
         force_rebuild: bool = False,
         build_bloom: bool = False,
         build_manifest: bool = False,
-        index_threshold: int = 8388608,
         runtime: Optional["Runtime"] = None,
     ) -> None:
-        """Create an indexer for a gzip file.
+        """Create a checkpoint indexer for a gzip file.
 
         Args:
             gz_path: Path to the gzip trace file.
@@ -40,9 +254,6 @@ class Indexer:
             force_rebuild: If True, rebuild the index even if it exists.
             build_bloom: If True, build bloom filter data in the index.
             build_manifest: If True, build manifest data in the index.
-            index_threshold: Skip bloom/manifest for files smaller than this
-                (bytes). Set to 0 to disable the threshold and force
-                indexing regardless of file size.
             runtime: Runtime instance for thread pool control.
                 If None, uses the default global Runtime.
         """
@@ -109,7 +320,7 @@ class Indexer:
         """Whether manifest data exists in the `.dftindex` store."""
         ...
 
-    def __enter__(self) -> "Indexer":
+    def __enter__(self) -> "CheckpointIndexer":
         """Enter the runtime context for the with statement."""
         ...
 
@@ -126,140 +337,6 @@ class Indexer:
         """
         ...
 
-# ========== JSON ==========
-
-# Type aliases for JSON values
-_JSONPrimitive = Union[str, int, float, bool, None]
-
-class JSON:
-    """Lazy JSON object that parses on demand using yyjson.
-
-    This implementation provides lazy nested navigation for memory efficiency:
-    - Nested objects/arrays return JSON wrappers (lazy, no conversion)
-    - Primitives (str, int, float, bool, None) are converted immediately
-
-    Example:
-        json_obj = JSON('{"args": {"hhash": "abc"}, "pid": 42}')
-        args = json_obj["args"]  # Returns JSON wrapper (lazy, ~48 bytes)
-        hhash = args["hhash"]     # Returns str (converted)
-        pid = json_obj["pid"]     # Returns int (converted)
-    """
-
-    def __init__(self, json_str: str) -> None:
-        """Create a JSON object from a JSON string.
-
-        The JSON string is stored but not parsed until first access.
-        """
-        ...
-
-    def __contains__(self, key: str) -> bool:
-        """Check if key exists in JSON object."""
-        ...
-
-    def __getitem__(self, key: str) -> Union[_JSONPrimitive, "JSON"]:
-        """Get value by key, raises KeyError if not found.
-
-        Returns:
-            - JSON wrapper for nested objects/arrays (lazy evaluation)
-            - Primitive Python types for values (str, int, float, bool, None)
-
-        Example:
-            obj["nested_object"]  # Returns JSON (lazy wrapper)
-            obj["string_field"]    # Returns str
-            obj["number_field"]    # Returns int or float
-        """
-        ...
-
-    def get(
-        self,
-        key: str,
-        default: Union[_JSONPrimitive, "JSON"] = None,
-    ) -> Union[_JSONPrimitive, "JSON"]:
-        """Get value by key with optional default.
-
-        Returns:
-            - JSON wrapper for nested objects/arrays (lazy evaluation)
-            - Primitive Python types for values
-            - default if key not found
-        """
-        ...
-
-    def keys(self) -> List[str]:
-        """Get all keys from JSON object (only for object types)."""
-        ...
-
-    def values(self) -> List[Union[_JSONPrimitive, "JSON"]]:
-        """Get all values from JSON object (only for object types).
-
-        Returns:
-            - List of values, where nested objects/arrays are JSON wrappers (lazy)
-            - Primitives are converted to Python types
-        """
-        ...
-
-    def items(self) -> List[Tuple[str, Union[_JSONPrimitive, "JSON"]]]:
-        """Get all key-value pairs from JSON object (only for object types).
-
-        Returns:
-            - List of (key, value) tuples
-            - Nested objects/arrays are JSON wrappers (lazy)
-            - Primitives are converted to Python types
-        """
-        ...
-
-    def __len__(self) -> int:
-        """Return the number of key-value pairs in the JSON object.
-
-        Returns 0 if the root is not an object.
-        """
-        ...
-
-    def __bool__(self) -> bool:
-        """Return True if the JSON object is non-empty, False otherwise.
-
-        Returns:
-            - True if object has at least one key-value pair
-            - False if object is empty or root is not an object
-        """
-        ...
-
-    def unwrap(self) -> Union[Dict[str, Any], List[Any], _JSONPrimitive]:
-        """Unwrap the lazy JSON object into native Python dict/list.
-
-        Unlike lazy access via obj[key], this method fully converts the entire
-        JSON structure to native Python objects:
-        - JSON objects -> Python dicts
-        - JSON arrays -> Python lists
-        - Primitives -> Python types (str, int, float, bool, None)
-
-        Returns:
-            Fully converted Python object (dict, list, or primitive)
-        """
-        ...
-
-    def copy(self) -> "JSON":
-        """Return a shallow copy of the JSON object.
-
-        For subtree wrappers: Creates a new wrapper pointing to the same subtree
-        For top-level objects: Creates a new JSON object from the same data
-
-        Returns:
-            New JSON object
-        """
-        ...
-
-    def __str__(self) -> str:
-        """Return the JSON string representation.
-
-        For top-level objects: returns original JSON string
-        For subtrees: serializes the subtree to JSON
-        """
-        ...
-
-    def __repr__(self) -> str:
-        """Return string representation of the object."""
-        ...
-
 # ========== TASK HANDLE ==========
 
 class TaskHandle:
@@ -296,7 +373,7 @@ class Runtime:
     which adds submit(), Python callable support, and error handling.
     """
 
-    def __init__(self, threads: int = 0) -> None: ...
+    def __init__(self, threads: int = 0, io_threads: int = 0) -> None: ...
     def shutdown(self) -> None: ...
     def wait_all(self) -> None: ...
     def get_progress(self) -> Dict[str, Any]: ...
@@ -305,6 +382,8 @@ class Runtime:
     def set_default_task_timeout(self, ms: int = 0) -> None: ...
     @property
     def threads(self) -> int: ...
+    @property
+    def io_threads(self) -> int: ...
     def __enter__(self) -> "Runtime": ...
     def __exit__(
         self,
@@ -323,26 +402,25 @@ class TraceReader:
 
     def __init__(
         self,
-        file_path: str,
+        path: str,
         index_dir: str = "",
         checkpoint_size: int = 33554432,
         auto_build_index: bool = False,
-        index_threshold: int = 8388608,
-        runtime: Optional[Runtime] = None,
+        runtime: Optional[Union[Runtime, object]] = None,
     ) -> None:
         """Create a TraceReader.
 
         Args:
-            file_path: Path to the trace file (.pfw.gz or plain text).
+            path: Path to a trace file (.pfw/.pfw.gz) or a directory.
+                When a directory is given, all iter/read methods discover
+                .pfw and .pfw.gz files recursively and process them in
+                parallel on the Runtime thread pool.
             index_dir: Directory to search for ``.dftindex`` stores.
                 Empty string (default) searches next to the trace file.
             checkpoint_size: Checkpoint interval in bytes for index
                 building (default 32 MB).
             auto_build_index: If True, automatically build an index
-                when none exists and the file exceeds *index_threshold*.
-            index_threshold: Minimum file size in bytes before
-                auto-indexing is triggered (default 8 MB). Set to 0
-                to disable the threshold and always build an index.
+                when none exists.
             runtime: Runtime instance for thread pool control.
                 If None, uses the default global Runtime.
 
@@ -359,7 +437,7 @@ class TraceReader:
         end_byte: int = 0,
         buffer_size: int = 4194304,
         query: Optional[str] = None,
-    ) -> List[str]:
+    ) -> List[memoryview]:
         """Read lines from the trace file and return as a list.
 
         Lines are 1-indexed. Pass ``start_line=0, end_line=0`` (the
@@ -376,7 +454,8 @@ class TraceReader:
         end_byte: int = 0,
         buffer_size: int = 4194304,
         query: Optional[str] = None,
-    ) -> Iterator[str]:
+        memory_budget: int = 0,
+    ) -> Iterator[memoryview]:
         """Return a streaming iterator over decoded lines.
 
         The C++ coroutine runs on the Runtime thread pool and pushes
@@ -384,70 +463,74 @@ class TraceReader:
         """
         ...
 
-    def iter_raw(
+    def iter_json(
         self,
         start_line: int = 0,
         end_line: int = 0,
         start_byte: int = 0,
         end_byte: int = 0,
-        line_aligned: bool = True,
-        multi_line: bool = True,
         buffer_size: int = 4194304,
         query: Optional[str] = None,
-    ) -> Iterator[bytes]:
-        """Return a streaming iterator over raw byte chunks.
+        batch_size: int = 1024,
+        memory_budget: int = 0,
+    ) -> Iterator["JsonDictValue"]:
+        """Return a streaming iterator over parsed JSON events.
 
-        When ``query`` is set and an index exists, chunk-level pruning
-        skips non-matching chunks. No per-event filtering is applied.
+        Each event is parsed once in C++ and yielded as a zero-copy
+        :class:`JsonDictValue` wrapper. No double-parsing overhead.
         """
         ...
 
-    def read_raw(
+    def read_json(
         self,
         start_line: int = 0,
         end_line: int = 0,
         start_byte: int = 0,
         end_byte: int = 0,
-        line_aligned: bool = True,
-        multi_line: bool = True,
         buffer_size: int = 4194304,
         query: Optional[str] = None,
-    ) -> List[bytes]:
-        """Read raw byte chunks and return as a list.
+        batch_size: int = 1024,
+    ) -> List["JsonDictValue"]:
+        """Read all events as parsed :class:`JsonDictValue` wrappers (list).
 
-        When ``query`` is set and an index exists, chunk-level pruning
-        skips non-matching chunks. No per-event filtering is applied.
+        Equivalent to ``list(iter_json(...))``.
         """
         ...
 
-    def iter_lines_json(
+    def iter_raw(
         self,
         start_line: int = 0,
         end_line: int = 0,
         start_byte: int = 0,
         end_byte: int = 0,
+        line_aligned: bool = True,
+        multi_line: bool = True,
         buffer_size: int = 4194304,
         query: Optional[str] = None,
-    ) -> Iterator["JSON"]:
-        """Return iterator over parsed JSON objects.
+        memory_budget: int = 0,
+    ) -> Iterator[memoryview]:
+        """Return a streaming iterator over raw byte chunks.
 
-        Skips non-JSON lines (array delimiters like ``[`` and ``]``).
-        Each yielded item is a lazy :class:`JSON` object.
+        When ``query`` is set and an index exists, chunk-level pruning
+        skips non-matching chunks. No per-event filtering is applied.
         """
         ...
 
-    def read_lines_json(
+    def read_raw(
         self,
         start_line: int = 0,
         end_line: int = 0,
         start_byte: int = 0,
         end_byte: int = 0,
+        line_aligned: bool = True,
+        multi_line: bool = True,
         buffer_size: int = 4194304,
         query: Optional[str] = None,
-    ) -> List["JSON"]:
-        """Read lines and return as list of parsed JSON objects.
+    ) -> List[memoryview]:
+        """Read raw byte chunks and return as a list.
 
-        Equivalent to ``list(self.iter_lines_json(...))``.
+        When ``query`` is set and an index exists, chunk-level pruning
+        skips non-matching chunks. No per-event filtering is applied.
         """
         ...
 
@@ -460,21 +543,42 @@ class TraceReader:
         end_byte: int = 0,
         buffer_size: int = 4194304,
         query: Optional[str] = None,
-    ) -> Iterator[Any]:
+        flatten_objects: bool = False,
+        normalize: bool = False,
+        memory_budget: int = 0,
+    ) -> Iterator["_ArrowBatchCapsule"]:
         """Return iterator over Arrow record batches.
 
         Each batch is an ``_ArrowBatchCapsule`` implementing the Arrow
         PyCapsule protocol (``__arrow_c_array__``).  Wrap with
         :class:`~dftracer.utils.arrow.ArrowBatch` for convenience
         methods, or pass directly to ``pyarrow.record_batch()``.
+        """
+        ...
 
-        Args:
-            batch_size (int): Maximum rows per Arrow batch.
-            start_line (int): First line (0 = beginning).
-            end_line (int): Last line (0 = end of file).
-            start_byte (int): First byte offset (0 = beginning).
-            end_byte (int): Last byte offset (0 = end of file).
-            buffer_size (int): Internal read buffer size in bytes.
+    def iter_arrow_stream(
+        self,
+        batch_size: int = 10000,
+        start_line: int = 0,
+        end_line: int = 0,
+        start_byte: int = 0,
+        end_byte: int = 0,
+        buffer_size: int = 4194304,
+        query: Optional[str] = None,
+        flatten_objects: bool = False,
+        normalize: bool = False,
+        memory_budget: int = 0,
+    ) -> "_ArrowBatchStream":
+        """Return an Arrow C Data Interface stream over record batches.
+
+        PyArrow can drain the producer channel in a single C-side call:
+
+            rbr = pa.RecordBatchReader.from_stream(reader.iter_arrow_stream())
+            for batch in rbr:
+                ...
+
+        Equivalent data to :meth:`iter_arrow`, but without per-batch
+        Python ↔ C transitions.
         """
         ...
 
@@ -487,19 +591,13 @@ class TraceReader:
         end_byte: int = 0,
         buffer_size: int = 4194304,
         query: Optional[str] = None,
-    ) -> Any:
+        flatten_objects: bool = False,
+        normalize: bool = False,
+    ) -> "ArrowTable":
         """Read all events as an ArrowTable.
 
         Equivalent to collecting all batches from :meth:`iter_arrow`
         into an :class:`~dftracer.utils.arrow.ArrowTable`.
-
-        Args:
-            batch_size (int): Maximum rows per Arrow batch.
-            start_line (int): First line (0 = beginning).
-            end_line (int): Last line (0 = end of file).
-            start_byte (int): First byte offset (0 = beginning).
-            end_byte (int): Last byte offset (0 = end of file).
-            buffer_size (int): Internal read buffer size in bytes.
         """
         ...
 
@@ -521,8 +619,8 @@ class TraceReader:
         ...
 
     @property
-    def file_path(self) -> str:
-        """Path to the trace file."""
+    def path(self) -> str:
+        """Path to the trace file or directory."""
         ...
 
     @property
@@ -540,6 +638,95 @@ class TraceReader:
         """Total line count (reads all lines to compute if needed)."""
         ...
 
+    def write_arrow(
+        self,
+        path: str,
+        views: Optional[List[Union[str, Dict[str, Any]]]] = None,
+        chunk_size_mb: int = 32,
+        compression: str = "zstd",
+        batch_size: int = 10000,
+    ) -> Dict[str, Any]:
+        """Write trace data to Arrow IPC files with optional view-based partitioning.
+
+        Args:
+            path: Output directory for Arrow IPC files.
+            views: List of view definitions. Each can be:
+                - A string: predefined view name ('io', 'compute', 'dlio')
+                - A dict with 'name' and optional 'query', 'include_metadata'
+                If None, writes all events to 'all' partition.
+            chunk_size_mb: Maximum uncompressed size per file in MB.
+            compression: 'zstd' or 'none'.
+            batch_size: Events per Arrow batch.
+
+        Returns:
+            Dict with partitions, total_rows, total_bytes, chunks_scanned, chunks_skipped.
+        """
+        ...
+
+    def get_view_chunks(
+        self,
+        view: Optional[Union[str, Dict[str, Any]]] = None,
+    ) -> Dict[str, Any]:
+        """Get candidate chunks for a view after bloom filter pruning.
+
+        Args:
+            view: View definition (string or dict with 'name' and optional 'query').
+
+        Returns:
+            Dict with chunks list, total_checkpoints, skipped_checkpoints, file_may_match.
+        """
+        ...
+
+    def write_view_chunk(
+        self,
+        output_file: str,
+        checkpoint_idx: int,
+        start_byte: int,
+        end_byte: int,
+        view: Optional[Union[str, Dict[str, Any]]] = None,
+        compression: str = "zstd",
+        batch_size: int = 10000,
+    ) -> Dict[str, Any]:
+        """Write a single chunk to an Arrow IPC file.
+
+        Args:
+            output_file: Path to output Arrow IPC file.
+            checkpoint_idx: Checkpoint index.
+            start_byte: Start byte offset.
+            end_byte: End byte offset.
+            view: View definition.
+            compression: 'zstd' or 'none'.
+            batch_size: Events per batch.
+
+        Returns:
+            Dict with output_file, events_matched, rows_written, bytes_written.
+        """
+        ...
+
+    def write_view_chunks(
+        self,
+        chunks: List[Dict[str, Any]],
+        output_dir: str,
+        view: Optional[Union[str, Dict[str, Any]]] = None,
+        compression: str = "zstd",
+        batch_size: int = 10000,
+    ) -> Dict[str, Any]:
+        """Write multiple chunks to Arrow IPC files in parallel.
+
+        All chunks are processed concurrently on the Runtime thread pool.
+
+        Args:
+            chunks: List of dicts with checkpoint_idx, start_byte, end_byte.
+            output_dir: Directory for output Arrow IPC files.
+            view: View definition.
+            compression: 'zstd' or 'none'.
+            batch_size: Events per batch.
+
+        Returns:
+            Dict with results list, total_rows, total_events_matched.
+        """
+        ...
+
     def __enter__(self) -> "TraceReader":
         """Enter the runtime context for the with statement."""
         ...
@@ -733,3 +920,205 @@ class ComparatorUtility:
         force_rebuild: bool = False,
         config: str = "",
     ) -> str: ...
+
+# ========== ARROW PARALLEL READER ==========
+
+def read_arrow_files_parallel(
+    paths: List[str],
+    runtime: Optional[Runtime] = None,
+) -> Dict[str, Any]:
+    """Read multiple Arrow IPC files in parallel using the Runtime.
+
+    Args:
+        paths: List of file paths to read.
+        runtime: Optional Runtime object. Uses default if not provided.
+
+    Returns:
+        dict with:
+            - file_results: List of per-file results, each with:
+                - path: File path
+                - success: True if read succeeded
+                - error: Error message if failed, else None
+                - total_rows: Number of rows in file
+                - batches: List of ArrowBatch objects
+            - total_rows: Total rows across all files
+            - total_batches: Total batches across all files
+            - files_read: Number of files read successfully
+            - files_failed: Number of files that failed
+    """
+    ...
+
+# ========== DISTRIBUTED INDEX (SST-based) ==========
+
+class IndexDatabase:
+    """Handle to a .dftindex RocksDB store.
+
+    Used by the distributed indexer coordinator to pre-register files,
+    reserve file_id ranges, bulk-ingest worker-produced SSTs, and rebuild
+    root summaries.
+    """
+
+    def __init__(self, index_path: str) -> None: ...
+    def init_schema(self) -> None: ...
+    def register_files(self, paths: List[str], build_manifest: bool = False) -> List[int]:
+        """Register each path in the DEFAULT-CF file registry and return
+        the assigned file_ids (parallel to `paths`). Idempotent for files
+        with matching hash."""
+        ...
+
+    def reserve_file_id_range(self, count: int) -> int:
+        """Atomically reserve `count` contiguous file_ids; return first."""
+        ...
+
+    def bulk_ingest(
+        self,
+        registry: "SstArtifactRegistry",
+        skip_cfs: Optional[Iterable[str]] = None,
+    ) -> None:
+        """Ingest all SSTs collected in the registry.
+
+        skip_cfs is an optional iterable of CF names whose SSTs are left
+        outside the unified DB. Distributed builds pass
+        {"aggregation", "system_metrics"} to keep per-worker AGG/SYS SSTs
+        addressable via `agg_manifest.json` for parallel reads at analyze
+        time. See `dftracer.utils.dask.consolidate_index` to fold them
+        back into the unified DB later.
+        """
+        ...
+
+    def rebuild_root_summaries(self) -> None:
+        """Recompute ROOT_* summary column families from per-file CFs."""
+        ...
+
+    def write_agg_global_config(self, time_interval_us: int, config_hash: int = 0) -> None:
+        """Write the aggregation global-config marker into the AGGREGATION CF.
+
+        Required for `Indexer.iter_arrow_dfanalyzer_all` on distributed
+        builds (which never materialise the key via worker SSTs) and
+        post-consolidate indices.
+        """
+        ...
+
+    def write_agg_file_markers(self, file_ids: Iterable[int]) -> None:
+        """Write per-file aggregation completion markers into the AGGREGATION CF.
+
+        Each marker is ``\\xFF\\xFF + file_id_be32``. The index resolver uses
+        their presence to decide whether each file has aggregated data; if
+        missing, ``ensure_indexed()`` concludes the aggregation tier is
+        incomplete and re-runs the entire build. Distributed_index must
+        call this after ``bulk_ingest`` so subsequent ``read_trace`` calls
+        do not redundantly re-aggregate.
+        """
+        ...
+
+    def write_aggregation_tracker(self, blobs: List[bytes]) -> None:
+        """Merge serialized AssociationTracker blobs and write the result
+        to the AGGREGATION CF under the ``__tracker__`` key."""
+        ...
+
+class SstArtifactRegistry:
+    """Thread-safe collector for SST artifact paths produced by workers."""
+
+    def __init__(self) -> None: ...
+    def append(self, artifacts_dict: Dict[str, Optional[str]]) -> None:
+        """Add a per-batch Artifacts dict as returned by `build_sst_batch`."""
+        ...
+
+def build_sst_batch(
+    files: List[str],
+    file_ids: List[int],
+    staging_dir: str,
+    batch_id: str,
+    index_dir: str = "",
+    checkpoint_size: int = 33554432,
+    build_manifest: bool = False,
+    force_rebuild: bool = False,
+    bloom_dimensions: Optional[List[str]] = None,
+    parallelism: int = 0,
+    flush_every_files: int = 0,
+    runtime: Optional[Union[Runtime, object]] = None,
+    aggregation_config: Optional[Any] = None,
+    file_slices: Optional[List[Optional[Tuple[int, int, int, bool, List[Tuple[int, int]]]]]] = None,
+) -> Tuple[List[Dict[str, Optional[str]]], bytes]:
+    """Run the indexer pipeline with an SST sink. Returns
+    `(artifact_dicts, tracker_blob)`. `tracker_blob` is the serialized
+    merged AssociationTracker for the batch (empty bytes when
+    `aggregation_config` is None). `file_slices` enables intra-file
+    parallelism; entries are `None` (whole file) or
+    `(member_begin, member_end, checkpoint_idx_base,
+    skip_file_scoped_writes, members)`."""
+    ...
+
+def plan_lpt_partition(
+    entries: List[Tuple[str, int]], num_workers: int
+) -> List[List[Tuple[str, int]]]:
+    """Greedy LPT bin-packing of (path, size) tuples into num_workers
+    buckets, minimising the maximum per-worker total size."""
+    ...
+
+def scan_files(
+    directory: str,
+    patterns: Optional[List[str]] = None,
+    recursive: bool = False,
+    runtime: Optional[Union[Runtime, object]] = None,
+) -> List[Tuple[str, int]]:
+    """Parallel directory scan returning (path, size) tuples for regular
+    files matching the patterns."""
+    ...
+
+def enable_aggregation_deterministic_ids() -> None:
+    """Flip the global aggregation StringIntern into deterministic-id mode
+    so the same string maps to the same 32-bit id in every worker process."""
+    ...
+
+def move_artifacts(artifacts: Dict[str, Optional[str]], dest_dir: str) -> Dict[str, Optional[str]]:
+    """Move every populated SST in `artifacts` into `dest_dir` via the
+    C++ rename/copy helper, returning a fresh dict with the new paths."""
+    ...
+
+def enumerate_gzip_members(
+    files: List[str],
+    runtime: Optional[Union[Runtime, object]] = None,
+) -> List[List[Tuple[int, int]]]:
+    """Cooperative async scan of gzip member offsets. Returns lists of
+    `(c_offset, c_size)` parallel to `files`; empty for non-gzip files."""
+    ...
+
+def plan_work_units(
+    member_map: List[List[Tuple[int, int]]],
+    num_workers: int,
+    target_c_size: int = 0,
+) -> List[List[Tuple[int, int, int, int]]]:
+    """Deterministic LPT assignment of intra-file gzip-member slices across
+    workers. Returns per-worker lists of
+    `(file_idx, member_begin, member_end, c_size)`."""
+    ...
+
+def scan_aggregation_manifest(
+    agg_ssts: List[str],
+    sys_ssts: List[str],
+    scratch_dir: str,
+    meta_index_path: str,
+    batch_size: int = 10000,
+    time_granularity: float = 1.0,
+    time_resolution: float = 1e6,
+    query: Optional[str] = None,
+    group_by: Optional[List[str]] = None,
+    shard_begin: int = 0,
+    shard_end: int = 4096,
+    runtime: Optional[Union[Runtime, object]] = None,
+    file_hashes: Optional[Dict[str, str]] = None,
+    host_hashes: Optional[Dict[str, str]] = None,
+) -> Dict[str, List[_ArrowBatchCapsule]]:
+    """Scan a worker's slice of the distributed aggregation manifest.
+
+    Ingests `agg_ssts` + `sys_ssts` into a scratch IndexDatabase at
+    `scratch_dir` (caller owns the directory lifecycle) and runs the
+    dfanalyzer aggregation scan over `[shard_begin, shard_end)`.
+    `meta_index_path` is the unified .dftindex used to resolve file /
+    host hashes.
+
+    Returns the same dict shape as `Indexer.iter_arrow_dfanalyzer_all`:
+    `{"events": [...], "profiles": [...], "system": [...]}`.
+    """
+    ...
diff --git a/python/dftracer/utils/indexer.py b/python/dftracer/utils/indexer.py
new file mode 100644
index 00000000..c36754f6
--- /dev/null
+++ b/python/dftracer/utils/indexer.py
@@ -0,0 +1,371 @@
+"""Indexer utilities for building and managing trace indexes."""
+
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+from .dftracer_utils_ext import CheckpointIndexer as _NativeCheckpointIndexer
+from .dftracer_utils_ext import Indexer as _NativeIndexer
+from .runtime import Runtime
+
+DEFAULT_CHECKPOINT_SIZE = 32 * 1024 * 1024  # 32MB
+
+FileInfo = Tuple[Dict[int, str], Dict[int, Set[int]]]
+
+
+@dataclass
+class AggregationConfig:
+    """Configuration for aggregation tier indexing.
+
+    Attributes:
+        time_interval_ms: Time bucket size in milliseconds (default 5000).
+        group_keys: Extra grouping dimensions (default None).
+        custom_metric_fields: Extra numeric args fields to aggregate (default None).
+        compute_percentiles: Enable percentile sketch collection (default False).
+    """
+
+    time_interval_ms: float = 5000.0
+    group_keys: Optional[List[str]] = None
+    custom_metric_fields: Optional[List[str]] = None
+    compute_percentiles: bool = False
+
+
+@dataclass
+class IndexStatus:
+    """Status of index resolution.
+
+    Attributes:
+        total_files: Total number of files discovered.
+        ready: Files that are fully indexed for requested tiers.
+        needs_work: Files that need indexing.
+        index_path: Path to the .dftindex store.
+    """
+
+    total_files: int
+    ready: List[str] = field(default_factory=list)
+    needs_work: List[str] = field(default_factory=list)
+    index_path: str = ""
+
+
+class Indexer:
+    """High-level indexer for building and managing trace indexes.
+
+    Supports tiered indexing:
+    - Tier 1: Checkpoints (for random access)
+    - Tier 2: Bloom filters and manifests (for fast filtering)
+    - Tier 3: Aggregation data (config-dependent)
+
+    At least one of 'directory' or 'files' must be provided.
+
+    Args:
+        directory: Directory containing trace files (.pfw/.pfw.gz).
+        files: List of specific file paths to index.
+        index_dir: Directory for .dftindex stores (default: next to files).
+        require_checkpoint: Build checkpoint tier (default True).
+        require_bloom: Build bloom filter tier (default True).
+        require_manifest: Build manifest tier (default True).
+        require_aggregation: Aggregation config or True for defaults (default None).
+        parallelism: Number of parallel workers (0 = all cores).
+        force_rebuild: Force rebuild even if index exists.
+        runtime: Runtime for executor parallelism (default: global runtime).
+
+    Example:
+        >>> indexer = Indexer("/path/to/traces")
+        >>> indexer.ensure_indexed()  # builds checkpoint, bloom, manifest
+
+        >>> # With explicit file list
+        >>> indexer = Indexer(files=["/path/to/trace1.pfw.gz", "/path/to/trace2.pfw.gz"])
+        >>> indexer.ensure_indexed()
+
+        >>> # With aggregation
+        >>> indexer = Indexer(
+        ...     "/path/to/traces",
+        ...     require_aggregation=AggregationConfig(time_interval_ms=1000),
+        ... )
+        >>> indexer.ensure_indexed()  # fused pass with aggregation
+    """
+
+    def __init__(
+        self,
+        directory: str = "",
+        files: Optional[List[str]] = None,
+        index_dir: str = "",
+        require_checkpoint: bool = True,
+        require_bloom: bool = True,
+        require_manifest: bool = True,
+        require_aggregation: Optional[Union[bool, AggregationConfig]] = None,
+        checkpoint_size: int = DEFAULT_CHECKPOINT_SIZE,
+        parallelism: int = 0,
+        force_rebuild: bool = False,
+        runtime: Optional[Runtime] = None,
+    ):
+        # Normalize aggregation config
+        if require_aggregation is True:
+            agg_config = AggregationConfig()
+        elif isinstance(require_aggregation, AggregationConfig):
+            agg_config = require_aggregation
+        else:
+            agg_config = None
+
+        # Build native indexer
+        native_runtime = runtime._native if runtime else None
+        self._native = _NativeIndexer(
+            directory=directory,
+            files=files,
+            index_dir=index_dir,
+            require_checkpoint=require_checkpoint,
+            require_bloom=require_bloom,
+            require_manifest=require_manifest,
+            require_aggregation=agg_config is not None,
+            time_interval_ms=agg_config.time_interval_ms if agg_config else 5000.0,
+            group_keys=agg_config.group_keys if agg_config else None,
+            custom_metric_fields=agg_config.custom_metric_fields if agg_config else None,
+            compute_percentiles=agg_config.compute_percentiles if agg_config else False,
+            checkpoint_size=checkpoint_size,
+            parallelism=parallelism,
+            force_rebuild=force_rebuild,
+            runtime=native_runtime,
+        )
+        self._aggregation_config = agg_config
+        self._file_info_cache: Optional[FileInfo] = None
+        self._closed = False
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+        return False
+
+    def close(self):
+        """Release resources."""
+        self._closed = True
+
+    @property
+    def aggregation_config(self) -> Optional[AggregationConfig]:
+        """Aggregation configuration, if enabled."""
+        return self._aggregation_config
+
+    def resolve(self) -> IndexStatus:
+        """Check what files exist vs need indexing.
+
+        Returns:
+            IndexStatus with total_files, ready, and needs_work lists.
+        """
+        result = self._native.resolve()
+        return IndexStatus(
+            total_files=result["total_files"],
+            ready=result["ready"],
+            needs_work=result["needs_work"],
+            index_path=result.get("index_path", ""),
+        )
+
+    def build(self) -> None:
+        """Build all missing index tiers based on require_* flags.
+
+        This method builds indexes in parallel using the Runtime executor.
+        When aggregation is enabled, it performs a fused pass for efficiency.
+        """
+        self._native.build()
+
+    def ensure_indexed(self) -> IndexStatus:
+        """Resolve and build if needed.
+
+        Convenience method that calls resolve() then build() if needed.
+
+        Returns:
+            IndexStatus after building.
+        """
+        result = self._native.ensure_indexed()
+        return IndexStatus(
+            total_files=result["total_files"],
+            ready=result["ready"],
+            needs_work=result["needs_work"],
+            index_path=result.get("index_path", ""),
+        )
+
+    def get_checkpoint_indexer(self, file_path: str) -> _NativeCheckpointIndexer:
+        """Get a checkpoint indexer for a specific file.
+
+        Returns an indexer for checkpoint-level operations on a single file,
+        such as finding checkpoints for random access.
+
+        Args:
+            file_path: Path to the trace file (.pfw/.pfw.gz).
+
+        Returns:
+            Indexer instance for checkpoint operations (checkpoints, find_checkpoint, etc).
+        """
+        return self._native.get_checkpoint_indexer(file_path)
+
+    def get_hash_table(self, hash_type: str) -> dict:
+        """Query hash table mappings.
+
+        Returns a dictionary mapping hash values to resolved names for the
+        given hash type. This is useful for resolving fhash/hhash values in
+        aggregated data.
+
+        Args:
+            hash_type: One of 'file', 'host', 'string', or 'proc'.
+
+        Returns:
+            dict mapping hash values (str) to resolved names (str).
+
+        Example:
+            >>> indexer = Indexer("/path/to/traces")
+            >>> indexer.ensure_indexed()
+            >>> file_names = indexer.get_hash_table("file")
+            >>> # file_names = {"abc123": "/path/to/data.h5", ...}
+        """
+        return self._native.get_hash_table(hash_type)
+
+    def query_file_pids(self, file_id: int) -> set:
+        """Query PIDs observed in a specific file.
+
+        Args:
+            file_id: Integer file ID from index.
+
+        Returns:
+            set of PIDs (int) observed in the file.
+        """
+        return self._native.query_file_pids(file_id)
+
+    def query_all_file_pids(self) -> dict:
+        """Query PIDs for all indexed files.
+
+        Returns a dictionary mapping file_id to the set of PIDs observed
+        in that file. This is useful for distributed aggregation to assign
+        files to workers by PID affinity.
+
+        Returns:
+            dict mapping file_id (int) to set of PIDs (int).
+        """
+        return self._native.query_all_file_pids()
+
+    def query_file_info(self) -> FileInfo:
+        """Query file distribution info in a single DB open.
+
+        Returns:
+            Tuple of (file_id_to_path, file_pids) where:
+            - file_id_to_path: dict[int, str] mapping DB file ID to path
+            - file_pids: dict[int, set[int]] mapping file ID to PIDs
+        """
+        if self._file_info_cache is None:
+            self._file_info_cache = self._native.query_file_info()
+        return self._file_info_cache
+
+    def iter_aggregation(self, type: str = "events", batch_size: int = 10000):
+        """Iterate over aggregation data as Arrow batches.
+
+        Requires that the index was built with require_aggregation=True.
+        Returns Arrow batches that can be converted to pandas or pyarrow.
+
+        Args:
+            type: Type of aggregation data - 'events', 'profiles', or 'system'
+            batch_size: Number of entries per Arrow batch (default 10000)
+
+        Yields:
+            Arrow batch capsules implementing __arrow_c_array__
+
+        Example:
+            >>> import pyarrow as pa
+            >>> indexer = Indexer("/traces", require_aggregation=True)
+            >>> indexer.ensure_indexed()
+            >>> batches = [pa.record_batch(b) for b in indexer.iter_aggregation("events")]
+            >>> table = pa.concat_tables([pa.Table.from_batches([b]) for b in batches])
+        """
+        return self._native.iter_aggregation(type, batch_size)
+
+    def iter_arrow_dfanalyzer(
+        self,
+        type: str = "events",
+        batch_size: int = 10000,
+        time_granularity: float = 1.0,
+        time_resolution: float = 1e6,
+        query: Optional[str] = None,
+    ):
+        """Iterate over aggregation data as dfanalyzer-compatible Arrow batches.
+
+        Returns Arrow batches with columns matching dfanalyzer schema:
+
+        - Events/Profiles: cat, func_name, pid, tid, file_hash, host_hash,
+          file_name, host_name, proc_name, io_cat, acc_pat, count, time, size,
+          time_min, time_max, size_min, size_max, time_range, time_start, time_end
+        - System: host_hash, time_range, ``sys_cpu_*``, ``sys_mem_*``
+
+        Hash resolution, time normalization, and computed columns (proc_name,
+        io_cat) are done in C++ for performance.
+
+        Args:
+            type: Type of aggregation data - 'events', 'profiles', or 'system'
+            batch_size: Number of entries per Arrow batch (default 10000)
+            time_granularity: Bucket width in seconds (default 1.0)
+            time_resolution: Microseconds per output time unit (default 1e6)
+            query: Optional query filter string (e.g., "pid == 1234 or pid == 5678")
+
+        Yields:
+            Arrow batch capsules implementing __arrow_c_array__
+
+        Example:
+            >>> import pyarrow as pa
+            >>> indexer = Indexer("/traces", require_aggregation=True)
+            >>> indexer.ensure_indexed()
+            >>> batches = list(indexer.iter_arrow_dfanalyzer("events"))
+            >>> table = pa.concat_tables([pa.Table.from_batches([pa.record_batch(b)]) for b in batches])
+        """
+        if query is not None:
+            return self._native.iter_arrow_dfanalyzer(
+                type, batch_size, time_granularity, time_resolution, query
+            )
+        return self._native.iter_arrow_dfanalyzer(
+            type, batch_size, time_granularity, time_resolution
+        )
+
+    def iter_arrow_dfanalyzer_all(
+        self,
+        batch_size: int = 10000,
+        time_granularity: float = 1.0,
+        time_resolution: float = 1e6,
+        query: Optional[str] = None,
+        group_by: Optional[List[str]] = None,
+    ):
+        """Iterate over all aggregation types in a single scan.
+
+        This is ~3x faster than calling iter_arrow_dfanalyzer separately for
+        events, profiles, and system because it scans the index only once.
+
+        When ``group_by`` is provided, aggregation collapses dimensions during
+        the scan and emits a reduced schema containing only the requested
+        group columns plus aggregated metrics (``count``, ``time``, ``size``,
+        ``time_sq``, ``size_sq``, ``time_min``, ``time_max``, ``size_min``,
+        ``size_max``, ``time_call_min``, ``time_call_max``, ``size_call_min``,
+        ``size_call_max``, ``time_start``, ``time_end``).
+
+        Args:
+            batch_size: Number of entries per Arrow batch (default 10000)
+            time_granularity: Bucket width in seconds (default 1.0)
+            time_resolution: Microseconds per output time unit (default 1e6)
+            query: Optional query filter string (e.g., "pid == 1234 or pid == 5678")
+            group_by: Optional list of columns to group by for coarse in-scan
+                aggregation. Supported: ``cat``, ``func_name``, ``pid``,
+                ``tid``, ``file_hash``, ``host_hash``, ``file_name``,
+                ``host_name``, ``proc_name``, ``io_cat``, ``acc_pat``,
+                ``time_range``.
+
+        Returns:
+            Dict with 'events', 'profiles', 'system' keys, each containing
+            a list of Arrow batch capsules.
+
+        Example:
+            >>> import pyarrow as pa
+            >>> indexer = Indexer("/traces", require_aggregation=True)
+            >>> indexer.ensure_indexed()
+            >>> all_batches = indexer.iter_arrow_dfanalyzer_all()
+            >>> events = [pa.record_batch(b) for b in all_batches["events"]]
+        """
+        return self._native.iter_arrow_dfanalyzer_all(
+            batch_size,
+            time_granularity,
+            time_resolution,
+            query,
+            group_by,
+        )
diff --git a/python/dftracer/utils/runtime.py b/python/dftracer/utils/runtime.py
index 932f40ce..1dc88b7f 100644
--- a/python/dftracer/utils/runtime.py
+++ b/python/dftracer/utils/runtime.py
@@ -121,18 +121,19 @@ class Runtime:
 
     Example::
 
-        with Runtime(threads=8, python_threads=4) as rt:
+        with Runtime(threads=8, io_threads=8, python_threads=4) as rt:
             h = rt.submit(lambda x: x * 2, 21)
             assert h.get() == 42
 
     Args:
         threads: Number of C++ executor threads (0 = hardware_concurrency).
+        io_threads: Number of C++ I/O threads (0 = hardware_concurrency).
         python_threads: Number of Python ThreadPoolExecutor threads
             (0 = min(32, threads)).
     """
 
-    def __init__(self, threads: int = 0, python_threads: int = 0) -> None:
-        self._native = _NativeRuntime(threads)
+    def __init__(self, threads: int = 0, io_threads: int = 0, python_threads: int = 0) -> None:
+        self._native = _NativeRuntime(threads=threads, io_threads=io_threads)
         self._init_fields(python_threads)
 
     def _init_fields(self, python_threads: int = 0) -> None:
@@ -357,6 +358,11 @@ def threads(self) -> int:
         """Number of C++ worker threads."""
         return self._native.threads
 
+    @property
+    def io_threads(self) -> int:
+        """Number of C++ I/O threads."""
+        return self._native.io_threads
+
     @property
     def python_threads(self) -> int:
         """Number of Python worker threads (0 if pool not yet created)."""
diff --git a/python/dftracer/utils/trace_reader.py b/python/dftracer/utils/trace_reader.py
new file mode 100644
index 00000000..3ec9fd84
--- /dev/null
+++ b/python/dftracer/utils/trace_reader.py
@@ -0,0 +1,413 @@
+"""Python TraceReader wrapping the native C extension.
+
+Delegates all native methods and adds iter_lines_json / read_lines_json
+as shims over iter_arrow via pyarrow.
+"""
+
+from __future__ import annotations
+
+from types import TracebackType
+from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Type, Union
+
+from .dftracer_utils_ext import (
+    JsonDictValue,
+    _ArrowBatchCapsule,
+)
+from .dftracer_utils_ext import (
+    TraceReader as _NativeTraceReader,
+)
+
+if TYPE_CHECKING:
+    from .arrow import ArrowTable
+    from .runtime import Runtime
+
+
+class TraceReader:
+    __slots__ = ("_native",)
+
+    def __init__(
+        self,
+        path: str,
+        index_dir: str = "",
+        checkpoint_size: int = 33554432,
+        auto_build_index: bool = False,
+        runtime: Optional[Runtime] = None,
+    ) -> None:
+        if runtime is not None:
+            self._native = _NativeTraceReader(
+                path,
+                index_dir=index_dir,
+                checkpoint_size=checkpoint_size,
+                auto_build_index=auto_build_index,
+                runtime=runtime,
+            )
+        else:
+            self._native = _NativeTraceReader(
+                path,
+                index_dir=index_dir,
+                checkpoint_size=checkpoint_size,
+                auto_build_index=auto_build_index,
+            )
+
+    # -- properties --
+
+    @property
+    def path(self) -> str:
+        return self._native.path
+
+    @property
+    def index_dir(self) -> str:
+        return self._native.index_dir
+
+    @property
+    def has_index(self) -> bool:
+        return self._native.has_index
+
+    @property
+    def num_lines(self) -> int:
+        return self._native.num_lines
+
+    # -- lines --
+
+    def read_lines(
+        self,
+        start_line: int = 0,
+        end_line: int = 0,
+        start_byte: int = 0,
+        end_byte: int = 0,
+        buffer_size: int = 4194304,
+        query: Optional[str] = None,
+    ) -> List[memoryview]:
+        return self._native.read_lines(
+            start_line=start_line,
+            end_line=end_line,
+            start_byte=start_byte,
+            end_byte=end_byte,
+            buffer_size=buffer_size,
+            query=query,
+        )
+
+    def iter_lines(
+        self,
+        start_line: int = 0,
+        end_line: int = 0,
+        start_byte: int = 0,
+        end_byte: int = 0,
+        buffer_size: int = 4194304,
+        query: Optional[str] = None,
+        memory_budget: int = 0,
+    ) -> Iterator[memoryview]:
+        return self._native.iter_lines(
+            start_line=start_line,
+            end_line=end_line,
+            start_byte=start_byte,
+            end_byte=end_byte,
+            buffer_size=buffer_size,
+            query=query,
+            memory_budget=memory_budget,
+        )
+
+    # -- json --
+
+    def iter_json(
+        self,
+        start_line: int = 0,
+        end_line: int = 0,
+        start_byte: int = 0,
+        end_byte: int = 0,
+        buffer_size: int = 4194304,
+        query: Optional[str] = None,
+        batch_size: int = 1024,
+        memory_budget: int = 0,
+    ) -> Iterator[JsonDictValue]:
+        return self._native.iter_json(
+            start_line=start_line,
+            end_line=end_line,
+            start_byte=start_byte,
+            end_byte=end_byte,
+            buffer_size=buffer_size,
+            query=query,
+            batch_size=batch_size,
+            memory_budget=memory_budget,
+        )
+
+    def read_json(
+        self,
+        start_line: int = 0,
+        end_line: int = 0,
+        start_byte: int = 0,
+        end_byte: int = 0,
+        buffer_size: int = 4194304,
+        query: Optional[str] = None,
+        batch_size: int = 1024,
+    ) -> List[JsonDictValue]:
+        return self._native.read_json(
+            start_line=start_line,
+            end_line=end_line,
+            start_byte=start_byte,
+            end_byte=end_byte,
+            buffer_size=buffer_size,
+            query=query,
+            batch_size=batch_size,
+        )
+
+    # -- raw --
+
+    def read_raw(
+        self,
+        start_line: int = 0,
+        end_line: int = 0,
+        start_byte: int = 0,
+        end_byte: int = 0,
+        line_aligned: bool = True,
+        multi_line: bool = True,
+        buffer_size: int = 4194304,
+        query: Optional[str] = None,
+    ) -> List[memoryview]:
+        return self._native.read_raw(
+            start_line=start_line,
+            end_line=end_line,
+            start_byte=start_byte,
+            end_byte=end_byte,
+            line_aligned=line_aligned,
+            multi_line=multi_line,
+            buffer_size=buffer_size,
+            query=query,
+        )
+
+    def iter_raw(
+        self,
+        start_line: int = 0,
+        end_line: int = 0,
+        start_byte: int = 0,
+        end_byte: int = 0,
+        line_aligned: bool = True,
+        multi_line: bool = True,
+        buffer_size: int = 4194304,
+        query: Optional[str] = None,
+        memory_budget: int = 0,
+    ) -> Iterator[memoryview]:
+        return self._native.iter_raw(
+            start_line=start_line,
+            end_line=end_line,
+            start_byte=start_byte,
+            end_byte=end_byte,
+            line_aligned=line_aligned,
+            multi_line=multi_line,
+            buffer_size=buffer_size,
+            query=query,
+            memory_budget=memory_budget,
+        )
+
+    # -- arrow --
+
+    def iter_arrow(
+        self,
+        batch_size: int = 10000,
+        start_line: int = 0,
+        end_line: int = 0,
+        start_byte: int = 0,
+        end_byte: int = 0,
+        buffer_size: int = 4194304,
+        query: Optional[str] = None,
+        flatten_objects: bool = False,
+        normalize: bool = False,
+        memory_budget: int = 0,
+    ) -> Iterator[_ArrowBatchCapsule]:
+        return self._native.iter_arrow(
+            batch_size=batch_size,
+            start_line=start_line,
+            end_line=end_line,
+            start_byte=start_byte,
+            end_byte=end_byte,
+            buffer_size=buffer_size,
+            query=query,
+            flatten_objects=flatten_objects,
+            normalize=normalize,
+            memory_budget=memory_budget,
+        )
+
+    def iter_arrow_stream(
+        self,
+        batch_size: int = 10000,
+        start_line: int = 0,
+        end_line: int = 0,
+        start_byte: int = 0,
+        end_byte: int = 0,
+        buffer_size: int = 4194304,
+        query: Optional[str] = None,
+        flatten_objects: bool = False,
+        normalize: bool = False,
+        memory_budget: int = 0,
+    ) -> Any:
+        return self._native.iter_arrow_stream(
+            batch_size=batch_size,
+            start_line=start_line,
+            end_line=end_line,
+            start_byte=start_byte,
+            end_byte=end_byte,
+            buffer_size=buffer_size,
+            query=query,
+            flatten_objects=flatten_objects,
+            normalize=normalize,
+            memory_budget=memory_budget,
+        )
+
+    def read_arrow(
+        self,
+        batch_size: int = 10000,
+        start_line: int = 0,
+        end_line: int = 0,
+        start_byte: int = 0,
+        end_byte: int = 0,
+        buffer_size: int = 4194304,
+        query: Optional[str] = None,
+        flatten_objects: bool = False,
+        normalize: bool = False,
+    ) -> ArrowTable:
+        return self._native.read_arrow(
+            batch_size=batch_size,
+            start_line=start_line,
+            end_line=end_line,
+            start_byte=start_byte,
+            end_byte=end_byte,
+            buffer_size=buffer_size,
+            query=query,
+            flatten_objects=flatten_objects,
+            normalize=normalize,
+        )
+
+    # -- JSON shims via arrow --
+
+    def iter_lines_json(
+        self,
+        batch_size: int = 10000,
+        start_line: int = 0,
+        end_line: int = 0,
+        start_byte: int = 0,
+        end_byte: int = 0,
+        buffer_size: int = 4194304,
+        query: Optional[str] = None,
+    ) -> Iterator[Dict[str, Any]]:
+        try:
+            import pyarrow as pa
+        except ImportError:
+            raise ImportError(
+                "pyarrow is required for iter_lines_json. Install with: pip install pyarrow"
+            ) from None
+
+        for capsule in self._native.iter_arrow(
+            batch_size=batch_size,
+            start_line=start_line,
+            end_line=end_line,
+            start_byte=start_byte,
+            end_byte=end_byte,
+            buffer_size=buffer_size,
+            query=query,
+        ):
+            rb = pa.record_batch(capsule)
+            yield from rb.to_pylist()
+
+    def read_lines_json(
+        self,
+        batch_size: int = 10000,
+        start_line: int = 0,
+        end_line: int = 0,
+        start_byte: int = 0,
+        end_byte: int = 0,
+        buffer_size: int = 4194304,
+        query: Optional[str] = None,
+    ) -> List[Dict[str, Any]]:
+        return list(
+            self.iter_lines_json(
+                batch_size=batch_size,
+                start_line=start_line,
+                end_line=end_line,
+                start_byte=start_byte,
+                end_byte=end_byte,
+                buffer_size=buffer_size,
+                query=query,
+            )
+        )
+
+    # -- metadata --
+
+    def get_max_bytes(self) -> int:
+        return self._native.get_max_bytes()
+
+    def get_num_lines(self) -> int:
+        return self._native.get_num_lines()
+
+    # -- arrow IPC writing --
+
+    def write_arrow(
+        self,
+        path: str,
+        views: Optional[List[Union[str, Dict[str, Any]]]] = None,
+        chunk_size_mb: int = 32,
+        compression: str = "zstd",
+        batch_size: int = 10000,
+    ) -> Dict[str, Any]:
+        return self._native.write_arrow(
+            path,
+            views=views,
+            chunk_size_mb=chunk_size_mb,
+            compression=compression,
+            batch_size=batch_size,
+        )
+
+    def get_view_chunks(
+        self,
+        view: Optional[Union[str, Dict[str, Any]]] = None,
+    ) -> Dict[str, Any]:
+        return self._native.get_view_chunks(view=view)
+
+    def write_view_chunk(
+        self,
+        output_file: str,
+        checkpoint_idx: int,
+        start_byte: int,
+        end_byte: int,
+        view: Optional[Union[str, Dict[str, Any]]] = None,
+        compression: str = "zstd",
+        batch_size: int = 10000,
+    ) -> Dict[str, Any]:
+        return self._native.write_view_chunk(
+            output_file=output_file,
+            checkpoint_idx=checkpoint_idx,
+            start_byte=start_byte,
+            end_byte=end_byte,
+            view=view,
+            compression=compression,
+            batch_size=batch_size,
+        )
+
+    def write_view_chunks(
+        self,
+        chunks: List[Dict[str, Any]],
+        output_dir: str,
+        view: Optional[Union[str, Dict[str, Any]]] = None,
+        compression: str = "zstd",
+        batch_size: int = 10000,
+    ) -> Dict[str, Any]:
+        return self._native.write_view_chunks(
+            chunks=chunks,
+            output_dir=output_dir,
+            view=view,
+            compression=compression,
+            batch_size=batch_size,
+        )
+
+    # -- context manager --
+
+    def __enter__(self) -> TraceReader:
+        self._native.__enter__()
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[TracebackType],
+    ) -> None:
+        self._native.__exit__(exc_type, exc_val, exc_tb)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 14b9493a..503bb86e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,15 +5,21 @@
 add_rpath()
 
 need_zlib()
-need_lz4()
+if(DFTRACER_UTILS_ENABLE_LZ4)
+  need_lz4()
+endif()
+if(DFTRACER_UTILS_ENABLE_ZSTD)
+  need_zstd()
+endif()
 need_rocksdb()
 need_argparse()
 need_ghc_filesystem()
 need_cpplogger()
-need_yyjson()
+need_simdjson()
 need_readerwriterqueue()
 need_concurrentqueue()
 need_tl_expected()
+need_unordered_dense()
 
 if(DFTRACER_UTILS_ENABLE_ARROW)
   need_nanoarrow()
@@ -32,6 +38,7 @@ set(DFTRACER_UTILS_CORE_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/common/constants.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/common/format_detector.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/common/filesystem.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/common/memory_budget.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/env.cpp
     # Utilities
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/utils/timer.cpp
@@ -62,7 +69,6 @@ set(DFTRACER_UTILS_CORE_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/rocksdb/database.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/rocksdb/filesystem.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/rocksdb/key_codec.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/rocksdb/async.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/core/rocksdb/db_manager.cpp
 )
 
@@ -94,6 +100,11 @@ endif()
 set(DFTRACER_UTILS_UTILITIES_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/metadata_collector_utility.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/fileio/chunk_writer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/fileio/parallel/layout.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/fileio/parallel/striped_writer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/fileio/parallel/sharded_writer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/fileio/parallel/padded_striped_writer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/fileio/parallel/merge.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/chunk_extractor_utility.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/chunk_manifest_mapper_utility.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/event_id_extractor_utility.cpp
@@ -105,7 +116,13 @@ set(DFTRACER_UTILS_UTILITIES_SOURCES
     # Common utilities (shared across modules)
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/ddsketch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/log2_histogram.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/timestamp_histogram.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/json/json_value.cpp
+)
+
+list(APPEND DFTRACER_UTILS_UTILITIES_SOURCES
+    # JSON parser (On-Demand API)
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/json/parser.cpp
     # Query language (generic JSON filtering)
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/query/ast.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/query/parser.cpp
@@ -113,10 +130,17 @@ set(DFTRACER_UTILS_UTILITIES_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/query/query.cpp
     # DFT Aggregators
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/chunk_mapper_utility.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.cpp
@@ -128,17 +152,25 @@ set(DFTRACER_UTILS_UTILITIES_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.cpp
     # DFT Reorganization
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/reconstruction_planner.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/event_router.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.cpp
 
     # DFT Statistics (trace statistics, aggregation, querying)
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/statistics/statistics_query_utility.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/log2_histogram.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/statistics/timestamp_histogram.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.cpp
 
@@ -155,14 +187,18 @@ set(DFTRACER_UTILS_UTILITIES_SOURCES
 
     # Indexer
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/index_database.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/index_database_sst_writer_context.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/index_database_writer_context.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/index_builder_utility.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/provenance_database.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/visitors/bloom_visitor.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/visitors/manifest_visitor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.cpp
     # Reader
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/reader/trace_reader.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/indexer_c.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/helpers.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/index_encoding.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/checkpoint_size.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/indexer/internal/error.cpp
     # Indexer factory
@@ -184,6 +220,8 @@ set(DFTRACER_UTILS_UTILITIES_SOURCES
     # Call Tree
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/call_tree/call_tree.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/call_tree/call_tree_internal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/call_tree/call_tree_save_binary.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/call_tree/call_tree_save_arrow.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/call_tree/json_serializer.cpp
     # Replay
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/replay/replay.cpp
@@ -217,7 +255,11 @@ endif()
 
 if(DFTRACER_UTILS_ENABLE_ARROW_IPC)
   list(APPEND DFTRACER_UTILS_UTILITIES_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/arrow/ipc_reader.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/arrow/ipc_writer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/arrow/parallel_reader.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/arrow/partition_writer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/utilities/common/arrow/partition_router.cpp
   )
 endif()
 
@@ -245,7 +287,7 @@ else()
 endif()
 
 # Add other dependencies built with CPM
-set(PKG_CONFIG_LIBS_PRIVATE "${PKG_CONFIG_LIBS_PRIVATE} -lyyjson")
+set(PKG_CONFIG_LIBS_PRIVATE "${PKG_CONFIG_LIBS_PRIVATE} -lsimdjson")
 
 # +++++++++++++++++++++++++++++++++++++++++
 # Core Library
@@ -314,13 +356,23 @@ foreach(variant shared static)
   if(TARGET dftracer_utils_core_${variant})
     # Link dependencies using helper functions
     link_cpp_logger(dftracer_utils_core_${variant} ${VARIANT_UPPER})
-    link_yyjson(dftracer_utils_core_${variant} ${VARIANT_UPPER})
+    link_simdjson(dftracer_utils_core_${variant} ${VARIANT_UPPER})
     link_rocksdb(dftracer_utils_core_${variant} ${VARIANT_UPPER})
     link_zlib(dftracer_utils_core_${variant} ${VARIANT_UPPER})
+    link_unordered_dense(dftracer_utils_core_${variant})
 
     # Add stdfs if needed
     add_stdfs_if_needed(dftracer_utils_core_${variant})
 
+    # mpi_utils.cpp is part of the core library sources when MPI is
+    # enabled, so core itself needs the MPI include path and runtime.
+    # Without this the precompiled header pulls in <mpi.h> and fails
+    # with "mpi.h: No such file or directory".
+    if(DFTRACER_UTILS_ENABLE_MPI)
+      target_compile_definitions(dftracer_utils_core_${variant} PUBLIC DFTRACER_UTILS_MPI_ENABLED)
+      target_link_libraries(dftracer_utils_core_${variant} PUBLIC MPI::MPI_CXX)
+    endif()
+
     # Set warnings
     target_set_warnings(dftracer_utils_core_${variant})
 
@@ -416,6 +468,31 @@ foreach(variant shared static)
       string(TOUPPER ${variant} VARIANT_UPPER)
       link_nanoarrow(dftracer_utils_utilities_${variant} ${VARIANT_UPPER})
     endif()
+
+    # Link zstd when ENABLE_ZSTD is on so headers propagate to consumers
+    # (e.g. arrow ipc_writer.cpp guarded by DFTRACER_UTILS_ENABLE_ZSTD).
+    if(DFTRACER_UTILS_ENABLE_ZSTD)
+      if(TARGET zstd::libzstd_shared)
+        target_link_libraries(dftracer_utils_utilities_${variant}
+                              PRIVATE zstd::libzstd_shared)
+      elseif(TARGET zstd::libzstd_static)
+        target_link_libraries(dftracer_utils_utilities_${variant}
+                              PRIVATE zstd::libzstd_static)
+      elseif(TARGET libzstd_shared)
+        target_link_libraries(dftracer_utils_utilities_${variant}
+                              PRIVATE libzstd_shared)
+      elseif(TARGET libzstd_static)
+        target_link_libraries(dftracer_utils_utilities_${variant}
+                              PRIVATE libzstd_static)
+      endif()
+    endif()
+
+    # Lustre stripe query (optional). The config header carries the
+    # DFTRACER_UTILS_HAVE_LUSTREAPI define; we just need to link the lib.
+    if(DFTRACER_UTILS_HAVE_LUSTREAPI)
+      target_link_libraries(dftracer_utils_utilities_${variant}
+                            PRIVATE ${LUSTREAPI_LIBRARY})
+    endif()
   endif()
 endforeach()
 
@@ -755,6 +832,13 @@ if(DFTRACER_UTILS_BUILD_BINARIES)
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_server.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_comparator.cpp)
 
+  set(DFTRACER_MPI_BINARIES "")
+  if(DFTRACER_UTILS_ENABLE_MPI)
+    list(APPEND DFTRACER_MPI_BINARIES
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_aggregator_mpi.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/dftracer_call_tree_mpi.cpp)
+  endif()
+
   foreach(bin ${DFTRACER_BINARIES})
     string(REPLACE ".cpp" "" bin_exec ${bin})
     string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/" ""
@@ -791,6 +875,37 @@ if(DFTRACER_UTILS_BUILD_BINARIES)
       create_python_wrapper(${bin_exec})
     endif()
   endforeach()
+
+  foreach(bin ${DFTRACER_MPI_BINARIES})
+    string(REPLACE ".cpp" "" bin_exec ${bin})
+    string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/binaries/" ""
+                   bin_exec ${bin_exec})
+
+    add_executable(${bin_exec} ${bin})
+    set_target_properties(
+      ${bin_exec} PROPERTIES OUTPUT_NAME "${bin_exec}" RUNTIME_OUTPUT_DIRECTORY
+                                                       ${CMAKE_BINARY_DIR}/bin)
+    target_add_rpath(${bin_exec})
+
+    target_link_libraries(${bin_exec}
+                          PRIVATE dftracer_utils argparse::argparse
+                                  MPI::MPI_CXX)
+    target_include_directories(${bin_exec}
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../include)
+    add_stdfs_if_needed(${bin_exec})
+    target_set_warnings(${bin_exec})
+
+    if(DFTRACER_UTILS_COVERAGE AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+      target_compile_options(${bin_exec} PRIVATE --coverage -fprofile-arcs
+                                                 -ftest-coverage)
+      target_link_libraries(${bin_exec} PRIVATE --coverage)
+    endif()
+
+    install(TARGETS ${bin_exec} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+    if(SKBUILD)
+      create_python_wrapper(${bin_exec})
+    endif()
+  endforeach()
 endif()
 
 # ##############################################################################
@@ -871,6 +986,12 @@ if(DFTRACER_UTILS_BUILD_PYTHON)
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/dftracer_utils_ext.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/indexer.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/indexer.h
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/index_database.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/index_database.h
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/sst_distribution.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/sst_distribution.h
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/batch_indexer.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/batch_indexer.h
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/indexer_checkpoint.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/indexer_checkpoint.h
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/trace_reader.cpp
@@ -879,13 +1000,19 @@ if(DFTRACER_UTILS_BUILD_PYTHON)
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/runtime.h
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/task_handle.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/task_handle.h
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/memoryview_batch.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/memoryview_batch.h
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/trace_reader_iterator.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/trace_reader_iterator.h
-      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/json.h
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/json.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/json.h
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/statistics_query.cpp
        ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/arrow_helpers.cpp
        ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/arrow_helpers.h
+       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/arrow_stream_capsule.cpp
+       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/arrow_stream_capsule.h
+       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/schema_reconcile.cpp
+       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/schema_reconcile.h
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/statistics_query.h
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/statistics_aggregator.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/statistics_aggregator.h
@@ -898,7 +1025,11 @@ if(DFTRACER_UTILS_BUILD_PYTHON)
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/aggregator.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/aggregator.h
       ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/comparator.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/comparator.h)
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/utilities/comparator.h
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/streaming_iterator.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/streaming_iterator.h
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/arrow_parallel_reader.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/dftracer/utils/python/arrow_parallel_reader.h)
 
   # Link to unified library to test if this fixes the bus error
   target_link_libraries(dftracer_utils_ext PRIVATE dftracer_utils::shared)
diff --git a/src/dftracer/utils/binaries/common_cli.h b/src/dftracer/utils/binaries/common_cli.h
new file mode 100644
index 00000000..d4afb592
--- /dev/null
+++ b/src/dftracer/utils/binaries/common_cli.h
@@ -0,0 +1,329 @@
+#ifndef DFTRACER_UTILS_BINARIES_COMMON_CLI_H
+#define DFTRACER_UTILS_BINARIES_COMMON_CLI_H
+
+#include <dftracer/utils/core/common/constants.h>
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/common/platform_compat.h>
+#include <dftracer/utils/core/pipeline/pipeline_config.h>
+
+#include <argparse/argparse.hpp>
+#include <chrono>
+#include <cstddef>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+namespace dftracer::utils::cli {
+
+class ArgParse;
+
+struct CliSchema {
+    virtual ~CliSchema() = default;
+    virtual void register_on(argparse::ArgumentParser& p) = 0;
+    virtual void parse_from(const argparse::ArgumentParser& p) = 0;
+    virtual bool validate() { return true; }
+};
+
+class ArgParse {
+   public:
+    explicit ArgParse(argparse::ArgumentParser& parser) : parser_(parser) {}
+    virtual ~ArgParse() = default;
+
+    ArgParse(const ArgParse&) = delete;
+    ArgParse& operator=(const ArgParse&) = delete;
+
+    void setup() {
+        for (auto* s : schemas_) s->register_on(parser_);
+        register_args();
+    }
+
+    bool parse(int argc, char** argv) {
+        try {
+            parser_.parse_args(argc, argv);
+        } catch (const std::exception& err) {
+            DFTRACER_UTILS_LOG_ERROR("Error: %s", err.what());
+            std::fprintf(stderr, "%s\n", parser_.help().str().c_str());
+            return false;
+        }
+        for (auto* s : schemas_) s->parse_from(parser_);
+        post_parse();
+        for (auto* s : schemas_) {
+            if (!s->validate()) return false;
+        }
+        return validate();
+    }
+
+    template <typename... Schemas>
+    void schema(Schemas&... args) {
+        (schemas_.push_back(&args), ...);
+    }
+
+   protected:
+    virtual void register_args() {}
+    virtual void post_parse() {}
+    virtual bool validate() { return true; }
+
+    argparse::ArgumentParser& parser() { return parser_; }
+    const argparse::ArgumentParser& parser() const { return parser_; }
+
+   private:
+    argparse::ArgumentParser& parser_;
+    std::vector<CliSchema*> schemas_;
+};
+
+enum class DirMode { DEFAULT_DOT, DEFAULT_EMPTY, REQUIRED };
+
+struct DirectoryArgs : CliSchema {
+    DirMode mode = DirMode::DEFAULT_DOT;
+    std::string help = "Directory containing trace files";
+    std::string value;
+
+    DirectoryArgs() = default;
+    explicit DirectoryArgs(DirMode m) : mode(m) {}
+    DirectoryArgs(DirMode m, std::string h) : mode(m), help(std::move(h)) {}
+
+    void register_on(argparse::ArgumentParser& p) override {
+        auto& arg = p.add_argument("-d", "--directory").help(help);
+        switch (mode) {
+            case DirMode::DEFAULT_DOT:
+                arg.default_value<std::string>(".");
+                break;
+            case DirMode::DEFAULT_EMPTY:
+                arg.default_value<std::string>("");
+                break;
+            case DirMode::REQUIRED:
+                arg.required();
+                break;
+        }
+    }
+
+    void parse_from(const argparse::ArgumentParser& p) override {
+        value = p.get<std::string>("--directory");
+    }
+
+    bool validate() override {
+        if (mode == DirMode::REQUIRED && value.empty()) {
+            DFTRACER_UTILS_LOG_ERROR("%s", "--directory is required");
+            return false;
+        }
+        if (!value.empty() && !fs::exists(value)) {
+            DFTRACER_UTILS_LOG_ERROR("Directory does not exist: %s",
+                                     value.c_str());
+            return false;
+        }
+        return true;
+    }
+};
+
+struct FilesArgs : CliSchema {
+    std::string help = "Trace files (.pfw, .pfw.gz)";
+    std::vector<std::string> value;
+
+    FilesArgs() = default;
+    explicit FilesArgs(std::string h) : help(std::move(h)) {}
+
+    void register_on(argparse::ArgumentParser& p) override {
+        p.add_argument("--files")
+            .help(help)
+            .nargs(argparse::nargs_pattern::any)
+            .default_value<std::vector<std::string>>({});
+    }
+
+    void parse_from(const argparse::ArgumentParser& p) override {
+        value = p.get<std::vector<std::string>>("--files");
+    }
+};
+
+struct PipelineArgs : CliSchema {
+    std::size_t executor_threads = 0;
+    std::size_t io_threads = 0;
+    bool time_profiling = false;
+
+    PipelineArgs() = default;
+
+    void register_on(argparse::ArgumentParser& p) override {
+        p.add_group("Pipeline");
+        p.add_argument("--executor-threads")
+            .help(
+                "Number of worker threads for parallel processing "
+                "(default: number of CPU cores)")
+            .scan<'d', std::size_t>()
+            .default_value(static_cast<std::size_t>(
+                dftracer_utils_hardware_concurrency()));
+        p.add_argument("--io-threads")
+            .help(
+                "Number of I/O threads "
+                "(default: number of CPU cores)")
+            .scan<'d', std::size_t>()
+            .default_value(dftracer_utils_hardware_concurrency());
+        p.add_argument("--time-profiling")
+            .help("Print stage timing breakdown to stderr")
+            .flag();
+    }
+
+    void parse_from(const argparse::ArgumentParser& p) override {
+        executor_threads = p.get<std::size_t>("--executor-threads");
+        io_threads = p.get<std::size_t>("--io-threads");
+        time_profiling = p.get<bool>("--time-profiling");
+    }
+
+    bool validate() override {
+        if (executor_threads == 0) {
+            DFTRACER_UTILS_LOG_ERROR(
+                "%s", "--executor-threads must be greater than 0");
+            return false;
+        }
+        return true;
+    }
+
+    void apply(PipelineConfig& config) const {
+        config.with_compute_threads(executor_threads);
+        config.with_io_threads(io_threads);
+    }
+};
+
+struct IndexingArgs : CliSchema {
+    std::string index_dir;
+    std::size_t checkpoint_size = 0;
+    bool force = false;
+
+    std::string index_dir_help = "Directory for .dftindex stores";
+    std::string force_help = "Force index recreation";
+    bool with_index_dir = true;
+    bool with_force = true;
+
+    IndexingArgs() = default;
+    explicit IndexingArgs(bool f) : with_force(f) {}
+
+    void register_on(argparse::ArgumentParser& p) override {
+        p.add_group("Indexing");
+        if (with_index_dir) {
+            p.add_argument("--index-dir")
+                .help(index_dir_help)
+                .default_value<std::string>("");
+        }
+        p.add_argument("--checkpoint-size")
+            .help("Checkpoint size for gzip indexing in bytes (default: " +
+                  std::to_string(constants::indexer::DEFAULT_CHECKPOINT_SIZE) +
+                  ")")
+            .scan<'d', std::size_t>()
+            .default_value(static_cast<std::size_t>(
+                constants::indexer::DEFAULT_CHECKPOINT_SIZE));
+        if (with_force) {
+            p.add_argument("-f", "--force").help(force_help).flag();
+        }
+    }
+
+    void parse_from(const argparse::ArgumentParser& p) override {
+        if (with_index_dir) {
+            index_dir = p.get<std::string>("--index-dir");
+        }
+        checkpoint_size = p.get<std::size_t>("--checkpoint-size");
+        if (with_force) {
+            force = p.get<bool>("--force");
+        }
+    }
+};
+
+struct QueryArgs : CliSchema {
+    std::string query;
+    std::string help =
+        "Query DSL filter (e.g., 'cat == \"POSIX\" and dur > 1000')";
+
+    QueryArgs() = default;
+    explicit QueryArgs(std::string h) : help(std::move(h)) {}
+
+    void register_on(argparse::ArgumentParser& p) override {
+        p.add_group("Query");
+        p.add_argument("--query").help(help).default_value<std::string>("");
+    }
+
+    void parse_from(const argparse::ArgumentParser& p) override {
+        query = p.get<std::string>("--query");
+    }
+};
+
+struct WatchdogArgs : CliSchema {
+    bool disable = false;
+    int global_timeout = 0;
+    int task_timeout = 0;
+    int interval = 1;
+    int warning_threshold = 300;
+    int idle_timeout = 300;
+    int deadlock_timeout = 600;
+
+    void register_on(argparse::ArgumentParser& p) override {
+        p.add_group("Watchdog");
+        p.add_argument("--disable-watchdog")
+            .help("Disable watchdog for hang detection")
+            .flag();
+        p.add_argument("--watchdog-global-timeout")
+            .help(
+                "Watchdog global timeout for pipeline execution in "
+                "seconds (0 = no timeout)")
+            .scan<'d', int>()
+            .default_value(0);
+        p.add_argument("--watchdog-task-timeout")
+            .help("Watchdog default task timeout in seconds (0 = no timeout)")
+            .scan<'d', int>()
+            .default_value(0);
+        p.add_argument("--watchdog-interval")
+            .help("Watchdog check interval in seconds")
+            .scan<'d', int>()
+            .default_value(1);
+        p.add_argument("--watchdog-warning-threshold")
+            .help("Watchdog long-running task warning threshold in seconds")
+            .scan<'d', int>()
+            .default_value(300);
+        p.add_argument("--watchdog-idle-timeout")
+            .help("Watchdog idle timeout in seconds (0 = use default)")
+            .scan<'d', int>()
+            .default_value(300);
+        p.add_argument("--watchdog-deadlock-timeout")
+            .help("Watchdog deadlock timeout in seconds (0 = use default)")
+            .scan<'d', int>()
+            .default_value(600);
+    }
+
+    void parse_from(const argparse::ArgumentParser& p) override {
+        disable = p.get<bool>("--disable-watchdog");
+        global_timeout = p.get<int>("--watchdog-global-timeout");
+        task_timeout = p.get<int>("--watchdog-task-timeout");
+        interval = p.get<int>("--watchdog-interval");
+        warning_threshold = p.get<int>("--watchdog-warning-threshold");
+        idle_timeout = p.get<int>("--watchdog-idle-timeout");
+        deadlock_timeout = p.get<int>("--watchdog-deadlock-timeout");
+    }
+
+    void apply(PipelineConfig& config) const {
+        config.with_watchdog(!disable)
+            .with_global_timeout(std::chrono::seconds(global_timeout))
+            .with_task_timeout(std::chrono::seconds(task_timeout))
+            .with_watchdog_interval(std::chrono::seconds(interval))
+            .with_warning_threshold(std::chrono::seconds(warning_threshold))
+            .with_executor_idle_timeout(std::chrono::seconds(idle_timeout))
+            .with_executor_deadlock_timeout(
+                std::chrono::seconds(deadlock_timeout));
+    }
+};
+
+inline PipelineConfig build_pipeline_config(const std::string& name,
+                                            const PipelineArgs& pipeline) {
+    auto config = PipelineConfig().with_name(name).with_watchdog(false);
+    pipeline.apply(config);
+    return config;
+}
+
+inline PipelineConfig build_pipeline_config(const std::string& name,
+                                            const PipelineArgs& pipeline,
+                                            const WatchdogArgs& watchdog) {
+    auto config = PipelineConfig().with_name(name);
+    pipeline.apply(config);
+    watchdog.apply(config);
+    return config;
+}
+
+}  // namespace dftracer::utils::cli
+
+#endif  // DFTRACER_UTILS_BINARIES_COMMON_CLI_H
diff --git a/src/dftracer/utils/binaries/dftracer_aggregator.cpp b/src/dftracer/utils/binaries/dftracer_aggregator.cpp
index c511cabe..6e1a4d12 100644
--- a/src/dftracer/utils/binaries/dftracer_aggregator.cpp
+++ b/src/dftracer/utils/binaries/dftracer_aggregator.cpp
@@ -1,60 +1,330 @@
 #include <dftracer/utils/core/common/config.h>
-#include <dftracer/utils/core/common/filesystem.h>
-#include <dftracer/utils/core/common/platform_compat.h>
-#include <dftracer/utils/core/coro/channel.h>
 #include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/core/pipeline/pipeline_config.h>
 #include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/tasks/task.h>
-#include <dftracer/utils/utilities/common/query/query.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/aggregators.h>
-#include <dftracer/utils/utilities/composites/dft/internal/utils.h>
-#include <dftracer/utils/utilities/composites/dft/metadata_collector_utility.h>
-#include <dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
 #include <dftracer/utils/utilities/indexer/index_builder_utility.h>
-#include <dftracer/utils/utilities/indexer/internal/indexer.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+
+#include "common_cli.h"
+#include "dftracer/utils/core/utils/timer.h"
 #ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
 #include <dftracer/utils/utilities/common/arrow/ipc_writer.h>
 #endif
-#include <unistd.h>
-
-#include <argparse/argparse.hpp>
-#include <atomic>
-#include <chrono>
 #include <sstream>
-#include <thread>
+#include <unordered_set>
 
 using namespace dftracer::utils;
 using namespace dftracer::utils::utilities;
 using namespace dftracer::utils::utilities::composites::dft::aggregators;
 
-static coro::CoroTask<int> run_aggregator(argparse::ArgumentParser& program) {
-    std::string log_dir = program.get<std::string>("--directory");
-    std::string output_file = program.get<std::string>("--output");
-    double time_interval_ms = program.get<double>("--time-interval");
+class AggregatorArgParse : public cli::ArgParse {
+   public:
+    cli::DirectoryArgs directory{
+        cli::DirMode::DEFAULT_DOT,
+        "Input directory containing .pfw or .pfw.gz files"};
+    cli::PipelineArgs pipeline;
+    cli::IndexingArgs indexing;
+    cli::QueryArgs query_args{
+        "Query DSL filter (e.g., 'cat == \"POSIX\" and dur > 1000')"};
+
+    std::string output;
+    double time_interval = 5000.0;
+    std::string group_keys;
+    std::string metric_fields;
+    bool compress = false;
+    int compression_level = 1;
+    std::string boundary_events;
+    bool no_track_parents = false;
+    std::size_t chunk_size = 4;
+    std::size_t read_batch_size = 4;
+    std::string event_format = "counter";
+    bool compute_percentiles = false;
+    std::string percentiles = "0.25,0.5,0.75,0.90";
+    double relative_accuracy = 0.01;
+    std::string format = "json";
+    bool no_default_args = false;
+
+    explicit AggregatorArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        indexing.index_dir_help =
+            "Directory to store index files (default: system temp directory)";
+        indexing.force_help = "Force index recreation";
+        schema(directory, pipeline, indexing, query_args);
+    }
+
+   protected:
+    void register_args() override {
+        parser()
+            .add_argument("-o", "--output")
+            .help("Output file path for aggregated counters")
+            .default_value<std::string>("aggregated_output.json");
+
+        parser()
+            .add_argument("-t", "--time-interval")
+            .help("Time interval in milliseconds for bucketing (default: 5000)")
+            .scan<'g', double>()
+            .default_value(5000.0);
+
+        parser()
+            .add_argument("-g", "--group-keys")
+            .help(
+                "Comma-separated extra group keys from args (e.g., "
+                "epoch,step,level)")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("-m", "--metric-fields")
+            .help(
+                "Comma-separated custom metric fields from args (e.g., "
+                "iter_count,num_events)")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("--compress")
+            .help("Compress output using gzip")
+            .default_value(false)
+            .implicit_value(true);
+
+        parser()
+            .add_argument("--compression-level")
+            .help("Gzip compression level (0-9, default: 1)")
+            .scan<'d', int>()
+            .default_value(1);
+
+        parser()
+            .add_argument("--boundary-events")
+            .help(
+                "Boundary event configuration: "
+                "event_name:value_field:output_name "
+                "(e.g., \"epoch.block:iter_count:epoch\")")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("--no-track-process-parents")
+            .help(
+                "Disable tracking of process parent relationships from "
+                "fork/spawn")
+            .default_value(false)
+            .implicit_value(true);
+
+        parser()
+            .add_argument("--chunk-size")
+            .help(
+                "Target chunk size in MB for parallel processing (default: 4)")
+            .scan<'d', std::size_t>()
+            .default_value(static_cast<std::size_t>(4));
+
+        parser()
+            .add_argument("--read-batch-size")
+            .help(
+                "Batch read size in MB for stream processing (default: 4, "
+                "higher = "
+                "faster but more memory)")
+            .scan<'d', std::size_t>()
+            .default_value(static_cast<std::size_t>(4));
+
+        parser()
+            .add_argument("--event-format")
+            .help(
+                "Perfetto event format: 'counter' (ph=C, point-in-time, "
+                "default), "
+                "'async' (ph=b/e, async tracks with overlaps), "
+                "'regular' (ph=X, duration events with original TID)")
+            .default_value<std::string>("counter");
+
+        parser()
+            .add_argument("--compute-percentiles")
+            .help(
+                "Enable percentile/quantile computation using DDSketch (opt-in "
+                "due "
+                "to memory overhead)")
+            .default_value(false)
+            .implicit_value(true);
+
+        parser()
+            .add_argument("--percentiles")
+            .help(
+                "Comma-separated percentiles to compute (e.g., "
+                "\"0.25,0.5,0.75,0.90\" for P25, P50, P75, P90)")
+            .default_value<std::string>("0.25,0.5,0.75,0.90");
+
+        parser()
+            .add_argument("--relative-accuracy")
+            .help(
+                "Relative accuracy for DDSketch percentile estimation "
+                "(default: 0.01 = 1%)")
+            .scan<'g', double>()
+            .default_value(0.01);
+
+        parser()
+            .add_argument("--format")
+            .help(
+                "Output format: 'json' (Perfetto JSON, default) or "
+                "'arrow' (Arrow IPC file, .arrows extension)")
+            .default_value<std::string>("json");
+
+        parser()
+            .add_argument("--no-default-args")
+            .help(
+                "Disable automatic aggregation of numeric event args "
+                "(offset, whence, flags, etc.)")
+            .default_value(false)
+            .implicit_value(true);
+    }
+
+    void post_parse() override {
+        output = parser().get<std::string>("--output");
+        time_interval = parser().get<double>("--time-interval");
+        group_keys = parser().get<std::string>("--group-keys");
+        metric_fields = parser().get<std::string>("--metric-fields");
+        compress = parser().get<bool>("--compress");
+        compression_level = parser().get<int>("--compression-level");
+        boundary_events = parser().get<std::string>("--boundary-events");
+        no_track_parents = parser().get<bool>("--no-track-process-parents");
+        chunk_size = parser().get<std::size_t>("--chunk-size");
+        read_batch_size = parser().get<std::size_t>("--read-batch-size");
+        event_format = parser().get<std::string>("--event-format");
+        compute_percentiles = parser().get<bool>("--compute-percentiles");
+        percentiles = parser().get<std::string>("--percentiles");
+        relative_accuracy = parser().get<double>("--relative-accuracy");
+        format = parser().get<std::string>("--format");
+        no_default_args = parser().get<bool>("--no-default-args");
+    }
+};
+
+// Write global config and per-file tracking entries.
+static void write_aggregation_tracking(
+    dftracer::utils::rocksdb::RocksDatabase* db,
+    const AggregationConfig& config,
+    const std::vector<std::string>& processed_files,
+    const std::string& index_path) {
+    namespace rcf = dftracer::utils::rocksdb::cf;
+
+    // Open index database to get file_ids
+    indexer::IndexDatabase idx_db(
+        index_path,
+        dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+
+    auto batch = db->begin_batch();
+
+    // Write global config once
+    AggGlobalConfig global_cfg;
+    global_cfg.time_interval_us = config.time_interval_us;
+    global_cfg.config_hash = 0;
+    db->put(batch, rcf::AGGREGATION, std::string_view(AGG_GLOBAL_CONFIG_KEY, 2),
+            serialize_agg_global_config(global_cfg));
+
+    // Per-file: empty value (presence = aggregated)
+    for (const auto& file_path : processed_files) {
+        int file_id = idx_db.find_file(file_path);
+        if (file_id >= 0) {
+            auto key = make_agg_file_key(file_id);
+            db->put(batch, rcf::AGGREGATION, key, "");
+        }
+    }
+
+    db->commit_batch(batch);
+}
+
+static coro::CoroTask<indexer::IndexBuildBatchResult> batch_index_and_aggregate(
+    CoroScope* scope, std::vector<std::string> file_paths,
+    std::string index_dir, std::size_t checkpoint_size, bool force_rebuild,
+    std::size_t parallelism, AggregationConfig agg_config,
+    std::shared_ptr<dftracer::utils::rocksdb::RocksDatabase> agg_db,
+    std::uint32_t config_hash) {
+    auto batch_config = std::make_shared<indexer::IndexBuildBatchConfig>();
+    batch_config->file_paths = std::move(file_paths);
+    batch_config->index_dir = std::move(index_dir);
+    batch_config->checkpoint_size = checkpoint_size;
+    batch_config->parallelism = parallelism;
+    batch_config->force_rebuild = force_rebuild;
+    batch_config->use_batch_write = true;
+
+    auto agg_config_ptr =
+        std::make_shared<AggregationConfig>(std::move(agg_config));
+    batch_config->dft_visitor_factory =
+        [agg_db, config_hash, agg_config_ptr](const std::string& file_path)
+        -> std::vector<std::unique_ptr<composites::dft::DftEventVisitor>> {
+        std::vector<std::unique_ptr<composites::dft::DftEventVisitor>> visitors;
+        visitors.push_back(std::make_unique<AggregationVisitor>(
+            agg_db, config_hash, *agg_config_ptr, file_path));
+        return visitors;
+    };
+
+    co_return co_await indexer::IndexBatchBuilderUtility::process(
+        scope, std::move(batch_config));
+}
+
+static PerfettoTraceWriterInput build_streaming_input(
+    EventAggregator* merger_ptr, const AggregationConfig* agg_config,
+    const std::string* output_file, bool compress_output, int compression_level,
+    PerfettoEventFormat event_format) {
+    auto global_tracker = merger_ptr->build_global_tracker();
+
+    PerfettoTraceWriterInput input;
+    input.output_path = *output_file;
+    input.aggregator = merger_ptr;
+    input.tracker = global_tracker.get();
+    input.agg_config = agg_config;
+    input.owned_tracker = std::move(global_tracker);
+    input.root_pids = input.tracker->get_root_pids();
+    input.compute_statistics = agg_config->compute_statistics;
+    input.compute_percentiles = agg_config->compute_percentiles;
+    input.percentiles = agg_config->percentiles;
+    input.compress = compress_output;
+    input.compression_level = compression_level;
+    input.format = event_format;
+
+    const auto& intervals = input.tracker->get_all_intervals();
+    if (!intervals.empty()) {
+        std::uint64_t global_min = UINT64_MAX;
+        std::uint64_t global_max = 0;
+        for (const auto& interval : intervals) {
+            global_min = std::min(global_min, interval.start_ts);
+            global_max = std::max(global_max, interval.end_ts);
+            auto& range = input.boundary_ranges[interval.name][interval.value];
+            if (range.ts == 0 && range.te == 0) {
+                range.ts = interval.start_ts;
+                range.te = interval.end_ts;
+            } else {
+                range.ts = std::min(range.ts, interval.start_ts);
+                range.te = std::max(range.te, interval.end_ts);
+            }
+        }
+        if (global_max > global_min) {
+            input.trace_duration = global_max - global_min;
+        }
+    }
+
+    return input;
+}
+
+static coro::CoroTask<int> run_aggregator(const AggregatorArgParse* cli) {
+    auto log_dir = cli->directory.value;
+    auto output_file = cli->output;
+    auto time_interval_ms = cli->time_interval;
     std::uint64_t time_interval_us =
         static_cast<std::uint64_t>(time_interval_ms * 1000.0);
-    std::string group_keys_str = program.get<std::string>("--group-keys");
-    std::string metric_fields_str = program.get<std::string>("--metric-fields");
-    std::string query_str = program.get<std::string>("--query");
-    bool force_rebuild = program.get<bool>("--force");
-    std::size_t checkpoint_size = program.get<std::size_t>("--checkpoint-size");
-    std::size_t executor_threads =
-        program.get<std::size_t>("--executor-threads");
-    std::string index_dir = program.get<std::string>("--index-dir");
-    bool compress_output = program.get<bool>("--compress");
-    int compression_level = program.get<int>("--compression-level");
-    std::string boundary_events_str =
-        program.get<std::string>("--boundary-events");
-    bool no_track_parents = program.get<bool>("--no-track-process-parents");
-    std::size_t chunk_size_mb = program.get<std::size_t>("--chunk-size");
-    std::size_t batch_size_mb = program.get<std::size_t>("--read-batch-size");
-    std::string event_format_str = program.get<std::string>("--event-format");
-    bool compute_percentiles = program.get<bool>("--compute-percentiles");
-    std::string percentiles_str = program.get<std::string>("--percentiles");
-    double relative_accuracy = program.get<double>("--relative-accuracy");
-    std::string output_format = program.get<std::string>("--format");
+    const auto& group_keys_str = cli->group_keys;
+    const auto& metric_fields_str = cli->metric_fields;
+    const auto& query_str = cli->query_args.query;
+    auto force_rebuild = cli->indexing.force;
+    auto checkpoint_size = cli->indexing.checkpoint_size;
+    auto executor_threads = cli->pipeline.executor_threads;
+    auto index_dir = cli->indexing.index_dir;
+    auto compress_output = cli->compress;
+    auto compression_level = cli->compression_level;
+    const auto& boundary_events_str = cli->boundary_events;
+    auto no_track_parents = cli->no_track_parents;
+    const auto& event_format_str = cli->event_format;
+    auto compute_percentiles = cli->compute_percentiles;
+    const auto& percentiles_str = cli->percentiles;
+    auto relative_accuracy = cli->relative_accuracy;
+    const auto& output_format = cli->format;
 
     if (!AggregationConfig::is_valid_format(output_format)) {
         DFTRACER_UTILS_LOG_ERROR(
@@ -131,31 +401,6 @@ static coro::CoroTask<int> run_aggregator(argparse::ArgumentParser& program) {
         }
     }
 
-    std::string temp_index_dir;
-    if (index_dir.empty()) {
-        try {
-            auto temp_path = fs::temp_directory_path();
-            temp_path /= "dftracer_idx_" + std::to_string(std::time(nullptr)) +
-                         "_" + std::to_string(getpid());
-            temp_index_dir = temp_path.string();
-            fs::create_directories(temp_index_dir);
-            index_dir = temp_index_dir;
-            DFTRACER_UTILS_LOG_INFO("Created temporary index directory: %s",
-                                    index_dir.c_str());
-        } catch (const fs::filesystem_error& e) {
-            temp_index_dir = "/tmp/dftracer_idx_" +
-                             std::to_string(std::time(nullptr)) + "_" +
-                             std::to_string(getpid());
-            fs::create_directories(temp_index_dir);
-            index_dir = temp_index_dir;
-            DFTRACER_UTILS_LOG_WARN(
-                "Failed to get system temp directory, using /tmp: %s",
-                e.what());
-            DFTRACER_UTILS_LOG_INFO("Created temporary index directory: %s",
-                                    index_dir.c_str());
-        }
-    }
-
     log_dir = fs::absolute(log_dir).string();
     output_file = fs::absolute(output_file).string();
 
@@ -220,31 +465,35 @@ static coro::CoroTask<int> run_aggregator(argparse::ArgumentParser& program) {
     agg_config.percentiles = percentiles;
     agg_config.boundary_events = boundary_events;
     agg_config.track_process_parents = !no_track_parents;
+    agg_config.track_default_args = !cli->no_default_args;
 
-    using common::query::Query;
-    std::optional<Query> query;
     if (!query_str.empty()) {
-        auto result = Query::from_string(query_str);
-        if (!result) {
-            DFTRACER_UTILS_LOG_ERROR("Invalid --query: %s",
-                                     result.error().format().c_str());
-            co_return 1;
-        }
-        query = std::move(*result);
+        DFTRACER_UTILS_LOG_WARN(
+            "--query is not yet supported in fused mode, ignoring");
     }
 
-    // Discover input files
-    filesystem::PatternDirectoryScannerUtility scanner;
-    filesystem::PatternDirectoryScannerUtilityInput scan_input{
-        log_dir, {".pfw", ".pfw.gz"}, false};
-    auto matched_entries = co_await scanner.process(scan_input);
-
-    std::vector<std::string> input_files;
-    input_files.reserve(matched_entries.size());
-    for (const auto& entry : matched_entries) {
-        input_files.push_back(entry.path.string());
+    // Use hash=0 for simplicity (no config-based filtering)
+    constexpr std::uint32_t config_hash = 0;
+
+    Timer stages_storage("dftracer_aggregator");
+    Timer* stages = cli->pipeline.time_profiling ? &stages_storage : nullptr;
+    Timer overall(true);
+
+    namespace idx = composites::dft::indexing;
+
+    auto scan_result = std::make_unique<idx::ResolverResult>();
+    {
+        ScopedTimer _t(stages, "scan_and_resolve");
+        idx::IndexResolverUtility resolver;
+        idx::ResolverInput input;
+        input.directory = log_dir;
+        input.index_dir = index_dir;
+        input.require_aggregation = !force_rebuild;
+        input.aggregation_config = agg_config;
+        *scan_result = co_await resolver.process(input);
     }
 
+    auto& input_files = scan_result->all_files;
     if (input_files.empty()) {
         DFTRACER_UTILS_LOG_ERROR("No .pfw or .pfw.gz files found in: %s",
                                  log_dir.c_str());
@@ -253,442 +502,234 @@ static coro::CoroTask<int> run_aggregator(argparse::ArgumentParser& program) {
 
     DFTRACER_UTILS_LOG_INFO("Found %zu input files", input_files.size());
 
-    auto pipeline_config = PipelineConfig()
-                               .with_name("DFTracer Aggregator")
-                               .with_compute_threads(executor_threads)
-                               .with_watchdog(false);
+    auto& shared_index_path = scan_result->index_path;
+
+    auto pipeline_config =
+        cli::build_pipeline_config("DFTracer Aggregator", cli->pipeline);
 
     Pipeline pipeline(pipeline_config);
 
-    auto start_time = std::chrono::high_resolution_clock::now();
+    if (force_rebuild && fs::exists(shared_index_path)) {
+        DFTRACER_UTILS_LOG_INFO("Clearing shared index store: %s",
+                                shared_index_path.c_str());
+        fs::remove_all(shared_index_path);
+    }
 
-    EventAggregatorUtility merger;
-    std::atomic<int> global_chunk_idx{0};
+    std::shared_ptr<dftracer::utils::rocksdb::RocksDatabase> agg_db;
+    std::unique_ptr<EventAggregator> merger;
+    {
+        ScopedTimer _t(stages, "open_rocksdb");
+        agg_db = EventAggregator::open_with_merge_operator(shared_index_path);
+        merger = std::make_unique<EventAggregator>(agg_db, config_hash);
+    }
 
-    if (force_rebuild && !input_files.empty()) {
-        const std::string shared_index_path =
-            composites::dft::internal::determine_index_path(input_files.front(),
-                                                            index_dir);
-        if (fs::exists(shared_index_path)) {
-            DFTRACER_UTILS_LOG_INFO("Clearing shared index store: %s",
-                                    shared_index_path.c_str());
-            fs::remove_all(shared_index_path);
+    // Files to process: needs_checkpoint (index + aggregate) +
+    // needs_aggregation
+    const std::size_t num_needing_index = scan_result->needs_checkpoint.size();
+    const std::size_t num_needing_agg_only =
+        force_rebuild ? scan_result->cached.size()
+                      : scan_result->needs_aggregation.size();
+    const std::size_t num_cached =
+        force_rebuild ? 0 : scan_result->total_cached();
+
+    std::vector<std::string> files_to_process;
+    files_to_process.reserve(num_needing_index + num_needing_agg_only);
+    for (auto& item : scan_result->needs_checkpoint) {
+        files_to_process.push_back(std::move(item.file_path));
+    }
+    if (force_rebuild) {
+        for (auto& item : scan_result->cached) {
+            files_to_process.push_back(std::move(item.file_path));
+        }
+    } else {
+        for (auto& item : scan_result->needs_aggregation) {
+            files_to_process.push_back(std::move(item.file_path));
         }
     }
 
-    // Streaming aggregation: file producers -> chunk workers -> merger
-    auto streaming_task = make_task(
-        [&](CoroScope& ctx) -> coro::CoroTask<void> {
-            auto chunk_chan = coro::make_channel<ChunkAggregatorInput>(0);
-            auto result_chan = coro::make_channel<ChunkAggregationOutput>(2);
-
-            co_await ctx.scope([&](CoroScope& scope) -> coro::CoroTask<void> {
-                // File producers: one per input file
-                for (const auto& file_path : input_files) {
-                    auto* global_chunk_idx_ptr = &global_chunk_idx;
-                    scope.spawn([file_path, ch = chunk_chan->producer(),
-                                 index_dir, checkpoint_size, force_rebuild,
-                                 agg_config, query, chunk_size_mb,
-                                 batch_size_mb, global_chunk_idx_ptr](
-                                    CoroScope& /*fctx*/) mutable
-                                    -> coro::CoroTask<void> {
-                        [[maybe_unused]] auto producer_guard = ch.guard();
-                        // Build index
-                        std::string index_path =
-                            composites::dft::internal::determine_index_path(
-                                file_path, index_dir);
-                        auto idx_input =
-                            indexer::IndexBuildConfig::for_file(file_path)
-                                .with_checkpoint_size(checkpoint_size)
-                                .with_force_rebuild(false)
-                                .with_index_dir(index_dir);
-                        co_await indexer::IndexBuilderUtility{}.process(
-                            idx_input);
-
-                        // Collect metadata
-                        auto meta_input =
-                            composites::dft::MetadataCollectorUtilityInput::
-                                from_file(file_path)
-                                    .with_checkpoint_size(checkpoint_size)
-                                    .with_force_rebuild(false)
-                                    .with_index(index_path);
-                        auto metadata =
-                            co_await composites::dft::MetadataCollectorUtility{}
-                                .process(meta_input);
-
-                        if (!metadata.success) {
-                            DFTRACER_UTILS_LOG_WARN("Skipping file: %s",
-                                                    file_path.c_str());
-                            co_return;
-                        }
-
-                        // Create chunks for this file
-                        FileChunkMapperUtility file_mapper;
-                        auto mapper_input =
-                            FileChunkMapperInput::from_metadata(metadata)
-                                .with_config(agg_config)
-                                .with_checkpoint_size(checkpoint_size)
-                                .with_target_chunk_size(chunk_size_mb)
-                                .with_batch_size(batch_size_mb * 1024 * 1024);
-                        mapper_input.query = query;
-                        auto file_chunks =
-                            co_await file_mapper.process(mapper_input);
-
-                        int start_idx = global_chunk_idx_ptr->fetch_add(
-                            static_cast<int>(file_chunks.size()));
-                        for (int i = 0;
-                             i < static_cast<int>(file_chunks.size()); ++i) {
-                            file_chunks[i].chunk_index = start_idx + i;
-                        }
+    DFTRACER_UTILS_LOG_INFO(
+        "Files to process: %zu (%zu need indexing, %zu need aggregation only, "
+        "%zu cached)",
+        files_to_process.size(), num_needing_index, num_needing_agg_only,
+        num_cached);
 
-                        for (auto& chunk : file_chunks) {
-                            if (!co_await ch.send(std::move(chunk))) {
-                                co_return;
+    bool write_success = false;
+    std::size_t total_keys = 0;
+    std::atomic<std::size_t> perfetto_keys_written{0};
+
+    auto main_task = make_task(
+        [&](CoroScope& scope) -> coro::CoroTask<void> {
+            if (!files_to_process.empty()) {
+                {
+                    ScopedTimer _t(stages, "index_and_aggregate");
+                    auto batch_result = co_await batch_index_and_aggregate(
+                        &scope, files_to_process, index_dir, checkpoint_size,
+                        force_rebuild, executor_threads, agg_config, agg_db,
+                        config_hash);
+
+                    {
+                        ScopedTimer _vd(stages, "visitor_drain");
+                        for (auto& file_visitors :
+                             batch_result.extra_visitors) {
+                            for (auto& visitor : file_visitors) {
+                                auto* agg_visitor =
+                                    dynamic_cast<AggregationVisitor*>(
+                                        visitor.get());
+                                if (agg_visitor) {
+                                    for (const auto& k :
+                                         agg_visitor->observed_extra_keys())
+                                        merger->add_observed_extra_key(k);
+                                    for (const auto& m :
+                                         agg_visitor->observed_custom_metrics())
+                                        merger->add_observed_custom_metric(m);
+                                    auto output = agg_visitor->take_output();
+                                    merger->merge_chunk(std::move(output));
+                                }
                             }
+                            file_visitors.clear();
                         }
-
-                        co_return;
-                    });
+                    }
                 }
 
-                // Chunk workers: parallel aggregation
-                for (std::size_t w = 0; w < executor_threads; ++w) {
-                    (void)w;
-                    scope.spawn(
-                        [chunk_chan, rp = result_chan->producer(), result_chan](
-                            CoroScope& wctx) mutable -> coro::CoroTask<void> {
-                            [[maybe_unused]] auto producer_guard = rp.guard();
-                            while (auto input =
-                                       co_await wctx.receive(chunk_chan)) {
-                                ChunkAggregatorUtility agg;
-                                auto output = co_await agg.process(*input);
-                                if (!co_await result_chan->send(
-                                        std::move(output))) {
-                                    co_return;
-                                }
-                            }
-                            co_return;
-                        });
+                // Write tracking entries for processed files
+                {
+                    ScopedTimer _wt(stages, "write_tracking");
+                    write_aggregation_tracking(agg_db.get(), agg_config,
+                                               files_to_process,
+                                               shared_index_path);
                 }
-
-                // Streaming merger: incremental merge
-                auto* merger_ptr = &merger;
-                scope.spawn([result_chan, merger_ptr](
-                                CoroScope& mctx) -> coro::CoroTask<void> {
-                    while (auto output = co_await mctx.receive(result_chan)) {
-                        merger_ptr->merge_chunk(std::move(*output));
-                    }
-                    co_return;
-                });
-
-                co_return;
-            });
-
-            co_return;
-        },
-        "StreamingAggregate");
-
-    // Post-processing: finalize, resolve associations, write output
-    bool write_success = false;
-    EventAggregatorUtilityOutput agg_results;
-
-    auto post_task = make_task(
-        [&](CoroScope& /*ctx*/) -> coro::CoroTask<bool> {
-            agg_results = merger.finalize();
-
-            // Resolve associations
-            AssociationResolverInput resolver_input;
-            resolver_input.trackers = std::move(agg_results.trackers);
-            resolver_input.aggregations = std::move(agg_results);
-            resolver_input.config = agg_config;
-
-            AssociationResolverUtility resolver;
-            auto resolver_output =
-                co_await resolver.process(std::move(resolver_input));
-            agg_results = std::move(resolver_output.aggregations);
-
-            if (agg_results.aggregations.empty()) {
-                DFTRACER_UTILS_LOG_WARN("No aggregations to write!");
-                co_return false;
             }
 
-            bool success = false;
+            ScopedTimer _pp(stages, "post_processing");
 
 #ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
             if (output_format == AggregationConfig::FORMAT_ARROW) {
                 using namespace utilities::common::arrow;
 
-                DFTRACER_UTILS_LOG_INFO(
-                    "Writing %zu aggregation keys to %s (Arrow IPC)...",
-                    agg_results.aggregations.size(), output_file.c_str());
+                std::unique_ptr<AssociationTracker> global_tracker;
+                {
+                    ScopedTimer _bt(stages, "build_global_tracker");
+                    global_tracker = merger->build_global_tracker();
+                }
+                (void)global_tracker;
+
+                EventAggregator::ObservedColumns obs;
+                {
+                    ScopedTimer _oc(stages, "observed_columns");
+                    obs = merger->observed_columns();
+                }
+                auto& global_extra_key_ids = obs.extra_key_ids;
+                auto& global_custom_metric_names = obs.custom_metric_names;
 
                 IpcWriter ipc;
-                if (ipc.open(output_file) != 0) {
+                if (co_await ipc.open(output_file) != 0) {
                     DFTRACER_UTILS_LOG_ERROR(
                         "Failed to open Arrow IPC file: %s",
                         output_file.c_str());
-                    co_return false;
-                }
+                } else {
+                    ScopedTimer _aw(stages, "arrow_scan_write");
+                    constexpr std::size_t BATCH_ROWS = 10000;
+                    AggregationBatch batch;
+                    batch.entries.reserve(BATCH_ROWS);
+                    batch.global_extra_key_ids = &global_extra_key_ids;
+                    batch.global_custom_metric_names =
+                        &global_custom_metric_names;
+
+                    std::vector<ArrowExportResult> pending_batches;
+                    merger->scan([&](AggMapType, const AggregationKey& key,
+                                     AggregationMetrics& metrics) {
+                        total_keys++;
+                        batch.entries.emplace_back(key, std::move(metrics));
+                        if (batch.entries.size() >= BATCH_ROWS) {
+                            pending_batches.push_back(batch.to_arrow());
+                            batch.entries.clear();
+                        }
+                        return true;
+                    });
+                    if (!batch.entries.empty()) {
+                        pending_batches.push_back(batch.to_arrow());
+                    }
 
-                constexpr std::size_t BATCH_ROWS = 10000;
-                AggregationBatch batch;
-                batch.entries.reserve(BATCH_ROWS);
-
-                bool arrow_write_failed = false;
-                for (auto& [key, metrics] : agg_results.aggregations) {
-                    batch.entries.emplace_back(key, metrics);
-                    if (batch.entries.size() >= BATCH_ROWS) {
-                        auto arrow_batch = batch.to_arrow();
-                        if (ipc.write_batch(arrow_batch) != 0) {
-                            DFTRACER_UTILS_LOG_ERROR(
-                                "Arrow IPC write_batch failed");
-                            arrow_write_failed = true;
+                    write_success = true;
+                    for (auto& ab : pending_batches) {
+                        if (co_await ipc.write_batch(ab) != 0) {
+                            write_success = false;
                             break;
                         }
-                        batch.entries.clear();
                     }
-                }
-                if (arrow_write_failed) {
-                    ipc.close();
-                    co_return false;
-                }
-                if (!batch.entries.empty()) {
-                    auto arrow_batch = batch.to_arrow();
-                    if (ipc.write_batch(arrow_batch) != 0) {
-                        DFTRACER_UTILS_LOG_ERROR(
-                            "Arrow IPC write_batch (final) failed");
-                        ipc.close();
-                        co_return false;
+                    if (write_success) {
+                        write_success = (co_await ipc.close() == 0);
+                    } else {
+                        co_await ipc.close();
                     }
                 }
-
-                success = (ipc.close() == 0);
             } else
 #endif
             {
-                // JSON / Perfetto output path
-                DFTRACER_UTILS_LOG_INFO(
-                    "Writing %zu aggregation keys to %s%s...",
-                    agg_results.aggregations.size(), output_file.c_str(),
-                    compress_output ? " (compressed)" : "");
-
-                PerfettoTraceWriterUtility writer;
-                PerfettoTraceWriterInput writer_input{
-                    output_file,
-                    std::move(resolver_output),
-                    agg_config.compute_statistics,
-                    agg_config.compute_percentiles,
-                    agg_config.percentiles,
-                    compress_output,
-                    compression_level,
-                    event_format};
-                success = co_await writer.process(writer_input);
-            }
-
-            if (success) {
-                DFTRACER_UTILS_LOG_INFO("Output written successfully to: %s",
-                                        output_file.c_str());
-                if (fs::exists(output_file)) {
-                    auto file_size = fs::file_size(output_file);
-                    DFTRACER_UTILS_LOG_INFO("File exists, size: %zu bytes",
-                                            file_size);
-                } else {
-                    DFTRACER_UTILS_LOG_ERROR(
-                        "File does not exist after write!");
-                    success = false;
+                PerfettoTraceWriterInput streaming_input;
+                {
+                    ScopedTimer _si(stages, "build_streaming_input");
+                    streaming_input = build_streaming_input(
+                        merger.get(), &agg_config, &output_file,
+                        compress_output, compression_level, event_format);
+                    streaming_input.keys_written = &perfetto_keys_written;
+                    streaming_input.merge_on_sharded = true;
                 }
-            } else {
-                DFTRACER_UTILS_LOG_ERROR("Failed to write output file");
+                {
+                    ScopedTimer _pw(stages, "perfetto_write");
+                    PerfettoTraceWriterUtility writer;
+                    write_success = co_await scope.spawn(
+                        writer, std::move(streaming_input));
+                }
+                total_keys = perfetto_keys_written.load();
             }
-
-            write_success = success;
-            co_return success;
         },
-        "PostProcess");
+        "AggregatorMain");
 
-    post_task->depends_on(streaming_task);
-    pipeline.set_source(streaming_task);
-    pipeline.set_destination(post_task);
-    pipeline.execute();
+    pipeline.set_source(main_task);
+    {
+        ScopedTimer _t(stages, "pipeline_execute");
+        pipeline.execute();
+    }
+
+    {
+        ScopedTimer _t(stages, "close_rocksdb");
+        merger.reset();
+        agg_db.reset();
+    }
 
-    auto end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double, std::milli> duration = end_time - start_time;
+    overall.stop();
+    double duration_ms = static_cast<double>(overall.elapsed()) / 1e6;
 
     std::printf("\n");
     std::printf("==========================================\n");
     std::printf("Aggregation Results\n");
     std::printf("==========================================\n");
-    std::printf("  Execution time: %.2f seconds\n", duration.count() / 1000.0);
-    std::printf("  Files processed: %zu\n", agg_results.total_files_processed);
-    std::printf("  Bytes processed: %.2f MB\n",
-                static_cast<double>(agg_results.total_bytes_processed) /
-                    (1024.0 * 1024.0));
-    std::printf("  Events processed: %zu\n",
-                agg_results.total_events_processed);
-    std::printf("  Unique aggregation keys: %zu\n",
-                agg_results.aggregations.size());
-    std::printf("  Throughput: %.2f MB/s, %.2f events/s\n",
-                (static_cast<double>(agg_results.total_bytes_processed) /
-                 (1024.0 * 1024.0)) /
-                    (duration.count() / 1000.0),
-                static_cast<double>(agg_results.total_events_processed) /
-                    (duration.count() / 1000.0));
+    std::printf("  Execution time: %.2f seconds\n", duration_ms / 1000.0);
+    std::printf("  Files: %zu total, %zu processed, %zu cached\n",
+                input_files.size(), files_to_process.size(), num_cached);
+    std::printf("  Unique aggregation keys: %zu\n", total_keys);
     std::printf("  Output file: %s\n", output_file.c_str());
     std::printf("  Write status: %s\n", write_success ? "SUCCESS" : "FAILED");
     std::printf("==========================================\n");
 
-    AggregatorSummaryUtility summary_writer;
-    summary_writer.process(agg_results);
+    if (stages) stages->print_stages();
 
-    if (!temp_index_dir.empty() && fs::exists(temp_index_dir)) {
-        DFTRACER_UTILS_LOG_INFO("Cleaning up temporary index directory: %s",
-                                temp_index_dir.c_str());
-        fs::remove_all(temp_index_dir);
-    }
-
-    co_return agg_results.success&& write_success ? 0 : 1;
+    co_return write_success ? 0 : 1;
 }
 
 int main(int argc, char** argv) {
     DFTRACER_UTILS_LOGGER_INIT();
 
-    auto default_checkpoint_size_str =
-        std::to_string(indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE) +
-        " B (" +
-        std::to_string(indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE /
-                       (1024 * 1024)) +
-        " MB)";
-
     argparse::ArgumentParser program("dftracer_aggregator",
                                      DFTRACER_UTILS_PACKAGE_VERSION);
     program.add_description(
         "Aggregate DFTracer events into time-series counters using streaming "
         "coroutine pipeline with minimal memory footprint");
 
-    program.add_argument("-d", "--directory")
-        .help("Input directory containing .pfw or .pfw.gz files")
-        .default_value<std::string>(".");
-
-    program.add_argument("-o", "--output")
-        .help("Output file path for aggregated counters")
-        .default_value<std::string>("aggregated_output.json");
-
-    program.add_argument("-t", "--time-interval")
-        .help("Time interval in milliseconds for bucketing (default: 5000)")
-        .scan<'g', double>()
-        .default_value(5000.0);
-
-    program.add_argument("-g", "--group-keys")
-        .help(
-            "Comma-separated extra group keys from args (e.g., "
-            "epoch,step,level)")
-        .default_value<std::string>("");
-
-    program.add_argument("-m", "--metric-fields")
-        .help(
-            "Comma-separated custom metric fields from args (e.g., "
-            "iter_count,num_events)")
-        .default_value<std::string>("");
-
-    program.add_argument("--query")
-        .help("Query DSL filter (e.g., 'cat == \"POSIX\" and dur > 1000')")
-        .default_value<std::string>("");
-
-    program.add_argument("-f", "--force").help("Force index recreation").flag();
-
-    program.add_argument("--checkpoint-size")
-        .help("Checkpoint size for indexing in bytes (default: " +
-              default_checkpoint_size_str + ")")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(
-            indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE));
-
-    program.add_argument("--executor-threads")
-        .help(
-            "Number of executor threads for parallel processing (default: "
-            "number of CPU cores)")
-        .scan<'d', std::size_t>()
-        .default_value(
-            static_cast<std::size_t>(dftracer_utils_hardware_concurrency()));
-
-    program.add_argument("--index-dir")
-        .help("Directory to store index files (default: system temp directory)")
-        .default_value<std::string>("");
-
-    program.add_argument("--compress")
-        .help("Compress output using gzip")
-        .default_value(false)
-        .implicit_value(true);
-
-    program.add_argument("--compression-level")
-        .help("Gzip compression level (0-9, default: 6)")
-        .scan<'d', int>()
-        .default_value(6);
-
-    program.add_argument("--boundary-events")
-        .help(
-            "Boundary event configuration: event_name:value_field:output_name "
-            "(e.g., \"epoch.block:iter_count:epoch\")")
-        .default_value<std::string>("");
-
-    program.add_argument("--no-track-process-parents")
-        .help(
-            "Disable tracking of process parent relationships from fork/spawn")
-        .default_value(false)
-        .implicit_value(true);
-
-    program.add_argument("--chunk-size")
-        .help("Target chunk size in MB for parallel processing (default: 4)")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(4));
-
-    program.add_argument("--read-batch-size")
-        .help(
-            "Batch read size in MB for stream processing (default: 4, higher = "
-            "faster but more memory)")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(4));
-
-    program.add_argument("--event-format")
-        .help(
-            "Perfetto event format: 'counter' (ph=C, point-in-time, default), "
-            "'async' (ph=b/e, async tracks with overlaps), "
-            "'regular' (ph=X, duration events with original TID)")
-        .default_value<std::string>("counter");
-
-    program.add_argument("--compute-percentiles")
-        .help(
-            "Enable percentile/quantile computation using DDSketch (opt-in due "
-            "to memory overhead)")
-        .default_value(false)
-        .implicit_value(true);
-
-    program.add_argument("--percentiles")
-        .help(
-            "Comma-separated percentiles to compute (e.g., "
-            "\"0.25,0.5,0.75,0.90\" for P25, P50, P75, P90)")
-        .default_value<std::string>("0.25,0.5,0.75,0.90");
-
-    program.add_argument("--relative-accuracy")
-        .help(
-            "Relative accuracy for DDSketch percentile estimation "
-            "(default: 0.01 = 1%)")
-        .scan<'g', double>()
-        .default_value(0.01);
-
-    program.add_argument("--format")
-        .help(
-            "Output format: 'json' (Perfetto JSON, default) or "
-            "'arrow' (Arrow IPC file, .arrows extension)")
-        .default_value<std::string>("json");
-
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        DFTRACER_UTILS_LOG_ERROR("Error occurred: %s", err.what());
-        std::fprintf(stderr, "%s\n", program.help().str().c_str());
-        return 1;
-    }
+    AggregatorArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
 
-    return run_aggregator(program).get();
+    return run_aggregator(&cli).get();
 }
diff --git a/src/dftracer/utils/binaries/dftracer_aggregator_mpi.cpp b/src/dftracer/utils/binaries/dftracer_aggregator_mpi.cpp
new file mode 100644
index 00000000..f482a80b
--- /dev/null
+++ b/src/dftracer/utils/binaries/dftracer_aggregator_mpi.cpp
@@ -0,0 +1,1199 @@
+// MPI driver for the distributed-SST aggregator.
+//
+// Pipeline DAG:
+//   scan -> phase_a -> phase_b -> phase_c -> merge
+// Each stage is its own task wired via depends_on(); MPI collectives
+// between stages sit inside the task bodies.
+
+#include <dftracer/utils/core/common/config.h>
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/common/string_intern.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/pipeline/pipeline.h>
+#include <dftracer/utils/core/rocksdb/column_families.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/core/tasks/task.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_key.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregators.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.h>
+#include <dftracer/utils/utilities/fileio/parallel/layout.h>
+#include <dftracer/utils/utilities/fileio/parallel/merge.h>
+#include <dftracer/utils/utilities/indexer/index_batch_sink.h>
+#include <dftracer/utils/utilities/indexer/index_builder_utility.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_sst_writer_context.h>
+#include <dftracer/utils/utilities/indexer/internal/common/gzip_member_scanner.h>
+#include <fcntl.h>
+#include <mpi.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <vector>
+
+#include "common_cli.h"
+
+using namespace dftracer::utils;
+using namespace dftracer::utils::utilities;
+using dftracer::utils::utilities::composites::dft::aggregators::
+    AGG_KEY_NUM_SHARDS;
+using dftracer::utils::utilities::composites::dft::aggregators::
+    AggregationConfig;
+using dftracer::utils::utilities::composites::dft::aggregators::
+    AggregationVisitor;
+using dftracer::utils::utilities::composites::dft::aggregators::
+    AssociationTracker;
+using dftracer::utils::utilities::composites::dft::aggregators::EventAggregator;
+using dftracer::utils::utilities::composites::dft::aggregators::
+    PerfettoEventFormat;
+using dftracer::utils::utilities::composites::dft::aggregators::
+    PerfettoTraceWriterInput;
+using dftracer::utils::utilities::composites::dft::aggregators::
+    PerfettoTraceWriterUtility;
+using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility;
+using dftracer::utils::utilities::indexer::IndexBatchSink;
+using dftracer::utils::utilities::indexer::IndexBuildBatchConfig;
+using dftracer::utils::utilities::indexer::IndexDatabase;
+using dftracer::utils::utilities::indexer::IndexDatabaseSstWriterContext;
+using dftracer::utils::utilities::indexer::SstArtifactRegistry;
+using dftracer::utils::utilities::indexer::internal::
+    enumerate_gzip_member_candidates;
+using dftracer::utils::utilities::indexer::internal::GzipMember;
+
+namespace {
+
+class AggregatorMpiArgParse : public cli::ArgParse {
+   public:
+    cli::DirectoryArgs directory{
+        cli::DirMode::DEFAULT_DOT,
+        "Input directory containing .pfw or .pfw.gz files"};
+    cli::PipelineArgs pipeline;
+    cli::IndexingArgs indexing;
+
+    std::string output;
+    std::string staging_dir;
+    std::string shared_staging_dir;
+    double time_interval = 5000.0;
+    bool keep_staging = false;
+
+    explicit AggregatorMpiArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        indexing.index_dir_help =
+            "Directory to store the final index (shared across ranks)";
+        indexing.force_help = "Force index recreation";
+        schema(directory, pipeline, indexing);
+    }
+
+   protected:
+    void register_args() override {
+        parser()
+            .add_argument("-o", "--output")
+            .help("Output file path for aggregated counters (gzip JSON)")
+            .default_value<std::string>("aggregated_output.json.gz");
+
+        parser()
+            .add_argument("--staging-dir")
+            .help(
+                "Per-rank SST staging root. Defaults to <index_dir>/_staging. "
+                "Each rank writes to <staging_dir>/rank_<R>.")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("--shared-staging")
+            .help(
+                "Shared-FS staging root. When set and different from "
+                "--staging-dir, each rank moves its SSTs + tracker.bin from "
+                "the (node-local) staging dir to <shared-staging>/rank_<R> "
+                "before the coordinator ingest. Required for multi-node runs "
+                "where --staging-dir points at node-local NVMe.")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("-t", "--time-interval")
+            .help("Time interval in milliseconds for bucketing (default: 5000)")
+            .scan<'g', double>()
+            .default_value(5000.0);
+
+        parser()
+            .add_argument("--keep-staging")
+            .help("Keep per-rank SST staging dirs after successful ingest")
+            .default_value(false)
+            .implicit_value(true);
+    }
+
+    void post_parse() override {
+        output = parser().get<std::string>("--output");
+        staging_dir = parser().get<std::string>("--staging-dir");
+        shared_staging_dir = parser().get<std::string>("--shared-staging");
+        time_interval = parser().get<double>("--time-interval");
+        keep_staging = parser().get<bool>("--keep-staging");
+    }
+};
+
+std::vector<std::string> enumerate_inputs(const std::string& dir) {
+    std::vector<std::string> files;
+    std::error_code ec;
+    for (const auto& entry : fs::directory_iterator(dir, ec)) {
+        if (ec) break;
+        if (!entry.is_regular_file(ec)) continue;
+        const auto& p = entry.path();
+        const auto ext = p.extension().string();
+        if (ext == ".pfw" || ext == ".gz") files.push_back(p.string());
+    }
+    std::sort(files.begin(), files.end());
+    return files;
+}
+
+std::vector<char> pack_paths(const std::vector<std::string>& paths) {
+    std::uint64_t total = sizeof(std::uint64_t);
+    for (const auto& p : paths) total += sizeof(std::uint64_t) + p.size();
+    std::vector<char> buf;
+    buf.reserve(total);
+    auto u64 = [&](std::uint64_t v) {
+        buf.insert(buf.end(), reinterpret_cast<const char*>(&v),
+                   reinterpret_cast<const char*>(&v) + sizeof(v));
+    };
+    u64(paths.size());
+    for (const auto& p : paths) {
+        u64(p.size());
+        buf.insert(buf.end(), p.begin(), p.end());
+    }
+    return buf;
+}
+
+std::vector<std::string> unpack_paths(const std::vector<char>& buf) {
+    std::vector<std::string> paths;
+    if (buf.size() < sizeof(std::uint64_t)) return paths;
+    const char* p = buf.data();
+    const char* end = buf.data() + buf.size();
+    auto read_u64 = [&](std::uint64_t& out) -> bool {
+        if (end - p < static_cast<std::ptrdiff_t>(sizeof(out))) return false;
+        std::memcpy(&out, p, sizeof(out));
+        p += sizeof(out);
+        return true;
+    };
+    std::uint64_t n = 0;
+    if (!read_u64(n)) return paths;
+    paths.reserve(n);
+    for (std::uint64_t i = 0; i < n; ++i) {
+        std::uint64_t len = 0;
+        if (!read_u64(len)) break;
+        if (end - p < static_cast<std::ptrdiff_t>(len)) break;
+        paths.emplace_back(p, p + len);
+        p += len;
+    }
+    return paths;
+}
+
+void pack_artifacts(const IndexDatabaseSstWriterContext::Artifacts& a,
+                    std::vector<char>& buf) {
+    auto append_u64 = [&](std::uint64_t v) {
+        buf.insert(buf.end(), reinterpret_cast<const char*>(&v),
+                   reinterpret_cast<const char*>(&v) + sizeof(v));
+    };
+    auto append_opt = [&](const std::optional<std::string>& s) {
+        if (s) {
+            buf.push_back(1);
+            append_u64(s->size());
+            buf.insert(buf.end(), s->begin(), s->end());
+        } else {
+            buf.push_back(0);
+        }
+    };
+    append_opt(a.metadata_sst);
+    append_opt(a.checkpoints_sst);
+    append_opt(a.manifest_sst);
+    append_opt(a.chunk_bloom_sst);
+    append_opt(a.file_bloom_sst);
+    append_opt(a.chunk_stats_sst);
+    append_opt(a.chunk_dim_stats_sst);
+    append_opt(a.dimensions_sst);
+    append_opt(a.file_scalar_stats_sst);
+    append_opt(a.file_cat_counts_sst);
+    append_opt(a.file_pid_tid_counts_sst);
+    append_opt(a.file_name_counts_sst);
+    append_opt(a.name_dictionary_sst);
+    append_opt(a.name_file_postings_sst);
+    append_opt(a.name_chunk_postings_sst);
+    append_opt(a.hash_tables_sst);
+    append_opt(a.aggregation_sst);
+    append_opt(a.system_metrics_sst);
+}
+
+bool unpack_artifacts(const char*& p, const char* end,
+                      IndexDatabaseSstWriterContext::Artifacts& a) {
+    auto read_u64 = [&](std::uint64_t& out) -> bool {
+        if (end - p < static_cast<std::ptrdiff_t>(sizeof(out))) return false;
+        std::memcpy(&out, p, sizeof(out));
+        p += sizeof(out);
+        return true;
+    };
+    auto read_opt = [&](std::optional<std::string>& s) -> bool {
+        if (p == end) return false;
+        const char flag = *p++;
+        if (!flag) return true;
+        std::uint64_t len = 0;
+        if (!read_u64(len)) return false;
+        if (end - p < static_cast<std::ptrdiff_t>(len)) return false;
+        s = std::string(p, p + len);
+        p += len;
+        return true;
+    };
+    return read_opt(a.metadata_sst) && read_opt(a.checkpoints_sst) &&
+           read_opt(a.manifest_sst) && read_opt(a.chunk_bloom_sst) &&
+           read_opt(a.file_bloom_sst) && read_opt(a.chunk_stats_sst) &&
+           read_opt(a.chunk_dim_stats_sst) && read_opt(a.dimensions_sst) &&
+           read_opt(a.file_scalar_stats_sst) &&
+           read_opt(a.file_cat_counts_sst) &&
+           read_opt(a.file_pid_tid_counts_sst) &&
+           read_opt(a.file_name_counts_sst) &&
+           read_opt(a.name_dictionary_sst) &&
+           read_opt(a.name_file_postings_sst) &&
+           read_opt(a.name_chunk_postings_sst) && read_opt(a.hash_tables_sst) &&
+           read_opt(a.aggregation_sst) && read_opt(a.system_metrics_sst);
+}
+
+std::vector<char> pack_artifact_list(
+    const std::vector<IndexDatabaseSstWriterContext::Artifacts>& main_artifacts,
+    const std::vector<IndexDatabaseSstWriterContext::Artifacts>&
+        agg_artifacts) {
+    std::vector<char> buf;
+    std::uint64_t count = 0;
+    for (const auto& a : main_artifacts)
+        if (!a.empty()) ++count;
+    for (const auto& a : agg_artifacts)
+        if (!a.empty()) ++count;
+    buf.insert(buf.end(), reinterpret_cast<const char*>(&count),
+               reinterpret_cast<const char*>(&count) + sizeof(count));
+    for (const auto& a : main_artifacts)
+        if (!a.empty()) pack_artifacts(a, buf);
+    for (const auto& a : agg_artifacts)
+        if (!a.empty()) pack_artifacts(a, buf);
+    return buf;
+}
+
+bool append_artifact_list(const char* p, const char* end,
+                          SstArtifactRegistry& registry) {
+    if (end - p < static_cast<std::ptrdiff_t>(sizeof(std::uint64_t)))
+        return false;
+    std::uint64_t count = 0;
+    std::memcpy(&count, p, sizeof(count));
+    p += sizeof(count);
+    for (std::uint64_t i = 0; i < count; ++i) {
+        IndexDatabaseSstWriterContext::Artifacts a;
+        if (!unpack_artifacts(p, end, a)) return false;
+        registry.append(std::move(a));
+    }
+    return true;
+}
+
+coro::CoroTask<void> scan_one_file(const std::string& path,
+                                   std::vector<GzipMember>& out) {
+    int fd = ::open(path.c_str(), O_RDONLY);
+    if (fd < 0) co_return;
+    struct stat st;
+    if (::fstat(fd, &st) == 0 && st.st_size >= 18) {
+        co_await enumerate_gzip_member_candidates(
+            fd, static_cast<std::uint64_t>(st.st_size), out);
+    }
+    ::close(fd);
+}
+
+struct WorkUnit {
+    std::size_t file_idx;
+    std::size_t member_begin;
+    std::size_t member_end;
+    std::uint64_t c_size;
+};
+
+// Partition members into slices of ~target bytes. Deterministic across
+// ranks so every rank computes identical assignments from the same map.
+std::vector<WorkUnit> build_work_units(
+    const std::vector<std::vector<GzipMember>>& per_file_members,
+    std::uint64_t target_c_size) {
+    std::vector<WorkUnit> units;
+    for (std::size_t fi = 0; fi < per_file_members.size(); ++fi) {
+        const auto& members = per_file_members[fi];
+        if (members.empty()) continue;
+        std::size_t begin = 0;
+        std::uint64_t accum = 0;
+        for (std::size_t i = 0; i < members.size(); ++i) {
+            accum += members[i].c_size;
+            const bool is_last = (i + 1 == members.size());
+            if ((target_c_size > 0 && accum >= target_c_size) || is_last) {
+                units.push_back({fi, begin, i + 1, accum});
+                begin = i + 1;
+                accum = 0;
+            }
+        }
+    }
+    return units;
+}
+
+std::vector<int> lpt_assign_units(const std::vector<WorkUnit>& units,
+                                  int num_ranks) {
+    std::vector<std::size_t> order(units.size());
+    for (std::size_t i = 0; i < order.size(); ++i) order[i] = i;
+    std::sort(order.begin(), order.end(), [&](std::size_t a, std::size_t b) {
+        if (units[a].c_size != units[b].c_size)
+            return units[a].c_size > units[b].c_size;
+        if (units[a].file_idx != units[b].file_idx)
+            return units[a].file_idx < units[b].file_idx;
+        return units[a].member_begin < units[b].member_begin;
+    });
+    std::vector<std::uint64_t> loads(num_ranks, 0);
+    std::vector<int> owner(units.size(), 0);
+    for (std::size_t ord : order) {
+        int best = 0;
+        for (int r = 1; r < num_ranks; ++r)
+            if (loads[r] < loads[best]) best = r;
+        owner[ord] = best;
+        loads[best] += std::max<std::uint64_t>(units[ord].c_size, 1);
+    }
+    return owner;
+}
+
+// Shared state threaded through the DAG via reference capture.
+struct RunCtx {
+    int rank = 0;
+    int size = 1;
+    const AggregatorMpiArgParse* cli = nullptr;
+
+    std::string index_dir;
+    std::string staging_root;
+    std::string shared_staging_root;
+    std::string final_output;
+    std::string perfetto_shards_dir;
+    std::string my_shard_output;
+
+    std::vector<std::string> all_files;
+    std::uint64_t nfiles = 0;
+    std::vector<std::vector<GzipMember>> member_map;
+
+    std::vector<std::string> my_files;
+    std::vector<int> my_file_ids;
+    std::vector<IndexBuildBatchConfig::FileSlice> my_slices;
+
+    std::vector<IndexDatabaseSstWriterContext::Artifacts> main_artifacts;
+    std::vector<IndexDatabaseSstWriterContext::Artifacts> agg_artifacts;
+
+    bool failed = false;
+    double scan_ms = 0.0;
+    double phase_a_ms = 0.0;
+    double phase_b_ms = 0.0;
+    double phase_c_ms = 0.0;
+};
+
+// ---- scan task: co-operative gzip member pre-scan + LPT assignment ----
+coro::CoroTask<void> task_scan(RunCtx& ctx, CoroScope& scope) {
+    if (ctx.failed) co_return;
+    const auto t0 = std::chrono::steady_clock::now();
+
+    std::vector<std::size_t> my_scan_indices;
+    for (std::uint64_t i = 0; i < ctx.nfiles; ++i) {
+        if (static_cast<int>(i % static_cast<std::uint64_t>(ctx.size)) ==
+            ctx.rank) {
+            my_scan_indices.push_back(i);
+        }
+    }
+    std::vector<std::vector<GzipMember>> my_scans(my_scan_indices.size());
+
+    co_await scope.scope([&](CoroScope& child) -> coro::CoroTask<void> {
+        for (std::size_t si = 0; si < my_scan_indices.size(); ++si) {
+            const std::string& path = ctx.all_files[my_scan_indices[si]];
+            auto& out = my_scans[si];
+            child.spawn([path, &out](CoroScope&) -> coro::CoroTask<void> {
+                co_await scan_one_file(path, out);
+            });
+        }
+        co_return;
+    });
+
+    std::vector<char> my_packed;
+    auto u64 = [&](std::uint64_t v) {
+        my_packed.insert(my_packed.end(), reinterpret_cast<const char*>(&v),
+                         reinterpret_cast<const char*>(&v) + sizeof(v));
+    };
+    u64(my_scan_indices.size());
+    for (std::size_t si = 0; si < my_scan_indices.size(); ++si) {
+        u64(static_cast<std::uint64_t>(my_scan_indices[si]));
+        u64(static_cast<std::uint64_t>(my_scans[si].size()));
+        for (const auto& m : my_scans[si]) {
+            u64(m.c_offset);
+            u64(m.c_size);
+        }
+    }
+    const int my_bytes = static_cast<int>(my_packed.size());
+    std::vector<int> rank_bytes(ctx.size, 0);
+    MPI_Allgather(&my_bytes, 1, MPI_INT, rank_bytes.data(), 1, MPI_INT,
+                  MPI_COMM_WORLD);
+    std::vector<int> displs(ctx.size, 0);
+    int total = 0;
+    for (int r = 0; r < ctx.size; ++r) {
+        displs[r] = total;
+        total += rank_bytes[r];
+    }
+    std::vector<char> gathered(total);
+    MPI_Allgatherv(my_packed.data(), my_bytes, MPI_CHAR, gathered.data(),
+                   rank_bytes.data(), displs.data(), MPI_CHAR, MPI_COMM_WORLD);
+
+    ctx.member_map.assign(ctx.nfiles, {});
+    for (int r = 0; r < ctx.size; ++r) {
+        const char* p = gathered.data() + displs[r];
+        const char* end = p + rank_bytes[r];
+        auto read_u64 = [&](std::uint64_t& v) -> bool {
+            if (end - p < static_cast<std::ptrdiff_t>(sizeof(v))) return false;
+            std::memcpy(&v, p, sizeof(v));
+            p += sizeof(v);
+            return true;
+        };
+        std::uint64_t count = 0;
+        if (!read_u64(count)) continue;
+        for (std::uint64_t k = 0; k < count; ++k) {
+            std::uint64_t fi = 0, mc = 0;
+            if (!read_u64(fi) || !read_u64(mc)) break;
+            if (fi >= ctx.nfiles) break;
+            ctx.member_map[fi].resize(mc);
+            for (std::uint64_t j = 0; j < mc; ++j) {
+                if (!read_u64(ctx.member_map[fi][j].c_offset)) break;
+                if (!read_u64(ctx.member_map[fi][j].c_size)) break;
+            }
+        }
+    }
+
+    // Fallback for plain .pfw / unreadable / non-dftracer gzip.
+    std::uint64_t total_c = 0;
+    for (std::uint64_t i = 0; i < ctx.nfiles; ++i) {
+        if (ctx.member_map[i].empty()) {
+            std::error_code ec;
+            std::uint64_t sz = fs::file_size(ctx.all_files[i], ec);
+            if (ec) sz = 0;
+            ctx.member_map[i].push_back({0, sz});
+        }
+        for (const auto& m : ctx.member_map[i]) total_c += m.c_size;
+    }
+
+    const std::uint64_t target_per_rank =
+        (total_c + static_cast<std::uint64_t>(ctx.size) - 1) /
+        std::max<std::uint64_t>(static_cast<std::uint64_t>(ctx.size), 1);
+    const auto units = build_work_units(ctx.member_map, target_per_rank);
+    const auto owner = lpt_assign_units(units, ctx.size);
+
+    for (std::size_t ui = 0; ui < units.size(); ++ui) {
+        if (owner[ui] != ctx.rank) continue;
+        const auto& u = units[ui];
+        ctx.my_files.push_back(ctx.all_files[u.file_idx]);
+        ctx.my_file_ids.push_back(static_cast<int>(u.file_idx + 1));
+        IndexBuildBatchConfig::FileSlice s;
+        s.members = &ctx.member_map[u.file_idx];
+        s.member_begin = u.member_begin;
+        s.member_end = u.member_end;
+        // Disambiguate (file_id, checkpoint_idx) across slices.
+        constexpr std::uint64_t CKPT_STRIDE = 1u << 20;
+        s.checkpoint_idx_base =
+            static_cast<std::uint64_t>(u.member_begin) * CKPT_STRIDE;
+        // Only the first slice of a file persists file-scoped CFs.
+        s.skip_file_scoped_writes = (u.member_begin != 0);
+        ctx.my_slices.push_back(s);
+    }
+
+    ctx.scan_ms = std::chrono::duration<double, std::milli>(
+                      std::chrono::steady_clock::now() - t0)
+                      .count();
+    if (ctx.rank == 0) {
+        std::printf(
+            "[rank 0] pre-scan %.2f ms: files=%llu work_units=%zu "
+            "target_per_rank=%llu bytes total=%llu bytes\n",
+            ctx.scan_ms, static_cast<unsigned long long>(ctx.nfiles),
+            units.size(), static_cast<unsigned long long>(target_per_rank),
+            static_cast<unsigned long long>(total_c));
+    }
+    std::printf("[rank %d/%d] files=%zu (work_units)\n", ctx.rank, ctx.size,
+                ctx.my_files.size());
+    std::fflush(stdout);
+    co_return;
+}
+
+// ---- phase_a task: distributed-SST index + aggregate build ----
+coro::CoroTask<void> task_phase_a(RunCtx& ctx, CoroScope& scope) {
+    if (ctx.failed) co_return;
+
+    const auto t0 = std::chrono::steady_clock::now();
+    bool ok = true;
+
+    if (!ctx.my_files.empty()) {
+        const std::string rank_staging =
+            (fs::path(ctx.staging_root) / ("rank_" + std::to_string(ctx.rank)))
+                .string();
+        std::error_code ec;
+        fs::create_directories(rank_staging, ec);
+
+        auto agg_config = std::make_shared<AggregationConfig>();
+        agg_config->time_interval_us =
+            static_cast<std::uint64_t>(ctx.cli->time_interval * 1000.0);
+        agg_config->compute_statistics = true;
+        agg_config->track_process_parents = true;
+        agg_config->track_default_args = true;
+
+        // Atomic: write_phase spawns N concurrent write workers; a plain
+        // size_t would let two workers share an idx and clobber each
+        // other's SSTs ("Bad table magic number" at ingest).
+        auto batch_counter = std::make_shared<std::atomic<std::size_t>>(0);
+        struct SharedArtifacts {
+            std::mutex mu;
+            std::vector<IndexDatabaseSstWriterContext::Artifacts> list;
+        };
+        auto artifacts_shared = std::make_shared<SharedArtifacts>();
+
+        auto batch_config = std::make_shared<IndexBuildBatchConfig>();
+        batch_config->file_paths = ctx.my_files;
+        batch_config->preassigned_file_ids = ctx.my_file_ids;
+        batch_config->file_slices = ctx.my_slices;
+        batch_config->index_dir = ctx.index_dir;
+        batch_config->checkpoint_size = ctx.cli->indexing.checkpoint_size;
+        batch_config->force_rebuild = ctx.cli->indexing.force;
+        batch_config->build_manifest = false;
+        batch_config->parallelism = ctx.cli->pipeline.executor_threads;
+        batch_config->rebuild_root_summaries = false;
+
+        const std::string batch_id = "r" + std::to_string(ctx.rank);
+        batch_config->dft_visitor_factory =
+            [rank_staging, batch_id, agg_config](const std::string& file_path)
+            -> std::vector<std::unique_ptr<composites::dft::DftEventVisitor>> {
+            std::vector<std::unique_ptr<composites::dft::DftEventVisitor>> v;
+            v.push_back(std::make_unique<AggregationVisitor>(
+                rank_staging, batch_id + "_agg", 0, *agg_config, file_path));
+            return v;
+        };
+        batch_config->sink_factory =
+            [rank_staging, batch_id,
+             batch_counter]() -> std::unique_ptr<IndexBatchSink> {
+            const std::size_t idx =
+                batch_counter->fetch_add(1, std::memory_order_relaxed);
+            return std::make_unique<IndexDatabaseSstWriterContext>(
+                rank_staging, batch_id + "_" + std::to_string(idx));
+        };
+        batch_config->sink_commit = [artifacts_shared](IndexBatchSink& sink) {
+            auto& sst = static_cast<IndexDatabaseSstWriterContext&>(sink);
+            auto a = sst.commit();
+            std::lock_guard<std::mutex> lock(artifacts_shared->mu);
+            if (!a.empty()) artifacts_shared->list.push_back(std::move(a));
+        };
+
+        auto batch_result =
+            co_await IndexBatchBuilderUtility::process(&scope, batch_config);
+
+        if (batch_result.failed > 0) {
+            for (const auto& r : batch_result.results) {
+                if (!r.success) {
+                    std::fprintf(
+                        stderr, "[rank %d] build failed: %s (file=%s)\n",
+                        ctx.rank, r.error_message.c_str(), r.file_path.c_str());
+                    ok = false;
+                    break;
+                }
+            }
+        }
+
+        if (ok) {
+            {
+                std::lock_guard<std::mutex> lock(artifacts_shared->mu);
+                ctx.main_artifacts = std::move(artifacts_shared->list);
+            }
+
+            std::vector<AggregationVisitor*> seen;
+            for (auto& file_visitors : batch_result.extra_visitors) {
+                for (auto& v : file_visitors) {
+                    auto* agg = dynamic_cast<AggregationVisitor*>(v.get());
+                    if (!agg) continue;
+                    if (std::find(seen.begin(), seen.end(), agg) != seen.end())
+                        continue;
+                    seen.push_back(agg);
+                    for (auto& a : agg->aggregation_artifacts()) {
+                        if (!a.empty())
+                            ctx.agg_artifacts.push_back(std::move(a));
+                    }
+                }
+            }
+
+            AssociationTracker combined;
+            for (auto* agg : seen) {
+                auto out = agg->take_output();
+                if (out.local_tracker) combined.merge(*out.local_tracker);
+            }
+            combined.finalize();
+            const std::string serialized = combined.serialize();
+            const std::string tracker_local =
+                (fs::path(rank_staging) / "tracker.bin").string();
+            FILE* f = std::fopen(tracker_local.c_str(), "wb");
+            if (f) {
+                std::fwrite(serialized.data(), 1, serialized.size(), f);
+                std::fclose(f);
+            }
+
+            // Move per-rank artifacts from node-local staging to shared
+            // staging so rank 0 can ingest them from a path visible on
+            // every node. No-op when the two roots are the same.
+            if (ctx.shared_staging_root != ctx.staging_root) {
+                const std::string rank_shared =
+                    (fs::path(ctx.shared_staging_root) /
+                     ("rank_" + std::to_string(ctx.rank)))
+                        .string();
+                std::error_code mec;
+                fs::create_directories(rank_shared, mec);
+                try {
+                    for (std::size_t i = 0; i < ctx.main_artifacts.size();
+                         ++i) {
+                        const std::string sub = (fs::path(rank_shared) /
+                                                 ("main_" + std::to_string(i)))
+                                                    .string();
+                        ctx.main_artifacts[i] =
+                            std::move(ctx.main_artifacts[i]).move_to(sub);
+                    }
+                    for (std::size_t i = 0; i < ctx.agg_artifacts.size(); ++i) {
+                        const std::string sub = (fs::path(rank_shared) /
+                                                 ("agg_" + std::to_string(i)))
+                                                    .string();
+                        ctx.agg_artifacts[i] =
+                            std::move(ctx.agg_artifacts[i]).move_to(sub);
+                    }
+                } catch (const std::exception& e) {
+                    std::fprintf(stderr,
+                                 "[rank %d] failed to relocate SSTs to shared "
+                                 "staging: %s\n",
+                                 ctx.rank, e.what());
+                    ok = false;
+                }
+                if (ok && !serialized.empty()) {
+                    const std::string tracker_shared =
+                        (fs::path(rank_shared) / "tracker.bin").string();
+                    std::error_code tec;
+                    fs::rename(tracker_local, tracker_shared, tec);
+                    if (tec) {
+                        fs::copy_file(tracker_local, tracker_shared,
+                                      fs::copy_options::overwrite_existing,
+                                      tec);
+                        if (!tec) fs::remove(tracker_local, tec);
+                    }
+                }
+            }
+        }
+    }
+
+    int ok_int = ok ? 1 : 0, global = 0;
+    MPI_Allreduce(&ok_int, &global, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    if (!global) {
+        if (ctx.rank == 0)
+            std::fprintf(stderr, "Phase A failed on some rank\n");
+        ctx.failed = true;
+        co_return;
+    }
+
+    ctx.phase_a_ms = std::chrono::duration<double, std::milli>(
+                         std::chrono::steady_clock::now() - t0)
+                         .count();
+    std::printf(
+        "[rank %d/%d] Phase A done in %.2f ms: main_artifacts=%zu "
+        "agg_flushes=%zu\n",
+        ctx.rank, ctx.size, ctx.phase_a_ms, ctx.main_artifacts.size(),
+        ctx.agg_artifacts.size());
+    std::fflush(stdout);
+    co_return;
+}
+
+// ---- phase_b task: Gatherv + rank 0 bulk_ingest + tracker merge ----
+coro::CoroTask<void> task_phase_b(RunCtx& ctx) {
+    if (ctx.failed) co_return;
+
+    const auto t0 = std::chrono::steady_clock::now();
+    const std::vector<char> packed =
+        pack_artifact_list(ctx.main_artifacts, ctx.agg_artifacts);
+    const int my_bytes = static_cast<int>(packed.size());
+
+    std::vector<int> rank_bytes(ctx.size, 0);
+    MPI_Gather(&my_bytes, 1, MPI_INT, rank_bytes.data(), 1, MPI_INT, 0,
+               MPI_COMM_WORLD);
+
+    std::vector<int> displs(ctx.size, 0);
+    std::vector<char> gathered;
+    if (ctx.rank == 0) {
+        int total = 0;
+        for (int r = 0; r < ctx.size; ++r) {
+            displs[r] = total;
+            total += rank_bytes[r];
+        }
+        gathered.resize(total);
+    }
+    MPI_Gatherv(packed.data(), my_bytes, MPI_CHAR,
+                ctx.rank == 0 ? gathered.data() : nullptr, rank_bytes.data(),
+                displs.data(), MPI_CHAR, 0, MPI_COMM_WORLD);
+
+    int ok = 1;
+    if (ctx.rank == 0) {
+        try {
+            SstArtifactRegistry registry;
+            for (int r = 0; r < ctx.size; ++r) {
+                if (rank_bytes[r] == 0) continue;
+                const char* p = gathered.data() + displs[r];
+                if (!append_artifact_list(p, p + rank_bytes[r], registry)) {
+                    std::fprintf(
+                        stderr,
+                        "[rank 0] failed to parse artifacts from rank %d\n", r);
+                    ok = 0;
+                    break;
+                }
+            }
+            if (ok) {
+                IndexDatabase db(ctx.index_dir);
+                db.bulk_ingest(registry, {});
+                db.rebuild_root_summaries();
+
+                db.write_agg_global_config(static_cast<std::uint64_t>(
+                    ctx.cli->time_interval * 1000.0));
+                std::vector<int> all_file_ids;
+                all_file_ids.reserve(ctx.nfiles);
+                for (std::uint64_t i = 1; i <= ctx.nfiles; ++i)
+                    all_file_ids.push_back(static_cast<int>(i));
+                db.write_agg_file_markers(all_file_ids);
+
+                AssociationTracker unified;
+                for (int r = 0; r < ctx.size; ++r) {
+                    char suffix[32];
+                    std::snprintf(suffix, sizeof(suffix),
+                                  "/rank_%d/tracker.bin", r);
+                    std::ifstream f(ctx.shared_staging_root + suffix,
+                                    std::ios::binary);
+                    if (!f) continue;
+                    std::string bytes((std::istreambuf_iterator<char>(f)), {});
+                    if (!bytes.empty())
+                        unified.merge(AssociationTracker::deserialize(bytes));
+                }
+                unified.finalize();
+                constexpr std::string_view TRACKER_KEY = "__tracker__";
+                db.db()->put(TRACKER_KEY, unified.serialize(),
+                             dftracer::utils::rocksdb::cf::AGGREGATION);
+
+                // Diagnostic: count aggregation CF keys right after ingest,
+                // split by shard-prefixed data vs special 0xFF-prefixed
+                // keys (global config / file markers / tracker). If the
+                // first bucket is 0 after ingest, the aggregation SSTs
+                // never actually landed in the CF.
+                {
+                    std::size_t shard_keys = 0, special_keys = 0;
+                    auto it = db.db()->new_iterator(
+                        dftracer::utils::rocksdb::cf::AGGREGATION);
+                    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+                        auto k = it->key();
+                        if (k.size() >= 2 &&
+                            static_cast<std::uint8_t>(k[0]) < 0xFF) {
+                            shard_keys++;
+                        } else {
+                            special_keys++;
+                        }
+                    }
+                    std::printf(
+                        "[rank 0] AGGREGATION CF after ingest: shard_keys=%zu "
+                        "special_keys=%zu\n",
+                        shard_keys, special_keys);
+                    std::fflush(stdout);
+                }
+            }
+        } catch (const std::exception& e) {
+            std::fprintf(stderr, "[rank 0] bulk_ingest failed: %s\n", e.what());
+            ok = 0;
+        }
+    }
+    MPI_Bcast(&ok, 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+    if (ctx.rank == 0 && !ctx.cli->keep_staging) {
+        std::error_code ec;
+        fs::remove_all(ctx.shared_staging_root, ec);
+    }
+    // Each rank drops its own node-local staging dir; rank 0's shared
+    // cleanup above only covers the shared-FS side.
+    if (!ctx.cli->keep_staging && ctx.shared_staging_root != ctx.staging_root) {
+        const std::string rank_local =
+            (fs::path(ctx.staging_root) / ("rank_" + std::to_string(ctx.rank)))
+                .string();
+        std::error_code ec;
+        fs::remove_all(rank_local, ec);
+    }
+
+    if (!ok) {
+        ctx.failed = true;
+        co_return;
+    }
+    ctx.phase_b_ms = std::chrono::duration<double, std::milli>(
+                         std::chrono::steady_clock::now() - t0)
+                         .count();
+    if (ctx.rank == 0) {
+        std::printf("[rank 0] Phase B done in %.2f ms (ingest %d ranks)\n",
+                    ctx.phase_b_ms, ctx.size);
+        std::fflush(stdout);
+    }
+    co_return;
+}
+
+// ---- phase_c task: per-rank shard-prefix perfetto write ----
+coro::CoroTask<void> task_phase_c(RunCtx& ctx, CoroScope& scope) {
+    if (ctx.failed) co_return;
+
+    const auto t0 = std::chrono::steady_clock::now();
+    const std::string actual_index_path =
+        (fs::path(ctx.index_dir) / ".dftindex").string();
+    const std::uint16_t shards_total = AGG_KEY_NUM_SHARDS;
+    const std::uint16_t my_shard_begin =
+        static_cast<std::uint16_t>(static_cast<std::uint32_t>(shards_total) *
+                                   static_cast<std::uint32_t>(ctx.rank) /
+                                   static_cast<std::uint32_t>(ctx.size));
+    const std::uint16_t my_shard_end =
+        (ctx.rank + 1 == ctx.size)
+            ? shards_total
+            : static_cast<std::uint16_t>(
+                  static_cast<std::uint32_t>(shards_total) *
+                  static_cast<std::uint32_t>(ctx.rank + 1) /
+                  static_cast<std::uint32_t>(ctx.size));
+
+    if (ctx.rank == 0) {
+        std::error_code ec;
+        fs::create_directories(ctx.perfetto_shards_dir, ec);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+    char suffix[32];
+    std::snprintf(suffix, sizeof(suffix), "/rank_%05d.json.gz", ctx.rank);
+    ctx.my_shard_output = ctx.perfetto_shards_dir + suffix;
+
+    AggregationConfig phase_c_config;
+    phase_c_config.time_interval_us =
+        static_cast<std::uint64_t>(ctx.cli->time_interval * 1000.0);
+    phase_c_config.compute_statistics = true;
+    phase_c_config.track_process_parents = true;
+    phase_c_config.track_default_args = true;
+
+    auto agg_db =
+        EventAggregator::open_read_only_with_merge_operator(actual_index_path);
+    composites::dft::aggregators::load_intern_dictionary(*agg_db);
+
+    EventAggregator aggregator(agg_db, 0);
+
+    PerfettoTraceWriterInput input;
+    input.output_path = ctx.my_shard_output;
+    input.aggregator = &aggregator;
+    input.agg_config = &phase_c_config;
+    auto tracker = aggregator.build_global_tracker();
+    input.tracker = tracker.get();
+    input.root_pids = tracker->get_root_pids();
+    input.owned_tracker = std::move(tracker);
+    input.compute_statistics = true;
+    input.compute_percentiles = false;
+    input.compress = true;
+    input.compression_level = 6;
+    input.format = PerfettoEventFormat::COUNTER;
+    input.merge_on_sharded = true;
+    input.shard_begin = my_shard_begin;
+    input.shard_end = my_shard_end;
+    input.emit_header = (ctx.rank == 0);
+    input.emit_footer = (ctx.rank == ctx.size - 1);
+
+    PerfettoTraceWriterUtility writer;
+    const bool ok = co_await scope.spawn(writer, std::move(input));
+
+    int ok_int = ok ? 1 : 0, global = 0;
+    MPI_Allreduce(&ok_int, &global, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    if (!global) {
+        if (ctx.rank == 0)
+            std::fprintf(stderr, "Phase C failed on some rank\n");
+        ctx.failed = true;
+        co_return;
+    }
+
+    ctx.phase_c_ms = std::chrono::duration<double, std::milli>(
+                         std::chrono::steady_clock::now() - t0)
+                         .count();
+    std::printf("[rank %d/%d] Phase C scan+write done in %.2f ms\n", ctx.rank,
+                ctx.size, ctx.phase_c_ms);
+    std::fflush(stdout);
+    co_return;
+}
+
+// ---- merge task: striped parallel pwrite (Lustre/SSD) or sharded-serial ----
+coro::CoroTask<void> task_merge(RunCtx& ctx) {
+    if (ctx.failed) co_return;
+    const auto t0 = std::chrono::steady_clock::now();
+
+    auto layout = fileio::parallel::detect_layout(ctx.final_output);
+    const bool striped = layout.layout == fileio::parallel::FileLayout::STRIPED;
+
+    std::uint64_t my_sz = 0;
+    {
+        std::error_code ec;
+        my_sz = fs::file_size(ctx.my_shard_output, ec);
+        if (ec) my_sz = 0;
+    }
+    std::vector<std::uint64_t> all_sizes(ctx.size, 0);
+    MPI_Allgather(&my_sz, 1, MPI_UINT64_T, all_sizes.data(), 1, MPI_UINT64_T,
+                  MPI_COMM_WORLD);
+    std::uint64_t my_offset = 0, total_bytes = 0;
+    for (int r = 0; r < ctx.size; ++r) {
+        if (r < ctx.rank) my_offset += all_sizes[r];
+        total_bytes += all_sizes[r];
+    }
+
+    int ok = 1;
+    if (striped) {
+        if (ctx.rank == 0) {
+            int fd = ::open(ctx.final_output.c_str(),
+                            O_CREAT | O_WRONLY | O_TRUNC, 0644);
+            if (fd < 0 ||
+                ::ftruncate(fd, static_cast<off_t>(total_bytes)) != 0) {
+                std::fprintf(stderr, "[rank 0] failed to create %s\n",
+                             ctx.final_output.c_str());
+                ok = 0;
+            }
+            if (fd >= 0) ::close(fd);
+        }
+        MPI_Bcast(&ok, 1, MPI_INT, 0, MPI_COMM_WORLD);
+        if (ok) {
+            int out_fd = ::open(ctx.final_output.c_str(), O_WRONLY);
+            int in_fd = ::open(ctx.my_shard_output.c_str(), O_RDONLY);
+            if (out_fd < 0 || in_fd < 0) {
+                ok = 0;
+            } else {
+                std::vector<char> buf(1 << 20);
+                off_t out_pos = static_cast<off_t>(my_offset);
+                while (true) {
+                    ssize_t n = ::read(in_fd, buf.data(), buf.size());
+                    if (n == 0) break;
+                    if (n < 0) {
+                        ok = 0;
+                        break;
+                    }
+                    ssize_t w = ::pwrite(out_fd, buf.data(),
+                                         static_cast<std::size_t>(n), out_pos);
+                    if (w != n) {
+                        ok = 0;
+                        break;
+                    }
+                    out_pos += n;
+                }
+            }
+            if (in_fd >= 0) ::close(in_fd);
+            if (out_fd >= 0) ::close(out_fd);
+        }
+        int global = 1;
+        MPI_Allreduce(&ok, &global, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+        ok = global;
+    } else if (ctx.rank == 0) {
+        std::vector<std::string> shards;
+        shards.reserve(ctx.size);
+        for (int r = 0; r < ctx.size; ++r) {
+            char rs[32];
+            std::snprintf(rs, sizeof(rs), "/rank_%05d.json.gz", r);
+            shards.emplace_back(ctx.perfetto_shards_dir + rs);
+        }
+        const int rc =
+            co_await fileio::parallel::merge_shards(ctx.final_output, shards);
+        if (rc != 0) ok = 0;
+    }
+
+    if (!ok) {
+        if (ctx.rank == 0) std::fprintf(stderr, "merge step failed\n");
+        ctx.failed = true;
+        co_return;
+    }
+
+    if (ctx.rank == 0) {
+        const double merge_ms = std::chrono::duration<double, std::milli>(
+                                    std::chrono::steady_clock::now() - t0)
+                                    .count();
+        std::printf(
+            "[rank 0] merge (%s, %llu bytes from %d ranks) -> %s (%.2f ms)\n",
+            striped ? "parallel-pwrite" : "sharded-serial",
+            static_cast<unsigned long long>(total_bytes), ctx.size,
+            ctx.final_output.c_str(), merge_ms);
+        std::fflush(stdout);
+        if (!ctx.cli->keep_staging) {
+            std::error_code ec;
+            fs::remove_all(ctx.perfetto_shards_dir, ec);
+        }
+    }
+    co_return;
+}
+
+int run(int argc, char** argv) {
+    DFTRACER_UTILS_LOGGER_INIT();
+
+    argparse::ArgumentParser program("dftracer_aggregator_mpi",
+                                     DFTRACER_UTILS_PACKAGE_VERSION);
+    program.add_description(
+        "MPI driver for the distributed-SST aggregator. Each rank produces "
+        "per-rank aggregation SSTs; rank 0 bulk-ingests and the ranks jointly "
+        "write the final gzip JSON output.");
+
+    AggregatorMpiArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
+
+    RunCtx ctx;
+    ctx.cli = &cli;
+    MPI_Comm_rank(MPI_COMM_WORLD, &ctx.rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &ctx.size);
+
+    // Per-node rank count via a node-local sub-communicator. Used to
+    // divide executor/io threads so N ranks on one node don't each try
+    // to spin up hardware_concurrency() compute threads (total cores)
+    // and oversubscribe by N.
+    MPI_Comm node_comm = MPI_COMM_NULL;
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, ctx.rank,
+                        MPI_INFO_NULL, &node_comm);
+    int ppn = 1;
+    if (node_comm != MPI_COMM_NULL) {
+        MPI_Comm_size(node_comm, &ppn);
+        MPI_Comm_free(&node_comm);
+    }
+    if (ppn > 1) {
+        const auto hw = dftracer_utils_hardware_concurrency();
+        const auto scaled = std::max<std::size_t>(
+            1, static_cast<std::size_t>(hw) / static_cast<std::size_t>(ppn));
+        // Heuristic: the argparse default for these flags is
+        // hardware_concurrency(). If the user didn't pass the flag, the
+        // parsed value equals the node-wide default -- scale it down.
+        // If the user set an explicit value we leave it alone.
+        if (cli.pipeline.executor_threads == static_cast<std::size_t>(hw)) {
+            cli.pipeline.executor_threads = scaled;
+        }
+        if (cli.pipeline.io_threads == static_cast<std::size_t>(hw)) {
+            cli.pipeline.io_threads = scaled;
+        }
+        if (ctx.rank == 0) {
+            std::printf(
+                "[rank 0] detected ppn=%d, executor_threads=%zu "
+                "io_threads=%zu (hw=%zu)\n",
+                ppn, cli.pipeline.executor_threads, cli.pipeline.io_threads,
+                static_cast<std::size_t>(hw));
+            std::fflush(stdout);
+        }
+    }
+
+    // Deterministic hash-based intern ids so the same string maps to the
+    // same id on every rank, keeping cross-rank aggregation keys identical.
+    composites::dft::aggregators::aggregation_intern()
+        .enable_deterministic_ids();
+
+    ctx.index_dir = cli.indexing.index_dir;
+    if (ctx.index_dir.empty())
+        ctx.index_dir = fs::absolute(cli.directory.value).string();
+    ctx.staging_root = cli.staging_dir.empty()
+                           ? (fs::path(ctx.index_dir) / "_staging").string()
+                           : cli.staging_dir;
+    ctx.shared_staging_root = (cli.shared_staging_dir.empty() ||
+                               cli.shared_staging_dir == ctx.staging_root)
+                                  ? ctx.staging_root
+                                  : cli.shared_staging_dir;
+    ctx.final_output = fs::absolute(cli.output).string();
+    if (ctx.final_output.size() < 3 ||
+        ctx.final_output.substr(ctx.final_output.size() - 3) != ".gz") {
+        ctx.final_output += ".gz";
+    }
+    ctx.perfetto_shards_dir =
+        (fs::path(ctx.index_dir) / "_perfetto_shards").string();
+
+    std::vector<char> packed_files;
+    if (ctx.rank == 0) {
+        const std::string dir = fs::absolute(cli.directory.value).string();
+        ctx.all_files = enumerate_inputs(dir);
+        if (ctx.all_files.empty()) {
+            std::fprintf(stderr,
+                         "[rank 0] no .pfw/.pfw.gz files in %s, aborting\n",
+                         dir.c_str());
+        }
+        packed_files = pack_paths(ctx.all_files);
+        std::error_code ec;
+        fs::create_directories(ctx.staging_root, ec);
+        if (ctx.shared_staging_root != ctx.staging_root)
+            fs::create_directories(ctx.shared_staging_root, ec);
+    }
+    std::uint64_t packed_size = packed_files.size();
+    MPI_Bcast(&packed_size, 1, MPI_UINT64_T, 0, MPI_COMM_WORLD);
+    if (packed_size == 0) return 1;
+    if (ctx.rank != 0) packed_files.resize(packed_size);
+    MPI_Bcast(packed_files.data(), static_cast<int>(packed_size), MPI_CHAR, 0,
+              MPI_COMM_WORLD);
+    if (ctx.rank != 0) ctx.all_files = unpack_paths(packed_files);
+    ctx.nfiles = ctx.all_files.size();
+
+    // Build the DAG: scan -> phase_a -> phase_b -> phase_c -> merge.
+    // Each task is scheduled independently; a downstream task only
+    // starts once its parent finishes.
+    auto pipeline_config =
+        cli::build_pipeline_config("DFTracer MPI", cli.pipeline);
+    Pipeline pipeline(pipeline_config);
+
+    auto scan = make_task(
+        [&ctx](CoroScope& scope) -> coro::CoroTask<void> {
+            co_await task_scan(ctx, scope);
+        },
+        "scan");
+    auto phase_a = make_task(
+        [&ctx](CoroScope& scope) -> coro::CoroTask<void> {
+            co_await task_phase_a(ctx, scope);
+        },
+        "phase_a");
+    auto phase_b = make_task(
+        [&ctx](CoroScope&) -> coro::CoroTask<void> {
+            co_await task_phase_b(ctx);
+        },
+        "phase_b");
+    auto phase_c = make_task(
+        [&ctx](CoroScope& scope) -> coro::CoroTask<void> {
+            co_await task_phase_c(ctx, scope);
+        },
+        "phase_c");
+    auto merge = make_task(
+        [&ctx](CoroScope&) -> coro::CoroTask<void> {
+            co_await task_merge(ctx);
+        },
+        "merge");
+
+    phase_a->depends_on(scan);
+    phase_b->depends_on(phase_a);
+    phase_c->depends_on(phase_b);
+    merge->depends_on(phase_c);
+
+    pipeline.set_source(scan);
+    pipeline.set_destination(merge);
+    pipeline.execute();
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    return ctx.failed ? 1 : 0;
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+    int provided = 0;
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
+    if (provided < MPI_THREAD_FUNNELED) {
+        std::fprintf(stderr,
+                     "MPI does not support MPI_THREAD_FUNNELED (got %d), "
+                     "aborting\n",
+                     provided);
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    const int rc = run(argc, argv);
+    MPI_Finalize();
+    return rc;
+}
diff --git a/src/dftracer/utils/binaries/dftracer_call_tree.cpp b/src/dftracer/utils/binaries/dftracer_call_tree.cpp
index 5c595d20..278e89d0 100644
--- a/src/dftracer/utils/binaries/dftracer_call_tree.cpp
+++ b/src/dftracer/utils/binaries/dftracer_call_tree.cpp
@@ -1,427 +1,538 @@
-/**
- * DFTracer Call Tree Utility
- * Standalone binary for building and analyzing call trees from DFTracer trace
- * files
- */
-
-#include <dftracer/utils/call_tree/call_tree.h>
+// Pipeline-driven call_tree binary.
+//
+// DAG:
+//   scan -> build -> merge -> hierarchy -> write_json
+//
+// scan      : enumerate inputs
+// build     : per-file CoroScope fan-out; each file ingests into its own
+//             local CallTree fragment (no shared mutation)
+// merge     : concatenate fragments into ctx.merged
+// hierarchy : per-process CoroScope fan-out; each ProcessCallTree is
+//             independent so parent-child build runs in parallel
+// write_json: per-worker serialization of process slices, ParallelWriter
+//             feeds io_backend for the actual writes
+
+#include <dftracer/utils/call_tree/internal/call_tree.h>
+#include <dftracer/utils/call_tree/internal/process_call_tree.h>
+#include <dftracer/utils/call_tree/internal/process_key.h>
+#include <dftracer/utils/call_tree/internal/trace_reader.h>
+#include <dftracer/utils/call_tree/json_serializer.h>
+#include <dftracer/utils/core/common/byte_view.h>
 #include <dftracer/utils/core/common/config.h>
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/pipeline/pipeline.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/core/tasks/task.h>
+#include <dftracer/utils/utilities/fileio/parallel/merge.h>
+#include <dftracer/utils/utilities/fileio/parallel/parallel_writer.h>
+#include <unistd.h>
 
 #include <algorithm>
-#include <argparse/argparse.hpp>
+#include <atomic>
 #include <chrono>
 #include <cstdio>
-#include <iostream>
-#include <map>
+#include <ctime>
+#include <memory>
 #include <string>
 #include <vector>
 
+#include "common_cli.h"
+
+using namespace dftracer::utils;
+using namespace dftracer::utils::utilities;
 using namespace dftracer::utils::call_tree;
 
-/**
- * Collect trace files from directory or file list
- */
-static std::vector<std::string> collect_trace_files(
-    const std::vector<std::string>& inputs, bool recursive) {
-    std::vector<std::string> trace_files;
+namespace {
 
-    for (const auto& input : inputs) {
-        if (fs::is_directory(input)) {
-            if (recursive) {
-                for (const auto& entry :
-                     fs::recursive_directory_iterator(input)) {
-                    if (entry.is_regular_file()) {
-                        std::string path = entry.path().string();
-                        if ((path.size() >= 4 &&
-                             path.substr(path.size() - 4) == ".pfw") ||
-                            (path.size() >= 7 &&
-                             path.substr(path.size() - 7) == ".pfw.gz")) {
-                            trace_files.push_back(path);
-                        }
-                    }
-                }
-            } else {
-                for (const auto& entry : fs::directory_iterator(input)) {
-                    if (entry.is_regular_file()) {
-                        std::string path = entry.path().string();
-                        if ((path.size() >= 4 &&
-                             path.substr(path.size() - 4) == ".pfw") ||
-                            (path.size() >= 7 &&
-                             path.substr(path.size() - 7) == ".pfw.gz")) {
-                            trace_files.push_back(path);
-                        }
-                    }
-                }
-            }
-        } else if (fs::is_regular_file(input)) {
-            trace_files.push_back(input);
-        } else {
-            DFTRACER_UTILS_LOG_ERROR("Input not found or not accessible: %s",
-                                     input.c_str());
-        }
-    }
+class CallTreeArgParse : public cli::ArgParse {
+   public:
+    cli::PipelineArgs pipeline;
 
-    return trace_files;
-}
+    std::vector<std::string> inputs;
+    bool recursive = false;
+    std::string output;
+    bool verbose = false;
+    bool no_save = false;
+    bool gzip = false;
 
-/**
- * Analyze call patterns in the tree
- */
-static void analyze_call_patterns(const std::vector<CallTreeNodeInfo>& nodes) {
-    printf("\n--- Call Pattern Analysis ---\n");
-
-    if (nodes.empty()) {
-        printf("No nodes to analyze\n");
-        return;
+    explicit CallTreeArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        schema(pipeline);
     }
 
-    // Find most frequently called functions
-    std::map<std::string, size_t> call_counts;
-    for (const auto& node : nodes) {
-        call_counts[node.name]++;
+   protected:
+    void register_args() override {
+        parser()
+            .add_argument("inputs")
+            .help("Trace files (.pfw, .pfw.gz) or directories")
+            .nargs(argparse::nargs_pattern::at_least_one);
+        parser().add_argument("-r", "--recursive").flag();
+        parser()
+            .add_argument("-o", "--output")
+            .help("Output JSON path (Chrome Tracing)")
+            .default_value<std::string>("");
+        parser().add_argument("-v", "--verbose").flag();
+        parser().add_argument("--no-save").flag();
+        parser()
+            .add_argument("--gzip")
+            .help("gzip the output (.gz appended if needed)")
+            .flag();
     }
 
-    // Sort by frequency
-    std::vector<std::pair<std::string, size_t>> sorted_calls(
-        call_counts.begin(), call_counts.end());
-    std::sort(sorted_calls.begin(), sorted_calls.end(),
-              [](const auto& a, const auto& b) { return a.second > b.second; });
-
-    printf("Top 10 most frequently called functions:\n");
-    for (size_t i = 0; i < std::min(sorted_calls.size(), size_t(10)); i++) {
-        printf("  %2zu. %-30s : %zu calls\n", i + 1,
-               sorted_calls[i].first.c_str(), sorted_calls[i].second);
+    void post_parse() override {
+        inputs = parser().get<std::vector<std::string>>("inputs");
+        recursive = parser().get<bool>("--recursive");
+        output = parser().get<std::string>("--output");
+        verbose = parser().get<bool>("--verbose");
+        no_save = parser().get<bool>("--no-save");
+        gzip = parser().get<bool>("--gzip");
     }
+};
+
+bool is_trace_file(const std::string& path) {
+    return (path.size() >= 4 &&
+            path.compare(path.size() - 4, 4, ".pfw") == 0) ||
+           (path.size() >= 7 &&
+            path.compare(path.size() - 7, 7, ".pfw.gz") == 0);
 }
 
-/**
- * Analyze timing statistics
- */
-static void analyze_timing(const std::vector<CallTreeNodeInfo>& nodes) {
-    printf("\n--- Timing Analysis ---\n");
+struct RunCtx {
+    const CallTreeArgParse* cli = nullptr;
 
-    if (nodes.empty()) {
-        printf("No nodes to analyze\n");
-        return;
+    std::vector<std::string> trace_files;
+    std::vector<std::unique_ptr<internal::CallTree>> per_file;
+    internal::CallTree merged;
+    std::vector<internal::ProcessKey> process_keys;
+
+    std::string output_path;
+    bool failed = false;
+
+    double scan_ms = 0;
+    double build_ms = 0;
+    double merge_ms = 0;
+    double hier_ms = 0;
+    double write_ms = 0;
+};
+
+coro::CoroTask<void> task_scan(RunCtx* ctx) {
+    const auto t0 = std::chrono::steady_clock::now();
+    for (const auto& in : ctx->cli->inputs) {
+        std::error_code ec;
+        if (fs::is_directory(in, ec)) {
+            if (ctx->cli->recursive) {
+                for (const auto& e : fs::recursive_directory_iterator(in, ec)) {
+                    if (e.is_regular_file(ec) &&
+                        is_trace_file(e.path().string()))
+                        ctx->trace_files.push_back(e.path().string());
+                }
+            } else {
+                for (const auto& e : fs::directory_iterator(in, ec)) {
+                    if (e.is_regular_file(ec) &&
+                        is_trace_file(e.path().string()))
+                        ctx->trace_files.push_back(e.path().string());
+                }
+            }
+        } else if (fs::is_regular_file(in, ec)) {
+            ctx->trace_files.push_back(in);
+        }
     }
-
-    // Calculate timing statistics
-    std::vector<std::uint64_t> durations;
-    durations.reserve(nodes.size());
-
-    for (const auto& node : nodes) {
-        durations.push_back(node.duration_us);
+    std::sort(ctx->trace_files.begin(), ctx->trace_files.end());
+    if (ctx->trace_files.empty()) {
+        DFTRACER_UTILS_LOG_ERROR("%s", "no trace files found");
+        ctx->failed = true;
     }
-
-    std::sort(durations.begin(), durations.end());
-
-    std::uint64_t total = 0;
-    for (auto d : durations) {
-        total += d;
+    ctx->scan_ms = std::chrono::duration<double, std::milli>(
+                       std::chrono::steady_clock::now() - t0)
+                       .count();
+    if (ctx->cli->verbose && !ctx->failed) {
+        std::printf("[scan] %.2f ms: %zu files\n", ctx->scan_ms,
+                    ctx->trace_files.size());
+        std::fflush(stdout);
     }
-    double avg =
-        static_cast<double>(total) / static_cast<double>(durations.size());
-
-    std::uint64_t min_time = durations.front();
-    std::uint64_t max_time = durations.back();
-    std::uint64_t median = durations[durations.size() / 2];
-    std::uint64_t p95 = durations[static_cast<size_t>(
-        static_cast<double>(durations.size()) * 0.95)];
-    std::uint64_t p99 = durations[static_cast<size_t>(
-        static_cast<double>(durations.size()) * 0.99)];
-
-    printf("Duration statistics (milliseconds):\n");
-    printf("  Min:    %.3f ms\n", static_cast<double>(min_time) / 1000.0);
-    printf("  Max:    %.3f ms\n", static_cast<double>(max_time) / 1000.0);
-    printf("  Mean:   %.3f ms\n", avg / 1000.0);
-    printf("  Median: %.3f ms\n", static_cast<double>(median) / 1000.0);
-    printf("  95th:   %.3f ms\n", static_cast<double>(p95) / 1000.0);
-    printf("  99th:   %.3f ms\n", static_cast<double>(p99) / 1000.0);
+    co_return;
 }
 
-/**
- * Find critical path (longest duration calls)
- */
-static void find_critical_path(const std::vector<CallTreeNodeInfo>& nodes) {
-    printf("\n--- Critical Path (Longest Duration Calls) ---\n");
-
-    if (nodes.empty()) {
-        printf("No nodes to analyze\n");
-        return;
-    }
+coro::CoroTask<void> ingest_one_file(std::string path, internal::CallTree* tree,
+                                     std::atomic<std::size_t>* total) {
+    auto counts = co_await internal::read_trace_file_async(std::move(path),
+                                                           tree, nullptr);
+    total->fetch_add(counts.processed, std::memory_order_relaxed);
+}
 
-    // Find top 10 longest running calls
-    std::vector<CallTreeNodeInfo> sorted_nodes = nodes;
-    std::sort(sorted_nodes.begin(), sorted_nodes.end(),
-              [](const auto& a, const auto& b) {
-                  return a.duration_us > b.duration_us;
-              });
-
-    printf("Top 10 longest running calls:\n");
-    for (size_t i = 0; i < std::min(sorted_nodes.size(), size_t(10)); i++) {
-        const auto& node = sorted_nodes[i];
-        printf("  %2zu. %-30s [%-15s] - %10.3f ms (level %d)\n", i + 1,
-               node.name.c_str(), node.category.c_str(),
-               static_cast<double>(node.duration_us) / 1000.0, node.level);
+coro::CoroTask<void> ingest_all_files(
+    CoroScope* child, const std::vector<std::string>* paths,
+    const std::vector<std::unique_ptr<internal::CallTree>>* per_file,
+    std::atomic<std::size_t>* total) {
+    for (std::size_t i = 0; i < paths->size(); ++i) {
+        std::string path = (*paths)[i];
+        internal::CallTree* tree = (*per_file)[i].get();
+        child->spawn([path = std::move(path), tree,
+                      total](CoroScope&) mutable -> coro::CoroTask<void> {
+            co_await ingest_one_file(std::move(path), tree, total);
+        });
     }
+    co_return;
 }
 
-/**
- * Analyze by category
- */
-static void analyze_by_category(const std::vector<CallTreeNodeInfo>& nodes) {
-    printf("\n--- Analysis by Category ---\n");
+coro::CoroTask<void> task_build(RunCtx* ctx, CoroScope* scope) {
+    if (ctx->failed) co_return;
+    const auto t0 = std::chrono::steady_clock::now();
 
-    if (nodes.empty()) {
-        printf("No nodes to analyze\n");
-        return;
+    const std::size_t n = ctx->trace_files.size();
+    ctx->per_file.clear();
+    ctx->per_file.reserve(n);
+    for (std::size_t i = 0; i < n; ++i) {
+        ctx->per_file.push_back(std::make_unique<internal::CallTree>());
+        ctx->per_file.back()->initialize();
     }
 
-    std::map<std::string, size_t> category_counts;
-    std::map<std::string, std::uint64_t> category_durations;
-
-    for (const auto& node : nodes) {
-        category_counts[node.category]++;
-        category_durations[node.category] += node.duration_us;
+    std::atomic<std::size_t> total_events{0};
+    std::atomic<std::size_t>* total_ptr = &total_events;
+
+    const std::vector<std::string>* paths_ptr = &ctx->trace_files;
+    const std::vector<std::unique_ptr<internal::CallTree>>* per_file_ptr =
+        &ctx->per_file;
+
+    co_await scope->scope(
+        [paths_ptr, per_file_ptr,
+         total_ptr](CoroScope& child) mutable -> coro::CoroTask<void> {
+            co_await ingest_all_files(&child, paths_ptr, per_file_ptr,
+                                      total_ptr);
+        });
+
+    ctx->build_ms = std::chrono::duration<double, std::milli>(
+                        std::chrono::steady_clock::now() - t0)
+                        .count();
+    if (ctx->cli->verbose) {
+        std::printf("[build] %.2f ms: %zu events across %zu files\n",
+                    ctx->build_ms, total_events.load(), n);
+        std::fflush(stdout);
     }
+    co_return;
+}
 
-    printf("Nodes by category:\n");
-    for (const auto& [category, count] : category_counts) {
-        double avg_duration =
-            static_cast<double>(category_durations[category]) /
-            static_cast<double>(count) / 1000.0;
-        printf("  %-20s: %6zu nodes, avg duration: %.3f ms\n", category.c_str(),
-               count, avg_duration);
+coro::CoroTask<void> task_merge(RunCtx* ctx) {
+    if (ctx->failed) co_return;
+    const auto t0 = std::chrono::steady_clock::now();
+    ctx->merged.initialize();
+    for (auto& t : ctx->per_file) {
+        if (t) ctx->merged.merge_from(std::move(*t));
+    }
+    ctx->per_file.clear();
+    ctx->process_keys = ctx->merged.keys();
+    ctx->merge_ms = std::chrono::duration<double, std::milli>(
+                        std::chrono::steady_clock::now() - t0)
+                        .count();
+    if (ctx->cli->verbose) {
+        std::printf("[merge] %.2f ms: %zu processes\n", ctx->merge_ms,
+                    ctx->process_keys.size());
+        std::fflush(stdout);
     }
+    co_return;
 }
 
-int main(int argc, char** argv) {
-    DFTRACER_UTILS_LOGGER_INIT();
+coro::CoroTask<void> hier_one_process(internal::CallTree* tree,
+                                      internal::ProcessKey key) {
+    tree->build_hierarchy_for_process(key);
+    co_return;
+}
 
-    argparse::ArgumentParser program("dftracer_call_tree",
-                                     DFTRACER_UTILS_PACKAGE_VERSION);
-    program.add_description(
-        "DFTracer Call Tree utility - builds and analyzes call trees from "
-        "DFTracer trace files");
-
-    // Input files/directories
-    program.add_argument("inputs")
-        .help(
-            "Trace files (.pfw, .pfw.gz) or directories containing trace files")
-        .nargs(argparse::nargs_pattern::at_least_one);
-
-    // Processing options
-    program.add_argument("-r", "--recursive")
-        .help("Recursively search directories for trace files")
-        .flag();
-
-    program.add_argument("--pattern")
-        .help("File pattern for trace files (default: *.pfw.gz)")
-        .default_value(std::string("*.pfw.gz"));
-
-    // Output options
-    program.add_argument("-o", "--output")
-        .help(
-            "Output file path for serialized call tree (default: "
-            "auto-generated from input)")
-        .default_value(std::string(""));
-
-    program.add_argument("--json")
-        .help("Also save call tree in JSON (Chrome Tracing) format")
-        .flag();
-
-    program.add_argument("--text")
-        .help("Export call tree to text file")
-        .default_value(std::string(""));
-
-    // Analysis options
-    program.add_argument("--max-depth")
-        .help("Maximum depth for tree printing (0=unlimited)")
-        .default_value(0)
-        .scan<'i', int>();
-
-    program.add_argument("--analyze")
-        .help(
-            "Perform detailed analysis (call patterns, timing, critical path)")
-        .flag();
-
-    program.add_argument("-v", "--verbose")
-        .help("Enable verbose output")
-        .flag();
-
-    program.add_argument("--stats-only")
-        .help("Only print statistics, skip tree traversal")
-        .flag();
-
-    program.add_argument("--no-save")
-        .help("Don't save output files, only print analysis")
-        .flag();
-
-    // Parse arguments
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        std::cerr << err.what() << std::endl;
-        std::cerr << program;
-        return 1;
+coro::CoroTask<void> hier_all_processes(
+    CoroScope* child, internal::CallTree* tree,
+    const std::vector<internal::ProcessKey>* keys) {
+    for (auto k : *keys) {
+        child->spawn([tree, k](CoroScope&) mutable -> coro::CoroTask<void> {
+            co_await hier_one_process(tree, k);
+        });
     }
+    co_return;
+}
 
-    // Get arguments
-    auto inputs = program.get<std::vector<std::string>>("inputs");
-    bool recursive = program.get<bool>("--recursive");
-    std::string pattern = program.get<std::string>("--pattern");
-    std::string output_path = program.get<std::string>("--output");
-    bool save_json = program.get<bool>("--json");
-    std::string text_file = program.get<std::string>("--text");
-    int max_depth = program.get<int>("--max-depth");
-    bool analyze = program.get<bool>("--analyze");
-    bool verbose = program.get<bool>("--verbose");
-    bool stats_only = program.get<bool>("--stats-only");
-    bool no_save = program.get<bool>("--no-save");
-
-    // Collect trace files
-    printf("=== DFTracer Call Tree Builder ===\n\n");
-
-    auto start_time = std::chrono::high_resolution_clock::now();
-
-    // For single directory input, use load_from_directory
-    // For multiple inputs or files, collect manually
-    CallTree tree;
-    bool loaded = false;
-
-    if (inputs.size() == 1 && fs::is_directory(inputs[0])) {
-        printf("Loading traces from directory: %s\n", inputs[0].c_str());
-        if (verbose) {
-            printf("  Pattern: %s\n", pattern.c_str());
-            printf("  Recursive: %s\n", recursive ? "yes" : "no");
-        }
-
-        loaded = tree.load_from_directory(inputs[0], pattern);
-        if (!loaded) {
-            fprintf(stderr, "Failed to load traces from directory: %s\n",
-                    inputs[0].c_str());
-            return 1;
-        }
-    } else {
-        auto trace_files = collect_trace_files(inputs, recursive);
-        if (trace_files.empty()) {
-            fprintf(stderr, "No trace files found in the specified inputs.\n");
-            return 1;
-        }
+coro::CoroTask<void> task_hierarchy(RunCtx* ctx, CoroScope* scope) {
+    if (ctx->failed) co_return;
+    const auto t0 = std::chrono::steady_clock::now();
+
+    internal::CallTree* tree = &ctx->merged;
+    const std::vector<internal::ProcessKey>* keys_ptr = &ctx->process_keys;
+    co_await scope->scope(
+        [tree, keys_ptr](CoroScope& child) mutable -> coro::CoroTask<void> {
+            co_await hier_all_processes(&child, tree, keys_ptr);
+        });
+
+    ctx->hier_ms = std::chrono::duration<double, std::milli>(
+                       std::chrono::steady_clock::now() - t0)
+                       .count();
+    if (ctx->cli->verbose) {
+        std::printf("[hierarchy] %.2f ms\n", ctx->hier_ms);
+        std::fflush(stdout);
+    }
+    co_return;
+}
 
-        printf("Found %zu trace file(s) to process:\n", trace_files.size());
-        if (verbose) {
-            for (const auto& file : trace_files) {
-                printf("  %s\n", file.c_str());
+// Serialize all events for a single process slice into `out`. Each event is
+// followed by ",\n"; trim the final separator at concatenation time.
+void serialize_process_slice(const internal::ProcessCallTree& pgraph,
+                             const internal::ProcessKey& key,
+                             internal::JsonSerializer& serializer,
+                             std::size_t starting_index, std::string& out) {
+    static constexpr std::size_t EVT_BUF = 16384;
+    char buffer[EVT_BUF];
+    std::size_t event_idx = starting_index;
+    for (std::uint64_t root_id : pgraph.root_calls) {
+        std::vector<std::uint64_t> stack;
+        stack.push_back(root_id);
+        while (!stack.empty()) {
+            std::uint64_t node_id = stack.back();
+            stack.pop_back();
+            auto it = pgraph.calls.find(node_id);
+            if (it == pgraph.calls.end()) continue;
+            const auto& node = it->second;
+            std::size_t written = serializer.serialize_node(
+                buffer, static_cast<int>(event_idx++), *node, key.pid, key.tid);
+            // serialize_node returns size including trailing newline; strip it
+            // and add ",\n" so concatenation produces valid
+            // JSON-array-of-lines.
+            if (written > 0) {
+                out.append(buffer, written - 1);
+                out.append(",\n", 2);
+            }
+            const auto& children = node->get_children();
+            for (auto cit = children.rbegin(); cit != children.rend(); ++cit) {
+                stack.push_back(*cit);
             }
         }
-
-        // Load first directory for now (CallTree API expects directory)
-        // This is a limitation of the current API
-        fprintf(stderr,
-                "Note: Multi-file input not yet supported. Use directory input "
-                "instead.\n");
-        return 1;
     }
+}
 
-    printf("Loaded %zu trace files\n", tree.get_num_trace_files());
-    printf("\n");
-
-    // Generate call tree
-    printf("Generating call tree structure...\n");
-    if (!tree.generate()) {
-        fprintf(stderr, "Failed to generate call tree\n");
-        return 1;
+coro::CoroTask<void> serialize_slice(const internal::CallTree* merged,
+                                     internal::ProcessKey key,
+                                     const std::string* hostname_hash,
+                                     std::vector<std::string>* slice_buffers,
+                                     std::size_t index,
+                                     std::uint64_t starting_index) {
+    auto* pgraph = const_cast<internal::CallTree*>(merged)->get(key);
+    if (pgraph) {
+        internal::JsonSerializer serializer;
+        char init[8];
+        serializer.initialize(init, *hostname_hash);
+        (void)init;
+        serialize_process_slice(*pgraph, key, serializer, starting_index,
+                                (*slice_buffers)[index]);
     }
-    printf("Call tree generation complete\n\n");
-
-    auto gen_time = std::chrono::high_resolution_clock::now();
-    auto gen_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
-        gen_time - start_time);
-    if (verbose) {
-        printf("Generation time: %lld ms\n\n",
-               static_cast<long long>(gen_duration.count()));
+    co_return;
+}
+
+coro::CoroTask<void> serialize_all_slices(
+    CoroScope* child, const internal::CallTree* merged,
+    const std::vector<internal::ProcessKey>* keys,
+    const std::string* hostname_hash, std::vector<std::string>* slice_buffers,
+    std::uint64_t stride) {
+    for (std::size_t i = 0; i < keys->size(); ++i) {
+        internal::ProcessKey k = (*keys)[i];
+        std::uint64_t start_idx = i * stride;
+        child->spawn([merged, k, start_idx, i, hostname_hash, slice_buffers](
+                         CoroScope&) mutable -> coro::CoroTask<void> {
+            co_await serialize_slice(merged, k, hostname_hash, slice_buffers, i,
+                                     start_idx);
+        });
     }
+    co_return;
+}
 
-    // Print statistics
-    printf("=== Call Tree Statistics ===\n");
-    tree.print_statistics();
+coro::CoroTask<void> task_write_json(RunCtx* ctx, CoroScope* scope) {
+    if (ctx->failed || ctx->cli->no_save) co_return;
+    const auto t0 = std::chrono::steady_clock::now();
+
+    const std::size_t n = ctx->process_keys.size();
+    std::vector<std::string> slice_buffers(n);
+    static constexpr std::uint64_t IDX_STRIDE = 1ull << 20;
+
+    char hostname[256] = {};
+    gethostname(hostname, sizeof(hostname) - 1);
+    std::string hostname_hash(hostname);
+
+    std::vector<std::string>* slice_buffers_ptr = &slice_buffers;
+    const std::string* hostname_hash_ptr = &hostname_hash;
+    const internal::CallTree* merged = &ctx->merged;
+    const std::vector<internal::ProcessKey>* keys_ptr = &ctx->process_keys;
+
+    co_await scope->scope(
+        [merged, keys_ptr, hostname_hash_ptr,
+         slice_buffers_ptr](CoroScope& child) mutable -> coro::CoroTask<void> {
+            co_await serialize_all_slices(&child, merged, keys_ptr,
+                                          hostname_hash_ptr, slice_buffers_ptr,
+                                          IDX_STRIDE);
+        });
+
+    std::string header;
+    header.append("[\n", 2);
+    {
+        internal::JsonSerializer serializer;
+        char init_buf[8];
+        serializer.initialize(init_buf, hostname_hash);
+        (void)init_buf;
+        char buf[8192];
+        std::time_t now = std::time(nullptr);
+        char ts[64];
+        std::strftime(ts, sizeof(ts), "%Y-%m-%d %H:%M:%S",
+                      std::localtime(&now));
+        std::size_t w = serializer.serialize_metadata(buf, "timestamp", ts, "M",
+                                                      0, 0, true);
+        if (w > 0) header.append(buf, w - 1);
+        header.append(",\n", 2);
+        w = serializer.serialize_metadata(buf, "format", "call_tree", "M", 0, 0,
+                                          true);
+        if (w > 0) header.append(buf, w - 1);
+        header.append(",\n", 2);
+    }
 
-    // Print tree structure
-    if (!stats_only) {
-        printf("\n=== Call Tree Structure ===\n");
-        tree.print_depth_first(max_depth);
+    fileio::parallel::WriterConfig wc;
+    wc.layout = fileio::parallel::FileLayout::SHARDED;
+    wc.gzip = ctx->cli->gzip;
+    auto writer = fileio::parallel::make_writer(wc);
+
+    const std::size_t total_workers = n + 1;
+    if (co_await writer->open(ctx->output_path, total_workers, ctx->cli->gzip,
+                              scope) != 0) {
+        DFTRACER_UTILS_LOG_ERROR("failed to open writer: %s",
+                                 ctx->output_path.c_str());
+        ctx->failed = true;
+        co_return;
     }
 
-    // Perform detailed analysis if requested
-    if (analyze) {
-        printf("\n=== Detailed Analysis ===\n");
-        auto nodes = tree.get_nodes_depth_first();
-        printf("Retrieved %zu nodes for analysis\n", nodes.size());
+    if (co_await writer->write_chunk(
+            0, ByteView(header.data(), header.size())) != 0) {
+        ctx->failed = true;
+    }
 
-        analyze_call_patterns(nodes);
-        analyze_timing(nodes);
-        find_critical_path(nodes);
-        analyze_by_category(nodes);
+    for (std::size_t i = 0; i < n && !ctx->failed; ++i) {
+        std::string& b = slice_buffers[i];
+        if (i + 1 == n) {
+            if (b.size() >= 2 && b[b.size() - 2] == ',' &&
+                b[b.size() - 1] == '\n') {
+                b.resize(b.size() - 2);
+                b.append("\n]\n", 3);
+            } else {
+                b.append("]\n", 2);
+            }
+        }
+        if (co_await writer->write_chunk(i + 1, ByteView(b.data(), b.size())) !=
+            0) {
+            ctx->failed = true;
+            break;
+        }
     }
 
-    // Save outputs
-    if (!no_save) {
-        printf("\n=== Saving Outputs ===\n");
+    if (co_await writer->close() != 0) ctx->failed = true;
 
-        // Set custom output path if specified
-        if (!output_path.empty()) {
-            tree.set_output_path(output_path);
+    if (!ctx->failed) {
+        auto shards = writer->output_paths();
+        if (co_await fileio::parallel::merge_shards(ctx->output_path, shards) !=
+            0) {
+            DFTRACER_UTILS_LOG_ERROR("merge_shards failed for %s",
+                                     ctx->output_path.c_str());
+            ctx->failed = true;
         }
+    }
 
-        // Save binary format
-        std::string bin_file = tree.get_output_path();
-        printf("Saving binary call tree to: %s\n", bin_file.c_str());
-        if (tree.save_to_file()) {
-            printf("  Successfully saved!\n");
-        } else {
-            fprintf(stderr, "  Failed to save binary file\n");
-        }
+    ctx->write_ms = std::chrono::duration<double, std::milli>(
+                        std::chrono::steady_clock::now() - t0)
+                        .count();
+    if (ctx->cli->verbose) {
+        std::printf("[write] %.2f ms -> %s\n", ctx->write_ms,
+                    ctx->output_path.c_str());
+        std::fflush(stdout);
+    }
+    co_return;
+}
 
-        // Save JSON format if requested
-        if (save_json) {
-            std::string json_file = bin_file;
-            // Replace .calltree extension with .pfw
-            if (json_file.size() >= 9 &&
-                json_file.substr(json_file.size() - 9) == ".calltree") {
-                json_file = json_file.substr(0, json_file.size() - 9) + ".pfw";
-            } else {
-                json_file += ".pfw";
-            }
+int run(int argc, char** argv) {
+    DFTRACER_UTILS_LOGGER_INIT();
 
-            printf("Saving JSON call tree to: %s\n", json_file.c_str());
-            if (tree.save_to_json(json_file)) {
-                printf("  Successfully saved! (Chrome Tracing compatible)\n");
-            } else {
-                fprintf(stderr, "  Failed to save JSON file\n");
-            }
+    argparse::ArgumentParser program("dftracer_call_tree",
+                                     DFTRACER_UTILS_PACKAGE_VERSION);
+    program.add_description(
+        "Build a call tree from DFTracer trace files and emit Chrome Tracing "
+        "JSON.");
+
+    CallTreeArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
+
+    RunCtx ctx;
+    ctx.cli = &cli;
+
+    if (cli.output.empty()) {
+        std::string base = "call_tree";
+        if (!cli.inputs.empty()) {
+            fs::path p(cli.inputs.front());
+            if (fs::is_directory(p))
+                base = p.filename().string();
+            else
+                base = p.stem().string();
+            if (base.empty()) base = "call_tree";
         }
+        ctx.output_path = base + ".pfw";
+    } else {
+        ctx.output_path = cli.output;
+    }
+    if (cli.gzip &&
+        (ctx.output_path.size() < 3 ||
+         ctx.output_path.compare(ctx.output_path.size() - 3, 3, ".gz") != 0)) {
+        ctx.output_path += ".gz";
+    }
 
-        // Save text format if requested
-        if (!text_file.empty()) {
-            printf("Exporting call tree to text file: %s\n", text_file.c_str());
-            if (tree.print_depth_first_to_file(text_file, max_depth)) {
-                printf("  Successfully exported!\n");
-            } else {
-                fprintf(stderr, "  Failed to export text file\n");
-            }
-        }
+    auto pipeline_config =
+        cli::build_pipeline_config("DFTracer CallTree", cli.pipeline);
+    Pipeline pipeline(pipeline_config);
+
+    RunCtx* ctx_ptr = &ctx;
+    auto scan = make_task(
+        [ctx_ptr](CoroScope&) -> coro::CoroTask<void> {
+            co_await task_scan(ctx_ptr);
+        },
+        "scan");
+    auto build = make_task(
+        [ctx_ptr](CoroScope& scope) -> coro::CoroTask<void> {
+            co_await task_build(ctx_ptr, &scope);
+        },
+        "build");
+    auto merge = make_task(
+        [ctx_ptr](CoroScope&) -> coro::CoroTask<void> {
+            co_await task_merge(ctx_ptr);
+        },
+        "merge");
+    auto hierarchy = make_task(
+        [ctx_ptr](CoroScope& scope) -> coro::CoroTask<void> {
+            co_await task_hierarchy(ctx_ptr, &scope);
+        },
+        "hierarchy");
+    auto write = make_task(
+        [ctx_ptr](CoroScope& scope) -> coro::CoroTask<void> {
+            co_await task_write_json(ctx_ptr, &scope);
+        },
+        "write_json");
+
+    build->depends_on(scan);
+    merge->depends_on(build);
+    hierarchy->depends_on(merge);
+    write->depends_on(hierarchy);
+
+    pipeline.set_source(scan);
+    pipeline.set_destination(write);
+    pipeline.execute();
+
+    if (cli.verbose && !ctx.failed) {
+        std::printf(
+            "[done] scan=%.1fms build=%.1fms merge=%.1fms hierarchy=%.1fms "
+            "write=%.1fms\n",
+            ctx.scan_ms, ctx.build_ms, ctx.merge_ms, ctx.hier_ms, ctx.write_ms);
     }
 
-    auto end_time = std::chrono::high_resolution_clock::now();
-    auto total_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
-        end_time - start_time);
+    return ctx.failed ? 1 : 0;
+}
 
-    printf("\n=== Completed ===\n");
-    printf("Total execution time: %lld ms\n",
-           static_cast<long long>(total_duration.count()));
+}  // namespace
 
-    return 0;
-}
+int main(int argc, char** argv) { return run(argc, argv); }
diff --git a/src/dftracer/utils/binaries/dftracer_call_tree_mpi.cpp b/src/dftracer/utils/binaries/dftracer_call_tree_mpi.cpp
new file mode 100644
index 00000000..07e926d2
--- /dev/null
+++ b/src/dftracer/utils/binaries/dftracer_call_tree_mpi.cpp
@@ -0,0 +1,208 @@
+// MPI driver for parallel call-tree construction. Thin DAG over
+// MPICallTreeBuilder; all phase logic lives in the engine.
+
+#include <dftracer/utils/call_tree/mpi/builder.h>
+#include <dftracer/utils/call_tree/mpi/config.h>
+#include <dftracer/utils/core/common/config.h>
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/pipeline/pipeline.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/core/tasks/task.h>
+#include <mpi.h>
+
+#include <algorithm>
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "common_cli.h"
+
+using namespace dftracer::utils;
+using namespace dftracer::utils::call_tree;
+
+namespace {
+
+class CallTreeMpiArgParse : public cli::ArgParse {
+   public:
+    cli::PipelineArgs pipeline;
+
+    std::string input_dir;
+    std::string output;
+    std::string staging_dir;
+    bool verbose = false;
+    bool gzip = false;
+    bool keep_staging = false;
+
+    explicit CallTreeMpiArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        schema(pipeline);
+    }
+
+   protected:
+    void register_args() override {
+        parser().add_argument("input").help(
+            "Input directory containing trace files");
+        parser()
+            .add_argument("-o", "--output")
+            .help("Output JSON path")
+            .default_value<std::string>("call_tree.pfw");
+        parser()
+            .add_argument("--staging-dir")
+            .help(
+                "Shared FS staging root for per-rank shards (default "
+                "<output>.shards/)")
+            .default_value<std::string>("");
+        parser().add_argument("--gzip").flag();
+        parser().add_argument("-v", "--verbose").flag();
+        parser().add_argument("--keep-staging").flag();
+    }
+
+    void post_parse() override {
+        input_dir = parser().get<std::string>("input");
+        output = parser().get<std::string>("--output");
+        staging_dir = parser().get<std::string>("--staging-dir");
+        gzip = parser().get<bool>("--gzip");
+        verbose = parser().get<bool>("--verbose");
+        keep_staging = parser().get<bool>("--keep-staging");
+    }
+};
+
+struct RunCtx {
+    const CallTreeMpiArgParse* cli = nullptr;
+    std::unique_ptr<MPICallTreeBuilder> builder;
+    std::string final_output;
+    std::string staging_dir;
+    bool failed = false;
+};
+
+int run(int argc, char** argv) {
+    DFTRACER_UTILS_LOGGER_INIT();
+
+    argparse::ArgumentParser program("dftracer_call_tree_mpi",
+                                     DFTRACER_UTILS_PACKAGE_VERSION);
+    program.add_description(
+        "MPI driver for parallel call-tree construction. Each rank owns a "
+        "slice of PIDs and emits a Chrome Tracing JSON shard; rank 0 merges.");
+
+    CallTreeMpiArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
+
+    int rank = 0, size = 1;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // Scale per-rank threads down when multiple ranks share a node.
+    MPI_Comm node_comm = MPI_COMM_NULL;
+    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank,
+                        MPI_INFO_NULL, &node_comm);
+    int ppn = 1;
+    if (node_comm != MPI_COMM_NULL) {
+        MPI_Comm_size(node_comm, &ppn);
+        MPI_Comm_free(&node_comm);
+    }
+    if (ppn > 1) {
+        const auto hw = dftracer_utils_hardware_concurrency();
+        const auto scaled = std::max<std::size_t>(
+            1, static_cast<std::size_t>(hw) / static_cast<std::size_t>(ppn));
+        if (cli.pipeline.executor_threads == static_cast<std::size_t>(hw))
+            cli.pipeline.executor_threads = scaled;
+        if (cli.pipeline.io_threads == static_cast<std::size_t>(hw))
+            cli.pipeline.io_threads = scaled;
+    }
+
+    RunCtx ctx;
+    ctx.cli = &cli;
+
+    MPICallTreeConfig builder_cfg;
+    builder_cfg.verbose = cli.verbose;
+    ctx.builder = std::make_unique<MPICallTreeBuilder>(builder_cfg);
+
+    ctx.final_output = fs::absolute(cli.output).string();
+    if (cli.gzip && (ctx.final_output.size() < 3 ||
+                     ctx.final_output.compare(ctx.final_output.size() - 3, 3,
+                                              ".gz") != 0)) {
+        ctx.final_output += ".gz";
+    }
+    ctx.staging_dir = cli.staging_dir.empty()
+                          ? (ctx.final_output + ".shards")
+                          : fs::absolute(cli.staging_dir).string();
+
+    auto pipeline_config =
+        cli::build_pipeline_config("DFTracer CallTree MPI", cli.pipeline);
+    Pipeline pipeline(pipeline_config);
+
+    RunCtx* p = &ctx;
+    auto discover = make_task(
+        [p](CoroScope& scope) -> coro::CoroTask<void> {
+            if (p->failed) co_return;
+            p->builder->add_trace_directory(p->cli->input_dir);
+            if (p->builder->trace_files().empty()) {
+                if (p->builder->rank() == 0)
+                    std::fprintf(stderr, "no .pfw/.pfw.gz files in %s\n",
+                                 p->cli->input_dir.c_str());
+                p->failed = true;
+                co_return;
+            }
+            if (!co_await p->builder->discover_pids(&scope)) p->failed = true;
+        },
+        "discover");
+    auto build = make_task(
+        [p](CoroScope& scope) -> coro::CoroTask<void> {
+            if (p->failed) co_return;
+            if (!co_await p->builder->build(&scope)) p->failed = true;
+        },
+        "build");
+    auto hierarchy = make_task(
+        [p](CoroScope& scope) -> coro::CoroTask<void> {
+            if (p->failed) co_return;
+            if (!co_await p->builder->hierarchy(&scope)) p->failed = true;
+        },
+        "hierarchy");
+    auto write = make_task(
+        [p](CoroScope& scope) -> coro::CoroTask<void> {
+            if (p->failed) co_return;
+            if (!co_await p->builder->write(&scope, p->final_output,
+                                            p->staging_dir, p->cli->gzip))
+                p->failed = true;
+        },
+        "write");
+    auto merge = make_task(
+        [p](CoroScope&) -> coro::CoroTask<void> {
+            if (p->failed) co_return;
+            if (!co_await p->builder->merge(p->final_output, p->staging_dir,
+                                            p->cli->gzip, p->cli->keep_staging))
+                p->failed = true;
+        },
+        "merge");
+
+    build->depends_on(discover);
+    hierarchy->depends_on(build);
+    write->depends_on(hierarchy);
+    merge->depends_on(write);
+
+    pipeline.set_source(discover);
+    pipeline.set_destination(merge);
+    pipeline.execute();
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    return ctx.failed ? 1 : 0;
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+    int provided = 0;
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
+    if (provided < MPI_THREAD_FUNNELED) {
+        std::fprintf(stderr,
+                     "MPI does not support MPI_THREAD_FUNNELED (got %d), "
+                     "aborting\n",
+                     provided);
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    const int rc = run(argc, argv);
+    MPI_Finalize();
+    return rc;
+}
diff --git a/src/dftracer/utils/binaries/dftracer_comparator.cpp b/src/dftracer/utils/binaries/dftracer_comparator.cpp
index a9cd9750..ebee6bb2 100644
--- a/src/dftracer/utils/binaries/dftracer_comparator.cpp
+++ b/src/dftracer/utils/binaries/dftracer_comparator.cpp
@@ -1,11 +1,7 @@
 #include <dftracer/utils/core/common/config.h>
-#include <dftracer/utils/core/common/filesystem.h>
-#include <dftracer/utils/core/common/platform_compat.h>
 #include <dftracer/utils/core/coro/channel.h>
 #include <dftracer/utils/core/coro/task.h>
-#include <dftracer/utils/core/coro/when_all.h>
 #include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/core/pipeline/pipeline_config.h>
 #include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/tasks/task.h>
 #include <dftracer/utils/utilities/common/query/query.h>
@@ -14,25 +10,121 @@
 #include <dftracer/utils/utilities/composites/dft/comparator/comparison_result.h>
 #include <dftracer/utils/utilities/composites/dft/comparator/comparison_utility.h>
 #include <dftracer/utils/utilities/composites/dft/comparator/tree_table_formatter.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
 #include <dftracer/utils/utilities/composites/dft/internal/utils.h>
 #include <dftracer/utils/utilities/composites/dft/metadata_collector_utility.h>
-#include <dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h>
 #include <dftracer/utils/utilities/indexer/index_builder_utility.h>
 #include <dftracer/utils/utilities/indexer/internal/indexer.h>
 #include <unistd.h>
 
-#include <argparse/argparse.hpp>
 #include <atomic>
 #include <chrono>
 #include <ctime>
-#include <sstream>
-#include <thread>
-#include <unordered_set>
+
+#include "common_cli.h"
 
 using namespace dftracer::utils;
 using namespace dftracer::utils::utilities;
 using namespace dftracer::utils::utilities::composites::dft::aggregators;
 using namespace dftracer::utils::utilities::composites::dft::comparator;
+using dftracer::utils::utilities::composites::dft::indexing::
+    IndexResolverUtility;
+using dftracer::utils::utilities::composites::dft::indexing::ResolverInput;
+using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility;
+using dftracer::utils::utilities::indexer::IndexBuildBatchConfig;
+
+class ComparatorArgParse : public cli::ArgParse {
+   public:
+    cli::PipelineArgs pipeline;
+    cli::IndexingArgs indexing;
+    cli::QueryArgs query_args{"Query filter (default: all events)"};
+
+    std::string config_path;
+    std::string baseline;
+    std::string variant;
+    std::string baseline_index_dir;
+    std::string variant_index_dir;
+    std::string group_by;
+    std::string format = "table";
+    double time_interval = 5000.0;
+    double threshold = 0.0;
+    bool no_color = false;
+
+    explicit ComparatorArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        indexing.with_index_dir = false;
+        indexing.force_help = "Force index rebuild";
+        schema(pipeline, indexing, query_args);
+    }
+
+   protected:
+    void register_args() override {
+        parser()
+            .add_argument("--config")
+            .help("JSON config file for hierarchical comparison")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("--baseline")
+            .help("Baseline trace file or directory")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("--variant")
+            .help("Variant trace file or directory")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("--baseline-index-dir")
+            .help(
+                "Index directory for baseline (default: co-located with data)")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("--variant-index-dir")
+            .help("Index directory for variant (default: co-located with data)")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("--group-by")
+            .help("Comma-separated group keys (default: cat,name)")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("--format")
+            .help("Output format: table (default) or json")
+            .default_value<std::string>("table");
+
+        parser()
+            .add_argument("-t", "--time-interval")
+            .help("Time interval in milliseconds for bucketing (default: 5000)")
+            .scan<'g', double>()
+            .default_value(5000.0);
+
+        parser()
+            .add_argument("--threshold")
+            .help("Hide changes below this percentage")
+            .scan<'g', double>()
+            .default_value(0.0);
+
+        parser()
+            .add_argument("--no-color")
+            .help("Disable ANSI color output")
+            .flag();
+    }
+
+    void post_parse() override {
+        config_path = parser().get<std::string>("--config");
+        baseline = parser().get<std::string>("--baseline");
+        variant = parser().get<std::string>("--variant");
+        baseline_index_dir = parser().get<std::string>("--baseline-index-dir");
+        variant_index_dir = parser().get<std::string>("--variant-index-dir");
+        group_by = parser().get<std::string>("--group-by");
+        format = parser().get<std::string>("--format");
+        time_interval = parser().get<double>("--time-interval");
+        threshold = parser().get<double>("--threshold");
+        no_color = parser().get<bool>("--no-color");
+    }
+};
 
 namespace {
 
@@ -44,156 +136,166 @@ void flatten_nodes(const ComparisonNode& node,
     }
 }
 
-// Run one complete aggregation pipeline for a set of files.
-// Returns EventAggregatorUtilityOutput after the pipeline completes.
-static coro::CoroTask<EventAggregatorUtilityOutput> run_aggregation(
-    const std::vector<std::string>& input_files,
-    const AggregationConfig& agg_config,
-    const std::optional<common::query::Query>& query,
-    const std::string& index_dir, std::size_t checkpoint_size,
-    bool force_rebuild, std::size_t executor_threads) {
+static coro::CoroTask<void> process_file_task(
+    std::string file_path, coro::ChannelProducer<ChunkAggregatorInput> ch,
+    std::string index_dir, std::size_t checkpoint_size, bool force_rebuild,
+    AggregationConfig agg_config, std::optional<common::query::Query> query,
+    std::atomic<int>* global_chunk_idx_ptr) {
     constexpr std::size_t CHUNK_SIZE_MB = 4;
     constexpr std::size_t BATCH_SIZE_MB = 4;
 
-    auto pipeline_config = PipelineConfig()
-                               .with_name("DFTracer Comparator Aggregation")
-                               .with_compute_threads(executor_threads)
-                               .with_watchdog(false);
-    Pipeline pipeline(pipeline_config);
+    [[maybe_unused]] auto producer_guard = ch.guard();
+
+    std::string index_path =
+        composites::dft::internal::determine_index_path(file_path, index_dir);
+
+    auto meta_input =
+        composites::dft::MetadataCollectorUtilityInput::from_file(file_path)
+            .with_checkpoint_size(checkpoint_size)
+            .with_force_rebuild(force_rebuild)
+            .with_index(index_path);
+    auto metadata =
+        co_await composites::dft::MetadataCollectorUtility{}.process(
+            meta_input);
+
+    if (!metadata.success) {
+        DFTRACER_UTILS_LOG_WARN("Skipping file: %s", file_path.c_str());
+        co_return;
+    }
+
+    FileChunkMapperUtility file_mapper;
+    auto mapper_input = FileChunkMapperInput::from_metadata(metadata)
+                            .with_config(agg_config)
+                            .with_checkpoint_size(checkpoint_size)
+                            .with_target_chunk_size(CHUNK_SIZE_MB)
+                            .with_batch_size(BATCH_SIZE_MB * 1024 * 1024);
+    mapper_input.query = query;
+    auto file_chunks = co_await file_mapper.process(mapper_input);
+
+    int start_idx =
+        global_chunk_idx_ptr->fetch_add(static_cast<int>(file_chunks.size()));
+    for (int i = 0; i < static_cast<int>(file_chunks.size()); ++i) {
+        file_chunks[i].chunk_index = start_idx + i;
+    }
 
-    EventAggregatorUtility merger;
+    for (auto& chunk : file_chunks) {
+        if (!co_await ch.send(std::move(chunk))) {
+            co_return;
+        }
+    }
+    co_return;
+}
+
+static coro::CoroTask<void> chunk_worker_task(
+    std::shared_ptr<coro::Channel<ChunkAggregatorInput>> chunk_chan,
+    coro::ChannelProducer<ChunkAggregationOutput> rp,
+    std::shared_ptr<coro::Channel<ChunkAggregationOutput>> result_chan,
+    CoroScope* wctx_ptr) {
+    [[maybe_unused]] auto producer_guard = rp.guard();
+    while (auto input = co_await wctx_ptr->receive(chunk_chan)) {
+        ChunkAggregatorUtility agg;
+        auto output = co_await agg.process(*input);
+        if (!co_await result_chan->send(std::move(output))) {
+            co_return;
+        }
+    }
+    co_return;
+}
+
+static coro::CoroTask<EventAggregatorOutput> run_aggregation(
+    CoroScope& ctx, const std::vector<std::string>& input_files,
+    const AggregationConfig& agg_config,
+    const std::optional<common::query::Query>& query,
+    const std::string& index_dir, std::size_t checkpoint_size,
+    bool force_rebuild, std::size_t executor_threads) {
+    EventAggregator merger;
     std::atomic<int> global_chunk_idx{0};
 
-    auto streaming_task = make_task(
-        [&](CoroScope& ctx) -> coro::CoroTask<void> {
-            auto chunk_chan = coro::make_channel<ChunkAggregatorInput>(0);
-            auto result_chan = coro::make_channel<ChunkAggregationOutput>(2);
-
-            co_await ctx.scope([&](CoroScope& scope) -> coro::CoroTask<void> {
-                for (const auto& file_path : input_files) {
-                    auto* global_chunk_idx_ptr = &global_chunk_idx;
-                    scope.spawn([file_path, ch = chunk_chan->producer(),
-                                 index_dir, checkpoint_size, force_rebuild,
-                                 agg_config, query, global_chunk_idx_ptr](
-                                    CoroScope& /*fctx*/) mutable
-                                    -> coro::CoroTask<void> {
-                        [[maybe_unused]] auto producer_guard = ch.guard();
-
-                        std::string index_path =
-                            composites::dft::internal::determine_index_path(
-                                file_path, index_dir);
-
-                        auto meta_input =
-                            composites::dft::MetadataCollectorUtilityInput::
-                                from_file(file_path)
-                                    .with_checkpoint_size(checkpoint_size)
-                                    .with_force_rebuild(force_rebuild)
-                                    .with_index(index_path);
-                        auto metadata =
-                            co_await composites::dft::MetadataCollectorUtility{}
-                                .process(meta_input);
-
-                        if (!metadata.success) {
-                            DFTRACER_UTILS_LOG_WARN("Skipping file: %s",
-                                                    file_path.c_str());
-                            co_return;
-                        }
-
-                        FileChunkMapperUtility file_mapper;
-                        auto mapper_input =
-                            FileChunkMapperInput::from_metadata(metadata)
-                                .with_config(agg_config)
-                                .with_checkpoint_size(checkpoint_size)
-                                .with_target_chunk_size(CHUNK_SIZE_MB)
-                                .with_batch_size(BATCH_SIZE_MB * 1024 * 1024);
-                        mapper_input.query = query;
-                        auto file_chunks =
-                            co_await file_mapper.process(mapper_input);
-
-                        int start_idx = global_chunk_idx_ptr->fetch_add(
-                            static_cast<int>(file_chunks.size()));
-                        for (int i = 0;
-                             i < static_cast<int>(file_chunks.size()); ++i) {
-                            file_chunks[i].chunk_index = start_idx + i;
-                        }
-
-                        for (auto& chunk : file_chunks) {
-                            if (!co_await ch.send(std::move(chunk))) {
-                                co_return;
-                            }
-                        }
-                        co_return;
-                    });
-                }
+    co_await ctx.scope([&](CoroScope& scope) -> coro::CoroTask<void> {
+        auto chunk_chan = coro::make_channel<ChunkAggregatorInput>(0);
+        auto result_chan = coro::make_channel<ChunkAggregationOutput>(2);
+
+        for (const auto& file_path : input_files) {
+            auto* global_chunk_idx_ptr = &global_chunk_idx;
+            scope.spawn([file_path, ch = chunk_chan->producer(), index_dir,
+                         checkpoint_size, force_rebuild, agg_config, query,
+                         global_chunk_idx_ptr](CoroScope& /*fctx*/) mutable
+                            -> coro::CoroTask<void> {
+                co_await process_file_task(
+                    std::move(file_path), std::move(ch), std::move(index_dir),
+                    checkpoint_size, force_rebuild, std::move(agg_config),
+                    std::move(query), global_chunk_idx_ptr);
+            });
+        }
 
-                for (std::size_t w = 0; w < executor_threads; ++w) {
-                    (void)w;
-                    scope.spawn(
-                        [chunk_chan, rp = result_chan->producer(), result_chan](
+        for (std::size_t w = 0; w < executor_threads; ++w) {
+            (void)w;
+            scope.spawn([chunk_chan, rp = result_chan->producer(), result_chan](
                             CoroScope& wctx) mutable -> coro::CoroTask<void> {
-                            [[maybe_unused]] auto producer_guard = rp.guard();
-                            while (auto input =
-                                       co_await wctx.receive(chunk_chan)) {
-                                ChunkAggregatorUtility agg;
-                                auto output = co_await agg.process(*input);
-                                if (!co_await result_chan->send(
-                                        std::move(output))) {
-                                    co_return;
-                                }
-                            }
-                            co_return;
-                        });
-                }
-
-                auto* merger_ptr = &merger;
-                scope.spawn([result_chan, merger_ptr](
-                                CoroScope& mctx) -> coro::CoroTask<void> {
-                    while (auto output = co_await mctx.receive(result_chan)) {
-                        merger_ptr->merge_chunk(std::move(*output));
-                    }
-                    co_return;
-                });
+                co_await chunk_worker_task(chunk_chan, std::move(rp),
+                                           result_chan, &wctx);
+            });
+        }
 
+        auto* merger_ptr = &merger;
+        scope.spawn(
+            [result_chan, merger_ptr](CoroScope& mctx) -> coro::CoroTask<void> {
+                while (auto output = co_await mctx.receive(result_chan)) {
+                    merger_ptr->merge_chunk(std::move(*output));
+                }
                 co_return;
             });
 
-            co_return;
-        },
-        "StreamingAggregate");
-
-    EventAggregatorUtilityOutput result;
-    auto post_task = make_task(
-        [&](CoroScope& /*ctx*/) -> coro::CoroTask<bool> {
-            result = merger.finalize();
-            co_return result.success;
-        },
-        "Finalize");
+        co_return;
+    });
 
-    post_task->depends_on(streaming_task);
-    pipeline.set_source(streaming_task);
-    pipeline.set_destination(post_task);
-    pipeline.execute();
+    co_return merger.finalize();
+}
 
-    co_return result;
+struct AggSpec {
+    AggregationConfig agg_cfg;
+    std::optional<common::query::Query> query;
+    const ComparisonNode* visitor;
+};
+
+struct NodeAggPlan {
+    ComparisonNode root;
+    std::vector<AggSpec> specs;
+};
+
+static coro::CoroTask<void> run_all_aggregations(
+    CoroScope& ctx, const std::vector<std::string>& files,
+    const std::vector<NodeAggPlan>& plans,
+    std::vector<std::vector<EventAggregatorOutput>>& results,
+    const std::string& index_dir, const ComparisonConfig& config) {
+    results.resize(plans.size());
+    for (std::size_t ni = 0; ni < plans.size(); ++ni) {
+        const auto& plan = plans[ni];
+        results[ni].resize(plan.specs.size());
+        for (std::size_t vi = 0; vi < plan.specs.size(); ++vi) {
+            const auto& spec = plan.specs[vi];
+            results[ni][vi] = co_await run_aggregation(
+                ctx, files, spec.agg_cfg, spec.query, index_dir,
+                config.checkpoint_size, config.force_rebuild,
+                config.executor_threads);
+        }
+    }
 }
 
 }  // namespace
 
-static coro::CoroTask<int> run_comparator(argparse::ArgumentParser& program) {
-    std::string config_path = program.get<std::string>("--config");
-    std::string baseline_path = program.get<std::string>("--baseline");
-    std::string variant_path = program.get<std::string>("--variant");
-    std::string query_str = program.get<std::string>("--query");
-    std::string group_by_str = program.get<std::string>("--group-by");
-    std::string format = program.get<std::string>("--format");
-    bool no_color = program.get<bool>("--no-color");
-    std::size_t executor_threads =
-        program.get<std::size_t>("--executor-threads");
-    std::string index_dir = program.get<std::string>("--index-dir");
-    bool force_rebuild = program.get<bool>("--force");
-    std::size_t checkpoint_size = program.get<std::size_t>("--checkpoint-size");
-    double threshold = program.get<double>("--threshold");
-    double time_interval_ms = program.get<double>("--time-interval");
+static int run_comparator(const ComparatorArgParse* cli) {
+    const auto& config_path = cli->config_path;
+    const auto& baseline_path = cli->baseline;
+    const auto& variant_path = cli->variant;
+    const auto& query_str = cli->query_args.query;
+    const auto& group_by_str = cli->group_by;
+    auto format = cli->format;
+    auto no_color = cli->no_color;
+    auto force_rebuild = cli->indexing.force;
+    auto checkpoint_size = cli->indexing.checkpoint_size;
+    auto threshold = cli->threshold;
+    auto time_interval_ms = cli->time_interval;
 
     ComparisonConfig config;
     if (!config_path.empty()) {
@@ -201,7 +303,7 @@ static coro::CoroTask<int> run_comparator(argparse::ArgumentParser& program) {
         auto parsed = ComparisonConfig::from_json_file(config_path, error);
         if (!parsed) {
             DFTRACER_UTILS_LOG_ERROR("Config error: %s", error.c_str());
-            co_return 1;
+            return 1;
         }
         config = std::move(*parsed);
     } else if (!baseline_path.empty() && !variant_path.empty()) {
@@ -210,15 +312,18 @@ static coro::CoroTask<int> run_comparator(argparse::ArgumentParser& program) {
     } else {
         DFTRACER_UTILS_LOG_ERROR(
             "Must specify --config or both --baseline and --variant");
-        co_return 1;
+        return 1;
     }
 
-    // CLI overrides
     if (!format.empty()) config.format = format;
     config.no_color = no_color;
-    if (executor_threads > 0) config.executor_threads = executor_threads;
+    if (cli->pipeline.executor_threads > 0)
+        config.executor_threads = cli->pipeline.executor_threads;
     if (checkpoint_size > 0) config.checkpoint_size = checkpoint_size;
-    if (!index_dir.empty()) config.index_dir = index_dir;
+    if (!cli->baseline_index_dir.empty())
+        config.baseline_index_dir = cli->baseline_index_dir;
+    if (!cli->variant_index_dir.empty())
+        config.variant_index_dir = cli->variant_index_dir;
     if (force_rebuild) config.force_rebuild = force_rebuild;
     if (threshold > 0.0) config.defaults.threshold_pct = threshold;
     if (time_interval_ms > 0.0)
@@ -229,207 +334,304 @@ static coro::CoroTask<int> run_comparator(argparse::ArgumentParser& program) {
     if (config.executor_threads == 0) {
         config.executor_threads = dftracer_utils_hardware_concurrency();
     }
+
     if (config.checkpoint_size == 0) {
         config.checkpoint_size =
             indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE;
     }
 
-    std::string temp_index_dir;
-    if (config.index_dir.empty()) {
-        try {
-            auto temp_path = fs::temp_directory_path();
-            temp_path /= "dftracer_cmp_" + std::to_string(std::time(nullptr)) +
-                         "_" + std::to_string(getpid());
-            temp_index_dir = temp_path.string();
-            fs::create_directories(temp_index_dir);
-            config.index_dir = temp_index_dir;
-        } catch (const fs::filesystem_error& e) {
-            temp_index_dir = "/tmp/dftracer_cmp_" +
-                             std::to_string(std::time(nullptr)) + "_" +
-                             std::to_string(getpid());
-            fs::create_directories(temp_index_dir);
-            config.index_dir = temp_index_dir;
-            DFTRACER_UTILS_LOG_WARN(
-                "Failed to get system temp directory, using /tmp: %s",
-                e.what());
-        }
-    }
+    // Precompute aggregation plans from config (needed by both Agg tasks)
+    std::vector<NodeAggPlan> agg_plans;
+    for (auto& node : config.nodes) {
+        NodeAggPlan plan;
+        plan.root = node;
 
-    // Enumerate files for both sides
-    auto enumerate_files = [](const std::string& path)
-        -> coro::CoroTask<std::vector<std::string>> {
-        std::vector<std::string> files;
-        if (fs::is_regular_file(path)) {
-            files.push_back(path);
-            co_return files;
-        }
-        filesystem::PatternDirectoryScannerUtility scanner;
-        filesystem::PatternDirectoryScannerUtilityInput scan_input{
-            path, {".pfw", ".pfw.gz"}, false};
-        auto entries = co_await scanner.process(scan_input);
-        files.reserve(entries.size());
-        for (const auto& e : entries) {
-            files.push_back(e.path.string());
-        }
-        co_return files;
-    };
+        std::vector<const ComparisonNode*> visitors;
+        flatten_nodes(node, visitors);
 
-    auto baseline_files = co_await enumerate_files(config.baseline);
-    auto variant_files = co_await enumerate_files(config.variant);
+        for (const auto* visitor : visitors) {
+            AggSpec spec;
+            if (!visitor->composed_query.empty()) {
+                auto result =
+                    common::query::Query::from_string(visitor->composed_query);
+                if (!result) {
+                    DFTRACER_UTILS_LOG_ERROR("Invalid query for node '%s': %s",
+                                             visitor->name.c_str(),
+                                             result.error().format().c_str());
+                    return 1;
+                }
+                spec.query = std::move(*result);
+            }
 
-    if (baseline_files.empty()) {
-        DFTRACER_UTILS_LOG_ERROR("No trace files found in baseline: %s",
-                                 config.baseline.c_str());
-        co_return 1;
-    }
-    if (variant_files.empty()) {
-        DFTRACER_UTILS_LOG_ERROR("No trace files found in variant: %s",
-                                 config.variant.c_str());
-        co_return 1;
+            spec.agg_cfg.time_interval_us = static_cast<std::uint64_t>(
+                config.defaults.time_interval_ms * 1000.0);
+            spec.agg_cfg.extra_group_keys = {};
+            spec.agg_cfg.compute_statistics = true;
+            spec.agg_cfg.compute_percentiles = true;
+            spec.agg_cfg.percentiles = visitor->resolved_percentiles;
+            spec.agg_cfg.sketch_accuracy = 0.01;
+            spec.agg_cfg.track_process_parents = false;
+            spec.visitor = visitor;
+
+            plan.specs.push_back(std::move(spec));
+        }
+        agg_plans.push_back(std::move(plan));
     }
 
-    // Build indexes upfront so parallel aggregation doesn't race on
-    // `.dftindex`.
-    {
-        if (config.force_rebuild && !baseline_files.empty()) {
-            const std::string shared_index_path =
-                composites::dft::internal::determine_index_path(
-                    baseline_files.front(), config.index_dir);
-            if (fs::exists(shared_index_path)) {
-                DFTRACER_UTILS_LOG_INFO("Clearing shared index store: %s",
-                                        shared_index_path.c_str());
-                fs::remove_all(shared_index_path);
-            }
+    auto pipeline_config =
+        cli::build_pipeline_config("DFTracer Comparator", cli->pipeline);
+    Pipeline pipeline(pipeline_config);
+
+    auto resolve_and_build =
+        [&config](CoroScope& scope, const std::string& path,
+                  const std::string& index_dir,
+                  std::vector<std::string>& out_files) -> coro::CoroTask<void> {
+        if (!fs::exists(path)) {
+            DFTRACER_UTILS_LOG_ERROR("Path does not exist: %s", path.c_str());
+            co_return;
         }
-        std::unordered_set<std::string> seen;
-        std::vector<std::string> all_files;
-        for (const auto& f : baseline_files) {
-            if (seen.insert(f).second) all_files.push_back(f);
+
+        IndexResolverUtility resolver;
+        ResolverInput resolve_input;
+        resolve_input.index_dir = index_dir;
+        resolve_input.require_checkpoints = !config.force_rebuild;
+        if (fs::is_regular_file(path)) {
+            resolve_input.files = {path};
+        } else {
+            resolve_input.directory = path;
         }
-        for (const auto& f : variant_files) {
-            if (seen.insert(f).second) all_files.push_back(f);
+
+        auto result = co_await resolver.process(resolve_input);
+        out_files = std::move(result.all_files);
+
+        if (out_files.empty()) {
+            DFTRACER_UTILS_LOG_ERROR("No trace files found in: %s",
+                                     path.c_str());
+            co_return;
         }
-        DFTRACER_UTILS_LOG_INFO("Building indexes for %zu unique files...",
-                                all_files.size());
-        std::vector<indexer::IndexBuildConfig> idx_configs;
-        idx_configs.reserve(all_files.size());
-        for (const auto& file_path : all_files) {
-            idx_configs.push_back(
-                indexer::IndexBuildConfig::for_file(file_path)
-                    .with_checkpoint_size(config.checkpoint_size)
-                    .with_force_rebuild(false)
-                    .with_index_dir(config.index_dir));
+
+        if (result.needs_checkpoint.empty()) {
+            DFTRACER_UTILS_LOG_INFO("All %zu files already indexed",
+                                    out_files.size());
+            co_return;
         }
-        std::vector<coro::CoroTask<indexer::IndexBuildResult>> idx_tasks;
-        idx_tasks.reserve(idx_configs.size());
-        for (const auto& cfg : idx_configs) {
-            idx_tasks.push_back(indexer::IndexBuilderUtility{}.process(cfg));
+
+        auto batch_cfg = std::make_shared<IndexBuildBatchConfig>();
+        batch_cfg->file_paths.reserve(result.needs_checkpoint.size());
+        for (const auto& item : result.needs_checkpoint) {
+            batch_cfg->file_paths.push_back(item.file_path);
         }
-        co_await coro::when_all(std::move(idx_tasks));
+        batch_cfg->index_dir = index_dir;
+        batch_cfg->checkpoint_size = config.checkpoint_size;
+        batch_cfg->parallelism = config.executor_threads;
+        batch_cfg->force_rebuild = config.force_rebuild;
+        batch_cfg->use_batch_write = true;
+        batch_cfg->rebuild_root_summaries = true;
+
+        DFTRACER_UTILS_LOG_INFO("Indexing %zu of %zu files...",
+                                result.needs_checkpoint.size(),
+                                out_files.size());
+        co_await IndexBatchBuilderUtility::process(&scope,
+                                                   std::move(batch_cfg));
+    };
+
+    // Shared state between tasks
+    std::vector<std::string> baseline_files;
+    std::vector<std::string> variant_files;
+    std::vector<std::vector<EventAggregatorOutput>> baseline_results;
+    std::vector<std::vector<EventAggregatorOutput>> variant_results;
+
+    auto baseline_index_path = composites::dft::internal::determine_index_path(
+        config.baseline, config.baseline_index_dir);
+    auto variant_index_path = composites::dft::internal::determine_index_path(
+        config.variant, config.variant_index_dir);
+    bool shared_index = baseline_index_path == variant_index_path;
+
+    std::shared_ptr<Task> enum_index_base;
+    std::shared_ptr<Task> enum_index_var;
+
+    if (shared_index) {
+        auto enum_index_shared = make_task(
+            [&config, &baseline_files, &variant_files,
+             &resolve_and_build](CoroScope& scope) -> coro::CoroTask<void> {
+                co_await resolve_and_build(scope, config.baseline,
+                                           config.baseline_index_dir,
+                                           baseline_files);
+                if (config.baseline == config.variant) {
+                    variant_files = baseline_files;
+                } else {
+                    co_await resolve_and_build(scope, config.variant,
+                                               config.variant_index_dir,
+                                               variant_files);
+                }
+            },
+            "EnumIndex");
+        enum_index_base = enum_index_shared;
+        enum_index_var = enum_index_shared;
+    } else {
+        enum_index_base = make_task(
+            [&config, &baseline_files,
+             &resolve_and_build](CoroScope& scope) -> coro::CoroTask<void> {
+                co_await resolve_and_build(scope, config.baseline,
+                                           config.baseline_index_dir,
+                                           baseline_files);
+            },
+            "EnumIndexBaseline");
+
+        enum_index_var = make_task(
+            [&config, &variant_files,
+             &resolve_and_build](CoroScope& scope) -> coro::CoroTask<void> {
+                co_await resolve_and_build(scope, config.variant,
+                                           config.variant_index_dir,
+                                           variant_files);
+            },
+            "EnumIndexVariant");
     }
 
+    std::shared_ptr<Task> agg_base;
+    std::shared_ptr<Task> agg_var;
+
+    bool same_files = shared_index && config.baseline == config.variant;
+    if (same_files) {
+        auto agg_shared = make_task(
+            [&baseline_files, &baseline_results, &variant_results, &agg_plans,
+             &config](CoroScope& ctx) -> coro::CoroTask<void> {
+                if (baseline_files.empty()) co_return;
+                co_await run_all_aggregations(
+                    ctx, baseline_files, agg_plans, baseline_results,
+                    config.baseline_index_dir, config);
+                variant_results = baseline_results;
+            },
+            "Aggregate");
+        agg_shared->depends_on(enum_index_base);
+        agg_base = agg_shared;
+        agg_var = agg_shared;
+    } else {
+        agg_base = make_task(
+            [&baseline_files, &baseline_results, &agg_plans,
+             &config](CoroScope& ctx) -> coro::CoroTask<void> {
+                if (baseline_files.empty()) co_return;
+                co_await run_all_aggregations(
+                    ctx, baseline_files, agg_plans, baseline_results,
+                    config.baseline_index_dir, config);
+            },
+            "AggBaseline");
+        agg_base->depends_on(enum_index_base);
+
+        agg_var = make_task(
+            [&variant_files, &variant_results, &agg_plans,
+             &config](CoroScope& ctx) -> coro::CoroTask<void> {
+                if (variant_files.empty()) co_return;
+                co_await run_all_aggregations(ctx, variant_files, agg_plans,
+                                              variant_results,
+                                              config.variant_index_dir, config);
+            },
+            "AggVariant");
+        agg_var->depends_on(enum_index_var);
+    }
+
+    // Compare (depends on both Agg tasks)
     ComparisonOutput output;
     output.baseline_path = config.baseline;
     output.variant_path = config.variant;
-    output.baseline_file_count = baseline_files.size();
-    output.variant_file_count = variant_files.size();
+    int result_code = 0;
+
+    auto compare_task = make_task(
+        [&config, &baseline_files, &variant_files, &baseline_results,
+         &variant_results, &agg_plans, &output, &result_code](
+            [[maybe_unused]] CoroScope& ctx) -> coro::CoroTask<void> {
+            if (baseline_files.empty() || variant_files.empty()) {
+                result_code = 1;
+                co_return;
+            }
 
-    auto start_time = std::chrono::high_resolution_clock::now();
+            output.baseline_file_count = baseline_files.size();
+            output.variant_file_count = variant_files.size();
 
-    for (auto& node : config.nodes) {
-        std::vector<const ComparisonNode*> visitors;
-        flatten_nodes(node, visitors);
+            auto start_time = std::chrono::high_resolution_clock::now();
 
-        std::vector<ComparisonVisitorPair> pairs;
-        pairs.reserve(visitors.size());
+            for (std::size_t ni = 0; ni < agg_plans.size(); ++ni) {
+                const auto& plan = agg_plans[ni];
+                std::vector<ComparisonVisitorPair> pairs;
+                pairs.reserve(plan.specs.size());
 
-        for (const auto* visitor : visitors) {
-            using common::query::Query;
-            std::optional<Query> query;
-            if (!visitor->composed_query.empty()) {
-                auto result = Query::from_string(visitor->composed_query);
-                if (!result) {
-                    DFTRACER_UTILS_LOG_ERROR("Invalid query for node '%s': %s",
-                                             visitor->name.c_str(),
-                                             result.error().format().c_str());
-                    co_return 1;
+                for (std::size_t vi = 0; vi < plan.specs.size(); ++vi) {
+                    if (pairs.empty()) {
+                        output.baseline_meta = extract_metadata(
+                            baseline_results[ni][vi].aggregations,
+                            baseline_files.size());
+                        output.variant_meta = extract_metadata(
+                            variant_results[ni][vi].aggregations,
+                            variant_files.size());
+                    }
+
+                    ComparisonVisitorPair pair;
+                    pair.baseline = std::move(baseline_results[ni][vi]);
+                    pair.variant = std::move(variant_results[ni][vi]);
+                    pair.node = *plan.specs[vi].visitor;
+                    pairs.push_back(std::move(pair));
                 }
-                query = std::move(*result);
-            }
 
-            AggregationConfig agg_cfg;
-            agg_cfg.time_interval_us = static_cast<std::uint64_t>(
-                config.defaults.time_interval_ms * 1000.0);
-            agg_cfg.extra_group_keys = {};
-            agg_cfg.compute_statistics = true;
-            agg_cfg.compute_percentiles = true;
-            agg_cfg.percentiles = visitor->resolved_percentiles;
-            agg_cfg.sketch_accuracy = 0.01;
-            agg_cfg.track_process_parents = false;
-
-            auto [base_result, var_result] = co_await coro::when_all(
-                run_aggregation(baseline_files, agg_cfg, query,
-                                config.index_dir, config.checkpoint_size,
-                                config.force_rebuild, config.executor_threads),
-                run_aggregation(variant_files, agg_cfg, query, config.index_dir,
-                                config.checkpoint_size, config.force_rebuild,
-                                config.executor_threads));
-
-            // Extract metadata from first visitor (broadest query)
-            if (pairs.empty()) {
-                output.baseline_meta = extract_metadata(
-                    base_result.aggregations, baseline_files.size());
-                output.variant_meta = extract_metadata(var_result.aggregations,
-                                                       variant_files.size());
+                ComparisonUtilityInput cmp_input;
+                cmp_input.visitors = std::move(pairs);
+                cmp_input.root_node = plan.root;
+                cmp_input.baseline_file_count = baseline_files.size();
+                cmp_input.variant_file_count = variant_files.size();
+
+                ComparisonUtility cmp;
+                auto cmp_output = co_await cmp.process(cmp_input);
+                output.nodes.push_back(std::move(cmp_output.result));
             }
 
-            ComparisonVisitorPair pair;
-            pair.baseline = std::move(base_result);
-            pair.variant = std::move(var_result);
-            pair.node = *visitor;
-            pairs.push_back(std::move(pair));
-        }
+            auto meta_rows = build_metadata_metrics(output.baseline_meta,
+                                                    output.variant_meta);
+            for (auto& n : output.nodes) {
+                n.summary.metrics.insert(n.summary.metrics.begin(),
+                                         meta_rows.begin(), meta_rows.end());
+            }
 
-        ComparisonUtilityInput cmp_input;
-        cmp_input.visitors = std::move(pairs);
-        cmp_input.root_node = node;
-        cmp_input.baseline_file_count = baseline_files.size();
-        cmp_input.variant_file_count = variant_files.size();
+            auto end_time = std::chrono::high_resolution_clock::now();
+            std::chrono::duration<double, std::milli> duration =
+                end_time - start_time;
+            output.execution_time_ms = duration.count();
+
+            if (config.format == "json") {
+                TreeTableFormatter formatter;
+                std::printf("%s\n", formatter.render_json(output).c_str());
+            } else {
+                bool is_tty = isatty(fileno(stdout));
+                FormatterOptions fmt_opts;
+                fmt_opts.use_color = is_tty && !config.no_color;
+                fmt_opts.use_unicode = is_tty;
+                TreeTableFormatter formatter(fmt_opts);
+                formatter.render(stdout, output);
+            }
 
-        ComparisonUtility cmp;
-        auto cmp_output = co_await cmp.process(cmp_input);
-        output.nodes.push_back(std::move(cmp_output.result));
-    }
+            co_return;
+        },
+        "Compare");
 
-    // Inject metadata rows into root SUMMARY.
-    auto meta_rows =
-        build_metadata_metrics(output.baseline_meta, output.variant_meta);
-    for (auto& node : output.nodes) {
-        node.summary.metrics.insert(node.summary.metrics.begin(),
-                                    meta_rows.begin(), meta_rows.end());
+    if (same_files) {
+        compare_task->depends_on(agg_base);
+    } else {
+        compare_task->depends_on({agg_base, agg_var});
     }
 
-    auto end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double, std::milli> duration = end_time - start_time;
-    output.execution_time_ms = duration.count();
-
-    if (config.format == "json") {
-        TreeTableFormatter formatter;
-        std::printf("%s\n", formatter.render_json(output).c_str());
+    if (shared_index) {
+        pipeline.set_source(enum_index_base);
     } else {
-        bool is_tty = isatty(fileno(stdout));
-        FormatterOptions fmt_opts;
-        fmt_opts.use_color = is_tty && !config.no_color;
-        fmt_opts.use_unicode = is_tty;
-        TreeTableFormatter formatter(fmt_opts);
-        formatter.render(stdout, output);
+        pipeline.set_source({enum_index_base, enum_index_var});
     }
+    pipeline.set_destination(compare_task);
 
-    if (!temp_index_dir.empty() && fs::exists(temp_index_dir)) {
-        fs::remove_all(temp_index_dir);
+    try {
+        pipeline.execute();
+    } catch (const PipelineError& e) {
+        DFTRACER_UTILS_LOG_ERROR("Pipeline failed: %s", e.what());
+        result_code = 1;
     }
 
-    co_return 0;
+    return result_code;
 }
 
 int main(int argc, char** argv) {
@@ -440,67 +642,9 @@ int main(int argc, char** argv) {
     program.add_description(
         "Compare DFTracer trace metrics between baseline and variant");
 
-    program.add_argument("--config")
-        .help("JSON config file for hierarchical comparison")
-        .default_value<std::string>("");
-
-    program.add_argument("--baseline")
-        .help("Baseline trace file or directory")
-        .default_value<std::string>("");
-
-    program.add_argument("--variant")
-        .help("Variant trace file or directory")
-        .default_value<std::string>("");
-
-    program.add_argument("--query")
-        .help("Query filter (default: all events)")
-        .default_value<std::string>("");
-
-    program.add_argument("--group-by")
-        .help("Comma-separated group keys (default: cat,name)")
-        .default_value<std::string>("");
-
-    program.add_argument("--format")
-        .help("Output format: table (default) or json")
-        .default_value<std::string>("table");
-
-    program.add_argument("-t", "--time-interval")
-        .help("Time interval in milliseconds for bucketing (default: 5000)")
-        .scan<'g', double>()
-        .default_value(5000.0);
-
-    program.add_argument("--threshold")
-        .help("Hide changes below this percentage")
-        .scan<'g', double>()
-        .default_value(0.0);
-
-    program.add_argument("--no-color").help("Disable ANSI color output").flag();
-
-    program.add_argument("--executor-threads")
-        .help("Number of parallel threads (default: auto)")
-        .scan<'d', std::size_t>()
-        .default_value(
-            static_cast<std::size_t>(dftracer_utils_hardware_concurrency()));
-
-    program.add_argument("--index-dir")
-        .help("Directory for index files (default: temp)")
-        .default_value<std::string>("");
-
-    program.add_argument("--force").help("Force index rebuild").flag();
-
-    program.add_argument("--checkpoint-size")
-        .help("Checkpoint size for indexing in bytes")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(
-            indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE));
-
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        DFTRACER_UTILS_LOG_ERROR("Error: %s", err.what());
-        std::fprintf(stderr, "%s\n", program.help().str().c_str());
-        return 1;
-    }
+    ComparatorArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
 
-    return run_comparator(program).get();
+    return run_comparator(&cli);
 }
diff --git a/src/dftracer/utils/binaries/dftracer_event_count.cpp b/src/dftracer/utils/binaries/dftracer_event_count.cpp
index c5e91c21..cecbb19c 100644
--- a/src/dftracer/utils/binaries/dftracer_event_count.cpp
+++ b/src/dftracer/utils/binaries/dftracer_event_count.cpp
@@ -1,206 +1,254 @@
 #include <dftracer/utils/core/common/config.h>
-#include <dftracer/utils/core/common/constants.h>
 #include <dftracer/utils/core/common/filesystem.h>
-#include <dftracer/utils/core/common/platform_compat.h>
 #include <dftracer/utils/core/coro/channel.h>
 #include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/core/pipeline/pipeline_config.h>
 #include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/tasks/task.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.h>
+#include <dftracer/utils/utilities/composites/dft/internal/utils.h>
 #include <dftracer/utils/utilities/fileio/lines/sources/async_streaming_gz_line_generator.h>
 #include <dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h>
 #include <dftracer/utils/utilities/indexer/index_builder_utility.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
-#include <dftracer/utils/utilities/indexer/internal/helpers.h>
-#include <dftracer/utils/utilities/indexer/internal/indexer.h>
 #include <unistd.h>
 
-#include <argparse/argparse.hpp>
+#include <atomic>
 #include <chrono>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "common_cli.h"
 
 using namespace dftracer::utils;
-using namespace dftracer::utils::utilities::indexer::internal;
+using namespace dftracer::utils::utilities::composites::dft::indexing;
+using dftracer::utils::utilities::fileio::lines::sources::
+    async_streaming_gz_lines;
+using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility;
+using dftracer::utils::utilities::indexer::IndexBuildBatchConfig;
+
+class EventCountArgParse : public cli::ArgParse {
+   public:
+    cli::DirectoryArgs directory;
+    cli::PipelineArgs pipeline;
+    cli::IndexingArgs indexing;
+
+    explicit EventCountArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        indexing.index_dir_help =
+            "Directory to store index files (default: system temp directory)";
+        schema(directory, pipeline, indexing);
+    }
+};
+
+static int run_event_count(const EventCountArgParse* cli);
+
+struct EventCountBatchResult {
+    std::size_t total_events = 0;
+    std::size_t files_processed = 0;
+    bool is_approximate = false;
+};
+
+static EventCountBatchResult process_index_group_event_counts_sync(
+    std::string index_path, std::vector<ResolvedFile> entries) {
+    std::vector<int> file_ids;
+    file_ids.reserve(entries.size());
+    for (const auto& entry : entries) {
+        file_ids.push_back(entry.file_id);
+    }
+
+    EventCountBatchResult batch_result;
+
+    utilities::indexer::IndexDatabase db(
+        index_path,
+        dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+
+    auto metadata_rows = db.query_file_metadata_batch(file_ids);
+    auto merged_stats = db.query_merged_statistics_batch(file_ids);
+
+    for (const auto file_id : file_ids) {
+        auto merged_it = merged_stats.find(file_id);
+        if (merged_it != merged_stats.end() &&
+            merged_it->second.num_chunks > 0) {
+            batch_result.total_events +=
+                static_cast<std::size_t>(merged_it->second.stats.total_events);
+            batch_result.files_processed++;
+            continue;
+        }
+
+        auto metadata_it = metadata_rows.find(file_id);
+        if (metadata_it != metadata_rows.end()) {
+            batch_result.total_events +=
+                static_cast<std::size_t>(metadata_it->second.num_lines);
+            batch_result.files_processed++;
+            batch_result.is_approximate = true;
+        }
+    }
 
-static coro::CoroTask<int> run_event_count(argparse::ArgumentParser& program);
+    return batch_result;
+}
+
+static coro::CoroTask<EventCountBatchResult> process_index_group_event_counts(
+    std::shared_ptr<std::string> index_path,
+    std::shared_ptr<std::vector<ResolvedFile>> entries) {
+    co_return process_index_group_event_counts_sync(std::move(*index_path),
+                                                    std::move(*entries));
+}
 
 int main(int argc, char** argv) {
     DFTRACER_UTILS_LOGGER_INIT();
 
-    auto default_checkpoint_size_str =
-        std::to_string(Indexer::DEFAULT_CHECKPOINT_SIZE) + " B (" +
-        std::to_string(Indexer::DEFAULT_CHECKPOINT_SIZE / (1024 * 1024)) +
-        " MB)";
-
     argparse::ArgumentParser program("dftracer_event_count",
                                      DFTRACER_UTILS_PACKAGE_VERSION);
     program.add_description(
         "Count valid events in DFTracer .pfw or .pfw.gz files using composable "
         "utilities and pipeline processing");
 
-    program.add_argument("-d", "--directory")
-        .help("Directory containing .pfw or .pfw.gz files")
-        .default_value<std::string>(".");
-
-    program.add_argument("-f", "--force").help("Force index recreation").flag();
-
-    program.add_argument("-c", "--checkpoint-size")
-        .help("Checkpoint size for indexing in bytes (default: " +
-              default_checkpoint_size_str + ")")
-        .scan<'d', std::size_t>()
-        .default_value(
-            static_cast<std::size_t>(Indexer::DEFAULT_CHECKPOINT_SIZE));
-
-    program.add_argument("--executor-threads")
-        .help(
-            "Number of executor threads for parallel processing (default: "
-            "number of CPU cores)")
-        .scan<'d', std::size_t>()
-        .default_value(
-            static_cast<std::size_t>(dftracer_utils_hardware_concurrency()));
-
-    program.add_argument("--index-dir")
-        .help("Directory to store index files (default: system temp directory)")
-        .default_value<std::string>("");
-
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        DFTRACER_UTILS_LOG_ERROR("Error occurred: %s", err.what());
-        std::cerr << program;
-        return 1;
-    }
+    EventCountArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
 
-    return run_event_count(program).get();
+    return run_event_count(&cli);
 }
 
-static coro::CoroTask<int> run_event_count(argparse::ArgumentParser& program) {
-    // Parse arguments
-    std::string log_dir = program.get<std::string>("--directory");
-    bool force_rebuild = program.get<bool>("--force");
-    std::size_t checkpoint_size = program.get<std::size_t>("--checkpoint-size");
-    std::size_t executor_threads =
-        program.get<std::size_t>("--executor-threads");
-    std::string index_dir = program.get<std::string>("--index-dir");
-
-    // If no index dir specified, indices are stored next to trace files
-    // (default IndexBuilderUtility behavior). This allows reuse of
-    // indices built by dftracer_index.
-
-    log_dir = fs::absolute(log_dir).string();
-
-    // Discover input files
-    utilities::filesystem::PatternDirectoryScannerUtility scanner;
-    utilities::filesystem::PatternDirectoryScannerUtilityInput scan_input{
-        log_dir, {".pfw", ".pfw.gz"}};
-    auto matched_entries = scanner.process(scan_input).get();
-
-    std::vector<std::string> input_files;
-    input_files.reserve(matched_entries.size());
-    for (const auto& entry : matched_entries) {
-        input_files.push_back(entry.path.string());
-    }
+static int run_event_count(const EventCountArgParse* cli) {
+    const auto log_dir = fs::absolute(cli->directory.value).string();
+    const auto index_dir = cli->indexing.index_dir;
+    const auto checkpoint_size = cli->indexing.checkpoint_size;
+    const auto force_rebuild = cli->indexing.force;
+    const auto executor_threads = cli->pipeline.executor_threads;
+
+    IndexResolverUtility resolver;
+    ResolverInput resolve_input;
+    resolve_input.directory = log_dir;
+    resolve_input.index_dir = index_dir;
+    resolve_input.require_checkpoints = !force_rebuild;
 
-    if (input_files.empty()) {
+    auto resolve_result = resolver.process(resolve_input).get();
+
+    if (resolve_result.all_files.empty()) {
         DFTRACER_UTILS_LOG_ERROR("No .pfw or .pfw.gz files found in: %s",
                                  log_dir.c_str());
-        co_return 1;
+        return 1;
     }
 
-    auto pipeline_config = PipelineConfig()
-                               .with_name("DFTracer Event Count")
-                               .with_compute_threads(executor_threads)
-                               .with_watchdog(false);
-
-    Pipeline pipeline(pipeline_config);
-
     auto start_time = std::chrono::high_resolution_clock::now();
 
     std::atomic<std::size_t> total_events{0};
     std::atomic<std::size_t> files_processed{0};
     std::atomic<bool> is_approximate{false};
 
-    auto count_task = make_task(
-        [&](CoroScope& ctx) -> coro::CoroTask<void> {
-            co_await ctx.scope([&](CoroScope& scope) -> coro::CoroTask<void> {
-                auto* files_ptr = &input_files;
-                auto* total_events_ptr = &total_events;
-                auto* files_processed_ptr = &files_processed;
-                auto* is_approximate_ptr = &is_approximate;
-                auto file_chan =
-                    coro::make_channel<std::size_t>(executor_threads * 2);
-
-                // Producer
-                scope.spawn([ch = file_chan->producer(),
-                             num_files = input_files.size()](
-                                CoroScope&) mutable -> coro::CoroTask<void> {
-                    auto guard = ch.guard();
-                    for (std::size_t i = 0; i < num_files; ++i) {
-                        if (!co_await ch.send(i)) co_return;
-                    }
-                    co_return;
-                });
+    std::vector<FileWorkItem> direct_scan_items;
+    std::vector<ResolvedFile> indexed_entries =
+        std::move(resolve_result.cached);
+    std::string index_path = resolve_result.index_path;
 
-                // Workers: build index if needed, then read count from DB
-                for (std::size_t w = 0; w < executor_threads; ++w) {
-                    scope.spawn([file_chan, files_ptr, checkpoint_size,
-                                 force_rebuild, &index_dir, total_events_ptr,
-                                 files_processed_ptr, is_approximate_ptr](
-                                    CoroScope&) -> coro::CoroTask<void> {
-                        while (auto fi_opt = co_await file_chan->receive()) {
-                            const auto& fp = (*files_ptr)[*fi_opt];
-
-                            // Build index if needed
-                            utilities::indexer::IndexBuilderUtility builder;
-                            auto config =
-                                utilities::indexer::IndexBuildConfig::for_file(
-                                    fp)
-                                    .with_checkpoint_size(checkpoint_size)
-                                    .with_force_rebuild(force_rebuild)
-                                    .with_index_dir(index_dir);
-                            co_await builder.process(config);
-
-                            // Read event count from index
-                            std::string index_path =
-                                fp + constants::indexer::EXTENSION;
-                            if (!index_dir.empty()) {
-                                auto fname = fs::path(fp).filename();
-                                index_path =
-                                    (fs::path(index_dir) / fname).string() +
-                                    constants::indexer::EXTENSION;
-                            }
+    std::vector<FileWorkItem> needs_checkpoint =
+        std::move(resolve_result.needs_checkpoint);
+
+    auto pipeline_config =
+        cli::build_pipeline_config("DFTracer Event Count", cli->pipeline);
+    Pipeline pipeline(pipeline_config);
 
-                            if (fs::exists(index_path)) {
-                                try {
-                                    utilities::indexer::IndexDatabase db(
-                                        index_path);
-                                    int fid = db.find_file(fp);
-                                    if (fid >= 0) {
-                                        if (!db.has_bloom_data(fid)) {
-                                            is_approximate_ptr->store(
-                                                true,
-                                                std::memory_order_relaxed);
-                                        }
-                                        total_events_ptr->fetch_add(
-                                            db.get_total_events(fid),
-                                            std::memory_order_relaxed);
-                                        files_processed_ptr->fetch_add(
-                                            1, std::memory_order_relaxed);
-                                        continue;
-                                    }
-                                } catch (...) {
+    auto build_task = make_task(
+        [&needs_checkpoint, index_dir, checkpoint_size, executor_threads,
+         force_rebuild](CoroScope& scope) -> coro::CoroTask<void> {
+            if (needs_checkpoint.empty()) {
+                co_return;
+            }
+            auto batch_config = std::make_shared<IndexBuildBatchConfig>();
+            batch_config->file_paths.reserve(needs_checkpoint.size());
+            for (const auto& item : needs_checkpoint) {
+                batch_config->file_paths.push_back(item.file_path);
+            }
+            batch_config->index_dir = index_dir;
+            batch_config->checkpoint_size = checkpoint_size;
+            batch_config->parallelism = executor_threads;
+            batch_config->force_rebuild = force_rebuild;
+            batch_config->use_batch_write = true;
+            batch_config->rebuild_root_summaries = true;
+            co_await IndexBatchBuilderUtility::process(&scope,
+                                                       std::move(batch_config));
+        },
+        "BatchIndex");
+
+    auto count_task = make_task(
+        [&needs_checkpoint, &indexed_entries, &direct_scan_items, &total_events,
+         &files_processed, &is_approximate, index_dir, index_path,
+         executor_threads](CoroScope& ctx) -> coro::CoroTask<void> {
+            if (!needs_checkpoint.empty()) {
+                IndexResolverUtility re_resolver;
+                ResolverInput refresh_input;
+                std::vector<std::string> newly_indexed;
+                newly_indexed.reserve(needs_checkpoint.size());
+                for (const auto& item : needs_checkpoint) {
+                    newly_indexed.push_back(item.file_path);
+                }
+                refresh_input.files = std::move(newly_indexed);
+                refresh_input.index_dir = index_dir;
+                refresh_input.require_checkpoints = true;
+
+                auto refresh_result =
+                    co_await re_resolver.process(refresh_input);
+                for (auto& entry : refresh_result.cached) {
+                    indexed_entries.push_back(std::move(entry));
+                }
+                for (auto& item : refresh_result.needs_checkpoint) {
+                    direct_scan_items.push_back(std::move(item));
+                }
+            }
+
+            if (!indexed_entries.empty()) {
+                auto idx_path_ptr = std::make_shared<std::string>(index_path);
+                auto entries_ptr = std::make_shared<std::vector<ResolvedFile>>(
+                    std::move(indexed_entries));
+                try {
+                    auto batch_result =
+                        co_await process_index_group_event_counts(
+                            std::move(idx_path_ptr), std::move(entries_ptr));
+                    total_events.fetch_add(batch_result.total_events,
+                                           std::memory_order_relaxed);
+                    files_processed.fetch_add(batch_result.files_processed,
+                                              std::memory_order_relaxed);
+                    if (batch_result.is_approximate) {
+                        is_approximate.store(true, std::memory_order_relaxed);
+                    }
+                } catch (...) {
+                    is_approximate.store(true, std::memory_order_relaxed);
+                }
+            }
+
+            if (!direct_scan_items.empty()) {
+                is_approximate.store(true, std::memory_order_relaxed);
+                co_await ctx.scope([&](CoroScope& scope)
+                                       -> coro::CoroTask<void> {
+                    auto file_chan =
+                        coro::make_channel<FileWorkItem>(executor_threads * 2);
+
+                    scope.spawn(
+                        [ch = file_chan->producer(),
+                         items_ptr = &direct_scan_items](
+                            CoroScope&) mutable -> coro::CoroTask<void> {
+                            auto guard = ch.guard();
+                            for (const auto& item : *items_ptr) {
+                                if (!co_await ch.send(item)) {
+                                    co_return;
                                 }
                             }
-
-                            // Fallback for small/unindexed files:
-                            // stream decompress and count lines (approximate)
-                            is_approximate_ptr->store(
-                                true, std::memory_order_relaxed);
-                            {
-                                using utilities::fileio::lines::sources::
-                                    async_streaming_gz_lines;
+                            co_return;
+                        });
+
+                    for (std::size_t w = 0; w < executor_threads; ++w) {
+                        scope.spawn([ch = file_chan->consumer(),
+                                     total_events_ptr = &total_events,
+                                     files_processed_ptr = &files_processed](
+                                        CoroScope&) -> coro::CoroTask<void> {
+                            while (auto item_opt = co_await ch.receive()) {
                                 std::size_t count = 0;
-                                auto gen = async_streaming_gz_lines(fp);
+                                auto gen = async_streaming_gz_lines(
+                                    item_opt->file_path);
                                 while (co_await gen.next()) {
                                     ++count;
                                 }
@@ -209,17 +257,18 @@ static coro::CoroTask<int> run_event_count(argparse::ArgumentParser& program) {
                                 files_processed_ptr->fetch_add(
                                     1, std::memory_order_relaxed);
                             }
-                        }
-                        co_return;
-                    });
-                }
-                co_return;
-            });
+                            co_return;
+                        });
+                    }
+                    co_return;
+                });
+            }
             co_return;
         },
-        "EventCount");
+        "Count");
 
-    pipeline.set_source(count_task);
+    count_task->depends_on(build_task);
+    pipeline.set_source(build_task);
     pipeline.set_destination(count_task);
     pipeline.execute();
 
@@ -235,5 +284,5 @@ static coro::CoroTask<int> run_event_count(argparse::ArgumentParser& program) {
     DFTRACER_UTILS_LOG_DEBUG("Completed in %.2f ms", duration.count());
     DFTRACER_UTILS_LOG_DEBUG("Files processed: %zu", files_processed.load());
 
-    co_return 0;
+    return 0;
 }
diff --git a/src/dftracer/utils/binaries/dftracer_gen_fake_trace.cpp b/src/dftracer/utils/binaries/dftracer_gen_fake_trace.cpp
index eb3f4c45..4c496127 100644
--- a/src/dftracer/utils/binaries/dftracer_gen_fake_trace.cpp
+++ b/src/dftracer/utils/binaries/dftracer_gen_fake_trace.cpp
@@ -1,9 +1,9 @@
 #include <dftracer/utils/core/common/byte_view.h>
 #include <dftracer/utils/core/common/config.h>
-#include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/core/pipeline/pipeline_config.h>
+#include <dftracer/utils/core/rocksdb/db_manager.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/tasks/task.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h>
@@ -14,35 +14,41 @@
 #include <dftracer/utils/utilities/hash/hasher_utility.h>
 #include <dftracer/utils/utilities/indexer/index_builder_utility.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
 #include <dftracer/utils/utilities/indexer/internal/indexer.h>
 
-#include <argparse/argparse.hpp>
 #include <cstdint>
 #include <cstdio>
 #include <random>
 #include <string>
 #include <vector>
 
+#include "common_cli.h"
+
 using namespace dftracer::utils;
 using namespace dftracer::utils::utilities;
 using namespace dftracer::utils::utilities::composites::dft;
 using namespace dftracer::utils::utilities::composites::dft::indexing;
 namespace compression = dftracer::utils::utilities::compression;
 namespace util_io = dftracer::utils::utilities::fileio;
-using dftracer::utils::utilities::indexer::IndexBuildConfig;
-using dftracer::utils::utilities::indexer::IndexBuilderUtility;
+using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility;
+using dftracer::utils::utilities::indexer::IndexBuildBatchConfig;
 using dftracer::utils::utilities::indexer::IndexDatabase;
 using dftracer::utils::utilities::indexer::internal::get_logical_path;
 
 // ---------------------------------------------------------------------------
-// TraceWriter – compresses via ManualStreamingCompressorUtility and writes
+// TraceWriter - compresses via ManualStreamingCompressorUtility and writes
 //               via StreamingFileWriterUtility.  Natural deflate blocks
 //               provide block boundaries for the gzip indexer.
 // ---------------------------------------------------------------------------
 class TraceWriter {
    public:
-    explicit TraceWriter(const std::string& path) : writer_(path) {}
+    explicit TraceWriter(const std::string& path,
+                         std::size_t flush_threshold = 4 * 1024 * 1024)
+        : writer_(path), flush_threshold_(flush_threshold) {
+        buf_.reserve(flush_threshold * 2);
+    }
 
     ~TraceWriter() { close(); }
 
@@ -50,16 +56,26 @@ class TraceWriter {
     TraceWriter& operator=(const TraceWriter&) = delete;
 
     void write(const std::string& s) {
-        [this, &s]() -> coro::CoroTask<void> {
-            auto gen = compressor_.compress(ByteView(s));
+        buf_ += s;
+        if (buf_.size() >= flush_threshold_) {
+            flush();
+        }
+    }
+
+    void flush() {
+        if (buf_.empty()) return;
+        [this]() -> coro::CoroTask<void> {
+            auto gen = compressor_.compress(ByteView(buf_));
             while (auto chunk = co_await gen.next()) {
                 co_await writer_.process(*chunk);
             }
         }()
-                            .get();
+                        .get();
+        buf_.clear();
     }
 
     void close() {
+        flush();
         [this]() -> coro::CoroTask<void> {
             auto gen = compressor_.finalize_stream();
             while (auto chunk = co_await gen.next()) {
@@ -73,6 +89,8 @@ class TraceWriter {
    private:
     compression::zlib::ManualStreamingCompressorUtility compressor_;
     util_io::StreamingFileWriterUtility writer_;
+    std::string buf_;
+    std::size_t flush_threshold_;
 };
 
 // ---------------------------------------------------------------------------
@@ -216,7 +234,7 @@ struct QuerySpec {
 };
 
 static coro::CoroTask<int> run_verify(
-    const std::vector<std::string>& file_paths,
+    CoroScope& scope, const std::vector<std::string>& file_paths,
     const std::vector<QuerySpec>& queries, std::size_t ckpt_size) {
     // Extra dimensions: arbitrary dot-paths into args
     std::vector<std::string> extra_dims = {"ret", "count", "offset", "epoch",
@@ -237,17 +255,33 @@ static coro::CoroTask<int> run_verify(
     std::printf("Verify: building bloom indices\n");
     std::printf("==========================================\n");
 
+    // Batch-build gzip indexes for all files
+    {
+        std::vector<std::string> abs_paths;
+        abs_paths.reserve(file_paths.size());
+        for (const auto& fp : file_paths) {
+            abs_paths.push_back(fs::absolute(fp).string());
+        }
+
+        auto index_path = internal::determine_index_path(abs_paths.front(), "");
+        dftracer::utils::rocksdb::RocksDBManager::instance().reset(index_path);
+
+        auto batch_config = std::make_shared<IndexBuildBatchConfig>();
+        batch_config->file_paths = std::move(abs_paths);
+        batch_config->checkpoint_size = ckpt_size;
+        batch_config->parallelism = file_paths.size();
+        batch_config->use_batch_write = true;
+        batch_config->rebuild_root_summaries = true;
+
+        co_await IndexBatchBuilderUtility::process(&scope,
+                                                   std::move(batch_config));
+    }
+
     for (const auto& file_path : file_paths) {
         std::string abs_path = fs::absolute(file_path).string();
-
-        // 1. Build gzip index
         std::string index_path = internal::determine_index_path(abs_path, "");
-        auto idx_input = IndexBuildConfig::for_file(abs_path)
-                             .with_checkpoint_size(ckpt_size)
-                             .with_force_rebuild(true);
-        co_await IndexBuilderUtility{}.process(idx_input);
 
-        // 2. Collect metadata
+        // Collect metadata
         auto meta_input = MetadataCollectorUtilityInput::from_file(abs_path)
                               .with_checkpoint_size(ckpt_size)
                               .with_force_rebuild(false)
@@ -265,16 +299,16 @@ static coro::CoroTask<int> run_verify(
             std::string idx_path_bidx =
                 internal::determine_index_path(abs_path, "");
             IndexDatabase idx_db(idx_path_bidx);
-            idx_db.init_base_schema();
-            idx_db.init_bloom_schema();
+            auto writer = idx_db.begin_write();
+            writer->init_schema();
 
             std::uint64_t file_hash_val = 0;
             if (fs::exists(abs_path)) {
                 file_hash_val =
                     static_cast<std::uint64_t>(fs::file_size(abs_path));
             }
-            int fid = idx_db.get_or_create_file_info(get_logical_path(abs_path),
-                                                     file_hash_val);
+            int fid = writer->get_or_create_file_info(
+                get_logical_path(abs_path), file_hash_val);
 
             std::size_t file_size = metadata.uncompressed_size;
             std::size_t num_ckpts = metadata.num_checkpoints;
@@ -299,7 +333,6 @@ static coro::CoroTask<int> run_verify(
                 }
             }
 
-            idx_db.begin_transaction();
             std::unordered_map<std::string, BloomFilter> file_blooms;
             HashResolutions all_hr;
             std::size_t total_events = 0;
@@ -320,7 +353,7 @@ static coro::CoroTask<int> run_verify(
 
                 for (auto& [dim, bloom] : output.bloom_filters) {
                     auto blob = bloom.serialize();
-                    idx_db.insert_chunk_bloom_filter(
+                    writer->insert_chunk_bloom_filter(
                         fid, output.checkpoint_idx, dim, blob.data(),
                         static_cast<int>(blob.size()), bloom.num_entries());
 
@@ -332,8 +365,8 @@ static coro::CoroTask<int> run_verify(
                     }
                 }
 
-                idx_db.insert_chunk_statistics(fid, output.checkpoint_idx,
-                                               output.statistics);
+                writer->insert_chunk_statistics(fid, output.checkpoint_idx,
+                                                output.statistics);
 
                 for (auto& [dim, resolutions] : output.hash_resolutions) {
                     for (auto& [h, resolved] : resolutions) {
@@ -344,20 +377,14 @@ static coro::CoroTask<int> run_verify(
 
             for (auto& [dim, bloom] : file_blooms) {
                 auto blob = bloom.serialize();
-                idx_db.insert_file_bloom_filter(fid, dim, blob.data(),
-                                                static_cast<int>(blob.size()),
-                                                bloom.num_entries());
-            }
-            for (const auto& [dim, resolutions] : all_hr) {
-                for (const auto& [h, resolved] : resolutions) {
-                    idx_db.insert_hash_resolution(fid, dim, h, resolved);
-                }
+                writer->insert_file_bloom_filter(fid, dim, blob.data(),
+                                                 static_cast<int>(blob.size()),
+                                                 bloom.num_entries());
             }
             for (const auto& dim : all_dimensions) {
-                idx_db.insert_index_dimension(fid, dim);
+                writer->insert_index_dimension(fid, dim);
             }
-
-            idx_db.commit_transaction();
+            writer->commit();
 
             std::string basename = fs::path(abs_path).filename().string();
             std::printf("  %s: indexed (%zu events, %zu chunks)\n",
@@ -435,6 +462,117 @@ static coro::CoroTask<int> run_verify(
     co_return 0;
 }
 
+class GenFakeTraceArgParse : public cli::ArgParse {
+   public:
+    cli::PipelineArgs pipeline;
+
+    std::string output_dir;
+    int num_ranks = 8;
+    int num_hosts = 4;
+    int num_epochs = 500;
+    int steps_per_epoch = 1000;
+    int checkpoint_every = 5;
+    int validation_every = 2;
+    int num_train_files = 8;
+    int num_val_files = 2;
+    int step_dur_ms = 100;
+    std::uint64_t base_seed = 42;
+    bool verify = false;
+    std::size_t checkpoint_size = 2 * 1024 * 1024;
+
+    explicit GenFakeTraceArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        schema(pipeline);
+    }
+
+   protected:
+    void register_args() override {
+        parser()
+            .add_argument("-o", "--output-dir")
+            .help("Output directory for trace files")
+            .required();
+        parser()
+            .add_argument("-p", "--num-processes")
+            .help("Number of ranks")
+            .scan<'d', int>()
+            .default_value(8);
+        parser()
+            .add_argument("-H", "--num-hosts")
+            .help("Number of hosts")
+            .scan<'d', int>()
+            .default_value(4);
+        parser()
+            .add_argument("-e", "--num-epochs")
+            .help("Training epochs")
+            .scan<'d', int>()
+            .default_value(500);
+        parser()
+            .add_argument("-s", "--steps-per-epoch")
+            .help("Steps per epoch")
+            .scan<'d', int>()
+            .default_value(1000);
+        parser()
+            .add_argument("--checkpoint-every")
+            .help("Checkpoint every N epochs")
+            .scan<'d', int>()
+            .default_value(5);
+        parser()
+            .add_argument("--validation-every")
+            .help("Validate every N epochs")
+            .scan<'d', int>()
+            .default_value(2);
+        parser()
+            .add_argument("--num-train-files")
+            .help("Training data shards")
+            .scan<'d', int>()
+            .default_value(8);
+        parser()
+            .add_argument("--num-val-files")
+            .help("Validation data shards")
+            .scan<'d', int>()
+            .default_value(2);
+        parser()
+            .add_argument("--step-duration-ms")
+            .help("Base step duration in milliseconds")
+            .scan<'d', int>()
+            .default_value(100);
+        parser()
+            .add_argument("--seed")
+            .help("Random seed for duration jitter")
+            .scan<'d', std::uint64_t>()
+            .default_value(static_cast<std::uint64_t>(42));
+        parser()
+            .add_argument("--verify")
+            .help(
+                "After generation, build bloom indices and run queries to "
+                "verify chunk-skipping works")
+            .flag();
+        parser()
+            .add_argument("--checkpoint-size")
+            .help(
+                "Gzip checkpoint size in bytes for indexing (default: 2 MB). "
+                "Smaller values produce more chunks and better demonstrate "
+                "chunk-level bloom filter skipping.")
+            .scan<'d', std::size_t>()
+            .default_value(static_cast<std::size_t>(2 * 1024 * 1024));
+    }
+
+    void post_parse() override {
+        output_dir = parser().get<std::string>("--output-dir");
+        num_ranks = parser().get<int>("--num-processes");
+        num_hosts = parser().get<int>("--num-hosts");
+        num_epochs = parser().get<int>("--num-epochs");
+        steps_per_epoch = parser().get<int>("--steps-per-epoch");
+        checkpoint_every = parser().get<int>("--checkpoint-every");
+        validation_every = parser().get<int>("--validation-every");
+        num_train_files = parser().get<int>("--num-train-files");
+        num_val_files = parser().get<int>("--num-val-files");
+        step_dur_ms = parser().get<int>("--step-duration-ms");
+        base_seed = parser().get<std::uint64_t>("--seed");
+        verify = parser().get<bool>("--verify");
+        checkpoint_size = parser().get<std::size_t>("--checkpoint-size");
+    }
+};
+
 // ---------------------------------------------------------------------------
 // main
 // ---------------------------------------------------------------------------
@@ -448,96 +586,23 @@ int main(int argc, char** argv) {
         "Produces per-rank .pfw.gz files with known patterns "
         "suitable for testing bloom-filter indexing.");
 
-    program.add_argument("-o", "--output-dir")
-        .help("Output directory for trace files")
-        .required();
-
-    program.add_argument("-p", "--num-processes")
-        .help("Number of ranks")
-        .scan<'d', int>()
-        .default_value(8);
-
-    program.add_argument("-H", "--num-hosts")
-        .help("Number of hosts")
-        .scan<'d', int>()
-        .default_value(4);
-
-    program.add_argument("-e", "--num-epochs")
-        .help("Training epochs")
-        .scan<'d', int>()
-        .default_value(500);
-
-    program.add_argument("-s", "--steps-per-epoch")
-        .help("Steps per epoch")
-        .scan<'d', int>()
-        .default_value(1000);
-
-    program.add_argument("--checkpoint-every")
-        .help("Checkpoint every N epochs")
-        .scan<'d', int>()
-        .default_value(5);
-
-    program.add_argument("--validation-every")
-        .help("Validate every N epochs")
-        .scan<'d', int>()
-        .default_value(2);
-
-    program.add_argument("--num-train-files")
-        .help("Training data shards")
-        .scan<'d', int>()
-        .default_value(8);
-
-    program.add_argument("--num-val-files")
-        .help("Validation data shards")
-        .scan<'d', int>()
-        .default_value(2);
-
-    program.add_argument("--step-duration-ms")
-        .help("Base step duration in milliseconds")
-        .scan<'d', int>()
-        .default_value(100);
-
-    program.add_argument("--seed")
-        .help("Random seed for duration jitter")
-        .scan<'d', std::uint64_t>()
-        .default_value(static_cast<std::uint64_t>(42));
-
-    program.add_argument("--verify")
-        .help(
-            "After generation, build bloom indices and run queries to "
-            "verify chunk-skipping works")
-        .flag();
-
-    program.add_argument("--checkpoint-size")
-        .help(
-            "Gzip checkpoint size in bytes for indexing (default: 2 MB). "
-            "Smaller values produce more chunks and better demonstrate "
-            "chunk-level bloom filter skipping.")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(2 * 1024 * 1024));
-
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        std::fprintf(stderr, "Error: %s\n", err.what());
-        std::fprintf(stderr, "%s\n", program.help().str().c_str());
-        return 1;
-    }
-
-    const std::string output_dir = program.get<std::string>("--output-dir");
-    const int num_ranks = program.get<int>("--num-processes");
-    const int num_hosts = program.get<int>("--num-hosts");
-    const int num_epochs = program.get<int>("--num-epochs");
-    const int steps_per_epoch = program.get<int>("--steps-per-epoch");
-    const int checkpoint_every = program.get<int>("--checkpoint-every");
-    const int validation_every = program.get<int>("--validation-every");
-    const int num_train_files = program.get<int>("--num-train-files");
-    const int num_val_files = program.get<int>("--num-val-files");
-    const int step_dur_ms = program.get<int>("--step-duration-ms");
-    const std::uint64_t base_seed = program.get<std::uint64_t>("--seed");
-    const bool verify = program.get<bool>("--verify");
-    const std::size_t checkpoint_size =
-        program.get<std::size_t>("--checkpoint-size");
+    GenFakeTraceArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
+
+    const auto& output_dir = cli.output_dir;
+    const int num_ranks = cli.num_ranks;
+    const int num_hosts = cli.num_hosts;
+    const int num_epochs = cli.num_epochs;
+    const int steps_per_epoch = cli.steps_per_epoch;
+    const int checkpoint_every = cli.checkpoint_every;
+    const int validation_every = cli.validation_every;
+    const int num_train_files = cli.num_train_files;
+    const int num_val_files = cli.num_val_files;
+    const int step_dur_ms = cli.step_dur_ms;
+    const std::uint64_t base_seed = cli.base_seed;
+    const bool verify = cli.verify;
+    const std::size_t checkpoint_size = cli.checkpoint_size;
 
     // Convert base step duration to microseconds
     const std::uint64_t step_dur_us =
@@ -607,8 +672,8 @@ int main(int argc, char** argv) {
     // -----------------------------------------------------------------------
     // Generate one file per rank (parallel via pipeline)
     // -----------------------------------------------------------------------
-    auto pipeline_config = PipelineConfig::default_config().with_name(
-        "DFTracer Fake Trace Generator");
+    auto pipeline_config = cli::build_pipeline_config(
+        "DFTracer Fake Trace Generator", cli.pipeline);
     Pipeline pipeline(pipeline_config);
 
     auto* generated_files_ptr = &generated_files;
@@ -632,10 +697,11 @@ int main(int argc, char** argv) {
              host_hashes_ptr, host_names_ptr, train_file_names_ptr,
              train_file_hashes_ptr, val_file_names_ptr, val_file_hashes_ptr,
              ckpt_file_name_ptr, ckref_ptr, script_name_ptr, sref_ptr,
-             rank_event_counts_ptr]([[maybe_unused]] CoroScope& ctx)
-                -> coro::CoroTask<std::size_t> {
+             rank_event_counts_ptr](
+                [[maybe_unused]] CoroScope& ctx) -> coro::CoroTask<void> {
                 const std::string& path = (*generated_files_ptr)[rank];
                 TraceWriter writer(path);
+                writer.write("[\n");
                 const std::string& sref = *sref_ptr;
                 const std::string& ckref = *ckref_ptr;
 
@@ -1058,60 +1124,70 @@ int main(int argc, char** argv) {
                     }
                 }
 
+                writer.write("]\n");
                 writer.close();
                 (*rank_event_counts_ptr)[rank] = rank_events;
-                co_return rank_events;
+                co_return;
             },
             "Rank-" + std::to_string(rank));
         rank_tasks.push_back(task);
     }
 
-    pipeline.set_source(rank_tasks);
-    pipeline.execute();
+    // Summary task (prints results after generation)
+    auto* rank_event_counts_for_summary = &rank_event_counts;
+    auto* generated_files_for_summary = &generated_files;
+    auto summary_task = make_task(
+        [num_ranks, checkpoint_every, validation_every,
+         rank_event_counts_for_summary, generated_files_for_summary,
+         val_file_hashes, train_file_hashes, host_hashes,
+         host_names]([[maybe_unused]] CoroScope& ctx) -> coro::CoroTask<void> {
+            std::size_t total_events = 0;
+            for (int rank = 0; rank < num_ranks; ++rank) {
+                std::printf("  rank %d: %zu events -> %s\n", rank,
+                            (*rank_event_counts_for_summary)[rank],
+                            (*generated_files_for_summary)[rank].c_str());
+                total_events += (*rank_event_counts_for_summary)[rank];
+            }
 
-    std::size_t total_events = 0;
-    for (int rank = 0; rank < num_ranks; ++rank) {
-        std::printf("  rank %d: %zu events -> %s\n", rank,
-                    rank_event_counts[rank], generated_files[rank].c_str());
-        total_events += rank_event_counts[rank];
+            std::printf("\n==========================================\n");
+            std::printf("Generation complete\n");
+            std::printf("==========================================\n");
+            std::printf("  Total events: %zu\n", total_events);
+            std::printf("  Total files:  %d\n", num_ranks);
+            std::printf("\nInteresting queries for bloom filter testing:\n");
+            std::printf(
+                "  1. name=pwrite                   (checkpoint I/O, ~%d%% of "
+                "epochs)\n",
+                checkpoint_every > 0 ? 100 / checkpoint_every : 0);
+            std::printf("  2. fhash=%s  (validation data, ~%d%% of epochs)\n",
+                        val_file_hashes[0].c_str(),
+                        validation_every > 0 ? 100 / validation_every : 0);
+            if (!train_file_hashes.empty()) {
+                std::printf("  3. fhash=%s  (rank-specific train shard)\n",
+                            train_file_hashes[0].c_str());
+            }
+            std::printf("  4. hhash=%s  (host-specific, %s)\n",
+                        host_hashes[0].c_str(), host_names[0].c_str());
+            std::printf(
+                "  5. name=allreduce                (every step, dense)\n");
+            std::printf(
+                "  6. name=fsync                    (checkpoint only, "
+                "sparse)\n");
+            std::printf("==========================================\n");
+            co_return;
+        },
+        "Summary");
+
+    for (const auto& rt : rank_tasks) {
+        summary_task->depends_on(rt);
     }
 
-    // -----------------------------------------------------------------------
-    // Summary banner
-    // -----------------------------------------------------------------------
-    std::printf("\n==========================================\n");
-    std::printf("Generation complete\n");
-    std::printf("==========================================\n");
-    std::printf("  Total events: %zu\n", total_events);
-    std::printf("  Total files:  %d\n", num_ranks);
-    std::printf("\nInteresting queries for bloom filter testing:\n");
-    std::printf(
-        "  1. name=pwrite                   (checkpoint I/O, ~%d%% of "
-        "epochs)\n",
-        checkpoint_every > 0 ? 100 / checkpoint_every : 0);
-    std::printf("  2. fhash=%s  (validation data, ~%d%% of epochs)\n",
-                val_file_hashes[0].c_str(),
-                validation_every > 0 ? 100 / validation_every : 0);
-    if (!train_file_hashes.empty()) {
-        std::printf("  3. fhash=%s  (rank-specific train shard)\n",
-                    train_file_hashes[0].c_str());
-    }
-    std::printf("  4. hhash=%s  (host-specific, %s)\n", host_hashes[0].c_str(),
-                host_names[0].c_str());
-    std::printf("  5. name=allreduce                (every step, dense)\n");
-    std::printf(
-        "  6. name=fsync                    (checkpoint only, sparse)\n");
-    std::printf("==========================================\n");
+    std::shared_ptr<Task> final_task = summary_task;
+    std::shared_ptr<Task> verify_task;
 
-    // -----------------------------------------------------------------------
-    // Verify mode: build bloom indices and run queries
-    // -----------------------------------------------------------------------
     if (verify) {
         std::vector<QuerySpec> test_queries;
 
-        // --- Single-dimension queries ---
-
-        // name dimension
         test_queries.push_back(
             {"name=pwrite (sparse, ckpt only)", {{"name", {"pwrite"}}}});
         test_queries.push_back(
@@ -1120,24 +1196,19 @@ int main(int argc, char** argv) {
             {"name=fsync (sparse, ckpt only)", {{"name", {"fsync"}}}});
         test_queries.push_back(
             {"name=val_forward (periodic)", {{"name", {"val_forward"}}}});
-
-        // cat dimension
         test_queries.push_back(
             {"cat=POSIX (all I/O events)", {{"cat", {"POSIX"}}}});
         test_queries.push_back(
             {"cat=APP (all compute events)", {{"cat", {"APP"}}}});
 
-        // pid dimension (rank-specific)
         std::string pid0 = std::to_string(1000);
         test_queries.push_back(
             {"pid=" + pid0 + " (rank 0 only)", {{"pid", {pid0}}}});
 
-        // tid dimension (io thread vs main thread)
         std::string tid_io_0 = std::to_string(10001);
         test_queries.push_back(
             {"tid=" + tid_io_0 + " (rank 0 io thread)", {{"tid", {tid_io_0}}}});
 
-        // fhash dimension (resolved file names)
         test_queries.push_back({"fhash=" + val_file_names[0] + " (resolved)",
                                 {{"fhash", {val_file_hashes[0]}}}});
         if (!train_file_hashes.empty()) {
@@ -1147,36 +1218,22 @@ int main(int argc, char** argv) {
         }
         test_queries.push_back(
             {"fhash=ckpt (resolved)", {{"fhash", {ckpt_file_hash}}}});
-
-        // hhash dimension (host-specific)
         test_queries.push_back({"hhash=" + host_names[0] + " (resolved)",
                                 {{"hhash", {host_hashes[0]}}}});
-
-        // sref dimension (script hash)
         test_queries.push_back(
             {"shash=train_unet3d (resolved)", {{"shash", {script_hash}}}});
 
-        // --- Multi-dimension AND queries ---
-
-        // name AND cat (checkpoint writes that are POSIX I/O)
         test_queries.push_back({"name=pwrite AND cat=POSIX",
                                 {{"name", {"pwrite"}}, {"cat", {"POSIX"}}}});
-
-        // name AND fhash (fsync on checkpoint file only)
         test_queries.push_back(
             {"name=fsync AND fhash=ckpt",
              {{"name", {"fsync"}}, {"fhash", {ckpt_file_hash}}}});
-
-        // cat AND hhash (POSIX I/O on node-0)
         test_queries.push_back(
             {"cat=POSIX AND hhash=" + host_names[0],
              {{"cat", {"POSIX"}}, {"hhash", {host_hashes[0]}}}});
-
-        // cat AND pid (APP events for rank 0)
         test_queries.push_back(
             {"cat=APP AND pid=" + pid0, {{"cat", {"APP"}}, {"pid", {pid0}}}});
 
-        // name AND hhash AND fhash (read on node-0 for train shard 0)
         if (!train_file_hashes.empty()) {
             test_queries.push_back(
                 {"name=read AND hhash=" + host_names[0] + " AND fhash=shard_0",
@@ -1185,23 +1242,15 @@ int main(int argc, char** argv) {
                   {"fhash", {train_file_hashes[0]}}}});
         }
 
-        // --- OR-within dimension queries ---
-
-        // name = pwrite OR write (all checkpoint write ops)
         test_queries.push_back({"name=pwrite|write (ckpt writes)",
                                 {{"name", {"pwrite", "write"}}}});
-
-        // name = open OR close (all open/close ops)
         test_queries.push_back({"name=open|close (all open/close)",
                                 {{"name", {"open", "close"}}}});
-
-        // fhash = any val file (all validation I/O)
         test_queries.push_back(
             {"fhash=any val file (OR)",
              {{"fhash", std::vector<std::string>(val_file_hashes.begin(),
                                                  val_file_hashes.end())}}});
 
-        // --- Negative tests ---
         test_queries.push_back(
             {"name=NONEXISTENT (expect 0)", {{"name", {"NONEXISTENT"}}}});
         test_queries.push_back(
@@ -1209,7 +1258,25 @@ int main(int argc, char** argv) {
         test_queries.push_back({"name=pwrite AND cat=APP (impossible)",
                                 {{"name", {"pwrite"}}, {"cat", {"APP"}}}});
 
-        return run_verify(generated_files, test_queries, checkpoint_size).get();
+        auto* gf_ptr = &generated_files;
+        verify_task = make_task(
+            [gf_ptr, test_queries = std::move(test_queries),
+             checkpoint_size](CoroScope& ctx) -> coro::CoroTask<int> {
+                co_return co_await run_verify(ctx, *gf_ptr, test_queries,
+                                              checkpoint_size);
+            },
+            "Verify");
+
+        verify_task->depends_on(summary_task);
+        final_task = verify_task;
+    }
+
+    pipeline.set_source(rank_tasks);
+    pipeline.set_destination(final_task);
+    pipeline.execute();
+
+    if (verify && verify_task) {
+        return verify_task->get<int>();
     }
 
     return 0;
diff --git a/src/dftracer/utils/binaries/dftracer_index.cpp b/src/dftracer/utils/binaries/dftracer_index.cpp
index 249f262a..bd02f51e 100644
--- a/src/dftracer/utils/binaries/dftracer_index.cpp
+++ b/src/dftracer/utils/binaries/dftracer_index.cpp
@@ -1,40 +1,122 @@
+#include <concurrentqueue.h>
 #include <dftracer/utils/core/common/config.h>
 #include <dftracer/utils/core/common/filesystem.h>
-#include <dftracer/utils/core/common/platform_compat.h>
-#include <dftracer/utils/core/coro/channel.h>
 #include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/core/pipeline/pipeline_config.h>
 #include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/tasks/task.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h>
 #include <dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h>
 #include <dftracer/utils/utilities/indexer/index_builder_utility.h>
-#include <dftracer/utils/utilities/indexer/internal/indexer.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_sst_writer_context.h>
 
-#include <argparse/argparse.hpp>
+#include <algorithm>
 #include <atomic>
 #include <chrono>
+#include <memory>
 #include <sstream>
-#include <thread>
+
+#include "common_cli.h"
 
 using namespace dftracer::utils;
 using namespace dftracer::utils::utilities;
 using namespace dftracer::utils::utilities::composites::dft::indexing;
 using namespace dftracer::utils::utilities::indexer;
 
-static coro::CoroTask<int> run_index(argparse::ArgumentParser& program) {
-    std::string log_dir = program.get<std::string>("--directory");
-    std::string dimensions_str = program.get<std::string>("--dimensions");
-    bool force_rebuild = program.get<bool>("--force");
-    std::size_t checkpoint_size = program.get<std::size_t>("--checkpoint-size");
-    std::size_t executor_threads =
-        program.get<std::size_t>("--executor-threads");
-    std::string index_dir = program.get<std::string>("--index-dir");
-    std::size_t expected_entries =
-        program.get<std::size_t>("--expected-entries");
-    double false_positive_rate = program.get<double>("--false-positive-rate");
-    bool build_manifest = program.get<bool>("--manifest");
+class IndexArgParse : public cli::ArgParse {
+   public:
+    cli::DirectoryArgs directory;
+    cli::PipelineArgs pipeline;
+    cli::IndexingArgs indexing;
+
+    std::string dimensions;
+    bool manifest = false;
+    bool rebuild_summaries = false;
+    std::size_t read_batch_size = 4;
+    std::size_t expected_entries = 1024;
+    double false_positive_rate = 0.01;
+
+    explicit IndexArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        indexing.index_dir_help =
+            "Directory where .dftindex stores are created";
+        indexing.force_help = "Force index recreation even if already built";
+        schema(directory, pipeline, indexing);
+    }
+
+   protected:
+    void register_args() override {
+        parser()
+            .add_argument("--dimensions")
+            .help(
+                "Comma-separated extra dimensions to index from args "
+                "(e.g., args.level,args.mode,args.io.size)")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("--manifest")
+            .help(
+                "Also build manifest data in the .dftindex store "
+                "(per-checkpoint event line routing)")
+            .flag();
+
+        parser()
+            .add_argument("--rebuild-summaries")
+            .help(
+                "Rebuild ROOT_* aggregated summaries after ingest. Off by "
+                "default; ROOT_* CFs are only used by summary tools like "
+                "dftracer_info. Bloom-filter chunk-skipping queries do not "
+                "need them.")
+            .flag();
+
+        parser()
+            .add_argument("--read-batch-size")
+            .help("Batch read size in MB for stream processing (default: 4)")
+            .scan<'d', std::size_t>()
+            .default_value(static_cast<std::size_t>(4));
+
+        parser()
+            .add_argument("--expected-entries")
+            .help(
+                "Expected entries per chunk for bloom filter sizing (default: "
+                "1024)")
+            .scan<'d', std::size_t>()
+            .default_value(static_cast<std::size_t>(1024));
+
+        parser()
+            .add_argument("--false-positive-rate")
+            .help("Bloom filter false positive rate (default: 0.01)")
+            .scan<'g', double>()
+            .default_value(0.01);
+    }
+
+    void post_parse() override {
+        dimensions = parser().get<std::string>("--dimensions");
+        manifest = parser().get<bool>("--manifest");
+        rebuild_summaries = parser().get<bool>("--rebuild-summaries");
+        read_batch_size = parser().get<std::size_t>("--read-batch-size");
+        expected_entries = parser().get<std::size_t>("--expected-entries");
+        false_positive_rate = parser().get<double>("--false-positive-rate");
+    }
+};
+
+static coro::CoroTask<int> run_index(const IndexArgParse* cli) {
+    const auto log_dir = fs::absolute(cli->directory.value).string();
+    const auto& dimensions_str = cli->dimensions;
+    const auto force_rebuild = cli->indexing.force;
+    const auto checkpoint_size = cli->indexing.checkpoint_size;
+    const auto executor_threads = cli->pipeline.executor_threads;
+    // When --index-dir is not provided, place coord/staging/ingest DBs next to
+    // the input data so they line up with each file's per-file index_path
+    // (which determine_index_path(file, "") resolves to
+    // <file_parent>/.dftindex). The top-level scanner is non-recursive, so all
+    // input files share log_dir.
+    const auto index_dir =
+        cli->indexing.index_dir.empty() ? log_dir : cli->indexing.index_dir;
+    const auto expected_entries = cli->expected_entries;
+    const auto false_positive_rate = cli->false_positive_rate;
+    const auto build_manifest = cli->manifest;
+    const auto rebuild_summaries = cli->rebuild_summaries;
 
     auto split_string = [](const std::string& str) {
         std::vector<std::string> result;
@@ -49,22 +131,30 @@ static coro::CoroTask<int> run_index(argparse::ArgumentParser& program) {
         return result;
     };
 
-    std::vector<std::string> extra_dimensions = split_string(dimensions_str);
+    std::vector<std::string> user_dimensions = split_string(dimensions_str);
+
+    std::vector<std::string> extra_dimensions(
+        dftracer::utils::utilities::indexer::DEFAULT_EXTRA_DIMENSIONS.begin(),
+        dftracer::utils::utilities::indexer::DEFAULT_EXTRA_DIMENSIONS.end());
+    for (const auto& dim : user_dimensions) {
+        if (std::find(extra_dimensions.begin(), extra_dimensions.end(), dim) ==
+            extra_dimensions.end()) {
+            extra_dimensions.push_back(dim);
+        }
+    }
 
     ChunkIndexerConfig indexer_config;
     indexer_config.extra_dimensions = extra_dimensions;
     indexer_config.expected_entries_per_chunk = expected_entries;
     indexer_config.false_positive_rate = false_positive_rate;
 
-    // Default bloom dimensions + any user-supplied extras.
-    std::vector<std::string> all_dimensions =
-        dftracer::utils::utilities::indexer::default_bloom_dimensions();
+    std::vector<std::string> all_dimensions(
+        dftracer::utils::utilities::indexer::DEFAULT_BLOOM_DIMENSIONS.begin(),
+        dftracer::utils::utilities::indexer::DEFAULT_BLOOM_DIMENSIONS.end());
     for (const auto& dim : extra_dimensions) {
         all_dimensions.push_back(dim);
     }
 
-    log_dir = fs::absolute(log_dir).string();
-
     std::printf("==========================================\n");
     std::printf("DFTracer Bloom Indexer\n");
     std::printf("==========================================\n");
@@ -106,98 +196,60 @@ static coro::CoroTask<int> run_index(argparse::ArgumentParser& program) {
 
     DFTRACER_UTILS_LOG_INFO("Found %zu input files", input_files.size());
 
-    auto pipeline_config = PipelineConfig()
-                               .with_name("DFTracer Bloom Indexer")
-                               .with_compute_threads(executor_threads)
-                               .with_watchdog(false);
+    auto pipeline_config =
+        cli::build_pipeline_config("DFTracer Bloom Indexer", cli->pipeline);
 
     Pipeline pipeline(pipeline_config);
 
     auto start_time = std::chrono::high_resolution_clock::now();
 
-    std::atomic<std::size_t> total_events{0};
-    std::atomic<std::size_t> total_checkpoints_processed{0};
-    std::atomic<std::size_t> total_files_processed{0};
-    std::atomic<std::size_t> total_files_skipped{0};
+    std::vector<int> preassigned_file_ids;
+    {
+        IndexDatabase coord_db(index_dir);
+        coord_db.init_schema();
+        preassigned_file_ids =
+            coord_db.register_files(input_files, build_manifest);
+    }
+
+    const std::string staging_root =
+        (fs::path(index_dir) / ".dftindex_staging").string();
+    fs::create_directories(staging_root);
+
+    auto artifacts_queue = std::make_shared<moodycamel::ConcurrentQueue<
+        IndexDatabaseSstWriterContext::Artifacts>>();
+    auto batch_counter = std::make_shared<std::atomic<std::size_t>>(0);
+
+    auto batch_config = std::make_shared<IndexBuildBatchConfig>();
+    batch_config->file_paths = std::move(input_files);
+    batch_config->preassigned_file_ids = std::move(preassigned_file_ids);
+    batch_config->index_dir = index_dir;
+    batch_config->checkpoint_size = checkpoint_size;
+    batch_config->parallelism = executor_threads;
+    batch_config->force_rebuild = force_rebuild;
+    batch_config->build_manifest = build_manifest;
+    batch_config->bloom_config = indexer_config;
+    batch_config->bloom_dimensions = all_dimensions;
+    batch_config->rebuild_root_summaries = false;
+
+    batch_config->sink_factory =
+        [staging_root, batch_counter]() -> std::unique_ptr<IndexBatchSink> {
+        const std::size_t idx =
+            batch_counter->fetch_add(1, std::memory_order_relaxed);
+        return std::make_unique<IndexDatabaseSstWriterContext>(
+            staging_root, "batch_" + std::to_string(idx));
+    };
+    batch_config->sink_commit = [artifacts_queue](IndexBatchSink& sink) {
+        auto& sst = static_cast<IndexDatabaseSstWriterContext&>(sink);
+        auto a = sst.commit();
+        if (!a.empty()) artifacts_queue->enqueue(std::move(a));
+    };
 
+    IndexBuildBatchResult batch_result;
     auto streaming_task = make_task(
-        [&](CoroScope& ctx) -> coro::CoroTask<void> {
-            co_await ctx.scope([&](CoroScope& scope) -> coro::CoroTask<void> {
-                auto* total_events_ptr = &total_events;
-                auto* total_checkpoints_ptr = &total_checkpoints_processed;
-                auto* total_processed_ptr = &total_files_processed;
-                auto* total_skipped_ptr = &total_files_skipped;
-                auto* all_dims_ptr = &all_dimensions;
-                auto* files_ptr = &input_files;
-                auto* index_dir_ptr = &index_dir;
-                // Bounded fan-out: channel limits concurrent file processing
-                // to avoid memory pressure from unbounded coroutine spawning.
-                auto file_chan =
-                    coro::make_channel<std::size_t>(executor_threads * 2);
-
-                // Producer: push file indices into channel
-                scope.spawn([ch = file_chan->producer(),
-                             num_files = input_files.size()](
-                                CoroScope&) mutable -> coro::CoroTask<void> {
-                    auto guard = ch.guard();
-                    for (std::size_t i = 0; i < num_files; ++i) {
-                        if (!co_await ch.send(i)) {
-                            co_return;
-                        }
-                    }
-                    co_return;
-                });
-
-                // Workers: consume from channel, process one file at a time
-                for (std::size_t w = 0; w < executor_threads; ++w) {
-                    scope.spawn([file_chan, files_ptr, indexer_config,
-                                 build_manifest, index_dir_ptr, checkpoint_size,
-                                 force_rebuild, all_dims_ptr, total_events_ptr,
-                                 total_checkpoints_ptr, total_processed_ptr,
-                                 total_skipped_ptr](
-                                    CoroScope&) -> coro::CoroTask<void> {
-                        while (auto fi_opt = co_await file_chan->receive()) {
-                            std::size_t fi = *fi_opt;
-                            const auto& file_path = (*files_ptr)[fi];
-
-                            IndexBuilderUtility builder;
-                            auto config =
-                                IndexBuildConfig::for_file(file_path)
-                                    .with_index_dir(*index_dir_ptr)
-                                    .with_checkpoint_size(checkpoint_size)
-                                    .with_force_rebuild(force_rebuild)
-                                    .with_bloom(true)
-                                    .with_manifest(build_manifest)
-                                    .with_index_threshold(0)
-                                    .with_bloom_config(indexer_config)
-                                    .with_bloom_dimensions(*all_dims_ptr);
-
-                            auto result = co_await builder.process(config);
-
-                            if (result.was_skipped) {
-                                (*total_skipped_ptr)++;
-                            } else if (result.success) {
-                                (*total_processed_ptr)++;
-                                (*total_events_ptr) += result.events_processed;
-                                (*total_checkpoints_ptr) +=
-                                    result.chunks_processed;
-                            } else {
-                                (*total_skipped_ptr)++;
-                                if (!result.error_message.empty()) {
-                                    DFTRACER_UTILS_LOG_ERROR(
-                                        "Index failed for %s: %s",
-                                        file_path.c_str(),
-                                        result.error_message.c_str());
-                                }
-                            }
-                        }
-                        co_return;
-                    });
-                }
-                co_return;
-            });
-
-            co_return;
+        [&batch_result,
+         batch_config](CoroScope& scope) -> coro::CoroTask<void> {
+            batch_result = co_await IndexBatchBuilderUtility::process(
+                &scope, std::move(batch_config));
         },
         "StreamingIndex");
 
@@ -205,6 +257,38 @@ static coro::CoroTask<int> run_index(argparse::ArgumentParser& program) {
     pipeline.set_destination(streaming_task);
     pipeline.execute();
 
+    SstArtifactRegistry registry;
+    {
+        IndexDatabaseSstWriterContext::Artifacts a;
+        while (artifacts_queue->try_dequeue(a)) {
+            registry.append(std::move(a));
+        }
+    }
+    DFTRACER_UTILS_LOG_INFO(
+        "dftracer_index: %zu SSTs in registry (chunk_bloom=%zu file_bloom=%zu)",
+        registry.chunk_bloom().size() + registry.file_bloom().size() +
+            registry.chunk_stats().size(),
+        registry.chunk_bloom().size(), registry.file_bloom().size());
+    {
+        IndexDatabase ingest_db(index_dir);
+        auto t0 = std::chrono::high_resolution_clock::now();
+        ingest_db.bulk_ingest(registry, {});
+        auto t1 = std::chrono::high_resolution_clock::now();
+        if (rebuild_summaries) {
+            ingest_db.rebuild_root_summaries();
+        }
+        auto t2 = std::chrono::high_resolution_clock::now();
+        DFTRACER_UTILS_LOG_INFO(
+            "dftracer_index: bulk_ingest=%.2fms "
+            "rebuild_root_summaries=%.2fms%s",
+            std::chrono::duration<double, std::milli>(t1 - t0).count(),
+            std::chrono::duration<double, std::milli>(t2 - t1).count(),
+            rebuild_summaries ? "" : " (skipped)");
+    }
+
+    std::error_code ec;
+    fs::remove_all(staging_root, ec);
+
     auto end_time = std::chrono::high_resolution_clock::now();
     std::chrono::duration<double, std::milli> duration = end_time - start_time;
 
@@ -213,11 +297,11 @@ static coro::CoroTask<int> run_index(argparse::ArgumentParser& program) {
     std::printf("Bloom Index Results\n");
     std::printf("==========================================\n");
     std::printf("  Execution time: %.2f seconds\n", duration.count() / 1000.0);
-    std::printf("  Files processed: %zu\n", total_files_processed.load());
-    std::printf("  Files skipped: %zu\n", total_files_skipped.load());
-    std::printf("  Checkpoints indexed: %zu\n",
-                total_checkpoints_processed.load());
-    std::printf("  Events processed: %zu\n", total_events.load());
+    std::printf("  Files processed: %zu\n", batch_result.indexed);
+    std::printf("  Files skipped: %zu\n", batch_result.skipped);
+    std::printf("  Files failed: %zu\n", batch_result.failed);
+    std::printf("  Events processed: %zu\n",
+                static_cast<std::size_t>(batch_result.total_events));
     std::printf("  Dimensions indexed: %zu\n", all_dimensions.size());
     std::printf("  Dimensions: ");
     for (std::size_t i = 0; i < all_dimensions.size(); ++i) {
@@ -235,13 +319,6 @@ static coro::CoroTask<int> run_index(argparse::ArgumentParser& program) {
 int main(int argc, char** argv) {
     DFTRACER_UTILS_LOGGER_INIT();
 
-    auto default_checkpoint_size_str =
-        std::to_string(indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE) +
-        " B (" +
-        std::to_string(indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE /
-                       (1024 * 1024)) +
-        " MB)";
-
     argparse::ArgumentParser program("dftracer_index",
                                      DFTRACER_UTILS_PACKAGE_VERSION);
     program.add_description(
@@ -249,67 +326,9 @@ int main(int argc, char** argv) {
         "Creates root-local .dftindex databases enabling fast chunk-skipping "
         "queries.");
 
-    program.add_argument("-d", "--directory")
-        .help("Input directory containing .pfw or .pfw.gz files")
-        .default_value<std::string>(".");
-
-    program.add_argument("--dimensions")
-        .help(
-            "Comma-separated extra dimensions to index from args "
-            "(e.g., args.level,args.mode,args.io.size)")
-        .default_value<std::string>("");
-
-    program.add_argument("-f", "--force")
-        .help("Force index recreation even if already built")
-        .flag();
-
-    program.add_argument("--checkpoint-size")
-        .help("Checkpoint size for gzip indexing in bytes (default: " +
-              default_checkpoint_size_str + ")")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(
-            indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE));
-
-    program.add_argument("--executor-threads")
-        .help("Number of worker threads for parallel processing")
-        .scan<'d', std::size_t>()
-        .default_value(
-            static_cast<std::size_t>(dftracer_utils_hardware_concurrency()));
-
-    program.add_argument("--index-dir")
-        .help("Directory where .dftindex stores are created")
-        .default_value<std::string>("");
-
-    program.add_argument("--expected-entries")
-        .help(
-            "Expected entries per chunk for bloom filter sizing (default: "
-            "1024)")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(1024));
-
-    program.add_argument("--false-positive-rate")
-        .help("Bloom filter false positive rate (default: 0.01)")
-        .scan<'g', double>()
-        .default_value(0.01);
-
-    program.add_argument("--read-batch-size")
-        .help("Batch read size in MB for stream processing (default: 4)")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(4));
-
-    program.add_argument("--manifest")
-        .help(
-            "Also build manifest data in the .dftindex store "
-            "(per-checkpoint event line routing)")
-        .flag();
-
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        DFTRACER_UTILS_LOG_ERROR("Error occurred: %s", err.what());
-        std::fprintf(stderr, "%s\n", program.help().str().c_str());
-        return 1;
-    }
+    IndexArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
 
-    return run_index(program).get();
+    return run_index(&cli).get();
 }
diff --git a/src/dftracer/utils/binaries/dftracer_info.cpp b/src/dftracer/utils/binaries/dftracer_info.cpp
index 7c4a7191..221098ed 100644
--- a/src/dftracer/utils/binaries/dftracer_info.cpp
+++ b/src/dftracer/utils/binaries/dftracer_info.cpp
@@ -1,156 +1,226 @@
 #include <dftracer/utils/core/common/archive_format.h>
 #include <dftracer/utils/core/common/config.h>
-#include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
-#include <dftracer/utils/core/common/platform_compat.h>
 #include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/core/pipeline/pipeline_config.h>
+#include <dftracer/utils/core/rocksdb/db_manager.h>
 #include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/tasks/task.h>
 #include <dftracer/utils/core/utils/string.h>
+#include <dftracer/utils/core/utils/timer.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
 #include <dftracer/utils/utilities/composites/dft/internal/utils.h>
 #include <dftracer/utils/utilities/composites/dft/metadata_collector_utility.h>
-#include <dftracer/utils/utilities/fileio/lines/sources/async_streaming_gz_line_generator.h>
+#include <dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h>
 #include <dftracer/utils/utilities/indexer/index_builder_utility.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
+#include <dftracer/utils/utilities/indexer/internal/helpers.h>
 #include <dftracer/utils/utilities/indexer/internal/indexer.h>
 #include <dftracer/utils/utilities/indexer/internal/indexer_factory.h>
 
-#include <argparse/argparse.hpp>
-#include <atomic>
-#include <chrono>
 #include <iomanip>
-#include <iostream>
+#include <memory>
 #include <mutex>
-#include <unordered_set>
+
+#include "common_cli.h"
 
 using namespace dftracer::utils;
 using namespace dftracer::utils::utilities::composites::dft;
-using dftracer::utils::utilities::indexer::IndexBuildConfig;
-using dftracer::utils::utilities::indexer::IndexBuilderUtility;
+using namespace dftracer::utils::utilities::composites::dft::indexing;
+using dftracer::utils::utilities::indexer::FileRegistryEntry;
+using dftracer::utils::utilities::indexer::has_capability;
+using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility;
+using dftracer::utils::utilities::indexer::IndexBuildBatchConfig;
 using dftracer::utils::utilities::indexer::IndexDatabase;
-using dftracer::utils::utilities::indexer::internal::Indexer;
+using dftracer::utils::utilities::indexer::IndexFileEntryCapability;
 using dftracer::utils::utilities::indexer::internal::IndexerFactory;
 
+class InfoArgParse : public cli::ArgParse {
+   public:
+    cli::DirectoryArgs directory{cli::DirMode::DEFAULT_EMPTY};
+    cli::FilesArgs files_args{"Compressed files to inspect (GZIP, TAR.GZ)"};
+    cli::PipelineArgs pipeline;
+    cli::IndexingArgs indexing;
+
+    std::string query_type = "summary";
+    bool verbose = false;
+    bool force_rebuild = false;
+
+    explicit InfoArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        indexing.with_force = false;
+        indexing.index_dir_help =
+            "Directory to store index files (default: system temp directory)";
+        schema(directory, files_args, pipeline, indexing);
+    }
+
+   protected:
+    void register_args() override {
+        parser()
+            .add_argument("--query")
+            .help(
+                "Query type: summary (aggregate all files, default) or "
+                "detailed (per-file output)")
+            .default_value<std::string>("summary");
+
+        parser()
+            .add_argument("-v", "--verbose")
+            .help("Show detailed information including index details")
+            .flag();
+
+        parser()
+            .add_argument("-f", "--force-rebuild")
+            .help("Force rebuild index files")
+            .flag();
+    }
+
+    void post_parse() override {
+        query_type = parser().get<std::string>("--query");
+        verbose = parser().get<bool>("--verbose");
+        force_rebuild = parser().get<bool>("--force-rebuild");
+    }
+};
+
 static std::string format_size(std::uint64_t bytes) {
     const char* units[] = {"B", "KB", "MB", "GB", "TB"};
     int unit_index = 0;
     double size = static_cast<double>(bytes);
-
     while (size >= 1024.0 && unit_index < 4) {
         size /= 1024.0;
         unit_index++;
     }
-
     std::ostringstream oss;
     oss << std::fixed << std::setprecision(2) << size << " "
         << units[unit_index];
     return oss.str();
 }
 
-/// Fast path: read metadata from the `.dftindex` database.
-/// Returns success=false if index doesn't exist, letting the caller
-/// fall back to direct_scan_info for small/unindexed files.
-static MetadataCollectorUtilityOutput index_based_info(
-    const std::string& file_path) {
-    using utilities::indexer::IndexDatabase;
+using FileRegistry = std::unordered_map<std::string, FileRegistryEntry>;
 
-    MetadataCollectorUtilityOutput meta;
-    meta.file_path = file_path;
+struct RootInfoSummary {
+    std::size_t file_count = 0;
+    std::uint64_t total_events = 0;
+    std::uint64_t total_lines = 0;
+    std::uint64_t total_uncompressed = 0;
+};
 
-    try {
-        std::string index_path = file_path + constants::indexer::EXTENSION;
-        if (!fs::exists(index_path)) {
-            meta.success = false;
-            return meta;
-        }
+static coro::CoroTask<std::shared_ptr<RootInfoSummary>> load_root_info_summary(
+    std::string index_path) {
+    auto result = std::make_shared<RootInfoSummary>();
 
+    std::optional<dftracer::utils::utilities::indexer::RootStatisticsResult>
+        root;
+    {
+        IndexDatabase db(
+            index_path,
+            dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+        root = db.query_root_scalar_stats();
+    }
+
+    if (!root) {
+        DFTRACER_UTILS_LOG_INFO(
+            "Root scalar stats missing for %s; rebuilding from file "
+            "registry",
+            index_path.c_str());
         IndexDatabase db(index_path);
-        int fid = db.find_file(file_path);
-        if (fid < 0) {
-            meta.success = false;
-            return meta;
-        }
+        auto writer = db.begin_write();
+        writer->rebuild_root_summaries();
+        writer->commit();
+        root = db.query_root_scalar_stats();
+    }
 
-        meta.format = IndexerFactory::detect_format(file_path);
-        meta.compressed_size = fs::file_size(file_path);
-        meta.num_lines = db.get_num_lines(fid);
-        meta.uncompressed_size = db.get_max_bytes(fid);
-        meta.valid_events = db.get_total_events(fid);
+    if (root) {
+        result->file_count = root->num_files;
+        result->total_events = root->stats.total_events;
+        result->total_lines = root->total_lines;
+        result->total_uncompressed = root->total_uncompressed_bytes;
+    }
+    co_return result;
+}
+
+static coro::CoroTask<std::shared_ptr<FileRegistry>> load_file_registry(
+    std::string index_path) {
+    IndexDatabase db(
+        index_path,
+        dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+    co_return std::make_shared<FileRegistry>(db.query_all_file_registry());
+}
+
+static std::vector<MetadataCollectorUtilityOutput>
+process_index_group_info_sync(std::string index_path,
+                              std::vector<ResolvedFile> entries) {
+    std::vector<int> file_ids;
+    file_ids.reserve(entries.size());
+    for (const auto& entry : entries) {
+        file_ids.push_back(entry.file_id);
+    }
+
+    IndexDatabase db(
+        index_path,
+        dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+    auto metadata_rows = db.query_file_metadata_batch(file_ids);
+    auto merged_stats = db.query_merged_statistics_batch(file_ids);
+
+    std::vector<MetadataCollectorUtilityOutput> results;
+    results.reserve(entries.size());
+
+    for (const auto& entry : entries) {
+        MetadataCollectorUtilityOutput meta;
+        meta.file_path = entry.file_path;
         meta.index_path = index_path;
         meta.has_index = true;
         meta.index_valid = true;
+
+        auto metadata_it = metadata_rows.find(entry.file_id);
+        if (metadata_it == metadata_rows.end()) {
+            meta.success = false;
+            meta.error_message = "No file metadata found in shared index";
+            results.push_back(std::move(meta));
+            continue;
+        }
+
+        meta.format = IndexerFactory::detect_format(entry.file_path);
+        meta.compressed_size = 0;
+        meta.checkpoint_size = metadata_it->second.checkpoint_size;
+        meta.num_lines = metadata_it->second.num_lines;
+        meta.uncompressed_size = metadata_it->second.max_bytes;
         meta.size_mb =
-            static_cast<double>(meta.compressed_size) / (1024.0 * 1024.0);
+            static_cast<double>(meta.uncompressed_size) / (1024.0 * 1024.0);
         meta.start_line = 1;
         meta.end_line = meta.num_lines;
-        meta.size_per_line =
-            (meta.valid_events > 0)
-                ? meta.size_mb / static_cast<double>(meta.valid_events)
-                : 0;
-        meta.success = true;
-    } catch (...) {
-        meta.success = false;
-    }
 
-    return meta;
-}
+        if (meta.checkpoint_size > 0 && meta.uncompressed_size > 0) {
+            meta.num_checkpoints =
+                (meta.uncompressed_size + meta.checkpoint_size - 1) /
+                meta.checkpoint_size;
+        }
 
-/// One streaming decompress pass, count lines with JSON validation,
-/// without creating a `.dftindex` store.
-static coro::CoroTask<MetadataCollectorUtilityOutput> direct_scan_info(
-    std::string file_path) {
-    using dftracer::utils::utilities::fileio::lines::sources::
-        async_streaming_gz_lines;
-
-    MetadataCollectorUtilityOutput meta;
-    meta.file_path = file_path;
-    meta.has_index = false;
-    meta.index_valid = false;
-
-    try {
-        meta.format = dftracer::utils::utilities::indexer::internal::
-            IndexerFactory::detect_format(file_path);
-        meta.compressed_size = fs::file_size(file_path);
-
-        std::size_t total_lines = 0;
-        std::size_t valid_events = 0;
-        std::uint64_t total_bytes = 0;
-
-        auto gen = async_streaming_gz_lines(file_path);
-        while (auto line_opt = co_await gen.next()) {
-            total_lines++;
-            const auto& line = *line_opt;
-            total_bytes += line.content.length();
-            const char* trimmed;
-            std::size_t trimmed_length;
-            if (json_trim_and_validate(line.content.data(),
-                                       line.content.length(), trimmed,
-                                       trimmed_length) &&
-                trimmed_length > 8) {
-                valid_events++;
+        auto stats_it = merged_stats.find(entry.file_id);
+        if (stats_it != merged_stats.end()) {
+            meta.valid_events = stats_it->second.stats.total_events;
+            if (stats_it->second.num_chunks > 0) {
+                meta.num_checkpoints = stats_it->second.num_chunks;
             }
+        } else {
+            meta.valid_events = meta.num_lines;
         }
 
-        meta.num_lines = total_lines;
-        meta.valid_events = valid_events;
-        meta.uncompressed_size = total_bytes;
-        meta.size_mb =
-            static_cast<double>(meta.compressed_size) / (1024.0 * 1024.0);
-        meta.start_line = 1;
-        meta.end_line = total_lines;
         meta.size_per_line =
-            (valid_events > 0)
-                ? meta.size_mb / static_cast<double>(valid_events)
-                : 0;
+            (meta.valid_events > 0)
+                ? meta.size_mb / static_cast<double>(meta.valid_events)
+                : 0.0;
         meta.success = true;
-    } catch (const std::exception& e) {
-        meta.error_message = e.what();
-        meta.success = false;
+        results.push_back(std::move(meta));
     }
 
-    co_return meta;
+    return results;
+}
+
+static coro::CoroTask<std::vector<MetadataCollectorUtilityOutput>>
+process_index_group_info(std::shared_ptr<std::string> index_path,
+                         std::shared_ptr<std::vector<ResolvedFile>> entries) {
+    co_return process_index_group_info_sync(std::move(*index_path),
+                                            std::move(*entries));
 }
 
 static void print_file_info(const MetadataCollectorUtilityOutput& info,
@@ -165,16 +235,16 @@ static void print_file_info(const MetadataCollectorUtilityOutput& info,
         return;
     }
 
-    // Basic Information
     std::printf("Basic Information:\n");
     std::printf("  Format: %s\n", get_format_name(info.format));
     std::printf("  Status: %s\n", "OK");
 
-    // File Size Information
     std::printf("\nFile Size:\n");
-    std::printf("  Compressed:   %12s (%llu bytes)\n",
-                format_size(info.compressed_size).c_str(),
-                (unsigned long long)info.compressed_size);
+    if (info.compressed_size > 0) {
+        std::printf("  Compressed:   %12s (%llu bytes)\n",
+                    format_size(info.compressed_size).c_str(),
+                    (unsigned long long)info.compressed_size);
+    }
     std::printf("  Uncompressed: %12s (%llu bytes)\n",
                 format_size(info.uncompressed_size).c_str(),
                 (unsigned long long)info.uncompressed_size);
@@ -184,78 +254,21 @@ static void print_file_info(const MetadataCollectorUtilityOutput& info,
         double ratio =
             100.0 * (1.0 - static_cast<double>(info.compressed_size) /
                                static_cast<double>(info.uncompressed_size));
-        double compression_factor =
-            static_cast<double>(info.uncompressed_size) /
-            static_cast<double>(info.compressed_size);
-        std::printf(
-            "  Savings:      %12s (%.2f%% reduction)\n",
-            format_size(info.uncompressed_size - info.compressed_size).c_str(),
-            ratio);
-        std::printf("  Ratio:        %.2fx compression\n", compression_factor);
+        std::printf("  Savings:      %.2f%% reduction\n", ratio);
     }
 
-    // Content Information
     std::printf("\nContent:\n");
     std::printf("  Total Lines: %llu\n", (unsigned long long)info.num_lines);
     std::printf("  Valid Events: %zu\n", info.valid_events);
 
-    if (info.num_lines > 0) {
-        std::printf("  Avg Bytes/Line: %.2f bytes\n",
-                    static_cast<double>(info.uncompressed_size) /
-                        static_cast<double>(info.num_lines));
-    }
-
-    if (info.valid_events > 0) {
-        std::printf("  Avg Bytes/Event: %.2f bytes\n",
-                    static_cast<double>(info.uncompressed_size) /
-                        static_cast<double>(info.valid_events));
-    }
-
-    // Index Information (always show if index-capable format)
-    if (info.format == ArchiveFormat::GZIP ||
-        info.format == ArchiveFormat::TAR_GZ) {
+    if (info.has_index && info.index_valid) {
         std::printf("\nIndex Information:\n");
-        std::printf("  Index Store: %s\n", info.index_path.empty()
-                                               ? "(auto-generated)"
-                                               : info.index_path.c_str());
-        std::printf("  Index Status: %s\n",
-                    info.has_index ? (info.index_valid ? "Valid" : "Invalid")
-                                   : "Not Created");
-
-        if (info.has_index && info.index_valid) {
-            std::printf("  Checkpoint Size: %s (%llu bytes)\n",
-                        format_size(info.checkpoint_size).c_str(),
-                        (unsigned long long)info.checkpoint_size);
-            std::printf("  Number of Checkpoints: %zu\n", info.num_checkpoints);
-
-            if (info.num_checkpoints > 0) {
-                std::uint64_t avg_chunk =
-                    info.uncompressed_size / info.num_checkpoints;
-                std::uint64_t lines_per_checkpoint =
-                    info.num_lines / info.num_checkpoints;
-                std::printf("  Avg Chunk Size: %s (%llu bytes)\n",
-                            format_size(avg_chunk).c_str(),
-                            (unsigned long long)avg_chunk);
-                std::printf("  Avg Lines/Checkpoint: %llu\n",
-                            (unsigned long long)lines_per_checkpoint);
-
-                // Calculate index overhead
-                if (fs::exists(info.index_path)) {
-                    std::uint64_t index_size = fs::file_size(info.index_path);
-                    double index_overhead =
-                        100.0 * static_cast<double>(index_size) /
-                        static_cast<double>(info.compressed_size);
-                    std::printf("  Index File Size: %s (%llu bytes)\n",
-                                format_size(index_size).c_str(),
-                                (unsigned long long)index_size);
-                    std::printf("  Index Overhead: %.2f%% of compressed size\n",
-                                index_overhead);
-                }
-            }
-        }
+        std::printf("  Index Store: %s\n", info.index_path.c_str());
+        std::printf("  Checkpoint Size: %s\n",
+                    format_size(info.checkpoint_size).c_str());
+        std::printf("  Checkpoints: %zu\n", info.num_checkpoints);
     }
 
-    // Detailed Statistics (verbose mode)
     if (verbose) {
         std::printf("\nDetailed Statistics:\n");
         std::printf("  Start Line: %zu\n", info.start_line);
@@ -263,283 +276,279 @@ static void print_file_info(const MetadataCollectorUtilityOutput& info,
         std::printf("  Size (MB): %.6f\n", info.size_mb);
         std::printf("  MB per Event: %.8f\n", info.size_per_line);
 
-        // Performance estimates
         if (info.num_checkpoints > 0 && info.num_lines > 0) {
-            std::uint64_t lines_per_checkpoint =
-                info.num_lines / info.num_checkpoints;
+            auto lines_per_ckpt = info.num_lines / info.num_checkpoints;
             std::printf("\nRandom Access Performance:\n");
             std::printf("  Worst-case lines to scan: %llu (1 checkpoint)\n",
-                        (unsigned long long)lines_per_checkpoint);
-            std::printf(
-                "  Best-case lines to scan: 1 (exact checkpoint hit)\n");
+                        (unsigned long long)lines_per_ckpt);
             std::printf("  Avg lines to scan: %llu (0.5 checkpoint)\n",
-                        (unsigned long long)(lines_per_checkpoint / 2));
-        }
-
-        // Memory estimates
-        if (info.checkpoint_size > 0) {
-            std::printf("\nMemory Estimates:\n");
-            std::printf("  Memory for 1 checkpoint: ~%s\n",
-                        format_size(info.checkpoint_size).c_str());
-            if (info.num_checkpoints > 0) {
-                std::uint64_t total_memory_for_all =
-                    info.checkpoint_size * info.num_checkpoints;
-                std::printf("  Memory for all checkpoints: ~%s\n",
-                            format_size(total_memory_for_all).c_str());
-            }
+                        (unsigned long long)(lines_per_ckpt / 2));
         }
     }
 
     std::printf("\n");
 }
 
-int main(int argc, char** argv) {
-    DFTRACER_UTILS_LOGGER_INIT();
+static coro::CoroTask<void> auto_index_and_resolve(
+    CoroScope& ctx, std::vector<FileWorkItem>& files_needing_index,
+    const std::string& index_dir, std::size_t checkpoint_size,
+    std::size_t executor_threads,
+    std::unordered_map<std::string, std::vector<ResolvedFile>>&
+        indexed_groups) {
+    auto index_path = internal::determine_index_path(
+        files_needing_index.front().file_path, index_dir);
+    dftracer::utils::rocksdb::RocksDBManager::instance().reset(index_path);
+
+    const bool all_gzip =
+        std::all_of(files_needing_index.begin(), files_needing_index.end(),
+                    [](const FileWorkItem& item) {
+                        return item.file_path.ends_with(".gz");
+                    });
 
-    auto default_checkpoint_size_str =
-        std::to_string(Indexer::DEFAULT_CHECKPOINT_SIZE) + " B (" +
-        std::to_string(Indexer::DEFAULT_CHECKPOINT_SIZE / (1024 * 1024)) +
-        " MB)";
+    {
+        auto batch_config = std::make_shared<IndexBuildBatchConfig>();
+        batch_config->file_paths.reserve(files_needing_index.size());
+        for (const auto& item : files_needing_index) {
+            batch_config->file_paths.push_back(item.file_path);
+        }
+        batch_config->index_dir = index_dir;
+        batch_config->checkpoint_size = checkpoint_size;
+        batch_config->parallelism = executor_threads;
+        batch_config->use_batch_write = all_gzip;
+        batch_config->rebuild_root_summaries = all_gzip;
+
+        auto batch_result = co_await IndexBatchBuilderUtility::process(
+            &ctx, std::move(batch_config));
+
+        for (const auto& result : batch_result.results) {
+            if (!result.success && !result.error_message.empty()) {
+                DFTRACER_UTILS_LOG_ERROR("Auto-indexing failed for %s: %s",
+                                         result.file_path.c_str(),
+                                         result.error_message.c_str());
+            }
+        }
+    }
 
-    argparse::ArgumentParser program("dftracer_info",
-                                     DFTRACER_UTILS_PACKAGE_VERSION);
-    program.add_description(
-        "Display metadata and index information for DFTracer compressed files "
-        "using composable utilities and pipeline processing");
+    std::vector<std::string> newly_indexed;
+    newly_indexed.reserve(files_needing_index.size());
+    for (const auto& item : files_needing_index) {
+        newly_indexed.push_back(item.file_path);
+    }
 
-    program.add_argument("--files")
-        .help("Compressed files to inspect (GZIP, TAR.GZ)")
-        .nargs(argparse::nargs_pattern::any)
-        .default_value<std::vector<std::string>>({});
-
-    program.add_argument("-d", "--directory")
-        .help("Directory containing files to inspect")
-        .default_value<std::string>("");
-
-    program.add_argument("--query")
-        .help(
-            "Query type: summary (aggregate all files, default) or "
-            "detailed (per-file output)")
-        .default_value<std::string>("summary");
-
-    program.add_argument("-v", "--verbose")
-        .help("Show detailed information including index details")
-        .flag();
-
-    program.add_argument("-f", "--force-rebuild")
-        .help("Force rebuild index files")
-        .flag();
-
-    program.add_argument("-c", "--checkpoint-size")
-        .help("Checkpoint size for indexing in bytes (default: " +
-              default_checkpoint_size_str + ")")
-        .scan<'d', std::size_t>()
-        .default_value(
-            static_cast<std::size_t>(Indexer::DEFAULT_CHECKPOINT_SIZE));
-
-    program.add_argument("--index-dir")
-        .help("Directory to store index files (default: system temp directory)")
-        .default_value<std::string>("");
-
-    program.add_argument("--executor-threads")
-        .help(
-            "Number of executor threads for parallel processing (default: "
-            "number of CPU cores)")
-        .scan<'d', std::size_t>()
-        .default_value(
-            static_cast<std::size_t>(dftracer_utils_hardware_concurrency()));
-
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        DFTRACER_UTILS_LOG_ERROR("Error occurred: %s", err.what());
-        std::cerr << program;
-        return 1;
+    IndexResolverUtility resolver;
+    ResolverInput refresh_input;
+    refresh_input.files = std::move(newly_indexed);
+    refresh_input.index_dir = index_dir;
+    refresh_input.require_checkpoints = true;
+
+    auto refresh_result = co_await resolver.process(refresh_input);
+
+    if (!refresh_result.cached.empty()) {
+        indexed_groups[refresh_result.index_path] =
+            std::move(refresh_result.cached);
     }
+}
 
-    // Parse arguments
-    std::string directory = program.get<std::string>("--directory");
-    std::string query_type = program.get<std::string>("--query");
-    bool verbose = program.get<bool>("--verbose");
-    bool force_rebuild = program.get<bool>("--force-rebuild");
-    std::size_t checkpoint_size = program.get<std::size_t>("--checkpoint-size");
-    std::string index_dir = program.get<std::string>("--index-dir");
-    std::size_t executor_threads =
-        program.get<std::size_t>("--executor-threads");
+static coro::CoroTask<int> run_info(CoroScope& ctx, const InfoArgParse* cli) {
+    const auto& directory = cli->directory.value;
+    const auto& query_type = cli->query_type;
+    const auto verbose = cli->verbose;
+    const auto force_rebuild = cli->force_rebuild;
+    const auto checkpoint_size = cli->indexing.checkpoint_size;
+    const auto& index_dir = cli->indexing.index_dir;
+    const auto executor_threads = cli->pipeline.executor_threads;
+    const bool summary_mode = (query_type != "detailed");
 
-    bool summary_mode = (query_type != "detailed");
+    Timer stages_storage("dftracer_info");
+    Timer* stages = cli->pipeline.time_profiling ? &stages_storage : nullptr;
+    Timer overall(true);
 
-    // Collect files to process
     std::vector<std::string> files;
-    if (!directory.empty()) {
-        if (!fs::exists(directory)) {
-            DFTRACER_UTILS_LOG_ERROR("Directory does not exist: %s",
-                                     directory.c_str());
-            return 1;
-        }
+    std::vector<FileWorkItem> files_needing_index;
+    std::unordered_map<std::string, std::vector<ResolvedFile>> indexed_groups;
 
-        for (const auto& entry : fs::directory_iterator(directory)) {
-            if (entry.is_regular_file()) {
-                std::string path = entry.path().string();
-                std::string ext = entry.path().extension().string();
-                if (ext == ".gz") {
-                    files.push_back(path);
-                }
+    {
+        ScopedTimer _t(stages, "collect_and_classify");
+
+        if (!directory.empty()) {
+            if (!fs::exists(directory)) {
+                DFTRACER_UTILS_LOG_ERROR("Directory does not exist: %s",
+                                         directory.c_str());
+                co_return 1;
             }
-        }
 
-        if (files.empty()) {
-            DFTRACER_UTILS_LOG_ERROR(
-                "No compressed files found in directory: %s",
-                directory.c_str());
-            return 1;
-        }
-    } else {
-        files = program.get<std::vector<std::string>>("--files");
-
-        if (files.empty()) {
-            DFTRACER_UTILS_LOG_ERROR(
-                "%s", "No files or directory specified. Use --help for usage.");
-            std::cerr << program;
-            return 1;
+            auto trusted_index_path =
+                internal::determine_index_path(directory, index_dir);
+            if (!force_rebuild && fs::exists(trusted_index_path)) {
+                if (summary_mode) {
+                    ScopedTimer _rt(stages, "root_summary_read");
+                    auto root_result =
+                        co_await load_root_info_summary(trusted_index_path);
+
+                    if (stages) stages->print_stages();
+
+                    std::printf("==========================================\n");
+                    std::printf("DFTracer File Info Summary\n");
+                    std::printf("==========================================\n");
+                    std::printf("  Total Files:        %zu\n",
+                                root_result->file_count);
+                    std::printf("  Successful:         %zu\n",
+                                root_result->file_count);
+                    std::printf("  Failed:             0\n");
+                    std::printf("  Total Lines:        %llu\n",
+                                (unsigned long long)root_result->total_lines);
+                    std::printf("  Valid Events:       %llu\n",
+                                (unsigned long long)root_result->total_events);
+                    std::printf(
+                        "  Total Uncompressed: %s (%llu bytes)\n",
+                        format_size(root_result->total_uncompressed).c_str(),
+                        (unsigned long long)root_result->total_uncompressed);
+                    if (root_result->total_events > 0) {
+                        std::printf(
+                            "  Avg Bytes/Event:    %.2f bytes\n",
+                            static_cast<double>(
+                                root_result->total_uncompressed) /
+                                static_cast<double>(root_result->total_events));
+                    }
+                    std::printf("  Processing Time:    %.2f ms\n",
+                                static_cast<double>(overall.elapsed()) / 1e6);
+                    std::printf("==========================================\n");
+                    co_return 0;
+                }
+
+                {
+                    ScopedTimer _lr(stages, "load_registry");
+                    auto registry_ptr =
+                        co_await load_file_registry(trusted_index_path);
+
+                    files.reserve(registry_ptr->size());
+                    auto& group = indexed_groups[trusted_index_path];
+                    group.reserve(registry_ptr->size());
+                    std::size_t fi = 0;
+                    for (auto& [logical_path, reg] : *registry_ptr) {
+                        files.push_back(logical_path);
+                        if (has_capability(
+                                reg.capabilities,
+                                IndexFileEntryCapability::FILE_SUMMARY)) {
+                            group.push_back(ResolvedFile{fi, logical_path,
+                                                         reg.file_id,
+                                                         reg.capabilities});
+                        }
+                        ++fi;
+                    }
+                }
+            } else {
+                ScopedTimer _ds(stages, "scan_and_resolve");
+                IndexResolverUtility resolver;
+                auto input = std::make_unique<ResolverInput>();
+                input->directory = directory;
+                input->index_dir = index_dir;
+                auto result = co_await resolver.process(*input);
+                files = std::move(result.all_files);
+                if (!result.cached.empty()) {
+                    indexed_groups[result.index_path] =
+                        std::move(result.cached);
+                }
+                files_needing_index = std::move(result.needs_checkpoint);
+            }
+        } else {
+            ScopedTimer _rs(stages, "resolve_index_state");
+            IndexResolverUtility resolver;
+            auto input = std::make_unique<ResolverInput>();
+            input->files = cli->files_args.value;
+            input->index_dir = index_dir;
+            auto result = co_await resolver.process(*input);
+            files = std::move(result.all_files);
+            if (!result.cached.empty()) {
+                indexed_groups[result.index_path] = std::move(result.cached);
+            }
+            files_needing_index = std::move(result.needs_checkpoint);
         }
     }
 
-    // Small files skip indexing to avoid creating `.dftindex` stores on
-    // metadata-sensitive filesystems (e.g. Lustre).
-    static constexpr std::size_t INDEX_SIZE_THRESHOLD =
-        constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD;
-    std::unordered_set<std::string> small_files;
-    for (const auto& file_path : files) {
-        std::error_code ec;
-        auto fsize = fs::file_size(file_path, ec);
-        if (!ec && fsize > 0 && fsize < INDEX_SIZE_THRESHOLD) {
-            small_files.insert(file_path);
-        }
+    if (files.empty()) {
+        DFTRACER_UTILS_LOG_ERROR("%s", "No files found. Use --help for usage.");
+        co_return 1;
     }
 
-    auto start_time = std::chrono::high_resolution_clock::now();
+    std::vector<MetadataCollectorUtilityOutput> all_results;
+    std::mutex results_mutex;
 
-    if (summary_mode) {
-        // Summary: accumulate totals in workers, print once at the end.
-        // No per-file storage, no sort, no per-file print.
-        std::atomic<std::uint64_t> total_compressed{0};
-        std::atomic<std::uint64_t> total_uncompressed{0};
-        std::atomic<std::uint64_t> total_lines{0};
-        std::atomic<std::uint64_t> total_valid_events{0};
-        std::atomic<std::size_t> successful{0};
-        std::atomic<std::size_t> failed{0};
-
-        {
-            auto pipeline_config = PipelineConfig()
-                                       .with_name("DFTracer File Info")
-                                       .with_compute_threads(executor_threads)
-                                       .with_watchdog(false);
-
-            Pipeline pipeline(pipeline_config);
-
-            auto info_task = make_task(
-                [&](CoroScope& ctx) -> coro::CoroTask<void> {
-                    co_await ctx.scope([&](CoroScope& scope)
-                                           -> coro::CoroTask<void> {
-                        auto* files_ptr = &files;
-                        auto* total_compressed_ptr = &total_compressed;
-                        auto* total_uncompressed_ptr = &total_uncompressed;
-                        auto* total_lines_ptr = &total_lines;
-                        auto* total_valid_events_ptr = &total_valid_events;
-                        auto* successful_ptr = &successful;
-                        auto* failed_ptr = &failed;
-
-                        auto file_chan = coro::make_channel<std::size_t>(
-                            executor_threads * 2);
-
-                        scope.spawn(
-                            [ch = file_chan->producer(), files_ptr](
-                                CoroScope&) mutable -> coro::CoroTask<void> {
-                                auto guard = ch.guard();
-                                for (std::size_t i = 0; i < files_ptr->size();
-                                     ++i) {
-                                    if (!co_await ch.send(i)) {
-                                        co_return;
-                                    }
-                                }
-                                co_return;
-                            });
-
-                        for (std::size_t w = 0; w < executor_threads; ++w) {
-                            scope.spawn(
-                                [file_chan, files_ptr, total_compressed_ptr,
-                                 total_uncompressed_ptr, total_lines_ptr,
-                                 total_valid_events_ptr, successful_ptr,
-                                 failed_ptr](
-                                    CoroScope&) -> coro::CoroTask<void> {
-                                    while (auto fi_opt =
-                                               co_await file_chan->receive()) {
-                                        std::size_t fi = *fi_opt;
-                                        const auto& fp = (*files_ptr)[fi];
-
-                                        // Phase 1: build index if
-                                        // needed (skips small files
-                                        // and already-indexed files)
-                                        IndexBuilderUtility builder;
-                                        auto build_config =
-                                            IndexBuildConfig::for_file(fp)
-                                                .with_force_rebuild(false);
-                                        co_await builder.process(build_config);
-
-                                        // Phase 2: read from index,
-                                        // fall back to direct scan
-                                        // for small/unindexed files
-                                        auto info = index_based_info(fp);
-                                        if (!info.success) {
-                                            info =
-                                                co_await direct_scan_info(fp);
-                                        }
-
-                                        if (info.success) {
-                                            total_compressed_ptr->fetch_add(
-                                                info.compressed_size,
-                                                std::memory_order_relaxed);
-                                            total_uncompressed_ptr->fetch_add(
-                                                info.uncompressed_size,
-                                                std::memory_order_relaxed);
-                                            total_lines_ptr->fetch_add(
-                                                info.num_lines,
-                                                std::memory_order_relaxed);
-                                            total_valid_events_ptr->fetch_add(
-                                                info.valid_events,
-                                                std::memory_order_relaxed);
-                                            successful_ptr->fetch_add(
-                                                1, std::memory_order_relaxed);
-                                        } else {
-                                            failed_ptr->fetch_add(
-                                                1, std::memory_order_relaxed);
-                                        }
-                                    }
-                                    co_return;
-                                });
+    if (!indexed_groups.empty()) {
+        ScopedTimer _t(stages, "index_batch_read");
+        co_await ctx.scope([&](CoroScope& scope) -> coro::CoroTask<void> {
+            auto* all_results_ptr = &all_results;
+            auto* mutex_ptr = &results_mutex;
+            for (auto& [ip, group] : indexed_groups) {
+                auto idx_path_ptr = std::make_shared<std::string>(ip);
+                auto entries_ptr = std::make_shared<std::vector<ResolvedFile>>(
+                    std::move(group));
+                scope.spawn(
+                    [idx_path_ptr, entries_ptr, all_results_ptr,
+                     mutex_ptr](CoroScope&) mutable -> coro::CoroTask<void> {
+                        auto infos = co_await process_index_group_info(
+                            std::move(idx_path_ptr), std::move(entries_ptr));
+                        std::lock_guard<std::mutex> lock(*mutex_ptr);
+                        for (auto& info : infos) {
+                            all_results_ptr->push_back(std::move(info));
                         }
-                        co_return;
                     });
-                    co_return;
-                },
-                "CollectInfo");
+            }
+            co_return;
+        });
+    }
 
-            pipeline.set_source(info_task);
-            pipeline.set_destination(info_task);
-            pipeline.execute();
+    if (!files_needing_index.empty()) {
+        ScopedTimer _t(stages, "auto_index_and_build");
+        co_await auto_index_and_resolve(ctx, files_needing_index, index_dir,
+                                        checkpoint_size, executor_threads,
+                                        indexed_groups);
+
+        if (!indexed_groups.empty()) {
+            ScopedTimer _t2(stages, "newly_indexed_batch_read");
+            co_await ctx.scope([&](CoroScope& scope) -> coro::CoroTask<void> {
+                auto* all_results_ptr = &all_results;
+                auto* mutex_ptr = &results_mutex;
+                for (auto& [ip, group] : indexed_groups) {
+                    auto idx_path_ptr = std::make_shared<std::string>(ip);
+                    auto entries_ptr =
+                        std::make_shared<std::vector<ResolvedFile>>(
+                            std::move(group));
+                    scope.spawn(
+                        [idx_path_ptr, entries_ptr, all_results_ptr, mutex_ptr](
+                            CoroScope&) mutable -> coro::CoroTask<void> {
+                            auto infos = co_await process_index_group_info(
+                                std::move(idx_path_ptr),
+                                std::move(entries_ptr));
+                            std::lock_guard<std::mutex> lock(*mutex_ptr);
+                            for (auto& info : infos) {
+                                all_results_ptr->push_back(std::move(info));
+                            }
+                        });
+                }
+                co_return;
+            });
         }
+    }
 
-        auto end_time = std::chrono::high_resolution_clock::now();
-        std::chrono::duration<double, std::milli> duration =
-            end_time - start_time;
-
-        auto tc = total_compressed.load();
-        auto tu = total_uncompressed.load();
-        auto tl = total_lines.load();
-        auto tv = total_valid_events.load();
-        auto ok = successful.load();
-        auto bad = failed.load();
+    if (stages) stages->print_stages();
+    if (summary_mode) {
+        std::uint64_t total_uncompressed = 0;
+        std::uint64_t total_lines = 0;
+        std::uint64_t total_events = 0;
+        std::size_t ok = 0;
+        std::size_t bad = 0;
+
+        for (const auto& r : all_results) {
+            if (r.success) {
+                total_uncompressed += r.uncompressed_size;
+                total_lines += r.num_lines;
+                total_events += r.valid_events;
+                ok++;
+            } else {
+                bad++;
+            }
+        }
 
         std::printf("==========================================\n");
         std::printf("DFTracer File Info Summary\n");
@@ -547,174 +556,85 @@ int main(int argc, char** argv) {
         std::printf("  Total Files:        %zu\n", files.size());
         std::printf("  Successful:         %zu\n", ok);
         std::printf("  Failed:             %zu\n", bad);
-        std::printf("  Total Lines:        %llu\n", (unsigned long long)tl);
-        std::printf("  Valid Events:       %llu\n", (unsigned long long)tv);
-        std::printf("  Total Compressed:   %s (%llu bytes)\n",
-                    format_size(tc).c_str(), (unsigned long long)tc);
+        std::printf("  Total Lines:        %llu\n",
+                    (unsigned long long)total_lines);
+        std::printf("  Valid Events:       %llu\n",
+                    (unsigned long long)total_events);
         std::printf("  Total Uncompressed: %s (%llu bytes)\n",
-                    format_size(tu).c_str(), (unsigned long long)tu);
+                    format_size(total_uncompressed).c_str(),
+                    (unsigned long long)total_uncompressed);
 
-        if (tc > 0 && tu > 0 && tc != tu) {
-            double ratio = 100.0 * (1.0 - static_cast<double>(tc) /
-                                              static_cast<double>(tu));
-            std::printf("  Compression:        %.2f%%\n", ratio);
-        }
-
-        if (tv > 0) {
+        if (total_events > 0) {
             std::printf("  Avg Bytes/Event:    %.2f bytes\n",
-                        static_cast<double>(tu) / static_cast<double>(tv));
+                        static_cast<double>(total_uncompressed) /
+                            static_cast<double>(total_events));
         }
 
-        std::printf("  Processing Time:    %.2f seconds\n",
-                    duration.count() / 1000.0);
+        std::printf("  Processing Time:    %.2f ms\n",
+                    static_cast<double>(overall.elapsed()) / 1e6);
         std::printf("==========================================\n");
 
-        return (bad == 0) ? 0 : 1;
+        co_return (bad == 0) ? 0 : 1;
     }
 
-    // Detailed mode: per-file output (original behavior)
-    struct IndexedResult {
-        std::size_t index;
-        MetadataCollectorUtilityOutput info;
-    };
-
-    std::vector<IndexedResult> results;
-    std::mutex results_mutex;
-
-    {
-        auto pipeline_config = PipelineConfig()
-                                   .with_name("DFTracer File Info")
-                                   .with_compute_threads(executor_threads)
-                                   .with_watchdog(false);
-
-        Pipeline pipeline(pipeline_config);
-
-        auto info_task = make_task(
-            [&](CoroScope& ctx) -> coro::CoroTask<void> {
-                co_await ctx.scope([&](CoroScope& scope)
-                                       -> coro::CoroTask<void> {
-                    auto* files_ptr = &files;
-                    auto* results_ptr = &results;
-                    auto* results_mutex_ptr = &results_mutex;
-
-                    auto small_set =
-                        std::make_shared<std::unordered_set<std::string>>(
-                            small_files);
-
-                    auto file_chan =
-                        coro::make_channel<std::size_t>(executor_threads * 2);
-
-                    scope.spawn(
-                        [ch = file_chan->producer(), files_ptr](
-                            CoroScope&) mutable -> coro::CoroTask<void> {
-                            auto guard = ch.guard();
-                            for (std::size_t i = 0; i < files_ptr->size();
-                                 ++i) {
-                                if (!co_await ch.send(i)) {
-                                    co_return;
-                                }
-                            }
-                            co_return;
-                        });
-
-                    for (std::size_t w = 0; w < executor_threads; ++w) {
-                        scope.spawn([file_chan, files_ptr, checkpoint_size,
-                                     force_rebuild, verbose, index_dir,
-                                     small_set, results_ptr, results_mutex_ptr](
-                                        CoroScope&) -> coro::CoroTask<void> {
-                            while (auto fi_opt =
-                                       co_await file_chan->receive()) {
-                                std::size_t fi = *fi_opt;
-                                const auto& file_path = (*files_ptr)[fi];
-                                bool is_small = small_set->count(file_path) > 0;
-
-                                MetadataCollectorUtilityOutput info;
-                                if (is_small) {
-                                    info = co_await direct_scan_info(file_path);
-                                } else {
-                                    auto input =
-                                        MetadataCollectorUtilityInput::
-                                            from_file(file_path)
-                                                .with_checkpoint_size(
-                                                    checkpoint_size)
-                                                .with_force_rebuild(
-                                                    force_rebuild)
-                                                .with_compute_hash(verbose);
-
-                                    if (!index_dir.empty()) {
-                                        input.with_index(
-                                            internal::determine_index_path(
-                                                file_path, index_dir));
-                                    }
-
-                                    MetadataCollectorUtility collector;
-                                    info = co_await collector.process(input);
-                                }
-
-                                std::lock_guard<std::mutex> lock(
-                                    *results_mutex_ptr);
-                                results_ptr->push_back({fi, std::move(info)});
-                            }
-                            co_return;
-                        });
-                    }
-                    co_return;
-                });
-                co_return;
-            },
-            "CollectInfo");
-
-        pipeline.set_source(info_task);
-        pipeline.set_destination(info_task);
-        pipeline.execute();
+    for (const auto& r : all_results) {
+        print_file_info(r, verbose);
     }
 
-    std::sort(results.begin(), results.end(),
-              [](const IndexedResult& a, const IndexedResult& b) {
-                  return a.index < b.index;
-              });
-
-    std::uint64_t total_compressed = 0;
-    std::uint64_t total_uncompressed = 0;
-    std::uint64_t total_lines = 0;
-    std::size_t successful = 0;
-
-    for (const auto& r : results) {
-        print_file_info(r.info, verbose);
-        if (r.info.success) {
-            successful++;
-            total_compressed += r.info.compressed_size;
-            total_uncompressed += r.info.uncompressed_size;
-            total_lines += r.info.num_lines;
+    if (files.size() > 1) {
+        std::uint64_t total_uncompressed = 0;
+        std::uint64_t total_lines = 0;
+        std::size_t ok = 0;
+
+        for (const auto& r : all_results) {
+            if (r.success) {
+                ok++;
+                total_uncompressed += r.uncompressed_size;
+                total_lines += r.num_lines;
+            }
         }
-    }
-
-    auto end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double, std::milli> duration = end_time - start_time;
 
-    if (files.size() > 1) {
         std::printf("==========================================\n");
         std::printf("Summary\n");
         std::printf("==========================================\n");
         std::printf("Total Files: %zu\n", files.size());
-        std::printf("Successful: %zu\n", successful);
-        std::printf("Failed: %zu\n", files.size() - successful);
+        std::printf("Successful: %zu\n", ok);
+        std::printf("Failed: %zu\n", files.size() - ok);
         std::printf("Total Lines: %llu\n", (unsigned long long)total_lines);
-        std::printf("Total Compressed: %s\n",
-                    format_size(total_compressed).c_str());
         std::printf("Total Uncompressed: %s\n",
                     format_size(total_uncompressed).c_str());
+        std::printf("Processing Time: %.2f ms\n",
+                    static_cast<double>(overall.elapsed()) / 1e6);
+    }
 
-        if (total_uncompressed > 0) {
-            double ratio =
-                100.0 * (1.0 - static_cast<double>(total_compressed) /
-                                   static_cast<double>(total_uncompressed));
-            std::printf("Overall Compression: %.2f%%\n", ratio);
-        }
+    co_return 0;
+}
 
-        std::printf("Processing Time: %.2f seconds\n",
-                    duration.count() / 1000.0);
-    }
+int main(int argc, char** argv) {
+    DFTRACER_UTILS_LOGGER_INIT();
+
+    argparse::ArgumentParser program("dftracer_info",
+                                     DFTRACER_UTILS_PACKAGE_VERSION);
+    program.add_description(
+        "Display metadata and index information for DFTracer compressed files "
+        "using composable utilities and pipeline processing");
+
+    InfoArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
+
+    auto pipeline_config =
+        cli::build_pipeline_config("DFTracer Info", cli.pipeline);
+    Pipeline pipeline(pipeline_config);
+
+    auto info_task = make_task(
+        [&cli](CoroScope& ctx) -> coro::CoroTask<int> {
+            co_return co_await run_info(ctx, &cli);
+        },
+        "InfoMain");
 
-    return (successful == files.size()) ? 0 : 1;
+    pipeline.set_source(info_task);
+    pipeline.set_destination(info_task);
+    pipeline.execute();
+    return info_task->get<int>();
 }
diff --git a/src/dftracer/utils/binaries/dftracer_merge.cpp b/src/dftracer/utils/binaries/dftracer_merge.cpp
index 50264115..5924a5f1 100644
--- a/src/dftracer/utils/binaries/dftracer_merge.cpp
+++ b/src/dftracer/utils/binaries/dftracer_merge.cpp
@@ -1,19 +1,96 @@
 #include <dftracer/utils/core/common/config.h>
-#include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
-#include <dftracer/utils/core/common/platform_compat.h>
 #include <dftracer/utils/core/coro/channel.h>
 #include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/core/pipeline/pipeline_config.h>
 #include <dftracer/utils/core/tasks/task.h>
 #include <dftracer/utils/utilities/utilities.h>
 
-#include <argparse/argparse.hpp>
 #include <chrono>
 
+#include "common_cli.h"
+
 using namespace dftracer::utils;
 using namespace dftracer::utils::utilities::composites;
 
+class MergeArgParse : public cli::ArgParse {
+   public:
+    cli::DirectoryArgs directory{cli::DirMode::DEFAULT_DOT,
+                                 "Directory containing .pfw or .pfw.gz files"};
+    cli::PipelineArgs pipeline;
+    cli::WatchdogArgs watchdog;
+
+    bool force = false;
+    std::string output;
+    bool compress = false;
+    bool verbose = false;
+    bool gzip_only = false;
+    bool verify = false;
+    std::size_t channel_capacity = 100;
+    std::size_t batch_size_kb = 256;
+
+    explicit MergeArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        schema(directory, pipeline, watchdog);
+    }
+
+   protected:
+    void register_args() override {
+        parser()
+            .add_argument("-f", "--force")
+            .help("Override existing output file and force index recreation")
+            .flag();
+
+        parser()
+            .add_argument("-o", "--output")
+            .help("Output file path (should have .pfw extension)")
+            .default_value<std::string>("combined.pfw");
+
+        parser()
+            .add_argument("-c", "--compress")
+            .help("Compress output file with gzip")
+            .flag();
+
+        parser()
+            .add_argument("-v", "--verbose")
+            .help("Enable verbose mode")
+            .flag();
+
+        parser()
+            .add_argument("-g", "--gzip-only")
+            .help("Process only .pfw.gz files")
+            .flag();
+
+        parser()
+            .add_argument("--verify")
+            .help("Verify merged output by comparing input/output hashes")
+            .flag();
+
+        parser()
+            .add_argument("--channel-capacity")
+            .help("Channel buffer capacity for batch streaming (default: 100)")
+            .scan<'d', std::size_t>()
+            .default_value(static_cast<std::size_t>(100));
+
+        parser()
+            .add_argument("--batch-size")
+            .help("Batch byte budget in KB (default: 256)")
+            .scan<'d', std::size_t>()
+            .default_value(static_cast<std::size_t>(256));
+    }
+
+    void post_parse() override {
+        force = parser().get<bool>("--force");
+        output = parser().get<std::string>("--output");
+        compress = parser().get<bool>("--compress");
+        verbose = parser().get<bool>("--verbose");
+        gzip_only = parser().get<bool>("--gzip-only");
+        verify = parser().get<bool>("--verify");
+        channel_capacity = parser().get<std::size_t>("--channel-capacity");
+        batch_size_kb = parser().get<std::size_t>("--batch-size");
+    }
+};
+
+static int run_merge(const MergeArgParse& cli);
+
 int main(int argc, char** argv) {
     DFTRACER_UTILS_LOGGER_INIT();
 
@@ -23,117 +100,24 @@ int main(int argc, char** argv) {
         "Merge DFTracer .pfw or .pfw.gz files into a single JSON array file "
         "using streaming producer-consumer pattern");
 
-    program.add_argument("-d", "--directory")
-        .help("Directory containing .pfw or .pfw.gz files")
-        .default_value<std::string>(".");
-
-    program.add_argument("-o", "--output")
-        .help("Output file path (should have .pfw extension)")
-        .default_value<std::string>("combined.pfw");
-
-    program.add_argument("-f", "--force")
-        .help("Override existing output file and force index recreation")
-        .flag();
-
-    program.add_argument("-c", "--compress")
-        .help("Compress output file with gzip")
-        .flag();
-
-    program.add_argument("-v", "--verbose").help("Enable verbose mode").flag();
-
-    program.add_argument("-g", "--gzip-only")
-        .help("Process only .pfw.gz files")
-        .flag();
-
-    program.add_argument("--executor-threads")
-        .help(
-            "Number of executor threads for parallel processing (default: "
-            "number of CPU cores)")
-        .scan<'d', std::size_t>()
-        .default_value(
-            static_cast<std::size_t>(dftracer_utils_hardware_concurrency()));
-
-    program.add_argument("--verify")
-        .help("Verify merged output by comparing input/output hashes")
-        .flag();
-
-    program.add_argument("--channel-capacity")
-        .help("Channel buffer capacity for batch streaming (default: 100)")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(100));
-
-    program.add_argument("--batch-size")
-        .help("Batch byte budget in KB (default: 256)")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(256));
-
-    program.add_argument("--disable-watchdog")
-        .help("Disable watchdog for hang detection")
-        .flag();
-
-    program.add_argument("--watchdog-global-timeout")
-        .help(
-            "Watchdog global timeout for pipeline execution in seconds (0 = no "
-            "timeout)")
-        .scan<'d', int>()
-        .default_value(0);
-
-    program.add_argument("--watchdog-task-timeout")
-        .help("Watchdog default task timeout in seconds (0 = no timeout)")
-        .scan<'d', int>()
-        .default_value(0);
-
-    program.add_argument("--watchdog-interval")
-        .help("Watchdog check interval in seconds")
-        .scan<'d', int>()
-        .default_value(1);
-
-    program.add_argument("--watchdog-warning-threshold")
-        .help("Watchdog long-running task warning threshold in seconds")
-        .scan<'d', int>()
-        .default_value(300);
-
-    program.add_argument("--watchdog-idle-timeout")
-        .help("Watchdog idle timeout in seconds (0 = use default)")
-        .scan<'d', int>()
-        .default_value(300);
-
-    program.add_argument("--watchdog-deadlock-timeout")
-        .help("Watchdog deadlock timeout in seconds (0 = use default)")
-        .scan<'d', int>()
-        .default_value(600);
-
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        DFTRACER_UTILS_LOG_ERROR("Error occurred: %s", err.what());
-        std::cerr << program << std::endl;
-        return 1;
-    }
+    MergeArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
 
-    std::string input_dir = program.get<std::string>("--directory");
-    std::string output_file = program.get<std::string>("--output");
-    bool force_override = program.get<bool>("--force");
-    bool compress_output = program.get<bool>("--compress");
-    [[maybe_unused]] bool verbose = program.get<bool>("--verbose");
-    bool gzip_only = program.get<bool>("--gzip-only");
-    bool verify = program.get<bool>("--verify");
-    std::size_t executor_threads =
-        program.get<std::size_t>("--executor-threads");
-    std::size_t channel_capacity =
-        program.get<std::size_t>("--channel-capacity");
-    std::size_t batch_size_kb = program.get<std::size_t>("--batch-size");
-    std::size_t batch_byte_budget = batch_size_kb * 1024;
-    bool disable_watchdog = program.get<bool>("--disable-watchdog");
-    int global_timeout = program.get<int>("--watchdog-global-timeout");
-    int task_timeout = program.get<int>("--watchdog-task-timeout");
-    int watchdog_interval = program.get<int>("--watchdog-interval");
-    int warning_threshold = program.get<int>("--watchdog-warning-threshold");
-    int idle_timeout = program.get<int>("--watchdog-idle-timeout");
-    int deadlock_timeout = program.get<int>("--watchdog-deadlock-timeout");
+    return run_merge(cli);
+}
 
-    input_dir = fs::absolute(input_dir).string();
-    output_file = fs::absolute(output_file).string();
+static int run_merge(const MergeArgParse& cli) {
+    const auto input_dir = fs::absolute(cli.directory.value).string();
+    const auto output_file = fs::absolute(cli.output).string();
+    const auto force_override = cli.force;
+    const auto compress_output = cli.compress;
+    [[maybe_unused]] const auto verbose = cli.verbose;
+    const auto gzip_only = cli.gzip_only;
+    const auto verify = cli.verify;
+    const auto channel_capacity = cli.channel_capacity;
+    const auto batch_size_kb = cli.batch_size_kb;
+    std::size_t batch_byte_budget = batch_size_kb * 1024;
 
     if (output_file.size() < 4 ||
         output_file.substr(output_file.size() - 4) != ".pfw") {
@@ -194,16 +178,12 @@ int main(int argc, char** argv) {
     std::printf("  Verify: %s\n", verify ? "true" : "false");
     std::printf("  Channel capacity: %zu\n", channel_capacity);
     std::printf("  Batch size: %zu KB\n", batch_size_kb);
-    std::printf("  Executor threads: %zu\n", executor_threads);
+    std::printf("  Executor threads: %zu\n", cli.pipeline.executor_threads);
     std::printf("==========================================\n\n");
 
     auto start_time = std::chrono::high_resolution_clock::now();
 
-    // Step 1: Create channel and buffer pool for streaming batches
     auto channel = coro::make_channel<StreamingMergeBatch>(channel_capacity);
-    // Pool size = channel capacity + num producers, so producers never block
-    // waiting for buffers (avoids deadlock when all executor threads are
-    // producers and the consumer can't run to release buffers).
     std::size_t pool_size = channel_capacity + input_files.size();
     auto buf_pool =
         make_buffer_pool<std::string>(pool_size, [batch_byte_budget]() {
@@ -216,23 +196,10 @@ int main(int argc, char** argv) {
     producer_results.resize(input_files.size());
     StreamingFileConsumerOutput consumer_result;
 
-    // Step 2: Create pipeline
-    auto pipeline_config =
-        PipelineConfig()
-            .with_name("DFTracer Merge")
-            .with_compute_threads(executor_threads)
-            .with_watchdog(!disable_watchdog)
-            .with_global_timeout(std::chrono::seconds(global_timeout))
-            .with_task_timeout(std::chrono::seconds(task_timeout))
-            .with_watchdog_interval(std::chrono::seconds(watchdog_interval))
-            .with_warning_threshold(std::chrono::seconds(warning_threshold))
-            .with_executor_idle_timeout(std::chrono::seconds(idle_timeout))
-            .with_executor_deadlock_timeout(
-                std::chrono::seconds(deadlock_timeout));
-
+    auto pipeline_config = cli::build_pipeline_config(
+        "DFTracer Merge", cli.pipeline, cli.watchdog);
     Pipeline pipeline(pipeline_config);
 
-    // Step 3: Create producer tasks
     std::vector<std::shared_ptr<Task>> producer_tasks;
     for (std::size_t i = 0; i < input_files.size(); ++i) {
         auto* input_files_ptr = &input_files;
@@ -259,7 +226,6 @@ int main(int argc, char** argv) {
         producer_tasks.push_back(producer_task);
     }
 
-    // Step 4: Create consumer task
     auto* consumer_result_ptr = &consumer_result;
     auto consumer_task = make_task(
         [channel, buf_pool, output_file, compress_output,
@@ -275,7 +241,6 @@ int main(int argc, char** argv) {
         },
         "Consumer");
 
-    // Step 5: Execute pipeline
     std::vector<std::shared_ptr<Task>> all_tasks;
     all_tasks.insert(all_tasks.end(), producer_tasks.begin(),
                      producer_tasks.end());
diff --git a/src/dftracer/utils/binaries/dftracer_organize.cpp b/src/dftracer/utils/binaries/dftracer_organize.cpp
index 1a1f9259..8d78e4cc 100644
--- a/src/dftracer/utils/binaries/dftracer_organize.cpp
+++ b/src/dftracer/utils/binaries/dftracer_organize.cpp
@@ -1,49 +1,612 @@
+#include <concurrentqueue.h>
 #include <dftracer/utils/core/common/config.h>
-#include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
-#include <dftracer/utils/core/common/platform_compat.h>
+#include <dftracer/utils/core/common/memory_budget.h>
+#include <dftracer/utils/core/coro/channel.h>
 #include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/core/pipeline/pipeline_config.h>
 #include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/tasks/task.h>
-#include <dftracer/utils/utilities/composites/dft/reorganize/event_router.h>
+#include <dftracer/utils/core/utils/timer.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
+#include <dftracer/utils/utilities/composites/dft/internal/utils.h>
+#include <dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.h>
+#include <dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.h>
+#include <dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.h>
 #include <dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h>
+#include <dftracer/utils/utilities/fileio/parallel/layout.h>
 #include <dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h>
 #include <dftracer/utils/utilities/indexer/index_builder_utility.h>
-#include <dftracer/utils/utilities/indexer/internal/helpers.h>
-#include <dftracer/utils/utilities/indexer/internal/indexer.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_sst_writer_context.h>
+#include <dftracer/utils/utilities/indexer/internal/common/gzip_member_scanner.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
 
-#include <argparse/argparse.hpp>
+#include <algorithm>
 #include <atomic>
-#include <chrono>
-#include <cstdio>
+#include <fstream>
+#include <mutex>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
+#include "common_cli.h"
+
 using namespace dftracer::utils;
 using namespace dftracer::utils::utilities;
 using namespace dftracer::utils::utilities::composites;
 using namespace dftracer::utils::utilities::composites::dft;
+using namespace dftracer::utils::utilities::composites::dft::indexing;
 using namespace dftracer::utils::utilities::composites::dft::reorganize;
+using namespace dftracer::utils::utilities::composites::dft::aggregators;
 using namespace dftracer::utils::utilities::indexer;
 
+class OrganizeArgParse : public cli::ArgParse {
+   public:
+    cli::DirectoryArgs directory{cli::DirMode::DEFAULT_EMPTY};
+    cli::FilesArgs files_args;
+    cli::PipelineArgs pipeline;
+    cli::IndexingArgs indexing;
+
+    std::string output_dir;
+    std::vector<std::string> group_specs;
+    std::size_t chunk_size_mb = 0;  // 0 = auto (one file/group on Lustre)
+    bool no_compress = false;
+    int compression_level = 1;
+    bool with_aggregation = false;
+    double time_interval_ms = 5000.0;
+    std::size_t memory_budget_mb = 0;         // 0 = auto-detect
+    std::size_t estimated_file_bytes_mb = 0;  // 0 = auto from input sizes
+
+    explicit OrganizeArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        indexing.force_help = "Force rebuild of indices";
+        schema(directory, files_args, pipeline, indexing);
+    }
+
+   protected:
+    void register_args() override {
+        parser()
+            .add_argument("-o", "--output")
+            .help("Output directory")
+            .required();
+
+        parser()
+            .add_argument("--groups")
+            .help(
+                "Predicate groups: \"io:cat==\\\"POSIX\\\"\" "
+                "\"compute:cat==\\\"APP\\\"\"")
+            .nargs(argparse::nargs_pattern::at_least_one)
+            .required();
+
+        parser()
+            .add_argument("--chunk-size")
+            .help(
+                "Target chunk size in MB. 0 = auto: one file per group on "
+                "Lustre, 256 MB rotation elsewhere (default: 0)")
+            .scan<'d', std::size_t>()
+            .default_value(static_cast<std::size_t>(0));
+
+        parser()
+            .add_argument("--no-compress")
+            .help("Write plain .pfw instead of .pfw.gz")
+            .flag();
+
+        parser()
+            .add_argument("--compression-level")
+            .help("Gzip compression level (0-9, default: 1)")
+            .scan<'d', int>()
+            .default_value(1);
+
+        parser()
+            .add_argument("--with-aggregation")
+            .help(
+                "Build aggregation index on organized chunks so downstream "
+                "analyzers skip the first-read aggregation cost")
+            .flag();
+
+        parser()
+            .add_argument("--time-interval-ms")
+            .help(
+                "Aggregation bucket size in ms (used with --with-aggregation, "
+                "default: 5000)")
+            .scan<'g', double>()
+            .default_value(5000.0);
+
+        parser()
+            .add_argument("--memory-budget-mb")
+            .help(
+                "Peak memory budget in MB for input file indexing. 0 = auto "
+                "(50%% of detected available memory). Bounds peak RSS on "
+                "large workloads by processing input files in batches.")
+            .scan<'d', std::size_t>()
+            .default_value(static_cast<std::size_t>(0));
+
+        parser()
+            .add_argument("--estimated-file-bytes-mb")
+            .help(
+                "Per-file peak memory estimate in MB for index build. "
+                "0 = auto (sample input file sizes and apply "
+                "gzip/JSON expansion factor). Combined with "
+                "--memory-budget-mb to derive flush_every_files.")
+            .scan<'d', std::size_t>()
+            .default_value(static_cast<std::size_t>(0));
+    }
+
+    void post_parse() override {
+        output_dir = parser().get<std::string>("--output");
+        group_specs = parser().get<std::vector<std::string>>("--groups");
+        chunk_size_mb = parser().get<std::size_t>("--chunk-size");
+        no_compress = parser().get<bool>("--no-compress");
+        compression_level = parser().get<int>("--compression-level");
+        with_aggregation = parser().get<bool>("--with-aggregation");
+        time_interval_ms = parser().get<double>("--time-interval-ms");
+        memory_budget_mb = parser().get<std::size_t>("--memory-budget-mb");
+        estimated_file_bytes_mb =
+            parser().get<std::size_t>("--estimated-file-bytes-mb");
+    }
+};
+
 namespace {
 
-coro::CoroTask<int> run_organize(const std::string& output_dir,
-                                 const std::string& index_dir,
-                                 const std::vector<std::string>& files,
-                                 const std::vector<PredicateGroup>& groups,
-                                 std::size_t checkpoint_size,
-                                 bool force_rebuild, bool no_compress,
-                                 std::size_t executor_threads,
-                                 std::size_t chunk_size_mb) {
+// Forward decl: defined below at first use site.
+using ChunkLayoutMap = std::unordered_map<
+    std::string, std::vector<fileio::parallel::ParallelWriter::MemberSpan>>;
+
+struct OrganizeResult {
+    std::size_t total_events_written = 0;
+    std::size_t total_events_unmatched = 0;
+    std::size_t chunks_created = 0;
+    std::size_t source_files_processed = 0;
+    std::vector<std::string> output_files;
+    /// Per-chunk-file gzip-member layout, captured during Phase 3 by the
+    /// striped writer so Phase 4 indexing can slice without re-scanning.
+    ChunkLayoutMap chunk_layouts;
+    std::unordered_set<std::string> inline_indexed_groups;
+    bool success = false;
+};
+
+struct GroupRuntime {
+    std::string name;
+    std::string group_index_dir;
+    std::string staging_root;
+    std::shared_ptr<
+        moodycamel::ConcurrentQueue<IndexDatabaseSstWriterContext::Artifacts>>
+        artifacts_queue;
+    std::shared_ptr<std::atomic<std::size_t>> batch_counter;
+    std::atomic<bool> indexed_inline{false};
+};
+
+static coro::CoroTask<void> run_group_writer_task(
+    CoroScope* inner_scope, GroupWriterConfig writer_config,
+    std::atomic<std::size_t>* total_events_ptr,
+    std::atomic<std::size_t>* chunks_ptr,
+    std::vector<std::string>* output_files_ptr, std::mutex* output_mutex_ptr,
+    ChunkLayoutMap* chunk_layouts_ptr, GroupRuntime* runtime_ptr) {
+    auto writer_result = co_await run_group_writer(inner_scope, writer_config);
+
+    if (writer_result.success) {
+        total_events_ptr->fetch_add(writer_result.events_written);
+        chunks_ptr->fetch_add(writer_result.chunks_created);
+        if (runtime_ptr) {
+            runtime_ptr->indexed_inline.store(writer_result.indexed_inline,
+                                              std::memory_order_release);
+        }
+
+        std::lock_guard<std::mutex> lock(*output_mutex_ptr);
+        for (const auto& f : writer_result.output_files) {
+            output_files_ptr->push_back(f);
+        }
+        if (chunk_layouts_ptr) {
+            for (auto& cl : writer_result.chunk_layouts) {
+                (*chunk_layouts_ptr)[cl.path] = std::move(cl.members);
+            }
+        }
+    } else {
+        DFTRACER_UTILS_LOG_ERROR("GroupWriter failed for %s: %s",
+                                 writer_config.group_name.c_str(),
+                                 writer_result.error_message.c_str());
+    }
+}
+
+static coro::CoroTask<void> run_group_indexing(
+    CoroScope* scope, const std::string& group_output_dir,
+    const std::vector<std::string>& chunk_files,
+    const AggregationConfig* agg_config, std::size_t checkpoint_size,
+    std::size_t parallelism, std::size_t flush_every_files,
+    const ChunkLayoutMap* writer_layouts) {
+    if (chunk_files.empty()) co_return;
+
+    const std::string index_path =
+        dft::internal::determine_index_path(group_output_dir);
+    fs::create_directories(index_path);
+
+    std::shared_ptr<dftracer::utils::rocksdb::RocksDatabase> agg_db;
+    std::unique_ptr<EventAggregator> merger;
+    if (agg_config) {
+        agg_db = EventAggregator::open_with_merge_operator(index_path);
+        merger = std::make_unique<EventAggregator>(agg_db, /*config_hash=*/0u);
+    }
+
+    std::vector<int> chunk_file_ids;
+    {
+        IndexDatabase coord_db(index_path);
+        coord_db.init_schema();
+        chunk_file_ids =
+            coord_db.register_files(chunk_files, /*build_manifest=*/true);
+    }
+
+    // Per-chunk-file gzip member layout. Prefer writer-captured layout (no
+    // I/O, exact); fall back to a post-write scan for chunks the writer
+    // didn't track (sharded/padded layouts return empty spans).
+    auto member_map = std::make_shared<std::vector<std::vector<
+        dftracer::utils::utilities::indexer::internal::GzipMember>>>(
+        chunk_files.size());
+    std::size_t scanned = 0;
+    std::size_t from_writer = 0;
+    for (std::size_t fi = 0; fi < chunk_files.size(); ++fi) {
+        if (writer_layouts) {
+            auto it = writer_layouts->find(chunk_files[fi]);
+            if (it != writer_layouts->end() && !it->second.empty()) {
+                auto& dst = (*member_map)[fi];
+                dst.reserve(it->second.size());
+                for (const auto& span : it->second) {
+                    dst.push_back({span.offset, span.length});
+                }
+                ++from_writer;
+                continue;
+            }
+        }
+        int fd = ::open(chunk_files[fi].c_str(), O_RDONLY);
+        if (fd < 0) continue;
+        struct stat st;
+        if (::fstat(fd, &st) == 0 && st.st_size >= 18) {
+            co_await dftracer::utils::utilities::indexer::internal::
+                enumerate_gzip_member_candidates(
+                    fd, static_cast<std::uint64_t>(st.st_size),
+                    (*member_map)[fi]);
+        }
+        ::close(fd);
+        ++scanned;
+    }
+    DFTRACER_UTILS_LOG_INFO(
+        "Phase 4 group '%s': layouts from writer=%zu, rescanned=%zu",
+        group_output_dir.c_str(), from_writer, scanned);
+
+    // Build per-file slices targeting
+    std::vector<std::string> sliced_file_paths;
+    std::vector<int> sliced_file_ids;
+    std::vector<IndexBuildBatchConfig::FileSlice> sliced_slices;
+    for (std::size_t fi = 0; fi < chunk_files.size(); ++fi) {
+        const auto& members = (*member_map)[fi];
+        if (members.size() <= 1) {
+            sliced_file_paths.push_back(chunk_files[fi]);
+            sliced_file_ids.push_back(chunk_file_ids[fi]);
+            sliced_slices.push_back({});
+            continue;
+        }
+        std::uint64_t total_c = 0;
+        for (const auto& m : members) total_c += m.c_size;
+        const std::size_t target_units =
+            std::max<std::size_t>(parallelism, std::size_t(1));
+        const std::uint64_t target_c =
+            (total_c + target_units - 1) / target_units;
+        std::size_t begin = 0;
+        std::uint64_t accum = 0;
+        bool first_slice_for_file = true;
+        for (std::size_t i = 0; i < members.size(); ++i) {
+            accum += members[i].c_size;
+            const bool is_last = (i + 1 == members.size());
+            if ((target_c > 0 && accum >= target_c) || is_last) {
+                IndexBuildBatchConfig::FileSlice s;
+                s.members = &(*member_map)[fi];
+                s.member_begin = begin;
+                s.member_end = i + 1;
+                constexpr std::uint64_t CKPT_STRIDE = 1u << 20;
+                s.checkpoint_idx_base =
+                    static_cast<std::uint64_t>(begin) * CKPT_STRIDE;
+                // Only the first slice persists file-scoped data
+                // (chunk_bloom/file_bloom/manifest/file_metadata). Subsequent
+                // slices contribute aggregation/system_metrics SSTs only.
+                s.skip_file_scoped_writes = !first_slice_for_file;
+                first_slice_for_file = false;
+
+                sliced_file_paths.push_back(chunk_files[fi]);
+                sliced_file_ids.push_back(chunk_file_ids[fi]);
+                sliced_slices.push_back(s);
+                begin = i + 1;
+                accum = 0;
+            }
+        }
+    }
+    DFTRACER_UTILS_LOG_INFO(
+        "Phase 4 group indexing: %zu chunk files -> %zu slices "
+        "(parallelism=%zu)",
+        chunk_files.size(), sliced_file_paths.size(), parallelism);
+
+    const std::string staging_root =
+        (fs::path(index_path) / ".dftindex_staging").string();
+    fs::create_directories(staging_root);
+    auto artifacts_queue = std::make_shared<moodycamel::ConcurrentQueue<
+        IndexDatabaseSstWriterContext::Artifacts>>();
+    auto batch_counter = std::make_shared<std::atomic<std::size_t>>(0);
+
+    auto batch_config = std::make_shared<IndexBuildBatchConfig>();
+    batch_config->file_paths = std::move(sliced_file_paths);
+    batch_config->preassigned_file_ids = std::move(sliced_file_ids);
+    batch_config->file_slices = std::move(sliced_slices);
+    batch_config->index_dir = group_output_dir;
+    batch_config->checkpoint_size = checkpoint_size;
+    batch_config->parallelism = parallelism;
+    batch_config->force_rebuild = false;
+    batch_config->build_manifest = true;
+    batch_config->use_batch_write = true;
+    batch_config->flush_every_files = flush_every_files;
+    batch_config->sink_factory =
+        [staging_root, batch_counter]() -> std::unique_ptr<IndexBatchSink> {
+        const std::size_t idx =
+            batch_counter->fetch_add(1, std::memory_order_relaxed);
+        return std::make_unique<IndexDatabaseSstWriterContext>(
+            staging_root, "batch_" + std::to_string(idx));
+    };
+    batch_config->sink_commit = [artifacts_queue](IndexBatchSink& sink) {
+        auto& sst = static_cast<IndexDatabaseSstWriterContext&>(sink);
+        auto a = sst.commit();
+        if (!a.empty()) artifacts_queue->enqueue(std::move(a));
+    };
+
+    if (agg_config) {
+        auto agg_config_ptr = std::make_shared<AggregationConfig>(*agg_config);
+        batch_config->dft_visitor_factory =
+            [agg_db, agg_config_ptr](const std::string& file_path)
+            -> std::vector<std::unique_ptr<composites::dft::DftEventVisitor>> {
+            std::vector<std::unique_ptr<composites::dft::DftEventVisitor>>
+                visitors;
+            visitors.push_back(std::make_unique<AggregationVisitor>(
+                agg_db, /*config_hash=*/0u, *agg_config_ptr, file_path));
+            return visitors;
+        };
+        auto* merger_ptr = merger.get();
+        batch_config->extra_visitors_drain =
+            [merger_ptr](std::vector<std::vector<
+                             std::unique_ptr<composites::dft::DftEventVisitor>>>
+                             per_file) {
+                for (auto& file_visitors : per_file) {
+                    for (auto& visitor : file_visitors) {
+                        auto* agg_visitor =
+                            dynamic_cast<AggregationVisitor*>(visitor.get());
+                        if (!agg_visitor) continue;
+                        for (const auto& k : agg_visitor->observed_extra_keys())
+                            merger_ptr->add_observed_extra_key(k);
+                        for (const auto& m :
+                             agg_visitor->observed_custom_metrics())
+                            merger_ptr->add_observed_custom_metric(m);
+                        merger_ptr->merge_chunk(agg_visitor->take_output());
+                    }
+                }
+            };
+    }
+
+    auto batch_result = co_await IndexBatchBuilderUtility::process(
+        scope, std::move(batch_config));
+
+    {
+        SstArtifactRegistry registry;
+        IndexDatabaseSstWriterContext::Artifacts a;
+        while (artifacts_queue->try_dequeue(a)) {
+            registry.append(std::move(a));
+        }
+        IndexDatabase ingest_db(index_path);
+        ingest_db.bulk_ingest(registry, {});
+        std::error_code ec;
+        fs::remove_all(staging_root, ec);
+    }
+
+    if (!agg_config) co_return;
+
+    namespace rcf = dftracer::utils::rocksdb::cf;
+    IndexDatabase idx_db(
+        index_path,
+        dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+
+    auto batch = agg_db->begin_batch();
+    AggGlobalConfig global_cfg;
+    global_cfg.time_interval_us = agg_config->time_interval_us;
+    global_cfg.config_hash = 0;
+    agg_db->put(batch, rcf::AGGREGATION,
+                std::string_view(AGG_GLOBAL_CONFIG_KEY, 2),
+                serialize_agg_global_config(global_cfg));
+    for (const auto& chunk_path : chunk_files) {
+        int file_id = idx_db.find_file(chunk_path);
+        if (file_id >= 0) {
+            auto key = make_agg_file_key(file_id);
+            agg_db->put(batch, rcf::AGGREGATION, key, "");
+        }
+    }
+    agg_db->commit_batch(batch);
+}
+
+static coro::CoroTask<void> run_manifest_extractor_task(
+    ManifestExtractorConfig extractor_config) {
+    auto extract_result = co_await extract_from_manifest(extractor_config);
+    if (!extract_result.success) {
+        DFTRACER_UTILS_LOG_WARN("ManifestExtractor failed for %s: %s",
+                                extractor_config.file_path.c_str(),
+                                extract_result.error_message.c_str());
+    }
+}
+
+struct ProducerScopeInput {
+    ResolverResult resolver_result;
+    std::vector<FileWorkItem> files_needing_index;
+    std::vector<ResolvedFile>
+        manifest_entries;  // Files with manifest for extraction
+    std::uint64_t checkpoint_size;
+    std::size_t executor_threads;
+    bool force_rebuild;
+    std::size_t flush_every_files;  // per-flush sub-batch inside indexer
+    std::vector<PredicateGroup> groups;
+    std::vector<std::shared_ptr<coro::Channel<std::shared_ptr<LineBatch>>>>
+        group_channels;
+    std::unordered_map<std::string, std::size_t> file_index_map;
+};
+
+static coro::CoroTask<void> run_producer_scope(CoroScope* producer_scope,
+                                               ProducerScopeInput* input) {
+    if (!input->files_needing_index.empty()) {
+        const std::size_t total_files = input->files_needing_index.size();
+        const std::size_t flush_every =
+            std::max(input->flush_every_files, std::size_t(1));
+        std::printf(
+            "  Processing %zu files needing index (flush_every=%zu)...\n",
+            total_files, flush_every);
+
+        auto factory = [input](const std::string& file_path)
+            -> std::vector<std::unique_ptr<DftEventVisitor>> {
+            std::size_t file_idx = 0;
+            if (auto it = input->file_index_map.find(file_path);
+                it != input->file_index_map.end()) {
+                file_idx = it->second;
+            }
+
+            OrganizeVisitorConfig visitor_config;
+            visitor_config.groups = input->groups;
+            visitor_config.group_channels = input->group_channels;
+            visitor_config.source_file_idx = file_idx;
+
+            std::vector<std::unique_ptr<DftEventVisitor>> visitors;
+            visitors.push_back(
+                std::make_unique<OrganizeVisitor>(std::move(visitor_config)));
+            return visitors;
+        };
+
+        auto batch_config = std::make_shared<IndexBuildBatchConfig>();
+        batch_config->file_paths.reserve(total_files);
+        for (const auto& item : input->files_needing_index) {
+            batch_config->file_paths.push_back(item.file_path);
+        }
+        const std::string& input_index_path = input->resolver_result.index_path;
+
+        std::vector<int> preassigned_file_ids;
+        {
+            IndexDatabase coord_db(input_index_path);
+            coord_db.init_schema();
+            preassigned_file_ids = coord_db.register_files(
+                batch_config->file_paths, /*build_manifest=*/true);
+        }
+        const std::string staging_root =
+            (fs::path(input_index_path) / ".dftindex_staging").string();
+        fs::create_directories(staging_root);
+        auto artifacts_queue = std::make_shared<moodycamel::ConcurrentQueue<
+            IndexDatabaseSstWriterContext::Artifacts>>();
+        auto batch_counter = std::make_shared<std::atomic<std::size_t>>(0);
+
+        batch_config->preassigned_file_ids = std::move(preassigned_file_ids);
+        batch_config->index_dir = input_index_path;
+        batch_config->checkpoint_size = input->checkpoint_size;
+        batch_config->parallelism = input->executor_threads;
+        batch_config->force_rebuild = input->force_rebuild;
+        batch_config->build_manifest = true;
+        batch_config->use_batch_write = true;
+        batch_config->rebuild_root_summaries = false;
+        batch_config->flush_every_files = flush_every;
+        batch_config->dft_visitor_factory = factory;
+        batch_config->sink_factory =
+            [staging_root, batch_counter]() -> std::unique_ptr<IndexBatchSink> {
+            const std::size_t idx =
+                batch_counter->fetch_add(1, std::memory_order_relaxed);
+            return std::make_unique<IndexDatabaseSstWriterContext>(
+                staging_root, "batch_" + std::to_string(idx));
+        };
+        batch_config->sink_commit = [artifacts_queue](IndexBatchSink& sink) {
+            auto& sst = static_cast<IndexDatabaseSstWriterContext&>(sink);
+            auto a = sst.commit();
+            if (!a.empty()) artifacts_queue->enqueue(std::move(a));
+        };
+
+        co_await IndexBatchBuilderUtility::process(producer_scope,
+                                                   batch_config);
+
+        SstArtifactRegistry registry;
+        {
+            IndexDatabaseSstWriterContext::Artifacts a;
+            while (artifacts_queue->try_dequeue(a)) {
+                registry.append(std::move(a));
+            }
+        }
+        {
+            IndexDatabase ingest_db(input_index_path);
+            ingest_db.bulk_ingest(registry, {});
+            ingest_db.rebuild_root_summaries();
+        }
+        std::error_code ec;
+        fs::remove_all(staging_root, ec);
+    }
+
+    if (!input->manifest_entries.empty()) {
+        std::printf("  Processing %zu files via manifest extraction...\n",
+                    input->manifest_entries.size());
+
+        const auto& index_path = input->resolver_result.index_path;
+        for (const auto& entry : input->manifest_entries) {
+            ManifestExtractorConfig extractor_config;
+            extractor_config.file_path = entry.file_path;
+            extractor_config.index_path = index_path;
+            extractor_config.source_file_idx = entry.file_index;
+            extractor_config.groups = input->groups;
+            extractor_config.group_channels = input->group_channels;
+
+            producer_scope->spawn(
+                [extractor_config](CoroScope&) -> coro::CoroTask<void> {
+                    co_await run_manifest_extractor_task(extractor_config);
+                });
+        }
+    }
+}
+
+coro::CoroTask<int> run_organize(const OrganizeArgParse* cli) {
+    const auto& output_dir = cli->output_dir;
+    const auto& index_dir = cli->indexing.index_dir;
+    const auto checkpoint_size = cli->indexing.checkpoint_size;
+    const auto force_rebuild = cli->indexing.force;
+    const auto no_compress = cli->no_compress;
+    const auto compression_level = cli->compression_level;
+    const auto executor_threads = cli->pipeline.executor_threads;
+    const bool time_profiling = cli->pipeline.time_profiling;
+
+    auto groups = parse_group_specs(cli->group_specs);
+    if (groups.empty()) {
+        DFTRACER_UTILS_LOG_ERROR("%s", "No groups specified.");
+        co_return 1;
+    }
+
+    fs::create_directories(output_dir);
+
+    std::size_t chunk_size_mb = cli->chunk_size_mb;
+    if (chunk_size_mb == 0) {
+        auto layout = fileio::parallel::detect_layout(output_dir);
+        if (layout.fs == fileio::parallel::FilesystemKind::LUSTRE) {
+            chunk_size_mb = 0;  // single file per group, no rotation
+        } else {
+            chunk_size_mb = 256;
+        }
+    }
+
     std::printf("==========================================\n");
-    std::printf("DFTracer Trace Reorganizer\n");
+    std::printf("DFTracer Trace Reorganizer (Streaming)\n");
     std::printf("==========================================\n");
-    std::printf("  Input files: %zu\n", files.size());
     std::printf("  Output directory: %s\n", output_dir.c_str());
-    std::printf("  Chunk size: %zu MB\n", chunk_size_mb);
+    if (chunk_size_mb == 0) {
+        std::printf("  Chunk size: auto (one file per group)\n");
+    } else {
+        std::printf("  Chunk size: %zu MB\n", chunk_size_mb);
+    }
     std::printf("  Compress: %s\n", no_compress ? "false" : "true");
     std::printf("  Executor threads: %zu\n", executor_threads);
     std::printf("  Groups: %zu\n", groups.size());
@@ -53,203 +616,444 @@ coro::CoroTask<int> run_organize(const std::string& output_dir,
     }
     std::printf("==========================================\n\n");
 
-    auto start_time = std::chrono::high_resolution_clock::now();
+    Timer stages_storage("dftracer_organize");
+    Timer* stages = time_profiling ? &stages_storage : nullptr;
+    Timer overall(true);
 
-    // Step 1: Build indices
-    std::printf("Step 1: Building indices...\n");
-    {
-        auto pipeline_config = PipelineConfig()
-                                   .with_name("Organize: Build IDX")
-                                   .with_compute_threads(executor_threads)
-                                   .with_watchdog(false);
-
-        Pipeline pipeline(pipeline_config);
-
-        std::atomic<std::size_t> built_count{0};
-        std::atomic<std::size_t> skipped_count{0};
-
-        auto build_task = make_task(
-            [&](CoroScope& ctx) -> coro::CoroTask<void> {
-                co_await ctx.scope([&](CoroScope& scope)
-                                       -> coro::CoroTask<void> {
-                    auto* built_ptr = &built_count;
-                    auto* skipped_ptr = &skipped_count;
-                    for (std::size_t i = 0; i < files.size(); ++i) {
-                        const auto file_path = files[i];
-                        scope.spawn([file_path, index_dir, checkpoint_size,
-                                     force_rebuild, built_ptr, skipped_ptr](
-                                        CoroScope&) -> coro::CoroTask<void> {
-                            auto config =
-                                IndexBuildConfig::for_file(file_path)
-                                    .with_index_dir(index_dir)
-                                    .with_checkpoint_size(checkpoint_size)
-                                    .with_force_rebuild(force_rebuild)
-                                    .with_manifest(true)
-                                    .with_index_threshold(0);
-
-                            IndexBuilderUtility builder;
-                            auto result = co_await builder.process(config);
-
-                            if (result.was_skipped) {
-                                (*skipped_ptr)++;
-                            } else if (result.success) {
-                                (*built_ptr)++;
-                            } else {
-                                DFTRACER_UTILS_LOG_ERROR(
-                                    "IDX build failed for %s: %s",
-                                    file_path.c_str(),
-                                    result.error_message.c_str());
-                            }
-                            co_return;
-                        });
-                    }
-                    co_return;
-                });
+    OrganizeResult result;
+
+    auto pipeline_config =
+        cli::build_pipeline_config("Organize: Streaming", cli->pipeline);
+
+    Pipeline pipeline(pipeline_config);
+
+    auto* cli_ptr = cli;
+    auto* groups_ptr = &groups;
+    auto* result_ptr = &result;
+
+    auto organize_task = make_task(
+        [cli_ptr, groups_ptr, result_ptr, output_dir, index_dir,
+         checkpoint_size, force_rebuild, no_compress, compression_level,
+         executor_threads, chunk_size_mb,
+         stages](CoroScope& ctx) -> coro::CoroTask<void> {
+            // Phase 1: Scan & Partition
+            DFTRACER_UTILS_LOG_INFO("%s", "Phase 1 begin: scan & partition");
+            std::printf("Phase 1: Scanning and partitioning files...\n");
+
+            IndexResolverUtility resolver;
+            ResolverInput resolver_input;
+            ResolverResult resolve_result;
+            {
+                ScopedTimer _t(stages, "phase1_scan_partition");
+                resolver_input.directory = cli_ptr->directory.value;
+                resolver_input.files = cli_ptr->files_args.value;
+                resolver_input.index_dir = index_dir;
+                resolver_input.require_manifest = true;
+
+                resolve_result = co_await resolver.process(resolver_input);
+            }
+
+            if (resolve_result.all_files.empty()) {
+                DFTRACER_UTILS_LOG_ERROR(
+                    "%s", "No input files. Use --files or --directory.");
                 co_return;
-            },
-            "BuildIDX");
+            }
 
-        pipeline.set_source(build_task);
-        pipeline.set_destination(build_task);
-        pipeline.execute();
+            std::printf("  Total files: %zu\n",
+                        resolve_result.all_files.size());
+            std::printf("  Files needing index: %zu\n",
+                        resolve_result.needs_checkpoint.size() +
+                            resolve_result.needs_manifest.size());
+            std::printf("  Already indexed: %zu\n",
+                        resolve_result.cached.size());
+            DFTRACER_UTILS_LOG_INFO(
+                "Phase 1 complete: %zu files (%zu need index, %zu cached)",
+                resolve_result.all_files.size(),
+                resolve_result.needs_checkpoint.size() +
+                    resolve_result.needs_manifest.size(),
+                resolve_result.cached.size());
 
-        std::printf("  Built: %zu, Skipped: %zu\n", built_count.load(),
-                    skipped_count.load());
-    }
+            // Build file index map for source_file_idx lookup
+            std::unordered_map<std::string, std::size_t> file_index_map;
+            for (std::size_t i = 0; i < resolve_result.all_files.size(); ++i) {
+                file_index_map[resolve_result.all_files[i]] = i;
+            }
 
-    // Step 2: Build extraction plan
-    std::printf("Step 2: Building extraction plan...\n");
-    ReorganizationPlannerUtility planner;
-    ReorganizationPlannerInput planner_input;
-    planner_input.source_files = files;
-    planner_input.groups = groups;
-    planner_input.index_dir = index_dir;
-    planner_input.checkpoint_size = checkpoint_size;
-
-    ExtractionPlan plan;
-    try {
-        plan = co_await planner.process(planner_input);
-    } catch (const std::exception& e) {
-        DFTRACER_UTILS_LOG_ERROR("Planning failed: %s", e.what());
-        co_return 1;
-    }
+            // Build source file info for provenance tracking
+            std::vector<SourceFileInfo> source_files;
+            source_files.reserve(resolve_result.all_files.size());
+            for (std::size_t i = 0; i < resolve_result.all_files.size(); ++i) {
+                source_files.push_back(SourceFileInfo{
+                    .file_path = resolve_result.all_files[i],
+                    .index_path = resolve_result.index_path,
+                    .num_checkpoints = 0,
+                });
+            }
 
-    std::printf("  Groups: %zu\n", plan.groups.size());
-    std::printf("  Source files: %zu\n", plan.source_files.size());
-    std::printf("  Extraction tasks: %zu\n", plan.tasks.size());
-    std::printf("  Total events: %zu\n", plan.total_events);
+            // Phase 2: Setup channels and writers
+            DFTRACER_UTILS_LOG_INFO("%s",
+                                    "Phase 2 begin: setup streaming pipeline");
+            std::printf("Phase 2: Starting streaming pipeline...\n");
 
-    if (plan.tasks.empty()) {
-        std::printf("No events to extract.\n");
-        co_return 0;
-    }
+            std::vector<
+                std::shared_ptr<coro::Channel<std::shared_ptr<LineBatch>>>>
+                group_channels;
+            std::atomic<std::size_t> total_events_written{0};
+            std::atomic<std::size_t> chunks_created{0};
+            std::vector<std::string> all_output_files;
+            std::mutex output_files_mutex;
+            // Each group writer writes distinct chunk paths, so concurrent
+            // inserts into chunk_layouts are key-disjoint and safe under
+            // output_files_mutex (already taken when appending output_files).
+            // Stored on result_ptr so Phase 4 (separate task) can read it.
 
-    // Step 3: Route events (parallel)
-    std::printf("Step 3: Routing events...\n");
-    EventRouterResult router_result;
-    {
-        auto pipeline_config = PipelineConfig()
-                                   .with_name("Organize: Route Events")
-                                   .with_compute_threads(executor_threads)
-                                   .with_watchdog(false);
-
-        Pipeline pipeline(pipeline_config);
-
-        EventRouterConfig router_config;
-        router_config.plan = std::move(plan);
-        router_config.output_dir = output_dir;
-        router_config.index_dir = index_dir;
-        router_config.chunk_size_bytes = chunk_size_mb * 1024 * 1024;
-        router_config.checkpoint_size = checkpoint_size;
-        router_config.executor_threads = executor_threads;
-        router_config.compress = !no_compress;
-
-        auto* router_config_ptr = &router_config;
-        auto* router_result_ptr = &router_result;
-
-        auto route_task = make_task(
-            [router_config_ptr,
-             router_result_ptr](CoroScope& scope) -> coro::CoroTask<void> {
-                *router_result_ptr =
-                    co_await route_events(scope, *router_config_ptr);
-            },
-            "RouteEvents");
-
-        pipeline.set_source(route_task);
-        pipeline.set_destination(route_task);
-        pipeline.execute();
-    }
+            {
+                ScopedTimer _t(stages, "phase2_setup_channels");
+                group_channels.reserve(groups_ptr->size());
+
+                for (std::size_t i = 0; i < groups_ptr->size(); ++i) {
+                    group_channels.push_back(
+                        std::make_shared<
+                            coro::Channel<std::shared_ptr<LineBatch>>>(
+                            executor_threads * 4));
+                }
+            }
 
-    std::printf("  Events written: %zu\n", router_result.total_events_written);
-    std::printf("  Chunks created: %zu\n", router_result.chunks_created);
-    std::printf("  Source files processed: %zu\n",
-                router_result.source_files_processed);
+            auto* total_events_ptr = &total_events_written;
+            auto* chunks_ptr = &chunks_created;
+            auto* output_files_ptr = &all_output_files;
+            auto* output_mutex_ptr = &output_files_mutex;
+            auto* chunk_layouts_ptr = &result_ptr->chunk_layouts;
+            const auto* source_files_ptr = &source_files;
 
-    // Step 4: Build `.dftindex` stores for output chunk files.
-    if (!router_result.output_files.empty()) {
-        std::printf("Step 4: Building .dftindex stores...\n");
-        auto pipeline_config = PipelineConfig()
-                                   .with_name("Organize: Build Index Stores")
-                                   .with_compute_threads(executor_threads)
-                                   .with_watchdog(false);
+            DFTRACER_UTILS_LOG_INFO("%s", "Phase 2 complete");
+            DFTRACER_UTILS_LOG_INFO("%s",
+                                    "Phase 3 begin: producers + group writers");
+            std::vector<std::unique_ptr<GroupRuntime>> group_runtimes;
+            group_runtimes.reserve(groups_ptr->size());
+            for (const auto& group : *groups_ptr) {
+                auto rt = std::make_unique<GroupRuntime>();
+                rt->name = group.name;
+                const std::string group_output_dir =
+                    output_dir + "/" + group.name;
+                fs::create_directories(group_output_dir);
+                rt->group_index_dir =
+                    dft::internal::determine_index_path(group_output_dir);
+                fs::create_directories(rt->group_index_dir);
+                rt->staging_root =
+                    (fs::path(rt->group_index_dir) / ".dftindex_staging")
+                        .string();
+                fs::create_directories(rt->staging_root);
+                rt->artifacts_queue =
+                    std::make_shared<moodycamel::ConcurrentQueue<
+                        IndexDatabaseSstWriterContext::Artifacts>>();
+                rt->batch_counter =
+                    std::make_shared<std::atomic<std::size_t>>(0);
+                group_runtimes.push_back(std::move(rt));
+            }
 
-        Pipeline pipeline(pipeline_config);
+            for (std::size_t g = 0; g < groups_ptr->size(); ++g) {
+                const auto& group = (*groups_ptr)[g];
+                auto channel = group_channels[g];
+                GroupRuntime* runtime = group_runtimes[g].get();
 
-        auto* output_files_ptr = &router_result.output_files;
+                GroupWriterConfig writer_config;
+                writer_config.group_name = group.name;
+                writer_config.group_query = group.query;
+                writer_config.output_dir = output_dir;
+                writer_config.chunk_size_bytes = chunk_size_mb * 1024 * 1024;
+                writer_config.compress = !no_compress;
+                writer_config.compression_level = compression_level;
+                writer_config.input_channel = channel;
+                writer_config.source_files = source_files_ptr;
+                writer_config.build_output_index = true;
+                writer_config.index_dir = runtime->group_index_dir;
+                writer_config.staging_root = runtime->staging_root;
+                writer_config.artifacts_queue = runtime->artifacts_queue;
+                writer_config.batch_counter = runtime->batch_counter;
+                writer_config.with_aggregation = cli_ptr->with_aggregation;
+                writer_config.agg_time_interval_us =
+                    cli_ptr->time_interval_ms * 1000.0;
+                writer_config.bloom_dimensions = std::vector<std::string>(
+                    indexer::DEFAULT_BLOOM_DIMENSIONS.begin(),
+                    indexer::DEFAULT_BLOOM_DIMENSIONS.end());
+                writer_config.bloom_config.build_manifest = true;
 
-        auto index_store_task = make_task(
-            [output_files_ptr, output_dir,
-             checkpoint_size](CoroScope& ctx) -> coro::CoroTask<void> {
+                ctx.spawn(
+                    [writer_config, total_events_ptr, chunks_ptr,
+                     output_files_ptr, output_mutex_ptr, chunk_layouts_ptr,
+                     runtime](CoroScope& inner_scope) -> coro::CoroTask<void> {
+                        co_await run_group_writer_task(
+                            &inner_scope, writer_config, total_events_ptr,
+                            chunks_ptr, output_files_ptr, output_mutex_ptr,
+                            chunk_layouts_ptr, runtime);
+                    });
+            }
+
+            // Phase 3b & 3c: Run producers in a nested scope
+            // When this scope completes, all producers have finished
+            const auto total_source_files = resolve_result.all_files.size();
+            const std::size_t memory_budget = compute_memory_budget(
+                cli_ptr->memory_budget_mb * 1024ULL * 1024ULL);
+            const std::size_t per_file_bytes = estimate_per_file_bytes(
+                resolve_result.all_file_sizes,
+                cli_ptr->estimated_file_bytes_mb * 1024ULL * 1024ULL);
+            const std::size_t phase3_flush_every =
+                compute_file_batch_size(memory_budget, per_file_bytes, 4);
+            {
+                ScopedTimer _t(stages, "phase3_producers");
+                auto producer_input = std::make_shared<ProducerScopeInput>();
+                producer_input->resolver_result = std::move(resolve_result);
+                producer_input->files_needing_index =
+                    std::move(producer_input->resolver_result.needs_checkpoint);
+                for (auto& item :
+                     producer_input->resolver_result.needs_manifest) {
+                    producer_input->files_needing_index.push_back(
+                        std::move(item));
+                }
+                producer_input->manifest_entries =
+                    std::move(producer_input->resolver_result.cached);
+                producer_input->checkpoint_size = checkpoint_size;
+                producer_input->executor_threads = executor_threads;
+                producer_input->force_rebuild = force_rebuild;
+                producer_input->flush_every_files = phase3_flush_every;
+                std::printf(
+                    "  Memory budget: %.2f GB; per-file peak estimate: %.2f "
+                    "GB; flush_every: %zu files\n",
+                    memory_budget / (1024.0 * 1024.0 * 1024.0),
+                    per_file_bytes / (1024.0 * 1024.0 * 1024.0),
+                    phase3_flush_every);
+                producer_input->groups = *groups_ptr;
+                producer_input->group_channels = group_channels;
+                producer_input->file_index_map = std::move(file_index_map);
+
+                // GCC 12 coroutine bug: capturing shared_ptr by value in
+                // coroutine lambdas corrupts refcount. Capture raw pointer
+                // instead - lifetime is guaranteed by outer shared_ptr.
+                auto* producer_input_raw = producer_input.get();
                 co_await ctx.scope(
-                    [&](CoroScope& scope) -> coro::CoroTask<void> {
-                        for (const auto& out_file : *output_files_ptr) {
-                            scope.spawn([out_file, checkpoint_size](CoroScope&)
-                                            -> coro::CoroTask<void> {
-                                auto config =
-                                    IndexBuildConfig::for_file(out_file)
-                                        .with_index_dir("")
-                                        .with_checkpoint_size(checkpoint_size)
-                                        .with_force_rebuild(true)
-                                        .with_manifest(true)
-                                        .with_index_threshold(0);
-
-                                IndexBuilderUtility builder;
-                                co_await builder.process(config);
-                                co_return;
-                            });
-                        }
-                        co_return;
+                    [producer_input_raw](
+                        CoroScope& producer_scope) -> coro::CoroTask<void> {
+                        co_await run_producer_scope(&producer_scope,
+                                                    producer_input_raw);
                     });
-                co_return;
-            },
-            "BuildSidecars");
+            }
+
+            // Producers done - close channels to signal EOF to writers
+            for (auto& channel : group_channels) {
+                channel->close();
+            }
+            DFTRACER_UTILS_LOG_INFO("%s",
+                                    "Phase 3 producers complete; waiting for "
+                                    "group writers to drain");
+
+            // Wait for all writers to complete
+            co_await ctx.join_all();
+
+            for (auto& rt : group_runtimes) {
+                if (!rt->indexed_inline.load(std::memory_order_acquire)) {
+                    continue;
+                }
+                SstArtifactRegistry registry;
+                IndexDatabaseSstWriterContext::Artifacts a;
+                while (rt->artifacts_queue->try_dequeue(a)) {
+                    registry.append(std::move(a));
+                }
+                IndexDatabase ingest_db(rt->group_index_dir);
+                ingest_db.bulk_ingest(registry, {});
+                std::error_code ec;
+                fs::remove_all(rt->staging_root, ec);
+                result_ptr->inline_indexed_groups.insert(rt->name);
+            }
+
+            result_ptr->total_events_written = total_events_written.load();
+            result_ptr->chunks_created = chunks_created.load();
+            result_ptr->source_files_processed = total_source_files;
+            result_ptr->output_files = std::move(all_output_files);
+
+            result_ptr->success = true;
+            DFTRACER_UTILS_LOG_INFO(
+                "Phase 3 complete: %zu chunks created, %zu events written",
+                result_ptr->chunks_created, result_ptr->total_events_written);
+        },
+        "OrganizeStreaming");
+
+    auto index_task = make_task(
+        [cli_ptr, groups_ptr, result_ptr, output_dir, checkpoint_size,
+         executor_threads](CoroScope& ctx) -> coro::CoroTask<void> {
+            if (!result_ptr->success) co_return;
+
+            AggregationConfig agg_config;
+            const AggregationConfig* agg_ptr = nullptr;
+            if (cli_ptr->with_aggregation) {
+                agg_config.time_interval_us = static_cast<std::uint64_t>(
+                    cli_ptr->time_interval_ms * 1000.0);
+                agg_config.compute_statistics = true;
+                agg_config.track_process_parents = true;
+                agg_config.track_default_args = true;
+                agg_ptr = &agg_config;
+                DFTRACER_UTILS_LOG_INFO(
+                    "Phase 4 begin: indexes + aggregation "
+                    "(time_interval=%.2f ms)",
+                    cli_ptr->time_interval_ms);
+                std::printf(
+                    "Phase 4: Building indexes + aggregation "
+                    "(time_interval=%.2f ms) ...\n",
+                    cli_ptr->time_interval_ms);
+            } else {
+                DFTRACER_UTILS_LOG_INFO("%s", "Phase 4 begin: indexes");
+                std::printf("Phase 4: Building indexes ...\n");
+            }
+
+            const std::size_t phase4_memory_budget = compute_memory_budget(
+                cli_ptr->memory_budget_mb * 1024ULL * 1024ULL);
+            const std::size_t override_per_file_bytes =
+                cli_ptr->estimated_file_bytes_mb * 1024ULL * 1024ULL;
+
+            filesystem::PatternDirectoryScannerUtility chunk_scanner;
+            for (const auto& group : *groups_ptr) {
+                const std::string group_dir = output_dir + "/" + group.name;
+                if (!fs::exists(group_dir)) continue;
+                if (result_ptr->inline_indexed_groups.count(group.name)) {
+                    DFTRACER_UTILS_LOG_INFO(
+                        "Phase 4: skipping group '%s' (indexed inline)",
+                        group.name.c_str());
+                    continue;
+                }
+
+                filesystem::PatternDirectoryScannerUtilityInput scan_input{
+                    group_dir, {".pfw", ".pfw.gz"}, false};
+                auto entries = co_await chunk_scanner.process(scan_input);
+                if (entries.empty()) continue;
+
+                std::sort(entries.begin(), entries.end(),
+                          [](const filesystem::FileEntry& a,
+                             const filesystem::FileEntry& b) {
+                              return a.path.string() < b.path.string();
+                          });
+
+                std::vector<std::string> chunk_files;
+                std::vector<std::size_t> chunk_sizes;
+                chunk_files.reserve(entries.size());
+                chunk_sizes.reserve(entries.size());
+                for (const auto& e : entries) {
+                    chunk_files.push_back(e.path.string());
+                    chunk_sizes.push_back(e.size);
+                }
+
+                const std::size_t per_file_bytes = estimate_per_file_bytes(
+                    chunk_sizes, override_per_file_bytes);
+                const std::size_t flush_every = compute_file_batch_size(
+                    phase4_memory_budget, per_file_bytes, 4);
+                std::printf(
+                    "  %s: %zu chunks; per-file peak: %.2f GB; "
+                    "flush_every: %zu chunks\n",
+                    group.name.c_str(), chunk_files.size(),
+                    per_file_bytes / (1024.0 * 1024.0 * 1024.0), flush_every);
+                DFTRACER_UTILS_LOG_INFO(
+                    "Phase 4: indexing group '%s' (%zu chunks)",
+                    group.name.c_str(), chunk_files.size());
+                co_await run_group_indexing(
+                    &ctx, group_dir, chunk_files, agg_ptr, checkpoint_size,
+                    executor_threads, flush_every, &result_ptr->chunk_layouts);
+                DFTRACER_UTILS_LOG_INFO("Phase 4: group '%s' complete",
+                                        group.name.c_str());
+            }
+            DFTRACER_UTILS_LOG_INFO("%s", "Phase 4 complete");
+        },
+        "OrganizeIndexing");
+
+    index_task->depends_on(organize_task);
+
+    pipeline.set_source(organize_task);
+    pipeline.set_destination(index_task);
+    pipeline.execute();
 
-        pipeline.set_source(index_store_task);
-        pipeline.set_destination(index_store_task);
-        pipeline.execute();
+    if (result.success) {
+        const std::string manifest_path = output_dir + "/manifest.json";
+        std::ofstream manifest_out(manifest_path);
+        if (manifest_out.is_open()) {
+            auto escape = [](const std::string& s) {
+                std::string out;
+                out.reserve(s.size());
+                for (char c : s) {
+                    switch (c) {
+                        case '"':
+                            out += "\\\"";
+                            break;
+                        case '\\':
+                            out += "\\\\";
+                            break;
+                        case '\n':
+                            out += "\\n";
+                            break;
+                        case '\r':
+                            out += "\\r";
+                            break;
+                        case '\t':
+                            out += "\\t";
+                            break;
+                        default:
+                            out += c;
+                            break;
+                    }
+                }
+                return out;
+            };
+            manifest_out << "{\n";
+            manifest_out << "  \"version\": 1,\n";
+            manifest_out << "  \"tool\": \"dftracer_organize\",\n";
+            manifest_out << "  \"groups\": {\n";
+            for (std::size_t i = 0; i < groups.size(); ++i) {
+                manifest_out << "    \"" << escape(groups[i].name) << "\": \""
+                             << escape(groups[i].name) << "\"";
+                if (i + 1 < groups.size()) manifest_out << ",";
+                manifest_out << "\n";
+            }
+            manifest_out << "  },\n";
+            manifest_out << "  \"group_queries\": {\n";
+            for (std::size_t i = 0; i < groups.size(); ++i) {
+                manifest_out << "    \"" << escape(groups[i].name) << "\": \""
+                             << escape(groups[i].query) << "\"";
+                if (i + 1 < groups.size()) manifest_out << ",";
+                manifest_out << "\n";
+            }
+            manifest_out << "  }\n";
+            manifest_out << "}\n";
+        } else {
+            DFTRACER_UTILS_LOG_WARN("Failed to write manifest at %s",
+                                    manifest_path.c_str());
+        }
     }
 
-    auto end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double, std::milli> duration = end_time - start_time;
+    overall.stop();
+    double duration_ms = static_cast<double>(overall.elapsed()) / 1e6;
 
     std::printf("\n==========================================\n");
     std::printf("Reorganization Complete\n");
     std::printf("==========================================\n");
-    std::printf("  Time: %.2f seconds\n", duration.count() / 1000.0);
-    std::printf("  Input files: %zu\n", files.size());
-    std::printf("  Events routed: %zu\n", router_result.total_events_written);
-    std::printf("  Chunks created: %zu\n", router_result.chunks_created);
+    std::printf("  Time: %.2f seconds\n", duration_ms / 1000.0);
+    std::printf("  Input files: %zu\n", result.source_files_processed);
+    std::printf("  Events routed: %zu\n", result.total_events_written);
+    std::printf("  Chunks created: %zu\n", result.chunks_created);
+    if (result.success) {
+        std::printf("  Manifest: %s/manifest.json\n", output_dir.c_str());
+    }
     std::printf("  Output files:\n");
-    for (const auto& f : router_result.output_files) {
+    for (const auto& f : result.output_files) {
         if (fs::exists(f)) {
             std::printf(
                 "    %s (%.2f MB)\n", f.c_str(),
                 static_cast<double>(fs::file_size(f)) / (1024.0 * 1024.0));
         }
     }
+    if (stages) {
+        std::printf("\n  Stage Timing:\n");
+        stages->print_stages("    ");
+    }
     std::printf("==========================================\n");
 
-    co_return router_result.success ? 0 : 1;
+    co_return result.success ? 0 : 1;
 }
 
 }  // namespace
@@ -263,106 +1067,9 @@ int main(int argc, char** argv) {
         "Reorganize DFTracer trace files by routing events to "
         "predicate-based groups with chunked output.");
 
-    program.add_argument("--files")
-        .help("Input trace files (.pfw, .pfw.gz)")
-        .nargs(argparse::nargs_pattern::any)
-        .default_value<std::vector<std::string>>({});
-
-    program.add_argument("-d", "--directory")
-        .help("Directory containing trace files")
-        .default_value<std::string>("");
-
-    program.add_argument("-o", "--output").help("Output directory").required();
-
-    program.add_argument("--groups")
-        .help(
-            "Predicate groups: \"io:cat==\\\"POSIX\\\"\" "
-            "\"compute:cat==\\\"APP\\\"\"")
-        .nargs(argparse::nargs_pattern::at_least_one)
-        .required();
-
-    program.add_argument("--chunk-size")
-        .help("Target chunk size in MB (default: 256)")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(256));
-
-    program.add_argument("--checkpoint-size")
-        .help("Checkpoint size for indexing in bytes")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(
-            indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE));
-
-    program.add_argument("--index-dir")
-        .help("Directory for .dftindex stores")
-        .default_value<std::string>("");
-
-    program.add_argument("-f", "--force")
-        .help("Force rebuild of indices")
-        .flag();
-
-    program.add_argument("--no-compress")
-        .help("Write plain .pfw instead of .pfw.gz")
-        .flag();
-
-    program.add_argument("--executor-threads")
-        .help("Worker threads")
-        .scan<'d', std::size_t>()
-        .default_value(
-            static_cast<std::size_t>(dftracer_utils_hardware_concurrency()));
-
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        DFTRACER_UTILS_LOG_ERROR("Error: %s", err.what());
-        std::cerr << program;
-        return 1;
-    }
-
-    std::string directory = program.get<std::string>("--directory");
-    std::string output_dir = program.get<std::string>("--output");
-    std::string index_dir = program.get<std::string>("--index-dir");
-    auto group_specs = program.get<std::vector<std::string>>("--groups");
-    std::size_t checkpoint_size = program.get<std::size_t>("--checkpoint-size");
-    std::size_t chunk_size_mb = program.get<std::size_t>("--chunk-size");
-    bool force_rebuild = program.get<bool>("--force");
-    bool no_compress = program.get<bool>("--no-compress");
-    std::size_t executor_threads =
-        program.get<std::size_t>("--executor-threads");
-
-    fs::create_directories(output_dir);
-
-    auto groups = parse_group_specs(group_specs);
-    if (groups.empty()) {
-        DFTRACER_UTILS_LOG_ERROR("%s", "No groups specified.");
-        return 1;
-    }
-
-    std::vector<std::string> files;
-    if (!directory.empty()) {
-        if (!fs::exists(directory)) {
-            DFTRACER_UTILS_LOG_ERROR("Directory does not exist: %s",
-                                     directory.c_str());
-            return 1;
-        }
-        filesystem::PatternDirectoryScannerUtility scanner;
-        filesystem::PatternDirectoryScannerUtilityInput scan_input{
-            directory, {".pfw", ".pfw.gz"}, false};
-        auto matched = scanner.process(scan_input).get();
-        for (const auto& entry : matched) {
-            files.push_back(entry.path.string());
-        }
-    } else {
-        files = program.get<std::vector<std::string>>("--files");
-    }
-
-    if (files.empty()) {
-        DFTRACER_UTILS_LOG_ERROR("%s",
-                                 "No input files. Use --files or --directory.");
-        return 1;
-    }
+    OrganizeArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
 
-    return run_organize(output_dir, index_dir, files, groups, checkpoint_size,
-                        force_rebuild, no_compress, executor_threads,
-                        chunk_size_mb)
-        .get();
+    return run_organize(&cli).get();
 }
diff --git a/src/dftracer/utils/binaries/dftracer_pgzip.cpp b/src/dftracer/utils/binaries/dftracer_pgzip.cpp
index c84d7160..ac5a1791 100644
--- a/src/dftracer/utils/binaries/dftracer_pgzip.cpp
+++ b/src/dftracer/utils/binaries/dftracer_pgzip.cpp
@@ -1,16 +1,12 @@
 #include <dftracer/utils/core/common/config.h>
-#include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
-#include <dftracer/utils/core/common/platform_compat.h>
 #include <dftracer/utils/core/coro/channel.h>
 #include <dftracer/utils/core/coro/coro.h>
 #include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/core/pipeline/pipeline_config.h>
 #include <dftracer/utils/core/tasks/task.h>
 #include <dftracer/utils/utilities/compression/zlib/types.h>
 #include <zlib.h>
 
-#include <argparse/argparse.hpp>
 #include <chrono>
 #include <cstdio>
 #include <fstream>
@@ -18,8 +14,52 @@
 #include <mutex>
 #include <vector>
 
+#include "common_cli.h"
+
 using namespace dftracer::utils;
 
+class PgzipArgParse : public cli::ArgParse {
+   public:
+    cli::DirectoryArgs directory{cli::DirMode::DEFAULT_DOT,
+                                 "Directory containing .pfw files"};
+    cli::PipelineArgs pipeline;
+    cli::WatchdogArgs watchdog;
+
+    bool verbose = false;
+    int compression_level = Z_DEFAULT_COMPRESSION;
+    std::size_t chunk_size = 4 * 1024 * 1024;
+
+    explicit PgzipArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        schema(directory, pipeline, watchdog);
+    }
+
+   protected:
+    void register_args() override {
+        parser()
+            .add_argument("-v", "--verbose")
+            .help("Enable verbose output")
+            .flag();
+
+        parser()
+            .add_argument("-l", "--compression-level")
+            .help("Compression level (0-9, default: Z_DEFAULT_COMPRESSION)")
+            .scan<'d', int>()
+            .default_value(Z_DEFAULT_COMPRESSION);
+
+        parser()
+            .add_argument("--chunk-size")
+            .help("Chunk size in bytes for parallel compression (default: 4MB)")
+            .scan<'d', std::size_t>()
+            .default_value(static_cast<std::size_t>(4 * 1024 * 1024));
+    }
+
+    void post_parse() override {
+        verbose = parser().get<bool>("--verbose");
+        compression_level = parser().get<int>("--compression-level");
+        chunk_size = parser().get<std::size_t>("--chunk-size");
+    }
+};
+
 namespace {
 
 struct FileResult {
@@ -41,7 +81,6 @@ struct CompressedChunk {
     std::string data;
 };
 
-// Top-level coroutine: reads file and sends chunks to channel.
 static coro::CoroTask<void> chunk_reader(
     coro::ChannelProducer<ChunkWork> producer, const std::string* file_path,
     std::size_t chunk_size) {
@@ -67,9 +106,8 @@ static coro::CoroTask<void> chunk_reader(
     co_return;
 }
 
-// Top-level coroutine: compresses chunks and sends to output channel.
 static coro::CoroTask<void> chunk_compressor(
-    std::shared_ptr<coro::Channel<ChunkWork>> input_chan,
+    coro::ChannelConsumer<ChunkWork> input_chan,
     coro::ChannelProducer<CompressedChunk> out_producer,
     int compression_level) {
     auto guard = out_producer.guard();
@@ -83,7 +121,7 @@ static coro::CoroTask<void> chunk_compressor(
 
     std::string out_buf(64 * 1024, '\0');
 
-    while (auto work = co_await input_chan->receive()) {
+    while (auto work = co_await input_chan.receive()) {
         std::string compressed;
         compressed.reserve(work->data.size());
 
@@ -117,9 +155,8 @@ static coro::CoroTask<void> chunk_compressor(
     co_return;
 }
 
-// Top-level coroutine: receives compressed chunks and writes in order.
 static coro::CoroTask<void> chunk_writer(
-    std::shared_ptr<coro::Channel<CompressedChunk>> output_chan,
+    coro::ChannelConsumer<CompressedChunk> output_chan,
     const std::string* output_path) {
     std::ofstream ofs(*output_path, std::ios::binary);
     if (!ofs.is_open()) co_return;
@@ -127,7 +164,7 @@ static coro::CoroTask<void> chunk_writer(
     std::size_t next_expected = 0;
     std::map<std::size_t, std::string> pending;
 
-    while (auto chunk = co_await output_chan->receive()) {
+    while (auto chunk = co_await output_chan.receive()) {
         if (chunk->index == next_expected) {
             ofs.write(chunk->data.data(),
                       static_cast<std::streamsize>(chunk->data.size()));
@@ -150,7 +187,6 @@ static coro::CoroTask<void> chunk_writer(
     co_return;
 }
 
-// Compress a single file using parallel chunk compression.
 static coro::CoroTask<FileResult> compress_file_parallel(
     CoroScope& ctx, const std::string& file_path, int compression_level,
     std::size_t num_workers, std::size_t chunk_size) {
@@ -182,27 +218,29 @@ static coro::CoroTask<FileResult> compress_file_parallel(
         const auto* file_path_ptr = &file_path;
         const auto* output_path_ptr = &result.output_path;
 
-        co_await ctx.scope([input_chan, output_chan, file_path_ptr,
+        co_await ctx.scope([&input_chan, &output_chan, file_path_ptr,
                             output_path_ptr, compression_level, num_workers,
                             chunk_size](
                                CoroScope& scope) -> coro::CoroTask<void> {
-            scope.spawn([input_chan, file_path_ptr,
-                         chunk_size](CoroScope&) -> coro::CoroTask<void> {
-                co_await chunk_reader(input_chan->producer(), file_path_ptr,
-                                      chunk_size);
+            scope.spawn([ch = input_chan->producer(), file_path_ptr,
+                         chunk_size](
+                            CoroScope&) mutable -> coro::CoroTask<void> {
+                co_await chunk_reader(std::move(ch), file_path_ptr, chunk_size);
             });
 
             for (std::size_t w = 0; w < num_workers; ++w) {
-                scope.spawn([input_chan, output_chan, compression_level](
-                                CoroScope&) -> coro::CoroTask<void> {
-                    co_await chunk_compressor(
-                        input_chan, output_chan->producer(), compression_level);
+                scope.spawn([in_ch = input_chan->consumer(),
+                             out_ch = output_chan->producer(),
+                             compression_level](
+                                CoroScope&) mutable -> coro::CoroTask<void> {
+                    co_await chunk_compressor(in_ch, std::move(out_ch),
+                                              compression_level);
                 });
             }
 
-            scope.spawn([output_chan,
+            scope.spawn([ch = output_chan->consumer(),
                          output_path_ptr](CoroScope&) -> coro::CoroTask<void> {
-                co_await chunk_writer(output_chan, output_path_ptr);
+                co_await chunk_writer(ch, output_path_ptr);
             });
 
             co_return;
@@ -230,85 +268,12 @@ static coro::CoroTask<FileResult> compress_file_parallel(
 
 }  // namespace
 
-int main(int argc, char** argv) {
-    DFTRACER_UTILS_LOGGER_INIT();
-
-    argparse::ArgumentParser program("dftracer_pgzip",
-                                     DFTRACER_UTILS_PACKAGE_VERSION);
-    program.add_description(
-        "Parallel gzip compression for DFTracer .pfw files. "
-        "Splits each file into chunks and compresses them in parallel "
-        "as independent gzip members.");
-
-    program.add_argument("-d", "--directory")
-        .help("Directory containing .pfw files")
-        .default_value<std::string>(".");
-
-    program.add_argument("-v", "--verbose")
-        .help("Enable verbose output")
-        .flag();
-
-    program.add_argument("--executor-threads")
-        .help("Number of worker threads (default: number of CPU cores)")
-        .scan<'d', std::size_t>()
-        .default_value(
-            static_cast<std::size_t>(dftracer_utils_hardware_concurrency()));
-
-    program.add_argument("-l", "--compression-level")
-        .help("Compression level (0-9, default: Z_DEFAULT_COMPRESSION)")
-        .scan<'d', int>()
-        .default_value(Z_DEFAULT_COMPRESSION);
-
-    program.add_argument("--chunk-size")
-        .help("Chunk size in bytes for parallel compression (default: 4MB)")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(4 * 1024 * 1024));
-
-    program.add_argument("--disable-watchdog")
-        .help("Disable watchdog for hang detection")
-        .flag();
-
-    program.add_argument("--watchdog-global-timeout")
-        .help("Watchdog global timeout in seconds (0 = no timeout)")
-        .scan<'d', int>()
-        .default_value(0);
-
-    program.add_argument("--watchdog-task-timeout")
-        .help("Watchdog default task timeout in seconds (0 = no timeout)")
-        .scan<'d', int>()
-        .default_value(0);
-
-    program.add_argument("--watchdog-idle-timeout")
-        .help("Watchdog idle timeout in seconds (0 = use default)")
-        .scan<'d', int>()
-        .default_value(300);
-
-    program.add_argument("--watchdog-deadlock-timeout")
-        .help("Watchdog deadlock timeout in seconds (0 = use default)")
-        .scan<'d', int>()
-        .default_value(600);
-
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        DFTRACER_UTILS_LOG_ERROR("Error occurred: %s", err.what());
-        std::cerr << program << std::endl;
-        return 1;
-    }
-
-    std::string input_dir = program.get<std::string>("--directory");
-    bool verbose = program.get<bool>("--verbose");
-    std::size_t executor_threads =
-        program.get<std::size_t>("--executor-threads");
-    int compression_level = program.get<int>("--compression-level");
-    std::size_t chunk_size = program.get<std::size_t>("--chunk-size");
-    bool disable_watchdog = program.get<bool>("--disable-watchdog");
-    int global_timeout = program.get<int>("--watchdog-global-timeout");
-    int task_timeout = program.get<int>("--watchdog-task-timeout");
-    int idle_timeout = program.get<int>("--watchdog-idle-timeout");
-    int deadlock_timeout = program.get<int>("--watchdog-deadlock-timeout");
-
-    input_dir = fs::absolute(input_dir).string();
+static int run_pgzip(const PgzipArgParse& cli) {
+    const auto input_dir = fs::absolute(cli.directory.value).string();
+    const auto verbose = cli.verbose;
+    const auto executor_threads = cli.pipeline.executor_threads;
+    const auto compression_level = cli.compression_level;
+    const auto chunk_size = cli.chunk_size;
 
     std::vector<std::string> input_files;
     for (const auto& entry : fs::directory_iterator(input_dir)) {
@@ -338,17 +303,8 @@ int main(int argc, char** argv) {
 
     auto start_time = std::chrono::high_resolution_clock::now();
 
-    auto pipeline_config =
-        PipelineConfig()
-            .with_name("DFTracer Parallel Gzip")
-            .with_compute_threads(executor_threads)
-            .with_watchdog(!disable_watchdog)
-            .with_global_timeout(std::chrono::seconds(global_timeout))
-            .with_task_timeout(std::chrono::seconds(task_timeout))
-            .with_executor_idle_timeout(std::chrono::seconds(idle_timeout))
-            .with_executor_deadlock_timeout(
-                std::chrono::seconds(deadlock_timeout));
-
+    auto pipeline_config = cli::build_pipeline_config(
+        "DFTracer Parallel Gzip", cli.pipeline, cli.watchdog);
     Pipeline pipeline(pipeline_config);
 
     std::vector<FileResult> results;
@@ -364,7 +320,7 @@ int main(int argc, char** argv) {
             auto file_chan =
                 coro::make_channel<std::size_t>(executor_threads * 2);
 
-            co_await ctx.scope([file_chan, files_ptr, results_ptr, mutex_ptr,
+            co_await ctx.scope([&file_chan, files_ptr, results_ptr, mutex_ptr,
                                 compression_level, executor_threads, chunk_size,
                                 verbose](
                                    CoroScope& scope) -> coro::CoroTask<void> {
@@ -379,11 +335,11 @@ int main(int argc, char** argv) {
                     });
 
                 for (std::size_t w = 0; w < executor_threads; ++w) {
-                    scope.spawn([file_chan, files_ptr, results_ptr, mutex_ptr,
-                                 compression_level, executor_threads,
-                                 chunk_size, verbose](
+                    scope.spawn([ch = file_chan->consumer(), files_ptr,
+                                 results_ptr, mutex_ptr, compression_level,
+                                 executor_threads, chunk_size, verbose](
                                     CoroScope& wctx) -> coro::CoroTask<void> {
-                        while (auto fi_opt = co_await file_chan->receive()) {
+                        while (auto fi_opt = co_await ch.receive()) {
                             const auto& path = (*files_ptr)[*fi_opt];
 
                             auto result = co_await compress_file_parallel(
@@ -473,3 +429,20 @@ int main(int argc, char** argv) {
 
     return successful == input_files.size() ? 0 : 1;
 }
+
+int main(int argc, char** argv) {
+    DFTRACER_UTILS_LOGGER_INIT();
+
+    argparse::ArgumentParser program("dftracer_pgzip",
+                                     DFTRACER_UTILS_PACKAGE_VERSION);
+    program.add_description(
+        "Parallel gzip compression for DFTracer .pfw files. "
+        "Splits each file into chunks and compresses them in parallel "
+        "as independent gzip members.");
+
+    PgzipArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
+
+    return run_pgzip(cli);
+}
diff --git a/src/dftracer/utils/binaries/dftracer_reconstruct.cpp b/src/dftracer/utils/binaries/dftracer_reconstruct.cpp
index f40d0db1..fede952f 100644
--- a/src/dftracer/utils/binaries/dftracer_reconstruct.cpp
+++ b/src/dftracer/utils/binaries/dftracer_reconstruct.cpp
@@ -1,330 +1,95 @@
 #include <dftracer/utils/core/common/config.h>
-#include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
-#include <dftracer/utils/core/common/platform_compat.h>
-#include <dftracer/utils/core/coro/async_mutex.h>
-#include <dftracer/utils/core/coro/channel.h>
 #include <dftracer/utils/core/coro/task.h>
-#include <dftracer/utils/core/io/io.h>
 #include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/core/pipeline/pipeline_config.h>
 #include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/tasks/task.h>
-#include <dftracer/utils/utilities/composites/dft/internal/utils.h>
-#include <dftracer/utils/utilities/composites/dft/metadata_collector_utility.h>
-#include <dftracer/utils/utilities/composites/dft/reorganize/reconstruction_planner.h>
-#include <dftracer/utils/utilities/composites/indexed_file_reader_utility.h>
-#include <dftracer/utils/utilities/composites/types.h>
-#include <dftracer/utils/utilities/fileio/chunk_writer.h>
-#include <dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h>
-#include <dftracer/utils/utilities/indexer/internal/indexer.h>
-#include <dftracer/utils/utilities/reader/internal/stream_config.h>
+#include <dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.h>
 
-#include <algorithm>
-#include <argparse/argparse.hpp>
 #include <chrono>
 #include <cstdio>
-#include <cstring>
 #include <string>
-#include <unordered_map>
-#include <vector>
+
+#include "common_cli.h"
 
 using namespace dftracer::utils;
-using namespace dftracer::utils::utilities;
-using namespace dftracer::utils::utilities::composites;
-using namespace dftracer::utils::utilities::composites::dft;
 using namespace dftracer::utils::utilities::composites::dft::reorganize;
-using dftracer::utils::utilities::fileio::ChunkWriter;
-using dftracer::utils::utilities::fileio::ChunkWriterConfig;
 
-namespace {
+class ReconstructArgParse : public cli::ArgParse {
+   public:
+    cli::DirectoryArgs directory{cli::DirMode::REQUIRED,
+                                 "Directory containing reorganized files"};
+    cli::PipelineArgs pipeline;
 
-struct SegmentInterval {
-    int line_start;
-    int line_end;
-    std::string original_path;
-    int source_checkpoint;
-};
+    std::size_t checkpoint_size = 0;
+    std::string output_dir;
+    bool no_compress = false;
 
-const SegmentInterval* find_segment(
-    const std::vector<SegmentInterval>& intervals, int line_number) {
-    auto it = std::upper_bound(
-        intervals.begin(), intervals.end(), line_number,
-        [](int ln, const SegmentInterval& seg) { return ln < seg.line_start; });
-    if (it != intervals.begin()) {
-        --it;
-        if (line_number >= it->line_start && line_number < it->line_end) {
-            return &(*it);
-        }
+    explicit ReconstructArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        schema(directory, pipeline);
     }
-    return nullptr;
-}
 
-std::string output_filename(const std::string& original_path) {
-    auto p = fs::path(original_path).filename().string();
-    if (p.size() > 3 && p.substr(p.size() - 3) == ".gz") {
-        p = p.substr(0, p.size() - 3);
+   protected:
+    void register_args() override {
+        parser()
+            .add_argument("--checkpoint-size")
+            .help("Checkpoint size for gzip indexing in bytes (default: " +
+                  std::to_string(constants::indexer::DEFAULT_CHECKPOINT_SIZE) +
+                  ")")
+            .scan<'d', std::size_t>()
+            .default_value(static_cast<std::size_t>(
+                constants::indexer::DEFAULT_CHECKPOINT_SIZE));
+
+        parser()
+            .add_argument("-o", "--output")
+            .help("Output directory")
+            .required();
+
+        parser()
+            .add_argument("--no-compress")
+            .help("Write plain .pfw instead of .pfw.gz")
+            .flag();
     }
-    return p;
-}
 
-}  // namespace
+    void post_parse() override {
+        checkpoint_size = parser().get<std::size_t>("--checkpoint-size");
+        output_dir = parser().get<std::string>("--output");
+        no_compress = parser().get<bool>("--no-compress");
+    }
+};
 
-static coro::CoroTask<int> run_reconstruct(const std::string& directory,
-                                           const std::string& output_dir,
-                                           std::size_t checkpoint_size,
-                                           bool no_compress,
-                                           std::size_t executor_threads) {
+static coro::CoroTask<int> run_reconstruct(const ReconstructArgParse* cli,
+                                           CoroScope& scope) {
     std::printf("==========================================\n");
     std::printf("DFTracer Trace Reconstructor\n");
     std::printf("==========================================\n");
-
-    std::vector<std::string> reorg_files;
-    if (fs::exists(directory)) {
-        filesystem::PatternDirectoryScannerUtility scanner;
-        filesystem::PatternDirectoryScannerUtilityInput scan_input{
-            directory, {".pfw", ".pfw.gz"}, true};
-        auto matched = co_await scanner.process(scan_input);
-        for (const auto& entry : matched) {
-            reorg_files.push_back(entry.path.string());
-        }
-    }
-
-    if (reorg_files.empty()) {
-        DFTRACER_UTILS_LOG_ERROR("%s", "No reorganized files found.");
-        co_return 1;
-    }
-
-    std::printf("  Input directory: %s\n", directory.c_str());
-    std::printf("  Reorganized files: %zu\n", reorg_files.size());
-    std::printf("  Output directory: %s\n", output_dir.c_str());
+    std::printf("  Input directory: %s\n", cli->directory.value.c_str());
+    std::printf("  Output directory: %s\n", cli->output_dir.c_str());
+    std::printf("  Compress: %s\n", cli->no_compress ? "false" : "true");
+    std::printf("  Executor threads: %zu\n", cli->pipeline.executor_threads);
+    std::printf("==========================================\n\n");
 
     auto start_time = std::chrono::high_resolution_clock::now();
 
-    std::printf("\nStep 1: Building reconstruction plan...\n");
-    ReconstructionPlannerUtility planner;
-    ReconstructionPlannerInput planner_input;
-    planner_input.reorganized_files = reorg_files;
-    planner_input.index_dir = "";
+    ReconstructorInput input;
+    input.input_dir = cli->directory.value;
+    input.output_dir = cli->output_dir;
+    input.checkpoint_size = cli->checkpoint_size;
+    input.parallelism = cli->pipeline.executor_threads;
+    input.compress = !cli->no_compress;
 
-    ReconstructionPlan plan;
-    try {
-        plan = co_await planner.process(planner_input);
-    } catch (const std::exception& e) {
-        DFTRACER_UTILS_LOG_ERROR("Planning failed: %s", e.what());
-        co_return 1;
-    }
-
-    if (plan.files.empty()) {
-        std::printf("No files with provenance found.\n");
-        co_return 0;
-    }
-
-    std::printf("  Original files to reconstruct: %zu\n", plan.files.size());
-    std::printf("  Total segments: %zu\n", plan.total_segments);
-    std::printf("  Total events: %zu\n", plan.total_events);
-
-    // Step 2: For each reorganized file, extract lines and write directly
-    // to output via ChunkWriter (streaming, no full buffering)
-    std::printf("\nStep 2: Extracting and writing...\n");
-
-    // Build per-reorg-file segment intervals
-    std::unordered_map<std::string, std::vector<SegmentInterval>>
-        per_reorg_segments;
-    for (const auto& [orig_path, recon] : plan.files) {
-        for (const auto& [ckpt, segs] : recon.checkpoint_segments) {
-            for (const auto& seg : segs) {
-                SegmentInterval si;
-                si.line_start = seg.output_line_start;
-                si.line_end = seg.output_line_end;
-                si.original_path = orig_path;
-                si.source_checkpoint = seg.source_checkpoint;
-                per_reorg_segments[seg.reorg_file].push_back(std::move(si));
-            }
-        }
-    }
-
-    for (auto& [file, segs] : per_reorg_segments) {
-        std::sort(segs.begin(), segs.end(),
-                  [](const SegmentInterval& a, const SegmentInterval& b) {
-                      return a.line_start < b.line_start;
-                  });
-    }
-
-    std::unordered_map<std::string, std::unique_ptr<ChunkWriter>> writers;
-    std::unordered_map<std::string, std::unique_ptr<coro::AsyncMutex>>
-        writer_mutexes;
-    for (const auto& [orig_path, recon] : plan.files) {
-        std::string fname = output_filename(orig_path);
-        std::string base = fname;
-        if (base.size() > 4 && base.substr(base.size() - 4) == ".pfw") {
-            base = base.substr(0, base.size() - 4);
-        }
-
-        auto config =
-            ChunkWriterConfig()
-                .with_output_dir(output_dir)
-                .with_base_name(base)
-                .with_chunk_size(std::numeric_limits<std::size_t>::max())
-                .with_compression(!no_compress);
-        writers[orig_path] = std::make_unique<ChunkWriter>(config);
-        writer_mutexes[orig_path] = std::make_unique<coro::AsyncMutex>();
-    }
+    ReconstructorUtility reconstructor;
+    auto result = co_await scope.spawn(reconstructor, std::move(input));
 
-    for (auto& [path, writer] : writers) {
-        co_await writer->open();
-    }
-
-    // Process reorganized files (parallel via pipeline)
-    {
-        auto pipeline_config = PipelineConfig()
-                                   .with_name("Reconstruct: Extract")
-                                   .with_compute_threads(executor_threads)
-                                   .with_watchdog(false);
-        Pipeline pipeline(pipeline_config);
-
-        auto* per_reorg_ptr = &per_reorg_segments;
-        auto* writers_ptr = &writers;
-        auto* mutexes_ptr = &writer_mutexes;
-
-        auto extract_task = make_task(
-            [per_reorg_ptr, writers_ptr, mutexes_ptr, checkpoint_size,
-             executor_threads](CoroScope& scope) -> coro::CoroTask<void> {
-                auto permits = coro::make_channel<bool>(executor_threads * 2);
-                for (std::size_t i = 0; i < executor_threads * 2; ++i) {
-                    permits->try_send(true);
-                }
-
-                std::vector<coro::SpawnFuture<void>> futures;
-
-                for (const auto& [reorg_file, intervals] : *per_reorg_ptr) {
-                    auto* intervals_ptr = &intervals;
-                    auto reorg_file_copy = reorg_file;
-                    futures.push_back(scope.spawn([reorg_file_copy,
-                                                   intervals_ptr, writers_ptr,
-                                                   mutexes_ptr, checkpoint_size,
-                                                   permits](CoroScope& s)
-                                                      -> coro::CoroTask<void> {
-                        co_await s.receive(permits);
-                        try {
-                            std::string index_path =
-                                internal::determine_index_path(reorg_file_copy,
-                                                               "");
-
-                            MetadataCollectorUtility meta_collector;
-                            auto meta_input =
-                                MetadataCollectorUtilityInput::from_file(
-                                    reorg_file_copy)
-                                    .with_index(index_path)
-                                    .with_checkpoint_size(checkpoint_size);
-                            auto meta =
-                                co_await meta_collector.process(meta_input);
-
-                            auto reader_input =
-                                IndexedReadInput::from_file(reorg_file_copy)
-                                    .with_index(index_path)
-                                    .with_checkpoint_size(checkpoint_size);
-                            IndexedFileReaderUtility reader_utility;
-                            auto reader =
-                                co_await reader_utility.process(reader_input);
-
-                            auto stream = reader->stream(
-                                reader::internal::StreamConfig()
-                                    .stream_type(reader::internal::StreamType::
-                                                     MULTI_LINES_BYTES)
-                                    .range_type(
-                                        reader::internal::RangeType::BYTE_RANGE)
-                                    .buffer_size(4 * 1024 * 1024)
-                                    .from(0)
-                                    .to(meta.uncompressed_size));
-
-                            struct PendingLine {
-                                const char* data;
-                                std::size_t len;
-                            };
-                            std::unordered_map<std::string,
-                                               std::vector<PendingLine>>
-                                batch;
-                            int event_number = 0;
-
-                            while (!stream->done()) {
-                                auto chunk = co_await stream->read_async();
-                                if (chunk.empty()) break;
-
-                                const char* data = chunk.data();
-                                std::size_t bytes_read = chunk.size();
-                                std::size_t pos = 0;
-
-                                while (pos < bytes_read) {
-                                    const char* line_start = data + pos;
-                                    const char* newline =
-                                        static_cast<const char*>(
-                                            std::memchr(line_start, '\n',
-                                                        bytes_read - pos));
-                                    if (!newline) break;
-                                    std::size_t line_len =
-                                        static_cast<std::size_t>(newline -
-                                                                 line_start);
-
-                                    if (line_len > 0 && line_start[0] == '{') {
-                                        const auto* seg = find_segment(
-                                            *intervals_ptr, event_number);
-                                        if (seg) {
-                                            batch[seg->original_path].push_back(
-                                                {line_start, line_len});
-                                        }
-                                        event_number++;
-                                    }
-
-                                    pos = static_cast<std::size_t>(newline -
-                                                                   data) +
-                                          1;
-                                }
-
-                                for (auto& [orig, lines] : batch) {
-                                    if (lines.empty()) continue;
-                                    auto wit = writers_ptr->find(orig);
-                                    auto mit = mutexes_ptr->find(orig);
-                                    if (wit != writers_ptr->end() &&
-                                        mit != mutexes_ptr->end()) {
-                                        co_await mit->second->lock();
-                                        for (const auto& l : lines) {
-                                            co_await wit->second->write_line(
-                                                ByteView(l.data, l.len));
-                                        }
-                                        mit->second->unlock();
-                                    }
-                                    lines.clear();
-                                }
-                            }
-
-                            permits->try_send(true);
-                        } catch (...) {
-                            permits->try_send(true);
-                            throw;
-                        }
-                    }));
-                }
-
-                for (auto& f : futures) {
-                    co_await f;
-                }
-            },
-            "ExtractLines");
-
-        pipeline.set_source(extract_task);
-        pipeline.set_destination(extract_task);
-        pipeline.execute();
+    if (!result.success) {
+        DFTRACER_UTILS_LOG_ERROR("Reconstruction failed: %s",
+                                 result.error_message.c_str());
+        co_return 1;
     }
 
-    // Close all writers
-    std::size_t files_written = 0;
-    for (auto& [path, writer] : writers) {
-        co_await writer->close();
-        std::string fname = output_filename(path);
-        std::printf("  %s: %zu events\n", fname.c_str(),
-                    writer->total_events_written());
-        files_written++;
+    for (const auto& file : result.files) {
+        std::string fname = fs::path(file.output_path).filename().string();
+        std::printf("  %s: %zu events\n", fname.c_str(), file.events_written);
     }
 
     auto end_time = std::chrono::high_resolution_clock::now();
@@ -334,7 +99,8 @@ static coro::CoroTask<int> run_reconstruct(const std::string& directory,
     std::printf("Reconstruction Complete\n");
     std::printf("==========================================\n");
     std::printf("  Time: %.2f seconds\n", duration.count() / 1000.0);
-    std::printf("  Files reconstructed: %zu\n", files_written);
+    std::printf("  Files reconstructed: %zu\n", result.files.size());
+    std::printf("  Total events: %zu\n", result.total_events);
     std::printf("==========================================\n");
 
     co_return 0;
@@ -348,46 +114,27 @@ int main(int argc, char** argv) {
     program.add_description(
         "Reconstruct original trace files from reorganized output.");
 
-    program.add_argument("-d", "--directory")
-        .help("Directory containing reorganized files")
-        .required();
-
-    program.add_argument("-o", "--output").help("Output directory").required();
+    ReconstructArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
 
-    program.add_argument("--checkpoint-size")
-        .help("Checkpoint size for indexing in bytes")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(
-            indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE));
+    fs::create_directories(cli.output_dir);
 
-    program.add_argument("--no-compress")
-        .help("Write plain .pfw instead of .pfw.gz")
-        .flag();
-
-    program.add_argument("--executor-threads")
-        .help("Worker threads")
-        .scan<'d', std::size_t>()
-        .default_value(
-            static_cast<std::size_t>(dftracer_utils_hardware_concurrency()));
-
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        DFTRACER_UTILS_LOG_ERROR("Error: %s", err.what());
-        std::cerr << program;
-        return 1;
-    }
+    auto pipeline_config =
+        cli::build_pipeline_config("Reconstruct", cli.pipeline);
+    Pipeline pipeline(pipeline_config);
 
-    std::string directory = program.get<std::string>("--directory");
-    std::string output_dir = program.get<std::string>("--output");
-    std::size_t checkpoint_size = program.get<std::size_t>("--checkpoint-size");
-    bool no_compress = program.get<bool>("--no-compress");
-    std::size_t executor_threads =
-        program.get<std::size_t>("--executor-threads");
+    int exit_code = 0;
+    auto* cli_ptr = &cli;
+    auto task = make_task(
+        [cli_ptr, &exit_code](CoroScope& scope) -> coro::CoroTask<void> {
+            exit_code = co_await run_reconstruct(cli_ptr, scope);
+        },
+        "ReconstructMain");
 
-    fs::create_directories(output_dir);
+    pipeline.set_source(task);
+    pipeline.set_destination(task);
+    pipeline.execute();
 
-    return run_reconstruct(directory, output_dir, checkpoint_size, no_compress,
-                           executor_threads)
-        .get();
+    return exit_code;
 }
diff --git a/src/dftracer/utils/binaries/dftracer_replay.cpp b/src/dftracer/utils/binaries/dftracer_replay.cpp
index b8d37a6d..8eae75b7 100644
--- a/src/dftracer/utils/binaries/dftracer_replay.cpp
+++ b/src/dftracer/utils/binaries/dftracer_replay.cpp
@@ -1,7 +1,23 @@
+// Pipeline-driven replay binary.
+//
+// DAG:
+//   scan -> execute
+//
+// scan    : enumerate inputs (.pfw / .pfw.gz, recursive optional)
+// execute : either ReplayEngine::run_pipelined (producer+consumer with
+//           Channel<Trace> so read/parse latency is hidden behind the
+//           consumer's apply_timing+execute) or, when --use-call-tree is
+//           set, the legacy replay_with_call_tree path that builds a
+//           hierarchical tree first
+
 #include <dftracer/utils/core/common/config.h>
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/core/mpi/mpi_utils.h>
+#include <dftracer/utils/core/pipeline/pipeline.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/core/tasks/task.h>
 #include <dftracer/utils/utilities/replay/replay.h>
 
 #include <argparse/argparse.hpp>
@@ -11,548 +27,566 @@
 #endif
 
 #include <chrono>
+#include <cinttypes>
+#include <cstdio>
 #include <cstdlib>
-#include <iomanip>
-#include <iostream>
 #include <sstream>
+#include <string>
+#include <vector>
+
+#include "common_cli.h"
 
 using namespace dftracer::utils;
+using namespace dftracer::utils::utilities;
 using namespace dftracer::utils::utilities::replay;
 
-/**
- * Collect trace files from directory or file list
- */
-static std::vector<std::string> collect_trace_files(
-    const std::vector<std::string>& inputs, bool recursive) {
-    std::vector<std::string> trace_files;
+namespace {
+
+class ReplayArgParse : public cli::ArgParse {
+   public:
+    cli::PipelineArgs pipeline;
+
+    std::vector<std::string> inputs;
+    bool no_timing = false;
+    bool dry_run = false;
+    bool dftracer_mode = false;
+    bool no_sleep = false;
+    bool verbose = false;
+    bool recursive = false;
+    bool use_call_tree = false;
+    bool hierarchical_replay = false;
+    bool respect_call_hierarchy = false;
+
+    std::string filter_pid_csv;
+    std::string exclude_pid_csv;
+    std::string filter_tid_csv;
+    std::string exclude_tid_csv;
+    std::string filter_function_csv;
+    std::string exclude_function_csv;
+    std::string filter_category_csv;
+    std::string exclude_category_csv;
+
+    std::uint64_t start_timestamp = 0;
+    std::uint64_t end_timestamp = UINT64_MAX;
+    std::int64_t min_size = -1;
+    std::int64_t max_size = -1;
+    double sample_rate = 1.0;
+    std::uint64_t sample_seed = 0;
+    std::size_t max_events = 0;
+    std::size_t channel_capacity = 4096;
+
+    explicit ReplayArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        schema(pipeline);
+    }
 
-    for (const auto& input : inputs) {
-        if (fs::is_directory(input)) {
-            if (recursive) {
-                for (const auto& entry :
-                     fs::recursive_directory_iterator(input)) {
-                    if (entry.is_regular_file()) {
-                        std::string path = entry.path().string();
-                        if ((path.size() >= 4 &&
-                             path.substr(path.size() - 4) == ".pfw") ||
-                            (path.size() >= 7 &&
-                             path.substr(path.size() - 7) == ".pfw.gz")) {
-                            trace_files.push_back(path);
-                        }
-                    }
-                }
-            } else {
-                for (const auto& entry : fs::directory_iterator(input)) {
-                    if (entry.is_regular_file()) {
-                        std::string path = entry.path().string();
-                        if ((path.size() >= 4 &&
-                             path.substr(path.size() - 4) == ".pfw") ||
-                            (path.size() >= 7 &&
-                             path.substr(path.size() - 7) == ".pfw.gz")) {
-                            trace_files.push_back(path);
-                        }
-                    }
-                }
-            }
-        } else if (fs::is_regular_file(input)) {
-            trace_files.push_back(input);
-        } else {
-            DFTRACER_UTILS_LOG_ERROR("Input not found or not accessible: %s",
-                                     input.c_str());
+   protected:
+    void register_args() override {
+        auto& p = parser();
+        p.add_argument("inputs")
+            .help(
+                "Trace files (.pfw, .pfw.gz) or directories containing trace "
+                "files")
+            .nargs(argparse::nargs_pattern::at_least_one);
+
+        p.add_argument("--no-timing")
+            .help("Ignore original timing and execute as fast as possible")
+            .flag();
+        p.add_argument("--dry-run")
+            .help("Parse and analyze traces without executing operations")
+            .flag();
+        p.add_argument("--dftracer-mode")
+            .help(
+                "Use DFTracer sleep-based replay (sleep for operation "
+                "duration instead of doing actual I/O)")
+            .flag();
+        p.add_argument("--no-sleep")
+            .help(
+                "When used with --dftracer-mode, disable sleep calls for "
+                "maximum speed")
+            .flag();
+        p.add_argument("--verbose")
+            .help("Enable verbose output and detailed statistics")
+            .flag();
+        p.add_argument("-r", "--recursive")
+            .help("Recursively search directories for trace files")
+            .flag();
+
+        p.add_argument("--use-call-tree")
+            .help("Build and use call tree structure for hierarchical replay")
+            .flag();
+        p.add_argument("--hierarchical-replay")
+            .help(
+                "Replay operations respecting parent-child call hierarchy "
+                "(requires --use-call-tree)")
+            .flag();
+        p.add_argument("--respect-call-hierarchy")
+            .help(
+                "Replay child nodes immediately after parent (requires "
+                "--use-call-tree and --hierarchical-replay)")
+            .flag();
+
+        p.add_argument("--filter-pid")
+            .help("Only replay events from specific PID(s) (comma-separated)")
+            .default_value(std::string(""));
+        p.add_argument("--exclude-pid")
+            .help("Exclude events from specific PID(s) (comma-separated)")
+            .default_value(std::string(""));
+        p.add_argument("--filter-tid")
+            .help("Only replay events from specific TID(s) (comma-separated)")
+            .default_value(std::string(""));
+        p.add_argument("--exclude-tid")
+            .help("Exclude events from specific TID(s) (comma-separated)")
+            .default_value(std::string(""));
+        p.add_argument("--filter-function")
+            .help(
+                "Only replay specific function(s) (comma-separated, e.g., "
+                "'read,write,open')")
+            .default_value(std::string(""));
+        p.add_argument("--exclude-function")
+            .help("Exclude specific function(s) (comma-separated)")
+            .default_value(std::string(""));
+        p.add_argument("--filter-category")
+            .help(
+                "Only replay specific category/categories (comma-separated, "
+                "e.g., 'POSIX,storage')")
+            .default_value(std::string(""));
+        p.add_argument("--exclude-category")
+            .help("Exclude specific category/categories (comma-separated)")
+            .default_value(std::string(""));
+
+        p.add_argument("--start-timestamp")
+            .help("Only replay events after this timestamp (microseconds)")
+            .default_value(std::uint64_t(0))
+            .scan<'u', std::uint64_t>();
+        p.add_argument("--end-timestamp")
+            .help("Only replay events before this timestamp (microseconds)")
+            .default_value(UINT64_MAX)
+            .scan<'u', std::uint64_t>();
+        p.add_argument("--min-size")
+            .help("Only replay operations with size >= this value (bytes)")
+            .default_value(std::int64_t(-1))
+            .scan<'i', std::int64_t>();
+        p.add_argument("--max-size")
+            .help("Only replay operations with size <= this value (bytes)")
+            .default_value(std::int64_t(-1))
+            .scan<'i', std::int64_t>();
+
+        p.add_argument("--sample-rate")
+            .help("Sample rate for replay (0.0-1.0, 1.0=all events, 0.1=10%)")
+            .default_value(1.0)
+            .scan<'g', double>();
+        p.add_argument("--sample-seed")
+            .help("Random seed for sampling (for reproducibility)")
+            .default_value(std::uint64_t(0))
+            .scan<'u', std::uint64_t>();
+        p.add_argument("--max-events")
+            .help("Maximum number of events to replay (0=unlimited)")
+            .default_value(std::size_t(0))
+            .scan<'u', std::size_t>();
+        p.add_argument("--channel-capacity")
+            .help(
+                "Bounded Channel<Trace> capacity between read/parse producer "
+                "and dispatch consumer (default 4096)")
+            .default_value(std::size_t(4096))
+            .scan<'u', std::size_t>();
+    }
+
+    void post_parse() override {
+        auto& p = parser();
+        inputs = p.get<std::vector<std::string>>("inputs");
+        no_timing = p.get<bool>("--no-timing");
+        dry_run = p.get<bool>("--dry-run");
+        dftracer_mode = p.get<bool>("--dftracer-mode");
+        no_sleep = p.get<bool>("--no-sleep");
+        verbose = p.get<bool>("--verbose");
+        recursive = p.get<bool>("--recursive");
+        use_call_tree = p.get<bool>("--use-call-tree");
+        hierarchical_replay = p.get<bool>("--hierarchical-replay");
+        respect_call_hierarchy = p.get<bool>("--respect-call-hierarchy");
+
+        filter_pid_csv = p.get<std::string>("--filter-pid");
+        exclude_pid_csv = p.get<std::string>("--exclude-pid");
+        filter_tid_csv = p.get<std::string>("--filter-tid");
+        exclude_tid_csv = p.get<std::string>("--exclude-tid");
+        filter_function_csv = p.get<std::string>("--filter-function");
+        exclude_function_csv = p.get<std::string>("--exclude-function");
+        filter_category_csv = p.get<std::string>("--filter-category");
+        exclude_category_csv = p.get<std::string>("--exclude-category");
+
+        start_timestamp = p.get<std::uint64_t>("--start-timestamp");
+        end_timestamp = p.get<std::uint64_t>("--end-timestamp");
+        min_size = p.get<std::int64_t>("--min-size");
+        max_size = p.get<std::int64_t>("--max-size");
+        sample_rate = p.get<double>("--sample-rate");
+        sample_seed = p.get<std::uint64_t>("--sample-seed");
+        max_events = p.get<std::size_t>("--max-events");
+        channel_capacity = p.get<std::size_t>("--channel-capacity");
+    }
+
+    bool validate() override {
+        if (no_sleep && !dftracer_mode) {
+            std::fprintf(stderr,
+                         "Error: --no-sleep can only be used with "
+                         "--dftracer-mode\n");
+            return false;
+        }
+        if (hierarchical_replay && !use_call_tree) {
+            std::fprintf(
+                stderr,
+                "Error: --hierarchical-replay requires --use-call-tree\n");
+            return false;
+        }
+        if (respect_call_hierarchy && !hierarchical_replay) {
+            std::fprintf(stderr,
+                         "Error: --respect-call-hierarchy requires "
+                         "--hierarchical-replay\n");
+            return false;
+        }
+        if (sample_rate < 0.0 || sample_rate > 1.0) {
+            std::fprintf(stderr,
+                         "Error: --sample-rate must be between 0.0 and 1.0\n");
+            return false;
         }
+        return true;
     }
+};
 
-    return trace_files;
+bool is_trace_file(const std::string& path) {
+    return (path.size() >= 4 &&
+            path.compare(path.size() - 4, 4, ".pfw") == 0) ||
+           (path.size() >= 7 &&
+            path.compare(path.size() - 7, 7, ".pfw.gz") == 0);
 }
 
-int main(int argc, char** argv) {
-#ifdef DFTRACER_UTILS_MPI_ENABLED
-    MPI_Init(&argc, &argv);
-    mpi::MPIUtils::instance().initialize();
-#endif
+std::unordered_set<std::uint32_t> parse_csv_uint32(const std::string& csv) {
+    std::unordered_set<std::uint32_t> out;
+    if (csv.empty()) return out;
+    std::istringstream ss(csv);
+    std::string token;
+    while (std::getline(ss, token, ',')) {
+        if (!token.empty()) {
+            try {
+                out.insert(static_cast<std::uint32_t>(std::stoul(token)));
+            } catch (...) {
+            }
+        }
+    }
+    return out;
+}
 
-    DFTRACER_UTILS_LOGGER_INIT();
+std::unordered_set<std::string> parse_csv_string(const std::string& csv) {
+    std::unordered_set<std::string> out;
+    if (csv.empty()) return out;
+    std::istringstream ss(csv);
+    std::string token;
+    while (std::getline(ss, token, ',')) {
+        if (!token.empty()) out.insert(token);
+    }
+    return out;
+}
 
-    // Get MPI rank for output control (defaults to rank 0, size 1 without MPI)
+struct RunCtx {
+    const ReplayArgParse* cli = nullptr;
     int mpi_rank = 0;
     int mpi_size = 1;
     bool is_root = true;
-#ifdef DFTRACER_UTILS_MPI_ENABLED
-    mpi_rank = mpi::MPIUtils::instance().get_rank();
-    mpi_size = mpi::MPIUtils::instance().get_world_size();
-    is_root = mpi::MPIUtils::instance().is_root();
-#endif
 
-    argparse::ArgumentParser program("dftracer_replay",
-                                     DFTRACER_UTILS_PACKAGE_VERSION);
-    program.add_description(
-        "DFTracer replay utility - replays I/O operations from DFTracer trace "
-        "files (.pfw, .pfw.gz)");
-
-    // Input files/directories
-    program.add_argument("inputs")
-        .help(
-            "Trace files (.pfw, .pfw.gz) or directories containing trace files")
-        .nargs(argparse::nargs_pattern::at_least_one);
-
-    // Timing options
-    program.add_argument("--no-timing")
-        .help("Ignore original timing and execute as fast as possible")
-        .flag();
-
-    // Execution options
-    program.add_argument("--dry-run")
-        .help("Parse and analyze traces without executing operations")
-        .flag();
-
-    program.add_argument("--dftracer-mode")
-        .help(
-            "Use DFTracer sleep-based replay (sleep for operation duration "
-            "instead of doing actual I/O)")
-        .flag();
-
-    program.add_argument("--no-sleep")
-        .help(
-            "When used with --dftracer-mode, disable sleep calls for maximum "
-            "speed")
-        .flag();
-
-    program.add_argument("--verbose")
-        .help("Enable verbose output and detailed statistics")
-        .flag();
-
-    program.add_argument("-r", "--recursive")
-        .help("Recursively search directories for trace files")
-        .flag();
-
-    // Call tree options
-    program.add_argument("--use-call-tree")
-        .help("Build and use call tree structure for hierarchical replay")
-        .flag();
-
-    program.add_argument("--hierarchical-replay")
-        .help(
-            "Replay operations respecting parent-child call hierarchy "
-            "(requires --use-call-tree)")
-        .flag();
-
-    program.add_argument("--respect-call-hierarchy")
-        .help(
-            "Replay child nodes immediately after parent (requires "
-            "--use-call-tree and --hierarchical-replay)")
-        .flag();
-
-    // Filtering options - Process/Thread
-    program.add_argument("--filter-pid")
-        .help("Only replay events from specific PID(s) (comma-separated)")
-        .default_value(std::string(""));
-
-    program.add_argument("--exclude-pid")
-        .help("Exclude events from specific PID(s) (comma-separated)")
-        .default_value(std::string(""));
-
-    program.add_argument("--filter-tid")
-        .help("Only replay events from specific TID(s) (comma-separated)")
-        .default_value(std::string(""));
-
-    program.add_argument("--exclude-tid")
-        .help("Exclude events from specific TID(s) (comma-separated)")
-        .default_value(std::string(""));
-
-    // Filtering options - Function/Category
-    program.add_argument("--filter-function")
-        .help(
-            "Only replay specific function(s) (comma-separated, e.g., "
-            "'read,write,open')")
-        .default_value(std::string(""));
-
-    program.add_argument("--exclude-function")
-        .help("Exclude specific function(s) (comma-separated)")
-        .default_value(std::string(""));
-
-    program.add_argument("--filter-category")
-        .help(
-            "Only replay specific category/categories (comma-separated, e.g., "
-            "'POSIX,storage')")
-        .default_value(std::string(""));
-
-    program.add_argument("--exclude-category")
-        .help("Exclude specific category/categories (comma-separated)")
-        .default_value(std::string(""));
-
-    // Filtering options - Timestamp
-    program.add_argument("--start-timestamp")
-        .help("Only replay events after this timestamp (microseconds)")
-        .default_value(std::uint64_t(0))
-        .scan<'u', std::uint64_t>();
-
-    program.add_argument("--end-timestamp")
-        .help("Only replay events before this timestamp (microseconds)")
-        .default_value(UINT64_MAX)
-        .scan<'u', std::uint64_t>();
-
-    // Filtering options - Size
-    program.add_argument("--min-size")
-        .help("Only replay operations with size >= this value (bytes)")
-        .default_value(std::int64_t(-1))
-        .scan<'i', std::int64_t>();
-
-    program.add_argument("--max-size")
-        .help("Only replay operations with size <= this value (bytes)")
-        .default_value(std::int64_t(-1))
-        .scan<'i', std::int64_t>();
-
-    // Sampling options
-    program.add_argument("--sample-rate")
-        .help("Sample rate for replay (0.0-1.0, 1.0=all events, 0.1=10%)")
-        .default_value(1.0)
-        .scan<'g', double>();
-
-    program.add_argument("--sample-seed")
-        .help("Random seed for sampling (for reproducibility)")
-        .default_value(std::uint64_t(0))
-        .scan<'u', std::uint64_t>();
-
-    // Resource limits
-    program.add_argument("--max-events")
-        .help("Maximum number of events to replay (0=unlimited)")
-        .default_value(std::size_t(0))
-        .scan<'u', std::size_t>();
-
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        DFTRACER_UTILS_LOG_ERROR("Argument parsing error: %s", err.what());
-        std::cerr << program;
-        return 1;
-    }
-
-    // Helper to parse comma-separated values
-    auto parse_csv_uint32 =
-        [](const std::string& csv) -> std::unordered_set<std::uint32_t> {
-        std::unordered_set<std::uint32_t> result;
-        if (csv.empty()) return result;
-
-        std::istringstream ss(csv);
-        std::string token;
-        while (std::getline(ss, token, ',')) {
-            if (!token.empty()) {
-                result.insert(static_cast<std::uint32_t>(std::stoul(token)));
+    ReplayConfig config;
+    std::vector<std::string> trace_files;
+    ReplayResult result;
+    int exit_code = 0;
+    bool failed = false;
+
+    double scan_ms = 0;
+    double execute_ms = 0;
+};
+
+coro::CoroTask<void> task_scan(RunCtx* ctx) {
+    const auto t0 = std::chrono::steady_clock::now();
+
+    for (const auto& in : ctx->cli->inputs) {
+        std::error_code ec;
+        if (fs::is_directory(in, ec)) {
+            if (ctx->cli->recursive) {
+                for (const auto& e : fs::recursive_directory_iterator(in, ec)) {
+                    if (e.is_regular_file(ec) &&
+                        is_trace_file(e.path().string())) {
+                        ctx->trace_files.push_back(e.path().string());
+                    }
+                }
+            } else {
+                for (const auto& e : fs::directory_iterator(in, ec)) {
+                    if (e.is_regular_file(ec) &&
+                        is_trace_file(e.path().string())) {
+                        ctx->trace_files.push_back(e.path().string());
+                    }
+                }
             }
+        } else if (fs::is_regular_file(in, ec)) {
+            ctx->trace_files.push_back(in);
+        } else {
+            DFTRACER_UTILS_LOG_ERROR("Input not found or not accessible: %s",
+                                     in.c_str());
         }
-        return result;
-    };
+    }
+    std::sort(ctx->trace_files.begin(), ctx->trace_files.end());
 
-    auto parse_csv_string =
-        [](const std::string& csv) -> std::unordered_set<std::string> {
-        std::unordered_set<std::string> result;
-        if (csv.empty()) return result;
+    ctx->scan_ms = std::chrono::duration<double, std::milli>(
+                       std::chrono::steady_clock::now() - t0)
+                       .count();
 
-        std::istringstream ss(csv);
-        std::string token;
-        while (std::getline(ss, token, ',')) {
-            if (!token.empty()) {
-                result.insert(token);
-            }
+    if (ctx->trace_files.empty()) {
+        if (ctx->is_root) {
+            std::fprintf(stderr,
+                         "No trace files found in the specified inputs.\n");
         }
-        return result;
-    };
-
-    // Parse arguments
-    std::vector<std::string> inputs =
-        program.get<std::vector<std::string>>("inputs");
-    bool no_timing = program.get<bool>("--no-timing");
-    bool dry_run = program.get<bool>("--dry-run");
-    bool dftracer_mode = program.get<bool>("--dftracer-mode");
-    bool no_sleep = program.get<bool>("--no-sleep");
-    bool verbose = program.get<bool>("--verbose");
-    bool recursive = program.get<bool>("--recursive");
-
-    // Call tree options
-    bool use_call_tree = program.get<bool>("--use-call-tree");
-    bool hierarchical_replay = program.get<bool>("--hierarchical-replay");
-    bool respect_call_hierarchy = program.get<bool>("--respect-call-hierarchy");
-
-    // Parse filter arguments
-    auto filter_pids =
-        parse_csv_uint32(program.get<std::string>("--filter-pid"));
-    auto exclude_pids =
-        parse_csv_uint32(program.get<std::string>("--exclude-pid"));
-    auto filter_tids =
-        parse_csv_uint32(program.get<std::string>("--filter-tid"));
-    auto exclude_tids =
-        parse_csv_uint32(program.get<std::string>("--exclude-tid"));
-    auto filter_functions =
-        parse_csv_string(program.get<std::string>("--filter-function"));
-    auto exclude_functions =
-        parse_csv_string(program.get<std::string>("--exclude-function"));
-    auto filter_categories =
-        parse_csv_string(program.get<std::string>("--filter-category"));
-    auto exclude_categories =
-        parse_csv_string(program.get<std::string>("--exclude-category"));
-
-    std::uint64_t start_timestamp =
-        program.get<std::uint64_t>("--start-timestamp");
-    std::uint64_t end_timestamp = program.get<std::uint64_t>("--end-timestamp");
-    std::int64_t min_size = program.get<std::int64_t>("--min-size");
-    std::int64_t max_size = program.get<std::int64_t>("--max-size");
-    double sample_rate = program.get<double>("--sample-rate");
-    std::uint64_t sample_seed = program.get<std::uint64_t>("--sample-seed");
-    std::size_t max_events = program.get<std::size_t>("--max-events");
-
-    // Validate --no-sleep usage
-    if (no_sleep && !dftracer_mode) {
-        std::cerr << "Error: --no-sleep can only be used with --dftracer-mode"
-                  << std::endl;
-        return 1;
+        ctx->failed = true;
+        ctx->exit_code = 1;
+        co_return;
     }
 
-    // Validate call tree options
-    if (hierarchical_replay && !use_call_tree) {
-        std::cerr << "Error: --hierarchical-replay requires --use-call-tree"
-                  << std::endl;
-        return 1;
+    if (ctx->is_root) {
+        std::printf("Found %zu trace file(s) to replay:\n",
+                    ctx->trace_files.size());
+        for (const auto& file : ctx->trace_files) {
+            std::printf("  %s\n", file.c_str());
+        }
     }
+    co_return;
+}
 
-    if (respect_call_hierarchy && !hierarchical_replay) {
-        std::cerr
-            << "Error: --respect-call-hierarchy requires --hierarchical-replay"
-            << std::endl;
-        return 1;
+void print_configuration(const RunCtx& ctx) {
+    const auto& c = ctx.config;
+    std::printf("\n=== Replay Configuration ===\n");
+    if (ctx.mpi_size > 1) {
+        std::printf("MPI processes: %d\n", ctx.mpi_size);
     }
-
-    // Validate sample rate
-    if (sample_rate < 0.0 || sample_rate > 1.0) {
-        std::cerr << "Error: --sample-rate must be between 0.0 and 1.0"
-                  << std::endl;
-        return 1;
+    std::printf("Maintain timing: %s\n", c.maintain_timing ? "yes" : "no");
+    std::printf("Dry run: %s\n", c.dry_run ? "yes" : "no");
+    if (c.dftracer_mode) {
+        std::printf("DFTracer mode: yes (%s)\n",
+                    c.no_sleep ? "no-sleep" : "sleep-based");
+    } else {
+        std::printf("DFTracer mode: no (actual I/O)\n");
     }
+    if (c.use_call_tree) {
+        std::printf("Call tree mode: yes\n");
+        std::printf("  Hierarchical replay: %s\n",
+                    c.hierarchical_replay ? "yes" : "no");
+        if (c.hierarchical_replay) {
+            std::printf("  Respect call hierarchy: %s\n",
+                        c.respect_call_hierarchy ? "yes" : "no");
+        }
+    }
+}
 
-    // Collect trace files
-    std::vector<std::string> trace_files =
-        collect_trace_files(inputs, recursive);
+void print_filters(const RunCtx& ctx) {
+    const auto& c = ctx.config;
+    bool any = !c.filter_pids.empty() || !c.exclude_pids.empty() ||
+               !c.filter_tids.empty() || !c.exclude_tids.empty() ||
+               !c.filter_functions.empty() || !c.exclude_functions.empty() ||
+               !c.filter_categories.empty() || !c.exclude_categories.empty() ||
+               c.start_timestamp > 0 || c.end_timestamp < UINT64_MAX ||
+               c.min_operation_size >= 0 || c.max_operation_size >= 0 ||
+               c.sampling_rate < 1.0 || c.max_events > 0;
+    if (!any) return;
+
+    std::printf("\nActive Filters:\n");
+    auto print_uint_set = [](const char* label,
+                             const std::unordered_set<std::uint32_t>& s) {
+        if (s.empty()) return;
+        std::printf("  %s:", label);
+        for (auto v : s) std::printf(" %u", v);
+        std::printf("\n");
+    };
+    auto print_str_set = [](const char* label,
+                            const std::unordered_set<std::string>& s) {
+        if (s.empty()) return;
+        std::printf("  %s:", label);
+        for (const auto& v : s) std::printf(" %s", v.c_str());
+        std::printf("\n");
+    };
+    print_uint_set("Filter PIDs", c.filter_pids);
+    print_uint_set("Exclude PIDs", c.exclude_pids);
+    print_uint_set("Filter TIDs", c.filter_tids);
+    print_uint_set("Exclude TIDs", c.exclude_tids);
+    print_str_set("Filter functions", c.filter_functions);
+    print_str_set("Exclude functions", c.exclude_functions);
+    print_str_set("Filter categories", c.filter_categories);
+    print_str_set("Exclude categories", c.exclude_categories);
+    if (c.start_timestamp > 0)
+        std::printf("  Start timestamp: %" PRIu64 "\n", c.start_timestamp);
+    if (c.end_timestamp < UINT64_MAX)
+        std::printf("  End timestamp: %" PRIu64 "\n", c.end_timestamp);
+    if (c.min_operation_size >= 0)
+        std::printf("  Min operation size: %" PRId64 " bytes\n",
+                    c.min_operation_size);
+    if (c.max_operation_size >= 0)
+        std::printf("  Max operation size: %" PRId64 " bytes\n",
+                    c.max_operation_size);
+    if (c.sampling_rate < 1.0)
+        std::printf("  Sampling rate: %g%%\n", c.sampling_rate * 100.0);
+    if (c.max_events > 0) std::printf("  Max events: %zu\n", c.max_events);
+}
 
-    if (trace_files.empty()) {
-        if (is_root)
-            std::cerr << "No trace files found in the specified inputs."
-                      << std::endl;
-#ifdef DFTRACER_UTILS_MPI_ENABLED
-        mpi::MPIUtils::instance().finalize();
-        MPI_Finalize();
-#endif
-        return 1;
-    }
+coro::CoroTask<void> task_execute(RunCtx* ctx, CoroScope* scope) {
+    if (ctx->failed) co_return;
 
-    if (is_root) {
-        std::cout << "Found " << trace_files.size()
-                  << " trace file(s) to replay:" << std::endl;
-        for (const auto& file : trace_files) {
-            std::cout << "  " << file << std::endl;
-        }
+    const auto t0 = std::chrono::steady_clock::now();
+
+    if (ctx->is_root) {
+        print_configuration(*ctx);
+        print_filters(*ctx);
+        std::printf("\n=== Starting Replay ===\n");
     }
 
-    // Configure replay
-    ReplayConfig config;
-    config.maintain_timing = !no_timing;
-    config.dry_run = dry_run;
-    config.dftracer_mode = dftracer_mode;
-    config.no_sleep = no_sleep;
-    config.verbose = verbose;
-
-    // Store MPI info in config
-    config.mpi_rank = mpi_rank;
-    config.mpi_size = mpi_size;
-
-    // Call tree options
-    config.use_call_tree = use_call_tree;
-    config.hierarchical_replay = hierarchical_replay;
-    config.respect_call_hierarchy = respect_call_hierarchy;
-
-    // Apply filters
-    config.filter_pids = filter_pids;
-    config.exclude_pids = exclude_pids;
-    config.filter_tids = filter_tids;
-    config.exclude_tids = exclude_tids;
-    config.filter_functions = filter_functions;
-    config.exclude_functions = exclude_functions;
-    config.filter_categories = filter_categories;
-    config.exclude_categories = exclude_categories;
-
-    // Apply ranges and limits
-    config.start_timestamp = start_timestamp;
-    config.end_timestamp = end_timestamp;
-    config.min_operation_size = min_size;
-    config.max_operation_size = max_size;
-    config.sampling_rate = sample_rate;
-    config.sample_seed = sample_seed;
-    config.max_events = max_events;
-
-    // Print configuration (only on rank 0)
-    if (is_root) {
-        std::cout << "\n=== Replay Configuration ===" << std::endl;
-        if (mpi_size > 1) {
-            std::cout << "MPI processes: " << mpi_size << std::endl;
-        }
-        std::cout << "Maintain timing: "
-                  << (config.maintain_timing ? "yes" : "no") << std::endl;
-        std::cout << "Dry run: " << (config.dry_run ? "yes" : "no")
-                  << std::endl;
-        if (config.dftracer_mode) {
-            std::cout << "DFTracer mode: yes ("
-                      << (config.no_sleep ? "no-sleep" : "sleep-based") << ")"
-                      << std::endl;
-        } else {
-            std::cout << "DFTracer mode: no (actual I/O)" << std::endl;
-        }
-        if (config.use_call_tree) {
-            std::cout << "Call tree mode: yes" << std::endl;
-            std::cout << "  Hierarchical replay: "
-                      << (config.hierarchical_replay ? "yes" : "no")
-                      << std::endl;
-            if (config.hierarchical_replay) {
-                std::cout << "  Respect call hierarchy: "
-                          << (config.respect_call_hierarchy ? "yes" : "no")
-                          << std::endl;
+    ReplayEngine engine(ctx->config);
+
+    if (ctx->config.use_call_tree) {
+        // Call tree path stays sync — replay_with_call_tree builds the
+        // full tree in memory first, so there's nothing to hide behind a
+        // channel. We just call it from inside this task.
+        if (ctx->cli->inputs.size() != 1 ||
+            !fs::is_directory(ctx->cli->inputs[0])) {
+            if (ctx->is_root) {
+                std::fprintf(stderr,
+                             "Error: --use-call-tree requires exactly one "
+                             "input directory\n");
             }
+            ctx->failed = true;
+            ctx->exit_code = 1;
+            co_return;
         }
-    }
-    // Print active filters
-    // Print active filters (only on rank 0)
-    if (is_root &&
-        (!filter_pids.empty() || !exclude_pids.empty() ||
-         !filter_tids.empty() || !exclude_tids.empty() ||
-         !filter_functions.empty() || !exclude_functions.empty() ||
-         !filter_categories.empty() || !exclude_categories.empty() ||
-         start_timestamp > 0 || end_timestamp < UINT64_MAX || min_size >= 0 ||
-         max_size >= 0 || sample_rate < 1.0 || max_events > 0)) {
-        std::cout << "\nActive Filters:" << std::endl;
-        if (!filter_pids.empty()) {
-            std::cout << "  Filter PIDs: ";
-            for (auto pid : filter_pids) std::cout << pid << " ";
-            std::cout << std::endl;
-        }
-        if (!exclude_pids.empty()) {
-            std::cout << "  Exclude PIDs: ";
-            for (auto pid : exclude_pids) std::cout << pid << " ";
-            std::cout << std::endl;
-        }
-        if (!filter_tids.empty()) {
-            std::cout << "  Filter TIDs: ";
-            for (auto tid : filter_tids) std::cout << tid << " ";
-            std::cout << std::endl;
-        }
-        if (!exclude_tids.empty()) {
-            std::cout << "  Exclude TIDs: ";
-            for (auto tid : exclude_tids) std::cout << tid << " ";
-            std::cout << std::endl;
-        }
-        if (!filter_functions.empty()) {
-            std::cout << "  Filter functions: ";
-            for (const auto& f : filter_functions) std::cout << f << " ";
-            std::cout << std::endl;
-        }
-        if (!exclude_functions.empty()) {
-            std::cout << "  Exclude functions: ";
-            for (const auto& f : exclude_functions) std::cout << f << " ";
-            std::cout << std::endl;
-        }
-        if (!filter_categories.empty()) {
-            std::cout << "  Filter categories: ";
-            for (const auto& c : filter_categories) std::cout << c << " ";
-            std::cout << std::endl;
-        }
-        if (!exclude_categories.empty()) {
-            std::cout << "  Exclude categories: ";
-            for (const auto& c : exclude_categories) std::cout << c << " ";
-            std::cout << std::endl;
-        }
-        if (start_timestamp > 0) {
-            std::cout << "  Start timestamp: " << start_timestamp << std::endl;
-        }
-        if (end_timestamp < UINT64_MAX) {
-            std::cout << "  End timestamp: " << end_timestamp << std::endl;
-        }
-        if (min_size >= 0) {
-            std::cout << "  Min operation size: " << min_size << " bytes"
-                      << std::endl;
-        }
-        if (max_size >= 0) {
-            std::cout << "  Max operation size: " << max_size << " bytes"
-                      << std::endl;
-        }
-        if (sample_rate < 1.0) {
-            std::cout << "  Sampling rate: " << (sample_rate * 100.0) << "%"
-                      << std::endl;
-        }
-        if (max_events > 0) {
-            std::cout << "  Max events: " << max_events << std::endl;
+        if (ctx->is_root) {
+            std::printf("Using call tree hierarchical replay mode\n");
         }
+        ctx->result = engine.replay_with_call_tree(ctx->cli->inputs[0]);
+    } else {
+        // Pipelined path: producer (read+parse) feeds Channel<Trace>;
+        // consumer (apply_timing+execute) drains it.
+        co_await engine.run_pipelined(*scope, ctx->trace_files, ctx->result,
+                                      ctx->cli->channel_capacity);
     }
 
-    // Create replay engine and execute
-    if (is_root) std::cout << "\n=== Starting Replay ===" << std::endl;
+    ctx->execute_ms = std::chrono::duration<double, std::milli>(
+                          std::chrono::steady_clock::now() - t0)
+                          .count();
 
-    auto start_time = std::chrono::steady_clock::now();
+    if (ctx->is_root) {
+        std::printf("\n=== Replay Completed ===\n");
+        std::printf("Wall clock time: %.3f ms\n", ctx->execute_ms);
+        ctx->result.print_summary(ctx->config.verbose);
+    }
 
-    ReplayEngine engine(config);
-    ReplayResult result;
+    // Exit-code semantics preserved from the previous binary:
+    //   0 = at least one event executed and none failed
+    //   1 = nothing executed (no events / no trace files)
+    //   2 = at least one failed event
+    if (ctx->result.failed_events > 0) {
+        if (ctx->is_root) std::printf("\nReplay completed with errors.\n");
+        ctx->exit_code = 2;
+    } else if (ctx->result.executed_events > 0) {
+        if (ctx->is_root) std::printf("\nReplay completed successfully.\n");
+        ctx->exit_code = 0;
+    } else {
+        if (ctx->is_root) std::printf("\nNo events were executed.\n");
+        ctx->exit_code = 1;
+    }
+    co_return;
+}
+
+int run(int argc, char** argv) {
+    DFTRACER_UTILS_LOGGER_INIT();
+
+    argparse::ArgumentParser program("dftracer_replay",
+                                     DFTRACER_UTILS_PACKAGE_VERSION);
+    program.add_description(
+        "DFTracer replay utility - replays I/O operations from DFTracer "
+        "trace files (.pfw, .pfw.gz)");
+
+    ReplayArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
+
+    RunCtx ctx;
+    ctx.cli = &cli;
 
-    if (use_call_tree) {
-        // Call tree mode: expects a single directory containing trace files
-        if (inputs.size() != 1 || !fs::is_directory(inputs[0])) {
-            if (is_root)
-                std::cerr << "Error: --use-call-tree requires exactly one "
-                             "input directory"
-                          << std::endl;
 #ifdef DFTRACER_UTILS_MPI_ENABLED
-            mpi::MPIUtils::instance().finalize();
-            MPI_Finalize();
+    ctx.mpi_rank = mpi::MPIUtils::instance().get_rank();
+    ctx.mpi_size = mpi::MPIUtils::instance().get_world_size();
+    ctx.is_root = mpi::MPIUtils::instance().is_root();
 #endif
-            return 1;
-        }
-        if (is_root)
-            std::cout << "Using call tree hierarchical replay mode"
-                      << std::endl;
-        result = engine.replay_with_call_tree(inputs[0]);
-    } else {
-        // Normal mode: replay trace files directly
-        result = engine.replay(trace_files);
+
+    // Mirror argparse output into ReplayConfig.
+    auto& c = ctx.config;
+    c.maintain_timing = !cli.no_timing;
+    c.dry_run = cli.dry_run;
+    c.dftracer_mode = cli.dftracer_mode;
+    c.no_sleep = cli.no_sleep;
+    c.verbose = cli.verbose;
+    c.mpi_rank = ctx.mpi_rank;
+    c.mpi_size = ctx.mpi_size;
+    c.use_call_tree = cli.use_call_tree;
+    c.hierarchical_replay = cli.hierarchical_replay;
+    c.respect_call_hierarchy = cli.respect_call_hierarchy;
+    c.filter_pids = parse_csv_uint32(cli.filter_pid_csv);
+    c.exclude_pids = parse_csv_uint32(cli.exclude_pid_csv);
+    c.filter_tids = parse_csv_uint32(cli.filter_tid_csv);
+    c.exclude_tids = parse_csv_uint32(cli.exclude_tid_csv);
+    c.filter_functions = parse_csv_string(cli.filter_function_csv);
+    c.exclude_functions = parse_csv_string(cli.exclude_function_csv);
+    c.filter_categories = parse_csv_string(cli.filter_category_csv);
+    c.exclude_categories = parse_csv_string(cli.exclude_category_csv);
+    c.start_timestamp = cli.start_timestamp;
+    c.end_timestamp = cli.end_timestamp;
+    c.min_operation_size = cli.min_size;
+    c.max_operation_size = cli.max_size;
+    c.sampling_rate = cli.sample_rate;
+    c.sample_seed = cli.sample_seed;
+    c.max_events = cli.max_events;
+
+    auto pipeline_config =
+        cli::build_pipeline_config("DFTracer Replay", cli.pipeline);
+    Pipeline pipeline(pipeline_config);
+
+    RunCtx* ctx_ptr = &ctx;
+    auto scan = make_task(
+        [ctx_ptr](CoroScope&) -> coro::CoroTask<void> {
+            co_await task_scan(ctx_ptr);
+        },
+        "scan");
+    auto execute = make_task(
+        [ctx_ptr](CoroScope& scope) -> coro::CoroTask<void> {
+            co_await task_execute(ctx_ptr, &scope);
+        },
+        "execute");
+    execute->depends_on(scan);
+
+    pipeline.set_source(scan);
+    pipeline.set_destination(execute);
+    pipeline.execute();
+
+    if (cli.verbose && ctx.is_root) {
+        std::fprintf(stderr, "[done] scan=%.1fms execute=%.1fms\n", ctx.scan_ms,
+                     ctx.execute_ms);
     }
 
-    auto end_time = std::chrono::steady_clock::now();
-    auto total_wall_time =
-        std::chrono::duration_cast<std::chrono::microseconds>(end_time -
-                                                              start_time);
+    return ctx.exit_code;
+}
 
-    if (is_root) {
-        std::cout << "\n=== Replay Completed ===" << std::endl;
-        std::cout << "Wall clock time: "
-                  << static_cast<double>(total_wall_time.count()) / 1000.0
-                  << " ms" << std::endl;
+}  // namespace
 
-        // Print results
-        result.print_summary(verbose);
-    }
+int main(int argc, char** argv) {
+#ifdef DFTRACER_UTILS_MPI_ENABLED
+    MPI_Init(&argc, &argv);
+    mpi::MPIUtils::instance().initialize();
+#endif
 
-    // Return appropriate exit code
-    int exit_code = 0;
-    if (result.failed_events > 0) {
-        if (is_root)
-            std::cout << "\nReplay completed with errors." << std::endl;
-        exit_code = 2;
-    } else if (result.executed_events > 0) {
-        if (is_root)
-            std::cout << "\nReplay completed successfully." << std::endl;
-        exit_code = 0;
-    } else {
-        if (is_root) std::cout << "\nNo events were executed." << std::endl;
-        exit_code = 1;
-    }
+    int rc = run(argc, argv);
 
 #ifdef DFTRACER_UTILS_MPI_ENABLED
     mpi::MPIUtils::instance().finalize();
     MPI_Finalize();
 #endif
-
-    return exit_code;
+    return rc;
 }
diff --git a/src/dftracer/utils/binaries/dftracer_server.cpp b/src/dftracer/utils/binaries/dftracer_server.cpp
index c740b21c..f17b951b 100644
--- a/src/dftracer/utils/binaries/dftracer_server.cpp
+++ b/src/dftracer/utils/binaries/dftracer_server.cpp
@@ -1,11 +1,7 @@
 #include <dftracer/utils/core/common/config.h>
-#include <dftracer/utils/core/common/filesystem.h>
-#include <dftracer/utils/core/common/logging.h>
-#include <dftracer/utils/core/common/platform_compat.h>
 #include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/core/io/io_backend.h>
 #include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/core/pipeline/pipeline_config.h>
 #include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/tasks/task.h>
 #include <dftracer/utils/server/http_connection.h>
@@ -16,28 +12,65 @@
 #include <dftracer/utils/server/trace_index.h>
 #include <dftracer/utils/server/viz_api.h>
 
-#include <argparse/argparse.hpp>
 #include <cstdint>
 #include <cstdio>
 #include <string>
-#include <thread>
+
+#include "common_cli.h"
 
 using namespace dftracer::utils;
 using namespace dftracer::utils::server;
 
-static coro::CoroTask<int> run_server(argparse::ArgumentParser& program) {
-    std::string bind_addr = program.get<std::string>("--bind");
-    uint16_t port = program.get<uint16_t>("--port");
-    std::string directory = program.get<std::string>("--directory");
-    std::string index_dir = program.get<std::string>("--index-dir");
-    std::size_t executor_threads =
-        program.get<std::size_t>("--executor-threads");
-
-    // When no explicit index dir is given, default to the trace
-    // directory so `.dftindex` stores persist across restarts
-    // and don't need to be rebuilt every time.
+class ServerArgParse : public cli::ArgParse {
+   public:
+    cli::DirectoryArgs directory{cli::DirMode::REQUIRED};
+    cli::PipelineArgs pipeline;
+
+    std::string index_dir;
+    std::string bind_addr = "0.0.0.0";
+    uint16_t port = 8080;
+
+    explicit ServerArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        schema(directory, pipeline);
+    }
+
+   protected:
+    void register_args() override {
+        parser()
+            .add_argument("--index-dir")
+            .help(
+                "Directory for root-local .dftindex stores (default: same as "
+                "--directory)")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("-b", "--bind")
+            .help("Bind address")
+            .default_value<std::string>("0.0.0.0");
+
+        parser()
+            .add_argument("-p", "--port")
+            .help("Listen port")
+            .scan<'d', uint16_t>()
+            .default_value(static_cast<uint16_t>(8080));
+    }
+
+    void post_parse() override {
+        index_dir = parser().get<std::string>("--index-dir");
+        bind_addr = parser().get<std::string>("--bind");
+        port = parser().get<uint16_t>("--port");
+    }
+};
+
+static coro::CoroTask<int> run_server(const ServerArgParse* cli) {
+    const auto& bind_addr = cli->bind_addr;
+    auto port = cli->port;
+    const auto& dir = cli->directory.value;
+    auto index_dir = cli->index_dir;
+    auto executor_threads = cli->pipeline.executor_threads;
+
     if (index_dir.empty()) {
-        index_dir = directory;
+        index_dir = dir;
         std::fprintf(stderr, "Using trace directory for indexes: %s\n",
                      index_dir.c_str());
     } else {
@@ -45,28 +78,22 @@ static coro::CoroTask<int> run_server(argparse::ArgumentParser& program) {
     }
 
     auto pipeline_config =
-        PipelineConfig()
-            .with_name("DFTracer Server")
-            .with_compute_threads(executor_threads)
-            .with_watchdog(false)  // Server is long-lived; no watchdog
-            .with_global_timeout(std::chrono::seconds(0))  // Run forever
-            .with_task_timeout(std::chrono::seconds(0))  // No per-task timeout
-            .with_io_backend(
-                io::IoBackendType::THREADPOOL)  // Thread pool IO for server
-            .with_io_batch_size(1);
+        cli::build_pipeline_config("DFTracer Server", cli->pipeline);
+    pipeline_config.with_io_backend(io::IoBackendType::THREADPOOL)
+        .with_io_batch_size(1)
+        .with_watchdog(false)
+        .with_global_timeout(std::chrono::seconds(0))
+        .with_task_timeout(std::chrono::seconds(0));
 
     Pipeline pipeline(pipeline_config);
 
-    // Build trace index (scan directory, load bloom indexes)
-    TraceIndex trace_index(directory, index_dir, executor_threads);
+    TraceIndex trace_index(dir, index_dir, executor_threads);
     co_await trace_index.initialize();
 
-    // Set up router
     Router router;
     register_trace_api(router, trace_index);
     register_viz_api(router, trace_index);
 
-    // Start TCP listener
     TcpListener listener(bind_addr, port);
     if (!listener.start()) {
         DFTRACER_UTILS_LOG_ERROR("Failed to bind to %s:%u", bind_addr.c_str(),
@@ -77,9 +104,8 @@ static coro::CoroTask<int> run_server(argparse::ArgumentParser& program) {
     std::fprintf(stderr, "DFTracer server listening on %s:%u\n",
                  bind_addr.c_str(), port);
     std::fprintf(stderr, "Serving %zu trace files from %s\n",
-                 trace_index.file_count(), directory.c_str());
+                 trace_index.file_count(), dir.c_str());
 
-    // Register listen fd so signal handler can unblock accept().
     g_listen_fd.store(listener.fd(), std::memory_order_release);
 
     auto server_task = make_task(
@@ -97,10 +123,6 @@ static coro::CoroTask<int> run_server(argparse::ArgumentParser& program) {
     pipeline.set_source(server_task);
     pipeline.set_destination(server_task);
 
-    // Run until SIGINT/SIGTERM.
-    // Signal handler sets g_shutdown_requested, which causes accept_loop
-    // to break, CoroScope drains in-flight handlers, and
-    // pipeline.execute() returns.
     pipeline.execute();
 
     std::fprintf(stderr, "Server shut down gracefully\n");
@@ -117,40 +139,11 @@ int main(int argc, char** argv) {
         "Serve DFTracer trace data over HTTP. Query, filter, and stream "
         "trace events via REST API.");
 
-    program.add_argument("-b", "--bind")
-        .help("Bind address")
-        .default_value<std::string>("0.0.0.0");
-
-    program.add_argument("-p", "--port")
-        .help("Listen port")
-        .scan<'d', uint16_t>()
-        .default_value(static_cast<uint16_t>(8080));
-
-    program.add_argument("-d", "--directory")
-        .help("Directory containing trace files")
-        .required();
-
-    program.add_argument("--index-dir")
-        .help(
-            "Directory for root-local .dftindex stores (default: same as "
-            "--directory)")
-        .default_value<std::string>("");
-
-    program.add_argument("--executor-threads")
-        .help("Number of worker threads")
-        .scan<'d', std::size_t>()
-        .default_value(
-            static_cast<std::size_t>(dftracer_utils_hardware_concurrency()));
+    ServerArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
 
     install_signal_handlers();
 
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        DFTRACER_UTILS_LOG_ERROR("Error: %s", err.what());
-        std::cerr << program;
-        return 1;
-    }
-
-    return run_server(program).get();
+    return run_server(&cli).get();
 }
diff --git a/src/dftracer/utils/binaries/dftracer_split.cpp b/src/dftracer/utils/binaries/dftracer_split.cpp
index 8be38958..03ddf4c9 100644
--- a/src/dftracer/utils/binaries/dftracer_split.cpp
+++ b/src/dftracer/utils/binaries/dftracer_split.cpp
@@ -1,8 +1,6 @@
 #include <dftracer/utils/core/common/config.h>
-#include <dftracer/utils/core/common/filesystem.h>
-#include <dftracer/utils/core/common/platform_compat.h>
 #include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/core/pipeline/pipeline_config.h>
+#include <dftracer/utils/core/rocksdb/db_manager.h>
 #include <dftracer/utils/core/task_graph/task_graph.h>
 #include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/tasks/task.h>
@@ -14,10 +12,11 @@
 #include <dftracer/utils/utilities/indexer/internal/indexer.h>
 #include <unistd.h>
 
-#include <argparse/argparse.hpp>
 #include <chrono>
 #include <cinttypes>
 
+#include "common_cli.h"
+
 using namespace dftracer::utils;
 using namespace dftracer::utils::task_graph;
 using Metadata = utilities::composites::dft::MetadataCollectorUtilityOutput;
@@ -26,142 +25,83 @@ using ChunkManifest =
 using ExtractInput = utilities::composites::dft::ChunkExtractorUtilityInput;
 using ExtractResult = utilities::composites::dft::ChunkExtractorUtilityOutput;
 
-int main(int argc, char** argv) {
-    DFTRACER_UTILS_LOGGER_INIT();
-
-    auto default_checkpoint_size_str =
-        std::to_string(dftracer::utils::utilities::indexer::internal::Indexer::
-                           DEFAULT_CHECKPOINT_SIZE) +
-        " B (" +
-        std::to_string(dftracer::utils::utilities::indexer::internal::Indexer::
-                           DEFAULT_CHECKPOINT_SIZE /
-                       (1024 * 1024)) +
-        " MB)";
+class SplitArgParse : public cli::ArgParse {
+   public:
+    cli::DirectoryArgs directory;
+    cli::PipelineArgs pipeline;
+    cli::IndexingArgs indexing;
+    cli::WatchdogArgs watchdog;
+
+    std::string app_name = "app";
+    std::string output_dir = "./split";
+    int chunk_size_mb = 4;
+    bool compress = true;
+    bool verbose = false;
+    bool verify = false;
+
+    explicit SplitArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        indexing.force_help =
+            "Override existing files and force index recreation";
+        schema(directory, pipeline, indexing, watchdog);
+    }
 
-    argparse::ArgumentParser program("dftracer_split",
-                                     DFTRACER_UTILS_PACKAGE_VERSION);
-    program.add_description(
-        "Split DFTracer traces into equal-sized chunks using explicit pipeline "
-        "with maximum parallelism");
+   protected:
+    void register_args() override {
+        parser()
+            .add_argument("-n", "--app-name")
+            .help("Application name for output files")
+            .default_value<std::string>("app");
+
+        parser()
+            .add_argument("-o", "--output")
+            .help("Output directory for split files")
+            .default_value<std::string>("./split");
+
+        parser()
+            .add_argument("-s", "--chunk-size")
+            .help("Chunk size in MB")
+            .scan<'d', int>()
+            .default_value(4);
+
+        parser()
+            .add_argument("-c", "--compress")
+            .help("Compress output files with gzip")
+            .flag()
+            .default_value(true);
+
+        parser()
+            .add_argument("-v", "--verbose")
+            .help("Enable verbose mode")
+            .flag();
+
+        parser()
+            .add_argument("--verify")
+            .help("Verify output chunks match input by comparing event IDs")
+            .flag();
+    }
 
-    program.add_argument("-n", "--app-name")
-        .help("Application name for output files")
-        .default_value<std::string>("app");
-
-    program.add_argument("-d", "--directory")
-        .help("Input directory containing .pfw or .pfw.gz files")
-        .default_value<std::string>(".");
-
-    program.add_argument("-o", "--output")
-        .help("Output directory for split files")
-        .default_value<std::string>("./split");
-
-    program.add_argument("-s", "--chunk-size")
-        .help("Chunk size in MB")
-        .scan<'d', int>()
-        .default_value(4);
-
-    program.add_argument("-f", "--force")
-        .help("Override existing files and force index recreation")
-        .flag();
-
-    program.add_argument("-c", "--compress")
-        .help("Compress output files with gzip")
-        .flag()
-        .default_value(true);
-
-    program.add_argument("-v", "--verbose").help("Enable verbose mode").flag();
-
-    program.add_argument("--checkpoint-size")
-        .help("Checkpoint size for indexing in bytes (default: " +
-              default_checkpoint_size_str + ")")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(
-            dftracer::utils::utilities::indexer::internal::Indexer::
-                DEFAULT_CHECKPOINT_SIZE));
-
-    program.add_argument("--executor-threads")
-        .help(
-            "Number of executor threads for parallel processing (default: "
-            "number "
-            "of CPU cores)")
-        .scan<'d', std::size_t>()
-        .default_value(
-            static_cast<std::size_t>(dftracer_utils_hardware_concurrency()));
-
-    program.add_argument("--index-dir")
-        .help("Directory to store index files (default: system temp directory)")
-        .default_value<std::string>("");
-
-    program.add_argument("--verify")
-        .help("Verify output chunks match input by comparing event IDs")
-        .flag();
-
-    program.add_argument("--disable-watchdog")
-        .help("Disable watchdog for hang detection")
-        .flag();
-
-    program.add_argument("--watchdog-global-timeout")
-        .help(
-            "Watchdog global timeout for pipeline execution in seconds (0 = no "
-            "timeout)")
-        .scan<'d', int>()
-        .default_value(0);
-
-    program.add_argument("--watchdog-task-timeout")
-        .help("Watchdog default task timeout in seconds (0 = no timeout)")
-        .scan<'d', int>()
-        .default_value(0);
-
-    program.add_argument("--watchdog-interval")
-        .help("Watchdog check interval in seconds")
-        .scan<'d', int>()
-        .default_value(1);
-
-    program.add_argument("--watchdog-warning-threshold")
-        .help("Watchdog long-running task warning threshold in seconds")
-        .scan<'d', int>()
-        .default_value(300);
-
-    program.add_argument("--watchdog-idle-timeout")
-        .help("Watchdog idle timeout in seconds (0 = use default)")
-        .scan<'d', int>()
-        .default_value(300);
-
-    program.add_argument("--watchdog-deadlock-timeout")
-        .help("Watchdog deadlock timeout in seconds (0 = use default)")
-        .scan<'d', int>()
-        .default_value(600);
-
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        DFTRACER_UTILS_LOG_ERROR("Error occurred: %s", err.what());
-        std::cerr << program << std::endl;
-        return 1;
+    void post_parse() override {
+        app_name = parser().get<std::string>("--app-name");
+        output_dir = parser().get<std::string>("--output");
+        chunk_size_mb = parser().get<int>("--chunk-size");
+        compress = parser().get<bool>("--compress");
+        verbose = parser().get<bool>("--verbose");
+        verify = parser().get<bool>("--verify");
     }
+};
+
+static coro::CoroTask<int> run_split(const SplitArgParse* cli) {
+    const auto log_dir = fs::absolute(cli->directory.value).string();
+    const auto output_dir = fs::absolute(cli->output_dir).string();
+    const auto& app_name = cli->app_name;
+    const auto chunk_size_mb = cli->chunk_size_mb;
+    const auto force = cli->indexing.force;
+    const auto compress = cli->compress;
+    const auto verify = cli->verify;
+    const auto checkpoint_size = cli->indexing.checkpoint_size;
+    const auto executor_threads = cli->pipeline.executor_threads;
+    auto index_dir = cli->indexing.index_dir;
 
-    // Parse arguments
-    std::string app_name = program.get<std::string>("--app-name");
-    std::string log_dir = program.get<std::string>("--directory");
-    std::string output_dir = program.get<std::string>("--output");
-    int chunk_size_mb = program.get<int>("--chunk-size");
-    bool force = program.get<bool>("--force");
-    bool compress = program.get<bool>("--compress");
-    bool verify = program.get<bool>("--verify");
-    std::size_t checkpoint_size = program.get<std::size_t>("--checkpoint-size");
-    std::size_t executor_threads =
-        program.get<std::size_t>("--executor-threads");
-    std::string index_dir = program.get<std::string>("--index-dir");
-    bool disable_watchdog = program.get<bool>("--disable-watchdog");
-    int global_timeout = program.get<int>("--watchdog-global-timeout");
-    int task_timeout = program.get<int>("--watchdog-task-timeout");
-    int watchdog_interval = program.get<int>("--watchdog-interval");
-    int warning_threshold = program.get<int>("--watchdog-warning-threshold");
-    int idle_timeout = program.get<int>("--watchdog-idle-timeout");
-    int deadlock_timeout = program.get<int>("--watchdog-deadlock-timeout");
-
-    // Setup temp index directory
     std::string temp_index_dir;
     if (index_dir.empty()) {
         temp_index_dir = fs::temp_directory_path() /
@@ -173,9 +113,6 @@ int main(int argc, char** argv) {
                                 index_dir.c_str());
     }
 
-    log_dir = fs::absolute(log_dir).string();
-    output_dir = fs::absolute(output_dir).string();
-
     std::printf("==========================================\n");
     std::printf("DFTracer Split (Explicit Pipeline)\n");
     std::printf("==========================================\n");
@@ -193,20 +130,8 @@ int main(int argc, char** argv) {
         fs::create_directories(output_dir);
     }
 
-    // Create pipeline with configuration
-    auto pipeline_config =
-        PipelineConfig()
-            .with_name("DFTracer Split")
-            .with_compute_threads(executor_threads)
-            .with_watchdog(!disable_watchdog)
-            .with_global_timeout(std::chrono::seconds(global_timeout))
-            .with_task_timeout(std::chrono::seconds(task_timeout))
-            .with_watchdog_interval(std::chrono::seconds(watchdog_interval))
-            .with_warning_threshold(std::chrono::seconds(warning_threshold))
-            .with_executor_idle_timeout(std::chrono::seconds(idle_timeout))
-            .with_executor_deadlock_timeout(
-                std::chrono::seconds(deadlock_timeout));
-
+    auto pipeline_config = cli::build_pipeline_config(
+        "DFTracer Split", cli->pipeline, cli->watchdog);
     Pipeline pipeline(pipeline_config);
 
     auto start_time = std::chrono::high_resolution_clock::now();
@@ -227,7 +152,7 @@ int main(int argc, char** argv) {
     if (input_files.empty()) {
         DFTRACER_UTILS_LOG_ERROR("No .pfw or .pfw.gz files found in %s",
                                  log_dir.c_str());
-        return 1;
+        co_return 1;
     }
 
     DFTRACER_UTILS_LOG_INFO("Found %zu input files", input_files.size());
@@ -247,30 +172,53 @@ int main(int argc, char** argv) {
     auto graph = TaskGraph::builder(
         {.name = "DFTracerSplit", .max_concurrency = executor_threads});
 
-    DFTRACER_UTILS_LOG_INFO("%s", "Creating file processing tasks...");
+    DFTRACER_UTILS_LOG_INFO("%s", "Creating batch index task...");
 
     auto* input_files_ptr = &input_files;
+    auto batch_index_task = make_task(
+        [input_files_ptr, checkpoint_size, index_dir,
+         executor_threads](CoroScope& ctx) -> coro::CoroTask<void> {
+            auto index_path =
+                utilities::composites::dft::internal::determine_index_path(
+                    input_files_ptr->front(), index_dir);
+            dftracer::utils::rocksdb::RocksDBManager::instance().reset(
+                index_path);
+
+            auto batch_config =
+                std::make_shared<utilities::indexer::IndexBuildBatchConfig>();
+            batch_config->file_paths = *input_files_ptr;
+            batch_config->index_dir = index_dir;
+            batch_config->checkpoint_size = checkpoint_size;
+            batch_config->parallelism = executor_threads;
+            batch_config->use_batch_write = true;
+            batch_config->rebuild_root_summaries = true;
+
+            auto result =
+                co_await utilities::indexer::IndexBatchBuilderUtility::process(
+                    &ctx, std::move(batch_config));
+            for (const auto& r : result.results) {
+                if (!r.success && !r.error_message.empty()) {
+                    DFTRACER_UTILS_LOG_ERROR("Auto-indexing failed for %s: %s",
+                                             r.file_path.c_str(),
+                                             r.error_message.c_str());
+                }
+            }
+        },
+        "BatchIndex");
+    graph.add(batch_index_task);
+
+    DFTRACER_UTILS_LOG_INFO("%s", "Creating file processing tasks...");
+
     auto file_metadata = graph.parallel<Metadata>(
         input_files.size(),
-        [input_files_ptr, checkpoint_size, force, index_dir, verify](
+        [input_files_ptr, checkpoint_size, index_dir, verify](
             CoroScope&, std::size_t idx) -> coro::CoroTask<Metadata> {
             const auto& file_path = (*input_files_ptr)[idx];
 
-            // Determine index path
             std::string index_path =
                 utilities::composites::dft::internal::determine_index_path(
                     file_path, index_dir);
 
-            // Build index
-            auto idx_input =
-                utilities::indexer::IndexBuildConfig::for_file(file_path)
-                    .with_checkpoint_size(checkpoint_size)
-                    .with_force_rebuild(false)
-                    .with_index_dir(index_dir);
-            co_await utilities::indexer::IndexBuilderUtility{}.process(
-                idx_input);
-
-            // Collect metadata
             auto meta_input =
                 utilities::composites::dft::MetadataCollectorUtilityInput::
                     from_file(file_path)
@@ -285,6 +233,10 @@ int main(int argc, char** argv) {
         },
         {.name = "ProcessFile"});
 
+    for (const auto& meta_task : file_metadata.tasks()) {
+        meta_task->depends_on(batch_index_task);
+    }
+
     DFTRACER_UTILS_LOG_INFO("%s", "Creating chunk mapping task...");
 
     auto manifests_group = graph.reduce<std::vector<ChunkManifest>>(
@@ -359,7 +311,6 @@ int main(int argc, char** argv) {
                 results.push_back(co_await future);
             }
 
-            // Sort by chunk index
             std::sort(results.begin(), results.end(),
                       [](const ExtractResult& a, const ExtractResult& b) {
                           return a.chunk_index < b.chunk_index;
@@ -384,20 +335,15 @@ int main(int argc, char** argv) {
             std::vector<Metadata> all_metadata;
         };
 
-        // Verification task receives both extraction results and metadata.
-        // Both are passed via combiner so the scheduler keeps parent
-        // results alive until this task consumes them.
         task_verify_chunks = make_task(
             [](CoroScope&, const VerifyInput& input)
                 -> coro::CoroTask<
                     utilities::composites::ChunkVerificationUtilityOutput> {
-                // Sum output hashes from extraction results
                 std::size_t output_hash = 0;
                 for (const auto& chunk : input.chunks) {
                     output_hash += chunk.event_hash;
                 }
 
-                // Sum input hashes from metadata (computed during collection)
                 std::size_t input_hash = 0;
                 for (const auto& meta : input.all_metadata) {
                     if (!meta.success) continue;
@@ -411,8 +357,6 @@ int main(int argc, char** argv) {
             },
             "VerifyChunks");
 
-        // Depend on both extract results and metadata tasks.
-        // The combiner collects parent outputs into the typed struct.
         task_verify_chunks->depends_on(task_extract_chunks);
         for (const auto& meta_task : file_metadata.tasks()) {
             task_verify_chunks->depends_on(meta_task);
@@ -420,8 +364,6 @@ int main(int argc, char** argv) {
 
         task_verify_chunks->with_combiner(
             [](const std::vector<std::any>& inputs) -> std::any {
-                // inputs[0] = ExtractChunksOutput (from extract task)
-                // inputs[1..N] = Metadata (from each metadata task)
                 auto chunks = std::any_cast<ExtractChunksOutput>(inputs[0]);
 
                 std::vector<Metadata> all_metadata;
@@ -441,12 +383,10 @@ int main(int argc, char** argv) {
     // Phase 4: Execute Pipeline
     DFTRACER_UTILS_LOG_INFO("%s", "Executing pipeline...");
 
-    pipeline.set_source(file_metadata.tasks());
+    pipeline.set_source(batch_index_task);
     pipeline.set_destination(final_task);
     pipeline.execute();
 
-    // Get results from the destination task only (intermediate task values
-    // are released after pipeline execution)
     auto end_time = std::chrono::high_resolution_clock::now();
     std::chrono::duration<double, std::milli> duration = end_time - start_time;
 
@@ -476,7 +416,6 @@ int main(int argc, char** argv) {
         std::printf("    Output hash: 0x%016" PRIx64 "\n",
                     verify_result.output_hash);
     } else {
-        // Without verification: extract task IS the destination, safe to read
         auto extraction_results =
             task_extract_chunks->get<ExtractChunksOutput>();
 
@@ -503,12 +442,27 @@ int main(int argc, char** argv) {
 
     std::printf("==========================================\n");
 
-    // Cleanup temporary index directory if created
     if (!temp_index_dir.empty() && fs::exists(temp_index_dir)) {
         DFTRACER_UTILS_LOG_INFO("Cleaning up temporary index directory: %s",
                                 temp_index_dir.c_str());
         fs::remove_all(temp_index_dir);
     }
 
-    return exit_code;
+    co_return exit_code;
+}
+
+int main(int argc, char** argv) {
+    DFTRACER_UTILS_LOGGER_INIT();
+
+    argparse::ArgumentParser program("dftracer_split",
+                                     DFTRACER_UTILS_PACKAGE_VERSION);
+    program.add_description(
+        "Split DFTracer traces into equal-sized chunks using explicit pipeline "
+        "with maximum parallelism");
+
+    SplitArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
+
+    return run_split(&cli).get();
 }
diff --git a/src/dftracer/utils/binaries/dftracer_stats.cpp b/src/dftracer/utils/binaries/dftracer_stats.cpp
index 5b9e322d..c5e05210 100644
--- a/src/dftracer/utils/binaries/dftracer_stats.cpp
+++ b/src/dftracer/utils/binaries/dftracer_stats.cpp
@@ -1,45 +1,55 @@
 #include <dftracer/utils/core/common/config.h>
-#include <dftracer/utils/core/common/filesystem.h>
-#include <dftracer/utils/core/common/logging.h>
-#include <dftracer/utils/core/common/platform_compat.h>
 #include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/core/pipeline/pipeline_config.h>
+#include <dftracer/utils/core/rocksdb/db_manager.h>
 #include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/tasks/task.h>
+#include <dftracer/utils/core/utilities/behaviors/behavior_chain.h>
+#include <dftracer/utils/core/utilities/utility_executor.h>
+#include <dftracer/utils/core/utils/timer.h>
 #include <dftracer/utils/utilities/common/json/json.h>
 #include <dftracer/utils/utilities/common/json/json_value.h>
 #include <dftracer/utils/utilities/common/query/query.h>
 #include <dftracer/utils/utilities/composites/dft/event.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/queries/queries.h>
 #include <dftracer/utils/utilities/composites/dft/internal/utils.h>
 #include <dftracer/utils/utilities/composites/dft/metadata_collector_utility.h>
 #include <dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.h>
 #include <dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.h>
+#include <dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.h>
 #include <dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h>
 #include <dftracer/utils/utilities/composites/dft/statistics/statistics_query_utility.h>
+#include <dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h>
 #include <dftracer/utils/utilities/fileio/lines/sources/async_streaming_gz_line_generator.h>
 #include <dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h>
 #include <dftracer/utils/utilities/indexer/index_builder_utility.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
+#include <dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
+#include <dftracer/utils/utilities/indexer/internal/index_batch_writer.h>
 #include <dftracer/utils/utilities/indexer/internal/indexer.h>
+#include <dftracer/utils/utilities/indexer/internal/transaction_scope.h>
 
 #include <algorithm>
-#include <argparse/argparse.hpp>
 #include <array>
 #include <atomic>
 #include <chrono>
+#include <cinttypes>
 #include <cstdio>
-#include <iostream>
+#include <memory>
 #include <mutex>
+#include <optional>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
+#include "common_cli.h"
+
 using namespace dftracer::utils;
 using namespace dftracer::utils::utilities;
 using namespace dftracer::utils::utilities::composites::dft;
@@ -50,35 +60,297 @@ using common::query::Query;
 using dftracer::utils::utilities::composites::dft::DFTracerEvent;
 using dftracer::utils::utilities::fileio::lines::sources::
     async_streaming_gz_lines;
-using dftracer::utils::utilities::indexer::IndexBuildConfig;
-using dftracer::utils::utilities::indexer::IndexBuilderUtility;
+using dftracer::utils::utilities::indexer::ChunkStatistics;
+using dftracer::utils::utilities::indexer::FileRegistryEntry;
+using dftracer::utils::utilities::indexer::has_capability;
 using dftracer::utils::utilities::indexer::IndexDatabase;
+using dftracer::utils::utilities::indexer::IndexFileEntryCapability;
+using dftracer::utils::utilities::indexer::RootStatisticsResult;
+namespace cli = dftracer::utils::cli;
+
+struct StatsConfig {
+    std::string directory;
+    std::string index_dir;
+    bool json_output = false;
+    std::uint64_t top_n = 0;
+    std::uint64_t top_n_pid_tid = 10;
+    bool no_auto_index = false;
+    std::size_t checkpoint_size = 0;
+    std::size_t executor_threads = 0;
+    std::optional<Query> query;
+    std::vector<std::string> filter_names;
+    std::vector<std::string> filter_cats;
+    std::vector<std::string> group_by;
+    StatisticsQueryType report_type = StatisticsQueryType::SUMMARY;
+};
+
+class StatsArgParse : public cli::ArgParse {
+   public:
+    cli::DirectoryArgs directory{cli::DirMode::DEFAULT_EMPTY};
+    cli::FilesArgs files_args{"Trace files to inspect (.pfw, .pfw.gz)"};
+    cli::PipelineArgs pipeline;
+    cli::IndexingArgs indexing;
+    cli::QueryArgs query_args;
+
+    bool json_output = false;
+    std::string report_str = "summary";
+    std::uint64_t top_n = 0;
+    std::uint64_t top_n_pid_tid = 10;
+    bool no_auto_index = false;
+    std::vector<std::string> group_by;
+    std::vector<std::string> filter_names;
+    std::vector<std::string> filter_cats;
+
+    explicit StatsArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        indexing.with_force = false;
+        indexing.index_dir_help =
+            "Directory where .dftindex stores are created";
+        schema(directory, files_args, pipeline, indexing, query_args);
+    }
+
+    bool to_config(StatsConfig& config) const {
+        config.directory = directory.value;
+        config.index_dir = indexing.index_dir;
+        config.json_output = json_output;
+        config.top_n = top_n;
+        config.top_n_pid_tid = top_n_pid_tid;
+        config.no_auto_index = no_auto_index;
+        config.checkpoint_size = indexing.checkpoint_size;
+        config.executor_threads = pipeline.executor_threads;
+        config.filter_names = filter_names;
+        config.filter_cats = filter_cats;
+        config.group_by = group_by;
+
+        const auto& query_str_val = query_args.query;
+        if (!query_str_val.empty()) {
+            auto result = Query::from_string(query_str_val);
+            if (!result) {
+                DFTRACER_UTILS_LOG_ERROR("Invalid --query: %s",
+                                         result.error().format().c_str());
+                return false;
+            }
+            config.query = std::move(*result);
+        }
+
+        config.report_type = parse_report_type_str(report_str);
+
+        if (config.report_type == StatisticsQueryType::DETAILED &&
+            config.group_by.empty()) {
+            config.group_by.push_back("name");
+        }
+
+        return true;
+    }
+
+   protected:
+    void register_args() override {
+        parser().add_argument("--json").help("Output in JSON format").flag();
+
+        parser()
+            .add_argument("--report")
+            .help(
+                "Report type: summary, categories, names, pid_tids, "
+                "time_range, "
+                "duration, top-names, top-categories, detailed")
+            .default_value<std::string>("summary");
+
+        parser()
+            .add_argument("--top-n")
+            .help(
+                "Number of results for top-N queries (0 = show all, "
+                "default: 0)")
+            .scan<'d', std::uint64_t>()
+            .default_value(static_cast<std::uint64_t>(0));
+
+        parser()
+            .add_argument("--top-n-pid-tid")
+            .help("Max PID:TID pairs to display (0 = show all, default: 10)")
+            .scan<'d', std::uint64_t>()
+            .default_value(static_cast<std::uint64_t>(10));
+
+        parser()
+            .add_argument("--no-auto-index")
+            .help(
+                "Disable automatic index building for files missing .dftindex")
+            .flag();
+
+        parser()
+            .add_argument("--group-by")
+            .help(
+                "Group detailed statistics by dimension(s): name, cat, pid, "
+                "tid, fhash, hhash, pid_tid. Multiple values create composite "
+                "keys.")
+            .nargs(argparse::nargs_pattern::at_least_one)
+            .default_value<std::vector<std::string>>({});
+
+        parser()
+            .add_argument("--filter-names")
+            .help("Filter by event names")
+            .nargs(argparse::nargs_pattern::any)
+            .default_value<std::vector<std::string>>({});
+
+        parser()
+            .add_argument("--filter-cats")
+            .help("Filter by event categories")
+            .nargs(argparse::nargs_pattern::any)
+            .default_value<std::vector<std::string>>({});
+    }
+
+    void post_parse() override {
+        json_output = parser().get<bool>("--json");
+        report_str = parser().get<std::string>("--report");
+        top_n = parser().get<std::uint64_t>("--top-n");
+        top_n_pid_tid = parser().get<std::uint64_t>("--top-n-pid-tid");
+        no_auto_index = parser().get<bool>("--no-auto-index");
+        group_by = parser().get<std::vector<std::string>>("--group-by");
+        filter_names = parser().get<std::vector<std::string>>("--filter-names");
+        filter_cats = parser().get<std::vector<std::string>>("--filter-cats");
+    }
+
+    bool validate() override {
+        const std::vector<std::string> valid_reports = {
+            "summary",   "categories",     "names",
+            "pid_tids",  "time_range",     "duration",
+            "top-names", "top-categories", "detailed"};
+        if (std::find(valid_reports.begin(), valid_reports.end(), report_str) ==
+            valid_reports.end()) {
+            DFTRACER_UTILS_LOG_ERROR("Invalid --report value: %s",
+                                     report_str.c_str());
+            return false;
+        }
+
+        const std::vector<std::string> valid_dims = {
+            "name", "cat", "pid", "tid", "fhash", "hhash", "pid_tid"};
+        for (const auto& dim : group_by) {
+            if (std::find(valid_dims.begin(), valid_dims.end(), dim) ==
+                valid_dims.end()) {
+                DFTRACER_UTILS_LOG_ERROR(
+                    "Invalid --group-by dimension: %s. Valid: name, cat, pid, "
+                    "tid, fhash, hhash, pid_tid",
+                    dim.c_str());
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+   private:
+    static StatisticsQueryType parse_report_type_str(const std::string& s) {
+        if (s == "summary") return StatisticsQueryType::SUMMARY;
+        if (s == "categories") return StatisticsQueryType::CATEGORIES;
+        if (s == "names") return StatisticsQueryType::NAMES;
+        if (s == "pid_tids") return StatisticsQueryType::PID_TIDS;
+        if (s == "time_range") return StatisticsQueryType::TIME_RANGE;
+        if (s == "duration") return StatisticsQueryType::DURATION_STATS;
+        if (s == "top-names") return StatisticsQueryType::TOP_N_NAMES;
+        if (s == "top-categories") return StatisticsQueryType::TOP_N_CATEGORIES;
+        if (s == "detailed") return StatisticsQueryType::DETAILED;
+        return StatisticsQueryType::SUMMARY;
+    }
+};
+
+using indexing::FileWorkItem;
+using indexing::IndexResolverUtility;
+using indexing::ResolvedFile;
+using indexing::ResolverInput;
+using indexing::ResolverResult;
+
+struct IndexPartition {
+    std::vector<FileWorkItem> files_needing_index;
+    std::vector<ResolvedFile> indexed_entries;
+    ResolverResult resolver_result;
+    std::vector<std::pair<std::size_t, TraceStatistics>> precomputed_failures;
+    std::vector<std::pair<std::size_t, TraceStatistics>> precomputed_successes;
+};
+
+struct AggregateStatsResult {
+    std::vector<std::pair<std::size_t, TraceStatistics>> indexed_stats;
+    TraceStatistics total;
+    std::size_t successful_count = 0;
+    std::size_t failed_count = 0;
+    std::int64_t read_elapsed_ns = 0;
+    std::unordered_map<std::string, std::uint64_t> read_counters;
+};
+
+struct IndexedRootSnapshot {
+    std::vector<std::string> logical_files;
+    IndexPartition partition;
+};
+
+static void append_failed_stats_result(
+    std::vector<std::pair<std::size_t, TraceStatistics>>& results,
+    std::size_t file_index, const std::string& file_path,
+    const std::string& error_message) {
+    TraceStatistics failed;
+    failed.file_path = file_path;
+    failed.success = false;
+    failed.error_message = error_message;
+    results.emplace_back(file_index, std::move(failed));
+}
+
+static void append_empty_indexed_stats_result(
+    std::vector<std::pair<std::size_t, TraceStatistics>>& results,
+    std::size_t file_index, const std::string& file_path,
+    const std::string& index_path) {
+    TraceStatistics stats;
+    stats.file_path = file_path;
+    stats.index_path = index_path;
+    stats.success = true;
+    stats.num_chunks = 0;
+    results.emplace_back(file_index, std::move(stats));
+}
 
-// Files below this compressed size are scanned directly without building
-// `.dftindex` stores. At 8 MB compressed (~160 MB
-// uncompressed with typical 20x JSON compression), a file has only a
-// handful of 32 MB checkpoints — the indexing overhead exceeds the
-// benefit of bloom-filter skip.
-static constexpr std::size_t INDEX_SIZE_THRESHOLD =
-    constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD;
-
-static StatisticsQueryType parse_report_type_str(const std::string& s) {
-    if (s == "summary") return StatisticsQueryType::SUMMARY;
-    if (s == "categories") return StatisticsQueryType::CATEGORIES;
-    if (s == "names") return StatisticsQueryType::NAMES;
-    if (s == "pid_tids") return StatisticsQueryType::PID_TIDS;
-    if (s == "time_range") return StatisticsQueryType::TIME_RANGE;
-    if (s == "duration") return StatisticsQueryType::DURATION_STATS;
-    if (s == "top-names") return StatisticsQueryType::TOP_N_NAMES;
-    if (s == "top-categories") return StatisticsQueryType::TOP_N_CATEGORIES;
-    if (s == "detailed") return StatisticsQueryType::DETAILED;
-    return StatisticsQueryType::SUMMARY;
+static double ns_to_ms(std::uint64_t ns) {
+    return static_cast<double>(ns) / 1'000'000.0;
+}
+
+static coro::CoroTask<std::optional<TraceStatistics>>
+process_index_group_root_summary(std::string index_path,
+                                 std::size_t expected_indexed_files,
+                                 StatisticsQueryType report_type) {
+    IndexDatabase idx_db(
+        index_path,
+        dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+    auto scalar_stats = idx_db.query_root_scalar_stats();
+
+    if (!scalar_stats || scalar_stats->num_files != expected_indexed_files) {
+        co_return std::nullopt;
+    }
+
+    TraceStatistics result;
+    result.file_path = index_path;
+    result.index_path = index_path;
+    result.num_chunks = scalar_stats->num_chunks;
+    result.merged = scalar_stats->stats;
+    result.success = true;
+
+    const bool needs_categories =
+        report_type == StatisticsQueryType::SUMMARY ||
+        report_type == StatisticsQueryType::CATEGORIES ||
+        report_type == StatisticsQueryType::TOP_N_CATEGORIES;
+    const bool needs_names = report_type == StatisticsQueryType::NAMES ||
+                             report_type == StatisticsQueryType::TOP_N_NAMES;
+    const bool needs_pid_tids = report_type == StatisticsQueryType::SUMMARY ||
+                                report_type == StatisticsQueryType::PID_TIDS;
+
+    if (needs_categories) {
+        idx_db.merge_root_category_counts_into(result.merged);
+    }
+    if (needs_names) {
+        idx_db.merge_root_name_counts_into(result.merged);
+    }
+    if (needs_pid_tids) {
+        idx_db.merge_root_pid_tid_counts_into(result.merged);
+    }
+
+    co_return result;
 }
 
 using CountPair = std::pair<std::string, std::uint64_t>;
 
-static std::vector<CountPair> sorted_by_count_desc(
-    const std::unordered_map<std::string, std::uint64_t>& counts) {
+template <typename Map>
+static std::vector<CountPair> sorted_by_count_desc(const Map& counts) {
     std::vector<CountPair> sorted(counts.begin(), counts.end());
     std::sort(sorted.begin(), sorted.end(),
               [](const CountPair& a, const CountPair& b) {
@@ -456,214 +728,14 @@ static void print_text_detailed(
     std::printf("\n");
 }
 
-// Direct-scan a small .pfw.gz file without any persisted index store.
-// Streams lines via async_streaming_gz_lines, parses each with yyjson,
-// and accumulates stats via ChunkStatistics::update_from_event().
-static coro::CoroTask<TraceStatistics> direct_scan_trace_statistics(
-    std::string file_path) {
-    TraceStatistics result;
-    result.file_path = file_path;
-
-    try {
-        auto gen = async_streaming_gz_lines(file_path);
-        ChunkStatistics stats;
-
-        while (auto line = co_await gen.next()) {
-            if (line->content.empty()) continue;
-
-            yyjson_doc* doc = yyjson_read_opts(
-                const_cast<char*>(line->content.data()), line->content.size(),
-                YYJSON_READ_NOFLAG, nullptr, nullptr);
-            if (!doc) continue;
-
-            yyjson_val* root = yyjson_doc_get_root(doc);
-            if (root && yyjson_is_obj(root)) {
-                DFTracerEvent ev;
-                if (DFTracerEvent::parse(root, ev) && !ev.is_metadata()) {
-                    stats.update_from_event(ev.name, ev.cat, ev.pid, ev.tid,
-                                            ev.ts, ev.dur);
-                }
-            }
-            yyjson_doc_free(doc);
-        }
-
-        result.merged = stats;
-        result.num_chunks = 1;
-        result.success = true;
-    } catch (const std::exception& e) {
-        result.success = false;
-        result.error_message = "Failed to scan: " + file_path + ": " + e.what();
-    }
-
-    co_return result;
-}
-
-// Direct-scan a small .pfw.gz for the detailed query path.
-// Applies name/category filters and group-by dimensions.
-static coro::CoroTask<DetailedStatistics> direct_scan_detailed_statistics(
-    std::string file_path, const std::vector<std::string>* filter_names_ptr,
-    const std::vector<std::string>* filter_cats_ptr,
-    const std::vector<std::string>* group_by_ptr) {
-    DetailedStatistics result;
-
-    // Build filter sets from pointer args (pointers are safe: caller's scope
-    // outlives this coroutine).
-    std::unordered_set<std::string_view> name_filter;
-    std::unordered_set<std::string_view> cat_filter;
-    for (const auto& n : *filter_names_ptr) name_filter.insert(n);
-    for (const auto& c : *filter_cats_ptr) cat_filter.insert(c);
-    bool has_name_filter = !name_filter.empty();
-    bool has_cat_filter = !cat_filter.empty();
-    bool has_grouping = !group_by_ptr->empty();
-
-    // I/O event names (same list as chunk_detail_scanner_utility.cpp)
-    static constexpr auto IO_EVENTS = std::to_array<std::string_view>(
-        {"read", "write", "pread", "pwrite", "pread64", "pwrite64", "readv",
-         "writev"});
-    auto is_io = [](std::string_view name) {
-        return std::find(IO_EVENTS.begin(), IO_EVENTS.end(), name) !=
-               IO_EVENTS.end();
-    };
-
-    try {
-        auto gen = async_streaming_gz_lines(file_path);
-
-        while (auto line = co_await gen.next()) {
-            if (line->content.empty()) continue;
-
-            yyjson_doc* doc = yyjson_read_opts(
-                const_cast<char*>(line->content.data()), line->content.size(),
-                YYJSON_READ_NOFLAG, nullptr, nullptr);
-            if (!doc) continue;
-
-            yyjson_val* root = yyjson_doc_get_root(doc);
-            if (root && yyjson_is_obj(root)) {
-                DFTracerEvent ev;
-                if (DFTracerEvent::parse(root, ev) && !ev.is_metadata()) {
-                    std::string_view name_sv = ev.name;
-                    std::string_view cat_sv = ev.cat;
-
-                    bool passes = true;
-                    if (has_name_filter &&
-                        name_filter.find(name_sv) == name_filter.end()) {
-                        passes = false;
-                    }
-                    if (passes && has_cat_filter &&
-                        cat_filter.find(cat_sv) == cat_filter.end()) {
-                        passes = false;
-                    }
-
-                    if (passes) {
-                        double dur = static_cast<double>(ev.dur);
-                        result.duration.update(dur);
-
-                        std::string io_key;
-
-                        if (has_grouping) {
-                            // Build group key inline (same logic as
-                            // chunk_detail_scanner_utility.cpp)
-                            constexpr std::size_t KEY_BUF_SIZE =
-                                utilities::common::json::YYJSON_LINE_POOL_SIZE;
-                            char buf[KEY_BUF_SIZE];
-                            char* p = buf;
-                            char* end = buf + KEY_BUF_SIZE - 1;
-
-                            auto append_sv = [&](std::string_view sv) {
-                                std::size_t n =
-                                    std::min(sv.size(),
-                                             static_cast<std::size_t>(end - p));
-                                std::memcpy(p, sv.data(), n);
-                                p += n;
-                            };
-                            auto append_uint = [&](std::uint64_t v) {
-                                p += std::snprintf(
-                                    p, static_cast<std::size_t>(end - p + 1),
-                                    "%llu", static_cast<unsigned long long>(v));
-                            };
-
-                            for (std::size_t i = 0; i < group_by_ptr->size();
-                                 ++i) {
-                                if (p >= end) break;
-                                if (i > 0) *p++ = '|';
-                                const auto& dim = (*group_by_ptr)[i];
-                                if (dim == "name") {
-                                    append_sv(ev.name);
-                                } else if (dim == "cat") {
-                                    append_sv(ev.cat);
-                                } else if (dim == "pid") {
-                                    append_uint(ev.pid);
-                                } else if (dim == "tid") {
-                                    append_uint(ev.tid);
-                                } else if (dim == "pid_tid") {
-                                    append_uint(ev.pid);
-                                    if (p < end) *p++ = ':';
-                                    append_uint(ev.tid);
-                                } else if (dim == "fhash") {
-                                    if (ev.args.exists())
-                                        append_sv(ev.args["fhash"]
-                                                      .get<std::string_view>());
-                                } else if (dim == "hhash") {
-                                    if (ev.args.exists())
-                                        append_sv(ev.args["hhash"]
-                                                      .get<std::string_view>());
-                                }
-                            }
-                            std::string key(buf, p);
-                            result.grouped_duration[key].update(dur);
-                            result.group_key_category.emplace(
-                                key, std::string(cat_sv));
-                            io_key = std::move(key);
-                        } else {
-                            io_key = "__global__";
-                        }
-
-                        if (is_io(name_sv) && ev.args.exists()) {
-                            auto ret_opt =
-                                ev.args["ret"].get_optional<std::int64_t>();
-                            if (ret_opt.has_value() && ret_opt.value() > 0) {
-                                double ret =
-                                    static_cast<double>(ret_opt.value());
-                                auto& io = result.grouped_io[io_key];
-                                io.duration.update(dur);
-                                io.size.update(ret);
-                                if (dur > 0) {
-                                    io.bandwidth.update(ret * 1e6 / dur);
-                                }
-                                auto offset_opt =
-                                    ev.args["offset"]
-                                        .get_optional<std::uint64_t>();
-                                if (offset_opt.has_value()) {
-                                    io.offset.update(static_cast<double>(
-                                        offset_opt.value()));
-                                }
-                            }
-                        }
-
-                        result.events_scanned++;
-                    }
-                }
-            }
-            yyjson_doc_free(doc);
-        }
-
-        result.chunks_scanned = 1;
-    } catch (const std::exception&) {
-        // Return empty result on open/read failure (matches original behaviour)
-    }
-
-    co_return result;
-}
-
 // Per-chunk scanning coroutine for parallel detailed stats.
 // Scans a single chunk and merges results into shared file_detailed.
-static coro::CoroTask<void> scan_chunk_detailed(
+static coro::CoroTask<std::optional<DetailedStatistics>> scan_chunk_detailed(
     std::string file_path, std::string index_path, std::size_t checkpoint_size,
     std::size_t file_size, std::size_t num_ckpts, std::uint64_t ckpt_idx,
     const std::vector<std::string>* filter_names_ptr,
     const std::vector<std::string>* filter_cats_ptr,
-    const std::vector<std::string>* group_by_ptr,
-    std::shared_ptr<DetailedStatistics> file_detailed,
-    std::shared_ptr<std::mutex> chunk_mutex) {
+    const std::vector<std::string>* group_by_ptr) {
     std::size_t start_byte = 0;
     std::size_t end_byte = file_size;
 
@@ -689,11 +761,10 @@ static coro::CoroTask<void> scan_chunk_detailed(
     auto scan_output = co_await scanner.process(scan_input);
 
     if (scan_output.success) {
-        std::lock_guard<std::mutex> lock(*chunk_mutex);
-        file_detailed->merge(scan_output.stats);
+        co_return scan_output.stats;
     }
 
-    co_return;
+    co_return std::nullopt;
 }
 
 // Per-file detailed stats coroutine. Spawns parallel chunk scans,
@@ -761,67 +832,77 @@ static coro::CoroTask<void> process_file_detailed(
         }
     }
 
-    // Scan candidate chunks in parallel
-    auto file_detailed = std::make_shared<DetailedStatistics>();
-    file_detailed->chunks_skipped =
+    // Scan candidate chunks in parallel, then merge sequentially per file.
+    DetailedStatistics file_detailed;
+    file_detailed.chunks_skipped =
         total_checkpoints - candidate_checkpoints.size();
-    auto chunk_mutex = std::make_shared<std::mutex>();
-
-    co_await fctx.scope([file_path, index_path, checkpoint_size, file_size,
-                         num_ckpts, filter_names_ptr, filter_cats_ptr,
-                         group_by_ptr, file_detailed, chunk_mutex,
-                         candidates = std::move(candidate_checkpoints)](
-                            CoroScope& chunk_scope) -> coro::CoroTask<void> {
-        for (auto ckpt_idx : candidates) {
-            chunk_scope.spawn(
-                [file_path, index_path, checkpoint_size, file_size, num_ckpts,
-                 ckpt_idx, filter_names_ptr, filter_cats_ptr, group_by_ptr,
-                 file_detailed,
-                 chunk_mutex](CoroScope& /*cctx*/) -> coro::CoroTask<void> {
-                    co_return co_await scan_chunk_detailed(
-                        file_path, index_path, checkpoint_size, file_size,
-                        num_ckpts, ckpt_idx, filter_names_ptr, filter_cats_ptr,
-                        group_by_ptr, file_detailed, chunk_mutex);
-                });
+    std::vector<std::uint64_t> candidates = std::move(candidate_checkpoints);
+    std::vector<std::optional<DetailedStatistics>> chunk_results(
+        candidates.size());
+
+    const auto* file_path_ptr = &file_path;
+    const auto* index_path_ptr = &index_path;
+    auto* candidates_ptr = &candidates;
+    auto* chunk_results_ptr = &chunk_results;
+    co_await fctx.scope(
+        [file_path_ptr, index_path_ptr, checkpoint_size, file_size, num_ckpts,
+         filter_names_ptr, filter_cats_ptr, group_by_ptr, candidates_ptr,
+         chunk_results_ptr](CoroScope& chunk_scope) -> coro::CoroTask<void> {
+            for (std::size_t result_idx = 0;
+                 result_idx < candidates_ptr->size(); ++result_idx) {
+                std::uint64_t ckpt_idx = (*candidates_ptr)[result_idx];
+                chunk_scope.spawn(
+                    [file_path_ptr, index_path_ptr, checkpoint_size, file_size,
+                     num_ckpts, ckpt_idx, filter_names_ptr, filter_cats_ptr,
+                     group_by_ptr, chunk_results_ptr,
+                     result_idx](CoroScope& /*cctx*/) -> coro::CoroTask<void> {
+                        (*chunk_results_ptr)[result_idx] =
+                            co_await scan_chunk_detailed(
+                                *file_path_ptr, *index_path_ptr,
+                                checkpoint_size, file_size, num_ckpts, ckpt_idx,
+                                filter_names_ptr, filter_cats_ptr,
+                                group_by_ptr);
+                        co_return;
+                    });
+            }
+            co_return;
+        });
+
+    for (const auto& chunk_result : chunk_results) {
+        if (chunk_result.has_value()) {
+            file_detailed.merge(*chunk_result);
         }
-        co_return;
-    });
+    }
 
     // Hash resolution (sequential, all chunks done)
     std::unordered_map<std::string, std::string> hash_resolutions;
     if (needs_hash_resolution && fs::exists(index_path)) {
         try {
             IndexDatabase idx_db(index_path);
-            auto logical =
-                utilities::indexer::internal::get_logical_path(file_path);
-            int file_info_id = idx_db.get_file_info_id(logical);
-            if (file_info_id >= 0) {
-                auto resolve_hashes = [&](const std::string& dim) {
-                    for (const auto& [key, _] :
-                         file_detailed->grouped_duration) {
-                        if (hash_resolutions.count(key) == 0) {
-                            auto resolved =
-                                idx_db.query_resolved_by_hash(dim, key);
-                            if (resolved.has_value()) {
-                                hash_resolutions[key] = resolved.value();
-                            }
+            auto resolve_hashes = [&](IndexDatabase::HashType hash_type) {
+                for (const auto& [key, _] : file_detailed.grouped_duration) {
+                    if (hash_resolutions.count(key) == 0) {
+                        auto resolved = idx_db.resolve_hash(hash_type, key);
+                        if (resolved.has_value()) {
+                            hash_resolutions[key] = resolved.value();
                         }
                     }
-                    for (const auto& [key, _] : file_detailed->grouped_io) {
-                        if (hash_resolutions.count(key) == 0) {
-                            auto resolved =
-                                idx_db.query_resolved_by_hash(dim, key);
-                            if (resolved.has_value()) {
-                                hash_resolutions[key] = resolved.value();
-                            }
+                }
+                for (const auto& [key, _] : file_detailed.grouped_io) {
+                    if (hash_resolutions.count(key) == 0) {
+                        auto resolved = idx_db.resolve_hash(hash_type, key);
+                        if (resolved.has_value()) {
+                            hash_resolutions[key] = resolved.value();
                         }
                     }
-                };
+                }
+            };
 
-                for (const auto& dim : *group_by_ptr) {
-                    if (dim == "fhash" || dim == "hhash") {
-                        resolve_hashes(dim);
-                    }
+            for (const auto& dim : *group_by_ptr) {
+                if (dim == "fhash") {
+                    resolve_hashes(IndexDatabase::HashType::FILE);
+                } else if (dim == "hhash") {
+                    resolve_hashes(IndexDatabase::HashType::HOST);
                 }
             }
         } catch (const std::exception& e) {
@@ -832,7 +913,7 @@ static coro::CoroTask<void> process_file_detailed(
 
     // Output per-file results
     if (json_output) {
-        std::string detail_json = file_detailed->to_json();
+        std::string detail_json = file_detailed.to_json();
         std::string json_obj = std::string("{\"file_path\": \"") + file_path +
                                "\", \"detailed\": " + detail_json + "}";
         std::lock_guard<std::mutex> lock(*output_mutex_ptr);
@@ -840,14 +921,14 @@ static coro::CoroTask<void> process_file_detailed(
     } else {
         std::lock_guard<std::mutex> lock(*output_mutex_ptr);
         print_text_detailed(
-            file_path, *file_detailed,
-            file_detailed->chunks_scanned + file_detailed->chunks_skipped,
-            top_n, hash_resolutions);
+            file_path, file_detailed,
+            file_detailed.chunks_scanned + file_detailed.chunks_skipped, top_n,
+            hash_resolutions);
     }
 
     {
         std::lock_guard<std::mutex> lock(*aggregate_mutex_ptr);
-        aggregate_detailed_ptr->merge(*file_detailed);
+        aggregate_detailed_ptr->merge(file_detailed);
     }
 
     co_return;
@@ -855,16 +936,12 @@ static coro::CoroTask<void> process_file_detailed(
 
 static void run_detailed_query_workers(
     CoroScope& scope, const std::vector<std::string>* files_ptr,
-    const std::vector<std::string>* small_files_ptr,
-    std::size_t executor_threads, std::string index_dir,
+    std::size_t executor_threads, const std::string* index_dir_ptr,
     std::size_t checkpoint_size, bool needs_hash_resolution, bool json_output,
     std::size_t top_n, const common::query::Query* qp,
     const std::vector<std::string>* fn, const std::vector<std::string>* fc,
     const std::vector<std::string>* gb, DetailedStatistics* ad, std::mutex* am,
     std::mutex* om, std::vector<std::pair<std::size_t, std::string>>* jr) {
-    auto small_set = std::make_shared<std::unordered_set<std::string>>(
-        small_files_ptr->begin(), small_files_ptr->end());
-
     auto file_chan = coro::make_channel<std::size_t>(executor_threads * 2);
 
     scope.spawn([ch = file_chan->producer(),
@@ -879,98 +956,122 @@ static void run_detailed_query_workers(
     });
 
     for (std::size_t w = 0; w < executor_threads; ++w) {
-        scope.spawn([file_chan, files_ptr, index_dir, checkpoint_size,
-                     needs_hash_resolution, json_output, top_n, small_set, qp,
-                     fn, fc, gb, ad, am, om,
+        scope.spawn([ch = file_chan->consumer(), files_ptr, index_dir_ptr,
+                     checkpoint_size, needs_hash_resolution, json_output, top_n,
+                     qp, fn, fc, gb, ad, am, om,
                      jr](CoroScope& fctx) -> coro::CoroTask<void> {
-            while (auto fi_opt = co_await file_chan->receive()) {
+            while (auto fi_opt = co_await ch.receive()) {
                 std::size_t fi = *fi_opt;
-                const auto& file_path = (*files_ptr)[fi];
-                bool is_small = small_set->count(file_path) > 0;
-
-                if (is_small) {
-                    auto stats = co_await direct_scan_detailed_statistics(
-                        file_path, fn, fc, gb);
-                    {
-                        std::lock_guard<std::mutex> lock(*am);
-                        ad->merge(stats);
-                    }
-                    continue;
-                }
+                std::string file_path = (*files_ptr)[fi];
                 co_await process_file_detailed(
-                    fctx, file_path, fi, index_dir, checkpoint_size,
-                    needs_hash_resolution, json_output, top_n, qp, fn, fc, gb,
-                    ad, am, om, jr);
+                    fctx, std::move(file_path), fi, *index_dir_ptr,
+                    checkpoint_size, needs_hash_resolution, json_output, top_n,
+                    qp, fn, fc, gb, ad, am, om, jr);
             }
             co_return;
         });
     }
 }
 
-static coro::CoroTask<int> run_stats(argparse::ArgumentParser& program) {
-    std::string directory = program.get<std::string>("--directory");
-    std::string index_dir = program.get<std::string>("--index-dir");
-    bool json_output = program.get<bool>("--json");
-    std::string report_str = program.get<std::string>("--report");
-    std::uint64_t top_n = program.get<std::uint64_t>("--top-n");
-    std::uint64_t top_n_pid_tid = program.get<std::uint64_t>("--top-n-pid-tid");
-    bool no_auto_index = program.get<bool>("--no-auto-index");
-    std::size_t checkpoint_size = program.get<std::size_t>("--checkpoint-size");
-    std::size_t executor_threads =
-        program.get<std::size_t>("--executor-threads");
-    auto query_str = program.get<std::string>("--query");
-    auto group_by = program.get<std::vector<std::string>>("--group-by");
-
-    std::optional<Query> query;
-    std::vector<std::string> filter_names;
-    std::vector<std::string> filter_cats;
-
-    if (!query_str.empty()) {
-        auto result = Query::from_string(query_str);
-        if (!result) {
-            DFTRACER_UTILS_LOG_ERROR("Invalid --query: %s",
-                                     result.error().format().c_str());
-            co_return 1;
-        }
-        query = std::move(*result);
-    }
-
-    auto report_type = parse_report_type_str(report_str);
-
-    // Default --group-by to "name" for detailed query so users always see
-    // per-event breakdowns (not just global I/O aggregates)
-    if (report_type == StatisticsQueryType::DETAILED && group_by.empty()) {
-        group_by.push_back("name");
-    }
+static void print_text_query_output(const TraceStatistics& stats,
+                                    const StatisticsQueryOutput& output) {
+    std::printf("========================================\n");
+    std::printf("File: %s\n", stats.file_path.c_str());
+    std::printf("========================================\n");
+    std::printf("  Chunks: %llu\n", (unsigned long long)stats.num_chunks);
+    std::printf("  Events Scanned: %llu\n",
+                (unsigned long long)output.total_events);
+
+    switch (
+        output.query_type_name == "categories" ? StatisticsQueryType::CATEGORIES
+        : output.query_type_name == "names"    ? StatisticsQueryType::NAMES
+        : output.query_type_name == "pid_tids" ? StatisticsQueryType::PID_TIDS
+        : output.query_type_name == "time_range"
+            ? StatisticsQueryType::TIME_RANGE
+        : output.query_type_name == "duration_stats"
+            ? StatisticsQueryType::DURATION_STATS
+        : output.query_type_name == "top_n_names"
+            ? StatisticsQueryType::TOP_N_NAMES
+        : output.query_type_name == "top_n_categories"
+            ? StatisticsQueryType::TOP_N_CATEGORIES
+            : StatisticsQueryType::SUMMARY) {
+        case StatisticsQueryType::CATEGORIES:
+        case StatisticsQueryType::TOP_N_CATEGORIES:
+            std::printf("\n  Categories (%zu):\n", output.results.size());
+            for (const auto& [name, count] : output.results) {
+                std::printf("    %-40s %llu\n", name.c_str(),
+                            (unsigned long long)count);
+            }
+            break;
+
+        case StatisticsQueryType::NAMES:
+        case StatisticsQueryType::TOP_N_NAMES:
+            std::printf("\n  Names (%zu):\n", output.results.size());
+            for (const auto& [name, count] : output.results) {
+                std::printf("    %-40s %llu\n", name.c_str(),
+                            (unsigned long long)count);
+            }
+            break;
 
-    // Validate group-by dimensions
-    const std::vector<std::string> valid_dims = {
-        "name", "cat", "pid", "tid", "fhash", "hhash", "pid_tid"};
-    for (const auto& dim : group_by) {
-        if (std::find(valid_dims.begin(), valid_dims.end(), dim) ==
-            valid_dims.end()) {
-            DFTRACER_UTILS_LOG_ERROR(
-                "Invalid --group-by dimension: %s. Valid: name, cat, pid, "
-                "tid, fhash, hhash, pid_tid",
-                dim.c_str());
-            co_return 1;
-        }
+        case StatisticsQueryType::PID_TIDS:
+            std::printf("\n  Process/Thread Pairs (%zu):\n",
+                        output.results.size());
+            for (const auto& [name, count] : output.results) {
+                std::printf("    %-40s %llu\n", name.c_str(),
+                            (unsigned long long)count);
+            }
+            break;
+
+        case StatisticsQueryType::TIME_RANGE:
+            std::printf("\n  Time Span: %.6f seconds\n",
+                        output.time_span_seconds);
+            std::printf("  Min Timestamp: %llu us\n",
+                        (unsigned long long)output.min_timestamp_us);
+            std::printf("  Max Timestamp: %llu us\n",
+                        (unsigned long long)output.max_timestamp_us);
+            break;
+
+        case StatisticsQueryType::DURATION_STATS:
+            std::printf("\n  Duration (all events):\n");
+            std::printf("    Count: %llu   Mean: %.1f us   Stddev: %.1f us\n",
+                        (unsigned long long)output.duration_count,
+                        output.duration_mean_us, output.duration_stddev_us);
+            std::printf("    Min: %llu us   Max: %llu us\n",
+                        (unsigned long long)output.duration_min_us,
+                        (unsigned long long)output.duration_max_us);
+            break;
+
+        case StatisticsQueryType::SUMMARY:
+        case StatisticsQueryType::DETAILED:
+            break;
     }
+}
 
-    // Collect files
-    std::vector<std::string> files;
+static coro::CoroTask<std::vector<std::string>> collect_files(
+    CoroScope& ctx, const std::vector<std::string>& cli_files,
+    std::string directory) {
     if (!directory.empty()) {
         if (!fs::exists(directory)) {
             DFTRACER_UTILS_LOG_ERROR("Directory does not exist: %s",
                                      directory.c_str());
-            co_return 1;
+            co_return std::vector<std::string>{};
         }
 
-        PatternDirectoryScannerUtility scanner;
+        auto scanner = std::make_shared<PatternDirectoryScannerUtility>();
         PatternDirectoryScannerUtilityInput scan_input{
-            directory, {".pfw", ".pfw.gz"}, false};
-        auto matched = co_await scanner.process(scan_input);
-
+            directory, {".pfw", ".pfw.gz"}, false, false};
+        utilities::behaviors::BehaviorChain<PatternDirectoryScannerUtilityInput,
+                                            std::vector<filesystem::FileEntry>>
+            chain;
+        utilities::behaviors::UtilityExecutor<
+            PatternDirectoryScannerUtilityInput,
+            std::vector<filesystem::FileEntry>, utilities::tags::Parallelizable,
+            utilities::tags::NeedsContext>
+            executor(scanner, std::move(chain));
+        auto matched = co_await executor.execute_with_context(ctx, scan_input);
+
+        std::vector<std::string> files;
+        files.reserve(matched.size());
         for (const auto& entry : matched) {
             files.push_back(entry.path.string());
         }
@@ -978,357 +1079,469 @@ static coro::CoroTask<int> run_stats(argparse::ArgumentParser& program) {
         if (files.empty()) {
             DFTRACER_UTILS_LOG_ERROR("No .pfw or .pfw.gz files found in: %s",
                                      directory.c_str());
-            co_return 1;
         }
-    } else {
-        files = program.get<std::vector<std::string>>("--files");
+        co_return files;
+    }
 
-        if (files.empty()) {
-            DFTRACER_UTILS_LOG_ERROR(
-                "%s", "No files or directory specified. Use --help for usage.");
-            std::cerr << program;
-            co_return 1;
-        }
+    if (cli_files.empty()) {
+        DFTRACER_UTILS_LOG_ERROR(
+            "%s", "No files or directory specified. Use --help for usage.");
     }
+    co_return cli_files;
+}
 
-    // Partition files: large files get indexed, small files are scanned
-    // directly to avoid creating sidecar files on metadata-sensitive
-    // filesystems (e.g. Lustre).
-    std::vector<std::string> files_needing_index;
-    std::vector<std::string> small_files;
-    for (const auto& file_path : files) {
-        std::string index_path =
-            internal::determine_index_path(file_path, index_dir);
-        if (fs::exists(index_path)) {
-            try {
-                IndexDatabase db(index_path);
-                auto logical =
-                    utilities::indexer::internal::get_logical_path(file_path);
-                int fid = db.get_file_info_id(logical);
-                if (fid >= 0 && db.has_bloom_data(fid)) continue;
-            } catch (...) {
-            }
-        }
-        std::error_code ec;
-        auto fsize = fs::file_size(file_path, ec);
-        if (ec || fsize == 0) {
-            continue;  // skip unreadable or empty files
-        }
-        if (fsize < INDEX_SIZE_THRESHOLD) {
-            small_files.push_back(file_path);
+static std::unique_ptr<IndexedRootSnapshot> load_index_root_snapshot_impl(
+    const std::string& index_path) {
+    auto snapshot = std::make_unique<IndexedRootSnapshot>();
+    IndexDatabase db(
+        index_path,
+        dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+
+    auto registry = db.query_all_file_registry();
+
+    snapshot->logical_files.reserve(registry.size());
+    snapshot->partition.indexed_entries.reserve(registry.size());
+
+    std::size_t file_index = 0;
+    for (auto& [logical_path, reg] : registry) {
+        snapshot->logical_files.push_back(logical_path);
+        const bool has_summary = has_capability(
+            reg.capabilities, IndexFileEntryCapability::FILE_SUMMARY);
+        if (!has_summary) {
+            append_failed_stats_result(
+                snapshot->partition.precomputed_failures, file_index,
+                logical_path,
+                "File registry entry exists but no file summary data was "
+                "found in the shared index");
         } else {
-            files_needing_index.push_back(file_path);
+            snapshot->partition.indexed_entries.push_back(ResolvedFile{
+                file_index, logical_path, reg.file_id, reg.capabilities});
         }
+        ++file_index;
     }
+    snapshot->partition.resolver_result.index_path = index_path;
+    return snapshot;
+}
 
-    if (!small_files.empty()) {
-        std::printf(
-            "Skipping index for %zu small file(s) (< %zu bytes "
-            "compressed); will scan directly.\n",
-            small_files.size(), INDEX_SIZE_THRESHOLD);
-    }
+static coro::CoroTask<std::unique_ptr<IndexedRootSnapshot>>
+load_index_root_snapshot(std::string index_path) {
+    co_return load_index_root_snapshot_impl(index_path);
+}
 
-    if (!files_needing_index.empty()) {
-        if (no_auto_index) {
-            DFTRACER_UTILS_LOG_ERROR(
-                "Missing index for %zu file(s) and --no-auto-index is "
-                "set. Run dftracer_index first.",
-                files_needing_index.size());
-            for (const auto& f : files_needing_index) {
-                std::fprintf(stderr, "  Missing index: %s\n", f.c_str());
-            }
-            co_return 1;
-        }
+static std::unique_ptr<AggregateStatsResult> load_root_aggregate_impl(
+    const std::string& index_path, StatisticsQueryType report_type) {
+    IndexDatabase idx_db(
+        index_path,
+        dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+    auto scalar_stats = idx_db.query_root_scalar_stats();
+    if (!scalar_stats) {
+        return nullptr;
+    }
 
-        std::printf("Auto-building index for %zu file(s)...\n",
-                    files_needing_index.size());
-
-        auto pipeline_config = PipelineConfig()
-                                   .with_name("DFTracer Stats Auto-Indexer")
-                                   .with_compute_threads(executor_threads)
-                                   .with_watchdog(false);
-
-        Pipeline pipeline(pipeline_config);
-
-        std::atomic<std::size_t> indexed_count{0};
-        std::atomic<std::size_t> failed_count{0};
-
-        auto index_task = make_task(
-            [&](CoroScope& ctx) -> coro::CoroTask<void> {
-                auto file_chan =
-                    coro::make_channel<std::string>(executor_threads * 2);
-
-                co_await ctx.scope([&](CoroScope& scope)
-                                       -> coro::CoroTask<void> {
-                    auto* files_ptr = &files_needing_index;
-                    scope.spawn(
-                        [ch = file_chan->producer(), files_ptr](
-                            CoroScope&) mutable -> coro::CoroTask<void> {
-                            auto guard = ch.guard();
-                            for (const auto& f : *files_ptr) {
-                                if (!co_await ch.send(f)) {
-                                    co_return;
-                                }
-                            }
-                            co_return;
-                        });
-
-                    auto* indexed_count_ptr = &indexed_count;
-                    auto* failed_count_ptr = &failed_count;
-                    std::string index_dir_copy = index_dir;
-                    std::size_t ckpt_size = checkpoint_size;
-                    for (std::size_t w = 0; w < executor_threads; ++w) {
-                        scope.spawn([file_chan, index_dir_copy, ckpt_size,
-                                     indexed_count_ptr, failed_count_ptr](
-                                        CoroScope&) -> coro::CoroTask<void> {
-                            while (auto file_path =
-                                       co_await file_chan->receive()) {
-                                try {
-                                    IndexBuilderUtility builder;
-                                    auto config =
-                                        IndexBuildConfig::for_file(*file_path)
-                                            .with_index_dir(index_dir_copy)
-                                            .with_checkpoint_size(ckpt_size)
-                                            .with_bloom(true)
-                                            .with_index_threshold(0);
-                                    auto result =
-                                        co_await builder.process(config);
-
-                                    if (result.success) {
-                                        (*indexed_count_ptr)++;
-                                    } else {
-                                        (*failed_count_ptr)++;
-                                        DFTRACER_UTILS_LOG_ERROR(
-                                            "Auto-indexing failed "
-                                            "for %s: %s",
-                                            file_path->c_str(),
-                                            result.error_message.c_str());
-                                    }
-                                } catch (const std::exception& e) {
-                                    (*failed_count_ptr)++;
-                                    DFTRACER_UTILS_LOG_ERROR(
-                                        "Auto-indexing exception "
-                                        "for %s: %s",
-                                        file_path->c_str(), e.what());
-                                }
-                            }
-                            co_return;
-                        });
-                    }
-                    co_return;
-                });
+    auto agg = std::make_unique<AggregateStatsResult>();
+    agg->total.success = true;
+    agg->total.file_path = index_path;
+    agg->total.index_path = index_path;
+    agg->total.num_chunks = scalar_stats->num_chunks;
+    agg->total.merged = scalar_stats->stats;
+
+    const bool needs_categories =
+        report_type == StatisticsQueryType::SUMMARY ||
+        report_type == StatisticsQueryType::CATEGORIES ||
+        report_type == StatisticsQueryType::TOP_N_CATEGORIES;
+    const bool needs_names = report_type == StatisticsQueryType::NAMES ||
+                             report_type == StatisticsQueryType::TOP_N_NAMES;
+    const bool needs_pid_tids = report_type == StatisticsQueryType::SUMMARY ||
+                                report_type == StatisticsQueryType::PID_TIDS;
+
+    if (needs_categories) {
+        idx_db.merge_root_category_counts_into(agg->total.merged);
+    }
+    if (needs_names) {
+        idx_db.merge_root_name_counts_into(agg->total.merged);
+    }
+    if (needs_pid_tids) {
+        idx_db.merge_root_pid_tid_counts_into(agg->total.merged);
+    }
 
-                co_return;
-            },
-            "AutoIndex");
+    agg->successful_count = static_cast<std::size_t>(scalar_stats->num_files);
+    return agg;
+}
 
-        pipeline.set_source(index_task);
-        pipeline.set_destination(index_task);
-        pipeline.execute();
+static coro::CoroTask<std::unique_ptr<AggregateStatsResult>>
+load_root_aggregate_result(std::string index_path,
+                           StatisticsQueryType report_type) {
+    co_return load_root_aggregate_impl(index_path, report_type);
+}
 
-        std::printf("Auto-indexing complete: %zu indexed, %zu failed\n",
-                    indexed_count.load(), failed_count.load());
+static IndexPartition build_partition(ResolverResult result) {
+    IndexPartition partition;
+    partition.files_needing_index = std::move(result.needs_checkpoint);
+    partition.indexed_entries = std::move(result.cached);
+    partition.resolver_result = std::move(result);
+
+    // Files that have checkpoints but no bloom get empty stats
+    for (const auto& entry : partition.resolver_result.needs_bloom) {
+        append_empty_indexed_stats_result(partition.precomputed_successes,
+                                          entry.file_index, entry.file_path,
+                                          partition.resolver_result.index_path);
     }
 
-    auto start_time = std::chrono::high_resolution_clock::now();
+    return partition;
+}
 
-    // Detailed query path: scan chunks on-demand with bloom pre-filtering
-    if (report_type == StatisticsQueryType::DETAILED) {
-        bool needs_hash_resolution = false;
-        for (const auto& dim : group_by) {
-            if (dim == "fhash" || dim == "hhash") {
-                needs_hash_resolution = true;
-                break;
+static coro::CoroTask<IndexPartition> resolve_index_state(
+    std::vector<std::string> files, std::string index_dir,
+    StatisticsQueryType report_type) {
+    IndexResolverUtility resolver;
+    ResolverInput input;
+    input.files = std::move(files);
+    input.index_dir = std::move(index_dir);
+    input.require_bloom = report_type == StatisticsQueryType::DETAILED;
+    auto result = co_await resolver.process(input);
+    co_return build_partition(std::move(result));
+}
+
+static coro::CoroTask<indexer::IndexBuildBatchResult> run_batch_build(
+    CoroScope* ctx, std::shared_ptr<indexer::IndexBuildBatchConfig> config) {
+    co_return co_await indexer::IndexBatchBuilderUtility::process(
+        ctx, std::move(config));
+}
+
+static coro::CoroTask<void> auto_index_files(CoroScope& ctx,
+                                             IndexPartition& partition,
+                                             const std::string& index_dir,
+                                             std::size_t checkpoint_size,
+                                             std::size_t executor_threads) {
+    auto index_path = internal::determine_index_path(
+        partition.files_needing_index.front().file_path, index_dir);
+    dftracer::utils::rocksdb::RocksDBManager::instance().reset(index_path);
+
+    std::printf("Auto-building index for %zu file(s)...\n",
+                partition.files_needing_index.size());
+
+    const bool all_gzip = std::all_of(
+        partition.files_needing_index.begin(),
+        partition.files_needing_index.end(), [](const FileWorkItem& item) {
+            return item.file_path.ends_with(".gz");
+        });
+
+    {
+        auto batch_config = std::make_shared<indexer::IndexBuildBatchConfig>();
+        batch_config->file_paths.reserve(partition.files_needing_index.size());
+        for (const auto& item : partition.files_needing_index) {
+            batch_config->file_paths.push_back(item.file_path);
+        }
+        batch_config->index_dir = index_dir;
+        batch_config->checkpoint_size = checkpoint_size;
+        batch_config->parallelism = executor_threads;
+
+        batch_config->use_batch_write = all_gzip;
+        batch_config->rebuild_root_summaries = all_gzip;
+
+        auto batch_result =
+            co_await run_batch_build(&ctx, std::move(batch_config));
+
+        DFTRACER_UTILS_LOG_INFO(
+            "Shared root auto-index metrics: root=%s files=%zu "
+            "enqueued=%zu parsed=%zu written=%zu "
+            "parse=%.2fms writer_db=%.2fms",
+            index_path.c_str(), partition.files_needing_index.size(),
+            batch_result.metrics.files_enqueued,
+            batch_result.metrics.files_parsed,
+            batch_result.metrics.files_written,
+            ns_to_ms(batch_result.metrics.parse_ns),
+            ns_to_ms(batch_result.metrics.write_ns));
+
+        for (const auto& result : batch_result.results) {
+            if (!result.success && !result.error_message.empty()) {
+                DFTRACER_UTILS_LOG_ERROR("Auto-indexing failed for %s: %s",
+                                         result.file_path.c_str(),
+                                         result.error_message.c_str());
             }
         }
 
-        DetailedStatistics aggregate_detailed;
-        std::mutex aggregate_mutex;
-        std::mutex output_mutex;
-        std::vector<std::pair<std::size_t, std::string>> json_results;
-
-        {
-            auto pipeline_config = PipelineConfig()
-                                       .with_name("DFTracer Stats Detailed")
-                                       .with_compute_threads(executor_threads)
-                                       .with_watchdog(false);
-
-            Pipeline pipeline(pipeline_config);
-
-            auto stats_task = make_task(
-                [&](CoroScope& ctx) -> coro::CoroTask<void> {
-                    co_await ctx.scope(
-                        [&](CoroScope& scope) -> coro::CoroTask<void> {
-                            run_detailed_query_workers(
-                                scope, &files, &small_files, executor_threads,
-                                index_dir, checkpoint_size,
-                                needs_hash_resolution, json_output, top_n,
-                                query ? &*query : nullptr, &filter_names,
-                                &filter_cats, &group_by, &aggregate_detailed,
-                                &aggregate_mutex, &output_mutex, &json_results);
-                            co_return;
-                        });
-                    co_return;
-                },
-                "StatsDetailed");
-
-            pipeline.set_source(stats_task);
-            pipeline.set_destination(stats_task);
-            pipeline.execute();
-        }
+        std::printf("Auto-indexing complete: %zu indexed, %zu failed\n",
+                    batch_result.indexed, batch_result.failed);
+    }
 
-        auto end_time = std::chrono::high_resolution_clock::now();
-        std::chrono::duration<double, std::milli> duration =
-            end_time - start_time;
-
-        if (json_output) {
-            std::printf("[\n");
-            std::sort(
-                json_results.begin(), json_results.end(),
-                [](const auto& a, const auto& b) { return a.first < b.first; });
-            for (std::size_t i = 0; i < json_results.size(); ++i) {
-                std::printf("%s%s", json_results[i].second.c_str(),
-                            i + 1 < json_results.size() ? ",\n" : "\n");
-            }
-            std::printf("]\n");
+    // Re-resolve newly indexed files
+    std::vector<std::string> newly_indexed;
+    newly_indexed.reserve(partition.files_needing_index.size());
+    for (const auto& item : partition.files_needing_index) {
+        newly_indexed.push_back(item.file_path);
+    }
+
+    IndexResolverUtility resolver;
+    ResolverInput refresh_input;
+    refresh_input.files = std::move(newly_indexed);
+    refresh_input.index_dir = index_dir;
+    refresh_input.require_checkpoints = true;
+
+    auto refresh_result = co_await resolver.process(refresh_input);
+
+    // Add successfully indexed files
+    for (auto& entry : refresh_result.cached) {
+        bool has_bloom = indexer::has_capability(
+            entry.capabilities, indexer::IndexFileEntryCapability::BLOOM);
+        if (has_bloom) {
+            partition.indexed_entries.push_back(std::move(entry));
         } else {
-            std::printf("==========================================\n");
-            std::printf("Consolidated Detailed (%zu files)\n", files.size());
-            std::printf("==========================================\n");
-            std::unordered_map<std::string, std::string> no_resolutions;
-            print_text_detailed(directory, aggregate_detailed,
-                                aggregate_detailed.chunks_scanned +
-                                    aggregate_detailed.chunks_skipped,
-                                top_n, no_resolutions);
-            std::printf("  Processing Time: %.2f ms\n", duration.count());
-            std::printf("==========================================\n");
+            append_empty_indexed_stats_result(partition.precomputed_successes,
+                                              entry.file_index, entry.file_path,
+                                              refresh_result.index_path);
         }
+    }
 
-        co_return 0;
+    // Handle files that still need checkpoints (failed to index)
+    for (const auto& item : refresh_result.needs_checkpoint) {
+        append_failed_stats_result(
+            partition.precomputed_failures, item.file_index, item.file_path,
+            "Auto-index completed but no readable file summary "
+            "data was found in the shared index");
     }
 
-    // Non-detailed path: aggregate statistics per file in parallel
-    std::vector<std::pair<std::size_t, TraceStatistics>> indexed_stats;
-    std::mutex stats_mutex;
+    // Update resolver result
+    partition.resolver_result.index_path = refresh_result.index_path;
 
-    {
-        auto pipeline_config = PipelineConfig()
-                                   .with_name("DFTracer Stats")
-                                   .with_compute_threads(executor_threads)
-                                   .with_watchdog(false);
-
-        Pipeline pipeline(pipeline_config);
-
-        auto stats_task = make_task(
-            [&](CoroScope& ctx) -> coro::CoroTask<void> {
-                co_await ctx.scope([&](CoroScope& scope)
-                                       -> coro::CoroTask<void> {
-                    auto* indexed_stats_ptr = &indexed_stats;
-                    auto* stats_mutex_ptr = &stats_mutex;
-                    auto* files_ptr = &files;
-
-                    // Build set of small files for O(1) lookup.
-                    // shared_ptr so workers keep it alive after this
-                    // scope lambda's coroutine frame is destroyed.
-                    auto small_set =
-                        std::make_shared<std::unordered_set<std::string>>(
-                            small_files.begin(), small_files.end());
-
-                    auto file_chan =
-                        coro::make_channel<std::size_t>(executor_threads * 2);
-
-                    // Producer: push file indices
-                    scope.spawn(
-                        [ch = file_chan->producer(), files_ptr](
-                            CoroScope&) mutable -> coro::CoroTask<void> {
-                            auto guard = ch.guard();
-                            for (std::size_t fi = 0; fi < files_ptr->size();
-                                 ++fi) {
-                                if (!co_await ch.send(fi)) {
-                                    co_return;
-                                }
-                            }
-                            co_return;
-                        });
-
-                    // Workers: N coroutines, each processing one file at a time
-                    for (std::size_t w = 0; w < executor_threads; ++w) {
-                        scope.spawn([file_chan, files_ptr, index_dir, small_set,
-                                     indexed_stats_ptr, stats_mutex_ptr](
-                                        CoroScope&) -> coro::CoroTask<void> {
-                            while (auto fi_opt =
-                                       co_await file_chan->receive()) {
-                                std::size_t fi = *fi_opt;
-                                const auto& file_path = (*files_ptr)[fi];
-                                bool is_small = small_set->count(file_path) > 0;
-
-                                TraceStatistics result;
-                                if (is_small) {
-                                    result =
-                                        co_await direct_scan_trace_statistics(
-                                            file_path);
-                                } else {
-                                    StatisticsAggregatorInput agg_input;
-                                    agg_input.file_path = file_path;
-                                    agg_input.index_dir = index_dir;
-
-                                    StatisticsAggregatorUtility aggregator;
-                                    result =
-                                        co_await aggregator.process(agg_input);
-                                }
-
-                                std::lock_guard<std::mutex> lock(
-                                    *stats_mutex_ptr);
-                                indexed_stats_ptr->emplace_back(
-                                    fi, std::move(result));
-                            }
-                            co_return;
-                        });
-                    }
-                    co_return;
-                });
+    co_return;
+}
+
+static coro::CoroTask<int> run_detailed_stats(
+    CoroScope& ctx, const StatsConfig* config_ptr,
+    const std::vector<std::string>* files_ptr) {
+    auto start_time = std::chrono::high_resolution_clock::now();
+
+    bool needs_hash_resolution = false;
+    for (const auto& dim : config_ptr->group_by) {
+        if (dim == "fhash" || dim == "hhash") {
+            needs_hash_resolution = true;
+            break;
+        }
+    }
+
+    auto aggregate_detailed = std::make_unique<DetailedStatistics>();
+    std::mutex aggregate_mutex;
+    std::mutex output_mutex;
+    auto json_results =
+        std::make_unique<std::vector<std::pair<std::size_t, std::string>>>();
+
+    auto* filter_names_ptr = &config_ptr->filter_names;
+    auto* filter_cats_ptr = &config_ptr->filter_cats;
+    auto* group_by_ptr = &config_ptr->group_by;
+    auto* aggregate_detailed_ptr = aggregate_detailed.get();
+    auto* aggregate_mutex_ptr = &aggregate_mutex;
+    auto* output_mutex_ptr = &output_mutex;
+    auto* json_results_ptr = json_results.get();
+    auto* query_ptr = config_ptr->query ? &*config_ptr->query : nullptr;
+    const auto* index_dir_for_detailed_ptr = &config_ptr->index_dir;
+    std::size_t checkpoint_size_for_detailed = config_ptr->checkpoint_size;
+    std::size_t executor_threads_for_detailed = config_ptr->executor_threads;
+    bool needs_hash_resolution_for_detailed = needs_hash_resolution;
+    bool json_output_for_detailed = config_ptr->json_output;
+    std::size_t top_n_for_detailed = config_ptr->top_n;
+
+    co_await ctx.scope(
+        [files_ptr, executor_threads_for_detailed, index_dir_for_detailed_ptr,
+         checkpoint_size_for_detailed, needs_hash_resolution_for_detailed,
+         json_output_for_detailed, top_n_for_detailed, query_ptr,
+         filter_names_ptr, filter_cats_ptr, group_by_ptr,
+         aggregate_detailed_ptr, aggregate_mutex_ptr, output_mutex_ptr,
+         json_results_ptr](CoroScope& scope) -> coro::CoroTask<void> {
+            run_detailed_query_workers(
+                scope, files_ptr, executor_threads_for_detailed,
+                index_dir_for_detailed_ptr, checkpoint_size_for_detailed,
+                needs_hash_resolution_for_detailed, json_output_for_detailed,
+                top_n_for_detailed, query_ptr, filter_names_ptr,
+                filter_cats_ptr, group_by_ptr, aggregate_detailed_ptr,
+                aggregate_mutex_ptr, output_mutex_ptr, json_results_ptr);
+            co_return;
+        });
+
+    auto end_time = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double, std::milli> duration = end_time - start_time;
 
+    if (config_ptr->json_output) {
+        std::printf("[\n");
+        std::sort(
+            json_results->begin(), json_results->end(),
+            [](const auto& a, const auto& b) { return a.first < b.first; });
+        for (std::size_t i = 0; i < json_results->size(); ++i) {
+            std::printf("%s%s", (*json_results)[i].second.c_str(),
+                        i + 1 < json_results->size() ? ",\n" : "\n");
+        }
+        std::printf("]\n");
+    } else {
+        std::printf("==========================================\n");
+        std::printf("Consolidated Detailed (%zu files)\n", files_ptr->size());
+        std::printf("==========================================\n");
+        std::unordered_map<std::string, std::string> no_resolutions;
+        print_text_detailed(config_ptr->directory, *aggregate_detailed,
+                            aggregate_detailed->chunks_scanned +
+                                aggregate_detailed->chunks_skipped,
+                            config_ptr->top_n, no_resolutions);
+        std::printf("  Processing Time: %.2f ms\n", duration.count());
+        std::printf("==========================================\n");
+    }
+
+    co_return 0;
+}
+
+static coro::CoroTask<void> process_index_group(
+    const std::string* index_path_ptr,
+    const std::vector<ResolvedFile>* group_ptr,
+    std::vector<std::pair<std::size_t, TraceStatistics>>* indexed_stats_ptr,
+    std::mutex* stats_mutex_ptr, std::size_t expected_indexed_files,
+    bool needs_per_file_results, TraceStatistics* total_ptr,
+    std::mutex* total_mutex_ptr, std::atomic<std::size_t>* successful_ptr,
+    std::atomic<std::size_t>* failed_ptr,
+    StatisticsQueryType report_type_for_reader, Timer* metrics_timer_ptr) {
+    try {
+        metrics_timer_ptr->increment("root_summary_attempts");
+        if (!needs_per_file_results) {
+            auto root_summary = co_await process_index_group_root_summary(
+                *index_path_ptr, expected_indexed_files,
+                report_type_for_reader);
+            if (root_summary && root_summary->success) {
+                metrics_timer_ptr->increment("root_summary_hits");
+                std::lock_guard<std::mutex> lock(*total_mutex_ptr);
+                total_ptr->merged.merge_from(root_summary->merged);
+                total_ptr->num_chunks += root_summary->num_chunks;
+                successful_ptr->fetch_add(group_ptr->size(),
+                                          std::memory_order_relaxed);
                 co_return;
-            },
-            "StatsProcess");
+            }
+        }
+        metrics_timer_ptr->increment("root_summary_misses");
+        metrics_timer_ptr->increment("fallback_groups");
+        metrics_timer_ptr->increment("fallback_files", group_ptr->size());
+
+        SharedIndexStatisticsReader reader;
+        auto batch_rows = co_await reader.query(*index_path_ptr, *group_ptr,
+                                                report_type_for_reader);
+        auto callback = [indexed_stats_ptr, stats_mutex_ptr,
+                         needs_per_file_results, total_ptr, total_mutex_ptr,
+                         successful_ptr, failed_ptr](std::size_t file_index,
+                                                     TraceStatistics&& stats) {
+            if (needs_per_file_results) {
+                std::lock_guard<std::mutex> lock(*stats_mutex_ptr);
+                indexed_stats_ptr->emplace_back(file_index, std::move(stats));
+                return;
+            }
 
-        pipeline.set_source(stats_task);
-        pipeline.set_destination(stats_task);
-        pipeline.execute();
+            if (stats.success) {
+                std::lock_guard<std::mutex> lock(*total_mutex_ptr);
+                total_ptr->merged.merge_from(stats.merged);
+                total_ptr->num_chunks += stats.num_chunks;
+                successful_ptr->fetch_add(1, std::memory_order_relaxed);
+            } else {
+                failed_ptr->fetch_add(1, std::memory_order_relaxed);
+            }
+        };
+        SharedIndexStatisticsReader::process_batch_results(batch_rows,
+                                                           callback);
+    } catch (const std::exception& e) {
+        DFTRACER_UTILS_LOG_ERROR("Indexed stats batch failed for %s: %s",
+                                 index_path_ptr->c_str(), e.what());
+        if (needs_per_file_results) {
+            std::lock_guard<std::mutex> lock(*stats_mutex_ptr);
+            for (const auto& entry : *group_ptr) {
+                TraceStatistics failed_result;
+                failed_result.file_path = entry.file_path;
+                failed_result.success = false;
+                failed_result.error_message = e.what();
+                indexed_stats_ptr->emplace_back(entry.file_index,
+                                                std::move(failed_result));
+            }
+        }
+        failed_ptr->fetch_add(group_ptr->size(), std::memory_order_relaxed);
     }
+    co_return;
+}
 
-    // Restore original file order
-    std::sort(indexed_stats.begin(), indexed_stats.end(),
-              [](const auto& a, const auto& b) { return a.first < b.first; });
+static coro::CoroTask<AggregateStatsResult> run_aggregate_stats(
+    CoroScope& /*ctx*/, const StatsConfig* config_ptr,
+    std::unique_ptr<IndexPartition> partition_ptr) {
+    auto& partition = *partition_ptr;
+    auto agg = std::make_unique<AggregateStatsResult>();
+    agg->total.success = true;
+    agg->total.file_path = config_ptr->directory;
+
+    const bool needs_per_file_results = config_ptr->json_output;
+    std::atomic<std::size_t> successful{0};
+    std::atomic<std::size_t> failed{0};
+
+    if (!partition.precomputed_failures.empty()) {
+        if (needs_per_file_results) {
+            agg->indexed_stats.insert(
+                agg->indexed_stats.end(),
+                std::make_move_iterator(partition.precomputed_failures.begin()),
+                std::make_move_iterator(partition.precomputed_failures.end()));
+        }
+        failed.fetch_add(partition.precomputed_failures.size(),
+                         std::memory_order_relaxed);
+    }
+    if (!partition.precomputed_successes.empty()) {
+        if (needs_per_file_results) {
+            agg->indexed_stats.insert(
+                agg->indexed_stats.end(),
+                std::make_move_iterator(
+                    partition.precomputed_successes.begin()),
+                std::make_move_iterator(partition.precomputed_successes.end()));
+        }
+        successful.fetch_add(partition.precomputed_successes.size(),
+                             std::memory_order_relaxed);
+    }
 
-    std::vector<TraceStatistics> all_stats;
-    all_stats.reserve(indexed_stats.size());
-    for (auto& [_, stats] : indexed_stats) {
-        all_stats.push_back(std::move(stats));
+    std::mutex stats_mutex;
+    std::mutex total_mutex;
+    Timer read_timer("stats_read_path", true, false);
+
+    auto* indexed_stats_ptr = &agg->indexed_stats;
+    auto* stats_mutex_ptr = &stats_mutex;
+    auto* indexed_entries_ptr = &partition.indexed_entries;
+    const auto* index_path_ptr = &partition.resolver_result.index_path;
+    auto* total_ptr = &agg->total;
+    auto* total_mutex_ptr = &total_mutex;
+    auto* successful_ptr = &successful;
+    auto* failed_ptr = &failed;
+    auto* read_timer_ptr = &read_timer;
+    StatisticsQueryType report_type_for_reader = config_ptr->report_type;
+
+    if (!indexed_entries_ptr->empty()) {
+        const auto expected_indexed_files = indexed_entries_ptr->size();
+        co_await process_index_group(
+            index_path_ptr, indexed_entries_ptr, indexed_stats_ptr,
+            stats_mutex_ptr, expected_indexed_files, needs_per_file_results,
+            total_ptr, total_mutex_ptr, successful_ptr, failed_ptr,
+            report_type_for_reader, read_timer_ptr);
     }
+    read_timer.stop();
 
-    // Merge all per-file stats into a single consolidated result
-    TraceStatistics total;
-    total.success = true;
-    total.file_path = directory;
-    std::size_t successful = 0;
-    std::size_t failed = 0;
-
-    for (const auto& stats : all_stats) {
-        if (stats.success) {
-            total.merged.merge_from(stats.merged);
-            total.num_chunks += stats.num_chunks;
-            successful++;
-        } else {
-            failed++;
+    agg->successful_count = successful.load(std::memory_order_relaxed);
+    agg->failed_count = failed.load(std::memory_order_relaxed);
+    agg->read_elapsed_ns = read_timer.elapsed();
+    agg->read_counters = read_timer.counters();
+    co_return std::move(*agg);
+}
+
+static coro::CoroTask<int> output_aggregate_stats(
+    const StatsConfig* config_ptr, const std::vector<std::string>* files_ptr,
+    std::unique_ptr<AggregateStatsResult> agg, Timer overall) {
+    std::vector<TraceStatistics> all_stats;
+    if (config_ptr->json_output) {
+        std::sort(
+            agg->indexed_stats.begin(), agg->indexed_stats.end(),
+            [](const auto& a, const auto& b) { return a.first < b.first; });
+        all_stats.reserve(agg->indexed_stats.size());
+        for (auto& [_, stats] : agg->indexed_stats) {
+            all_stats.push_back(std::move(stats));
         }
     }
 
-    auto end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double, std::milli> duration = end_time - start_time;
+    double duration_ms = static_cast<double>(overall.elapsed()) / 1e6;
 
-    if (json_output) {
-        // For JSON, output per-file results
+    if (config_ptr->json_output) {
         StatisticsQueryUtility query_util;
         std::printf("[\n");
         for (std::size_t i = 0; i < all_stats.size(); ++i) {
@@ -1340,33 +1553,178 @@ static coro::CoroTask<int> run_stats(argparse::ArgumentParser& program) {
             }
             StatisticsQueryInput qi;
             qi.stats = stats;
-            qi.query_type = report_type;
-            qi.top_n = top_n;
+            qi.query_type = config_ptr->report_type;
+            qi.top_n = config_ptr->top_n;
             auto output = co_await query_util.process(qi);
             std::printf("%s%s", output.to_json().c_str(),
                         i + 1 < all_stats.size() ? ",\n" : "\n");
         }
         std::printf("]\n");
     } else {
-        // Text output: print consolidated summary
+        const auto displayed_files =
+            files_ptr->empty() ? agg->successful_count + agg->failed_count
+                               : files_ptr->size();
         std::printf("==========================================\n");
         std::printf("Consolidated (%zu files, %zu successful, %zu failed)\n",
-                    files.size(), successful, failed);
+                    displayed_files, agg->successful_count, agg->failed_count);
         std::printf("==========================================\n");
-        auto detailed = to_detailed(total);
-        std::unordered_map<std::string, std::string> no_resolutions;
-        print_text_detailed(total.file_path, detailed, total.num_chunks, top_n,
-                            no_resolutions, &total, top_n_pid_tid);
-        std::printf("  Processing Time: %.2f ms\n", duration.count());
+        if (config_ptr->report_type == StatisticsQueryType::SUMMARY) {
+            auto detailed = to_detailed(agg->total);
+            std::unordered_map<std::string, std::string> no_resolutions;
+            print_text_detailed(agg->total.file_path, detailed,
+                                agg->total.num_chunks, config_ptr->top_n,
+                                no_resolutions, &agg->total,
+                                config_ptr->top_n_pid_tid);
+        } else {
+            StatisticsQueryUtility query_util;
+            StatisticsQueryInput qi;
+            qi.stats = agg->total;
+            qi.query_type = config_ptr->report_type;
+            qi.top_n = config_ptr->top_n;
+            auto output = co_await query_util.process(qi);
+            print_text_query_output(agg->total, output);
+        }
+        auto counter = [&agg](const char* key) -> std::uint64_t {
+            auto it = agg->read_counters.find(key);
+            return it == agg->read_counters.end() ? 0 : it->second;
+        };
+        DFTRACER_UTILS_LOG_INFO(
+            "Stats read metrics: report=%d elapsed=%.2fms "
+            "root_attempts=%" PRIu64 " root_hits=%" PRIu64
+            " root_misses=%" PRIu64 " fallback_groups=%" PRIu64
+            " fallback_files=%" PRIu64,
+            static_cast<int>(config_ptr->report_type),
+            static_cast<double>(agg->read_elapsed_ns) / 1'000'000.0,
+            counter("root_summary_attempts"), counter("root_summary_hits"),
+            counter("root_summary_misses"), counter("fallback_groups"),
+            counter("fallback_files"));
+        std::printf("  Processing Time: %.2f ms\n", duration_ms);
         std::printf("==========================================\n");
     }
 
     co_return 0;
 }
 
+static coro::CoroTask<int> run_stats(CoroScope& ctx,
+                                     const StatsArgParse* args) {
+    StatsConfig config;
+    if (!args->to_config(config)) {
+        co_return 1;
+    }
+
+    Timer stages_storage("dftracer_stats");
+    Timer* stages = args->pipeline.time_profiling ? &stages_storage : nullptr;
+    Timer overall(true);
+
+    std::vector<std::string> files;
+    IndexPartition partition;
+    bool used_index_source_of_truth = false;
+    std::unique_ptr<AggregateStatsResult> direct_root_aggregate;
+
+    {
+        ScopedTimer _t(stages, "collect_and_classify");
+        if (!config.directory.empty() &&
+            config.report_type != StatisticsQueryType::DETAILED) {
+            auto trusted_index_path = internal::determine_index_path(
+                config.directory, config.index_dir);
+            const bool trusted_index_exists = fs::exists(trusted_index_path);
+            DFTRACER_UTILS_LOG_DEBUG(
+                "Stats direct-index decision: directory=%s index_path=%s "
+                "exists=%d json=%d report=%d",
+                config.directory.c_str(), trusted_index_path.c_str(),
+                trusted_index_exists ? 1 : 0, config.json_output ? 1 : 0,
+                static_cast<int>(config.report_type));
+            if (trusted_index_exists) {
+                {
+                    ScopedTimer _ra(stages, "root_aggregate_read");
+                    if (!config.json_output) {
+                        direct_root_aggregate =
+                            co_await load_root_aggregate_result(
+                                trusted_index_path, config.report_type);
+                        DFTRACER_UTILS_LOG_DEBUG(
+                            "Stats direct-index aggregate: index_path=%s "
+                            "hit=%d",
+                            trusted_index_path.c_str(),
+                            direct_root_aggregate ? 1 : 0);
+                    }
+                }
+                if (direct_root_aggregate) {
+                    used_index_source_of_truth = true;
+                } else {
+                    ScopedTimer _ls(stages, "load_index_snapshot");
+                    auto snapshot =
+                        co_await load_index_root_snapshot(trusted_index_path);
+                    files = std::move(snapshot->logical_files);
+                    partition = std::move(snapshot->partition);
+                    used_index_source_of_truth = true;
+                }
+            }
+        }
+
+        if (!used_index_source_of_truth) {
+            {
+                ScopedTimer _cf(stages, "collect_files");
+                files = co_await collect_files(ctx, args->files_args.value,
+                                               config.directory);
+            }
+            if (files.empty()) {
+                co_return 1;
+            }
+            {
+                ScopedTimer _ri(stages, "resolve_index_state");
+                partition = co_await resolve_index_state(
+                    files, config.index_dir, config.report_type);
+            }
+        }
+    }
+
+    if (!partition.files_needing_index.empty()) {
+        if (config.no_auto_index) {
+            DFTRACER_UTILS_LOG_ERROR(
+                "Missing index for %zu file(s) and --no-auto-index is "
+                "set. Run dftracer_index first.",
+                partition.files_needing_index.size());
+            for (const auto& f : partition.files_needing_index) {
+                std::fprintf(stderr, "  Missing index: %s\n",
+                             f.file_path.c_str());
+            }
+            co_return 1;
+        }
+        ScopedTimer _ai(stages, "auto_index_files");
+        co_await auto_index_files(ctx, partition, config.index_dir,
+                                  config.checkpoint_size,
+                                  config.executor_threads);
+    }
+
+    if (config.report_type == StatisticsQueryType::DETAILED) {
+        if (stages) stages->print_stages();
+        co_return co_await run_detailed_stats(ctx, &config, &files);
+    }
+
+    std::unique_ptr<AggregateStatsResult> agg_ptr =
+        std::move(direct_root_aggregate);
+    if (!agg_ptr) {
+        ScopedTimer _ag(stages, "aggregate_stats");
+        auto agg_val = co_await run_aggregate_stats(
+            ctx, &config,
+            std::make_unique<IndexPartition>(std::move(partition)));
+        agg_ptr = std::make_unique<AggregateStatsResult>(std::move(agg_val));
+    }
+
+    if (stages) stages->print_stages();
+    co_return co_await output_aggregate_stats(&config, &files,
+                                              std::move(agg_ptr), overall);
+}
+
 int main(int argc, char** argv) {
     DFTRACER_UTILS_LOGGER_INIT();
 
+    struct RocksDbExitGuard {
+        ~RocksDbExitGuard() {
+            dftracer::utils::rocksdb::mark_process_exiting_for_rocksdb();
+        }
+    } rocksdb_exit_guard;
+
     argparse::ArgumentParser program("dftracer_stats",
                                      DFTRACER_UTILS_PACKAGE_VERSION);
     program.add_description(
@@ -1374,77 +1732,20 @@ int main(int argc, char** argv) {
         ".dftindex databases. Auto-builds indexes if missing. "
         "Zero-cost reads from RocksDB metadata, no decompression.");
 
-    program.add_argument("--files")
-        .help("Trace files to inspect (.pfw, .pfw.gz)")
-        .nargs(argparse::nargs_pattern::any)
-        .default_value<std::vector<std::string>>({});
-
-    program.add_argument("-d", "--directory")
-        .help("Directory containing trace files")
-        .default_value<std::string>("");
-
-    program.add_argument("--index-dir")
-        .help("Directory where .dftindex stores are created")
-        .default_value<std::string>("");
-
-    program.add_argument("--json").help("Output in JSON format").flag();
-
-    program.add_argument("--report")
-        .help(
-            "Report type: summary, categories, names, pid_tids, time_range, "
-            "duration, top-names, top-categories, detailed")
-        .default_value<std::string>("summary");
-
-    program.add_argument("--top-n")
-        .help(
-            "Number of results for top-N queries (0 = show all, "
-            "default: 0)")
-        .scan<'d', std::uint64_t>()
-        .default_value(static_cast<std::uint64_t>(0));
-
-    program.add_argument("--top-n-pid-tid")
-        .help("Max PID:TID pairs to display (0 = show all, default: 10)")
-        .scan<'d', std::uint64_t>()
-        .default_value(static_cast<std::uint64_t>(10));
-
-    program.add_argument("--no-auto-index")
-        .help("Disable automatic index building for files missing .dftindex")
-        .flag();
-
-    program.add_argument("--checkpoint-size")
-        .help("Checkpoint size for auto-indexing in bytes (default: " +
-              std::to_string(
-                  indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE) +
-              ")")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(
-            indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE));
-
-    program.add_argument("--executor-threads")
-        .help("Number of worker threads for auto-indexing")
-        .scan<'d', std::size_t>()
-        .default_value(
-            static_cast<std::size_t>(dftracer_utils_hardware_concurrency()));
-
-    program.add_argument("--query")
-        .help("Query DSL filter (e.g., 'cat == \"POSIX\" and dur > 1000')")
-        .default_value<std::string>("");
-
-    program.add_argument("--group-by")
-        .help(
-            "Group detailed statistics by dimension(s): name, cat, pid, "
-            "tid, fhash, hhash, pid_tid. Multiple values create composite "
-            "keys.")
-        .nargs(argparse::nargs_pattern::at_least_one)
-        .default_value<std::vector<std::string>>({});
-
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        DFTRACER_UTILS_LOG_ERROR("Error: %s", err.what());
-        std::cerr << program;
-        return 1;
-    }
-
-    return run_stats(program).get();
+    StatsArgParse args(program);
+    args.setup();
+    if (!args.parse(argc, argv)) return 1;
+
+    auto pipeline_config =
+        cli::build_pipeline_config("DFTracer Stats Main", args.pipeline);
+    Pipeline pipeline(pipeline_config);
+    auto stats_task = make_task(
+        [&args](CoroScope& ctx) -> coro::CoroTask<int> {
+            co_return co_await run_stats(ctx, &args);
+        },
+        "StatsMain");
+    pipeline.set_source(stats_task);
+    pipeline.set_destination(stats_task);
+    pipeline.execute();
+    return stats_task->get<int>();
 }
diff --git a/src/dftracer/utils/binaries/dftracer_view.cpp b/src/dftracer/utils/binaries/dftracer_view.cpp
index 49fcb15f..11a303ce 100644
--- a/src/dftracer/utils/binaries/dftracer_view.cpp
+++ b/src/dftracer/utils/binaries/dftracer_view.cpp
@@ -1,10 +1,7 @@
 #include <dftracer/utils/core/common/config.h>
-#include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
-#include <dftracer/utils/core/common/platform_compat.h>
 #include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/core/pipeline/pipeline_config.h>
 #include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/tasks/task.h>
 #include <dftracer/utils/utilities/common/query/query.h>
@@ -18,24 +15,121 @@
 #include <dftracer/utils/utilities/indexer/index_database.h>
 #include <dftracer/utils/utilities/indexer/internal/indexer.h>
 
-#include <argparse/argparse.hpp>
 #include <atomic>
 #include <cstdio>
 #include <exception>
 #include <fstream>
-#include <iostream>
 #include <mutex>
 #include <string>
-#include <thread>
 #include <vector>
 
+#include "common_cli.h"
+
 using namespace dftracer::utils;
 using namespace dftracer::utils::utilities;
 using namespace dftracer::utils::utilities::composites::dft;
 using namespace dftracer::utils::utilities::composites::dft::views;
 using namespace dftracer::utils::utilities::filesystem;
-using dftracer::utils::utilities::indexer::IndexBuildConfig;
-using dftracer::utils::utilities::indexer::IndexBuilderUtility;
+using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility;
+using dftracer::utils::utilities::indexer::IndexBuildBatchConfig;
+
+class ViewArgParse : public cli::ArgParse {
+   public:
+    cli::DirectoryArgs directory{cli::DirMode::DEFAULT_EMPTY};
+    cli::FilesArgs files_args;
+    cli::PipelineArgs pipeline;
+    cli::IndexingArgs indexing;
+    cli::QueryArgs query_args;
+
+    std::string preset;
+    std::string recipe;
+    std::string save_recipe;
+    std::string time_range;
+    double min_duration = 0.0;
+    double max_duration = 0.0;
+    std::string output;
+    bool stream = false;
+    bool no_metadata = false;
+    bool no_auto_index = false;
+
+    explicit ViewArgParse(argparse::ArgumentParser& p) : ArgParse(p) {
+        indexing.with_force = false;
+        indexing.index_dir_help =
+            "Directory where .dftindex stores are created";
+        schema(directory, files_args, pipeline, indexing, query_args);
+    }
+
+   protected:
+    void register_args() override {
+        parser()
+            .add_argument("--preset")
+            .help("Predefined view: io, compute, dlio")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("--recipe")
+            .help("Custom view JSON file path")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("--save-recipe")
+            .help("Save the constructed view to a JSON file")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("--time-range")
+            .help(
+                "Timestamp filter as min,max in microseconds (e.g., "
+                "1000000,2000000)")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("--min-duration")
+            .help("Minimum event duration in microseconds")
+            .scan<'g', double>()
+            .default_value(static_cast<double>(0.0));
+
+        parser()
+            .add_argument("--max-duration")
+            .help("Maximum event duration in microseconds")
+            .scan<'g', double>()
+            .default_value(static_cast<double>(0.0));
+
+        parser()
+            .add_argument("-o", "--output")
+            .help("Output file path (default: stdout)")
+            .default_value<std::string>("");
+
+        parser()
+            .add_argument("--stream")
+            .help("Stream matching events to stdout as NDJSON")
+            .flag();
+
+        parser()
+            .add_argument("--no-metadata")
+            .help("Exclude metadata events (ph=M) from output")
+            .flag();
+
+        parser()
+            .add_argument("--no-auto-index")
+            .help(
+                "Disable automatic index building for files missing .dftindex")
+            .flag();
+    }
+
+    void post_parse() override {
+        preset = parser().get<std::string>("--preset");
+        recipe = parser().get<std::string>("--recipe");
+        save_recipe = parser().get<std::string>("--save-recipe");
+        time_range = parser().get<std::string>("--time-range");
+        min_duration = parser().get<double>("--min-duration");
+        max_duration = parser().get<double>("--max-duration");
+        output = parser().get<std::string>("--output");
+        stream = parser().get<bool>("--stream");
+        no_metadata = parser().get<bool>("--no-metadata");
+        no_auto_index = parser().get<bool>("--no-auto-index");
+    }
+};
 
 struct ViewContext {
     std::string index_dir;
@@ -53,24 +147,32 @@ struct ViewContext {
     std::atomic<std::size_t>* failed_count;
 };
 
-static coro::CoroTask<void> index_single_file(const std::string& file_path,
-                                              const ViewContext& vctx,
-                                              CoroScope&) {
-    IndexBuilderUtility builder;
-    auto config = IndexBuildConfig::for_file(file_path)
-                      .with_index_dir(vctx.index_dir)
-                      .with_checkpoint_size(vctx.checkpoint_size)
-                      .with_bloom(true)
-                      .with_index_threshold(0);
-    auto result = co_await builder.process(config);
-
-    if (result.success) {
-        (*vctx.indexed_count)++;
-    } else {
-        (*vctx.failed_count)++;
-        DFTRACER_UTILS_LOG_ERROR("Auto-indexing failed for %s: %s",
-                                 file_path.c_str(),
-                                 result.error_message.c_str());
+static coro::CoroTask<void> batch_index_files(
+    const std::vector<std::string>& files_needing_index,
+    const ViewContext& vctx, CoroScope& ctx) {
+    auto batch_config = std::make_shared<IndexBuildBatchConfig>();
+    batch_config->file_paths = files_needing_index;
+    batch_config->index_dir = vctx.index_dir;
+    batch_config->checkpoint_size = vctx.checkpoint_size;
+    batch_config->parallelism =
+        std::max<std::size_t>(1, files_needing_index.size());
+    batch_config->use_batch_write = true;
+    batch_config->rebuild_root_summaries = true;
+
+    auto batch_result = co_await IndexBatchBuilderUtility::process(
+        &ctx, std::move(batch_config));
+
+    for (const auto& result : batch_result.results) {
+        if (result.success) {
+            (*vctx.indexed_count)++;
+        } else {
+            (*vctx.failed_count)++;
+            if (!result.error_message.empty()) {
+                DFTRACER_UTILS_LOG_ERROR("Auto-indexing failed for %s: %s",
+                                         result.file_path.c_str(),
+                                         result.error_message.c_str());
+            }
+        }
     }
 }
 
@@ -104,7 +206,6 @@ static coro::CoroTask<void> read_single_chunk(
                 }
             }
         } else {
-            // Non-stream: must copy since string_view won't outlive chunk
             std::lock_guard<std::mutex> lock(*vctx.output_mutex);
             for (const auto& event : batch->events) {
                 vctx.all_events->emplace_back(event);
@@ -120,7 +221,6 @@ static coro::CoroTask<void> process_single_file(const std::string& file_path,
     std::string index_path =
         internal::determine_index_path(file_path, vctx.index_dir);
 
-    // Collect metadata
     auto meta_input = MetadataCollectorUtilityInput::from_file(file_path)
                           .with_checkpoint_size(vctx.checkpoint_size)
                           .with_force_rebuild(false)
@@ -134,7 +234,6 @@ static coro::CoroTask<void> process_single_file(const std::string& file_path,
         co_return;
     }
 
-    // Run ViewBuilderUtility to get candidate chunks
     ViewBuilderInput builder_input;
     builder_input.with_view(vctx.view)
         .with_file_path(file_path)
@@ -157,7 +256,6 @@ static coro::CoroTask<void> process_single_file(const std::string& file_path,
         co_return;
     }
 
-    // Process each candidate chunk
     auto& candidates = build_output.candidates;
     co_await fctx.scope([&file_path, &index_path, &vctx, &candidates](
                             CoroScope& chunk_scope) -> coro::CoroTask<void> {
@@ -172,23 +270,21 @@ static coro::CoroTask<void> process_single_file(const std::string& file_path,
     });
 }
 
-static coro::CoroTask<int> run_view(argparse::ArgumentParser& program) {
-    std::string directory = program.get<std::string>("--directory");
-    std::string index_dir = program.get<std::string>("--index-dir");
-    std::string preset = program.get<std::string>("--preset");
-    std::string recipe_path = program.get<std::string>("--recipe");
-    std::string save_recipe = program.get<std::string>("--save-recipe");
-    std::string output_path = program.get<std::string>("--output");
-    std::string time_range_str = program.get<std::string>("--time-range");
-    double min_duration = program.get<double>("--min-duration");
-    double max_duration = program.get<double>("--max-duration");
-    bool stream_mode = program.get<bool>("--stream");
-    bool no_metadata = program.get<bool>("--no-metadata");
-    bool no_auto_index = program.get<bool>("--no-auto-index");
-    std::size_t checkpoint_size = program.get<std::size_t>("--checkpoint-size");
-    std::size_t executor_threads =
-        program.get<std::size_t>("--executor-threads");
-    auto query_str = program.get<std::string>("--query");
+static coro::CoroTask<int> run_view(const ViewArgParse* cli) {
+    const auto& directory = cli->directory.value;
+    const auto& index_dir = cli->indexing.index_dir;
+    const auto& preset = cli->preset;
+    const auto& recipe_path = cli->recipe;
+    const auto& save_recipe = cli->save_recipe;
+    const auto& output_path = cli->output;
+    const auto& time_range_str = cli->time_range;
+    const auto min_duration = cli->min_duration;
+    const auto max_duration = cli->max_duration;
+    const auto stream_mode = cli->stream;
+    const auto no_metadata = cli->no_metadata;
+    const auto no_auto_index = cli->no_auto_index;
+    const auto checkpoint_size = cli->indexing.checkpoint_size;
+    const auto& query_str = cli->query_args.query;
 
     ViewDefinition view;
 
@@ -285,7 +381,6 @@ static coro::CoroTask<int> run_view(argparse::ArgumentParser& program) {
     if (!view.query) {
         DFTRACER_UTILS_LOG_ERROR(
             "%s", "No view specified. Use --preset, --recipe, or --query.");
-        std::cerr << program;
         co_return 1;
     }
 
@@ -319,12 +414,11 @@ static coro::CoroTask<int> run_view(argparse::ArgumentParser& program) {
             co_return 1;
         }
     } else {
-        files = program.get<std::vector<std::string>>("--files");
+        files = cli->files_args.value;
 
         if (files.empty()) {
             DFTRACER_UTILS_LOG_ERROR(
                 "%s", "No files or directory specified. Use --help for usage.");
-            std::cerr << program;
             co_return 1;
         }
     }
@@ -388,10 +482,8 @@ static coro::CoroTask<int> run_view(argparse::ArgumentParser& program) {
                      &indexed_count,
                      &failed_count};
 
-    auto pipeline_config = PipelineConfig()
-                               .with_name("DFTracer View")
-                               .with_compute_threads(executor_threads)
-                               .with_watchdog(false);
+    auto pipeline_config =
+        cli::build_pipeline_config("DFTracer View", cli->pipeline);
 
     Pipeline pipeline(pipeline_config);
 
@@ -402,19 +494,7 @@ static coro::CoroTask<int> run_view(argparse::ArgumentParser& program) {
         [files_needing_index_ptr, files_ptr,
          &vctx](CoroScope& ctx) -> coro::CoroTask<void> {
             if (!files_needing_index_ptr->empty()) {
-                co_await ctx.scope([files_needing_index_ptr,
-                                    &vctx](CoroScope& scope)
-                                       -> coro::CoroTask<void> {
-                    for (std::size_t i = 0; i < files_needing_index_ptr->size();
-                         ++i) {
-                        const auto file_path = (*files_needing_index_ptr)[i];
-                        scope.spawn([file_path, &vctx](CoroScope& fctx)
-                                        -> coro::CoroTask<void> {
-                            co_await index_single_file(file_path, vctx, fctx);
-                        });
-                    }
-                    co_return;
-                });
+                co_await batch_index_files(*files_needing_index_ptr, vctx, ctx);
 
                 std::printf("Auto-indexing complete: %zu indexed, %zu failed\n",
                             vctx.indexed_count->load(),
@@ -481,97 +561,12 @@ int main(int argc, char** argv) {
         "indices for efficient chunk-skipping. Supports predefined views "
         "(io, compute, dlio), custom recipes, and inline queries.");
 
-    // Input files
-    program.add_argument("--files")
-        .help("Trace files to process (.pfw, .pfw.gz)")
-        .nargs(argparse::nargs_pattern::any)
-        .default_value<std::vector<std::string>>({});
-
-    program.add_argument("-d", "--directory")
-        .help("Directory containing trace files")
-        .default_value<std::string>("");
-
-    // View specification
-    program.add_argument("--preset")
-        .help("Predefined view: io, compute, dlio")
-        .default_value<std::string>("");
-
-    program.add_argument("--recipe")
-        .help("Custom view JSON file path")
-        .default_value<std::string>("");
-
-    program.add_argument("--save-recipe")
-        .help("Save the constructed view to a JSON file")
-        .default_value<std::string>("");
-
-    program.add_argument("--query")
-        .help("Query DSL filter (e.g., 'cat == \"POSIX\" and dur > 1000')")
-        .default_value<std::string>("");
-
-    // Event-level filters
-    program.add_argument("--time-range")
-        .help(
-            "Timestamp filter as min,max in microseconds (e.g., "
-            "1000000,2000000)")
-        .default_value<std::string>("");
-
-    program.add_argument("--min-duration")
-        .help("Minimum event duration in microseconds")
-        .scan<'g', double>()
-        .default_value(static_cast<double>(0.0));
-
-    program.add_argument("--max-duration")
-        .help("Maximum event duration in microseconds")
-        .scan<'g', double>()
-        .default_value(static_cast<double>(0.0));
-
-    // Output
-    program.add_argument("-o", "--output")
-        .help("Output file path (default: stdout)")
-        .default_value<std::string>("");
-
-    program.add_argument("--stream")
-        .help("Stream matching events to stdout as NDJSON")
-        .flag();
-
-    program.add_argument("--no-metadata")
-        .help("Exclude metadata events (ph=M) from output")
-        .flag();
-
-    // Indexing options
-    program.add_argument("--index-dir")
-        .help("Directory where .dftindex stores are created")
-        .default_value<std::string>("");
-
-    program.add_argument("--no-auto-index")
-        .help("Disable automatic index building for files missing .dftindex")
-        .flag();
-
-    program.add_argument("--checkpoint-size")
-        .help("Checkpoint size for auto-indexing in bytes (default: " +
-              std::to_string(
-                  indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE) +
-              ")")
-        .scan<'d', std::size_t>()
-        .default_value(static_cast<std::size_t>(
-            indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE));
-
-    program.add_argument("--executor-threads")
-        .help("Number of worker threads")
-        .scan<'d', std::size_t>()
-        .default_value(
-            static_cast<std::size_t>(dftracer_utils_hardware_concurrency()));
-
-    try {
-        program.parse_args(argc, argv);
-    } catch (const std::exception& err) {
-        DFTRACER_UTILS_LOG_ERROR("Error: %s", err.what());
-        std::cerr << program;
-        return 1;
-    }
+    ViewArgParse cli(program);
+    cli.setup();
+    if (!cli.parse(argc, argv)) return 1;
 
     try {
-        return run_view(program).get();
+        return run_view(&cli).get();
     } catch (const std::exception& e) {
         DFTRACER_UTILS_LOG_ERROR("Fatal: %s", e.what());
         return 1;
diff --git a/src/dftracer/utils/core/common/inflater.h b/src/dftracer/utils/core/common/inflater.h
index 73449e36..f966ac8f 100644
--- a/src/dftracer/utils/core/common/inflater.h
+++ b/src/dftracer/utils/core/common/inflater.h
@@ -22,8 +22,13 @@ class Inflater {
         constants::indexer::INFLATE_BUFFER_SIZE;
 
     z_stream stream;
-    alignas(DFTRACER_OPTIMAL_ALIGNMENT) unsigned char out_buffer[BUFFER_SIZE];
-    alignas(DFTRACER_OPTIMAL_ALIGNMENT) unsigned char in_buffer[BUFFER_SIZE];
+    alignas(DFTRACER_OPTIMAL_ALIGNMENT) unsigned char out_buffer_[BUFFER_SIZE];
+    alignas(DFTRACER_OPTIMAL_ALIGNMENT) unsigned char in_buffer_[BUFFER_SIZE];
+
+    unsigned char* out_buffer() { return out_buffer_; }
+    const unsigned char* out_buffer() const { return out_buffer_; }
+    unsigned char* in_buffer() { return in_buffer_; }
+    const unsigned char* in_buffer() const { return in_buffer_; }
 
    protected:
     int window_bits_;
@@ -31,8 +36,8 @@ class Inflater {
    public:
     Inflater() : window_bits_(constants::indexer::ZLIB_GZIP_WINDOW_BITS) {
         std::memset(&stream, 0, sizeof(stream));
-        std::memset(out_buffer, 0, sizeof(out_buffer));
-        std::memset(in_buffer, 0, sizeof(in_buffer));
+        std::memset(out_buffer_, 0, BUFFER_SIZE);
+        std::memset(in_buffer_, 0, BUFFER_SIZE);
     }
 
     virtual ~Inflater() { inflateEnd(&stream); }
@@ -88,11 +93,10 @@ class Inflater {
     }
 
     coro::CoroTask<bool> read_input(int fd, off_t& offset) {
-        ssize_t n =
-            co_await io::pread(fd, in_buffer, sizeof(in_buffer), offset);
+        ssize_t n = co_await io::pread(fd, in_buffer(), BUFFER_SIZE, offset);
         if (n > 0) {
             offset += n;
-            stream.next_in = in_buffer;
+            stream.next_in = in_buffer();
             stream.avail_in = static_cast<uInt>(n);
             co_return true;
         } else if (n < 0) {
@@ -104,13 +108,14 @@ class Inflater {
     }
 
     std::size_t get_output(unsigned char* buf, std::size_t len) {
-        std::size_t available = sizeof(out_buffer) - stream.avail_out;
+        std::size_t available = BUFFER_SIZE - stream.avail_out;
         std::size_t to_copy = std::min(len, available);
-        std::memcpy(buf, out_buffer, to_copy);
+        std::memcpy(buf, out_buffer(), to_copy);
 
         // Shift remaining data
         if (to_copy < available) {
-            std::memmove(out_buffer, out_buffer + to_copy, available - to_copy);
+            std::memmove(out_buffer(), out_buffer() + to_copy,
+                         available - to_copy);
         }
 
         return to_copy;
@@ -129,8 +134,8 @@ class Inflater {
             return NEED_INPUT;
         }
 
-        stream.next_out = out_buffer;
-        stream.avail_out = sizeof(out_buffer);
+        stream.next_out = out_buffer();
+        stream.avail_out = BUFFER_SIZE;
 
         int ret = inflate(&stream, flush_mode);
 
@@ -150,7 +155,7 @@ class Inflater {
     }
 
     bool needs_input() const { return stream.avail_in == 0; }
-    bool has_output() const { return stream.avail_out < sizeof(out_buffer); }
+    bool has_output() const { return stream.avail_out < BUFFER_SIZE; }
     int get_data_type() const { return stream.data_type; }
     std::size_t get_avail_in() const { return stream.avail_in; }
     std::size_t get_avail_out() const { return stream.avail_out; }
diff --git a/src/dftracer/utils/core/common/memory_budget.cpp b/src/dftracer/utils/core/common/memory_budget.cpp
new file mode 100644
index 00000000..8dc778af
--- /dev/null
+++ b/src/dftracer/utils/core/common/memory_budget.cpp
@@ -0,0 +1,206 @@
+#include <dftracer/utils/core/common/memory_budget.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+
+namespace dftracer::utils {
+
+static constexpr std::size_t FALLBACK_AVAILABLE_BYTES =
+    1ULL * 1024 * 1024 * 1024;
+
+static std::size_t read_size_from_file(const char *path) {
+    FILE *f = std::fopen(path, "r");
+    if (!f) return 0;
+    char buf[64];
+    std::size_t n = std::fread(buf, 1, sizeof(buf) - 1, f);
+    std::fclose(f);
+    if (n == 0) return 0;
+    buf[n] = '\0';
+    if (std::strncmp(buf, "max", 3) == 0) return 0;
+    char *end = nullptr;
+    unsigned long long val = std::strtoull(buf, &end, 10);
+    if (end == buf) return 0;
+    return static_cast<std::size_t>(val);
+}
+
+static constexpr std::size_t CGROUP_LIMIT_SENTINEL =
+    1ULL * 1024 * 1024 * 1024 * 1024;
+
+static void read_self_cgroup_paths(std::string &v2_path, std::string &v1_path) {
+    FILE *f = std::fopen("/proc/self/cgroup", "r");
+    if (!f) return;
+    char line[1024];
+    while (std::fgets(line, sizeof(line), f)) {
+        std::size_t n = std::strlen(line);
+        while (n > 0 && (line[n - 1] == '\n' || line[n - 1] == '\r')) {
+            line[--n] = '\0';
+        }
+        if (n >= 3 && line[0] == '0' && line[1] == ':' && line[2] == ':') {
+            v2_path = line + 3;
+            continue;
+        }
+        char *first = std::strchr(line, ':');
+        if (!first) continue;
+        char *second = std::strchr(first + 1, ':');
+        if (!second) continue;
+        std::string controllers(first + 1, second - first - 1);
+        std::size_t start = 0;
+        while (start <= controllers.size()) {
+            std::size_t comma = controllers.find(',', start);
+            std::size_t end =
+                (comma == std::string::npos) ? controllers.size() : comma;
+            if (controllers.compare(start, end - start, "memory") == 0) {
+                v1_path = second + 1;
+                break;
+            }
+            if (comma == std::string::npos) break;
+            start = comma + 1;
+        }
+    }
+    std::fclose(f);
+}
+
+static std::size_t cgroup_v2_limit_at(const std::string &cg_path) {
+    std::string base = "/sys/fs/cgroup" + cg_path;
+    std::string dir = base;
+    while (true) {
+        std::size_t max_mem =
+            read_size_from_file((dir + "/memory.max").c_str());
+        if (max_mem > 0 && max_mem < CGROUP_LIMIT_SENTINEL) {
+            std::size_t current =
+                read_size_from_file((dir + "/memory.current").c_str());
+            if (current >= max_mem) return 0;
+            return max_mem - current;
+        }
+        if (dir.size() <= std::strlen("/sys/fs/cgroup")) break;
+        std::size_t slash = dir.find_last_of('/');
+        if (slash == std::string::npos || slash < std::strlen("/sys/fs/cgroup"))
+            break;
+        dir.resize(slash);
+    }
+    return 0;
+}
+
+static std::size_t cgroup_v1_limit_at(const std::string &cg_path) {
+    std::string base = "/sys/fs/cgroup/memory" + cg_path;
+    std::string dir = base;
+    while (true) {
+        std::size_t limit =
+            read_size_from_file((dir + "/memory.limit_in_bytes").c_str());
+        if (limit > 0 && limit < CGROUP_LIMIT_SENTINEL) {
+            std::size_t usage =
+                read_size_from_file((dir + "/memory.usage_in_bytes").c_str());
+            if (usage >= limit) return 0;
+            return limit - usage;
+        }
+        if (dir.size() <= std::strlen("/sys/fs/cgroup/memory")) break;
+        std::size_t slash = dir.find_last_of('/');
+        if (slash == std::string::npos ||
+            slash < std::strlen("/sys/fs/cgroup/memory"))
+            break;
+        dir.resize(slash);
+    }
+    return 0;
+}
+
+static std::size_t try_cgroups_v2() {
+    std::string v2_path, v1_path;
+    read_self_cgroup_paths(v2_path, v1_path);
+    if (v2_path.empty()) v2_path = "/";
+    return cgroup_v2_limit_at(v2_path);
+}
+
+static std::size_t try_cgroups_v1() {
+    std::string v2_path, v1_path;
+    read_self_cgroup_paths(v2_path, v1_path);
+    if (v1_path.empty()) v1_path = "/";
+    return cgroup_v1_limit_at(v1_path);
+}
+
+static std::size_t try_proc_meminfo() {
+    FILE *f = std::fopen("/proc/meminfo", "r");
+    if (!f) return 0;
+    char line[256];
+    while (std::fgets(line, sizeof(line), f)) {
+        if (std::strncmp(line, "MemAvailable:", 13) == 0) {
+            char *p = line + 13;
+            while (*p == ' ') ++p;
+            char *end = nullptr;
+            unsigned long long val = std::strtoull(p, &end, 10);
+            std::fclose(f);
+            return static_cast<std::size_t>(val) * 1024;
+        }
+    }
+    std::fclose(f);
+    return 0;
+}
+
+std::size_t detect_available_memory() {
+    std::size_t avail = try_cgroups_v2();
+    if (avail > 0) return avail;
+    avail = try_cgroups_v1();
+    if (avail > 0) return avail;
+    avail = try_proc_meminfo();
+    if (avail > 0) return avail;
+    return FALLBACK_AVAILABLE_BYTES;
+}
+
+std::size_t compute_memory_budget(std::size_t user_override_bytes) {
+    if (user_override_bytes > 0) return user_override_bytes;
+    std::size_t avail = detect_available_memory();
+    std::size_t budget = avail * DEFAULT_MEMORY_BUDGET_FRACTION_PERCENT / 100;
+    return std::max(budget, MIN_MEMORY_BUDGET_BYTES);
+}
+
+std::size_t compute_channel_capacity(std::size_t memory_budget_bytes,
+                                     std::size_t estimated_batch_bytes,
+                                     std::size_t num_workers) {
+    std::size_t from_budget =
+        memory_budget_bytes / std::max(estimated_batch_bytes, std::size_t(1));
+    std::size_t minimum = std::max(num_workers * 2, std::size_t(4));
+    return std::max(from_budget, minimum);
+}
+
+std::size_t compute_file_batch_size(std::size_t memory_budget_bytes,
+                                    std::size_t estimated_file_bytes,
+                                    std::size_t min_files) {
+    std::size_t from_budget =
+        memory_budget_bytes / std::max(estimated_file_bytes, std::size_t(1));
+    return std::max(from_budget, std::max(min_files, std::size_t(1)));
+}
+
+std::size_t estimate_per_file_bytes(const std::vector<std::size_t> &file_sizes,
+                                    std::size_t user_override_bytes) {
+    if (user_override_bytes > 0) return user_override_bytes;
+    if (file_sizes.empty()) return MIN_PER_FILE_PEAK_BYTES;
+
+    const std::size_t total = file_sizes.size();
+    const std::size_t sample_count = std::min(total, PER_FILE_SAMPLE_LIMIT);
+    const std::size_t stride = std::max(total / sample_count, std::size_t(1));
+
+    std::vector<std::size_t> sizes;
+    sizes.reserve(sample_count);
+    for (std::size_t i = 0; i < total && sizes.size() < sample_count;
+         i += stride) {
+        if (file_sizes[i] > 0) sizes.push_back(file_sizes[i]);
+    }
+
+    if (sizes.empty()) return MIN_PER_FILE_PEAK_BYTES;
+
+    std::size_t idx = (sizes.size() * 95) / 100;
+    if (idx >= sizes.size()) idx = sizes.size() - 1;
+    std::nth_element(sizes.begin(), sizes.begin() + idx, sizes.end());
+    const std::size_t p95 = sizes[idx];
+
+    std::size_t estimate = p95 * PER_FILE_EXPANSION_FACTOR;
+    estimate = std::max(estimate, MIN_PER_FILE_PEAK_BYTES);
+    estimate = std::min(estimate, MAX_PER_FILE_PEAK_BYTES);
+    return estimate;
+}
+
+}  // namespace dftracer::utils
diff --git a/src/dftracer/utils/core/io/io_backend_factory.cpp b/src/dftracer/utils/core/io/io_backend_factory.cpp
index 01f5207b..6d17f3b1 100644
--- a/src/dftracer/utils/core/io/io_backend_factory.cpp
+++ b/src/dftracer/utils/core/io/io_backend_factory.cpp
@@ -21,17 +21,16 @@ std::unique_ptr<IoBackend> create_io_backend(Executor& executor,
                                              unsigned batch_threshold) {
     // Explicit backend selection (non-AUTO).
     if (backend_type == IoBackendType::THREADPOOL) {
-        DFTRACER_UTILS_LOG_DEBUG(
-            "I/O backend: using threadpool (%zu threads, forced)", pool_size);
+        DFTRACER_UTILS_LOG_INFO("I/O backend: using threadpool (%zu threads)",
+                                pool_size);
         return std::make_unique<ThreadPoolBackend>(executor, pool_size,
                                                    batch_threshold);
     }
 
 #ifdef __linux__
     if (backend_type == IoBackendType::EPOLL_THREADPOOL) {
-        DFTRACER_UTILS_LOG_DEBUG(
-            "I/O backend: using epoll+threadpool (%zu threads, forced)",
-            pool_size);
+        DFTRACER_UTILS_LOG_INFO(
+            "I/O backend: using epoll+threadpool (%zu threads)", pool_size);
         return std::make_unique<EpollThreadPoolBackend>(executor, pool_size,
                                                         batch_threshold);
     }
@@ -40,9 +39,8 @@ std::unique_ptr<IoBackend> create_io_backend(Executor& executor,
 #if defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \
     defined(__NetBSD__) || defined(__DragonFly__)
     if (backend_type == IoBackendType::KQUEUE_THREADPOOL) {
-        DFTRACER_UTILS_LOG_DEBUG(
-            "I/O backend: using kqueue+threadpool (%zu threads, forced)",
-            pool_size);
+        DFTRACER_UTILS_LOG_INFO(
+            "I/O backend: using kqueue+threadpool (%zu threads)", pool_size);
         return std::make_unique<KqueueThreadPoolBackend>(executor, pool_size,
                                                          batch_threshold);
     }
@@ -53,12 +51,11 @@ std::unique_ptr<IoBackend> create_io_backend(Executor& executor,
         auto uring =
             std::make_unique<IoUringBackend>(executor, 256, batch_threshold);
         if (uring->probe()) {
-            DFTRACER_UTILS_LOG_DEBUG("%s",
-                                     "I/O backend: using io_uring (forced)");
+            DFTRACER_UTILS_LOG_INFO("%s", "I/O backend: using io_uring");
             return uring;
         }
         DFTRACER_UTILS_LOG_ERROR("%s",
-                                 "io_uring forced but runtime probe failed");
+                                 "io_uring selected but runtime probe failed");
         // Fall through to AUTO detection.
     }
 #endif
@@ -69,28 +66,28 @@ std::unique_ptr<IoBackend> create_io_backend(Executor& executor,
         auto uring =
             std::make_unique<IoUringBackend>(executor, 256, batch_threshold);
         if (uring->probe()) {
-            DFTRACER_UTILS_LOG_DEBUG("%s", "I/O backend: using io_uring");
+            DFTRACER_UTILS_LOG_INFO("%s", "I/O backend: using io_uring");
             return uring;
         }
-        DFTRACER_UTILS_LOG_DEBUG("%s",
-                                 "io_uring runtime probe failed, falling back");
+        DFTRACER_UTILS_LOG_INFO("%s",
+                                "io_uring runtime probe failed, falling back");
     }
 #endif
 
 #ifdef __linux__
-    DFTRACER_UTILS_LOG_DEBUG(
-        "I/O backend: using epoll+threadpool (%zu threads)", pool_size);
+    DFTRACER_UTILS_LOG_INFO("I/O backend: using epoll+threadpool (%zu threads)",
+                            pool_size);
     return std::make_unique<EpollThreadPoolBackend>(executor, pool_size,
                                                     batch_threshold);
 #elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \
     defined(__NetBSD__) || defined(__DragonFly__)
-    DFTRACER_UTILS_LOG_DEBUG(
+    DFTRACER_UTILS_LOG_INFO(
         "I/O backend: using kqueue+threadpool (%zu threads)", pool_size);
     return std::make_unique<KqueueThreadPoolBackend>(executor, pool_size,
                                                      batch_threshold);
 #else
-    DFTRACER_UTILS_LOG_DEBUG("I/O backend: using threadpool (%zu threads)",
-                             pool_size);
+    DFTRACER_UTILS_LOG_INFO("I/O backend: using threadpool (%zu threads)",
+                            pool_size);
     return std::make_unique<ThreadPoolBackend>(executor, pool_size,
                                                batch_threshold);
 #endif
diff --git a/src/dftracer/utils/core/pipeline/executor.cpp b/src/dftracer/utils/core/pipeline/executor.cpp
index 12029f51..704ddd44 100644
--- a/src/dftracer/utils/core/pipeline/executor.cpp
+++ b/src/dftracer/utils/core/pipeline/executor.cpp
@@ -2,7 +2,6 @@
 #include <dftracer/utils/core/common/platform_compat.h>
 #include <dftracer/utils/core/coro/yield.h>
 #include <dftracer/utils/core/io/io_backend_factory.h>
-#include <dftracer/utils/core/io/io_thread_pool.h>
 #include <dftracer/utils/core/pipeline/executor.h>
 #include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/tasks/task.h>
@@ -32,8 +31,6 @@ Executor* Executor::set_current(Executor* e) noexcept {
     return old;
 }
 
-io::IoThreadPool* Executor::db_pool() noexcept { return db_pool_.get(); }
-
 // Thread-local list of coroutine handles to destroy after the current
 // resume() returns.  FinalAwaiter pushes here instead of the shared
 // destroy_queue_ to avoid another worker freeing the frame while
@@ -61,15 +58,20 @@ Executor::Executor(const ExecutorConfig& config)
     : num_threads_(config.num_threads == 0
                        ? dftracer_utils_hardware_concurrency()
                        : config.num_threads),
-      last_activity_time_(std::chrono::steady_clock::now()),
+      last_activity_ns_(
+          std::chrono::steady_clock::now().time_since_epoch().count()),
       idle_timeout_(config.idle_timeout),
       deadlock_timeout_(config.deadlock_timeout),
-      io_pool_size_(config.io_pool_size),
+      io_pool_size_(config.io_pool_size == 0
+                        ? dftracer_utils_hardware_concurrency()
+                        : config.io_pool_size),
       io_backend_type_(config.io_backend_type),
-      io_batch_threshold_(config.io_batch_threshold),
-      db_pool_size_(config.db_pool_size) {
+      io_batch_threshold_(config.io_batch_threshold) {
     if (num_threads_ == 0) {
-        num_threads_ = 2;  // Fallback if hardware_concurrency returns 0
+        num_threads_ = 2;
+    }
+    if (io_pool_size_ == 0) {
+        io_pool_size_ = 2;
     }
     DFTRACER_UTILS_LOG_DEBUG(
         "Executor created with %zu threads, idle_timeout=%lld s, "
@@ -100,9 +102,6 @@ void Executor::start() {
                                         io_batch_threshold_);
     io_backend_->start();
 
-    db_pool_ = std::make_unique<io::IoThreadPool>(db_pool_size_);
-    db_pool_->start();
-
     // Create all worker contexts first so workers_ is stable before any
     // worker thread can try to iterate/steal from it.
     for (std::size_t i = 0; i < num_threads_; ++i) {
@@ -143,11 +142,6 @@ void Executor::shutdown() {
     // completion thread may still call enqueue() -> wake_all_workers()
     // which accesses WorkerContext cv/mutex, so workers_ must remain
     // alive until the completion thread has exited.
-    if (db_pool_) {
-        db_pool_->stop();
-        db_pool_.reset();
-    }
-
     if (io_backend_) {
         io_backend_->stop();
         io_backend_.reset();
@@ -258,12 +252,7 @@ void Executor::worker_thread(WorkerContext* context) {
                 }
             }
             drain_destroy_queue();
-            std::unique_lock<std::mutex> lock(context->queue_mutex);
-            context->cv.wait(lock, [this, observed_signal] {
-                return !running_.load(std::memory_order_acquire) ||
-                       work_signal_.load(std::memory_order_acquire) !=
-                           observed_signal;
-            });
+            work_signal_.wait(observed_signal, std::memory_order_acquire);
         }
     }
 
@@ -308,33 +297,11 @@ void Executor::signal_global_work() {
     wake_one_worker();
 }
 
-void Executor::wake_one_worker() {
-    const std::size_t worker_count = workers_.size();
-    if (worker_count == 0) {
-        return;
-    }
-
-    const std::size_t worker_index =
-        next_worker_.fetch_add(1, std::memory_order_relaxed) % worker_count;
-    // Lock-then-unlock the worker's mutex before notifying.
-    // This ensures the worker is either before its predicate check (and will
-    // see the updated atomic state) or inside cv.wait (and will receive the
-    // notification). Without this, a notification sent between predicate
-    // evaluation and cv.wait entry is lost, causing the worker to hang.
-    workers_[worker_index]->queue_mutex.lock();
-    workers_[worker_index]->queue_mutex.unlock();
-    workers_[worker_index]->cv.notify_one();
-}
+void Executor::wake_one_worker() { work_signal_.notify_one(); }
 
 void Executor::wake_all_workers() {
-    for (auto& worker : workers_) {
-        // Lock-then-unlock ensures the worker is either before its predicate
-        // check or inside cv.wait before the notification is sent.
-        // See wake_one_worker() for detailed rationale.
-        worker->queue_mutex.lock();
-        worker->queue_mutex.unlock();
-        worker->cv.notify_all();
-    }
+    work_signal_.fetch_add(1, std::memory_order_release);
+    work_signal_.notify_all();
 }
 
 // Helper function for when_all.h (avoids circular dependency)
@@ -382,9 +349,11 @@ bool Executor::is_responsive() const {
 
     if (active >= num_threads_) {
         // All threads busy - check if making progress
-        std::lock_guard<std::mutex> lock(activity_mutex_);
         auto now = std::chrono::steady_clock::now();
-        auto idle_time = now - last_activity_time_;
+        auto last_ns = last_activity_ns_.load(std::memory_order_acquire);
+        auto last_tp = std::chrono::steady_clock::time_point(
+            std::chrono::steady_clock::duration(last_ns));
+        auto idle_time = now - last_tp;
 
         // If all threads busy but no activity for deadlock_timeout,
         // likely deadlocked
@@ -403,8 +372,9 @@ bool Executor::is_responsive() const {
 }
 
 void Executor::mark_activity() {
-    std::lock_guard<std::mutex> lock(activity_mutex_);
-    last_activity_time_ = std::chrono::steady_clock::now();
+    last_activity_ns_.store(
+        std::chrono::steady_clock::now().time_since_epoch().count(),
+        std::memory_order_release);
 }
 
 void Executor::update_task_location(TaskIndex task_id,
diff --git a/src/dftracer/utils/core/pipeline/pipeline.cpp b/src/dftracer/utils/core/pipeline/pipeline.cpp
index b546974d..86b859a5 100644
--- a/src/dftracer/utils/core/pipeline/pipeline.cpp
+++ b/src/dftracer/utils/core/pipeline/pipeline.cpp
@@ -5,7 +5,6 @@
 #include <dftracer/utils/core/tasks/task.h>
 
 #include <any>
-#include <sstream>
 
 namespace dftracer::utils {
 
@@ -20,7 +19,6 @@ Pipeline::Pipeline(const PipelineConfig& config)
     exec_cfg.io_pool_size = config.io_thread_count;
     exec_cfg.io_backend_type = config.io_backend_type;
     exec_cfg.io_batch_threshold = config.io_batch_threshold;
-    exec_cfg.db_pool_size = config.db_pool_size;
 
     std::unique_ptr<Watchdog> watchdog;
     if (config.enable_watchdog) {
diff --git a/src/dftracer/utils/core/rocksdb/async.cpp b/src/dftracer/utils/core/rocksdb/async.cpp
deleted file mode 100644
index 6d3b016b..00000000
--- a/src/dftracer/utils/core/rocksdb/async.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-#include <dftracer/utils/core/io/io_thread_pool.h>
-#include <dftracer/utils/core/pipeline/executor.h>
-#include <dftracer/utils/core/rocksdb/async.h>
-
-namespace dftracer::utils::rocksdb {
-
-io::IoThreadPool* get_db_pool() {
-    auto* exec = Executor::current();
-    if (exec == nullptr) {
-        return nullptr;
-    }
-    return exec->db_pool();
-}
-
-void db_async_submit(io::IoThreadPool* pool, std::function<void()> fn) {
-    pool->submit(std::move(fn));
-}
-
-void db_async_resume_on(void* executor, std::coroutine_handle<> h) {
-    auto* exec = static_cast<Executor*>(executor);
-    if (exec != nullptr) {
-        exec->enqueue(h);
-    } else {
-        h.resume();
-    }
-}
-
-void* get_current_executor_opaque() {
-    return static_cast<void*>(Executor::current());
-}
-
-}  // namespace dftracer::utils::rocksdb
diff --git a/src/dftracer/utils/core/rocksdb/database.cpp b/src/dftracer/utils/core/rocksdb/database.cpp
index 1b227a67..63be75a2 100644
--- a/src/dftracer/utils/core/rocksdb/database.cpp
+++ b/src/dftracer/utils/core/rocksdb/database.cpp
@@ -3,6 +3,7 @@
 #include <dftracer/utils/core/rocksdb/database.h>
 #include <dftracer/utils/core/rocksdb/filesystem.h>
 #include <rocksdb/slice.h>
+#include <rocksdb/table.h>
 
 #include <algorithm>
 #include <atomic>
@@ -25,7 +26,11 @@ const ::rocksdb::ReadOptions& read_options() {
 }
 
 const ::rocksdb::WriteOptions& write_options() {
-    static const ::rocksdb::WriteOptions options;
+    static const auto options = [] {
+        ::rocksdb::WriteOptions wo;
+        wo.disableWAL = true;
+        return wo;
+    }();
     return options;
 }
 
@@ -79,10 +84,8 @@ RocksDatabase& RocksDatabase::operator=(RocksDatabase&& other) noexcept {
     return *this;
 }
 
-std::vector<std::string> RocksDatabase::default_column_families() {
-    return {"default",    "checkpoints", "metadata",   "chunk_bloom",
-            "file_bloom", "chunk_stats", "dimensions", "chunk_dim_stats",
-            "manifest",   "provenance",  "archives",   "tar_files"};
+const decltype(cf::ALL)& RocksDatabase::default_column_families() {
+    return cf::ALL;
 }
 
 ::rocksdb::Options RocksDatabase::default_options() {
@@ -92,13 +95,40 @@ ::rocksdb::Options RocksDatabase::default_options() {
     options.allow_concurrent_memtable_write = true;
     options.enable_pipelined_write = true;
     options.max_open_files = Env::rocksdb_max_open_files();
+    options.max_background_jobs = 8;
+    options.max_subcompactions = 8;
+    options.write_buffer_size = 256 * 1024 * 1024;
+    options.max_write_buffer_number = 4;
     return options;
 }
 
 ::rocksdb::ColumnFamilyOptions RocksDatabase::default_column_family_options() {
     ::rocksdb::ColumnFamilyOptions options;
+
+    ::rocksdb::BlockBasedTableOptions bbt;
+    bbt.block_size = 32 * 1024;
+    bbt.format_version = 5;
+    bbt.index_block_restart_interval = 16;
+    options.table_factory.reset(::rocksdb::NewBlockBasedTableFactory(bbt));
+
+#ifdef DFTRACER_UTILS_ENABLE_ZSTD
+    options.compression = ::rocksdb::kZSTD;
+    options.compression_opts.level = 9;
+    options.compression_opts.max_dict_bytes = 262144;
+    options.compression_opts.zstd_max_train_bytes = 1048576;
+    options.compression_opts.enabled = true;
+    options.bottommost_compression = ::rocksdb::kZSTD;
+    options.bottommost_compression_opts.level = 9;
+    options.bottommost_compression_opts.max_dict_bytes = 262144;
+    options.bottommost_compression_opts.zstd_max_train_bytes = 1048576;
+    options.bottommost_compression_opts.enabled = true;
+#elif defined(DFTRACER_UTILS_ENABLE_LZ4)
     options.compression = ::rocksdb::kLZ4Compression;
     options.bottommost_compression = ::rocksdb::kZlibCompression;
+#else
+    options.compression = ::rocksdb::kZlibCompression;
+    options.bottommost_compression = ::rocksdb::kZlibCompression;
+#endif
     return options;
 }
 
@@ -131,14 +161,17 @@ bool RocksDatabase::open(const std::string& db_path, OpenMode open_mode) {
                 "Failed to list RocksDB column families at '" + db_path_ +
                 "': " + list_status.ToString());
         }
-        column_family_names = default_column_families();
+        column_family_names.reserve(default_column_families().size());
+        for (auto name : default_column_families()) {
+            column_family_names.emplace_back(name);
+        }
     } else {
         if (open_mode_ == OpenMode::ReadWrite) {
             for (const auto& name : default_column_families()) {
                 if (std::find(column_family_names.begin(),
                               column_family_names.end(),
                               name) == column_family_names.end()) {
-                    column_family_names.push_back(name);
+                    column_family_names.emplace_back(name);
                 }
             }
         }
@@ -147,16 +180,22 @@ bool RocksDatabase::open(const std::string& db_path, OpenMode open_mode) {
     std::vector<::rocksdb::ColumnFamilyDescriptor> descriptors;
     descriptors.reserve(column_family_names.size());
     for (const auto& name : column_family_names) {
-        descriptors.emplace_back(name, cf_options);
+        auto opts = cf_options;
+        if (cf_options_override_) {
+            cf_options_override_(name, opts);
+        }
+        descriptors.emplace_back(name, opts);
     }
 
     std::vector<::rocksdb::ColumnFamilyHandle*> handles;
-    auto status =
-        open_mode_ == OpenMode::ReadOnly
-            ? ::rocksdb::DB::OpenForReadOnly(db_options, db_path_, descriptors,
-                                             &handles, &db_, false)
-            : ::rocksdb::DB::Open(db_options, db_path_, descriptors, &handles,
-                                  &db_);
+    ::rocksdb::Status status;
+    if (open_mode_ == OpenMode::ReadOnly) {
+        status = ::rocksdb::DB::OpenForReadOnly(
+            db_options, db_path_, descriptors, &handles, &db_, false);
+    } else {
+        status = ::rocksdb::DB::Open(db_options, db_path_, descriptors,
+                                     &handles, &db_);
+    }
     if (!status.ok()) {
         cleanup_failed_open(db_, handles);
         throw std::runtime_error("Failed to open RocksDB at '" + db_path_ +
@@ -215,7 +254,7 @@ ::rocksdb::DB* RocksDatabase::get() const noexcept { return db_; }
 
 ::rocksdb::ColumnFamilyHandle* RocksDatabase::column_family_handle(
     std::string_view column_family) const {
-    const auto name = column_family.empty() ? std::string("default")
+    const auto name = column_family.empty() ? std::string(cf::DEFAULT)
                                             : std::string(column_family);
     const auto it = column_families_.find(name);
     if (it == column_families_.end() || it->second == nullptr) {
@@ -238,12 +277,42 @@ ::rocksdb::Status RocksDatabase::get(std::string_view key, std::string* value,
                     ::rocksdb::Slice(key.data(), key.size()), value);
 }
 
+::rocksdb::Status RocksDatabase::merge(std::string_view key,
+                                       std::string_view value,
+                                       std::string_view column_family) {
+    return db_->Merge(write_options(), column_family_handle(column_family),
+                      ::rocksdb::Slice(key.data(), key.size()),
+                      ::rocksdb::Slice(value.data(), value.size()));
+}
+
+void RocksDatabase::set_cf_options_override(CfOptionsOverride override) {
+    cf_options_override_ = std::move(override);
+}
+
+::rocksdb::Status RocksDatabase::merge(Batch& batch,
+                                       std::string_view column_family,
+                                       std::string_view key,
+                                       std::string_view value) {
+    return batch.Merge(column_family_handle(column_family),
+                       ::rocksdb::Slice(key.data(), key.size()),
+                       ::rocksdb::Slice(value.data(), value.size()));
+}
+
 ::rocksdb::Status RocksDatabase::del(std::string_view key,
                                      std::string_view column_family) {
     return db_->Delete(write_options(), column_family_handle(column_family),
                        ::rocksdb::Slice(key.data(), key.size()));
 }
 
+::rocksdb::Status RocksDatabase::delete_range(std::string_view begin_key,
+                                              std::string_view end_key,
+                                              std::string_view column_family) {
+    return db_->DeleteRange(
+        write_options(), column_family_handle(column_family),
+        ::rocksdb::Slice(begin_key.data(), begin_key.size()),
+        ::rocksdb::Slice(end_key.data(), end_key.size()));
+}
+
 ::rocksdb::Status RocksDatabase::put(Batch& batch,
                                      std::string_view column_family,
                                      std::string_view key,
@@ -272,4 +341,27 @@ std::unique_ptr<::rocksdb::Iterator> RocksDatabase::new_iterator(
         db_->NewIterator(read_options(), column_family_handle(column_family)));
 }
 
+::rocksdb::Status RocksDatabase::compact(std::string_view column_family) {
+    ::rocksdb::CompactRangeOptions opts;
+    opts.max_subcompactions = 8;
+    return db_->CompactRange(opts, column_family_handle(column_family), nullptr,
+                             nullptr);
+}
+
+::rocksdb::Status RocksDatabase::ingest_external_files(
+    std::string_view column_family,
+    const std::vector<std::string>& external_files, bool ingest_behind) {
+    if (external_files.empty()) {
+        return ::rocksdb::Status::OK();
+    }
+    ::rocksdb::IngestExternalFileOptions opts;
+    opts.move_files = false;
+    opts.snapshot_consistency = true;
+    opts.allow_global_seqno = true;
+    opts.allow_blocking_flush = true;
+    opts.ingest_behind = ingest_behind;
+    return db_->IngestExternalFile(column_family_handle(column_family),
+                                   external_files, opts);
+}
+
 }  // namespace dftracer::utils::rocksdb
diff --git a/src/dftracer/utils/core/rocksdb/db_manager.cpp b/src/dftracer/utils/core/rocksdb/db_manager.cpp
index a9ae5c57..8c28d4ba 100644
--- a/src/dftracer/utils/core/rocksdb/db_manager.cpp
+++ b/src/dftracer/utils/core/rocksdb/db_manager.cpp
@@ -10,7 +10,8 @@ RocksDBManager& RocksDBManager::instance() {
 }
 
 std::shared_ptr<RocksDatabase> RocksDBManager::get_or_open(
-    const std::string& db_path, RocksDatabase::OpenMode open_mode) {
+    const std::string& db_path, RocksDatabase::OpenMode open_mode,
+    RocksDatabase::CfOptionsOverride cf_override) {
     for (;;) {
         bool needs_upgrade = false;
         bool do_open = false;
@@ -67,9 +68,13 @@ std::shared_ptr<RocksDatabase> RocksDBManager::get_or_open(
 
         std::shared_ptr<RocksDatabase> database;
         try {
-            database = std::make_shared<RocksDatabase>(
-                db_path,
-                needs_upgrade ? RocksDatabase::OpenMode::ReadWrite : open_mode);
+            database = std::make_shared<RocksDatabase>();
+            if (cf_override) {
+                database->set_cf_options_override(std::move(cf_override));
+            }
+            database->open(db_path, needs_upgrade
+                                        ? RocksDatabase::OpenMode::ReadWrite
+                                        : open_mode);
         } catch (...) {
             std::lock_guard<std::mutex> lock(mutex_);
             opening_.erase(db_path);
diff --git a/src/dftracer/utils/core/rocksdb/filesystem.cpp b/src/dftracer/utils/core/rocksdb/filesystem.cpp
index 1d31f791..d0787ede 100644
--- a/src/dftracer/utils/core/rocksdb/filesystem.cpp
+++ b/src/dftracer/utils/core/rocksdb/filesystem.cpp
@@ -607,12 +607,12 @@ class DfTracerFileSystem final : public LocalFileSystemWrapper {
 
     ~DfTracerFileSystem() override { fallback_pool_.stop(); }
 
-    static const char* kClassName() { return "DfTracerFileSystem"; }
+    static const char* class_name() { return "DfTracerFileSystem"; }
 
-    const char* Name() const override { return kClassName(); }
+    const char* Name() const override { return class_name(); }
 
     bool IsInstanceOf(const std::string& name) const override {
-        return name == kClassName() ||
+        return name == class_name() ||
                LocalFileSystemWrapper::IsInstanceOf(name);
     }
 
diff --git a/src/dftracer/utils/core/runtime.cpp b/src/dftracer/utils/core/runtime.cpp
index b09c113e..1f8b3a01 100644
--- a/src/dftracer/utils/core/runtime.cpp
+++ b/src/dftracer/utils/core/runtime.cpp
@@ -77,6 +77,14 @@ TaskHandle Runtime::submit(coro::CoroTask<void> task, std::string name) {
         p->set_value();
     };
 
+    // Set the executor on the task's promise so awaitables (e.g. channels)
+    // that capture `get_root_promise()->get_executor()` can schedule
+    // resumption. Without this, awaiters end up with executor=nullptr because
+    // the wrapping `coro::Coro` doesn't extend PromiseBase and the
+    // root-promise chain stops at the user's CoroTask.
+    if (task.handle()) {
+        task.handle().promise().set_executor(executor_.get());
+    }
     auto coro = wrapper(std::move(task), promise, executor_.get(), tid);
     TaskIndex id = executor_->enqueue_tracked(std::move(coro), name, tid);
 
@@ -142,4 +150,8 @@ void Runtime::shutdown() {
 
 std::size_t Runtime::threads() const { return threads_; }
 
+std::size_t Runtime::io_threads() const {
+    return executor_ ? executor_->get_io_pool_size() : 0;
+}
+
 }  // namespace dftracer::utils
diff --git a/src/dftracer/utils/core/utils/timer.cpp b/src/dftracer/utils/core/utils/timer.cpp
index cb3559f0..d7e879db 100644
--- a/src/dftracer/utils/core/utils/timer.cpp
+++ b/src/dftracer/utils/core/utils/timer.cpp
@@ -1,8 +1,11 @@
 #include <dftracer/utils/core/common/logging.h>
 #include <dftracer/utils/core/utils/timer.h>
 
+#include <algorithm>
 #include <cinttypes>
 #include <cstdio>
+#include <utility>
+#include <vector>
 
 namespace dftracer::utils {
 
@@ -56,4 +59,40 @@ std::int64_t Timer::elapsed() const {
     }
 }
 
+void Timer::increment(const std::string& key, std::uint64_t by) {
+    counters_[key] += by;
+}
+
+void Timer::set_counter(const std::string& key, std::uint64_t value) {
+    counters_[key] = value;
+}
+
+const std::unordered_map<std::string, std::uint64_t>& Timer::counters() const {
+    return counters_;
+}
+
+void Timer::print_stages(const std::string& prefix) const {
+    if (counters_.empty()) return;
+
+    std::vector<std::pair<std::string, std::uint64_t>> sorted(counters_.begin(),
+                                                              counters_.end());
+    std::sort(sorted.begin(), sorted.end());
+
+    std::uint64_t total_ns = 0;
+    for (const auto& [_, ns] : sorted) total_ns += ns;
+
+    if (!name_.empty()) {
+        std::printf("%s%s (%.2f ms)\n", prefix.c_str(), name_.c_str(),
+                    static_cast<double>(total_ns) / 1e6);
+    }
+    for (std::size_t i = 0; i < sorted.size(); ++i) {
+        const auto& [key, ns] = sorted[i];
+        bool last = (i + 1 == sorted.size());
+        double ms = static_cast<double>(ns) / 1e6;
+        double pct = total_ns > 0 ? 100.0 * ns / total_ns : 0.0;
+        std::printf("%s%s %-28s %8.2f ms  (%5.1f%%)\n", prefix.c_str(),
+                    last ? "\\-- " : "|-- ", key.c_str(), ms, pct);
+    }
+}
+
 }  // namespace dftracer::utils
diff --git a/src/dftracer/utils/core/utils/timer.h b/src/dftracer/utils/core/utils/timer.h
index 950dd88f..235612e1 100644
--- a/src/dftracer/utils/core/utils/timer.h
+++ b/src/dftracer/utils/core/utils/timer.h
@@ -2,7 +2,6 @@
 #define DFTRACER_UTILS_CORE_UTILS_TIMER_H
 
 #include <chrono>
-#include <cstddef>
 #include <cstdint>
 #include <string>
 #include <unordered_map>
@@ -18,6 +17,10 @@ class Timer {
     void start();
     void stop();
     std::int64_t elapsed() const;
+    void increment(const std::string& key, std::uint64_t by = 1);
+    void set_counter(const std::string& key, std::uint64_t value);
+    const std::unordered_map<std::string, std::uint64_t>& counters() const;
+    void print_stages(const std::string& indent = "  ") const;
 
     inline const std::string& name() const { return name_; }
     inline bool is_running() const { return running_; }
@@ -48,6 +51,36 @@ class Timer {
     using Clock = std::chrono::high_resolution_clock;
     Clock::time_point start_time;
     Clock::time_point end_time;
+    std::unordered_map<std::string, std::uint64_t> counters_;
+};
+
+/// Self-contained scoped timer. Each instance captures its own start
+/// timestamp and writes elapsed nanoseconds to `timer->set_counter(key)`
+/// on destruction, so ScopedTimers can be nested freely.
+class ScopedTimer {
+   public:
+    ScopedTimer(Timer& timer, std::string key)
+        : timer_(&timer), key_(std::move(key)), start_(Clock::now()) {}
+
+    ScopedTimer(Timer* timer, std::string key)
+        : timer_(timer), key_(std::move(key)), start_(Clock::now()) {}
+
+    ~ScopedTimer() {
+        if (!timer_) return;
+        auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                      Clock::now() - start_)
+                      .count();
+        timer_->set_counter(key_, static_cast<std::uint64_t>(ns));
+    }
+
+    ScopedTimer(const ScopedTimer&) = delete;
+    ScopedTimer& operator=(const ScopedTimer&) = delete;
+
+   private:
+    using Clock = std::chrono::high_resolution_clock;
+    Timer* timer_;
+    std::string key_;
+    Clock::time_point start_;
 };
 
 }  // namespace dftracer::utils
diff --git a/src/dftracer/utils/python/arrow_helpers.cpp b/src/dftracer/utils/python/arrow_helpers.cpp
index 2141dc0b..9e0d1670 100644
--- a/src/dftracer/utils/python/arrow_helpers.cpp
+++ b/src/dftracer/utils/python/arrow_helpers.cpp
@@ -1,3 +1,4 @@
+#include <dftracer/utils/core/common/config.h>
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
 
 #define PY_SSIZE_T_CLEAN
@@ -47,6 +48,31 @@ PyObject *wrap_arrow_table(PyObject *batch_list) {
     return table;
 }
 
+PyObject *wrap_arrow_stream_table(PyObject *stream_obj) {
+    if (!stream_obj) {
+        PyErr_SetString(PyExc_RuntimeError, "stream_obj is NULL");
+        return NULL;
+    }
+
+    PyObject *mod = PyImport_ImportModule("dftracer.utils.arrow");
+    if (!mod) {
+        Py_DECREF(stream_obj);
+        return NULL;
+    }
+
+    PyObject *cls = PyObject_GetAttrString(mod, "ArrowTable");
+    Py_DECREF(mod);
+    if (!cls) {
+        Py_DECREF(stream_obj);
+        return NULL;
+    }
+
+    PyObject *table = PyObject_CallFunctionObjArgs(cls, stream_obj, NULL);
+    Py_DECREF(cls);
+    Py_DECREF(stream_obj);
+    return table;
+}
+
 PyObject *arrow_result_to_table(ArrowExportResult result) {
     PyObject *capsule = wrap_arrow_result(std::move(result));
     if (!capsule) return NULL;
diff --git a/src/dftracer/utils/python/arrow_helpers.h b/src/dftracer/utils/python/arrow_helpers.h
index b6b7621f..d4957880 100644
--- a/src/dftracer/utils/python/arrow_helpers.h
+++ b/src/dftracer/utils/python/arrow_helpers.h
@@ -1,6 +1,7 @@
 #ifndef DFTRACER_UTILS_PYTHON_ARROW_HELPERS_H
 #define DFTRACER_UTILS_PYTHON_ARROW_HELPERS_H
 
+#include <dftracer/utils/core/common/config.h>
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
 
 #include <Python.h>
@@ -23,6 +24,11 @@ PyObject *wrap_arrow_table(PyObject *batch_list);
 /// Returns a new reference, or NULL on error.
 PyObject *arrow_result_to_table(ArrowExportResult result);
 
+/// Wrap an _ArrowBatchStream (or any __arrow_c_stream__ provider) in an
+/// ArrowTable. Steals a reference to stream_obj on success.
+/// Returns a new reference, or NULL on error.
+PyObject *wrap_arrow_stream_table(PyObject *stream_obj);
+
 }  // namespace dftracer::utils::python
 
 #endif  // DFTRACER_UTILS_ENABLE_ARROW
diff --git a/src/dftracer/utils/python/arrow_parallel_reader.cpp b/src/dftracer/utils/python/arrow_parallel_reader.cpp
new file mode 100644
index 00000000..2779672c
--- /dev/null
+++ b/src/dftracer/utils/python/arrow_parallel_reader.cpp
@@ -0,0 +1,212 @@
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <dftracer/utils/python/arrow_parallel_reader.h>
+#include <dftracer/utils/python/runtime.h>
+#include <dftracer/utils/python/trace_reader_iterator.h>
+#include <dftracer/utils/utilities/common/arrow/parallel_reader.h>
+
+#include <string>
+#include <vector>
+
+namespace dftracer::utils::python {
+
+using utilities::common::arrow::ArrowExportResult;
+using utilities::common::arrow::read_arrow_files_parallel;
+
+static PyObject* py_read_arrow_files_parallel(PyObject* /*self*/,
+                                              PyObject* args,
+                                              PyObject* kwargs) {
+    static const char* kwlist[] = {"paths", "runtime", nullptr};
+    PyObject* paths_obj = nullptr;
+    PyObject* runtime_obj = nullptr;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O",
+                                     const_cast<char**>(kwlist), &paths_obj,
+                                     &runtime_obj)) {
+        return nullptr;
+    }
+
+    // Convert paths to vector<string>
+    if (!PyList_Check(paths_obj)) {
+        PyErr_SetString(PyExc_TypeError, "paths must be a list of strings");
+        return nullptr;
+    }
+
+    Py_ssize_t n = PyList_Size(paths_obj);
+    std::vector<std::string> paths;
+    paths.reserve(n);
+
+    for (Py_ssize_t i = 0; i < n; ++i) {
+        PyObject* item = PyList_GetItem(paths_obj, i);
+        if (!PyUnicode_Check(item)) {
+            PyErr_SetString(PyExc_TypeError, "all paths must be strings");
+            return nullptr;
+        }
+        paths.push_back(PyUnicode_AsUTF8(item));
+    }
+
+    // Get runtime
+    Runtime* runtime = nullptr;
+    if (runtime_obj && runtime_obj != Py_None) {
+        if (!PyObject_TypeCheck(runtime_obj, &RuntimeType)) {
+            PyErr_SetString(PyExc_TypeError,
+                            "runtime must be a Runtime object");
+            return nullptr;
+        }
+        runtime = ((RuntimeObject*)runtime_obj)->runtime.get();
+    } else {
+        runtime = get_default_runtime();
+    }
+
+    // Call C++ parallel reader (releases GIL during file I/O)
+    utilities::common::arrow::ParallelReadResult result;
+    bool had_error = false;
+    std::string error_msg;
+
+    Py_BEGIN_ALLOW_THREADS try {
+        auto task = read_arrow_files_parallel(std::move(paths));
+        result = runtime->submit(std::move(task), "read_arrow_files").get();
+    } catch (const std::exception& e) {
+        had_error = true;
+        error_msg = e.what();
+    } catch (...) {
+        had_error = true;
+        error_msg = "Unknown error in read_arrow_files";
+    }
+    Py_END_ALLOW_THREADS
+
+        if (had_error) {
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return nullptr;
+    }
+
+    // Build Python result dict
+    PyObject* file_results_list = PyList_New(result.file_results.size());
+    if (!file_results_list) return nullptr;
+
+    for (std::size_t i = 0; i < result.file_results.size(); ++i) {
+        const auto& fr = result.file_results[i];
+        PyObject* fr_dict = PyDict_New();
+        if (!fr_dict) {
+            Py_DECREF(file_results_list);
+            return nullptr;
+        }
+
+        // path
+        PyObject* path_str = PyUnicode_FromString(fr.path.c_str());
+        PyDict_SetItemString(fr_dict, "path", path_str);
+        Py_DECREF(path_str);
+
+        // success
+        PyDict_SetItemString(fr_dict, "success",
+                             fr.success ? Py_True : Py_False);
+
+        // error
+        if (!fr.error.empty()) {
+            PyObject* err_str = PyUnicode_FromString(fr.error.c_str());
+            PyDict_SetItemString(fr_dict, "error", err_str);
+            Py_DECREF(err_str);
+        } else {
+            Py_INCREF(Py_None);
+            PyDict_SetItemString(fr_dict, "error", Py_None);
+        }
+
+        // total_rows
+        PyObject* rows = PyLong_FromLongLong(fr.total_rows);
+        PyDict_SetItemString(fr_dict, "total_rows", rows);
+        Py_DECREF(rows);
+
+        // batches - list of ArrowBatchCapsule objects
+        PyObject* batches_list = PyList_New(fr.batches->size());
+        if (!batches_list) {
+            Py_DECREF(fr_dict);
+            Py_DECREF(file_results_list);
+            return nullptr;
+        }
+
+        for (std::size_t j = 0; j < fr.batches->size(); ++j) {
+            ArrowBatchCapsuleObject* capsule =
+                (ArrowBatchCapsuleObject*)ArrowBatchCapsuleType.tp_alloc(
+                    &ArrowBatchCapsuleType, 0);
+            if (!capsule) {
+                Py_DECREF(batches_list);
+                Py_DECREF(fr_dict);
+                Py_DECREF(file_results_list);
+                return nullptr;
+            }
+            // Move the batch into the capsule
+            capsule->result =
+                new ArrowExportResult(std::move((*fr.batches)[j]));
+            PyList_SetItem(batches_list, j, (PyObject*)capsule);
+        }
+
+        PyDict_SetItemString(fr_dict, "batches", batches_list);
+        Py_DECREF(batches_list);
+
+        PyList_SetItem(file_results_list, i, fr_dict);
+    }
+
+    // Build final result dict
+    PyObject* result_dict = PyDict_New();
+    if (!result_dict) {
+        Py_DECREF(file_results_list);
+        return nullptr;
+    }
+
+    PyDict_SetItemString(result_dict, "file_results", file_results_list);
+    Py_DECREF(file_results_list);
+
+    PyObject* total_rows = PyLong_FromLongLong(result.total_rows);
+    PyDict_SetItemString(result_dict, "total_rows", total_rows);
+    Py_DECREF(total_rows);
+
+    PyObject* total_batches = PyLong_FromLongLong(result.total_batches);
+    PyDict_SetItemString(result_dict, "total_batches", total_batches);
+    Py_DECREF(total_batches);
+
+    PyObject* files_read = PyLong_FromSize_t(result.files_read);
+    PyDict_SetItemString(result_dict, "files_read", files_read);
+    Py_DECREF(files_read);
+
+    PyObject* files_failed = PyLong_FromSize_t(result.files_failed);
+    PyDict_SetItemString(result_dict, "files_failed", files_failed);
+    Py_DECREF(files_failed);
+
+    return result_dict;
+}
+
+static PyMethodDef arrow_parallel_reader_methods[] = {
+    {"read_arrow_files_parallel", (PyCFunction)py_read_arrow_files_parallel,
+     METH_VARARGS | METH_KEYWORDS,
+     "Read multiple Arrow IPC files in parallel using the Runtime.\n\n"
+     "Args:\n"
+     "    paths: List of file paths to read.\n"
+     "    runtime: Optional Runtime object. Uses default if not provided.\n\n"
+     "Returns:\n"
+     "    dict with:\n"
+     "        - file_results: List of per-file results, each with:\n"
+     "            - path: File path\n"
+     "            - success: True if read succeeded\n"
+     "            - error: Error message if failed, else None\n"
+     "            - total_rows: Number of rows in file\n"
+     "            - batches: List of ArrowBatch objects\n"
+     "        - total_rows: Total rows across all files\n"
+     "        - total_batches: Total batches across all files\n"
+     "        - files_read: Number of files read successfully\n"
+     "        - files_failed: Number of files that failed"},
+    {nullptr, nullptr, 0, nullptr}};
+
+int init_arrow_parallel_reader(PyObject* m) {
+    // Add the function to the module
+    if (PyModule_AddFunctions(m, arrow_parallel_reader_methods) < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+}  // namespace dftracer::utils::python
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW_IPC
diff --git a/src/dftracer/utils/python/arrow_parallel_reader.h b/src/dftracer/utils/python/arrow_parallel_reader.h
new file mode 100644
index 00000000..eee0041d
--- /dev/null
+++ b/src/dftracer/utils/python/arrow_parallel_reader.h
@@ -0,0 +1,16 @@
+#ifndef DFTRACER_UTILS_PYTHON_ARROW_PARALLEL_READER_H
+#define DFTRACER_UTILS_PYTHON_ARROW_PARALLEL_READER_H
+
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+#include <Python.h>
+
+namespace dftracer::utils::python {
+
+int init_arrow_parallel_reader(PyObject* m);
+
+}  // namespace dftracer::utils::python
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW_IPC
+#endif  // DFTRACER_UTILS_PYTHON_ARROW_PARALLEL_READER_H
diff --git a/src/dftracer/utils/python/arrow_stream_capsule.cpp b/src/dftracer/utils/python/arrow_stream_capsule.cpp
new file mode 100644
index 00000000..56029e4c
--- /dev/null
+++ b/src/dftracer/utils/python/arrow_stream_capsule.cpp
@@ -0,0 +1,323 @@
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <dftracer/utils/python/arrow_stream_capsule.h>
+#include <dftracer/utils/python/batch_byte_size.h>
+#include <dftracer/utils/python/schema_reconcile.h>
+#include <nanoarrow/nanoarrow.h>
+
+#include <cerrno>
+#include <cstring>
+#include <deque>
+#include <exception>
+#include <mutex>
+#include <optional>
+#include <string>
+
+using ArrowExportResult =
+    dftracer::utils::utilities::common::arrow::ArrowExportResult;
+
+namespace {
+
+// Drain until K consecutive batches add no new columns, bounded by MAX.
+constexpr int STABLE_BATCHES = 5;
+constexpr int MAX_DRAIN = 128;
+
+struct StreamPrivate {
+    std::shared_ptr<ArrowIteratorState> state;
+    dftracer::utils::python::SchemaReconciler reconciler;
+    // Drained during discovery, emitted first from get_next.
+    std::deque<ArrowExportResult> pending;
+    std::string last_error;
+    bool initialized = false;
+    // Sticky: once set, all entry points short-circuit to EIO.
+    bool error_set = false;
+};
+
+static void mark_error(StreamPrivate *p, std::string msg) {
+    if (p->last_error.empty()) p->last_error = std::move(msg);
+    p->error_set = true;
+    p->initialized = true;
+}
+
+static int initialize_stream(StreamPrivate *p) {
+    if (p->error_set) return EIO;
+    if (p->initialized) return 0;
+    auto *astate = p->state.get();
+
+    int stable_run = 0;
+    int drained = 0;
+    while (stable_run < STABLE_BATCHES && drained < MAX_DRAIN) {
+        auto batch = astate->channel->blocking_receive();
+        if (!batch.has_value()) {
+            // End-of-stream or producer error before discovery converged.
+            std::lock_guard<std::mutex> lock(astate->error_mtx);
+            if (astate->error) {
+                try {
+                    std::rethrow_exception(astate->error);
+                } catch (const std::exception &e) {
+                    mark_error(p, e.what());
+                } catch (...) {
+                    mark_error(p, "unknown error in Arrow stream");
+                }
+                return EIO;
+            }
+            break;  // clean early EOS; finalize with whatever we have
+        }
+        auto dequeued = dftracer::utils::python::byte_size(*batch);
+        astate->bytes_in_queue.fetch_sub(dequeued, std::memory_order_acq_rel);
+
+        bool added = p->reconciler.merge(batch->get_schema());
+        if (!p->reconciler.last_error().empty()) {
+            mark_error(p, p->reconciler.last_error());
+            return EIO;
+        }
+        p->pending.push_back(std::move(*batch));
+        stable_run = added ? 0 : (stable_run + 1);
+        ++drained;
+    }
+
+    if (p->reconciler.finalize() != 0) {
+        mark_error(p, p->reconciler.last_error().empty()
+                          ? "failed to finalize schema union"
+                          : p->reconciler.last_error());
+        return EIO;
+    }
+    p->initialized = true;
+    return 0;
+}
+
+static int stream_get_schema(struct ArrowArrayStream *s,
+                             struct ArrowSchema *out) {
+    auto *p = static_cast<StreamPrivate *>(s->private_data);
+    int rc = initialize_stream(p);
+    if (rc != 0) return rc;
+    if (p->error_set) return EIO;
+    if (p->reconciler.copy_schema(out) != 0) {
+        mark_error(p, p->reconciler.last_error().empty()
+                          ? "failed to copy locked schema"
+                          : p->reconciler.last_error());
+        return EIO;
+    }
+    return 0;
+}
+
+static int stream_get_next(struct ArrowArrayStream *s, struct ArrowArray *out) {
+    auto *p = static_cast<StreamPrivate *>(s->private_data);
+    if (p->error_set) return EIO;
+    if (!p->initialized) {
+        int rc = initialize_stream(p);
+        if (rc != 0) return rc;
+    }
+
+    // Drain any discovery-phase batches first, then pull from the channel.
+    std::optional<ArrowExportResult> batch;
+    if (!p->pending.empty()) {
+        batch = std::move(p->pending.front());
+        p->pending.pop_front();
+    } else {
+        auto *astate = p->state.get();
+        batch = astate->channel->blocking_receive();
+        if (!batch.has_value()) {
+            std::lock_guard<std::mutex> lock(astate->error_mtx);
+            if (astate->error) {
+                try {
+                    std::rethrow_exception(astate->error);
+                } catch (const std::exception &e) {
+                    mark_error(p, e.what());
+                } catch (...) {
+                    mark_error(p, "unknown error in Arrow stream");
+                }
+                return EIO;
+            }
+            // End of stream per Arrow C spec: return success with
+            // out->release == nullptr.
+            out->release = nullptr;
+            return 0;
+        }
+        auto dequeued = dftracer::utils::python::byte_size(*batch);
+        astate->bytes_in_queue.fetch_sub(dequeued, std::memory_order_acq_rel);
+    }
+
+    if (p->reconciler.reconcile(batch->get_schema(), batch->get_array(), out) !=
+        0) {
+        mark_error(p, p->reconciler.last_error().empty()
+                          ? "schema reconciliation failed"
+                          : p->reconciler.last_error());
+        return EIO;
+    }
+    return 0;
+}
+
+static const char *stream_get_last_error(struct ArrowArrayStream *s) {
+    auto *p = static_cast<StreamPrivate *>(s->private_data);
+    if (!p || p->last_error.empty()) return nullptr;
+    return p->last_error.c_str();
+}
+
+static void stream_release(struct ArrowArrayStream *s) {
+    auto *p = static_cast<StreamPrivate *>(s->private_data);
+    if (p) {
+        if (p->state) {
+            p->state->cancelled.store(true, std::memory_order_release);
+            if (p->state->channel) p->state->channel->close();
+            if (p->state->task_future.valid()) {
+                // Release the GIL if this callback was invoked from a
+                // Python-holding context (e.g. capsule destructor during
+                // GC). If the GIL is not held (pyarrow's C reader path),
+                // _PyThreadState_UncheckedGet() returns null and we wait
+                // without touching the Python thread state.
+                if (Py_IsInitialized() && PyGILState_Check()) {
+                    Py_BEGIN_ALLOW_THREADS p->state->task_future.wait();
+                    Py_END_ALLOW_THREADS
+                } else {
+                    p->state->task_future.wait();
+                }
+            }
+        }
+        delete p;
+    }
+    s->private_data = nullptr;
+    s->release = nullptr;
+}
+
+static void release_stream_capsule(PyObject *capsule) {
+    auto *stream = static_cast<ArrowArrayStream *>(
+        PyCapsule_GetPointer(capsule, "arrow_array_stream"));
+    if (stream && stream->release) {
+        stream->release(stream);
+    }
+    delete stream;
+}
+
+static PyObject *ArrowBatchStream_arrow_c_stream(ArrowBatchStreamObject *self,
+                                                 PyObject *args) {
+    PyObject *requested_schema = Py_None;
+    if (!PyArg_ParseTuple(args, "|O", &requested_schema)) return NULL;
+
+    // Per the PyCapsule protocol, a non-None `requested_schema` means the
+    // caller wants the stream cast to that schema. We only emit our native
+    // schema today; reject explicitly so misuse fails loudly instead of
+    // silently returning arrays that don't match what the caller asked for.
+    if (requested_schema != Py_None) {
+        PyErr_SetString(PyExc_NotImplementedError,
+                        "iter_arrow_stream does not support "
+                        "requested_schema casting; pass None to use the "
+                        "native schema.");
+        return NULL;
+    }
+
+    if (self->consumed || !self->state) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "Arrow stream already exported via "
+                        "__arrow_c_stream__; each stream can be "
+                        "exported only once.");
+        return NULL;
+    }
+
+    auto *priv = new StreamPrivate;
+    priv->state = self->state;
+    self->consumed = true;
+    self->state.reset();
+
+    auto *stream = new ArrowArrayStream;
+    std::memset(stream, 0, sizeof(*stream));
+    stream->get_schema = stream_get_schema;
+    stream->get_next = stream_get_next;
+    stream->get_last_error = stream_get_last_error;
+    stream->release = stream_release;
+    stream->private_data = priv;
+
+    PyObject *capsule =
+        PyCapsule_New(stream, "arrow_array_stream", release_stream_capsule);
+    if (!capsule) {
+        stream->release(stream);
+        delete stream;
+        return NULL;
+    }
+    return capsule;
+}
+
+static void ArrowBatchStream_dealloc(ArrowBatchStreamObject *self) {
+    if (self->state) {
+        self->state->cancelled.store(true, std::memory_order_release);
+        if (self->state->channel) self->state->channel->close();
+        Py_BEGIN_ALLOW_THREADS if (self->state->task_future.valid()) {
+            self->state->task_future.wait();
+        }
+        Py_END_ALLOW_THREADS
+    }
+    self->state.~shared_ptr<ArrowIteratorState>();
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static PyMethodDef ArrowBatchStream_methods[] = {
+    {"__arrow_c_stream__", (PyCFunction)ArrowBatchStream_arrow_c_stream,
+     METH_VARARGS, "Export as Arrow C Data Interface stream PyCapsule"},
+    {NULL}};
+
+}  // namespace
+
+PyTypeObject ArrowBatchStreamType = {
+    PyVarObject_HEAD_INIT(NULL, 0) "dftracer_utils_ext._ArrowBatchStream",
+    sizeof(ArrowBatchStreamObject), /* tp_basicsize */
+    0,                              /* tp_itemsize */
+    (destructor)ArrowBatchStream_dealloc,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    Py_TPFLAGS_DEFAULT,
+    "Zero-iteration Arrow stream backed by a C++ coroutine channel",
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    ArrowBatchStream_methods,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+};
+
+int init_arrow_batch_stream(PyObject *m) {
+    if (PyType_Ready(&ArrowBatchStreamType) < 0) return -1;
+    Py_INCREF(&ArrowBatchStreamType);
+    if (PyModule_AddObject(m, "_ArrowBatchStream",
+                           (PyObject *)&ArrowBatchStreamType) < 0) {
+        Py_DECREF(&ArrowBatchStreamType);
+        return -1;
+    }
+    return 0;
+}
+
+PyObject *make_arrow_batch_stream(std::shared_ptr<ArrowIteratorState> state) {
+    auto *obj = (ArrowBatchStreamObject *)ArrowBatchStreamType.tp_alloc(
+        &ArrowBatchStreamType, 0);
+    if (!obj) return NULL;
+    new (&obj->state) std::shared_ptr<ArrowIteratorState>(std::move(state));
+    obj->consumed = false;
+    return (PyObject *)obj;
+}
+
+#endif
diff --git a/src/dftracer/utils/python/arrow_stream_capsule.h b/src/dftracer/utils/python/arrow_stream_capsule.h
new file mode 100644
index 00000000..4f0d7d03
--- /dev/null
+++ b/src/dftracer/utils/python/arrow_stream_capsule.h
@@ -0,0 +1,25 @@
+#ifndef DFTRACER_UTILS_PYTHON_ARROW_STREAM_CAPSULE_H
+#define DFTRACER_UTILS_PYTHON_ARROW_STREAM_CAPSULE_H
+
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <dftracer/utils/python/trace_reader_iterator.h>
+
+#include <memory>
+
+typedef struct {
+    PyObject_HEAD std::shared_ptr<ArrowIteratorState> state;
+    bool consumed;
+} ArrowBatchStreamObject;
+
+extern PyTypeObject ArrowBatchStreamType;
+
+int init_arrow_batch_stream(PyObject *m);
+
+PyObject *make_arrow_batch_stream(std::shared_ptr<ArrowIteratorState> state);
+
+#endif
+#endif
diff --git a/src/dftracer/utils/python/batch_byte_size.h b/src/dftracer/utils/python/batch_byte_size.h
new file mode 100644
index 00000000..3921cf41
--- /dev/null
+++ b/src/dftracer/utils/python/batch_byte_size.h
@@ -0,0 +1,55 @@
+#ifndef DFTRACER_UTILS_PYTHON_BATCH_BYTE_SIZE_H
+#define DFTRACER_UTILS_PYTHON_BATCH_BYTE_SIZE_H
+
+#include <dftracer/utils/core/common/config.h>
+#include <dftracer/utils/python/memoryview_batch.h>
+#include <dftracer/utils/python/trace_reader_iterator.h>
+
+#include <cstddef>
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+#include <dftracer/utils/utilities/common/arrow/arrow_export.h>
+#endif
+
+namespace dftracer::utils::python {
+
+inline std::size_t byte_size(const MemoryViewBatchData &b) {
+    return b.buffer.capacity() + b.offsets.capacity() * sizeof(Py_ssize_t) +
+           b.lengths.capacity() * sizeof(Py_ssize_t);
+}
+
+inline std::size_t byte_size(const JsonDictBatch &b) {
+    static constexpr std::size_t ESTIMATED_EVENT_BYTES = 512;
+    return b.events.capacity() * ESTIMATED_EVENT_BYTES;
+}
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+
+inline std::size_t arrow_array_byte_size(const ArrowArray *arr) {
+    if (!arr || !arr->release) return 0;
+    std::size_t total = 0;
+    for (int64_t i = 0; i < arr->n_buffers; ++i) {
+        if (arr->buffers[i]) {
+            // For variable-length buffers, use length * estimated element size
+            // For validity/offset buffers, use (length + 7) / 8 or length * 4
+            // Conservative estimate: buffer contributes proportionally to
+            // length
+            total += static_cast<std::size_t>(arr->length) * 8;
+        }
+    }
+    for (int64_t i = 0; i < arr->n_children; ++i) {
+        total += arrow_array_byte_size(arr->children[i]);
+    }
+    return total;
+}
+
+inline std::size_t byte_size(
+    const dftracer::utils::utilities::common::arrow::ArrowExportResult &b) {
+    return arrow_array_byte_size(b.get_array());
+}
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW
+
+}  // namespace dftracer::utils::python
+
+#endif  // DFTRACER_UTILS_PYTHON_BATCH_BYTE_SIZE_H
diff --git a/src/dftracer/utils/python/batch_indexer.cpp b/src/dftracer/utils/python/batch_indexer.cpp
new file mode 100644
index 00000000..ef7bb8be
--- /dev/null
+++ b/src/dftracer/utils/python/batch_indexer.cpp
@@ -0,0 +1,2554 @@
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/common/string_intern.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/coro/when_all.h>
+#include <dftracer/utils/core/rocksdb/db_manager.h>
+#include <dftracer/utils/core/runtime.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/python/batch_indexer.h>
+#include <dftracer/utils/python/indexer.h>
+#include <dftracer/utils/python/runtime.h>
+#include <dftracer/utils/utilities/common/query/query.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregator_types.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/system_metrics.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.h>
+#include <dftracer/utils/utilities/composites/dft/internal/utils.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+#include <dftracer/utils/utilities/common/arrow/column_builder.h>
+#endif
+
+#include <algorithm>
+#include <chrono>
+#include <cstdio>
+#include <limits>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+using dftracer::utils::CoroScope;
+using dftracer::utils::Runtime;
+using dftracer::utils::coro::CoroTask;
+using namespace dftracer::utils::utilities::composites::dft::indexing;
+using namespace dftracer::utils::utilities::composites::dft::aggregators;
+
+// ---------------------------------------------------------------------------
+// BatchIndexer - directory-level indexer with resolve/build pattern
+// ---------------------------------------------------------------------------
+
+static void Indexer_dealloc(IndexerObject* self) {
+    Py_XDECREF(self->runtime_obj);
+    Py_XDECREF(self->directory);
+    Py_XDECREF(self->files);
+    Py_XDECREF(self->index_dir);
+    Py_XDECREF(self->group_keys);
+    Py_XDECREF(self->custom_metric_fields);
+    Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static PyObject* Indexer_new(PyTypeObject* type, PyObject* args,
+                             PyObject* kwds) {
+    IndexerObject* self = (IndexerObject*)type->tp_alloc(type, 0);
+    if (self) {
+        self->runtime_obj = nullptr;
+        self->directory = nullptr;
+        self->files = nullptr;
+        self->index_dir = nullptr;
+        self->require_checkpoint = 1;
+        self->require_bloom = 1;
+        self->require_manifest = 1;
+        self->require_aggregation = 0;
+        self->time_interval_ms = 5000.0;
+        self->group_keys = nullptr;
+        self->custom_metric_fields = nullptr;
+        self->compute_percentiles = 0;
+        self->checkpoint_size = 32 * 1024 * 1024;
+        self->parallelism = 0;
+        self->force_rebuild = 0;
+    }
+    return (PyObject*)self;
+}
+
+static int Indexer_init(IndexerObject* self, PyObject* args, PyObject* kwds) {
+    static const char* kwlist[] = {"directory",
+                                   "files",
+                                   "index_dir",
+                                   "require_checkpoint",
+                                   "require_bloom",
+                                   "require_manifest",
+                                   "require_aggregation",
+                                   "time_interval_ms",
+                                   "group_keys",
+                                   "custom_metric_fields",
+                                   "compute_percentiles",
+                                   "checkpoint_size",
+                                   "parallelism",
+                                   "force_rebuild",
+                                   "runtime",
+                                   nullptr};
+
+    const char* directory = "";
+    PyObject* files_obj = Py_None;
+    const char* index_dir = "";
+    int require_checkpoint = 1;
+    int require_bloom = 1;
+    int require_manifest = 1;
+    int require_aggregation = 0;
+    double time_interval_ms = 5000.0;
+    PyObject* group_keys_obj = Py_None;
+    PyObject* custom_metrics_obj = Py_None;
+    int compute_percentiles = 0;
+    Py_ssize_t checkpoint_size = 32 * 1024 * 1024;  // 32MB default
+    Py_ssize_t parallelism = 0;
+    int force_rebuild = 0;
+    PyObject* runtime_arg = nullptr;
+
+    if (!PyArg_ParseTupleAndKeywords(
+            args, kwds, "|sOsppppdOOpnnpO", (char**)kwlist, &directory,
+            &files_obj, &index_dir, &require_checkpoint, &require_bloom,
+            &require_manifest, &require_aggregation, &time_interval_ms,
+            &group_keys_obj, &custom_metrics_obj, &compute_percentiles,
+            &checkpoint_size, &parallelism, &force_rebuild, &runtime_arg)) {
+        return -1;
+    }
+
+    // Validate: at least one of directory or files must be provided
+    bool has_directory = directory && directory[0] != '\0';
+    bool has_files = files_obj && files_obj != Py_None &&
+                     PyList_Check(files_obj) && PyList_Size(files_obj) > 0;
+
+    if (!has_directory && !has_files) {
+        PyErr_SetString(PyExc_ValueError,
+                        "At least one of 'directory' or 'files' must be "
+                        "provided");
+        return -1;
+    }
+
+    // Store runtime
+    if (runtime_arg && runtime_arg != Py_None) {
+        if (PyObject_TypeCheck(runtime_arg, &RuntimeType)) {
+            Py_INCREF(runtime_arg);
+            self->runtime_obj = runtime_arg;
+        } else {
+            PyObject* native = PyObject_GetAttrString(runtime_arg, "_native");
+            if (native && PyObject_TypeCheck(native, &RuntimeType)) {
+                self->runtime_obj = native;
+            } else {
+                Py_XDECREF(native);
+                PyErr_SetString(PyExc_TypeError,
+                                "runtime must be a Runtime instance or None");
+                return -1;
+            }
+        }
+    }
+
+    self->directory = PyUnicode_FromString(directory);
+    self->index_dir = PyUnicode_FromString(index_dir);
+    self->require_checkpoint = require_checkpoint;
+    self->require_bloom = require_bloom;
+    self->require_manifest = require_manifest;
+    self->require_aggregation = require_aggregation;
+    self->time_interval_ms = time_interval_ms;
+    self->compute_percentiles = compute_percentiles;
+    self->checkpoint_size = static_cast<std::size_t>(checkpoint_size);
+    self->parallelism = static_cast<std::size_t>(parallelism);
+    self->force_rebuild = force_rebuild;
+
+    // Store files list
+    if (has_files) {
+        Py_INCREF(files_obj);
+        self->files = files_obj;
+    } else {
+        self->files = nullptr;
+    }
+
+    // Store group_keys
+    if (group_keys_obj && group_keys_obj != Py_None) {
+        Py_INCREF(group_keys_obj);
+        self->group_keys = group_keys_obj;
+    } else {
+        self->group_keys = nullptr;
+    }
+
+    // Store custom_metric_fields
+    if (custom_metrics_obj && custom_metrics_obj != Py_None) {
+        Py_INCREF(custom_metrics_obj);
+        self->custom_metric_fields = custom_metrics_obj;
+    } else {
+        self->custom_metric_fields = nullptr;
+    }
+
+    return 0;
+}
+
+static Runtime* get_batch_indexer_runtime(IndexerObject* self) {
+    if (self->runtime_obj) {
+        return ((RuntimeObject*)self->runtime_obj)->runtime.get();
+    }
+    return get_default_runtime();
+}
+
+static std::optional<AggregationConfig> build_aggregation_config(
+    IndexerObject* self) {
+    if (!self->require_aggregation) {
+        return std::nullopt;
+    }
+
+    AggregationConfig config;
+    config.time_interval_us =
+        static_cast<std::uint64_t>(self->time_interval_ms * 1000.0);
+
+    if (self->group_keys && PyList_Check(self->group_keys)) {
+        Py_ssize_t n = PyList_Size(self->group_keys);
+        for (Py_ssize_t i = 0; i < n; i++) {
+            const char* s =
+                PyUnicode_AsUTF8(PyList_GetItem(self->group_keys, i));
+            if (s) config.extra_group_keys.emplace_back(s);
+        }
+    }
+    if (self->custom_metric_fields &&
+        PyList_Check(self->custom_metric_fields)) {
+        Py_ssize_t n = PyList_Size(self->custom_metric_fields);
+        for (Py_ssize_t i = 0; i < n; i++) {
+            const char* s =
+                PyUnicode_AsUTF8(PyList_GetItem(self->custom_metric_fields, i));
+            if (s) config.custom_metric_fields.emplace_back(s);
+        }
+    }
+
+    config.compute_percentiles = self->compute_percentiles != 0;
+    return config;
+}
+
+// ---------------------------------------------------------------------------
+// resolve() - check what exists vs needs building
+// ---------------------------------------------------------------------------
+
+static PyObject* Indexer_resolve(IndexerObject* self,
+                                 PyObject* Py_UNUSED(ignored)) {
+    const char* directory = PyUnicode_AsUTF8(self->directory);
+    const char* index_dir = PyUnicode_AsUTF8(self->index_dir);
+
+    ResolverInput input;
+    input.directory = directory ? directory : "";
+    input.index_dir = index_dir ? index_dir : "";
+    input.require_checkpoints = self->require_checkpoint;
+    input.require_bloom = self->require_bloom;
+    input.require_manifest = self->require_manifest;
+    input.require_aggregation = self->require_aggregation;
+    input.aggregation_config = build_aggregation_config(self);
+
+    // Add files if provided
+    if (self->files && PyList_Check(self->files)) {
+        Py_ssize_t n = PyList_Size(self->files);
+        for (Py_ssize_t i = 0; i < n; i++) {
+            const char* s = PyUnicode_AsUTF8(PyList_GetItem(self->files, i));
+            if (s) input.files.emplace_back(s);
+        }
+    }
+
+    ResolverResult result;
+    std::string error_msg;
+
+    Py_BEGIN_ALLOW_THREADS try {
+        Runtime* rt = get_batch_indexer_runtime(self);
+        rt->submit(run_coro_scope(
+                       rt->executor(),
+                       [](CoroScope& scope, ResolverInput in,
+                          ResolverResult* out) -> CoroTask<void> {
+                           IndexResolverUtility resolver;
+                           // Use scope.spawn(utility, input) which auto-binds
+                           // context for utilities with NeedsContext tag
+                           *out = co_await scope.spawn(resolver, std::move(in));
+                       },
+                       std::move(input), &result),
+                   "batch-indexer-resolve")
+            .get();
+    } catch (const std::exception& e) {
+        error_msg = e.what();
+    }
+    Py_END_ALLOW_THREADS
+
+        if (!error_msg.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return nullptr;
+    }
+
+    // Build result dict
+    PyObject* dict = PyDict_New();
+    if (!dict) return nullptr;
+
+    PyDict_SetItemString(dict, "total_files",
+                         PyLong_FromSize_t(result.all_files.size()));
+    PyDict_SetItemString(dict, "index_path",
+                         PyUnicode_FromString(result.index_path.c_str()));
+
+    // Ready files
+    PyObject* ready_list = PyList_New(result.cached.size());
+    for (std::size_t i = 0; i < result.cached.size(); ++i) {
+        PyList_SetItem(
+            ready_list, i,
+            PyUnicode_FromString(result.cached[i].file_path.c_str()));
+    }
+    PyDict_SetItemString(dict, "ready", ready_list);
+
+    // Needs work files (union of all needs_* lists)
+    std::vector<std::string> needs_work;
+    for (const auto& item : result.needs_checkpoint) {
+        needs_work.push_back(item.file_path);
+    }
+    for (const auto& item : result.needs_bloom) {
+        bool found = false;
+        for (const auto& existing : needs_work) {
+            if (existing == item.file_path) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) needs_work.push_back(item.file_path);
+    }
+    for (const auto& item : result.needs_manifest) {
+        bool found = false;
+        for (const auto& existing : needs_work) {
+            if (existing == item.file_path) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) needs_work.push_back(item.file_path);
+    }
+    for (const auto& item : result.needs_aggregation) {
+        bool found = false;
+        for (const auto& existing : needs_work) {
+            if (existing == item.file_path) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) needs_work.push_back(item.file_path);
+    }
+
+    PyObject* needs_list = PyList_New(needs_work.size());
+    for (std::size_t i = 0; i < needs_work.size(); ++i) {
+        PyList_SetItem(needs_list, i,
+                       PyUnicode_FromString(needs_work[i].c_str()));
+    }
+    PyDict_SetItemString(dict, "needs_work", needs_list);
+
+    return dict;
+}
+
+// ---------------------------------------------------------------------------
+// build() - build missing index tiers
+// ---------------------------------------------------------------------------
+
+static PyObject* Indexer_build(IndexerObject* self,
+                               PyObject* Py_UNUSED(ignored)) {
+    const char* directory = PyUnicode_AsUTF8(self->directory);
+    const char* index_dir = PyUnicode_AsUTF8(self->index_dir);
+
+    ResolveAndBuildInput input;
+    input.directory = directory ? directory : "";
+    input.index_dir = index_dir ? index_dir : "";
+    input.require_checkpoints = self->require_checkpoint;
+    input.require_bloom = self->require_bloom;
+    input.require_manifest = self->require_manifest;
+    input.require_aggregation = self->require_aggregation;
+    input.aggregation_config = build_aggregation_config(self);
+    input.checkpoint_size = self->checkpoint_size;
+    input.parallelism = self->parallelism;
+    input.force_rebuild = self->force_rebuild;
+
+    // Add files if provided
+    if (self->files && PyList_Check(self->files)) {
+        Py_ssize_t n = PyList_Size(self->files);
+        for (Py_ssize_t i = 0; i < n; i++) {
+            const char* s = PyUnicode_AsUTF8(PyList_GetItem(self->files, i));
+            if (s) input.files.emplace_back(s);
+        }
+    }
+
+    std::string error_msg;
+
+    Py_BEGIN_ALLOW_THREADS try {
+        Runtime* rt = get_batch_indexer_runtime(self);
+        rt->submit(run_coro_scope(
+                       rt->executor(),
+                       [](CoroScope& scope,
+                          ResolveAndBuildInput in) -> CoroTask<void> {
+                           co_await resolve_and_build_index(&scope,
+                                                            std::move(in));
+                       },
+                       std::move(input)),
+                   "batch-indexer-build")
+            .get();
+    } catch (const std::exception& e) {
+        error_msg = e.what();
+    }
+    Py_END_ALLOW_THREADS
+
+        if (!error_msg.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return nullptr;
+    }
+
+    Py_RETURN_NONE;
+}
+
+// ---------------------------------------------------------------------------
+// ensure_indexed() - resolve + build if needed
+// ---------------------------------------------------------------------------
+
+static PyObject* Indexer_ensure_indexed(IndexerObject* self,
+                                        PyObject* Py_UNUSED(ignored)) {
+    // First resolve
+    PyObject* status = Indexer_resolve(self, nullptr);
+    if (!status) return nullptr;
+
+    // Check if needs_work is non-empty
+    PyObject* needs_work = PyDict_GetItemString(status, "needs_work");
+    if (needs_work && PyList_Size(needs_work) > 0) {
+        Py_DECREF(status);
+
+        // Build
+        PyObject* result = Indexer_build(self, nullptr);
+        if (!result) return nullptr;
+        Py_DECREF(result);
+
+        // Re-resolve
+        status = Indexer_resolve(self, nullptr);
+    }
+
+    return status;
+}
+
+// ---------------------------------------------------------------------------
+// get_checkpoint_indexer() - get a single-file checkpoint indexer
+// ---------------------------------------------------------------------------
+
+static PyObject* Indexer_get_checkpoint_indexer(IndexerObject* self,
+                                                PyObject* args) {
+    const char* file_path = nullptr;
+    if (!PyArg_ParseTuple(args, "s", &file_path)) {
+        return nullptr;
+    }
+
+    // Determine index path using BatchIndexer's index_dir setting
+    const char* index_dir = PyUnicode_AsUTF8(self->index_dir);
+    std::string index_path = dftracer::utils::utilities::composites::dft::
+        internal::determine_index_path(file_path, index_dir ? index_dir : "");
+
+    // Create IndexerObject
+    CheckpointIndexerObject* indexer =
+        (CheckpointIndexerObject*)CheckpointIndexerType.tp_alloc(
+            &CheckpointIndexerType, 0);
+    if (!indexer) {
+        return nullptr;
+    }
+
+    indexer->handle = nullptr;
+    indexer->gz_path = PyUnicode_FromString(file_path);
+    indexer->index_path = PyUnicode_FromString(index_path.c_str());
+    indexer->checkpoint_size = self->checkpoint_size;
+    indexer->build_bloom = 0;
+    indexer->build_manifest = 0;
+
+    // Share runtime reference
+    if (self->runtime_obj) {
+        Py_INCREF(self->runtime_obj);
+        indexer->runtime_obj = self->runtime_obj;
+    } else {
+        indexer->runtime_obj = nullptr;
+    }
+
+    // Create the native handle
+    indexer->handle = dft_indexer_create(file_path, index_path.c_str(),
+                                         self->checkpoint_size, 0);
+    if (!indexer->handle) {
+        Py_DECREF((PyObject*)indexer);
+        PyErr_SetString(PyExc_RuntimeError,
+                        "Failed to create checkpoint indexer");
+        return nullptr;
+    }
+
+    return (PyObject*)indexer;
+}
+
+static std::optional<std::string> resolve_index_path(IndexerObject* self) {
+    PyObject* status = Indexer_resolve(self, nullptr);
+    if (!status) return std::nullopt;
+    PyObject* obj = PyDict_GetItemString(status, "index_path");
+    const char* path = obj ? PyUnicode_AsUTF8(obj) : nullptr;
+    if (!path || path[0] == '\0') {
+        Py_DECREF(status);
+        PyErr_SetString(PyExc_RuntimeError, "No index path available");
+        return std::nullopt;
+    }
+    std::string result(path);
+    Py_DECREF(status);
+    return result;
+}
+
+static PyObject* Indexer_get_hash_table(IndexerObject* self, PyObject* args) {
+    const char* type_str = nullptr;
+    if (!PyArg_ParseTuple(args, "s", &type_str)) {
+        return nullptr;
+    }
+
+    using dftracer::utils::utilities::indexer::IndexDatabase;
+    using HashType = IndexDatabase::HashType;
+
+    HashType type;
+    if (std::strcmp(type_str, "file") == 0) {
+        type = HashType::FILE;
+    } else if (std::strcmp(type_str, "host") == 0) {
+        type = HashType::HOST;
+    } else if (std::strcmp(type_str, "string") == 0) {
+        type = HashType::STRING;
+    } else if (std::strcmp(type_str, "proc") == 0) {
+        type = HashType::PROC;
+    } else {
+        PyErr_SetString(PyExc_ValueError,
+                        "type must be 'file', 'host', 'string', or 'proc'");
+        return nullptr;
+    }
+
+    auto idx_opt = resolve_index_path(self);
+    if (!idx_opt) return nullptr;
+    std::string index_path = std::move(*idx_opt);
+
+    std::unordered_map<std::string, std::string> hash_map;
+    std::string error_msg;
+
+    Py_BEGIN_ALLOW_THREADS try {
+        IndexDatabase db(
+            index_path,
+            dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+        hash_map = db.query_hash_table(type);
+    } catch (const std::exception& e) {
+        error_msg = e.what();
+    }
+    Py_END_ALLOW_THREADS
+
+        if (!error_msg.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return nullptr;
+    }
+
+    PyObject* dict = PyDict_New();
+    if (!dict) return nullptr;
+
+    for (const auto& [hash, name] : hash_map) {
+        PyObject* key = PyUnicode_FromStringAndSize(hash.data(), hash.size());
+        PyObject* val = PyUnicode_FromStringAndSize(name.data(), name.size());
+        PyDict_SetItem(dict, key, val);
+        Py_DECREF(key);
+        Py_DECREF(val);
+    }
+
+    return dict;
+}
+
+static PyObject* Indexer_query_file_pids(IndexerObject* self, PyObject* args) {
+    int file_id;
+    if (!PyArg_ParseTuple(args, "i", &file_id)) {
+        return nullptr;
+    }
+
+    using dftracer::utils::utilities::indexer::IndexDatabase;
+
+    auto idx_opt = resolve_index_path(self);
+    if (!idx_opt) return nullptr;
+    std::string index_path = std::move(*idx_opt);
+
+    std::unordered_set<std::uint64_t> pids;
+    std::string error_msg;
+
+    Py_BEGIN_ALLOW_THREADS try {
+        IndexDatabase db(
+            index_path,
+            dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+        pids = db.query_file_pids(file_id);
+    } catch (const std::exception& e) {
+        error_msg = e.what();
+    }
+    Py_END_ALLOW_THREADS
+
+        if (!error_msg.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return nullptr;
+    }
+
+    PyObject* set = PySet_New(nullptr);
+    if (!set) return nullptr;
+
+    for (auto pid : pids) {
+        PyObject* val = PyLong_FromUnsignedLongLong(pid);
+        PySet_Add(set, val);
+        Py_DECREF(val);
+    }
+
+    return set;
+}
+
+static PyObject* Indexer_query_all_file_pids(IndexerObject* self,
+                                             PyObject* Py_UNUSED(ignored)) {
+    using dftracer::utils::utilities::indexer::IndexDatabase;
+
+    auto idx_opt = resolve_index_path(self);
+    if (!idx_opt) return nullptr;
+    std::string index_path = std::move(*idx_opt);
+
+    std::unordered_map<int, std::unordered_set<std::uint64_t>> all_pids;
+    std::string error_msg;
+
+    Py_BEGIN_ALLOW_THREADS try {
+        IndexDatabase db(
+            index_path,
+            dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+        all_pids = db.query_all_file_pids();
+    } catch (const std::exception& e) {
+        error_msg = e.what();
+    }
+    Py_END_ALLOW_THREADS
+
+        if (!error_msg.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return nullptr;
+    }
+
+    PyObject* dict = PyDict_New();
+    if (!dict) return nullptr;
+
+    for (const auto& [file_id, pids] : all_pids) {
+        PyObject* key = PyLong_FromLong(file_id);
+        PyObject* set = PySet_New(nullptr);
+        for (auto pid : pids) {
+            PyObject* val = PyLong_FromUnsignedLongLong(pid);
+            PySet_Add(set, val);
+            Py_DECREF(val);
+        }
+        PyDict_SetItem(dict, key, set);
+        Py_DECREF(key);
+        Py_DECREF(set);
+    }
+
+    return dict;
+}
+
+static PyObject* Indexer_query_file_info(IndexerObject* self,
+                                         PyObject* Py_UNUSED(ignored)) {
+    using dftracer::utils::utilities::indexer::IndexDatabase;
+
+    auto idx_opt = resolve_index_path(self);
+    if (!idx_opt) return nullptr;
+    std::string index_path = std::move(*idx_opt);
+
+    std::unordered_map<std::string, int> file_ids;
+    std::unordered_map<int, std::unordered_set<std::uint64_t>> all_pids;
+    std::string error_msg;
+
+    Py_BEGIN_ALLOW_THREADS try {
+        IndexDatabase db(
+            index_path,
+            dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+        file_ids = db.query_all_file_info_ids();
+        all_pids = db.query_all_file_pids();
+    } catch (const std::exception& e) {
+        error_msg = e.what();
+    }
+    Py_END_ALLOW_THREADS
+
+        if (!error_msg.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return nullptr;
+    }
+
+    auto data_dir = fs::weakly_canonical(fs::path(index_path)).parent_path();
+
+    PyObject* id_to_path = PyDict_New();
+    if (!id_to_path) return nullptr;
+    for (const auto& [logical_name, fid] : file_ids) {
+        auto resolved = (data_dir / logical_name).string();
+        PyObject* key = PyLong_FromLong(fid);
+        PyObject* val = PyUnicode_FromStringAndSize(
+            resolved.data(), static_cast<Py_ssize_t>(resolved.size()));
+        PyDict_SetItem(id_to_path, key, val);
+        Py_DECREF(key);
+        Py_DECREF(val);
+    }
+
+    PyObject* pid_dict = PyDict_New();
+    if (!pid_dict) {
+        Py_DECREF(id_to_path);
+        return nullptr;
+    }
+    for (const auto& [file_id, pids] : all_pids) {
+        PyObject* key = PyLong_FromLong(file_id);
+        PyObject* set = PySet_New(nullptr);
+        for (auto pid : pids) {
+            PyObject* val = PyLong_FromUnsignedLongLong(pid);
+            PySet_Add(set, val);
+            Py_DECREF(val);
+        }
+        PyDict_SetItem(pid_dict, key, set);
+        Py_DECREF(key);
+        Py_DECREF(set);
+    }
+
+    PyObject* result = PyTuple_Pack(2, id_to_path, pid_dict);
+    Py_DECREF(id_to_path);
+    Py_DECREF(pid_dict);
+    return result;
+}
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+#include <dftracer/utils/python/trace_reader_iterator.h>
+#include <dftracer/utils/utilities/common/arrow/column_builder.h>
+
+static PyObject* create_arrow_batch_capsule(
+    dftracer::utils::utilities::common::arrow::ArrowExportResult&& result) {
+    auto* obj = (ArrowBatchCapsuleObject*)ArrowBatchCapsuleType.tp_alloc(
+        &ArrowBatchCapsuleType, 0);
+    if (!obj) return nullptr;
+    obj->result =
+        new dftracer::utils::utilities::common::arrow::ArrowExportResult(
+            std::move(result));
+    return (PyObject*)obj;
+}
+
+namespace {
+
+using dftracer::utils::utilities::common::arrow::ArrowExportResult;
+using dftracer::utils::utilities::common::arrow::ColumnSpec;
+using dftracer::utils::utilities::common::arrow::ColumnType;
+using dftracer::utils::utilities::common::arrow::RecordBatchBuilder;
+
+static bool parse_agg_type_str(const char* type_str, AggMapType& out) {
+    if (strcmp(type_str, "events") == 0) {
+        out = AggMapType::EVENT;
+        return true;
+    }
+    if (strcmp(type_str, "profiles") == 0) {
+        out = AggMapType::PROFILE;
+        return true;
+    }
+    if (strcmp(type_str, "system") == 0) {
+        out = AggMapType::SYSTEM;
+        return true;
+    }
+    PyErr_SetString(PyExc_ValueError,
+                    "type must be 'events', 'profiles', or 'system'");
+    return false;
+}
+
+struct AggDbHandle {
+    std::shared_ptr<dftracer::utils::rocksdb::RocksDatabase> db;
+    std::unique_ptr<EventAggregator> agg;
+};
+
+static std::unique_ptr<AggDbHandle> open_agg_db(const std::string& index_path,
+                                                std::string& error_msg) {
+    std::shared_ptr<dftracer::utils::rocksdb::RocksDatabase> db;
+    try {
+        db = EventAggregator::open_with_merge_operator(index_path);
+    } catch (...) {
+        auto& mgr = dftracer::utils::rocksdb::RocksDBManager::instance();
+        mgr.reset(index_path);
+        db = mgr.get_or_open(
+            index_path,
+            dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+        if (db && db->is_open()) {
+            load_intern_dictionary(*db);
+        }
+    }
+    if (!db || !db->is_open()) {
+        error_msg = "Failed to open aggregation database";
+        return nullptr;
+    }
+    std::string config_val;
+    auto key = std::string_view(AGG_GLOBAL_CONFIG_KEY,
+                                sizeof(AGG_GLOBAL_CONFIG_KEY) - 1);
+    if (!db->get(key, &config_val, dftracer::utils::rocksdb::cf::AGGREGATION)
+             .ok()) {
+        error_msg = "No aggregation config found - was aggregation enabled?";
+        return nullptr;
+    }
+    auto cfg = deserialize_agg_global_config(config_val);
+    auto handle = std::make_unique<AggDbHandle>();
+    handle->db = db;
+    handle->agg = std::make_unique<EventAggregator>(db, cfg.config_hash);
+    return handle;
+}
+
+static std::optional<dftracer::utils::utilities::common::query::Query>
+parse_query_arg(const char* query_str) {
+    if (!query_str || query_str[0] == '\0') return std::nullopt;
+    auto result = dftracer::utils::utilities::common::query::Query::from_string(
+        query_str);
+    if (!result) {
+        PyErr_SetString(PyExc_ValueError, result.error().message.c_str());
+        return std::nullopt;
+    }
+    return std::move(*result);
+}
+
+constexpr std::uint16_t DFT_NUM_SHARDS = 4096;
+
+template <typename Output, typename ScanFn>
+void parallel_shard_scan_range(Runtime* rt, std::uint16_t outer_begin,
+                               std::uint16_t outer_end, ScanFn&& scan_fn,
+                               std::vector<Output>& outputs) {
+    if (outer_end <= outer_begin) return;
+    const std::size_t span = static_cast<std::size_t>(outer_end - outer_begin);
+    const std::size_t num_tasks = std::min<std::size_t>(rt->threads(), span);
+    const std::size_t shards_per_task = (span + num_tasks - 1) / num_tasks;
+    rt->submit(run_coro_scope(
+                   rt->executor(),
+                   [&](CoroScope& scope) -> CoroTask<void> {
+                       std::vector<dftracer::utils::coro::SpawnFuture<Output>>
+                           futures;
+                       futures.reserve(num_tasks);
+                       for (std::size_t t = 0; t < num_tasks; ++t) {
+                           auto shard_begin = static_cast<std::uint16_t>(
+                               outer_begin + t * shards_per_task);
+                           auto shard_end =
+                               static_cast<std::uint16_t>(std::min<std::size_t>(
+                                   outer_begin + (t + 1) * shards_per_task,
+                                   outer_end));
+                           futures.push_back(
+                               scope.spawn([&scan_fn, shard_begin, shard_end](
+                                               CoroScope&) -> CoroTask<Output> {
+                                   co_return scan_fn(shard_begin, shard_end);
+                               }));
+                       }
+                       outputs.reserve(num_tasks);
+                       for (auto& f : futures) {
+                           outputs.push_back(co_await f);
+                       }
+                   }),
+               "parallel-shard-scan-range")
+        .get();
+}
+
+template <typename Output, typename ScanFn>
+void parallel_shard_scan(Runtime* rt, ScanFn&& scan_fn,
+                         std::vector<Output>& outputs) {
+    parallel_shard_scan_range<Output>(rt, 0, DFT_NUM_SHARDS,
+                                      std::forward<ScanFn>(scan_fn), outputs);
+}
+
+static void append_results_to_list(PyObject* list,
+                                   std::vector<ArrowExportResult>& results) {
+    for (auto& r : results) {
+        PyObject* capsule = create_arrow_batch_capsule(std::move(r));
+        if (capsule) {
+            PyList_Append(list, capsule);
+            Py_DECREF(capsule);
+        }
+    }
+}
+
+struct AggScanInput {
+    const EventAggregator* agg;
+    AggMapType target_type;
+    AggregationBatchType batch_type;
+    Py_ssize_t batch_size;
+    std::uint16_t shard_begin;
+    std::uint16_t shard_end;
+};
+
+struct AggScanOutput {
+    std::vector<ArrowExportResult> results;
+};
+
+AggScanOutput scan_aggregation_shard_range(AggScanInput input) {
+    AggScanOutput output;
+
+    static const std::vector<ColumnSpec> schema = {
+        {"batch_type", ColumnType::INT64},  {"cat", ColumnType::DICT_STRING},
+        {"name", ColumnType::DICT_STRING},  {"pid", ColumnType::UINT64},
+        {"tid", ColumnType::UINT64},        {"hhash", ColumnType::DICT_STRING},
+        {"fhash", ColumnType::DICT_STRING}, {"time_bucket", ColumnType::UINT64},
+        {"count", ColumnType::UINT64},      {"dur_total", ColumnType::UINT64},
+        {"dur_min", ColumnType::UINT64},    {"dur_max", ColumnType::UINT64},
+        {"dur_mean", ColumnType::DOUBLE},   {"dur_std", ColumnType::DOUBLE},
+        {"size_total", ColumnType::UINT64}, {"size_min", ColumnType::UINT64},
+        {"size_max", ColumnType::UINT64},   {"size_mean", ColumnType::DOUBLE},
+        {"size_std", ColumnType::DOUBLE},   {"ts", ColumnType::UINT64},
+        {"te", ColumnType::UINT64},
+    };
+
+    RecordBatchBuilder builder;
+    builder.declare_schema(schema);
+    builder.reserve(static_cast<std::size_t>(input.batch_size));
+
+    std::size_t row_count = 0;
+
+    input.agg->scan_shard_range_raw(
+        input.shard_begin, input.shard_end,
+        [&](std::string_view key_bytes, std::string_view val_bytes) -> bool {
+            AggKeyView kv;
+            if (!parse_agg_key_view(key_bytes, kv)) return true;
+            if (kv.map_type != input.target_type) return true;
+
+            AggMetricsFullView mv;
+            if (!parse_agg_value_full_view(val_bytes, mv)) return true;
+
+            std::size_t ci = 0;
+            builder.append_int64(ci++,
+                                 static_cast<std::int64_t>(input.batch_type));
+            builder.append_dict_string(ci++, kv.cat);
+            builder.append_dict_string(ci++, kv.name);
+            builder.append_uint64(ci++, kv.pid);
+            builder.append_uint64(ci++, kv.tid);
+            builder.append_dict_string(ci++, kv.hhash);
+            builder.append_dict_string(ci++, kv.fhash);
+            builder.append_uint64(ci++, kv.time_bucket);
+            builder.append_uint64(ci++, mv.count);
+            builder.append_uint64(ci++, mv.dur_total);
+            builder.append_uint64(ci++, mv.count > 0 ? mv.dur_min : 0);
+            builder.append_uint64(ci++, mv.dur_max);
+            builder.append_double(ci++, mv.dur_mean);
+            builder.append_double(ci++, mv.dur_stddev());
+            builder.append_uint64(ci++, mv.size_total);
+            builder.append_uint64(ci++, mv.count > 0 ? mv.size_min : 0);
+            builder.append_uint64(ci++, mv.size_max);
+            builder.append_double(ci++, mv.size_mean);
+            builder.append_double(ci++, mv.size_stddev());
+            builder.append_uint64(ci++, mv.ts);
+            builder.append_uint64(ci++, mv.te);
+            builder.end_row();
+
+            row_count++;
+            if (static_cast<Py_ssize_t>(row_count) >= input.batch_size) {
+                auto arrow = builder.finish();
+                if (arrow.valid()) {
+                    output.results.push_back(std::move(arrow));
+                }
+                builder.reset(true);
+                builder.reserve(static_cast<std::size_t>(input.batch_size));
+                row_count = 0;
+            }
+            return true;
+        });
+
+    if (row_count > 0) {
+        auto arrow = builder.finish();
+        if (arrow.valid()) {
+            output.results.push_back(std::move(arrow));
+        }
+    }
+
+    return output;
+}
+
+enum class IOCategory : std::int8_t {
+    READ = 1,
+    WRITE = 2,
+    METADATA = 3,
+    PCTL = 4,
+    IPC = 5,
+    OTHER = 6,
+    SYNC = 7,
+};
+
+inline IOCategory get_io_category(std::string_view func_name) {
+    if (func_name == "read" || func_name == "pread" || func_name == "readv" ||
+        func_name == "preadv" || func_name == "fread") {
+        return IOCategory::READ;
+    }
+    if (func_name == "write" || func_name == "pwrite" ||
+        func_name == "writev" || func_name == "pwritev" ||
+        func_name == "fwrite") {
+        return IOCategory::WRITE;
+    }
+    if (func_name == "fsync" || func_name == "fdatasync" ||
+        func_name == "msync" || func_name == "sync") {
+        return IOCategory::SYNC;
+    }
+    if (func_name == "open" || func_name == "open64" || func_name == "close" ||
+        func_name == "fopen" || func_name == "fopen64" ||
+        func_name == "fclose" || func_name == "stat" || func_name == "fstat" ||
+        func_name == "lstat" || func_name == "fstatat" ||
+        func_name == "__xstat" || func_name == "__xstat64" ||
+        func_name == "__lxstat" || func_name == "__lxstat64" ||
+        func_name == "__fxstat" || func_name == "__fxstat64" ||
+        func_name == "access" || func_name == "lseek" ||
+        func_name == "lseek64" || func_name == "fseek" ||
+        func_name == "ftell" || func_name == "seek" || func_name == "fcntl" ||
+        func_name == "ftruncate" || func_name == "mkdir" ||
+        func_name == "rmdir" || func_name == "unlink" ||
+        func_name == "remove" || func_name == "rename" || func_name == "link" ||
+        func_name == "readlink" || func_name == "opendir" ||
+        func_name == "closedir" || func_name == "readdir") {
+        return IOCategory::METADATA;
+    }
+    return IOCategory::OTHER;
+}
+
+inline char* fast_itoa(std::uint64_t val, char* buf) {
+    char* p = buf;
+    do {
+        *p++ = '0' + (val % 10);
+        val /= 10;
+    } while (val);
+    std::reverse(buf, p);
+    return p;
+}
+
+class HashResolver {
+   public:
+    HashResolver(
+        const std::unordered_map<std::string, std::string>* file_hashes,
+        const std::unordered_map<std::string, std::string>* host_hashes)
+        : file_hashes_(file_hashes), host_hashes_(host_hashes) {
+        if (file_hashes_) {
+            for (const auto& [hash, name] : *file_hashes_) {
+                auto hash_sv = intern_.intern(hash);
+                auto name_sv = intern_.intern(name);
+                file_map_[hash_sv] = name_sv;
+            }
+        }
+        if (host_hashes_) {
+            for (const auto& [hash, name] : *host_hashes_) {
+                auto hash_sv = intern_.intern(hash);
+                auto name_sv = intern_.intern(name);
+                host_map_[hash_sv] = name_sv;
+            }
+        }
+    }
+
+    std::string_view resolve_file(std::string_view hash) {
+        if (hash.empty()) return hash;
+        auto interned = intern_.intern(hash);
+        auto it = file_map_.find(interned);
+        return it != file_map_.end() ? it->second : interned;
+    }
+
+    std::string_view resolve_host(std::string_view hash) {
+        if (hash.empty()) return hash;
+        auto interned = intern_.intern(hash);
+        auto it = host_map_.find(interned);
+        return it != host_map_.end() ? it->second : interned;
+    }
+
+    std::string_view intern(std::string_view sv) { return intern_.intern(sv); }
+
+   private:
+    const std::unordered_map<std::string, std::string>* file_hashes_;
+    const std::unordered_map<std::string, std::string>* host_hashes_;
+    dftracer::utils::StringIntern intern_;
+    std::unordered_map<std::string_view, std::string_view> file_map_;
+    std::unordered_map<std::string_view, std::string_view> host_map_;
+};
+
+struct ProcKey {
+    std::string_view hhash;
+    std::uint64_t pid;
+    std::uint64_t tid;
+    bool operator==(const ProcKey& o) const {
+        return hhash == o.hhash && pid == o.pid && tid == o.tid;
+    }
+};
+
+struct ProcKeyHash {
+    std::size_t operator()(const ProcKey& k) const {
+        return std::hash<std::string_view>{}(k.hhash) ^
+               (std::hash<std::uint64_t>{}(k.pid) << 1) ^
+               (std::hash<std::uint64_t>{}(k.tid) << 2);
+    }
+};
+
+static const std::vector<ColumnSpec> DFANALYZER_SCHEMA = {
+    {"cat", ColumnType::DICT_STRING},
+    {"func_name", ColumnType::DICT_STRING},
+    {"pid", ColumnType::INT64},
+    {"tid", ColumnType::INT64},
+    {"file_hash", ColumnType::DICT_STRING},
+    {"host_hash", ColumnType::DICT_STRING},
+    {"file_name", ColumnType::DICT_STRING},
+    {"host_name", ColumnType::DICT_STRING},
+    {"proc_name", ColumnType::DICT_STRING},
+    {"io_cat", ColumnType::INT64},
+    {"acc_pat", ColumnType::INT64},
+    {"count", ColumnType::INT64},
+    {"time", ColumnType::DOUBLE},
+    {"size", ColumnType::INT64},
+    {"time_min", ColumnType::DOUBLE},
+    {"time_max", ColumnType::DOUBLE},
+    {"size_min", ColumnType::INT64},
+    {"size_max", ColumnType::INT64},
+    {"time_range", ColumnType::INT64},
+    {"time_start", ColumnType::INT64},
+    {"time_end", ColumnType::INT64},
+};
+
+enum GroupByField : std::uint32_t {
+    GB_CAT = 1u << 0,
+    GB_FUNC_NAME = 1u << 1,
+    GB_PID = 1u << 2,
+    GB_TID = 1u << 3,
+    GB_FILE_HASH = 1u << 4,
+    GB_HOST_HASH = 1u << 5,
+    GB_FILE_NAME = 1u << 6,
+    GB_HOST_NAME = 1u << 7,
+    GB_PROC_NAME = 1u << 8,
+    GB_IO_CAT = 1u << 9,
+    GB_ACC_PAT = 1u << 10,
+    GB_TIME_RANGE = 1u << 11,
+};
+
+struct GroupByConfig {
+    std::uint32_t mask = 0;
+    std::vector<GroupByField> order;
+    std::vector<std::string> names;  // matches `order`, used for schema
+};
+
+inline std::optional<GroupByField> parse_group_by_name(std::string_view name) {
+    if (name == "cat") return GB_CAT;
+    if (name == "func_name") return GB_FUNC_NAME;
+    if (name == "pid") return GB_PID;
+    if (name == "tid") return GB_TID;
+    if (name == "file_hash") return GB_FILE_HASH;
+    if (name == "host_hash") return GB_HOST_HASH;
+    if (name == "file_name") return GB_FILE_NAME;
+    if (name == "host_name") return GB_HOST_NAME;
+    if (name == "proc_name") return GB_PROC_NAME;
+    if (name == "io_cat") return GB_IO_CAT;
+    if (name == "acc_pat") return GB_ACC_PAT;
+    if (name == "time_range") return GB_TIME_RANGE;
+    return std::nullopt;
+}
+
+struct CoarseKey {
+    std::string_view cat;
+    std::string_view func_name;
+    std::uint64_t pid = 0;
+    std::uint64_t tid = 0;
+    std::string_view file_hash;
+    std::string_view host_hash;
+    std::string_view file_name;
+    std::string_view host_name;
+    std::string_view proc_name;
+    std::int64_t io_cat = 0;
+    std::int64_t acc_pat = 0;
+    std::int64_t time_range = 0;
+
+    bool operator==(const CoarseKey& o) const {
+        return cat == o.cat && func_name == o.func_name && pid == o.pid &&
+               tid == o.tid && file_hash == o.file_hash &&
+               host_hash == o.host_hash && file_name == o.file_name &&
+               host_name == o.host_name && proc_name == o.proc_name &&
+               io_cat == o.io_cat && acc_pat == o.acc_pat &&
+               time_range == o.time_range;
+    }
+};
+
+struct CoarseKeyHash {
+    std::size_t operator()(const CoarseKey& k) const {
+        auto combine = [](std::size_t h, std::size_t v) {
+            return h ^ (v + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2));
+        };
+        std::size_t h = std::hash<std::string_view>{}(k.cat);
+        h = combine(h, std::hash<std::string_view>{}(k.func_name));
+        h = combine(h, std::hash<std::uint64_t>{}(k.pid));
+        h = combine(h, std::hash<std::uint64_t>{}(k.tid));
+        h = combine(h, std::hash<std::string_view>{}(k.file_hash));
+        h = combine(h, std::hash<std::string_view>{}(k.host_hash));
+        h = combine(h, std::hash<std::string_view>{}(k.file_name));
+        h = combine(h, std::hash<std::string_view>{}(k.host_name));
+        h = combine(h, std::hash<std::string_view>{}(k.proc_name));
+        h = combine(h, std::hash<std::int64_t>{}(k.io_cat));
+        h = combine(h, std::hash<std::int64_t>{}(k.acc_pat));
+        h = combine(h, std::hash<std::int64_t>{}(k.time_range));
+        return h;
+    }
+};
+
+struct CoarseMetrics {
+    std::uint64_t count = 0;
+    double time_sum = 0.0;
+    double time_sq_sum = 0.0;
+    double time_min_val = std::numeric_limits<double>::infinity();
+    double time_max_val = -std::numeric_limits<double>::infinity();
+    double time_call_min_val = std::numeric_limits<double>::infinity();
+    double time_call_max_val = -std::numeric_limits<double>::infinity();
+    std::uint64_t size_sum = 0;
+    double size_sq_sum = 0.0;
+    std::uint64_t size_min_val = std::numeric_limits<std::uint64_t>::max();
+    std::uint64_t size_max_val = 0;
+    std::uint64_t size_call_min_val = std::numeric_limits<std::uint64_t>::max();
+    std::uint64_t size_call_max_val = 0;
+    bool has_size = false;
+    std::uint64_t time_start_val = std::numeric_limits<std::uint64_t>::max();
+    std::uint64_t time_end_val = 0;
+    bool has_time_bounds = false;
+};
+
+inline std::vector<ColumnSpec> make_coarse_schema(const GroupByConfig& cfg) {
+    std::vector<ColumnSpec> specs;
+    specs.reserve(cfg.order.size() + 16);
+    for (std::size_t i = 0; i < cfg.order.size(); ++i) {
+        GroupByField f = cfg.order[i];
+        const std::string& name = cfg.names[i];
+        switch (f) {
+            case GB_CAT:
+            case GB_FUNC_NAME:
+            case GB_FILE_HASH:
+            case GB_HOST_HASH:
+            case GB_FILE_NAME:
+            case GB_HOST_NAME:
+            case GB_PROC_NAME:
+                specs.push_back({name, ColumnType::DICT_STRING});
+                break;
+            case GB_PID:
+            case GB_TID:
+            case GB_IO_CAT:
+            case GB_ACC_PAT:
+            case GB_TIME_RANGE:
+                specs.push_back({name, ColumnType::INT64});
+                break;
+        }
+    }
+    specs.push_back({"count", ColumnType::INT64});
+    specs.push_back({"time", ColumnType::DOUBLE});
+    specs.push_back({"size", ColumnType::INT64});
+    specs.push_back({"time_sq", ColumnType::DOUBLE});
+    specs.push_back({"size_sq", ColumnType::DOUBLE});
+    specs.push_back({"time_min", ColumnType::DOUBLE});
+    specs.push_back({"time_max", ColumnType::DOUBLE});
+    specs.push_back({"size_min", ColumnType::INT64});
+    specs.push_back({"size_max", ColumnType::INT64});
+    specs.push_back({"time_call_min", ColumnType::DOUBLE});
+    specs.push_back({"time_call_max", ColumnType::DOUBLE});
+    specs.push_back({"size_call_min", ColumnType::INT64});
+    specs.push_back({"size_call_max", ColumnType::INT64});
+    specs.push_back({"time_start", ColumnType::INT64});
+    specs.push_back({"time_end", ColumnType::INT64});
+    return specs;
+}
+
+struct DfanalyzerScanInput {
+    const EventAggregator* agg;
+    const DfanalyzerContext* ctx;
+    std::optional<AggMapType> type_filter;
+    Py_ssize_t batch_size;
+    std::uint16_t shard_begin;
+    std::uint16_t shard_end;
+    const GroupByConfig* group_by = nullptr;  // null = full granularity
+};
+
+struct DfanalyzerScanOutput {
+    std::vector<ArrowExportResult> events;
+    std::vector<ArrowExportResult> profiles;
+    std::vector<ArrowExportResult> system;
+};
+
+DfanalyzerScanOutput scan_dfanalyzer_shards(DfanalyzerScanInput input) {
+    DfanalyzerScanOutput output;
+
+    const bool coarse = input.group_by != nullptr;
+    const std::vector<ColumnSpec> coarse_schema =
+        coarse ? make_coarse_schema(*input.group_by)
+               : std::vector<ColumnSpec>{};
+
+    auto make_builder = [&]() {
+        RecordBatchBuilder b;
+        if (coarse) {
+            b.declare_schema(coarse_schema);
+        } else {
+            b.declare_schema(DFANALYZER_SCHEMA);
+        }
+        b.reserve(static_cast<std::size_t>(input.batch_size));
+        return b;
+    };
+
+    RecordBatchBuilder event_builder, profile_builder, system_builder;
+    bool use_events =
+        !input.type_filter || *input.type_filter == AggMapType::EVENT;
+    bool use_profiles =
+        !input.type_filter || *input.type_filter == AggMapType::PROFILE;
+    bool use_system =
+        !input.type_filter || *input.type_filter == AggMapType::SYSTEM;
+
+    if (use_events) event_builder = make_builder();
+    if (use_profiles) profile_builder = make_builder();
+    if (use_system) system_builder = make_builder();
+
+    auto bucket_width_us = static_cast<std::uint64_t>(
+        input.ctx->time_granularity * input.ctx->time_resolution);
+    std::size_t event_count = 0, profile_count = 0, system_count = 0;
+
+    HashResolver resolver(input.ctx->file_hashes, input.ctx->host_hashes);
+    std::unordered_map<ProcKey, std::string, ProcKeyHash> proc_name_cache;
+    std::unordered_map<std::string_view, IOCategory> io_cat_cache;
+
+    std::unordered_map<CoarseKey, CoarseMetrics, CoarseKeyHash> event_coarse,
+        profile_coarse, system_coarse;
+
+    auto flush_builder = [&](RecordBatchBuilder& builder, std::size_t& count,
+                             std::vector<ArrowExportResult>& results) {
+        if (count > 0) {
+            auto arrow = builder.finish();
+            if (arrow.valid()) {
+                results.push_back(std::move(arrow));
+            }
+            builder.reset(true);
+            builder.reserve(static_cast<std::size_t>(input.batch_size));
+            count = 0;
+        }
+    };
+
+    auto append_row = [&](RecordBatchBuilder& builder, std::size_t& count,
+                          std::vector<ArrowExportResult>& results,
+                          const AggKeyView& kv, const AggMetricsView& mv,
+                          std::string_view file_name,
+                          std::string_view host_name,
+                          std::string_view proc_name, IOCategory io_cat) {
+        std::size_t ci = 0;
+        builder.append_dict_string(ci++, kv.cat);
+        builder.append_dict_string(ci++, kv.name);
+        builder.append_int64(ci++, static_cast<std::int64_t>(kv.pid));
+        builder.append_int64(ci++, static_cast<std::int64_t>(kv.tid));
+        builder.append_dict_string(ci++, kv.fhash);
+        builder.append_dict_string(ci++, kv.hhash);
+        builder.append_dict_string(ci++, file_name);
+        builder.append_dict_string(ci++, host_name);
+        builder.append_dict_string(ci++, proc_name);
+        builder.append_int64(ci++, static_cast<std::int64_t>(io_cat));
+        builder.append_int64(ci++, 0);
+
+        builder.append_int64(ci++, static_cast<std::int64_t>(mv.count));
+        builder.append_double(ci++, static_cast<double>(mv.dur_total) /
+                                        input.ctx->time_resolution);
+
+        if (mv.size_total > 0) {
+            builder.append_int64(ci++,
+                                 static_cast<std::int64_t>(mv.size_total));
+        } else {
+            builder.append_null(ci++);
+        }
+
+        builder.append_double(ci++, mv.count > 0
+                                        ? static_cast<double>(mv.dur_min) /
+                                              input.ctx->time_resolution
+                                        : 0.0);
+        builder.append_double(ci++, mv.count > 0
+                                        ? static_cast<double>(mv.dur_max) /
+                                              input.ctx->time_resolution
+                                        : 0.0);
+
+        if (mv.size_total > 0 && mv.count > 0) {
+            builder.append_int64(ci++, static_cast<std::int64_t>(mv.size_min));
+            builder.append_int64(ci++, static_cast<std::int64_t>(mv.size_max));
+        } else {
+            builder.append_null(ci++);
+            builder.append_null(ci++);
+        }
+
+        auto time_range = bucket_width_us > 0
+                              ? static_cast<std::int64_t>(
+                                    (kv.time_bucket - input.ctx->time_origin) /
+                                    bucket_width_us)
+                              : 0;
+        builder.append_int64(ci++, time_range);
+        builder.append_int64(
+            ci++, static_cast<std::int64_t>(mv.ts - input.ctx->time_origin));
+        builder.append_int64(
+            ci++, static_cast<std::int64_t>(mv.te - input.ctx->time_origin));
+        builder.end_row();
+
+        count++;
+        if (static_cast<Py_ssize_t>(count) >= input.batch_size) {
+            flush_builder(builder, count, results);
+        }
+    };
+
+    auto accumulate_coarse =
+        [&](std::unordered_map<CoarseKey, CoarseMetrics, CoarseKeyHash>& map,
+            const AggKeyView& kv, const AggMetricsView& mv,
+            std::string_view file_name, std::string_view host_name,
+            std::string_view proc_name, IOCategory io_cat) {
+            const auto& cfg = *input.group_by;
+            // Probe with non-interned views; hash/equality compare by content,
+            // so string_view lifetime doesn't matter for lookup. We only copy
+            // (intern) on first insert.
+            CoarseKey probe;
+            if (cfg.mask & GB_CAT) probe.cat = kv.cat;
+            if (cfg.mask & GB_FUNC_NAME) probe.func_name = kv.name;
+            if (cfg.mask & GB_PID) probe.pid = kv.pid;
+            if (cfg.mask & GB_TID) probe.tid = kv.tid;
+            if (cfg.mask & GB_FILE_HASH) probe.file_hash = kv.fhash;
+            if (cfg.mask & GB_HOST_HASH) probe.host_hash = kv.hhash;
+            if (cfg.mask & GB_FILE_NAME) probe.file_name = file_name;
+            if (cfg.mask & GB_HOST_NAME) probe.host_name = host_name;
+            if (cfg.mask & GB_PROC_NAME) probe.proc_name = proc_name;
+            if (cfg.mask & GB_IO_CAT)
+                probe.io_cat = static_cast<std::int64_t>(io_cat);
+            if (cfg.mask & GB_TIME_RANGE) {
+                probe.time_range =
+                    bucket_width_us > 0
+                        ? static_cast<std::int64_t>(
+                              (kv.time_bucket - input.ctx->time_origin) /
+                              bucket_width_us)
+                        : 0;
+            }
+            // acc_pat is always 0 today; included for completeness.
+
+            auto it = map.find(probe);
+            if (it == map.end()) {
+                // First sighting: promote views referencing unstable DB buffers
+                // to interned copies. file_name/host_name come from the
+                // resolver's intern pool, and proc_name from proc_name_cache;
+                // both already stable across iterations, no copy needed.
+                CoarseKey stable = probe;
+                if (cfg.mask & GB_CAT) stable.cat = resolver.intern(kv.cat);
+                if (cfg.mask & GB_FUNC_NAME)
+                    stable.func_name = resolver.intern(kv.name);
+                if (cfg.mask & GB_FILE_HASH)
+                    stable.file_hash = resolver.intern(kv.fhash);
+                if (cfg.mask & GB_HOST_HASH)
+                    stable.host_hash = resolver.intern(kv.hhash);
+                auto [nit, _] = map.emplace(std::move(stable), CoarseMetrics{});
+                it = nit;
+            }
+            CoarseMetrics& m = it->second;
+            m.count += mv.count;
+            double time_val =
+                static_cast<double>(mv.dur_total) / input.ctx->time_resolution;
+            m.time_sum += time_val;
+            m.time_sq_sum += time_val * time_val;
+            if (time_val < m.time_call_min_val) m.time_call_min_val = time_val;
+            if (time_val > m.time_call_max_val) m.time_call_max_val = time_val;
+            if (mv.count > 0) {
+                double dur_min_v = static_cast<double>(mv.dur_min) /
+                                   input.ctx->time_resolution;
+                double dur_max_v = static_cast<double>(mv.dur_max) /
+                                   input.ctx->time_resolution;
+                if (dur_min_v < m.time_min_val) m.time_min_val = dur_min_v;
+                if (dur_max_v > m.time_max_val) m.time_max_val = dur_max_v;
+            }
+            if (mv.size_total > 0) {
+                m.has_size = true;
+                m.size_sum += mv.size_total;
+                double sz = static_cast<double>(mv.size_total);
+                m.size_sq_sum += sz * sz;
+                if (mv.size_total < m.size_call_min_val)
+                    m.size_call_min_val = mv.size_total;
+                if (mv.size_total > m.size_call_max_val)
+                    m.size_call_max_val = mv.size_total;
+                if (mv.count > 0) {
+                    if (mv.size_min < m.size_min_val)
+                        m.size_min_val = mv.size_min;
+                    if (mv.size_max > m.size_max_val)
+                        m.size_max_val = mv.size_max;
+                }
+            }
+            if (mv.ts >= input.ctx->time_origin) {
+                m.has_time_bounds = true;
+                auto ts_off = mv.ts - input.ctx->time_origin;
+                auto te_off = mv.te - input.ctx->time_origin;
+                if (ts_off < m.time_start_val) m.time_start_val = ts_off;
+                if (te_off > m.time_end_val) m.time_end_val = te_off;
+            }
+        };
+
+    input.agg->scan_shard_range_raw(
+        input.shard_begin, input.shard_end,
+        [&](std::string_view key_bytes, std::string_view val_bytes) -> bool {
+            AggKeyView kv;
+            if (!parse_agg_key_view(key_bytes, kv)) return true;
+
+            if (input.type_filter && kv.map_type != *input.type_filter)
+                return true;
+
+            if (input.ctx->query_filter) {
+                auto& q = *input.ctx->query_filter;
+                dftracer::utils::utilities::common::query::ValueMap fields;
+                if (q.references("cat")) fields["cat"] = std::string(kv.cat);
+                if (q.references("name")) fields["name"] = std::string(kv.name);
+                if (q.references("pid")) fields["pid"] = kv.pid;
+                if (q.references("tid")) fields["tid"] = kv.tid;
+                if (q.references("hhash"))
+                    fields["hhash"] = std::string(kv.hhash);
+                if (q.references("fhash"))
+                    fields["fhash"] = std::string(kv.fhash);
+                if (q.references("time_bucket"))
+                    fields["time_bucket"] = kv.time_bucket;
+                if (!q.evaluate(fields)) return true;
+            }
+
+            AggMetricsView mv;
+            if (!parse_agg_value_view(val_bytes, mv)) return true;
+
+            auto file_name = resolver.resolve_file(kv.fhash);
+            auto host_name = resolver.resolve_host(kv.hhash);
+
+            ProcKey pk{kv.hhash, kv.pid, kv.tid};
+            auto proc_it = proc_name_cache.find(pk);
+            std::string_view proc_name;
+            if (proc_it != proc_name_cache.end()) {
+                proc_name = proc_it->second;
+            } else {
+                std::string pn = "app#";
+                if (!host_name.empty()) {
+                    pn.append(host_name);
+                } else if (!kv.hhash.empty()) {
+                    pn.append(kv.hhash);
+                } else {
+                    pn.append("unknown");
+                }
+                pn.push_back('#');
+                pn.append(std::to_string(kv.pid));
+                pn.push_back('#');
+                pn.append(std::to_string(kv.tid));
+                ProcKey stable_pk{resolver.intern(kv.hhash), kv.pid, kv.tid};
+                auto [it, _] =
+                    proc_name_cache.emplace(stable_pk, std::move(pn));
+                proc_name = it->second;
+            }
+
+            auto io_it = io_cat_cache.find(kv.name);
+            IOCategory io_cat;
+            if (io_it != io_cat_cache.end()) {
+                io_cat = io_it->second;
+            } else {
+                io_cat = get_io_category(kv.name);
+                io_cat_cache[resolver.intern(kv.name)] = io_cat;
+            }
+
+            if (coarse) {
+                switch (kv.map_type) {
+                    case AggMapType::EVENT:
+                        if (use_events)
+                            accumulate_coarse(event_coarse, kv, mv, file_name,
+                                              host_name, proc_name, io_cat);
+                        break;
+                    case AggMapType::PROFILE:
+                        if (use_profiles)
+                            accumulate_coarse(profile_coarse, kv, mv, file_name,
+                                              host_name, proc_name, io_cat);
+                        break;
+                    case AggMapType::SYSTEM:
+                        if (use_system)
+                            accumulate_coarse(system_coarse, kv, mv, file_name,
+                                              host_name, proc_name, io_cat);
+                        break;
+                }
+            } else {
+                switch (kv.map_type) {
+                    case AggMapType::EVENT:
+                        append_row(event_builder, event_count, output.events,
+                                   kv, mv, file_name, host_name, proc_name,
+                                   io_cat);
+                        break;
+                    case AggMapType::PROFILE:
+                        append_row(profile_builder, profile_count,
+                                   output.profiles, kv, mv, file_name,
+                                   host_name, proc_name, io_cat);
+                        break;
+                    case AggMapType::SYSTEM:
+                        append_row(system_builder, system_count, output.system,
+                                   kv, mv, file_name, host_name, proc_name,
+                                   io_cat);
+                        break;
+                }
+            }
+            return true;
+        });
+
+    if (coarse) {
+        const auto& cfg = *input.group_by;
+        auto flush_coarse = [&](std::unordered_map<CoarseKey, CoarseMetrics,
+                                                   CoarseKeyHash>& map,
+                                RecordBatchBuilder& builder, std::size_t& count,
+                                std::vector<ArrowExportResult>& results) {
+            for (auto& [key, m] : map) {
+                std::size_t ci = 0;
+                for (std::size_t i = 0; i < cfg.order.size(); ++i) {
+                    switch (cfg.order[i]) {
+                        case GB_CAT:
+                            builder.append_dict_string(ci++, key.cat);
+                            break;
+                        case GB_FUNC_NAME:
+                            builder.append_dict_string(ci++, key.func_name);
+                            break;
+                        case GB_PID:
+                            builder.append_int64(
+                                ci++, static_cast<std::int64_t>(key.pid));
+                            break;
+                        case GB_TID:
+                            builder.append_int64(
+                                ci++, static_cast<std::int64_t>(key.tid));
+                            break;
+                        case GB_FILE_HASH:
+                            builder.append_dict_string(ci++, key.file_hash);
+                            break;
+                        case GB_HOST_HASH:
+                            builder.append_dict_string(ci++, key.host_hash);
+                            break;
+                        case GB_FILE_NAME:
+                            builder.append_dict_string(ci++, key.file_name);
+                            break;
+                        case GB_HOST_NAME:
+                            builder.append_dict_string(ci++, key.host_name);
+                            break;
+                        case GB_PROC_NAME:
+                            builder.append_dict_string(ci++, key.proc_name);
+                            break;
+                        case GB_IO_CAT:
+                            builder.append_int64(ci++, key.io_cat);
+                            break;
+                        case GB_ACC_PAT:
+                            builder.append_int64(ci++, key.acc_pat);
+                            break;
+                        case GB_TIME_RANGE:
+                            builder.append_int64(ci++, key.time_range);
+                            break;
+                    }
+                }
+                builder.append_int64(ci++, static_cast<std::int64_t>(m.count));
+                builder.append_double(ci++, m.time_sum);
+                if (m.has_size) {
+                    builder.append_int64(ci++,
+                                         static_cast<std::int64_t>(m.size_sum));
+                } else {
+                    builder.append_null(ci++);
+                }
+                builder.append_double(ci++, m.time_sq_sum);
+                if (m.has_size) {
+                    builder.append_double(ci++, m.size_sq_sum);
+                } else {
+                    builder.append_null(ci++);
+                }
+                builder.append_double(ci++, m.count > 0 ? m.time_min_val : 0.0);
+                builder.append_double(ci++, m.count > 0 ? m.time_max_val : 0.0);
+                if (m.has_size) {
+                    builder.append_int64(
+                        ci++, static_cast<std::int64_t>(m.size_min_val));
+                    builder.append_int64(
+                        ci++, static_cast<std::int64_t>(m.size_max_val));
+                } else {
+                    builder.append_null(ci++);
+                    builder.append_null(ci++);
+                }
+                builder.append_double(ci++,
+                                      m.count > 0 ? m.time_call_min_val : 0.0);
+                builder.append_double(ci++,
+                                      m.count > 0 ? m.time_call_max_val : 0.0);
+                if (m.has_size) {
+                    builder.append_int64(
+                        ci++, static_cast<std::int64_t>(m.size_call_min_val));
+                    builder.append_int64(
+                        ci++, static_cast<std::int64_t>(m.size_call_max_val));
+                } else {
+                    builder.append_null(ci++);
+                    builder.append_null(ci++);
+                }
+                builder.append_int64(
+                    ci++, m.has_time_bounds
+                              ? static_cast<std::int64_t>(m.time_start_val)
+                              : 0);
+                builder.append_int64(
+                    ci++, m.has_time_bounds
+                              ? static_cast<std::int64_t>(m.time_end_val)
+                              : 0);
+                builder.end_row();
+                ++count;
+                if (static_cast<Py_ssize_t>(count) >= input.batch_size) {
+                    flush_builder(builder, count, results);
+                }
+            }
+            flush_builder(builder, count, results);
+        };
+        if (use_events)
+            flush_coarse(event_coarse, event_builder, event_count,
+                         output.events);
+        if (use_profiles)
+            flush_coarse(profile_coarse, profile_builder, profile_count,
+                         output.profiles);
+        if (use_system)
+            flush_coarse(system_coarse, system_builder, system_count,
+                         output.system);
+    } else {
+        if (use_events)
+            flush_builder(event_builder, event_count, output.events);
+        if (use_profiles)
+            flush_builder(profile_builder, profile_count, output.profiles);
+        if (use_system)
+            flush_builder(system_builder, system_count, output.system);
+    }
+
+    return output;
+}
+
+}  // namespace
+
+static PyObject* Indexer_iter_aggregation(IndexerObject* self, PyObject* args,
+                                          PyObject* kwds) {
+    static const char* kwlist[] = {"type", "batch_size", nullptr};
+    const char* type_str = "events";
+    Py_ssize_t batch_size = 10000;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|sn", (char**)kwlist,
+                                     &type_str, &batch_size)) {
+        return nullptr;
+    }
+
+    AggMapType target_type;
+    if (!parse_agg_type_str(type_str, target_type)) return nullptr;
+
+    AggregationBatchType batch_type;
+    if (target_type == AggMapType::EVENT)
+        batch_type = AggregationBatchType::EVENT;
+    else if (target_type == AggMapType::PROFILE)
+        batch_type = AggregationBatchType::PROFILE;
+    else
+        batch_type = AggregationBatchType::SYSTEM;
+
+    auto idx_opt = resolve_index_path(self);
+    if (!idx_opt) return nullptr;
+    std::string index_path = std::move(*idx_opt);
+
+    PyObject* batch_list = PyList_New(0);
+    if (!batch_list) return nullptr;
+
+    std::string error_msg;
+    std::vector<dftracer::utils::utilities::common::arrow::ArrowExportResult>
+        results;
+
+    Py_BEGIN_ALLOW_THREADS try {
+        auto handle = open_agg_db(index_path, error_msg);
+        if (handle) {
+            Runtime* rt = get_batch_indexer_runtime(self);
+            std::vector<AggScanOutput> outputs;
+            parallel_shard_scan<AggScanOutput>(
+                rt,
+                [&](std::uint16_t shard_begin, std::uint16_t shard_end) {
+                    AggScanInput input;
+                    input.agg = handle->agg.get();
+                    input.target_type = target_type;
+                    input.batch_type = batch_type;
+                    input.batch_size = batch_size;
+                    input.shard_begin = shard_begin;
+                    input.shard_end = shard_end;
+                    return scan_aggregation_shard_range(input);
+                },
+                outputs);
+
+            for (auto& out : outputs) {
+                for (auto& r : out.results) {
+                    results.push_back(std::move(r));
+                }
+            }
+        }
+    } catch (const std::exception& e) {
+        error_msg = e.what();
+    }
+    Py_END_ALLOW_THREADS
+
+        if (!error_msg.empty()) {
+        Py_DECREF(batch_list);
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return nullptr;
+    }
+
+    append_results_to_list(batch_list, results);
+
+    PyObject* iter = PyObject_GetIter(batch_list);
+    Py_DECREF(batch_list);
+    return iter;
+}
+
+static PyObject* Indexer_iter_arrow_dfanalyzer(IndexerObject* self,
+                                               PyObject* args, PyObject* kwds) {
+    static const char* kwlist[] = {
+        "type",  "batch_size", "time_granularity", "time_resolution",
+        "query", nullptr};
+    const char* type_str = "events";
+    Py_ssize_t batch_size = 10000;
+    double time_granularity = 1.0;
+    double time_resolution = 1000000.0;
+    const char* query_str = nullptr;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|snddz", (char**)kwlist,
+                                     &type_str, &batch_size, &time_granularity,
+                                     &time_resolution, &query_str)) {
+        return nullptr;
+    }
+
+    AggMapType target_type;
+    if (!parse_agg_type_str(type_str, target_type)) return nullptr;
+
+    auto query_opt = parse_query_arg(query_str);
+    if (!query_opt && PyErr_Occurred()) return nullptr;
+
+    auto idx_opt = resolve_index_path(self);
+    if (!idx_opt) return nullptr;
+    std::string index_path = std::move(*idx_opt);
+
+    PyObject* batch_list = PyList_New(0);
+    if (!batch_list) return nullptr;
+
+    std::string error_msg;
+    std::vector<ArrowExportResult> results;
+
+    Py_BEGIN_ALLOW_THREADS try {
+        auto handle = open_agg_db(index_path, error_msg);
+        if (handle) {
+            dftracer::utils::utilities::indexer::IndexDatabase idx_db(
+                index_path,
+                dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+            auto file_hashes =
+                idx_db.query_hash_table(dftracer::utils::utilities::indexer::
+                                            IndexDatabase::HashType::FILE);
+            auto host_hashes =
+                idx_db.query_hash_table(dftracer::utils::utilities::indexer::
+                                            IndexDatabase::HashType::HOST);
+
+            auto time_bounds = handle->agg->query_time_bounds();
+            std::uint64_t time_origin =
+                time_bounds.valid ? time_bounds.min_time_bucket : 0;
+
+            DfanalyzerContext ctx;
+            ctx.file_hashes = &file_hashes;
+            ctx.host_hashes = &host_hashes;
+            ctx.query_filter = query_opt ? &*query_opt : nullptr;
+            ctx.time_origin = time_origin;
+            ctx.time_resolution = time_resolution;
+            ctx.time_granularity = time_granularity;
+
+            Runtime* rt = get_batch_indexer_runtime(self);
+            std::vector<DfanalyzerScanOutput> outputs;
+            parallel_shard_scan<DfanalyzerScanOutput>(
+                rt,
+                [&](std::uint16_t shard_begin, std::uint16_t shard_end) {
+                    DfanalyzerScanInput input;
+                    input.agg = handle->agg.get();
+                    input.ctx = &ctx;
+                    input.type_filter = target_type;
+                    input.batch_size = batch_size;
+                    input.shard_begin = shard_begin;
+                    input.shard_end = shard_end;
+                    return scan_dfanalyzer_shards(input);
+                },
+                outputs);
+
+            for (auto& out : outputs) {
+                for (auto& r : out.events) results.push_back(std::move(r));
+                for (auto& r : out.profiles) results.push_back(std::move(r));
+                for (auto& r : out.system) results.push_back(std::move(r));
+            }
+        }
+    } catch (const std::exception& e) {
+        error_msg = e.what();
+    }
+    Py_END_ALLOW_THREADS
+
+        if (!error_msg.empty()) {
+        Py_DECREF(batch_list);
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return nullptr;
+    }
+
+    append_results_to_list(batch_list, results);
+
+    PyObject* iter = PyObject_GetIter(batch_list);
+    Py_DECREF(batch_list);
+    return iter;
+}
+
+static bool parse_group_by_arg(PyObject* obj, GroupByConfig& out) {
+    if (!obj || obj == Py_None) return true;
+    if (!PySequence_Check(obj)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "group_by must be a sequence of strings or None");
+        return false;
+    }
+    Py_ssize_t n = PySequence_Length(obj);
+    for (Py_ssize_t i = 0; i < n; ++i) {
+        PyObject* item = PySequence_GetItem(obj, i);
+        if (!item) return false;
+        if (!PyUnicode_Check(item)) {
+            Py_DECREF(item);
+            PyErr_SetString(PyExc_TypeError,
+                            "group_by entries must be strings");
+            return false;
+        }
+        Py_ssize_t sz = 0;
+        const char* s = PyUnicode_AsUTF8AndSize(item, &sz);
+        if (!s) {
+            Py_DECREF(item);
+            return false;
+        }
+        std::string_view sv(s, static_cast<std::size_t>(sz));
+        auto field = parse_group_by_name(sv);
+        if (!field) {
+            std::string msg = "unsupported group_by field: ";
+            msg.append(sv);
+            Py_DECREF(item);
+            PyErr_SetString(PyExc_ValueError, msg.c_str());
+            return false;
+        }
+        if (!(out.mask & *field)) {
+            out.mask |= *field;
+            out.order.push_back(*field);
+            out.names.emplace_back(sv);
+        }
+        Py_DECREF(item);
+    }
+    return true;
+}
+
+static PyObject* Indexer_iter_arrow_dfanalyzer_all(IndexerObject* self,
+                                                   PyObject* args,
+                                                   PyObject* kwds) {
+    static const char* kwlist[] = {"batch_size",      "time_granularity",
+                                   "time_resolution", "query",
+                                   "group_by",        nullptr};
+    Py_ssize_t batch_size = 10000;
+    double time_granularity = 1.0;
+    double time_resolution = 1000000.0;
+    const char* query_str = nullptr;
+    PyObject* group_by_obj = nullptr;
+
+    if (!PyArg_ParseTupleAndKeywords(
+            args, kwds, "|nddzO", (char**)kwlist, &batch_size,
+            &time_granularity, &time_resolution, &query_str, &group_by_obj)) {
+        return nullptr;
+    }
+
+    auto query_opt = parse_query_arg(query_str);
+    if (!query_opt && PyErr_Occurred()) return nullptr;
+
+    GroupByConfig group_by_cfg;
+    if (!parse_group_by_arg(group_by_obj, group_by_cfg)) return nullptr;
+    const GroupByConfig* group_by_ptr =
+        group_by_cfg.mask != 0 ? &group_by_cfg : nullptr;
+
+    auto idx_opt = resolve_index_path(self);
+    if (!idx_opt) return nullptr;
+    std::string index_path = std::move(*idx_opt);
+
+    PyObject* result_dict = PyDict_New();
+    if (!result_dict) return nullptr;
+
+    PyObject* events_list = PyList_New(0);
+    PyObject* profiles_list = PyList_New(0);
+    PyObject* system_list = PyList_New(0);
+    if (!events_list || !profiles_list || !system_list) {
+        Py_XDECREF(events_list);
+        Py_XDECREF(profiles_list);
+        Py_XDECREF(system_list);
+        Py_DECREF(result_dict);
+        return nullptr;
+    }
+
+    std::string error_msg;
+    std::vector<ArrowExportResult> events_results, profiles_results,
+        system_results;
+
+    Py_BEGIN_ALLOW_THREADS try {
+        auto handle = open_agg_db(index_path, error_msg);
+        if (handle) {
+            dftracer::utils::utilities::indexer::IndexDatabase idx_db(
+                index_path,
+                dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+            auto file_hashes =
+                idx_db.query_hash_table(dftracer::utils::utilities::indexer::
+                                            IndexDatabase::HashType::FILE);
+            auto host_hashes =
+                idx_db.query_hash_table(dftracer::utils::utilities::indexer::
+                                            IndexDatabase::HashType::HOST);
+
+            auto time_bounds = handle->agg->query_time_bounds();
+            std::uint64_t time_origin =
+                time_bounds.valid ? time_bounds.min_time_bucket : 0;
+
+            DfanalyzerContext ctx;
+            ctx.file_hashes = &file_hashes;
+            ctx.host_hashes = &host_hashes;
+            ctx.query_filter = query_opt ? &*query_opt : nullptr;
+            ctx.time_origin = time_origin;
+            ctx.time_resolution = time_resolution;
+            ctx.time_granularity = time_granularity;
+
+            Runtime* rt = get_batch_indexer_runtime(self);
+            std::vector<DfanalyzerScanOutput> outputs;
+            parallel_shard_scan<DfanalyzerScanOutput>(
+                rt,
+                [&](std::uint16_t shard_begin, std::uint16_t shard_end) {
+                    DfanalyzerScanInput input;
+                    input.agg = handle->agg.get();
+                    input.ctx = &ctx;
+                    input.type_filter = std::nullopt;
+                    input.batch_size = batch_size;
+                    input.shard_begin = shard_begin;
+                    input.shard_end = shard_end;
+                    input.group_by = group_by_ptr;
+                    return scan_dfanalyzer_shards(input);
+                },
+                outputs);
+
+            for (auto& out : outputs) {
+                for (auto& r : out.events)
+                    events_results.push_back(std::move(r));
+                for (auto& r : out.profiles)
+                    profiles_results.push_back(std::move(r));
+                for (auto& r : out.system)
+                    system_results.push_back(std::move(r));
+            }
+        }
+    } catch (const std::exception& e) {
+        error_msg = e.what();
+    }
+    Py_END_ALLOW_THREADS
+
+        if (!error_msg.empty()) {
+        Py_DECREF(events_list);
+        Py_DECREF(profiles_list);
+        Py_DECREF(system_list);
+        Py_DECREF(result_dict);
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return nullptr;
+    }
+
+    append_results_to_list(events_list, events_results);
+    append_results_to_list(profiles_list, profiles_results);
+    append_results_to_list(system_list, system_results);
+
+    PyDict_SetItemString(result_dict, "events", events_list);
+    PyDict_SetItemString(result_dict, "profiles", profiles_list);
+    PyDict_SetItemString(result_dict, "system", system_list);
+    Py_DECREF(events_list);
+    Py_DECREF(profiles_list);
+    Py_DECREF(system_list);
+
+    return result_dict;
+}
+
+// ---------------------------------------------------------------------------
+// scan_aggregation_manifest — module-level entry point for analyze_trace.
+//
+// Each Dask worker calls this with its slice of the agg manifest
+// (agg_ssts + sys_ssts) and optionally a [shard_begin, shard_end) range.
+// The function opens a scratch IndexDatabase at `scratch_dir`, ingests the
+// SSTs into its AGGREGATION/SYSTEM_METRICS CFs (nearly free when SSTs live
+// on the same filesystem as `scratch_dir` — RocksDB hard-links them), then
+// runs the same parallel shard scan that `iter_arrow_dfanalyzer_all` uses.
+//
+// AGG_GLOBAL_CONFIG_KEY is not written by worker SSTs, so we construct the
+// EventAggregator with config_hash=0 directly instead of going through
+// `open_agg_db` (which requires the config key). The config hash is used
+// by the aggregator only for write-time validation, not for reads.
+//
+// The scratch DB is NOT cleaned up here — the Python caller owns
+// `scratch_dir` lifetime and should remove it after gathering results.
+// ---------------------------------------------------------------------------
+
+static bool collect_string_list(PyObject* obj, const char* name,
+                                std::vector<std::string>& out) {
+    if (!obj || obj == Py_None) return true;
+    PyObject* seq = PySequence_Fast(obj, name);
+    if (!seq) return false;
+    Py_ssize_t n = PySequence_Fast_GET_SIZE(seq);
+    out.reserve(static_cast<std::size_t>(n));
+    for (Py_ssize_t i = 0; i < n; ++i) {
+        PyObject* item = PySequence_Fast_GET_ITEM(seq, i);
+        if (!PyUnicode_Check(item)) {
+            Py_DECREF(seq);
+            PyErr_Format(PyExc_TypeError, "%s items must be str", name);
+            return false;
+        }
+        const char* s = PyUnicode_AsUTF8(item);
+        if (!s) {
+            Py_DECREF(seq);
+            return false;
+        }
+        out.emplace_back(s);
+    }
+    Py_DECREF(seq);
+    return true;
+}
+
+static bool collect_string_string_dict(
+    PyObject* obj, const char* name,
+    std::unordered_map<std::string, std::string>& out) {
+    if (!obj || obj == Py_None) return true;
+    if (!PyDict_Check(obj)) {
+        PyErr_Format(PyExc_TypeError, "%s must be a dict[str, str] or None",
+                     name);
+        return false;
+    }
+    PyObject *k, *v;
+    Py_ssize_t pos = 0;
+    while (PyDict_Next(obj, &pos, &k, &v)) {
+        if (!PyUnicode_Check(k) || !PyUnicode_Check(v)) {
+            PyErr_Format(PyExc_TypeError, "%s must map str -> str", name);
+            return false;
+        }
+        const char* ks = PyUnicode_AsUTF8(k);
+        const char* vs = PyUnicode_AsUTF8(v);
+        if (!ks || !vs) return false;
+        out.emplace(ks, vs);
+    }
+    return true;
+}
+
+static PyObject* scan_aggregation_manifest_fn(PyObject* /*self*/,
+                                              PyObject* args, PyObject* kwds) {
+    static const char* kwlist[] = {
+        "agg_ssts",        "sys_ssts",    "scratch_dir",
+        "meta_index_path", "batch_size",  "time_granularity",
+        "time_resolution", "query",       "group_by",
+        "shard_begin",     "shard_end",   "runtime",
+        "file_hashes",     "host_hashes", nullptr};
+
+    PyObject* agg_ssts_obj = nullptr;
+    PyObject* sys_ssts_obj = nullptr;
+    const char* scratch_dir = nullptr;
+    const char* meta_index_path = nullptr;
+    Py_ssize_t batch_size = 10000;
+    double time_granularity = 1.0;
+    double time_resolution = 1000000.0;
+    const char* query_str = nullptr;
+    PyObject* group_by_obj = nullptr;
+    int shard_begin_i = 0;
+    int shard_end_i = DFT_NUM_SHARDS;
+    PyObject* runtime_obj = nullptr;
+    PyObject* file_hashes_obj = nullptr;
+    PyObject* host_hashes_obj = nullptr;
+
+    if (!PyArg_ParseTupleAndKeywords(
+            args, kwds, "OOss|nddzOiiOOO", (char**)kwlist, &agg_ssts_obj,
+            &sys_ssts_obj, &scratch_dir, &meta_index_path, &batch_size,
+            &time_granularity, &time_resolution, &query_str, &group_by_obj,
+            &shard_begin_i, &shard_end_i, &runtime_obj, &file_hashes_obj,
+            &host_hashes_obj)) {
+        return nullptr;
+    }
+
+    if (shard_begin_i < 0 || shard_end_i > DFT_NUM_SHARDS ||
+        shard_begin_i >= shard_end_i) {
+        PyErr_Format(PyExc_ValueError,
+                     "shard range [%d, %d) invalid (must be within [0, %d))",
+                     shard_begin_i, shard_end_i, (int)DFT_NUM_SHARDS);
+        return nullptr;
+    }
+
+    std::vector<std::string> agg_ssts;
+    std::vector<std::string> sys_ssts;
+    if (!collect_string_list(agg_ssts_obj, "agg_ssts", agg_ssts))
+        return nullptr;
+    if (!collect_string_list(sys_ssts_obj, "sys_ssts", sys_ssts))
+        return nullptr;
+
+    std::unordered_map<std::string, std::string> preloaded_file_hashes;
+    std::unordered_map<std::string, std::string> preloaded_host_hashes;
+    const bool hashes_preloaded =
+        (file_hashes_obj && file_hashes_obj != Py_None) ||
+        (host_hashes_obj && host_hashes_obj != Py_None);
+    if (!collect_string_string_dict(file_hashes_obj, "file_hashes",
+                                    preloaded_file_hashes))
+        return nullptr;
+    if (!collect_string_string_dict(host_hashes_obj, "host_hashes",
+                                    preloaded_host_hashes))
+        return nullptr;
+
+    auto query_opt = parse_query_arg(query_str);
+    if (!query_opt && PyErr_Occurred()) return nullptr;
+
+    GroupByConfig group_by_cfg;
+    if (!parse_group_by_arg(group_by_obj, group_by_cfg)) return nullptr;
+    const GroupByConfig* group_by_ptr =
+        group_by_cfg.mask != 0 ? &group_by_cfg : nullptr;
+
+    Runtime* rt = nullptr;
+    if (runtime_obj && runtime_obj != Py_None) {
+        if (!PyObject_TypeCheck(runtime_obj, &RuntimeType)) {
+            PyErr_SetString(PyExc_TypeError,
+                            "runtime must be a Runtime instance or None");
+            return nullptr;
+        }
+        rt = ((RuntimeObject*)runtime_obj)->runtime.get();
+    } else {
+        rt = get_default_runtime();
+    }
+
+    PyObject* result_dict = PyDict_New();
+    if (!result_dict) return nullptr;
+    PyObject* events_list = PyList_New(0);
+    PyObject* profiles_list = PyList_New(0);
+    PyObject* system_list = PyList_New(0);
+    if (!events_list || !profiles_list || !system_list) {
+        Py_XDECREF(events_list);
+        Py_XDECREF(profiles_list);
+        Py_XDECREF(system_list);
+        Py_DECREF(result_dict);
+        return nullptr;
+    }
+
+    std::string error_msg;
+    std::vector<ArrowExportResult> events_results, profiles_results,
+        system_results;
+    std::string scratch_index_path = std::string(scratch_dir) + "/.dftindex";
+    std::string meta_index_path_str(meta_index_path);
+
+    Py_BEGIN_ALLOW_THREADS try {
+        namespace rcf = dftracer::utils::rocksdb::cf;
+        using clock = std::chrono::steady_clock;
+        auto ms = [](clock::time_point a, clock::time_point b) -> long long {
+            return std::chrono::duration_cast<std::chrono::milliseconds>(b - a)
+                .count();
+        };
+
+        auto t_start = clock::now();
+        dftracer::utils::utilities::indexer::IndexDatabase scratch_db(
+            scratch_index_path);
+        auto t_scratch_open = clock::now();
+
+        auto raw_db = scratch_db.db();
+        for (const auto& p : agg_ssts) {
+            auto st = raw_db->ingest_external_files(rcf::AGGREGATION, {p},
+                                                    /*ingest_behind=*/false);
+            if (!st.ok()) {
+                error_msg =
+                    "ingest AGGREGATION sst '" + p + "': " + st.ToString();
+                break;
+            }
+        }
+        if (error_msg.empty()) {
+            for (const auto& p : sys_ssts) {
+                auto st = raw_db->ingest_external_files(
+                    rcf::SYSTEM_METRICS, {p}, /*ingest_behind=*/false);
+                if (!st.ok()) {
+                    error_msg = "ingest SYSTEM_METRICS sst '" + p +
+                                "': " + st.ToString();
+                    break;
+                }
+            }
+        }
+        auto t_ingest = clock::now();
+
+        if (error_msg.empty()) {
+            auto agg =
+                std::make_unique<EventAggregator>(raw_db, /*cfg_hash=*/0);
+
+            // If the caller passed pre-loaded hash tables, skip opening
+            // the meta DB on lustre. When many dask workers run
+            // scan_aggregation_manifest in parallel, loading the hash
+            // tables N times from the same file is significant lustre
+            // metadata pressure; loading once on the coordinator and
+            // passing them in eliminates the redundant reads.
+            std::unordered_map<std::string, std::string> loaded_file_hashes;
+            std::unordered_map<std::string, std::string> loaded_host_hashes;
+            std::unique_ptr<dftracer::utils::utilities::indexer::IndexDatabase>
+                meta_db;
+            if (!hashes_preloaded) {
+                meta_db = std::make_unique<
+                    dftracer::utils::utilities::indexer::IndexDatabase>(
+                    meta_index_path_str, dftracer::utils::rocksdb::
+                                             RocksDatabase::OpenMode::ReadOnly);
+                loaded_file_hashes = meta_db->query_hash_table(
+                    dftracer::utils::utilities::indexer::IndexDatabase::
+                        HashType::FILE);
+                loaded_host_hashes = meta_db->query_hash_table(
+                    dftracer::utils::utilities::indexer::IndexDatabase::
+                        HashType::HOST);
+            }
+            const auto& file_hashes =
+                hashes_preloaded ? preloaded_file_hashes : loaded_file_hashes;
+            const auto& host_hashes =
+                hashes_preloaded ? preloaded_host_hashes : loaded_host_hashes;
+            auto t_hash_tables = clock::now();
+
+            auto time_bounds = agg->query_time_bounds();
+            std::uint64_t time_origin =
+                time_bounds.valid ? time_bounds.min_time_bucket : 0;
+
+            DfanalyzerContext ctx;
+            ctx.file_hashes = &file_hashes;
+            ctx.host_hashes = &host_hashes;
+            ctx.query_filter = query_opt ? &*query_opt : nullptr;
+            ctx.time_origin = time_origin;
+            ctx.time_resolution = time_resolution;
+            ctx.time_granularity = time_granularity;
+
+            std::vector<DfanalyzerScanOutput> outputs;
+            parallel_shard_scan_range<DfanalyzerScanOutput>(
+                rt, static_cast<std::uint16_t>(shard_begin_i),
+                static_cast<std::uint16_t>(shard_end_i),
+                [&](std::uint16_t sb, std::uint16_t se) {
+                    DfanalyzerScanInput input;
+                    input.agg = agg.get();
+                    input.ctx = &ctx;
+                    input.type_filter = std::nullopt;
+                    input.batch_size = batch_size;
+                    input.shard_begin = sb;
+                    input.shard_end = se;
+                    input.group_by = group_by_ptr;
+                    return scan_dfanalyzer_shards(input);
+                },
+                outputs);
+            auto t_scan = clock::now();
+
+            for (auto& out : outputs) {
+                for (auto& r : out.events)
+                    events_results.push_back(std::move(r));
+                for (auto& r : out.profiles)
+                    profiles_results.push_back(std::move(r));
+                for (auto& r : out.system)
+                    system_results.push_back(std::move(r));
+            }
+
+            std::fprintf(
+                stderr,
+                "[scan_aggregation_manifest] n_agg=%zu n_sys=%zu "
+                "scratch_open=%lldms ingest=%lldms hash_tables=%lldms "
+                "scan=%lldms\n",
+                agg_ssts.size(), sys_ssts.size(), ms(t_start, t_scratch_open),
+                ms(t_scratch_open, t_ingest), ms(t_ingest, t_hash_tables),
+                ms(t_hash_tables, t_scan));
+            std::fflush(stderr);
+        }
+    } catch (const std::exception& e) {
+        error_msg = e.what();
+    }
+    Py_END_ALLOW_THREADS
+
+        if (!error_msg.empty()) {
+        Py_DECREF(events_list);
+        Py_DECREF(profiles_list);
+        Py_DECREF(system_list);
+        Py_DECREF(result_dict);
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return nullptr;
+    }
+
+    append_results_to_list(events_list, events_results);
+    append_results_to_list(profiles_list, profiles_results);
+    append_results_to_list(system_list, system_results);
+
+    PyDict_SetItemString(result_dict, "events", events_list);
+    PyDict_SetItemString(result_dict, "profiles", profiles_list);
+    PyDict_SetItemString(result_dict, "system", system_list);
+    Py_DECREF(events_list);
+    Py_DECREF(profiles_list);
+    Py_DECREF(system_list);
+
+    return result_dict;
+}
+
+static PyMethodDef BatchIndexerModuleMethods[] = {
+    {"scan_aggregation_manifest", (PyCFunction)scan_aggregation_manifest_fn,
+     METH_VARARGS | METH_KEYWORDS,
+     "scan_aggregation_manifest(agg_ssts, sys_ssts, scratch_dir, "
+     "meta_index_path, batch_size=10000, time_granularity=1.0, "
+     "time_resolution=1e6, query=None, group_by=None, shard_begin=0, "
+     "shard_end=4096, runtime=None) -> dict\n"
+     "--\n\n"
+     "Scan a worker's slice of the distributed aggregation manifest.\n\n"
+     "Ingests agg_ssts + sys_ssts into a scratch IndexDatabase at "
+     "scratch_dir (caller owns the directory lifecycle) and runs the "
+     "dfanalyzer aggregation scan over [shard_begin, shard_end). "
+     "meta_index_path is the unified .dftindex used to resolve file / "
+     "host hashes. Returns the same dict shape as "
+     "Indexer.iter_arrow_dfanalyzer_all."},
+    {nullptr, nullptr, 0, nullptr}};
+#endif
+
+static PyMethodDef Indexer_methods[] = {
+    {"get_checkpoint_indexer", (PyCFunction)Indexer_get_checkpoint_indexer,
+     METH_VARARGS,
+     "get_checkpoint_indexer(file_path)\n"
+     "--\n\n"
+     "Get a checkpoint indexer for a specific file.\n\n"
+     "Args:\n"
+     "    file_path: Path to the trace file (.pfw/.pfw.gz)\n\n"
+     "Returns:\n"
+     "    Indexer instance for checkpoint-level operations.\n"},
+    {"resolve", (PyCFunction)Indexer_resolve, METH_NOARGS,
+     "resolve()\n"
+     "--\n\n"
+     "Check what files exist vs need indexing.\n\n"
+     "Returns:\n"
+     "    dict with 'total_files', 'ready', 'needs_work', 'index_path'\n"},
+    {"build", (PyCFunction)Indexer_build, METH_NOARGS,
+     "build()\n"
+     "--\n\n"
+     "Build all missing index tiers based on require_* flags.\n"},
+    {"ensure_indexed", (PyCFunction)Indexer_ensure_indexed, METH_NOARGS,
+     "ensure_indexed()\n"
+     "--\n\n"
+     "Resolve and build if needed.\n\n"
+     "Returns:\n"
+     "    dict with index status after building.\n"},
+    {"get_hash_table", (PyCFunction)Indexer_get_hash_table, METH_VARARGS,
+     "get_hash_table(type)\n"
+     "--\n\n"
+     "Query hash table mappings.\n\n"
+     "Args:\n"
+     "    type: 'file', 'host', 'string', or 'proc'\n\n"
+     "Returns:\n"
+     "    dict mapping hash values to resolved names.\n"},
+    {"query_file_pids", (PyCFunction)Indexer_query_file_pids, METH_VARARGS,
+     "query_file_pids(file_id)\n"
+     "--\n\n"
+     "Query PIDs observed in a specific file.\n\n"
+     "Args:\n"
+     "    file_id: Integer file ID from index.\n\n"
+     "Returns:\n"
+     "    set of PIDs.\n"},
+    {"query_all_file_pids", (PyCFunction)Indexer_query_all_file_pids,
+     METH_NOARGS,
+     "query_all_file_pids()\n"
+     "--\n\n"
+     "Query PIDs for all indexed files.\n\n"
+     "Returns:\n"
+     "    dict mapping file_id to set of PIDs.\n"},
+    {"query_file_info", (PyCFunction)Indexer_query_file_info, METH_NOARGS,
+     "query_file_info()\n"
+     "--\n\n"
+     "Query file ID to path mapping and per-file PIDs in one call.\n\n"
+     "Returns:\n"
+     "    tuple of (dict[int, str], dict[int, set[int]]).\n"},
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+    {"iter_aggregation", (PyCFunction)Indexer_iter_aggregation,
+     METH_VARARGS | METH_KEYWORDS,
+     "iter_aggregation(type='events', batch_size=10000)\n"
+     "--\n\n"
+     "Iterate over aggregation data as Arrow batches.\n\n"
+     "Args:\n"
+     "    type: 'events', 'profiles', or 'system'\n"
+     "    batch_size: Number of entries per batch (default 10000)\n\n"
+     "Returns:\n"
+     "    Iterator over Arrow batch capsules.\n"},
+    {"iter_arrow_dfanalyzer", (PyCFunction)Indexer_iter_arrow_dfanalyzer,
+     METH_VARARGS | METH_KEYWORDS,
+     "iter_arrow_dfanalyzer(type='events', batch_size=10000, "
+     "time_granularity=1.0, time_resolution=1e6, query=None)\n"
+     "--\n\n"
+     "Iterate over aggregation data as dfanalyzer-compatible Arrow batches.\n\n"
+     "Output schema matches dfanalyzer expectations with resolved hashes,\n"
+     "normalized time_range, and computed columns (proc_name, io_cat).\n\n"
+     "Args:\n"
+     "    type: 'events', 'profiles', or 'system'\n"
+     "    batch_size: Number of entries per batch (default 10000)\n"
+     "    time_granularity: Bucket width in seconds (default 1.0)\n"
+     "    time_resolution: Microseconds per output time unit (default 1e6)\n"
+     "    query: Optional query filter string (e.g., \"pid == 1234\")\n\n"
+     "Returns:\n"
+     "    Iterator over Arrow batch capsules.\n"},
+    {"iter_arrow_dfanalyzer_all",
+     (PyCFunction)Indexer_iter_arrow_dfanalyzer_all,
+     METH_VARARGS | METH_KEYWORDS,
+     "iter_arrow_dfanalyzer_all(batch_size=10000, time_granularity=1.0, "
+     "time_resolution=1e6, query=None, group_by=None)\n"
+     "--\n\n"
+     "Iterate over all aggregation types in a single scan.\n\n"
+     "Returns a dict with 'events', 'profiles', 'system' keys, each "
+     "containing\n"
+     "a list of Arrow batch capsules. This is ~3x faster than calling\n"
+     "iter_arrow_dfanalyzer separately for each type.\n\n"
+     "When group_by is provided, the scan collapses dimensions during "
+     "aggregation\n"
+     "and emits a reduced schema containing only the requested columns plus\n"
+     "aggregated metrics (count, time, size, time_sq, size_sq, time_min,\n"
+     "time_max, size_min, size_max, time_call_min, time_call_max, "
+     "size_call_min,\n"
+     "size_call_max, time_start, time_end). Supported group_by columns: "
+     "cat,\n"
+     "func_name, pid, tid, file_hash, host_hash, file_name, host_name, "
+     "proc_name,\n"
+     "io_cat, acc_pat, time_range.\n\n"
+     "Args:\n"
+     "    batch_size: Number of entries per batch (default 10000)\n"
+     "    time_granularity: Bucket width in seconds (default 1.0)\n"
+     "    time_resolution: Microseconds per output time unit (default 1e6)\n"
+     "    query: Optional query filter string\n"
+     "    group_by: Optional list of columns to group by; enables coarse\n"
+     "        in-scan aggregation (default None = full granularity)\n\n"
+     "Returns:\n"
+     "    dict with 'events', 'profiles', 'system' lists of Arrow capsules.\n"},
+#endif
+    {nullptr}};
+
+static PyGetSetDef Indexer_getsetters[] = {{nullptr}};
+
+PyTypeObject IndexerType = {
+    PyVarObject_HEAD_INIT(nullptr, 0) "dftracer_utils_ext.Indexer",
+    sizeof(IndexerObject),
+    0,
+    (destructor)Indexer_dealloc,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
+    "BatchIndexer(directory='', files=None, index_dir='',\n"
+    "             require_checkpoint=True, require_bloom=True,\n"
+    "             require_manifest=True, require_aggregation=False,\n"
+    "             time_interval_ms=5000.0, group_keys=None,\n"
+    "             custom_metric_fields=None, compute_percentiles=False,\n"
+    "             parallelism=0, force_rebuild=False, runtime=None)\n"
+    "--\n\n"
+    "Indexer with tiered index building.\n\n"
+    "At least one of 'directory' or 'files' must be provided.\n"
+    "- directory: scan for .pfw/.pfw.gz files\n"
+    "- files: list of specific file paths\n\n"
+    "Supports:\n"
+    "- Tier 1: Checkpoints (require_checkpoint)\n"
+    "- Tier 2: Bloom filters (require_bloom), Manifests (require_manifest)\n"
+    "- Tier 3: Aggregation (require_aggregation + config params)\n",
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    Indexer_methods,
+    0,
+    Indexer_getsetters,
+    0,
+    0,
+    0,
+    0,
+    0,
+    (initproc)Indexer_init,
+    0,
+    Indexer_new,
+};
+
+int init_indexer(PyObject* m) {
+    if (PyType_Ready(&IndexerType) < 0) return -1;
+
+    Py_INCREF(&IndexerType);
+    if (PyModule_AddObject(m, "Indexer", (PyObject*)&IndexerType) < 0) {
+        Py_DECREF(&IndexerType);
+        return -1;
+    }
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+    if (PyModule_AddFunctions(m, BatchIndexerModuleMethods) < 0) return -1;
+#endif
+
+    return 0;
+}
diff --git a/src/dftracer/utils/python/batch_indexer.h b/src/dftracer/utils/python/batch_indexer.h
new file mode 100644
index 00000000..d7dd9aa6
--- /dev/null
+++ b/src/dftracer/utils/python/batch_indexer.h
@@ -0,0 +1,38 @@
+#ifndef DFTRACER_UTILS_PYTHON_BATCH_INDEXER_H
+#define DFTRACER_UTILS_PYTHON_BATCH_INDEXER_H
+
+#include <Python.h>
+
+#include <cstddef>
+#include <cstdint>
+
+struct IndexerObject {
+    PyObject_HEAD
+
+        PyObject* runtime_obj;
+    PyObject* directory;
+    PyObject* files;  // Python list of file paths or None
+    PyObject* index_dir;
+
+    // Tier requirements
+    int require_checkpoint;
+    int require_bloom;
+    int require_manifest;
+    int require_aggregation;
+
+    // Aggregation config (stored for rebuild)
+    double time_interval_ms;
+    PyObject* group_keys;            // Python list or None
+    PyObject* custom_metric_fields;  // Python list or None
+    int compute_percentiles;
+
+    std::size_t checkpoint_size;
+    std::size_t parallelism;
+    int force_rebuild;
+};
+
+extern PyTypeObject IndexerType;
+
+int init_indexer(PyObject* m);
+
+#endif  // DFTRACER_UTILS_PYTHON_BATCH_INDEXER_H
diff --git a/src/dftracer/utils/python/dftracer_utils_ext.cpp b/src/dftracer/utils/python/dftracer_utils_ext.cpp
index 77d7b528..9ae169a8 100644
--- a/src/dftracer/utils/python/dftracer_utils_ext.cpp
+++ b/src/dftracer/utils/python/dftracer_utils_ext.cpp
@@ -1,9 +1,14 @@
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
+#include <dftracer/utils/core/common/config.h>
+#include <dftracer/utils/python/batch_indexer.h>
+#include <dftracer/utils/python/index_database.h>
 #include <dftracer/utils/python/indexer.h>
 #include <dftracer/utils/python/indexer_checkpoint.h>
 #include <dftracer/utils/python/json.h>
+#include <dftracer/utils/python/memoryview_batch.h>
 #include <dftracer/utils/python/runtime.h>
+#include <dftracer/utils/python/sst_distribution.h>
 #include <dftracer/utils/python/task_handle.h>
 #include <dftracer/utils/python/trace_reader.h>
 #include <dftracer/utils/python/trace_reader_iterator.h>
@@ -14,11 +19,18 @@
 #include <dftracer/utils/python/utilities/reorganization_planner.h>
 #include <dftracer/utils/python/utilities/statistics_aggregator.h>
 #include <dftracer/utils/python/utilities/statistics_query.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+#include <dftracer/utils/python/arrow_stream_capsule.h>
+#include <dftracer/utils/python/streaming_iterator.h>
+#endif
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+#include <dftracer/utils/python/arrow_parallel_reader.h>
+#endif
 
 static PyModuleDef dftracer_utils_module = {
     PyModuleDef_HEAD_INIT,
     "dftracer_utils_ext",   /* m_name */
-    "DFTracer utils module with indexer, reader, lazy JSON, "
+    "DFTracer utils module with indexer, reader, "
     "and utility bindings", /* m_doc */
     -1,                     /* m_size */
     NULL,                   /* m_methods */
@@ -33,11 +45,21 @@ PyMODINIT_FUNC PyInit_dftracer_utils_ext(void) {
     m = PyModule_Create(&dftracer_utils_module);
     if (m == NULL) return NULL;
     if (init_indexer_checkpoint(m) < 0) return NULL;
-    if (init_json(m) < 0) return NULL;
+    if (init_checkpoint_indexer(m) < 0) return NULL;
     if (init_indexer(m) < 0) return NULL;
     if (init_task_handle(m) < 0) return NULL;
     if (init_runtime(m) < 0) return NULL;
+    if (dftracer::utils::python::init_memoryview_batch(m) < 0) return NULL;
+    if (init_json_dict_value(m) < 0) return NULL;
     if (init_trace_reader_iterator(m) < 0) return NULL;
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+    if (dftracer::utils::python::init_arrow_streaming_iterator(m) < 0)
+        return NULL;
+    if (init_arrow_batch_stream(m) < 0) return NULL;
+#endif
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+    if (dftracer::utils::python::init_arrow_parallel_reader(m) < 0) return NULL;
+#endif
     if (init_trace_reader(m) < 0) return NULL;
     if (init_statistics_query(m) < 0) return NULL;
     if (init_statistics_aggregator(m) < 0) return NULL;
@@ -46,5 +68,7 @@ PyMODINIT_FUNC PyInit_dftracer_utils_ext(void) {
     if (init_reconstruction_planner(m) < 0) return NULL;
     if (init_aggregator(m) < 0) return NULL;
     if (init_comparator(m) < 0) return NULL;
+    if (init_index_database(m) < 0) return NULL;
+    if (init_sst_distribution(m) < 0) return NULL;
     return m;
 }
diff --git a/src/dftracer/utils/python/index_database.cpp b/src/dftracer/utils/python/index_database.cpp
new file mode 100644
index 00000000..9f29b18a
--- /dev/null
+++ b/src/dftracer/utils/python/index_database.cpp
@@ -0,0 +1,363 @@
+#include <dftracer/utils/python/index_database.h>
+#include <dftracer/utils/python/sst_distribution.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_sst_writer_context.h>
+
+#include <new>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+using dftracer::utils::utilities::indexer::IndexDatabase;
+using dftracer::utils::utilities::indexer::SstArtifactRegistry;
+
+static void IndexDatabase_dealloc(IndexDatabaseObject *self) {
+    self->db.~shared_ptr<IndexDatabase>();
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static PyObject *IndexDatabase_new(PyTypeObject *type, PyObject * /*args*/,
+                                   PyObject * /*kwds*/) {
+    auto *self = (IndexDatabaseObject *)type->tp_alloc(type, 0);
+    if (!self) return NULL;
+    new (&self->db) std::shared_ptr<IndexDatabase>();
+    return (PyObject *)self;
+}
+
+static int IndexDatabase_init(IndexDatabaseObject *self, PyObject *args,
+                              PyObject *kwds) {
+    static const char *kwlist[] = {"index_path", NULL};
+    const char *index_path;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "s", (char **)kwlist,
+                                     &index_path)) {
+        return -1;
+    }
+    try {
+        self->db = std::make_shared<IndexDatabase>(index_path);
+    } catch (const std::exception &e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return -1;
+    }
+    return 0;
+}
+
+static PyObject *IndexDatabase_init_schema(IndexDatabaseObject *self,
+                                           PyObject * /*ignored*/) {
+    if (!self->db) {
+        PyErr_SetString(PyExc_RuntimeError, "IndexDatabase not initialised");
+        return NULL;
+    }
+    try {
+        Py_BEGIN_ALLOW_THREADS self->db->init_schema();
+        Py_END_ALLOW_THREADS
+    } catch (const std::exception &e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+static PyObject *IndexDatabase_register_files(IndexDatabaseObject *self,
+                                              PyObject *args, PyObject *kwds) {
+    static const char *kwlist[] = {"paths", "build_manifest", NULL};
+    PyObject *paths_obj;
+    int build_manifest = 0;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|p", (char **)kwlist,
+                                     &paths_obj, &build_manifest)) {
+        return NULL;
+    }
+    std::vector<std::string> paths;
+    PyObject *seq = PySequence_Fast(paths_obj, "paths must be a sequence");
+    if (!seq) return NULL;
+    Py_ssize_t n = PySequence_Fast_GET_SIZE(seq);
+    paths.reserve(n);
+    for (Py_ssize_t i = 0; i < n; ++i) {
+        PyObject *item = PySequence_Fast_GET_ITEM(seq, i);
+        const char *s = PyUnicode_AsUTF8(item);
+        if (!s) {
+            Py_DECREF(seq);
+            return NULL;
+        }
+        paths.emplace_back(s);
+    }
+    Py_DECREF(seq);
+
+    std::vector<int> ids;
+    try {
+        Py_BEGIN_ALLOW_THREADS ids =
+            self->db->register_files(paths, build_manifest != 0);
+        Py_END_ALLOW_THREADS
+    } catch (const std::exception &e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return NULL;
+    }
+
+    PyObject *out = PyList_New(static_cast<Py_ssize_t>(ids.size()));
+    if (!out) return NULL;
+    for (Py_ssize_t i = 0; i < static_cast<Py_ssize_t>(ids.size()); ++i) {
+        PyList_SET_ITEM(out, i, PyLong_FromLong(ids[i]));
+    }
+    return out;
+}
+
+static PyObject *IndexDatabase_reserve_file_id_range(IndexDatabaseObject *self,
+                                                     PyObject *args) {
+    Py_ssize_t count;
+    if (!PyArg_ParseTuple(args, "n", &count)) return NULL;
+    if (count < 0) {
+        PyErr_SetString(PyExc_ValueError, "count must be >= 0");
+        return NULL;
+    }
+    int first;
+    try {
+        Py_BEGIN_ALLOW_THREADS first =
+            self->db->reserve_file_id_range(static_cast<std::size_t>(count));
+        Py_END_ALLOW_THREADS
+    } catch (const std::exception &e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return NULL;
+    }
+    return PyLong_FromLong(first);
+}
+
+static PyObject *IndexDatabase_bulk_ingest(IndexDatabaseObject *self,
+                                           PyObject *args, PyObject *kwds) {
+    static const char *kwlist[] = {"registry", "skip_cfs", NULL};
+    PyObject *registry_obj;
+    PyObject *skip_cfs_obj = NULL;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O", (char **)kwlist,
+                                     &registry_obj, &skip_cfs_obj)) {
+        return NULL;
+    }
+
+    SstArtifactRegistry *registry = sst_artifact_registry_get(registry_obj);
+    if (!registry) {
+        PyErr_SetString(PyExc_TypeError,
+                        "expected an SstArtifactRegistry instance");
+        return NULL;
+    }
+
+    std::unordered_set<std::string> skip_cfs;
+    if (skip_cfs_obj && skip_cfs_obj != Py_None) {
+        PyObject *seq =
+            PySequence_Fast(skip_cfs_obj, "skip_cfs must be an iterable");
+        if (!seq) return NULL;
+        Py_ssize_t n = PySequence_Fast_GET_SIZE(seq);
+        for (Py_ssize_t i = 0; i < n; ++i) {
+            PyObject *item = PySequence_Fast_GET_ITEM(seq, i);
+            const char *s = PyUnicode_AsUTF8(item);
+            if (!s) {
+                Py_DECREF(seq);
+                return NULL;
+            }
+            skip_cfs.emplace(s);
+        }
+        Py_DECREF(seq);
+    }
+
+    try {
+        Py_BEGIN_ALLOW_THREADS self->db->bulk_ingest(*registry, skip_cfs);
+        Py_END_ALLOW_THREADS
+    } catch (const std::exception &e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+static PyObject *IndexDatabase_write_agg_file_markers(IndexDatabaseObject *self,
+                                                      PyObject *args) {
+    PyObject *ids_obj;
+    if (!PyArg_ParseTuple(args, "O", &ids_obj)) return NULL;
+
+    PyObject *seq = PySequence_Fast(ids_obj, "file_ids must be an iterable");
+    if (!seq) return NULL;
+    Py_ssize_t n = PySequence_Fast_GET_SIZE(seq);
+    std::vector<int> file_ids;
+    file_ids.reserve(static_cast<std::size_t>(n));
+    for (Py_ssize_t i = 0; i < n; ++i) {
+        PyObject *item = PySequence_Fast_GET_ITEM(seq, i);
+        long v = PyLong_AsLong(item);
+        if (v == -1 && PyErr_Occurred()) {
+            Py_DECREF(seq);
+            return NULL;
+        }
+        file_ids.push_back(static_cast<int>(v));
+    }
+    Py_DECREF(seq);
+
+    try {
+        Py_BEGIN_ALLOW_THREADS self->db->write_agg_file_markers(file_ids);
+        Py_END_ALLOW_THREADS
+    } catch (const std::exception &e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+static PyObject *IndexDatabase_write_agg_global_config(
+    IndexDatabaseObject *self, PyObject *args, PyObject *kwds) {
+    static const char *kwlist[] = {"time_interval_us", "config_hash", NULL};
+    unsigned long long time_interval_us = 0;
+    unsigned int config_hash = 0;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "K|I", (char **)kwlist,
+                                     &time_interval_us, &config_hash)) {
+        return NULL;
+    }
+    try {
+        Py_BEGIN_ALLOW_THREADS self->db->write_agg_global_config(
+            static_cast<std::uint64_t>(time_interval_us),
+            static_cast<std::uint32_t>(config_hash));
+        Py_END_ALLOW_THREADS
+    } catch (const std::exception &e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+static PyObject *IndexDatabase_write_aggregation_tracker(
+    IndexDatabaseObject *self, PyObject *args) {
+    PyObject *blobs_obj;
+    if (!PyArg_ParseTuple(args, "O", &blobs_obj)) return NULL;
+    PyObject *seq = PySequence_Fast(blobs_obj, "blobs must be an iterable");
+    if (!seq) return NULL;
+    Py_ssize_t n = PySequence_Fast_GET_SIZE(seq);
+    std::vector<std::string> blobs;
+    blobs.reserve(static_cast<std::size_t>(n));
+    for (Py_ssize_t i = 0; i < n; ++i) {
+        PyObject *item = PySequence_Fast_GET_ITEM(seq, i);
+        if (item == Py_None) continue;
+        char *buf = nullptr;
+        Py_ssize_t len = 0;
+        if (PyBytes_Check(item)) {
+            if (PyBytes_AsStringAndSize(item, &buf, &len) < 0) {
+                Py_DECREF(seq);
+                return NULL;
+            }
+        } else {
+            Py_DECREF(seq);
+            PyErr_SetString(PyExc_TypeError,
+                            "blobs entries must be bytes or None");
+            return NULL;
+        }
+        if (len > 0) blobs.emplace_back(buf, static_cast<std::size_t>(len));
+    }
+    Py_DECREF(seq);
+    try {
+        Py_BEGIN_ALLOW_THREADS self->db->write_aggregation_tracker(blobs);
+        Py_END_ALLOW_THREADS
+    } catch (const std::exception &e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+static PyObject *IndexDatabase_rebuild_root_summaries(IndexDatabaseObject *self,
+                                                      PyObject * /*ignored*/) {
+    try {
+        Py_BEGIN_ALLOW_THREADS self->db->rebuild_root_summaries();
+        Py_END_ALLOW_THREADS
+    } catch (const std::exception &e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+static PyMethodDef IndexDatabase_methods[] = {
+    {"init_schema", (PyCFunction)IndexDatabase_init_schema, METH_NOARGS,
+     "Idempotently initialise the schema version key."},
+    {"register_files", (PyCFunction)IndexDatabase_register_files,
+     METH_VARARGS | METH_KEYWORDS,
+     "register_files(paths, build_manifest=False) -> list[int]\n"
+     "Register each path in the DEFAULT-CF file registry and return the "
+     "assigned file_ids. Idempotent for files with matching hash."},
+    {"reserve_file_id_range", (PyCFunction)IndexDatabase_reserve_file_id_range,
+     METH_VARARGS,
+     "reserve_file_id_range(count) -> int\n"
+     "Atomically reserve `count` contiguous file_ids, return the first."},
+    {"bulk_ingest", (PyCFunction)IndexDatabase_bulk_ingest,
+     METH_VARARGS | METH_KEYWORDS,
+     "bulk_ingest(registry, skip_cfs=None) -> None\n"
+     "Ingest all SSTs collected in the SstArtifactRegistry.\n"
+     "skip_cfs is an optional iterable of CF names whose SSTs are left "
+     "outside the unified DB (used by distributed builds to keep "
+     "AGGREGATION/SYSTEM_METRICS SSTs addressable by manifest)."},
+    {"rebuild_root_summaries",
+     (PyCFunction)IndexDatabase_rebuild_root_summaries, METH_NOARGS,
+     "Recompute ROOT_* summary column families from per-file CFs."},
+    {"write_agg_global_config",
+     (PyCFunction)IndexDatabase_write_agg_global_config,
+     METH_VARARGS | METH_KEYWORDS,
+     "write_agg_global_config(time_interval_us, config_hash=0) -> None\n"
+     "Write the AGG_GLOBAL_CONFIG_KEY marker into the AGGREGATION CF. "
+     "Required for `iter_arrow_dfanalyzer_all` on distributed builds "
+     "(which never materialise the key via worker SSTs) or "
+     "post-consolidate indices."},
+    {"write_agg_file_markers",
+     (PyCFunction)IndexDatabase_write_agg_file_markers, METH_VARARGS,
+     "write_agg_file_markers(file_ids) -> None\n"
+     "Write per-file aggregation completion markers (\\xFF\\xFF + file_id) "
+     "into the AGGREGATION CF. Required after distributed_index otherwise "
+     "`ensure_indexed()` concludes aggregation is incomplete and re-runs "
+     "the entire build."},
+    {"write_aggregation_tracker",
+     (PyCFunction)IndexDatabase_write_aggregation_tracker, METH_VARARGS,
+     "write_aggregation_tracker(blobs) -> None\n"
+     "Merge a list of serialized AssociationTracker bytes and write the "
+     "result to the AGGREGATION CF under the `__tracker__` key."},
+    {NULL}};
+
+PyTypeObject IndexDatabaseType = {
+    PyVarObject_HEAD_INIT(NULL, 0) "dftracer_utils_ext.IndexDatabase",
+    sizeof(IndexDatabaseObject),
+    0,
+    (destructor)IndexDatabase_dealloc,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    Py_TPFLAGS_DEFAULT,
+    "Handle to a .dftindex RocksDB store.",
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    IndexDatabase_methods,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    (initproc)IndexDatabase_init,
+    0,
+    IndexDatabase_new,
+};
+
+int init_index_database(PyObject *m) {
+    if (PyType_Ready(&IndexDatabaseType) < 0) return -1;
+    Py_INCREF(&IndexDatabaseType);
+    if (PyModule_AddObject(m, "IndexDatabase", (PyObject *)&IndexDatabaseType) <
+        0) {
+        Py_DECREF(&IndexDatabaseType);
+        return -1;
+    }
+    return 0;
+}
diff --git a/src/dftracer/utils/python/index_database.h b/src/dftracer/utils/python/index_database.h
new file mode 100644
index 00000000..66c408fb
--- /dev/null
+++ b/src/dftracer/utils/python/index_database.h
@@ -0,0 +1,23 @@
+#ifndef DFTRACER_UTILS_PYTHON_INDEX_DATABASE_H
+#define DFTRACER_UTILS_PYTHON_INDEX_DATABASE_H
+
+#include <Python.h>
+
+#include <memory>
+
+namespace dftracer::utils::utilities::indexer {
+class IndexDatabase;
+class SstArtifactRegistry;
+}  // namespace dftracer::utils::utilities::indexer
+
+typedef struct {
+    PyObject_HEAD
+        std::shared_ptr<dftracer::utils::utilities::indexer::IndexDatabase>
+            db;
+} IndexDatabaseObject;
+
+extern PyTypeObject IndexDatabaseType;
+
+int init_index_database(PyObject *m);
+
+#endif  // DFTRACER_UTILS_PYTHON_INDEX_DATABASE_H
diff --git a/src/dftracer/utils/python/indexer.cpp b/src/dftracer/utils/python/indexer.cpp
index 071a6986..80b95390 100644
--- a/src/dftracer/utils/python/indexer.cpp
+++ b/src/dftracer/utils/python/indexer.cpp
@@ -1,4 +1,5 @@
 #include <dftracer/utils/core/runtime.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/python/indexer.h>
 #include <dftracer/utils/python/indexer_checkpoint.h>
 #include <dftracer/utils/python/runtime.h>
@@ -9,8 +10,9 @@
 #include <structmember.h>
 
 #include <cstring>
+#include <memory>
 
-static void Indexer_dealloc(IndexerObject *self) {
+static void CheckpointIndexer_dealloc(CheckpointIndexerObject *self) {
     if (self->handle) {
         // The Python wrapper owns only the native indexer handle. The
         // underlying RocksDB instance remains manager-owned and may continue to
@@ -24,7 +26,7 @@ static void Indexer_dealloc(IndexerObject *self) {
     Py_TYPE(self)->tp_free((PyObject *)self);
 }
 
-static void Indexer_release_handle(IndexerObject *self) {
+static void CheckpointIndexer_release_handle(CheckpointIndexerObject *self) {
     if (self->handle) {
         // Releasing the handle drops this wrapper's native indexer state only.
         // Shared RocksDB lifetime is managed separately by RocksDBManager.
@@ -33,10 +35,10 @@ static void Indexer_release_handle(IndexerObject *self) {
     }
 }
 
-static PyObject *Indexer_new(PyTypeObject *type, PyObject *args,
-                             PyObject *kwds) {
-    IndexerObject *self;
-    self = (IndexerObject *)type->tp_alloc(type, 0);
+static PyObject *CheckpointIndexer_new(PyTypeObject *type, PyObject *args,
+                                       PyObject *kwds) {
+    CheckpointIndexerObject *self;
+    self = (CheckpointIndexerObject *)type->tp_alloc(type, 0);
     if (self != NULL) {
         self->handle = NULL;
         self->gz_path = NULL;
@@ -44,18 +46,16 @@ static PyObject *Indexer_new(PyTypeObject *type, PyObject *args,
         self->checkpoint_size = 0;
         self->build_bloom = 0;
         self->build_manifest = 0;
-        self->index_threshold =
-            dftracer::utils::constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD;
         self->runtime_obj = NULL;
     }
     return (PyObject *)self;
 }
 
-static int Indexer_init(IndexerObject *self, PyObject *args, PyObject *kwds) {
+static int CheckpointIndexer_init(CheckpointIndexerObject *self, PyObject *args,
+                                  PyObject *kwds) {
     static const char *kwlist[] = {
-        "gz_path",         "index_path",  "checkpoint_size",
-        "force_rebuild",   "build_bloom", "build_manifest",
-        "index_threshold", "runtime",     NULL};
+        "gz_path",     "index_path",     "checkpoint_size", "force_rebuild",
+        "build_bloom", "build_manifest", "runtime",         NULL};
     const char *gz_path;
     const char *index_path = NULL;
     std::uint64_t checkpoint_size =
@@ -63,14 +63,12 @@ static int Indexer_init(IndexerObject *self, PyObject *args, PyObject *kwds) {
     int force_rebuild = 0;
     int build_bloom = 0;
     int build_manifest = 0;
-    std::uint64_t index_threshold =
-        dftracer::utils::constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD;
     PyObject *runtime_arg = NULL;
 
-    if (!PyArg_ParseTupleAndKeywords(
-            args, kwds, "s|snpppnO", (char **)kwlist, &gz_path, &index_path,
-            &checkpoint_size, &force_rebuild, &build_bloom, &build_manifest,
-            &index_threshold, &runtime_arg)) {
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|snpppO", (char **)kwlist,
+                                     &gz_path, &index_path, &checkpoint_size,
+                                     &force_rebuild, &build_bloom,
+                                     &build_manifest, &runtime_arg)) {
         return -1;
     }
 
@@ -112,7 +110,6 @@ static int Indexer_init(IndexerObject *self, PyObject *args, PyObject *kwds) {
     self->checkpoint_size = checkpoint_size;
     self->build_bloom = build_bloom;
     self->build_manifest = build_manifest;
-    self->index_threshold = index_threshold;
 
     const char *index_path_str = PyUnicode_AsUTF8(self->index_path);
     if (!index_path_str) {
@@ -129,72 +126,97 @@ static int Indexer_init(IndexerObject *self, PyObject *args, PyObject *kwds) {
     return 0;
 }
 
-static dftracer::utils::Runtime *get_indexer_runtime(IndexerObject *self) {
+static dftracer::utils::Runtime *get_indexer_runtime(
+    CheckpointIndexerObject *self) {
     if (self->runtime_obj) {
         return ((RuntimeObject *)self->runtime_obj)->runtime.get();
     }
     return get_default_runtime();
 }
 
-static PyObject *Indexer_build(IndexerObject *self,
-                               PyObject *Py_UNUSED(ignored)) {
+static PyObject *CheckpointIndexer_build(CheckpointIndexerObject *self,
+                                         PyObject *Py_UNUSED(ignored)) {
     if (!self->handle) {
         PyErr_SetString(PyExc_RuntimeError, "Indexer not initialized");
         return NULL;
     }
 
-    using namespace dftracer::utils;
-    using namespace dftracer::utils::utilities::indexer;
+    // Use IndexBatchBuilderUtility when bloom or manifest is requested.
+    // Otherwise, use the simpler dft_indexer_build which only creates
+    // checkpoints.
+    if (self->build_bloom || self->build_manifest) {
+        using namespace dftracer::utils;
+        using namespace dftracer::utils::utilities::indexer;
 
-    const char *gz = PyUnicode_AsUTF8(self->gz_path);
-    const char *idx = PyUnicode_AsUTF8(self->index_path);
-    if (!gz || !idx) {
-        return NULL;
-    }
+        const char *gz = PyUnicode_AsUTF8(self->gz_path);
+        const char *idx = PyUnicode_AsUTF8(self->index_path);
+        if (!gz || !idx) {
+            return NULL;
+        }
 
-    auto config = IndexBuildConfig::for_file(gz)
-                      .with_checkpoint_size(
-                          static_cast<std::size_t>(self->checkpoint_size))
-                      .with_bloom(self->build_bloom != 0)
-                      .with_manifest(self->build_manifest != 0)
-                      .with_index_threshold(
-                          static_cast<std::size_t>(self->index_threshold));
-
-    std::string idx_str(idx);
-    auto pos = idx_str.find_last_of('/');
-    if (pos != std::string::npos) {
-        config.with_index_dir(idx_str.substr(0, pos));
-    }
+        auto batch_config = std::make_shared<IndexBuildBatchConfig>();
+        batch_config->file_paths.emplace_back(gz);
+        batch_config->checkpoint_size =
+            static_cast<std::size_t>(self->checkpoint_size);
+        batch_config->build_manifest = self->build_manifest != 0;
+        batch_config->parallelism = 1;
+        batch_config->use_batch_write = true;
+        batch_config->rebuild_root_summaries = true;
+
+        std::string idx_str(idx);
+        auto pos = idx_str.find_last_of('/');
+        if (pos != std::string::npos) {
+            batch_config->index_dir = idx_str.substr(0, pos);
+        }
 
-    Runtime *rt = get_indexer_runtime(self);
-    IndexBuildResult build_result;
+        Runtime *rt = get_indexer_runtime(self);
+        IndexBuildBatchResult batch_result;
+
+        try {
+            Py_BEGIN_ALLOW_THREADS rt
+                ->submit(
+                    run_coro_scope(
+                        rt->executor(),
+                        [](CoroScope &scope,
+                           std::shared_ptr<IndexBuildBatchConfig> cfg,
+                           IndexBuildBatchResult *out) -> coro::CoroTask<void> {
+                            *out = co_await IndexBatchBuilderUtility::process(
+                                &scope, std::move(cfg));
+                        },
+                        batch_config, &batch_result),
+                    "indexer-build")
+                .get();
+            Py_END_ALLOW_THREADS
+        } catch (const std::exception &e) {
+            PyErr_SetString(PyExc_RuntimeError, e.what());
+            return NULL;
+        }
 
-    try {
-        auto build_coro =
-            [](IndexBuildConfig cfg) -> coro::CoroTask<IndexBuildResult> {
-            IndexBuilderUtility builder;
-            co_return co_await builder.process(cfg);
-        };
-
-        Py_BEGIN_ALLOW_THREADS auto handle =
-            rt->submit(build_coro(config), "indexer-build");
-        build_result = handle.get();
+        if (batch_result.failed > 0 && !batch_result.results.empty()) {
+            const auto &result = batch_result.results[0];
+            if (!result.success) {
+                PyErr_SetString(PyExc_RuntimeError,
+                                result.error_message.c_str());
+                return NULL;
+            }
+        }
+    } else {
+        // Simple checkpoint-only build
+        int result;
+        Py_BEGIN_ALLOW_THREADS result = dft_indexer_build(self->handle);
         Py_END_ALLOW_THREADS
-    } catch (const std::exception &e) {
-        PyErr_SetString(PyExc_RuntimeError, e.what());
-        return NULL;
-    }
 
-    if (!build_result.success) {
-        PyErr_SetString(PyExc_RuntimeError, build_result.error_message.c_str());
-        return NULL;
+            if (result < 0) {
+            PyErr_SetString(PyExc_RuntimeError, "Failed to build index");
+            return NULL;
+        }
     }
 
     Py_RETURN_NONE;
 }
 
-static PyObject *Indexer_need_rebuild(IndexerObject *self,
-                                      PyObject *Py_UNUSED(ignored)) {
+static PyObject *CheckpointIndexer_need_rebuild(CheckpointIndexerObject *self,
+                                                PyObject *Py_UNUSED(ignored)) {
     if (!self->handle) {
         PyErr_SetString(PyExc_RuntimeError, "Indexer not initialized");
         return NULL;
@@ -204,8 +226,8 @@ static PyObject *Indexer_need_rebuild(IndexerObject *self,
     return PyBool_FromLong(result);
 }
 
-static PyObject *Indexer_exists(IndexerObject *self,
-                                PyObject *Py_UNUSED(ignored)) {
+static PyObject *CheckpointIndexer_exists(CheckpointIndexerObject *self,
+                                          PyObject *Py_UNUSED(ignored)) {
     if (!self->handle) {
         PyErr_SetString(PyExc_RuntimeError, "Indexer not initialized");
         return NULL;
@@ -215,8 +237,8 @@ static PyObject *Indexer_exists(IndexerObject *self,
     return PyBool_FromLong(result);
 }
 
-static PyObject *Indexer_get_max_bytes(IndexerObject *self,
-                                       PyObject *Py_UNUSED(ignored)) {
+static PyObject *CheckpointIndexer_get_max_bytes(CheckpointIndexerObject *self,
+                                                 PyObject *Py_UNUSED(ignored)) {
     if (!self->handle) {
         PyErr_SetString(PyExc_RuntimeError, "Indexer not initialized");
         return NULL;
@@ -226,8 +248,8 @@ static PyObject *Indexer_get_max_bytes(IndexerObject *self,
     return PyLong_FromUnsignedLongLong(result);
 }
 
-static PyObject *Indexer_get_num_lines(IndexerObject *self,
-                                       PyObject *Py_UNUSED(ignored)) {
+static PyObject *CheckpointIndexer_get_num_lines(CheckpointIndexerObject *self,
+                                                 PyObject *Py_UNUSED(ignored)) {
     if (!self->handle) {
         PyErr_SetString(PyExc_RuntimeError, "Indexer not initialized");
         return NULL;
@@ -237,7 +259,8 @@ static PyObject *Indexer_get_num_lines(IndexerObject *self,
     return PyLong_FromUnsignedLongLong(result);
 }
 
-static PyObject *Indexer_find_checkpoint(IndexerObject *self, PyObject *args) {
+static PyObject *CheckpointIndexer_find_checkpoint(
+    CheckpointIndexerObject *self, PyObject *args) {
     if (!self->handle) {
         PyErr_SetString(PyExc_RuntimeError, "Indexer not initialized");
         return NULL;
@@ -268,8 +291,8 @@ static PyObject *Indexer_find_checkpoint(IndexerObject *self, PyObject *args) {
     return (PyObject *)cp_obj;
 }
 
-static PyObject *Indexer_get_checkpoints(IndexerObject *self,
-                                         PyObject *Py_UNUSED(ignored)) {
+static PyObject *CheckpointIndexer_get_checkpoints(
+    CheckpointIndexerObject *self, PyObject *Py_UNUSED(ignored)) {
     if (!self->handle) {
         PyErr_SetString(PyExc_RuntimeError, "Indexer not initialized");
         return NULL;
@@ -309,7 +332,8 @@ static PyObject *Indexer_get_checkpoints(IndexerObject *self,
     return list;
 }
 
-static PyObject *Indexer_has_bloom(IndexerObject *self, void *closure) {
+static PyObject *CheckpointIndexer_has_bloom(CheckpointIndexerObject *self,
+                                             void *closure) {
     const char *idx = PyUnicode_AsUTF8(self->index_path);
     const char *gz = PyUnicode_AsUTF8(self->gz_path);
     if (!idx || !gz) {
@@ -330,7 +354,8 @@ static PyObject *Indexer_has_bloom(IndexerObject *self, void *closure) {
     Py_RETURN_FALSE;
 }
 
-static PyObject *Indexer_has_manifest(IndexerObject *self, void *closure) {
+static PyObject *CheckpointIndexer_has_manifest(CheckpointIndexerObject *self,
+                                                void *closure) {
     const char *idx = PyUnicode_AsUTF8(self->index_path);
     const char *gz = PyUnicode_AsUTF8(self->gz_path);
     if (!idx || !gz) {
@@ -351,66 +376,71 @@ static PyObject *Indexer_has_manifest(IndexerObject *self, void *closure) {
     Py_RETURN_FALSE;
 }
 
-static PyObject *Indexer_gz_path(IndexerObject *self, void *closure) {
+static PyObject *CheckpointIndexer_gz_path(CheckpointIndexerObject *self,
+                                           void *closure) {
     Py_INCREF(self->gz_path);
     return self->gz_path;
 }
 
-static PyObject *Indexer_index_path(IndexerObject *self, void *closure) {
+static PyObject *CheckpointIndexer_index_path(CheckpointIndexerObject *self,
+                                              void *closure) {
     Py_INCREF(self->index_path);
     return self->index_path;
 }
 
-static PyObject *Indexer_checkpoint_size(IndexerObject *self, void *closure) {
+static PyObject *CheckpointIndexer_checkpoint_size(
+    CheckpointIndexerObject *self, void *closure) {
     return PyLong_FromUnsignedLongLong(self->checkpoint_size);
 }
 
-static PyObject *Indexer_enter(IndexerObject *self,
-                               PyObject *Py_UNUSED(ignored)) {
+static PyObject *CheckpointIndexer_enter(CheckpointIndexerObject *self,
+                                         PyObject *Py_UNUSED(ignored)) {
     Py_INCREF(self);
     return (PyObject *)self;
 }
 
-static PyObject *Indexer_close(IndexerObject *self,
-                               PyObject *Py_UNUSED(ignored)) {
-    Indexer_release_handle(self);
+static PyObject *CheckpointIndexer_close(CheckpointIndexerObject *self,
+                                         PyObject *Py_UNUSED(ignored)) {
+    CheckpointIndexer_release_handle(self);
     Py_RETURN_NONE;
 }
 
-static PyObject *Indexer_exit(IndexerObject *self, PyObject *args) {
-    Indexer_release_handle(self);
+static PyObject *CheckpointIndexer_exit(CheckpointIndexerObject *self,
+                                        PyObject *args) {
+    CheckpointIndexer_release_handle(self);
     Py_RETURN_NONE;
 }
 
-static PyMethodDef Indexer_methods[] = {
-    {"build", (PyCFunction)Indexer_build, METH_NOARGS,
+static PyMethodDef CheckpointIndexer_methods[] = {
+    {"build", (PyCFunction)CheckpointIndexer_build, METH_NOARGS,
      "build()\n"
      "--\n"
      "\n"
      "Build or rebuild the index.\n"},
-    {"need_rebuild", (PyCFunction)Indexer_need_rebuild, METH_NOARGS,
+    {"need_rebuild", (PyCFunction)CheckpointIndexer_need_rebuild, METH_NOARGS,
      "Check if a rebuild is needed."},
-    {"exists", (PyCFunction)Indexer_exists, METH_NOARGS,
+    {"exists", (PyCFunction)CheckpointIndexer_exists, METH_NOARGS,
      "Check if the .dftindex store exists."},
-    {"get_max_bytes", (PyCFunction)Indexer_get_max_bytes, METH_NOARGS,
+    {"get_max_bytes", (PyCFunction)CheckpointIndexer_get_max_bytes, METH_NOARGS,
      "Get the maximum uncompressed bytes in the indexed file."},
-    {"get_num_lines", (PyCFunction)Indexer_get_num_lines, METH_NOARGS,
+    {"get_num_lines", (PyCFunction)CheckpointIndexer_get_num_lines, METH_NOARGS,
      "Get the total number of lines in the indexed file."},
-    {"find_checkpoint", (PyCFunction)Indexer_find_checkpoint, METH_VARARGS,
+    {"find_checkpoint", (PyCFunction)CheckpointIndexer_find_checkpoint,
+     METH_VARARGS,
      "Find the best checkpoint for a given uncompressed offset.\n"
      "\n"
      "Args:\n"
      "    offset (int): Uncompressed byte offset.\n"},
-    {"get_checkpoints", (PyCFunction)Indexer_get_checkpoints, METH_NOARGS,
-     "Get all checkpoints for this file as a list."},
-    {"close", (PyCFunction)Indexer_close, METH_NOARGS,
+    {"get_checkpoints", (PyCFunction)CheckpointIndexer_get_checkpoints,
+     METH_NOARGS, "Get all checkpoints for this file as a list."},
+    {"close", (PyCFunction)CheckpointIndexer_close, METH_NOARGS,
      "Release this Python wrapper's native indexer handle.\n"
      "\n"
      "The shared RocksDB instance for the same .dftindex path remains managed\n"
      "by the native RocksDBManager cache."},
-    {"__enter__", (PyCFunction)Indexer_enter, METH_NOARGS,
+    {"__enter__", (PyCFunction)CheckpointIndexer_enter, METH_NOARGS,
      "Enter the runtime context for the with statement."},
-    {"__exit__", (PyCFunction)Indexer_exit, METH_VARARGS,
+    {"__exit__", (PyCFunction)CheckpointIndexer_exit, METH_VARARGS,
      "Release this Python wrapper on context exit.\n"
      "\n"
      "This does not force-close the shared RocksDB instance for the same\n"
@@ -418,48 +448,48 @@ static PyMethodDef Indexer_methods[] = {
     {NULL} /* Sentinel */
 };
 
-static PyGetSetDef Indexer_getsetters[] = {
-    {"gz_path", (getter)Indexer_gz_path, NULL, "Path to the gzip file", NULL},
-    {"index_path", (getter)Indexer_index_path, NULL,
+static PyGetSetDef CheckpointIndexer_getsetters[] = {
+    {"gz_path", (getter)CheckpointIndexer_gz_path, NULL,
+     "Path to the gzip file", NULL},
+    {"index_path", (getter)CheckpointIndexer_index_path, NULL,
      "Path to the .dftindex store", NULL},
-    {"checkpoint_size", (getter)Indexer_checkpoint_size, NULL,
+    {"checkpoint_size", (getter)CheckpointIndexer_checkpoint_size, NULL,
      "Checkpoint size in bytes", NULL},
-    {"has_bloom", (getter)Indexer_has_bloom, NULL,
+    {"has_bloom", (getter)CheckpointIndexer_has_bloom, NULL,
      "Whether bloom data exists in index", NULL},
-    {"has_manifest", (getter)Indexer_has_manifest, NULL,
+    {"has_manifest", (getter)CheckpointIndexer_has_manifest, NULL,
      "Whether manifest data exists in index", NULL},
     {NULL} /* Sentinel */
 };
 
-PyTypeObject IndexerType = {
-    PyVarObject_HEAD_INIT(NULL, 0) "indexer.Indexer", /* tp_name */
-    sizeof(IndexerObject),                            /* tp_basicsize */
-    0,                                                /* tp_itemsize */
-    (destructor)Indexer_dealloc,                      /* tp_dealloc */
-    0,                                                /* tp_vectorcall_offset */
-    0,                                                /* tp_getattr */
-    0,                                                /* tp_setattr */
-    0,                                                /* tp_as_async */
-    0,                                                /* tp_repr */
-    0,                                                /* tp_as_number */
-    0,                                                /* tp_as_sequence */
-    0,                                                /* tp_as_mapping */
-    0,                                                /* tp_hash */
-    0,                                                /* tp_call */
-    0,                                                /* tp_str */
-    0,                                                /* tp_getattro */
-    0,                                                /* tp_setattro */
-    0,                                                /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,         /* tp_flags */
-    "Indexer(gz_path: str, index_path: str | None = None,\n"
-    "       checkpoint_size: int = 1048576,\n"
-    "       force_rebuild: bool = False, build_bloom: bool = False,\n"
-    "       build_manifest: bool = False,\n"
-    "       index_threshold: int = 1048576,\n"
-    "       runtime: Runtime | None = None)\n"
+PyTypeObject CheckpointIndexerType = {
+    PyVarObject_HEAD_INIT(
+        NULL, 0) "dftracer_utils_ext.CheckpointIndexer", /* tp_name */
+    sizeof(CheckpointIndexerObject),                     /* tp_basicsize */
+    0,                                                   /* tp_itemsize */
+    (destructor)CheckpointIndexer_dealloc,               /* tp_dealloc */
+    0,                                        /* tp_vectorcall_offset */
+    0,                                        /* tp_getattr */
+    0,                                        /* tp_setattr */
+    0,                                        /* tp_as_async */
+    0,                                        /* tp_repr */
+    0,                                        /* tp_as_number */
+    0,                                        /* tp_as_sequence */
+    0,                                        /* tp_as_mapping */
+    0,                                        /* tp_hash */
+    0,                                        /* tp_call */
+    0,                                        /* tp_str */
+    0,                                        /* tp_getattro */
+    0,                                        /* tp_setattro */
+    0,                                        /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+    "CheckpointIndexer(gz_path, index_path=None, checkpoint_size=1048576, "
+    "force_rebuild=False, build_bloom=False, build_manifest=False, "
+    "runtime=None)\n"
     "--\n"
     "\n"
-    "Indexer for creating and managing gzip trace index stores.\n"
+    "Checkpoint indexer for single-file checkpoint-level operations on a "
+    "gzip trace.\n"
     "\n"
     "Args:\n"
     "    gz_path (str): Path to the gzip trace file.\n"
@@ -470,38 +500,37 @@ PyTypeObject IndexerType = {
     "    force_rebuild (bool): If True, rebuild the index even if it\n"
     "        exists.\n"
     "    build_bloom (bool): If True, build bloom filter data in the\n"
-    "        store.\n"
+    "        index.\n"
     "    build_manifest (bool): If True, build manifest data in the\n"
     "        store.\n"
-    "    index_threshold (int): Skip indexing for files smaller than\n"
-    "        this (default 1 MB).\n"
     "    runtime (Runtime or None): Runtime instance for thread pool\n"
     "        control. If None, uses the default global Runtime.\n", /* tp_doc */
-    0,                      /* tp_traverse */
-    0,                      /* tp_clear */
-    0,                      /* tp_richcompare */
-    0,                      /* tp_weaklistoffset */
-    0,                      /* tp_iter */
-    0,                      /* tp_iternext */
-    Indexer_methods,        /* tp_methods */
-    0,                      /* tp_members */
-    Indexer_getsetters,     /* tp_getset */
-    0,                      /* tp_base */
-    0,                      /* tp_dict */
-    0,                      /* tp_descr_get */
-    0,                      /* tp_descr_set */
-    0,                      /* tp_dictoffset */
-    (initproc)Indexer_init, /* tp_init */
-    0,                      /* tp_alloc */
-    Indexer_new,            /* tp_new */
+    0,                                /* tp_traverse */
+    0,                                /* tp_clear */
+    0,                                /* tp_richcompare */
+    0,                                /* tp_weaklistoffset */
+    0,                                /* tp_iter */
+    0,                                /* tp_iternext */
+    CheckpointIndexer_methods,        /* tp_methods */
+    0,                                /* tp_members */
+    CheckpointIndexer_getsetters,     /* tp_getset */
+    0,                                /* tp_base */
+    0,                                /* tp_dict */
+    0,                                /* tp_descr_get */
+    0,                                /* tp_descr_set */
+    0,                                /* tp_dictoffset */
+    (initproc)CheckpointIndexer_init, /* tp_init */
+    0,                                /* tp_alloc */
+    CheckpointIndexer_new,            /* tp_new */
 };
 
-int init_indexer(PyObject *m) {
-    if (PyType_Ready(&IndexerType) < 0) return -1;
+int init_checkpoint_indexer(PyObject *m) {
+    if (PyType_Ready(&CheckpointIndexerType) < 0) return -1;
 
-    Py_INCREF(&IndexerType);
-    if (PyModule_AddObject(m, "Indexer", (PyObject *)&IndexerType) < 0) {
-        Py_DECREF(&IndexerType);
+    Py_INCREF(&CheckpointIndexerType);
+    if (PyModule_AddObject(m, "CheckpointIndexer",
+                           (PyObject *)&CheckpointIndexerType) < 0) {
+        Py_DECREF(&CheckpointIndexerType);
         Py_DECREF(m);
         return -1;
     }
diff --git a/src/dftracer/utils/python/indexer.h b/src/dftracer/utils/python/indexer.h
index d31d0ccf..a2dfd203 100644
--- a/src/dftracer/utils/python/indexer.h
+++ b/src/dftracer/utils/python/indexer.h
@@ -13,12 +13,11 @@ typedef struct {
     std::uint64_t checkpoint_size;
     int build_bloom;
     int build_manifest;
-    std::uint64_t index_threshold;
     PyObject *runtime_obj;  // RuntimeObject* or NULL (uses default)
-} IndexerObject;
+} CheckpointIndexerObject;
 
-extern PyTypeObject IndexerType;
+extern PyTypeObject CheckpointIndexerType;
 
-int init_indexer(PyObject *m);
+int init_checkpoint_indexer(PyObject *m);
 
 #endif
diff --git a/src/dftracer/utils/python/json.cpp b/src/dftracer/utils/python/json.cpp
index 3c827b8a..3f31c602 100644
--- a/src/dftracer/utils/python/json.cpp
+++ b/src/dftracer/utils/python/json.cpp
@@ -1,727 +1,364 @@
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
 #include <dftracer/utils/python/json.h>
 
-#include <cstring>
-#include <iostream>
-
-static void JSON_dealloc(JSONObject* self) {
-    if (self->doc && self->owns_doc) {
-        yyjson_doc_free(self->doc);
-    }
-    Py_TYPE(self)->tp_free((PyObject*)self);
+using dftracer::utils::utilities::composites::dft::ArgsValueProxy;
+
+PyObject *args_value_to_pyobject(const ArgsValue &v) {
+    return std::visit(
+        [](const auto &val) -> PyObject * {
+            using T = std::decay_t<decltype(val)>;
+            if constexpr (std::is_same_v<T, std::monostate>) {
+                Py_RETURN_NONE;
+            } else if constexpr (std::is_same_v<T, std::string>) {
+                return PyUnicode_FromStringAndSize(val.data(), val.size());
+            } else if constexpr (std::is_same_v<T, std::uint64_t>) {
+                return PyLong_FromUnsignedLongLong(val);
+            } else if constexpr (std::is_same_v<T, std::int64_t>) {
+                return PyLong_FromLongLong(val);
+            } else if constexpr (std::is_same_v<T, double>) {
+                return PyFloat_FromDouble(val);
+            } else if constexpr (std::is_same_v<T, bool>) {
+                return PyBool_FromLong(val ? 1 : 0);
+            } else {
+                Py_RETURN_NONE;
+            }
+        },
+        v);
 }
 
-static PyObject* JSON_new(PyTypeObject* type, PyObject* args, PyObject* kwds) {
-    JSONObject* self;
-    self = (JSONObject*)type->tp_alloc(type, 0);
-    if (self != NULL) {
-        self->doc = nullptr;
-        self->root = nullptr;
-        self->parsed = false;
-        self->json_length = 0;
-        self->owns_doc = false;
-    }
-    return (PyObject*)self;
+static const ArgsMap &get_map(JsonDictValueObject *self) {
+    auto &ev = self->batch->events[self->event_index];
+    return self->is_args ? ev.args : ev.top;
 }
 
-static int JSON_init(JSONObject* self, PyObject* args, PyObject* kwds) {
-    const char* json_str;
-    if (!PyArg_ParseTuple(args, "s", &json_str)) {
-        return -1;
-    }
-
-    self->json_length = strlen(json_str);
-    if (self->json_length > 0) {
-        std::memcpy(self->json_data, json_str, self->json_length);
-    }
-    self->doc = nullptr;
-    self->root = nullptr;
-    self->parsed = false;
-    self->owns_doc = true;
-    return 0;
+static void JsonDictValue_dealloc(JsonDictValueObject *self) {
+    self->batch.reset();
+    Py_TYPE(self)->tp_free((PyObject *)self);
 }
 
-static bool JSON_ensure_parsed(JSONObject* self) {
-    if (self->root != nullptr) {
-        return true;
-    }
-
-    if (self->parsed && self->doc != nullptr) {
-        return true;
-    }
-
-    if (!self->parsed && self->json_length > 0) {
-        // Use YYJSON_READ_INSITU for large-scale processing
-        // (zero-copy, in-place modification)
-        yyjson_read_err err;
-        self->doc = yyjson_read_opts(self->json_data, self->json_length,
-                                     YYJSON_READ_INSITU, NULL, &err);
-        if (!self->doc) {
-            char err_msg[256];
-            std::snprintf(err_msg, sizeof(err_msg),
-                          "Failed to parse JSON at position %zu: %s (code %u, "
-                          "string: %.*s)",
-                          err.pos, err.msg, err.code, (int)self->json_length,
-                          self->json_data);
-            PyErr_SetString(PyExc_ValueError, err_msg);
-            return false;
-        }
-        self->parsed = true;
-        return true;
+static Py_ssize_t JsonDictValue_length(JsonDictValueObject *self) {
+    const auto &map = get_map(self);
+    Py_ssize_t count = 0;
+    map.for_each_member([&](std::string_view, ArgsValueProxy) { ++count; });
+    if (!self->is_args && get_map(self).exists()) {
+        auto &ev = self->batch->events[self->event_index];
+        if (ev.args.exists()) ++count;
     }
-
-    // If we get here, there's no data to parse
-    return false;
+    return count;
 }
 
-// Get the root yyjson_val for this JSON object (handles both top-level and
-// subtrees)
-static yyjson_val* JSON_get_root(JSONObject* self) {
-    if (self->root != nullptr) {
-        // This is a subtree wrapper - return the wrapped value directly
-        return self->root;
-    }
-    // This is a top-level document - get the root from the doc
-    return yyjson_doc_get_root(self->doc);
-}
+static PyObject *JsonDictValue_subscript(JsonDictValueObject *self,
+                                         PyObject *key) {
+    const char *key_str = PyUnicode_AsUTF8(key);
+    if (!key_str) return NULL;
 
-static PyObject* JSON_contains(JSONObject* self, PyObject* key) {
-    if (!JSON_ensure_parsed(self)) {
-        return NULL;
-    }
+    std::string_view k(key_str);
 
-    if (!PyUnicode_Check(key)) {
-        PyErr_SetString(PyExc_TypeError, "Key must be a string");
+    if (!self->is_args && k == "args") {
+        auto &ev = self->batch->events[self->event_index];
+        if (!ev.args.exists()) {
+            Py_RETURN_NONE;
+        }
+        JsonDictValueObject *obj =
+            (JsonDictValueObject *)JsonDictValueType.tp_alloc(
+                &JsonDictValueType, 0);
+        if (!obj) return NULL;
+        new (&obj->batch) std::shared_ptr<JsonDictBatch>(self->batch);
+        obj->event_index = self->event_index;
+        obj->is_args = true;
+        return (PyObject *)obj;
+    }
+
+    const auto &map = get_map(self);
+    auto proxy = map[k];
+    if (!proxy.exists()) {
+        PyErr_SetObject(PyExc_KeyError, key);
         return NULL;
     }
 
-    const char* key_str = PyUnicode_AsUTF8(key);
-    if (!key_str) {
+    const auto &raw = map.raw();
+    auto it = raw.find(k);
+    if (it == raw.end()) {
+        PyErr_SetObject(PyExc_KeyError, key);
         return NULL;
     }
-
-    yyjson_val* root = JSON_get_root(self);
-    if (!yyjson_is_obj(root)) {
-        Py_RETURN_FALSE;
-    }
-
-    yyjson_val* val = yyjson_obj_get(root, key_str);
-    if (val) {
-        Py_RETURN_TRUE;
-    } else {
-        Py_RETURN_FALSE;
-    }
+    return args_value_to_pyobject(it->second);
 }
 
-static int JSON_contains_sq(PyObject* self_obj, PyObject* key) {
-    JSONObject* self = (JSONObject*)self_obj;
-    PyObject* result = JSON_contains(self, key);
-    if (!result) {
-        return -1;
-    }
-
-    int is_true = PyObject_IsTrue(result);
-    Py_DECREF(result);
-    return is_true;
-}
-
-static PyObject* yyjson_val_to_python(yyjson_val* val) {
-    if (yyjson_is_null(val)) {
-        Py_RETURN_NONE;
-    } else if (yyjson_is_bool(val)) {
-        if (yyjson_get_bool(val)) {
-            Py_RETURN_TRUE;
-        } else {
-            Py_RETURN_FALSE;
+static PyObject *JsonDictValue_keys(JsonDictValueObject *self,
+                                    PyObject *Py_UNUSED(ignored)) {
+    PyObject *list = PyList_New(0);
+    if (!list) return NULL;
+
+    const auto &map = get_map(self);
+    map.for_each_member([&](std::string_view k, ArgsValueProxy) {
+        PyObject *key = PyUnicode_FromStringAndSize(k.data(), k.size());
+        if (key) {
+            PyList_Append(list, key);
+            Py_DECREF(key);
         }
-    } else if (yyjson_is_uint(val)) {
-        return PyLong_FromUnsignedLongLong(yyjson_get_uint(val));
-    } else if (yyjson_is_int(val)) {
-        return PyLong_FromLongLong(yyjson_get_int(val));
-    } else if (yyjson_is_real(val)) {
-        return PyFloat_FromDouble(yyjson_get_real(val));
-    } else if (yyjson_is_str(val)) {
-        return PyUnicode_FromString(yyjson_get_str(val));
-    } else if (yyjson_is_arr(val)) {
-        std::size_t idx, max;
-        yyjson_val* item;
-        PyObject* list = PyList_New(0);
-        if (!list) return NULL;
-
-        yyjson_arr_foreach(val, idx, max, item) {
-            PyObject* py_item = yyjson_val_to_python(item);
-            if (!py_item) {
-                Py_DECREF(list);
-                return NULL;
+    });
+
+    if (!self->is_args) {
+        auto &ev = self->batch->events[self->event_index];
+        if (ev.args.exists()) {
+            PyObject *args_key = PyUnicode_InternFromString("args");
+            if (args_key) {
+                PyList_Append(list, args_key);
+                Py_DECREF(args_key);
             }
-            if (PyList_Append(list, py_item) < 0) {
-                Py_DECREF(py_item);
-                Py_DECREF(list);
-                return NULL;
-            }
-            Py_DECREF(py_item);
         }
-        return list;
-    } else if (yyjson_is_obj(val)) {
-        std::size_t idx, max;
-        yyjson_val *key_val, *val_val;
-        PyObject* dict = PyDict_New();
-        if (!dict) return NULL;
-
-        yyjson_obj_foreach(val, idx, max, key_val, val_val) {
-            const char* key_str = yyjson_get_str(key_val);
-            PyObject* py_key = PyUnicode_FromString(key_str);
-            PyObject* py_val = yyjson_val_to_python(val_val);
-
-            if (!py_key || !py_val) {
-                Py_XDECREF(py_key);
-                Py_XDECREF(py_val);
-                Py_DECREF(dict);
-                return NULL;
-            }
-
-            if (PyDict_SetItem(dict, py_key, py_val) < 0) {
-                Py_DECREF(py_key);
-                Py_DECREF(py_val);
-                Py_DECREF(dict);
-                return NULL;
-            }
-
-            Py_DECREF(py_key);
-            Py_DECREF(py_val);
-        }
-        return dict;
-    }
-
-    Py_RETURN_NONE;
-}
-
-static PyObject* JSON_getitem(JSONObject* self, PyObject* key) {
-    if (!JSON_ensure_parsed(self)) {
-        return NULL;
-    }
-
-    if (!PyUnicode_Check(key)) {
-        PyErr_SetString(PyExc_TypeError, "Key must be a string");
-        return NULL;
-    }
-
-    const char* key_str = PyUnicode_AsUTF8(key);
-    if (!key_str) {
-        return NULL;
-    }
-
-    yyjson_val* root = JSON_get_root(self);
-    if (!yyjson_is_obj(root)) {
-        PyErr_SetString(PyExc_TypeError, "JSON root is not an object");
-        return NULL;
-    }
-
-    yyjson_val* val = yyjson_obj_get(root, key_str);
-    if (!val) {
-        PyErr_SetString(PyExc_KeyError, key_str);
-        return NULL;
-    }
-
-    // If the value is an object or array, return a lazy wrapper
-    if (yyjson_is_obj(val) || yyjson_is_arr(val)) {
-        return JSON_from_yyjson_val(self->doc, val);
     }
 
-    return yyjson_val_to_python(val);
+    return list;
 }
 
-static PyObject* JSON_keys(JSONObject* self, PyObject* Py_UNUSED(ignored)) {
-    if (!JSON_ensure_parsed(self)) {
-        return NULL;
-    }
-
-    yyjson_val* root = JSON_get_root(self);
-    if (!yyjson_is_obj(root)) {
-        return PyList_New(0);
+static PyObject *JsonDictValue_values(JsonDictValueObject *self,
+                                      PyObject *Py_UNUSED(ignored)) {
+    PyObject *list = PyList_New(0);
+    if (!list) return NULL;
+
+    const auto &map = get_map(self);
+    for (const auto &[k, v] : map.raw()) {
+        PyObject *val = args_value_to_pyobject(v);
+        if (val) {
+            PyList_Append(list, val);
+            Py_DECREF(val);
+        }
     }
 
-    PyObject* keys = PyList_New(0);
-    if (!keys) return NULL;
-
-    std::size_t idx, max;
-    yyjson_val *key_val, *val_val;
-    yyjson_obj_foreach(root, idx, max, key_val, val_val) {
-        const char* key_str = yyjson_get_str(key_val);
-        PyObject* py_key = PyUnicode_FromString(key_str);
-        if (!py_key) {
-            Py_DECREF(keys);
-            return NULL;
-        }
-        if (PyList_Append(keys, py_key) < 0) {
-            Py_DECREF(py_key);
-            Py_DECREF(keys);
-            return NULL;
+    if (!self->is_args) {
+        auto &ev = self->batch->events[self->event_index];
+        if (ev.args.exists()) {
+            JsonDictValueObject *args_obj =
+                (JsonDictValueObject *)JsonDictValueType.tp_alloc(
+                    &JsonDictValueType, 0);
+            if (args_obj) {
+                new (&args_obj->batch)
+                    std::shared_ptr<JsonDictBatch>(self->batch);
+                args_obj->event_index = self->event_index;
+                args_obj->is_args = true;
+                PyList_Append(list, (PyObject *)args_obj);
+                Py_DECREF(args_obj);
+            }
         }
-        Py_DECREF(py_key);
     }
 
-    return keys;
+    return list;
 }
 
-static PyObject* JSON_values(JSONObject* self, PyObject* Py_UNUSED(ignored)) {
-    if (!JSON_ensure_parsed(self)) {
-        return NULL;
-    }
-
-    yyjson_val* root = JSON_get_root(self);
-    if (!yyjson_is_obj(root)) {
-        return PyList_New(0);
-    }
-
-    PyObject* values = PyList_New(0);
-    if (!values) return NULL;
-
-    std::size_t idx, max;
-    yyjson_val *key_val, *val_val;
-    yyjson_obj_foreach(root, idx, max, key_val, val_val) {
-        PyObject* py_val;
-        // If the value is an object or array, return a lazy wrapper
-        if (yyjson_is_obj(val_val) || yyjson_is_arr(val_val)) {
-            py_val = JSON_from_yyjson_val(self->doc, val_val);
-        } else {
-            py_val = yyjson_val_to_python(val_val);
-        }
-
-        if (!py_val) {
-            Py_DECREF(values);
-            return NULL;
+static PyObject *JsonDictValue_items(JsonDictValueObject *self,
+                                     PyObject *Py_UNUSED(ignored)) {
+    PyObject *list = PyList_New(0);
+    if (!list) return NULL;
+
+    const auto &map = get_map(self);
+    for (const auto &[k, v] : map.raw()) {
+        PyObject *key = PyUnicode_FromStringAndSize(k.data(), k.size());
+        PyObject *val = args_value_to_pyobject(v);
+        if (key && val) {
+            PyObject *tuple = PyTuple_Pack(2, key, val);
+            if (tuple) {
+                PyList_Append(list, tuple);
+                Py_DECREF(tuple);
+            }
         }
-
-        if (PyList_Append(values, py_val) < 0) {
-            Py_DECREF(py_val);
-            Py_DECREF(values);
-            return NULL;
+        Py_XDECREF(key);
+        Py_XDECREF(val);
+    }
+
+    if (!self->is_args) {
+        auto &ev = self->batch->events[self->event_index];
+        if (ev.args.exists()) {
+            PyObject *args_key = PyUnicode_InternFromString("args");
+            JsonDictValueObject *args_obj =
+                (JsonDictValueObject *)JsonDictValueType.tp_alloc(
+                    &JsonDictValueType, 0);
+            if (args_key && args_obj) {
+                new (&args_obj->batch)
+                    std::shared_ptr<JsonDictBatch>(self->batch);
+                args_obj->event_index = self->event_index;
+                args_obj->is_args = true;
+                PyObject *tuple =
+                    PyTuple_Pack(2, args_key, (PyObject *)args_obj);
+                if (tuple) {
+                    PyList_Append(list, tuple);
+                    Py_DECREF(tuple);
+                }
+            }
+            Py_XDECREF(args_key);
+            Py_XDECREF((PyObject *)args_obj);
         }
-        Py_DECREF(py_val);
     }
 
-    return values;
+    return list;
 }
 
-static PyObject* JSON_items(JSONObject* self, PyObject* Py_UNUSED(ignored)) {
-    if (!JSON_ensure_parsed(self)) {
-        return NULL;
-    }
-
-    yyjson_val* root = JSON_get_root(self);
-    if (!yyjson_is_obj(root)) {
-        return PyList_New(0);
-    }
-
-    PyObject* items = PyList_New(0);
-    if (!items) return NULL;
-
-    std::size_t idx, max;
-    yyjson_val *key_val, *val_val;
-    yyjson_obj_foreach(root, idx, max, key_val, val_val) {
-        const char* key_str = yyjson_get_str(key_val);
-        PyObject* py_key = PyUnicode_FromString(key_str);
-        if (!py_key) {
-            Py_DECREF(items);
-            return NULL;
-        }
-
-        PyObject* py_val;
-        // If the value is an object or array, return a lazy wrapper
-        if (yyjson_is_obj(val_val) || yyjson_is_arr(val_val)) {
-            py_val = JSON_from_yyjson_val(self->doc, val_val);
-        } else {
-            py_val = yyjson_val_to_python(val_val);
-        }
-
-        if (!py_val) {
-            Py_DECREF(py_key);
-            Py_DECREF(items);
-            return NULL;
-        }
+static PyObject *JsonDictValue_get(JsonDictValueObject *self, PyObject *args) {
+    PyObject *key;
+    PyObject *default_val = Py_None;
+    if (!PyArg_ParseTuple(args, "O|O", &key, &default_val)) return NULL;
 
-        PyObject* tuple = PyTuple_Pack(2, py_key, py_val);
-        Py_DECREF(py_key);
-        Py_DECREF(py_val);
+    const char *key_str = PyUnicode_AsUTF8(key);
+    if (!key_str) return NULL;
 
-        if (!tuple) {
-            Py_DECREF(items);
-            return NULL;
-        }
+    std::string_view k(key_str);
 
-        if (PyList_Append(items, tuple) < 0) {
-            Py_DECREF(tuple);
-            Py_DECREF(items);
-            return NULL;
+    if (!self->is_args && k == "args") {
+        auto &ev = self->batch->events[self->event_index];
+        if (!ev.args.exists()) {
+            Py_INCREF(default_val);
+            return default_val;
         }
-        Py_DECREF(tuple);
-    }
-
-    return items;
+        JsonDictValueObject *obj =
+            (JsonDictValueObject *)JsonDictValueType.tp_alloc(
+                &JsonDictValueType, 0);
+        if (!obj) return NULL;
+        new (&obj->batch) std::shared_ptr<JsonDictBatch>(self->batch);
+        obj->event_index = self->event_index;
+        obj->is_args = true;
+        return (PyObject *)obj;
+    }
+
+    const auto &map = get_map(self);
+    auto it = map.raw().find(k);
+    if (it == map.raw().end()) {
+        Py_INCREF(default_val);
+        return default_val;
+    }
+    return args_value_to_pyobject(it->second);
 }
 
-static PyObject* JSON_get(JSONObject* self, PyObject* args) {
-    PyObject* key;
-    PyObject* default_value = Py_None;
-
-    if (!PyArg_ParseTuple(args, "O|O", &key, &default_value)) {
-        return NULL;
-    }
-
-    if (!JSON_ensure_parsed(self)) {
-        return NULL;
-    }
-
-    if (!PyUnicode_Check(key)) {
-        PyErr_SetString(PyExc_TypeError, "Key must be a string");
-        return NULL;
-    }
-
-    const char* key_str = PyUnicode_AsUTF8(key);
-    if (!key_str) {
-        return NULL;
-    }
-
-    yyjson_val* root = JSON_get_root(self);
-    if (!yyjson_is_obj(root)) {
-        Py_INCREF(default_value);
-        return default_value;
-    }
+static int JsonDictValue_contains(JsonDictValueObject *self, PyObject *key) {
+    const char *key_str = PyUnicode_AsUTF8(key);
+    if (!key_str) return -1;
 
-    yyjson_val* val = yyjson_obj_get(root, key_str);
-    if (!val) {
-        Py_INCREF(default_value);
-        return default_value;
-    }
+    std::string_view k(key_str);
 
-    // If the value is an object or array, return a lazy wrapper
-    if (yyjson_is_obj(val) || yyjson_is_arr(val)) {
-        return JSON_from_yyjson_val(self->doc, val);
+    if (!self->is_args && k == "args") {
+        auto &ev = self->batch->events[self->event_index];
+        return ev.args.exists() ? 1 : 0;
     }
 
-    return yyjson_val_to_python(val);
+    const auto &map = get_map(self);
+    return map[k].exists() ? 1 : 0;
 }
 
-// Helper function to recursively convert yyjson_val to Python dict/list
-static PyObject* yyjson_val_to_python_deep(yyjson_val* val) {
-    if (yyjson_is_null(val)) {
-        Py_RETURN_NONE;
-    } else if (yyjson_is_bool(val)) {
-        if (yyjson_get_bool(val)) {
-            Py_RETURN_TRUE;
-        } else {
-            Py_RETURN_FALSE;
-        }
-    } else if (yyjson_is_uint(val)) {
-        return PyLong_FromUnsignedLongLong(yyjson_get_uint(val));
-    } else if (yyjson_is_int(val)) {
-        return PyLong_FromLongLong(yyjson_get_int(val));
-    } else if (yyjson_is_real(val)) {
-        return PyFloat_FromDouble(yyjson_get_real(val));
-    } else if (yyjson_is_str(val)) {
-        return PyUnicode_FromString(yyjson_get_str(val));
-    } else if (yyjson_is_arr(val)) {
-        std::size_t idx, max;
-        yyjson_val* item;
-        PyObject* list = PyList_New(0);
-        if (!list) return NULL;
-
-        yyjson_arr_foreach(val, idx, max, item) {
-            PyObject* py_item = yyjson_val_to_python_deep(item);
-            if (!py_item) {
-                Py_DECREF(list);
-                return NULL;
-            }
-            if (PyList_Append(list, py_item) < 0) {
-                Py_DECREF(py_item);
-                Py_DECREF(list);
-                return NULL;
-            }
-            Py_DECREF(py_item);
+static PyObject *JsonDictValue_to_dict(JsonDictValueObject *self,
+                                       PyObject *Py_UNUSED(ignored)) {
+    PyObject *dict = PyDict_New();
+    if (!dict) return NULL;
+
+    const auto &map = get_map(self);
+    for (const auto &[k, v] : map.raw()) {
+        PyObject *key = PyUnicode_FromStringAndSize(k.data(), k.size());
+        PyObject *val = args_value_to_pyobject(v);
+        if (!key || !val) {
+            Py_XDECREF(key);
+            Py_XDECREF(val);
+            Py_DECREF(dict);
+            return NULL;
         }
-        return list;
-    } else if (yyjson_is_obj(val)) {
-        std::size_t idx, max;
-        yyjson_val *key_val, *val_val;
-        PyObject* dict = PyDict_New();
-        if (!dict) return NULL;
-
-        yyjson_obj_foreach(val, idx, max, key_val, val_val) {
-            const char* key_str = yyjson_get_str(key_val);
-            PyObject* py_key = PyUnicode_FromString(key_str);
-            PyObject* py_val = yyjson_val_to_python_deep(val_val);
-
-            if (!py_key || !py_val) {
-                Py_XDECREF(py_key);
-                Py_XDECREF(py_val);
-                Py_DECREF(dict);
-                return NULL;
-            }
+        PyDict_SetItem(dict, key, val);
+        Py_DECREF(key);
+        Py_DECREF(val);
+    }
 
-            if (PyDict_SetItem(dict, py_key, py_val) < 0) {
-                Py_DECREF(py_key);
-                Py_DECREF(py_val);
+    if (!self->is_args) {
+        auto &ev = self->batch->events[self->event_index];
+        if (ev.args.exists()) {
+            PyObject *args_dict = PyDict_New();
+            if (!args_dict) {
                 Py_DECREF(dict);
                 return NULL;
             }
-
-            Py_DECREF(py_key);
-            Py_DECREF(py_val);
-        }
-        return dict;
-    }
-
-    Py_RETURN_NONE;
-}
-
-static PyObject* JSON_unwrap(JSONObject* self, PyObject* Py_UNUSED(ignored)) {
-    if (!JSON_ensure_parsed(self)) {
-        return NULL;
-    }
-
-    yyjson_val* root = JSON_get_root(self);
-    return yyjson_val_to_python_deep(root);
-}
-
-static PyObject* JSON_copy(JSONObject* self, PyObject* Py_UNUSED(ignored)) {
-    if (!JSON_ensure_parsed(self)) {
-        return NULL;
-    }
-
-    // If this is a subtree wrapper, create a new wrapper pointing to the same
-    // subtree
-    if (self->root != nullptr) {
-        return JSON_from_yyjson_val(self->doc, self->root);
-    }
-
-    // For top-level documents, we need to serialize and re-parse since
-    // the original json_data was modified in-place by YYJSON_READ_INSITU
-    yyjson_val* root = JSON_get_root(self);
-    if (root) {
-        char* json_str = yyjson_val_write(root, 0, NULL);
-        if (!json_str) {
-            PyErr_SetString(PyExc_RuntimeError,
-                            "Failed to serialize JSON for copy");
-            return NULL;
-        }
-
-        size_t len = strlen(json_str);
-        PyObject* result = JSON_from_data(json_str, len);
-        free(json_str);
-        return result;
-    }
-
-    // Empty object
-    return JSON_from_data("{}", 2);
-}
-
-static PyObject* JSON_iter(JSONObject* self) {
-    if (!JSON_ensure_parsed(self)) {
-        return NULL;
-    }
-
-    yyjson_val* root = yyjson_doc_get_root(self->doc);
-    if (!yyjson_is_obj(root)) {
-        return PyObject_GetIter(PyList_New(0));
-    }
-
-    return PyObject_GetIter(JSON_keys(self, NULL));
-}
-
-static PyObject* JSON_str(JSONObject* self) {
-    if (self->root != nullptr) {
-        char* json_str = yyjson_val_write(self->root, 0, NULL);
-        if (!json_str) {
-            return PyUnicode_FromString("{}");
+            for (const auto &[k, v] : ev.args.raw()) {
+                PyObject *key = PyUnicode_FromStringAndSize(k.data(), k.size());
+                PyObject *val = args_value_to_pyobject(v);
+                if (!key || !val) {
+                    Py_XDECREF(key);
+                    Py_XDECREF(val);
+                    Py_DECREF(args_dict);
+                    Py_DECREF(dict);
+                    return NULL;
+                }
+                PyDict_SetItem(args_dict, key, val);
+                Py_DECREF(key);
+                Py_DECREF(val);
+            }
+            PyDict_SetItemString(dict, "args", args_dict);
+            Py_DECREF(args_dict);
         }
-        PyObject* result = PyUnicode_FromString(json_str);
-        free(json_str);
-        return result;
-    }
-    if (self->json_length > 0) {
-        return PyUnicode_FromStringAndSize(self->json_data, self->json_length);
-    }
-    return PyUnicode_FromString("{}");
-}
-
-static PyObject* JSON_repr(JSONObject* self) {
-    PyObject* str_obj = JSON_str(self);
-    if (!str_obj) return NULL;
-    PyObject* result = PyUnicode_FromFormat("JSON(%U)", str_obj);
-    Py_DECREF(str_obj);
-    return result;
-}
-
-static Py_ssize_t JSON_length(JSONObject* self) {
-    if (!JSON_ensure_parsed(self)) {
-        return -1;
     }
 
-    yyjson_val* root = JSON_get_root(self);
-    if (!yyjson_is_obj(root)) {
-        return 0;
-    }
-
-    return (Py_ssize_t)yyjson_obj_size(root);
+    return dict;
 }
 
-static int JSON_bool(JSONObject* self) {
-    if (!JSON_ensure_parsed(self)) {
-        return -1;
-    }
-
-    yyjson_val* root = JSON_get_root(self);
-    if (!yyjson_is_obj(root)) {
-        return 0;  // Non-objects are falsy
-    }
-
-    // Return true if object has at least one key
-    return yyjson_obj_size(root) > 0 ? 1 : 0;
-}
-
-PyMethodDef JSON_methods[] = {{"__contains__", (PyCFunction)JSON_contains,
-                               METH_O, "Check if key exists in JSON object"},
-                              {"keys", (PyCFunction)JSON_keys, METH_NOARGS,
-                               "Get all keys from JSON object"},
-                              {"values", (PyCFunction)JSON_values, METH_NOARGS,
-                               "Get all values from JSON object"},
-                              {"items", (PyCFunction)JSON_items, METH_NOARGS,
-                               "Get all key-value pairs from JSON object"},
-                              {"get", (PyCFunction)JSON_get, METH_VARARGS,
-                               "Get value by key with optional default"},
-                              {"unwrap", (PyCFunction)JSON_unwrap, METH_NOARGS,
-                               "Unwrap lazy JSON to native Python dict/list"},
-                              {"copy", (PyCFunction)JSON_copy, METH_NOARGS,
-                               "Return a shallow copy of the JSON object"},
-                              {NULL}};
-
-// gcc11_bandaid: Use positional initializers instead of designated
-PySequenceMethods JSON_as_sequence = {
-    NULL,            /* sq_length */
-    NULL,            /* sq_concat */
-    NULL,            /* sq_repeat */
-    NULL,            /* sq_item */
-    NULL,            /* was_sq_slice */
-    NULL,            /* sq_ass_item */
-    NULL,            /* was_sq_ass_slice */
-    JSON_contains_sq /* sq_contains */
+static PyMappingMethods JsonDictValue_as_mapping = {
+    (lenfunc)JsonDictValue_length,
+    (binaryfunc)JsonDictValue_subscript,
+    NULL,
 };
 
-PyMappingMethods JSON_as_mapping = {
-    (lenfunc)JSON_length,     /* mp_length */
-    (binaryfunc)JSON_getitem, /* mp_subscript */
-    NULL                      /* mp_ass_subscript */
+static PySequenceMethods JsonDictValue_as_sequence = {
+    NULL, NULL, NULL, NULL,
+    NULL, NULL, NULL, (objobjproc)JsonDictValue_contains,
+    NULL, NULL,
 };
 
-PyNumberMethods JSON_as_number = {
-    NULL,               /* nb_add */
-    NULL,               /* nb_subtract */
-    NULL,               /* nb_multiply */
-    NULL,               /* nb_remainder */
-    NULL,               /* nb_divmod */
-    NULL,               /* nb_power */
-    NULL,               /* nb_negative */
-    NULL,               /* nb_positive */
-    NULL,               /* nb_absolute */
-    (inquiry)JSON_bool, /* nb_bool */
+static PyMethodDef JsonDictValue_methods[] = {
+    {"keys", (PyCFunction)JsonDictValue_keys, METH_NOARGS,
+     "Return list of keys."},
+    {"values", (PyCFunction)JsonDictValue_values, METH_NOARGS,
+     "Return list of values."},
+    {"items", (PyCFunction)JsonDictValue_items, METH_NOARGS,
+     "Return list of (key, value) pairs."},
+    {"get", (PyCFunction)JsonDictValue_get, METH_VARARGS,
+     "Get value by key with optional default."},
+    {"to_dict", (PyCFunction)JsonDictValue_to_dict, METH_NOARGS,
+     "Convert to a regular Python dict."},
+    {NULL}};
+
+PyTypeObject JsonDictValueType = {
+    PyVarObject_HEAD_INIT(NULL, 0) "dftracer_utils_ext.JsonDictValue",
+    sizeof(JsonDictValueObject),       /* tp_basicsize */
+    0,                                 /* tp_itemsize */
+    (destructor)JsonDictValue_dealloc, /* tp_dealloc */
+    0,                                 /* tp_vectorcall_offset */
+    0,                                 /* tp_getattr */
+    0,                                 /* tp_setattr */
+    0,                                 /* tp_as_async */
+    0,                                 /* tp_repr */
+    0,                                 /* tp_as_number */
+    &JsonDictValue_as_sequence,        /* tp_as_sequence */
+    &JsonDictValue_as_mapping,         /* tp_as_mapping */
+    0,                                 /* tp_hash */
+    0,                                 /* tp_call */
+    0,                                 /* tp_str */
+    0,                                 /* tp_getattro */
+    0,                                 /* tp_setattro */
+    0,                                 /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,                /* tp_flags */
+    "Zero-copy wrapper over a parsed DFTracer JSON event.\n"
+    "Supports dict-like access: event['name'], event['args']['ret'].\n"
+    "Call .to_dict() to materialize a regular Python dict.",
+    0,                     /* tp_traverse */
+    0,                     /* tp_clear */
+    0,                     /* tp_richcompare */
+    0,                     /* tp_weaklistoffset */
+    0,                     /* tp_iter */
+    0,                     /* tp_iternext */
+    JsonDictValue_methods, /* tp_methods */
 };
 
-PyTypeObject JSONType = {
-    PyVarObject_HEAD_INIT(NULL, 0) "json.JSON", /* tp_name */
-    sizeof(JSONObject),                         /* tp_basicsize */
-    0,                                          /* tp_itemsize */
-    (destructor)JSON_dealloc,                   /* tp_dealloc */
-    0,                                          /* tp_vectorcall_offset */
-    0,                                          /* tp_getattr */
-    0,                                          /* tp_setattr */
-    0,                                          /* tp_as_async */
-    (reprfunc)JSON_repr,                        /* tp_repr */
-    &JSON_as_number,                            /* tp_as_number */
-    &JSON_as_sequence,                          /* tp_as_sequence */
-    &JSON_as_mapping,                           /* tp_as_mapping */
-    0,                                          /* tp_hash */
-    0,                                          /* tp_call */
-    (reprfunc)JSON_str,                         /* tp_str */
-    0,                                          /* tp_getattro */
-    0,                                          /* tp_setattro */
-    0,                                          /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,   /* tp_flags */
-    "JSON(json_str: str)\n"
-    "--\n"
-    "\n"
-    "Lazy JSON object that parses on demand using yyjson.\n"
-    "\n"
-    "Args:\n"
-    "    json_str (str): A JSON string to wrap. Parsing is deferred\n"
-    "        until first attribute access.\n", /* tp_doc */
-    0,                                         /* tp_traverse */
-    0,                                         /* tp_clear */
-    0,                                         /* tp_richcompare */
-    0,                                         /* tp_weaklistoffset */
-    (getiterfunc)JSON_iter,                    /* tp_iter */
-    0,                                         /* tp_iternext */
-    JSON_methods,                              /* tp_methods */
-    0,                                         /* tp_members */
-    0,                                         /* tp_getset */
-    0,                                         /* tp_base */
-    0,                                         /* tp_dict */
-    0,                                         /* tp_descr_get */
-    0,                                         /* tp_descr_set */
-    0,                                         /* tp_dictoffset */
-    (initproc)JSON_init,                       /* tp_init */
-    0,                                         /* tp_alloc */
-    JSON_new,                                  /* tp_new */
-};
-
-int init_json(PyObject* m) {
-    if (PyType_Ready(&JSONType) < 0) return -1;
-
-    Py_INCREF(&JSONType);
-    if (PyModule_AddObject(m, "JSON", (PyObject*)&JSONType) < 0) {
-        Py_DECREF(&JSONType);
-        Py_DECREF(m);
+int init_json_dict_value(PyObject *m) {
+    if (PyType_Ready(&JsonDictValueType) < 0) return -1;
+    Py_INCREF(&JsonDictValueType);
+    if (PyModule_AddObject(m, "JsonDictValue", (PyObject *)&JsonDictValueType) <
+        0) {
+        Py_DECREF(&JsonDictValueType);
         return -1;
     }
-
     return 0;
 }
-
-PyObject* JSON_from_data(const char* data, size_t length) {
-    JSONObject* self =
-        (JSONObject*)PyObject_MALLOC(sizeof(JSONObject) + length + 1);
-    if (!self) {
-        return PyErr_NoMemory();
-    }
-
-    PyObject_INIT(self, &JSONType);
-
-    self->doc = nullptr;
-    self->root = nullptr;
-    self->parsed = false;
-    self->json_length = length;
-    self->owns_doc = true;
-
-    std::memcpy(self->json_data, data, length);
-    self->json_data[length] = '\0';
-
-    return (PyObject*)self;
-}
-
-// Create a JSON object wrapping a yyjson_val
-// subtree (lazy wrapper for nested objects/arrays)
-PyObject* JSON_from_yyjson_val(yyjson_doc* doc, yyjson_val* root) {
-    JSONObject* self = (JSONObject*)PyObject_MALLOC(sizeof(JSONObject));
-    if (!self) {
-        return PyErr_NoMemory();
-    }
-
-    PyObject_INIT(self, &JSONType);
-
-    self->doc = doc;         // Share the document (don't copy)
-    self->root = root;       // Point to the subtree
-    self->parsed = true;     // Already parsed (just wrapping a subtree)
-    self->json_length = 0;   // No raw JSON data
-    self->owns_doc = false;  // Don't free the doc (it's owned by parent)
-
-    return (PyObject*)self;
-}
diff --git a/src/dftracer/utils/python/json.h b/src/dftracer/utils/python/json.h
index 0806e326..d87e50b1 100644
--- a/src/dftracer/utils/python/json.h
+++ b/src/dftracer/utils/python/json.h
@@ -2,33 +2,19 @@
 #define DFTRACER_UTILS_PYTHON_JSON_H
 
 #include <Python.h>
-#include <yyjson.h>
+#include <dftracer/utils/python/trace_reader_iterator.h>
 
-#include <cstddef>
-#include <cstdint>
 #include <memory>
-#include <string>
 
 typedef struct {
-    PyObject_HEAD mutable yyjson_doc* doc;
-    yyjson_val* root;
-    mutable bool parsed;
-    std::size_t json_length;
-    bool owns_doc;
-    char json_data[];
-} JSONObject;
+    PyObject_HEAD std::shared_ptr<JsonDictBatch> batch;
+    std::size_t event_index;
+    bool is_args;
+} JsonDictValueObject;
 
-extern PyTypeObject JSONType;
+extern PyTypeObject JsonDictValueType;
+int init_json_dict_value(PyObject *m);
 
-extern PyMethodDef JSON_methods[];
-extern PySequenceMethods JSON_as_sequence;
-extern PyMappingMethods JSON_as_mapping;
-
-int init_json(PyObject* m);
-
-PyObject* JSON_from_data(const char* data, size_t length);
-
-// Create a JSON object wrapping a yyjson_val subtree
-PyObject* JSON_from_yyjson_val(yyjson_doc* doc, yyjson_val* root);
+PyObject *args_value_to_pyobject(const ArgsValue &v);
 
 #endif  // DFTRACER_UTILS_PYTHON_JSON_H
diff --git a/src/dftracer/utils/python/memoryview_batch.cpp b/src/dftracer/utils/python/memoryview_batch.cpp
new file mode 100644
index 00000000..460fa058
--- /dev/null
+++ b/src/dftracer/utils/python/memoryview_batch.cpp
@@ -0,0 +1,114 @@
+#define PY_SSIZE_T_CLEAN
+#include <dftracer/utils/python/memoryview_batch.h>
+
+#include <cstring>
+
+namespace dftracer::utils::python {
+
+static void MemoryViewBatch_dealloc(MemoryViewBatchObject *self) {
+    delete self->data;
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static int MemoryViewBatch_getbuffer(MemoryViewBatchObject *self,
+                                     Py_buffer *view, int flags) {
+    if (!self->data || self->data->buffer.empty()) {
+        PyErr_SetString(PyExc_BufferError, "MemoryViewBatch has no data");
+        return -1;
+    }
+    return PyBuffer_FillInfo(view, (PyObject *)self, self->data->buffer.data(),
+                             static_cast<Py_ssize_t>(self->data->buffer.size()),
+                             1, flags);
+}
+
+static Py_ssize_t MemoryViewBatch_length(MemoryViewBatchObject *self) {
+    if (!self->data) return 0;
+    return static_cast<Py_ssize_t>(self->data->num_entries());
+}
+
+PyObject *MemoryViewBatch_item(MemoryViewBatchObject *self, Py_ssize_t i) {
+    if (!self->data) {
+        PyErr_SetString(PyExc_IndexError, "MemoryViewBatch has no data");
+        return NULL;
+    }
+    Py_ssize_t n = static_cast<Py_ssize_t>(self->data->num_entries());
+    if (i < 0 || i >= n) {
+        PyErr_SetString(PyExc_IndexError, "MemoryViewBatch index out of range");
+        return NULL;
+    }
+
+    Py_buffer buf;
+    std::memset(&buf, 0, sizeof(buf));
+    buf.buf = self->data->buffer.data() + self->data->offsets[i];
+    buf.obj = (PyObject *)self;
+    Py_INCREF(self);
+    buf.len = self->data->lengths[i];
+    buf.itemsize = 1;
+    buf.readonly = 1;
+    buf.ndim = 1;
+    buf.format = const_cast<char *>("B");
+    buf.shape = &buf.len;
+    buf.strides = &buf.itemsize;
+    buf.suboffsets = NULL;
+    buf.internal = NULL;
+    return PyMemoryView_FromBuffer(&buf);
+}
+
+static PyBufferProcs MemoryViewBatch_as_buffer = {
+    (getbufferproc)MemoryViewBatch_getbuffer,
+    NULL,
+};
+
+static PySequenceMethods MemoryViewBatch_as_sequence = {
+    (lenfunc)MemoryViewBatch_length,
+    NULL,
+    NULL,
+    (ssizeargfunc)MemoryViewBatch_item,
+};
+
+static PyObject *MemoryViewBatch_get_num_entries(MemoryViewBatchObject *self,
+                                                 void *) {
+    if (!self->data) return PyLong_FromLong(0);
+    return PyLong_FromSsize_t(
+        static_cast<Py_ssize_t>(self->data->num_entries()));
+}
+
+static PyObject *MemoryViewBatch_get_num_bytes(MemoryViewBatchObject *self,
+                                               void *) {
+    if (!self->data) return PyLong_FromLong(0);
+    return PyLong_FromSsize_t(
+        static_cast<Py_ssize_t>(self->data->buffer.size()));
+}
+
+static PyGetSetDef MemoryViewBatch_getsetters[] = {
+    {"num_entries", (getter)MemoryViewBatch_get_num_entries, NULL,
+     "Number of entries", NULL},
+    {"num_bytes", (getter)MemoryViewBatch_get_num_bytes, NULL,
+     "Total buffer size in bytes", NULL},
+    {NULL}};
+
+PyTypeObject MemoryViewBatchType = {
+    .ob_base = PyVarObject_HEAD_INIT(NULL, 0).tp_name =
+        "dftracer_utils_ext._MemoryViewBatch",
+    .tp_basicsize = sizeof(MemoryViewBatchObject),
+    .tp_itemsize = 0,
+    .tp_dealloc = (destructor)MemoryViewBatch_dealloc,
+    .tp_as_sequence = &MemoryViewBatch_as_sequence,
+    .tp_as_buffer = &MemoryViewBatch_as_buffer,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_doc = "Zero-copy batch of byte entries backed by a contiguous buffer",
+    .tp_getset = MemoryViewBatch_getsetters,
+};
+
+int init_memoryview_batch(PyObject *m) {
+    if (PyType_Ready(&MemoryViewBatchType) < 0) return -1;
+    Py_INCREF(&MemoryViewBatchType);
+    if (PyModule_AddObject(m, "_MemoryViewBatch",
+                           (PyObject *)&MemoryViewBatchType) < 0) {
+        Py_DECREF(&MemoryViewBatchType);
+        return -1;
+    }
+    return 0;
+}
+
+}  // namespace dftracer::utils::python
diff --git a/src/dftracer/utils/python/memoryview_batch.h b/src/dftracer/utils/python/memoryview_batch.h
new file mode 100644
index 00000000..67fef45c
--- /dev/null
+++ b/src/dftracer/utils/python/memoryview_batch.h
@@ -0,0 +1,54 @@
+#ifndef DFTRACER_UTILS_PYTHON_MEMORYVIEW_BATCH_H
+#define DFTRACER_UTILS_PYTHON_MEMORYVIEW_BATCH_H
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <dftracer/utils/core/coro/channel.h>
+
+#include <atomic>
+#include <cstddef>
+#include <exception>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+namespace dftracer::utils::python {
+
+struct MemoryViewBatchData {
+    std::vector<char> buffer;
+    std::vector<Py_ssize_t> offsets;
+    std::vector<Py_ssize_t> lengths;
+
+    std::size_t num_entries() const { return offsets.size(); }
+};
+
+struct MemoryViewBatchObject {
+    PyObject_HEAD MemoryViewBatchData *data;
+};
+
+extern PyTypeObject MemoryViewBatchType;
+
+PyObject *MemoryViewBatch_item(MemoryViewBatchObject *self, Py_ssize_t i);
+
+struct MemoryViewBatchIteratorState {
+    std::shared_ptr<dftracer::utils::coro::Channel<MemoryViewBatchData>>
+        channel;
+    std::mutex error_mtx;
+    std::exception_ptr error;
+    std::atomic<bool> cancelled{false};
+    std::size_t memory_budget_bytes = 0;
+    std::atomic<std::size_t> bytes_in_queue{0};
+    std::shared_future<void> task_future;
+
+    void set_error(std::exception_ptr e) {
+        std::lock_guard<std::mutex> lock(error_mtx);
+        if (!error) error = e;
+    }
+};
+
+int init_memoryview_batch(PyObject *m);
+
+}  // namespace dftracer::utils::python
+
+#endif  // DFTRACER_UTILS_PYTHON_MEMORYVIEW_BATCH_H
diff --git a/src/dftracer/utils/python/runtime.cpp b/src/dftracer/utils/python/runtime.cpp
index 6e6272f4..e5c63993 100644
--- a/src/dftracer/utils/python/runtime.cpp
+++ b/src/dftracer/utils/python/runtime.cpp
@@ -30,11 +30,12 @@ static PyObject *Runtime_new(PyTypeObject *type, PyObject *args,
 }
 
 static int Runtime_init(RuntimeObject *self, PyObject *args, PyObject *kwds) {
-    static const char *kwlist[] = {"threads", NULL};
+    static const char *kwlist[] = {"threads", "io_threads", NULL};
     Py_ssize_t threads = 0;
+    Py_ssize_t io_threads = 0;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|n", (char **)kwlist,
-                                     &threads)) {
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|nn", (char **)kwlist,
+                                     &threads, &io_threads)) {
         return -1;
     }
 
@@ -42,10 +43,17 @@ static int Runtime_init(RuntimeObject *self, PyObject *args, PyObject *kwds) {
         PyErr_SetString(PyExc_ValueError, "threads must be >= 0");
         return -1;
     }
+    if (io_threads < 0) {
+        PyErr_SetString(PyExc_ValueError, "io_threads must be >= 0");
+        return -1;
+    }
 
     try {
-        self->runtime = std::make_shared<dftracer::utils::Runtime>(
-            static_cast<std::size_t>(threads));
+        dftracer::utils::ExecutorConfig config;
+        config.num_threads = static_cast<std::size_t>(threads);
+        config.io_pool_size = static_cast<std::size_t>(io_threads);
+        self->runtime =
+            std::make_shared<dftracer::utils::Runtime>(config, true);
     } catch (const std::exception &e) {
         PyErr_SetString(PyExc_RuntimeError, e.what());
         return -1;
@@ -393,9 +401,19 @@ static PyMethodDef Runtime_methods[] = {
      "Exit context manager (calls shutdown)."},
     {NULL}};
 
+static PyObject *Runtime_get_io_threads(RuntimeObject *self, void *closure) {
+    if (!self->runtime) {
+        PyErr_SetString(PyExc_RuntimeError, "Runtime not initialized");
+        return NULL;
+    }
+    return PyLong_FromSize_t(self->runtime->io_threads());
+}
+
 static PyGetSetDef Runtime_getsetters[] = {
     {"threads", (getter)Runtime_get_threads, NULL, "Number of worker threads",
      NULL},
+    {"io_threads", (getter)Runtime_get_io_threads, NULL,
+     "Number of I/O threads", NULL},
     {NULL}};
 
 PyTypeObject RuntimeType = {
@@ -418,13 +436,15 @@ PyTypeObject RuntimeType = {
     0,                                        /* tp_setattro */
     0,                                        /* tp_as_buffer */
     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
-    "Runtime(threads: int = 0)\n"
+    "Runtime(threads: int = 0, io_threads: int = 0)\n"
     "--\n"
     "\n"
     "Coroutine runtime backed by a thread pool.\n"
     "\n"
     "Args:\n"
     "    threads (int): Number of worker threads. 0 (default) uses\n"
+    "        the hardware concurrency.\n"
+    "    io_threads (int): Number of I/O threads. 0 (default) uses\n"
     "        the hardware concurrency.\n", /* tp_doc */
     0,                                     /* tp_traverse */
     0,                                     /* tp_clear */
diff --git a/src/dftracer/utils/python/schema_reconcile.cpp b/src/dftracer/utils/python/schema_reconcile.cpp
new file mode 100644
index 00000000..21b8e783
--- /dev/null
+++ b/src/dftracer/utils/python/schema_reconcile.cpp
@@ -0,0 +1,351 @@
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+
+#include <dftracer/utils/python/schema_reconcile.h>
+
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+
+namespace dftracer::utils::python {
+
+namespace {
+
+bool cstr_eq(const char *a, const char *b) {
+    if (a == b) return true;
+    if (!a || !b) return false;
+    return std::strcmp(a, b) == 0;
+}
+
+// Unknown formats fall back to NA so we can still emit a safe null column.
+ArrowType type_from_format(const ArrowSchema *s) {
+    if (!s || !s->format) return NANOARROW_TYPE_NA;
+    const char *f = s->format;
+    if (cstr_eq(f, "n")) return NANOARROW_TYPE_NA;
+    if (cstr_eq(f, "b")) return NANOARROW_TYPE_BOOL;
+    if (cstr_eq(f, "c")) return NANOARROW_TYPE_INT8;
+    if (cstr_eq(f, "s")) return NANOARROW_TYPE_INT16;
+    if (cstr_eq(f, "i")) return NANOARROW_TYPE_INT32;
+    if (cstr_eq(f, "l")) return NANOARROW_TYPE_INT64;
+    if (cstr_eq(f, "C")) return NANOARROW_TYPE_UINT8;
+    if (cstr_eq(f, "S")) return NANOARROW_TYPE_UINT16;
+    if (cstr_eq(f, "I")) return NANOARROW_TYPE_UINT32;
+    if (cstr_eq(f, "L")) return NANOARROW_TYPE_UINT64;
+    if (cstr_eq(f, "f")) return NANOARROW_TYPE_FLOAT;
+    if (cstr_eq(f, "g")) return NANOARROW_TYPE_DOUBLE;
+    if (cstr_eq(f, "u")) return NANOARROW_TYPE_STRING;
+    if (cstr_eq(f, "z")) return NANOARROW_TYPE_BINARY;
+    if (cstr_eq(f, "U")) return NANOARROW_TYPE_LARGE_STRING;
+    if (cstr_eq(f, "Z")) return NANOARROW_TYPE_LARGE_BINARY;
+    return NANOARROW_TYPE_NA;
+}
+
+int build_null_array(const ArrowSchema *child_schema, int64_t length,
+                     ArrowArray *out) {
+    ArrowError err;
+    ArrowErrorInit(&err);
+    ArrowType t = type_from_format(child_schema);
+    if (ArrowArrayInitFromType(out, t) != NANOARROW_OK) return -1;
+    if (ArrowArrayStartAppending(out) != NANOARROW_OK) return -1;
+    if (ArrowArrayAppendNull(out, length) != NANOARROW_OK) return -1;
+    if (ArrowArrayFinishBuildingDefault(out, &err) != NANOARROW_OK) return -1;
+    return 0;
+}
+
+void json_escape(std::string_view in, std::string &out) {
+    for (char c : in) {
+        switch (c) {
+            case '"':
+                out.append("\\\"");
+                break;
+            case '\\':
+                out.append("\\\\");
+                break;
+            case '\n':
+                out.append("\\n");
+                break;
+            case '\r':
+                out.append("\\r");
+                break;
+            case '\t':
+                out.append("\\t");
+                break;
+            default:
+                if (static_cast<unsigned char>(c) < 0x20) {
+                    char buf[8];
+                    std::snprintf(
+                        buf, sizeof(buf), "\\u%04x",
+                        static_cast<int>(static_cast<unsigned char>(c)));
+                    out.append(buf);
+                } else {
+                    out.push_back(c);
+                }
+        }
+    }
+}
+
+void append_json_scalar(const ArrowSchema *child_schema,
+                        const ArrowArray *child_array, int64_t row,
+                        std::string &out) {
+    if (!child_schema || !child_array) {
+        out.append("null");
+        return;
+    }
+    ArrowArrayView view;
+    ArrowArrayViewInitFromType(&view, type_from_format(child_schema));
+    ArrowError err;
+    ArrowErrorInit(&err);
+    if (ArrowArrayViewSetArray(&view, child_array, &err) != NANOARROW_OK) {
+        out.append("null");
+        ArrowArrayViewReset(&view);
+        return;
+    }
+    if (ArrowArrayViewIsNull(&view, row)) {
+        out.append("null");
+        ArrowArrayViewReset(&view);
+        return;
+    }
+    ArrowType t = type_from_format(child_schema);
+    switch (t) {
+        case NANOARROW_TYPE_BOOL:
+            out.append(ArrowArrayViewGetIntUnsafe(&view, row) ? "true"
+                                                              : "false");
+            break;
+        case NANOARROW_TYPE_INT8:
+        case NANOARROW_TYPE_INT16:
+        case NANOARROW_TYPE_INT32:
+        case NANOARROW_TYPE_INT64: {
+            char buf[32];
+            std::snprintf(
+                buf, sizeof(buf), "%lld",
+                static_cast<long long>(ArrowArrayViewGetIntUnsafe(&view, row)));
+            out.append(buf);
+            break;
+        }
+        case NANOARROW_TYPE_UINT8:
+        case NANOARROW_TYPE_UINT16:
+        case NANOARROW_TYPE_UINT32:
+        case NANOARROW_TYPE_UINT64: {
+            char buf[32];
+            std::snprintf(buf, sizeof(buf), "%llu",
+                          static_cast<unsigned long long>(
+                              ArrowArrayViewGetUIntUnsafe(&view, row)));
+            out.append(buf);
+            break;
+        }
+        case NANOARROW_TYPE_FLOAT:
+        case NANOARROW_TYPE_DOUBLE: {
+            char buf[32];
+            std::snprintf(buf, sizeof(buf), "%g",
+                          ArrowArrayViewGetDoubleUnsafe(&view, row));
+            out.append(buf);
+            break;
+        }
+        case NANOARROW_TYPE_STRING:
+        case NANOARROW_TYPE_LARGE_STRING: {
+            auto sv = ArrowArrayViewGetStringUnsafe(&view, row);
+            out.push_back('"');
+            json_escape(std::string_view(sv.data, sv.size_bytes), out);
+            out.push_back('"');
+            break;
+        }
+        default:
+            out.append("null");
+    }
+    ArrowArrayViewReset(&view);
+}
+
+}  // namespace
+
+SchemaReconciler::SchemaReconciler() = default;
+
+bool SchemaReconciler::merge(const ArrowSchema *incoming) {
+    if (finalized_ || !incoming) return false;
+    bool added = false;
+    for (int64_t i = 0; i < incoming->n_children; ++i) {
+        const ArrowSchema *child = incoming->children[i];
+        if (!child || !child->name) continue;
+        std::string name(child->name);
+        if (name == EXTRA_COLUMN_NAME) continue;  // reserved
+        if (name_to_idx_.count(name)) continue;
+        nanoarrow::UniqueSchema copy;
+        if (ArrowSchemaDeepCopy(child, copy.get()) != NANOARROW_OK) {
+            last_error_ = "schema deep-copy failed while merging";
+            return added;
+        }
+        int64_t idx = static_cast<int64_t>(names_.size());
+        names_.push_back(name);
+        child_schemas_.push_back(std::move(copy));
+        name_to_idx_.emplace(std::move(name), idx);
+        added = true;
+    }
+    return added;
+}
+
+int SchemaReconciler::finalize() {
+    if (finalized_) return 0;
+    int64_t n = static_cast<int64_t>(child_schemas_.size()) + 1;
+    ArrowSchemaInit(locked_schema_.get());
+    if (ArrowSchemaSetTypeStruct(locked_schema_.get(), n) != NANOARROW_OK) {
+        last_error_ = "failed to initialize union struct schema";
+        return -1;
+    }
+    for (size_t i = 0; i < child_schemas_.size(); ++i) {
+        nanoarrow::UniqueSchema tmp;
+        if (ArrowSchemaDeepCopy(child_schemas_[i].get(), tmp.get()) !=
+            NANOARROW_OK) {
+            last_error_ = "failed to deep-copy union child";
+            return -1;
+        }
+        ArrowSchemaMove(tmp.get(), locked_schema_->children[i]);
+    }
+    ArrowSchema *extra = locked_schema_->children[child_schemas_.size()];
+    if (ArrowSchemaSetType(extra, NANOARROW_TYPE_STRING) != NANOARROW_OK) {
+        last_error_ = "failed to set _extra column type";
+        return -1;
+    }
+    if (ArrowSchemaSetName(extra, EXTRA_COLUMN_NAME) != NANOARROW_OK) {
+        last_error_ = "failed to name _extra column";
+        return -1;
+    }
+    finalized_ = true;
+    return 0;
+}
+
+int SchemaReconciler::copy_schema(ArrowSchema *out) const {
+    if (!finalized_) {
+        last_error_ = "copy_schema called before finalize";
+        return -1;
+    }
+    nanoarrow::UniqueSchema tmp;
+    if (ArrowSchemaDeepCopy(locked_schema_.get(), tmp.get()) != NANOARROW_OK) {
+        last_error_ = "failed to deep-copy locked schema";
+        return -1;
+    }
+    ArrowSchemaMove(tmp.get(), out);
+    return 0;
+}
+
+int SchemaReconciler::reconcile(const ArrowSchema *in_schema,
+                                ArrowArray *in_array, ArrowArray *out) const {
+    if (!finalized_) {
+        last_error_ = "reconcile called before finalize";
+        return -1;
+    }
+    if (!in_schema || !in_array || !out) return -1;
+
+    int64_t num_rows = in_array->length;
+
+    // Initialize out as a struct matching the locked schema. This allocates
+    // children of the right types; we'll populate them below.
+    ArrowError err;
+    ArrowErrorInit(&err);
+    if (ArrowArrayInitFromSchema(out, locked_schema_.get(), &err) !=
+        NANOARROW_OK) {
+        last_error_ = "ArrowArrayInitFromSchema failed for reconciled array";
+        return -1;
+    }
+
+    // Build: input-name -> input-child-index
+    std::unordered_map<std::string, int64_t> in_idx;
+    in_idx.reserve(static_cast<size_t>(in_schema->n_children));
+    for (int64_t i = 0; i < in_schema->n_children; ++i) {
+        const ArrowSchema *c = in_schema->children[i];
+        if (c && c->name) in_idx.emplace(c->name, i);
+    }
+
+    // For each known union column (all except the final _extra), try to take
+    // it from the input batch. If missing, null-pad.
+    int64_t n_known = num_known_columns();
+    for (int64_t i = 0; i < n_known; ++i) {
+        const std::string &name = names_[static_cast<size_t>(i)];
+        auto it = in_idx.find(name);
+        if (it != in_idx.end()) {
+            // Release the pre-initialized placeholder child and move the
+            // input child into its slot (zero copy; release of the input
+            // goes null after the move).
+            ArrowArray *slot = out->children[i];
+            if (slot->release) slot->release(slot);
+            ArrowArrayMove(in_array->children[it->second], slot);
+        } else {
+            ArrowArray *slot = out->children[i];
+            if (slot->release) slot->release(slot);
+            if (build_null_array(locked_schema_->children[i], num_rows, slot) !=
+                0) {
+                last_error_ = "failed to build null column for missing field";
+                return -1;
+            }
+        }
+    }
+
+    // Find input children whose names aren't in the union: these feed _extra.
+    std::vector<int64_t> unknown_in;
+    for (int64_t i = 0; i < in_schema->n_children; ++i) {
+        const ArrowSchema *c = in_schema->children[i];
+        if (!c || !c->name) continue;
+        if (!name_to_idx_.count(c->name)) unknown_in.push_back(i);
+    }
+
+    // Build the _extra column. Fast path: no unknowns -> all nulls.
+    ArrowArray *extra_slot = out->children[n_known];
+    if (extra_slot->release) extra_slot->release(extra_slot);
+    if (unknown_in.empty()) {
+        if (ArrowArrayInitFromType(extra_slot, NANOARROW_TYPE_STRING) !=
+            NANOARROW_OK) {
+            last_error_ = "failed to init null _extra column";
+            return -1;
+        }
+        if (ArrowArrayStartAppending(extra_slot) != NANOARROW_OK ||
+            ArrowArrayAppendNull(extra_slot, num_rows) != NANOARROW_OK ||
+            ArrowArrayFinishBuildingDefault(extra_slot, &err) != NANOARROW_OK) {
+            last_error_ = "failed to append nulls to _extra";
+            return -1;
+        }
+    } else {
+        // Slow path: JSON-encode unknown fields per row.
+        if (ArrowArrayInitFromType(extra_slot, NANOARROW_TYPE_STRING) !=
+            NANOARROW_OK) {
+            last_error_ = "failed to init string _extra column";
+            return -1;
+        }
+        if (ArrowArrayStartAppending(extra_slot) != NANOARROW_OK) {
+            last_error_ = "failed to start appending to _extra";
+            return -1;
+        }
+        std::string buf;
+        for (int64_t row = 0; row < num_rows; ++row) {
+            buf.clear();
+            buf.push_back('{');
+            bool first = true;
+            for (int64_t u : unknown_in) {
+                const ArrowSchema *cs = in_schema->children[u];
+                const ArrowArray *ca = in_array->children[u];
+                if (!cs || !ca || !cs->name) continue;
+                if (!first) buf.push_back(',');
+                first = false;
+                buf.push_back('"');
+                json_escape(cs->name, buf);
+                buf.append("\":");
+                append_json_scalar(cs, ca, row, buf);
+            }
+            buf.push_back('}');
+            ArrowStringView sv{buf.data(), static_cast<int64_t>(buf.size())};
+            if (ArrowArrayAppendString(extra_slot, sv) != NANOARROW_OK) {
+                last_error_ = "failed to append _extra row";
+                return -1;
+            }
+        }
+        if (ArrowArrayFinishBuildingDefault(extra_slot, &err) != NANOARROW_OK) {
+            last_error_ = "failed to finish _extra column";
+            return -1;
+        }
+    }
+
+    out->length = num_rows;
+    out->null_count = 0;
+    return 0;
+}
+
+}  // namespace dftracer::utils::python
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW
diff --git a/src/dftracer/utils/python/schema_reconcile.h b/src/dftracer/utils/python/schema_reconcile.h
new file mode 100644
index 00000000..452ac507
--- /dev/null
+++ b/src/dftracer/utils/python/schema_reconcile.h
@@ -0,0 +1,49 @@
+#ifndef DFTRACER_UTILS_PYTHON_SCHEMA_RECONCILE_H
+#define DFTRACER_UTILS_PYTHON_SCHEMA_RECONCILE_H
+
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+
+#include <nanoarrow/nanoarrow.h>
+
+#include <nanoarrow/nanoarrow.hpp>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace dftracer::utils::python {
+
+// Build a union schema over batches from producers that each emit a subset
+// of columns; after finalize(), surprise columns are JSON-encoded into
+// _extra so no data is lost and the stream schema stays stable.
+class SchemaReconciler {
+   public:
+    static constexpr const char *EXTRA_COLUMN_NAME = "_extra";
+
+    SchemaReconciler();
+
+    bool merge(const ArrowSchema *incoming);
+    int finalize();
+    int copy_schema(ArrowSchema *out) const;
+    int reconcile(const ArrowSchema *in_schema, ArrowArray *in_array,
+                  ArrowArray *out) const;
+
+    bool finalized() const { return finalized_; }
+    int64_t num_known_columns() const {
+        return static_cast<int64_t>(child_schemas_.size());
+    }
+    const std::string &last_error() const { return last_error_; }
+
+   private:
+    std::vector<std::string> names_;
+    std::vector<nanoarrow::UniqueSchema> child_schemas_;
+    std::unordered_map<std::string, int64_t> name_to_idx_;
+    nanoarrow::UniqueSchema locked_schema_;
+    bool finalized_ = false;
+    mutable std::string last_error_;
+};
+
+}  // namespace dftracer::utils::python
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW
+#endif  // DFTRACER_UTILS_PYTHON_SCHEMA_RECONCILE_H
diff --git a/src/dftracer/utils/python/sst_distribution.cpp b/src/dftracer/utils/python/sst_distribution.cpp
new file mode 100644
index 00000000..dc80b43c
--- /dev/null
+++ b/src/dftracer/utils/python/sst_distribution.cpp
@@ -0,0 +1,1182 @@
+#include <dftracer/utils/core/runtime.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/python/runtime.h>
+#include <dftracer/utils/python/sst_distribution.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_key.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h>
+#include <dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h>
+#include <dftracer/utils/utilities/indexer/file_partition.h>
+#include <dftracer/utils/utilities/indexer/index_batch_sink.h>
+#include <dftracer/utils/utilities/indexer/index_builder_utility.h>
+#include <dftracer/utils/utilities/indexer/index_database_sst_writer_context.h>
+#include <dftracer/utils/utilities/indexer/internal/common/gzip_member_scanner.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <new>
+#include <optional>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+using dftracer::utils::Runtime;
+using dftracer::utils::utilities::filesystem::FileEntry;
+using dftracer::utils::utilities::filesystem::PatternDirectoryScannerUtility;
+using dftracer::utils::utilities::filesystem::
+    PatternDirectoryScannerUtilityInput;
+using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility;
+using dftracer::utils::utilities::indexer::IndexBatchSink;
+using dftracer::utils::utilities::indexer::IndexBuildBatchConfig;
+using dftracer::utils::utilities::indexer::IndexBuildBatchResult;
+using dftracer::utils::utilities::indexer::IndexDatabaseSstWriterContext;
+using dftracer::utils::utilities::indexer::plan_lpt_partition;
+using dftracer::utils::utilities::indexer::SstArtifactRegistry;
+using dftracer::utils::utilities::indexer::internal::
+    enumerate_gzip_member_candidates;
+using dftracer::utils::utilities::indexer::internal::GzipMember;
+
+// ---------------------------------------------------------------------------
+// SstArtifactRegistry type
+// ---------------------------------------------------------------------------
+
+typedef struct {
+    PyObject_HEAD std::shared_ptr<SstArtifactRegistry> registry;
+} SstArtifactRegistryObject;
+
+static void SstArtifactRegistry_dealloc(SstArtifactRegistryObject *self) {
+    self->registry.~shared_ptr<SstArtifactRegistry>();
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static PyObject *SstArtifactRegistry_new(PyTypeObject *type,
+                                         PyObject * /*args*/,
+                                         PyObject * /*kwds*/) {
+    auto *self = (SstArtifactRegistryObject *)type->tp_alloc(type, 0);
+    if (!self) return NULL;
+    new (&self->registry) std::shared_ptr<SstArtifactRegistry>(
+        std::make_shared<SstArtifactRegistry>());
+    return (PyObject *)self;
+}
+
+namespace {
+
+// Field names in the Artifacts dict returned by build_sst_batch and
+// consumed by SstArtifactRegistry.append. Must match the field names on
+// IndexDatabaseSstWriterContext::Artifacts.
+constexpr const char *ARTIFACT_FIELDS[] = {
+    "metadata_sst",        "checkpoints_sst",         "manifest_sst",
+    "chunk_bloom_sst",     "file_bloom_sst",          "chunk_stats_sst",
+    "chunk_dim_stats_sst", "dimensions_sst",          "file_scalar_stats_sst",
+    "file_cat_counts_sst", "file_pid_tid_counts_sst", "file_name_counts_sst",
+    "name_dictionary_sst", "name_file_postings_sst",  "name_chunk_postings_sst",
+    "hash_tables_sst",     "aggregation_sst",         "system_metrics_sst",
+};
+
+/// Map a slot name to the matching Artifacts member. Kept in one place so
+/// that adding a new CF requires updating only `ARTIFACT_FIELDS` plus
+/// `dispatch_*` below.
+std::optional<std::string> *artifacts_slot(
+    IndexDatabaseSstWriterContext::Artifacts &a, std::string_view name) {
+    if (name == "metadata_sst") return &a.metadata_sst;
+    if (name == "checkpoints_sst") return &a.checkpoints_sst;
+    if (name == "manifest_sst") return &a.manifest_sst;
+    if (name == "chunk_bloom_sst") return &a.chunk_bloom_sst;
+    if (name == "file_bloom_sst") return &a.file_bloom_sst;
+    if (name == "chunk_stats_sst") return &a.chunk_stats_sst;
+    if (name == "chunk_dim_stats_sst") return &a.chunk_dim_stats_sst;
+    if (name == "dimensions_sst") return &a.dimensions_sst;
+    if (name == "file_scalar_stats_sst") return &a.file_scalar_stats_sst;
+    if (name == "file_cat_counts_sst") return &a.file_cat_counts_sst;
+    if (name == "file_pid_tid_counts_sst") return &a.file_pid_tid_counts_sst;
+    if (name == "file_name_counts_sst") return &a.file_name_counts_sst;
+    if (name == "name_dictionary_sst") return &a.name_dictionary_sst;
+    if (name == "name_file_postings_sst") return &a.name_file_postings_sst;
+    if (name == "name_chunk_postings_sst") return &a.name_chunk_postings_sst;
+    if (name == "hash_tables_sst") return &a.hash_tables_sst;
+    if (name == "aggregation_sst") return &a.aggregation_sst;
+    if (name == "system_metrics_sst") return &a.system_metrics_sst;
+    return nullptr;
+}
+
+/// Convert a Python artifacts dict to the C++ Artifacts struct. Missing,
+/// None, or empty-string entries become nullopt. Returns false on type
+/// errors (exception set).
+bool artifacts_from_dict(PyObject *dict,
+                         IndexDatabaseSstWriterContext::Artifacts *out) {
+    if (!PyDict_Check(dict)) {
+        PyErr_SetString(PyExc_TypeError, "artifacts must be a dict");
+        return false;
+    }
+    for (const char *field : ARTIFACT_FIELDS) {
+        PyObject *val = PyDict_GetItemString(dict, field);  // borrowed
+        if (!val || val == Py_None) continue;
+        if (!PyUnicode_Check(val)) {
+            PyErr_Format(PyExc_TypeError, "artifacts['%s'] must be str or None",
+                         field);
+            return false;
+        }
+        const char *s = PyUnicode_AsUTF8(val);
+        if (!s) return false;
+        if (s[0] == '\0') continue;
+        auto *slot = artifacts_slot(*out, field);
+        if (slot) *slot = std::string(s);
+    }
+    return true;
+}
+
+PyObject *artifacts_to_dict(const IndexDatabaseSstWriterContext::Artifacts &a) {
+    PyObject *dict = PyDict_New();
+    if (!dict) return NULL;
+    auto set_field = [&](const char *name,
+                         const std::optional<std::string> &slot) -> bool {
+        PyObject *v = slot.has_value() ? PyUnicode_FromString(slot->c_str())
+                                       : (Py_INCREF(Py_None), Py_None);
+        if (!v) return false;
+        int rc = PyDict_SetItemString(dict, name, v);
+        Py_DECREF(v);
+        return rc == 0;
+    };
+    if (!set_field("metadata_sst", a.metadata_sst) ||
+        !set_field("checkpoints_sst", a.checkpoints_sst) ||
+        !set_field("manifest_sst", a.manifest_sst) ||
+        !set_field("chunk_bloom_sst", a.chunk_bloom_sst) ||
+        !set_field("file_bloom_sst", a.file_bloom_sst) ||
+        !set_field("chunk_stats_sst", a.chunk_stats_sst) ||
+        !set_field("chunk_dim_stats_sst", a.chunk_dim_stats_sst) ||
+        !set_field("dimensions_sst", a.dimensions_sst) ||
+        !set_field("file_scalar_stats_sst", a.file_scalar_stats_sst) ||
+        !set_field("file_cat_counts_sst", a.file_cat_counts_sst) ||
+        !set_field("file_pid_tid_counts_sst", a.file_pid_tid_counts_sst) ||
+        !set_field("file_name_counts_sst", a.file_name_counts_sst) ||
+        !set_field("name_dictionary_sst", a.name_dictionary_sst) ||
+        !set_field("name_file_postings_sst", a.name_file_postings_sst) ||
+        !set_field("name_chunk_postings_sst", a.name_chunk_postings_sst) ||
+        !set_field("hash_tables_sst", a.hash_tables_sst) ||
+        !set_field("aggregation_sst", a.aggregation_sst) ||
+        !set_field("system_metrics_sst", a.system_metrics_sst)) {
+        Py_DECREF(dict);
+        return NULL;
+    }
+    return dict;
+}
+
+}  // namespace
+
+static PyObject *SstArtifactRegistry_append(SstArtifactRegistryObject *self,
+                                            PyObject *args) {
+    PyObject *dict;
+    if (!PyArg_ParseTuple(args, "O", &dict)) return NULL;
+    IndexDatabaseSstWriterContext::Artifacts a;
+    if (!artifacts_from_dict(dict, &a)) return NULL;
+    self->registry->append(std::move(a));
+    Py_RETURN_NONE;
+}
+
+static PyMethodDef SstArtifactRegistry_methods[] = {
+    {"append", (PyCFunction)SstArtifactRegistry_append, METH_VARARGS,
+     "append(artifacts_dict) -> None\n"
+     "Add a per-batch Artifacts dict (as returned by build_sst_batch or "
+     "IndexDatabaseSstWriterContext.commit) to the registry."},
+    {NULL}};
+
+static PyTypeObject SstArtifactRegistryType = {
+    PyVarObject_HEAD_INIT(NULL, 0) "dftracer_utils_ext.SstArtifactRegistry",
+    sizeof(SstArtifactRegistryObject),
+    0,
+    (destructor)SstArtifactRegistry_dealloc,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    Py_TPFLAGS_DEFAULT,
+    "Thread-safe collector for SST artifact paths.",
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    SstArtifactRegistry_methods,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    SstArtifactRegistry_new,
+};
+
+SstArtifactRegistry *sst_artifact_registry_get(PyObject *obj) {
+    if (!PyObject_TypeCheck(obj, &SstArtifactRegistryType)) return nullptr;
+    return ((SstArtifactRegistryObject *)obj)->registry.get();
+}
+
+// ---------------------------------------------------------------------------
+// scan_files: parallel directory scan with size info
+// ---------------------------------------------------------------------------
+
+static PyObject *scan_files_fn(PyObject * /*self*/, PyObject *args,
+                               PyObject *kwds) {
+    static const char *kwlist[] = {"directory", "patterns", "recursive",
+                                   "runtime", NULL};
+    const char *directory;
+    PyObject *patterns_obj = NULL;
+    int recursive = 0;
+    PyObject *runtime_arg = NULL;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|OpO", (char **)kwlist,
+                                     &directory, &patterns_obj, &recursive,
+                                     &runtime_arg)) {
+        return NULL;
+    }
+
+    std::vector<std::string> patterns;
+    if (patterns_obj && patterns_obj != Py_None) {
+        PyObject *seq =
+            PySequence_Fast(patterns_obj, "patterns must be a sequence");
+        if (!seq) return NULL;
+        Py_ssize_t n = PySequence_Fast_GET_SIZE(seq);
+        patterns.reserve(n);
+        for (Py_ssize_t i = 0; i < n; ++i) {
+            const char *s = PyUnicode_AsUTF8(PySequence_Fast_GET_ITEM(seq, i));
+            if (!s) {
+                Py_DECREF(seq);
+                return NULL;
+            }
+            patterns.emplace_back(s);
+        }
+        Py_DECREF(seq);
+    }
+
+    Runtime *rt = nullptr;
+    if (runtime_arg && runtime_arg != Py_None) {
+        if (!PyObject_TypeCheck(runtime_arg, &RuntimeType)) {
+            PyObject *native = PyObject_GetAttrString(runtime_arg, "_native");
+            if (!native || !PyObject_TypeCheck(native, &RuntimeType)) {
+                Py_XDECREF(native);
+                PyErr_SetString(PyExc_TypeError,
+                                "runtime must be a Runtime instance or None");
+                return NULL;
+            }
+            rt = ((RuntimeObject *)native)->runtime.get();
+            Py_DECREF(native);
+        } else {
+            rt = ((RuntimeObject *)runtime_arg)->runtime.get();
+        }
+    } else {
+        rt = get_default_runtime();
+    }
+
+    PatternDirectoryScannerUtilityInput input(directory, patterns,
+                                              recursive != 0, true);
+    std::vector<FileEntry> entries;
+    try {
+        Py_BEGIN_ALLOW_THREADS rt
+            ->submit(dftracer::utils::run_coro_scope(
+                         rt->executor(),
+                         [](dftracer::utils::CoroScope &scope,
+                            PatternDirectoryScannerUtilityInput in,
+                            std::vector<FileEntry> *out)
+                             -> dftracer::utils::coro::CoroTask<void> {
+                             PatternDirectoryScannerUtility scanner;
+                             *out = co_await scope.spawn(scanner, in);
+                         },
+                         std::move(input), &entries),
+                     "scan-files")
+            .get();
+        Py_END_ALLOW_THREADS
+    } catch (const std::exception &e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return NULL;
+    }
+
+    PyObject *out = PyList_New(static_cast<Py_ssize_t>(entries.size()));
+    if (!out) return NULL;
+    for (std::size_t i = 0; i < entries.size(); ++i) {
+        PyObject *t = Py_BuildValue("(sn)", entries[i].path.c_str(),
+                                    (Py_ssize_t)entries[i].size);
+        if (!t) {
+            Py_DECREF(out);
+            return NULL;
+        }
+        PyList_SET_ITEM(out, i, t);
+    }
+    return out;
+}
+
+// ---------------------------------------------------------------------------
+// plan_lpt_partition: LPT bin-packing of (path, size) pairs
+// ---------------------------------------------------------------------------
+
+static PyObject *plan_lpt_partition_fn(PyObject * /*self*/, PyObject *args) {
+    PyObject *entries_obj;
+    Py_ssize_t num_workers;
+    if (!PyArg_ParseTuple(args, "On", &entries_obj, &num_workers)) return NULL;
+    if (num_workers <= 0) num_workers = 1;
+
+    std::vector<FileEntry> entries;
+    PyObject *seq = PySequence_Fast(entries_obj,
+                                    "entries must be a sequence of "
+                                    "(path, size) tuples");
+    if (!seq) return NULL;
+    Py_ssize_t n = PySequence_Fast_GET_SIZE(seq);
+    entries.reserve(n);
+    for (Py_ssize_t i = 0; i < n; ++i) {
+        PyObject *item = PySequence_Fast_GET_ITEM(seq, i);
+        const char *path = nullptr;
+        Py_ssize_t size = 0;
+        if (!PyArg_ParseTuple(item, "sn", &path, &size)) {
+            Py_DECREF(seq);
+            return NULL;
+        }
+        FileEntry fe;
+        fe.path = path;
+        fe.size = static_cast<std::size_t>(size);
+        fe.is_regular_file = true;
+        entries.push_back(std::move(fe));
+    }
+    Py_DECREF(seq);
+
+    auto buckets = plan_lpt_partition(std::move(entries),
+                                      static_cast<std::size_t>(num_workers));
+
+    PyObject *out = PyList_New(static_cast<Py_ssize_t>(buckets.size()));
+    if (!out) return NULL;
+    for (std::size_t i = 0; i < buckets.size(); ++i) {
+        PyObject *lst = PyList_New(static_cast<Py_ssize_t>(buckets[i].size()));
+        if (!lst) {
+            Py_DECREF(out);
+            return NULL;
+        }
+        for (std::size_t j = 0; j < buckets[i].size(); ++j) {
+            PyObject *t = Py_BuildValue("(sn)", buckets[i][j].path.c_str(),
+                                        (Py_ssize_t)buckets[i][j].size);
+            if (!t) {
+                Py_DECREF(lst);
+                Py_DECREF(out);
+                return NULL;
+            }
+            PyList_SET_ITEM(lst, j, t);
+        }
+        PyList_SET_ITEM(out, i, lst);
+    }
+    return out;
+}
+
+// ---------------------------------------------------------------------------
+// build_sst_batch: run the indexer pipeline with an SST sink and return
+// the merged Artifacts dict.
+// ---------------------------------------------------------------------------
+
+static PyObject *build_sst_batch_fn(PyObject * /*self*/, PyObject *args,
+                                    PyObject *kwds) {
+    static const char *kwlist[] = {"files",
+                                   "file_ids",
+                                   "staging_dir",
+                                   "batch_id",
+                                   "index_dir",
+                                   "checkpoint_size",
+                                   "build_manifest",
+                                   "force_rebuild",
+                                   "bloom_dimensions",
+                                   "parallelism",
+                                   "flush_every_files",
+                                   "runtime",
+                                   "aggregation_config",
+                                   "file_slices",
+                                   NULL};
+    PyObject *files_obj;
+    PyObject *file_ids_obj;
+    const char *staging_dir;
+    const char *batch_id;
+    const char *index_dir = "";
+    Py_ssize_t checkpoint_size = 32 * 1024 * 1024;
+    int build_manifest = 0;
+    int force_rebuild = 0;
+    PyObject *bloom_dims_obj = NULL;
+    Py_ssize_t parallelism = 0;
+    Py_ssize_t flush_every_files = 0;
+    PyObject *runtime_arg = NULL;
+    PyObject *aggregation_config_obj = NULL;
+    PyObject *file_slices_obj = NULL;
+
+    if (!PyArg_ParseTupleAndKeywords(
+            args, kwds, "OOss|snppOnnOOO", (char **)kwlist, &files_obj,
+            &file_ids_obj, &staging_dir, &batch_id, &index_dir,
+            &checkpoint_size, &build_manifest, &force_rebuild, &bloom_dims_obj,
+            &parallelism, &flush_every_files, &runtime_arg,
+            &aggregation_config_obj, &file_slices_obj)) {
+        return NULL;
+    }
+
+    // Unpack files.
+    std::vector<std::string> files;
+    {
+        PyObject *seq = PySequence_Fast(files_obj, "files must be a sequence");
+        if (!seq) return NULL;
+        Py_ssize_t n = PySequence_Fast_GET_SIZE(seq);
+        files.reserve(n);
+        for (Py_ssize_t i = 0; i < n; ++i) {
+            const char *s = PyUnicode_AsUTF8(PySequence_Fast_GET_ITEM(seq, i));
+            if (!s) {
+                Py_DECREF(seq);
+                return NULL;
+            }
+            files.emplace_back(s);
+        }
+        Py_DECREF(seq);
+    }
+    if (files.empty()) {
+        return PyDict_New();
+    }
+
+    // Unpack file_ids, parallel to files.
+    std::vector<int> file_ids;
+    {
+        PyObject *seq =
+            PySequence_Fast(file_ids_obj, "file_ids must be a sequence");
+        if (!seq) return NULL;
+        Py_ssize_t n = PySequence_Fast_GET_SIZE(seq);
+        if (static_cast<std::size_t>(n) != files.size()) {
+            Py_DECREF(seq);
+            PyErr_SetString(PyExc_ValueError,
+                            "file_ids must have the same length as files");
+            return NULL;
+        }
+        file_ids.reserve(n);
+        for (Py_ssize_t i = 0; i < n; ++i) {
+            long v = PyLong_AsLong(PySequence_Fast_GET_ITEM(seq, i));
+            if (v == -1 && PyErr_Occurred()) {
+                Py_DECREF(seq);
+                return NULL;
+            }
+            file_ids.push_back(static_cast<int>(v));
+        }
+        Py_DECREF(seq);
+    }
+
+    // Optional bloom dimensions override.
+    std::vector<std::string> bloom_dims;
+    if (bloom_dims_obj && bloom_dims_obj != Py_None) {
+        PyObject *seq = PySequence_Fast(bloom_dims_obj,
+                                        "bloom_dimensions must be a sequence");
+        if (!seq) return NULL;
+        Py_ssize_t n = PySequence_Fast_GET_SIZE(seq);
+        bloom_dims.reserve(n);
+        for (Py_ssize_t i = 0; i < n; ++i) {
+            const char *s = PyUnicode_AsUTF8(PySequence_Fast_GET_ITEM(seq, i));
+            if (!s) {
+                Py_DECREF(seq);
+                return NULL;
+            }
+            bloom_dims.emplace_back(s);
+        }
+        Py_DECREF(seq);
+    }
+
+    // Resolve Runtime (matching CheckpointIndexer pattern).
+    Runtime *rt = nullptr;
+    if (runtime_arg && runtime_arg != Py_None) {
+        if (PyObject_TypeCheck(runtime_arg, &RuntimeType)) {
+            rt = ((RuntimeObject *)runtime_arg)->runtime.get();
+        } else {
+            PyObject *native = PyObject_GetAttrString(runtime_arg, "_native");
+            if (!native || !PyObject_TypeCheck(native, &RuntimeType)) {
+                Py_XDECREF(native);
+                PyErr_SetString(PyExc_TypeError,
+                                "runtime must be a Runtime instance or None");
+                return NULL;
+            }
+            rt = ((RuntimeObject *)native)->runtime.get();
+            Py_DECREF(native);
+        }
+    } else {
+        rt = get_default_runtime();
+    }
+
+    // Build config + sink factory shared state.
+    struct SharedArtifacts {
+        std::mutex mu;
+        std::vector<IndexDatabaseSstWriterContext::Artifacts> list;
+    };
+    auto artifacts = std::make_shared<SharedArtifacts>();
+    auto staging = std::string(staging_dir);
+    auto batch = std::string(batch_id);
+
+    // Optional aggregation config, extracted from the Python dataclass.
+    std::shared_ptr<dftracer::utils::utilities::composites::dft::aggregators::
+                        AggregationConfig>
+        agg_config_ptr;
+    if (aggregation_config_obj && aggregation_config_obj != Py_None) {
+        using dftracer::utils::utilities::composites::dft::aggregators::
+            AggregationConfig;
+        auto cfg = std::make_shared<AggregationConfig>();
+        auto pull_double = [&](const char *name, double fallback) -> double {
+            PyObject *v = PyObject_GetAttrString(aggregation_config_obj, name);
+            if (!v || v == Py_None) {
+                Py_XDECREF(v);
+                PyErr_Clear();
+                return fallback;
+            }
+            double out = PyFloat_AsDouble(v);
+            Py_DECREF(v);
+            if (out == -1.0 && PyErr_Occurred()) return fallback;
+            return out;
+        };
+        auto pull_bool = [&](const char *name, bool fallback) -> bool {
+            PyObject *v = PyObject_GetAttrString(aggregation_config_obj, name);
+            if (!v || v == Py_None) {
+                Py_XDECREF(v);
+                PyErr_Clear();
+                return fallback;
+            }
+            int out = PyObject_IsTrue(v);
+            Py_DECREF(v);
+            return out > 0 ? true : fallback;
+        };
+        auto pull_string_list =
+            [&](const char *name) -> std::vector<std::string> {
+            std::vector<std::string> out;
+            PyObject *v = PyObject_GetAttrString(aggregation_config_obj, name);
+            if (!v || v == Py_None) {
+                Py_XDECREF(v);
+                PyErr_Clear();
+                return out;
+            }
+            PyObject *seq = PySequence_Fast(v, "expected list of str");
+            Py_DECREF(v);
+            if (!seq) {
+                PyErr_Clear();
+                return out;
+            }
+            Py_ssize_t n = PySequence_Fast_GET_SIZE(seq);
+            out.reserve(n);
+            for (Py_ssize_t i = 0; i < n; ++i) {
+                const char *s =
+                    PyUnicode_AsUTF8(PySequence_Fast_GET_ITEM(seq, i));
+                if (s) out.emplace_back(s);
+            }
+            Py_DECREF(seq);
+            return out;
+        };
+        double time_interval_ms = pull_double("time_interval_ms", 5000.0);
+        cfg->time_interval_us =
+            static_cast<std::uint64_t>(time_interval_ms * 1000.0);
+        cfg->compute_percentiles = pull_bool("compute_percentiles", false);
+        cfg->extra_group_keys = pull_string_list("group_keys");
+        cfg->custom_metric_fields = pull_string_list("custom_metric_fields");
+        agg_config_ptr = std::move(cfg);
+    }
+
+    // owned_member_maps must outlive rt->submit: FileSlice::members is raw.
+    std::vector<std::vector<GzipMember>> owned_member_maps;
+    std::vector<IndexBuildBatchConfig::FileSlice> parsed_slices;
+    if (file_slices_obj && file_slices_obj != Py_None) {
+        PyObject *seq =
+            PySequence_Fast(file_slices_obj, "file_slices must be a sequence");
+        if (!seq) return NULL;
+        Py_ssize_t n = PySequence_Fast_GET_SIZE(seq);
+        if (static_cast<std::size_t>(n) != files.size()) {
+            Py_DECREF(seq);
+            PyErr_SetString(PyExc_ValueError,
+                            "file_slices must match files length");
+            return NULL;
+        }
+        owned_member_maps.resize(n);
+        parsed_slices.resize(n);
+        for (Py_ssize_t i = 0; i < n; ++i) {
+            PyObject *entry = PySequence_Fast_GET_ITEM(seq, i);
+            if (entry == Py_None) {
+                continue;  // leave slice default-constructed (members=null)
+            }
+            Py_ssize_t mb = 0, me = 0, ckpt_base = 0;
+            int skip_scoped = 0;
+            PyObject *members_obj = nullptr;
+            if (!PyArg_ParseTuple(entry, "nnnpO", &mb, &me, &ckpt_base,
+                                  &skip_scoped, &members_obj)) {
+                Py_DECREF(seq);
+                return NULL;
+            }
+            PyObject *mseq = PySequence_Fast(
+                members_obj, "file_slices[i].members must be a sequence");
+            if (!mseq) {
+                Py_DECREF(seq);
+                return NULL;
+            }
+            Py_ssize_t mn = PySequence_Fast_GET_SIZE(mseq);
+            auto &mv = owned_member_maps[i];
+            mv.resize(mn);
+            for (Py_ssize_t j = 0; j < mn; ++j) {
+                PyObject *m = PySequence_Fast_GET_ITEM(mseq, j);
+                unsigned long long c_offset = 0, c_size = 0;
+                if (!PyArg_ParseTuple(m, "KK", &c_offset, &c_size)) {
+                    Py_DECREF(mseq);
+                    Py_DECREF(seq);
+                    return NULL;
+                }
+                mv[j].c_offset = static_cast<std::uint64_t>(c_offset);
+                mv[j].c_size = static_cast<std::uint64_t>(c_size);
+            }
+            Py_DECREF(mseq);
+            parsed_slices[i].members = &mv;
+            parsed_slices[i].member_begin = static_cast<std::size_t>(mb);
+            parsed_slices[i].member_end = static_cast<std::size_t>(me);
+            parsed_slices[i].checkpoint_idx_base =
+                static_cast<std::uint64_t>(ckpt_base);
+            parsed_slices[i].skip_file_scoped_writes = skip_scoped != 0;
+        }
+        Py_DECREF(seq);
+    }
+
+    auto batch_config = std::make_shared<IndexBuildBatchConfig>();
+    batch_config->file_paths = std::move(files);
+    batch_config->preassigned_file_ids = std::move(file_ids);
+    if (!parsed_slices.empty()) {
+        batch_config->file_slices = parsed_slices;
+    }
+    batch_config->index_dir = index_dir;
+    batch_config->checkpoint_size = static_cast<std::size_t>(checkpoint_size);
+    batch_config->build_manifest = build_manifest != 0;
+    batch_config->force_rebuild = force_rebuild != 0;
+    batch_config->bloom_dimensions = std::move(bloom_dims);
+    batch_config->parallelism =
+        parallelism > 0 ? static_cast<std::size_t>(parallelism)
+                        : (rt ? std::max<std::size_t>(rt->threads(), 1) : 1);
+    batch_config->flush_every_files =
+        static_cast<std::size_t>(flush_every_files);
+    batch_config->rebuild_root_summaries = false;
+
+    if (agg_config_ptr) {
+        auto agg_staging = staging;
+        auto agg_prefix = batch + "_agg";
+        // Counter keeps per-file SST dirs unique across duplicate file_paths
+        // when one worker owns multiple slices of the same file.
+        auto visitor_counter = std::make_shared<std::atomic<std::size_t>>(0);
+        batch_config->dft_visitor_factory =
+            [agg_staging, agg_prefix, agg_config_ptr,
+             visitor_counter](const std::string &file_path)
+            -> std::vector<std::unique_ptr<
+                dftracer::utils::utilities::composites::dft::DftEventVisitor>> {
+            using dftracer::utils::utilities::composites::dft::DftEventVisitor;
+            using dftracer::utils::utilities::composites::dft::aggregators::
+                AggregationVisitor;
+            const std::size_t idx =
+                visitor_counter->fetch_add(1, std::memory_order_relaxed);
+            std::string prefix = agg_prefix + "_" + std::to_string(idx);
+            std::vector<std::unique_ptr<DftEventVisitor>> visitors;
+            visitors.push_back(std::make_unique<AggregationVisitor>(
+                agg_staging, prefix, /*config_hash=*/0, *agg_config_ptr,
+                file_path));
+            return visitors;
+        };
+    }
+
+    // Atomic: write phase calls sink_factory from N coroutines concurrently.
+    auto batch_counter = std::make_shared<std::atomic<std::size_t>>(0);
+    batch_config->sink_factory =
+        [staging, batch, batch_counter]() -> std::unique_ptr<IndexBatchSink> {
+        const std::size_t idx =
+            batch_counter->fetch_add(1, std::memory_order_relaxed);
+        std::string sub_batch = batch + "_" + std::to_string(idx);
+        return std::make_unique<IndexDatabaseSstWriterContext>(staging,
+                                                               sub_batch);
+    };
+    batch_config->sink_commit = [artifacts](IndexBatchSink &sink) {
+        auto &sst = static_cast<IndexDatabaseSstWriterContext &>(sink);
+        auto batch_artifacts = sst.commit();
+        std::lock_guard<std::mutex> lock(artifacts->mu);
+        if (!batch_artifacts.empty()) {
+            artifacts->list.push_back(std::move(batch_artifacts));
+        }
+    };
+
+    IndexBuildBatchResult result;
+    std::string submit_error;
+    Py_BEGIN_ALLOW_THREADS try {
+        rt->submit(dftracer::utils::run_coro_scope(
+                       rt->executor(),
+                       [](dftracer::utils::CoroScope &scope,
+                          std::shared_ptr<IndexBuildBatchConfig> cfg,
+                          IndexBuildBatchResult *out)
+                           -> dftracer::utils::coro::CoroTask<void> {
+                           *out = co_await IndexBatchBuilderUtility::process(
+                               &scope, std::move(cfg));
+                       },
+                       batch_config, &result),
+                   "build-sst-batch")
+            .get();
+    } catch (const std::exception &e) {
+        submit_error = e.what();
+    }
+    Py_END_ALLOW_THREADS if (!submit_error.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, submit_error.c_str());
+        return NULL;
+    }
+
+    // If any file failed, surface the first error.
+    if (result.failed > 0) {
+        for (const auto &r : result.results) {
+            if (!r.success) {
+                PyErr_SetString(PyExc_RuntimeError, r.error_message.c_str());
+                return NULL;
+            }
+        }
+    }
+
+    // One dict per committed sink + per-file aggregation below.
+    PyObject *out_list = PyList_New(0);
+    if (!out_list) return NULL;
+    {
+        std::lock_guard<std::mutex> lock(artifacts->mu);
+        for (const auto &a : artifacts->list) {
+            PyObject *main_dict = artifacts_to_dict(a);
+            if (!main_dict || PyList_Append(out_list, main_dict) < 0) {
+                Py_XDECREF(main_dict);
+                Py_DECREF(out_list);
+                return NULL;
+            }
+            Py_DECREF(main_dict);
+        }
+    }
+    // Harvest per-file aggregation SSTs from extra visitors. Each visitor
+    // holds a vector of Artifacts (one per FLUSH_THRESHOLD flush + the
+    // file-complete flush) because SstFileWriter requires strictly
+    // ascending keys per SST and cross-flush merge operands would collide.
+    //
+    // `extra_visitors` is indexed per input file, but a single
+    // AggregationVisitor instance is typically shared across every file in
+    // the batch (one flush at end-of-batch). Without dedup we would emit
+    // that visitor's artifact dict N_files times, producing a manifest with
+    // N copies of the same SST path. Dedup by visitor pointer so each
+    // unique flush-sequence is emitted exactly once.
+    using dftracer::utils::utilities::composites::dft::aggregators::
+        AggregationVisitor;
+    using dftracer::utils::utilities::composites::dft::aggregators::
+        AssociationTracker;
+    std::unordered_set<AggregationVisitor *> seen_visitors;
+    for (auto &file_visitors : result.extra_visitors) {
+        for (auto &visitor : file_visitors) {
+            auto *agg = dynamic_cast<AggregationVisitor *>(visitor.get());
+            if (!agg) continue;
+            if (!seen_visitors.insert(agg).second) continue;
+            for (auto &a : agg->aggregation_artifacts()) {
+                if (a.empty()) continue;
+                PyObject *agg_dict = artifacts_to_dict(a);
+                if (!agg_dict || PyList_Append(out_list, agg_dict) < 0) {
+                    Py_XDECREF(agg_dict);
+                    Py_DECREF(out_list);
+                    return NULL;
+                }
+                Py_DECREF(agg_dict);
+            }
+        }
+    }
+
+    AssociationTracker combined;
+    bool any_tracker = false;
+    for (auto *agg : seen_visitors) {
+        auto out = agg->take_output();
+        if (out.local_tracker) {
+            combined.merge(*out.local_tracker);
+            any_tracker = true;
+        }
+    }
+    PyObject *tracker_bytes = nullptr;
+    if (any_tracker) {
+        combined.finalize();
+        std::string blob = combined.serialize();
+        tracker_bytes = PyBytes_FromStringAndSize(
+            blob.data(), static_cast<Py_ssize_t>(blob.size()));
+    } else {
+        tracker_bytes = PyBytes_FromStringAndSize(nullptr, 0);
+    }
+    if (!tracker_bytes) {
+        Py_DECREF(out_list);
+        return NULL;
+    }
+    PyObject *ret = PyTuple_Pack(2, out_list, tracker_bytes);
+    Py_DECREF(out_list);
+    Py_DECREF(tracker_bytes);
+    return ret;
+}
+
+static PyObject *enable_aggregation_deterministic_ids_fn(PyObject * /*self*/,
+                                                         PyObject * /*args*/) {
+    dftracer::utils::utilities::composites::dft::aggregators::
+        aggregation_intern()
+            .enable_deterministic_ids();
+    Py_RETURN_NONE;
+}
+
+static PyObject *move_artifacts_fn(PyObject * /*self*/, PyObject *args,
+                                   PyObject *kwds) {
+    static const char *kwlist[] = {"artifacts", "dest_dir", NULL};
+    PyObject *dict = NULL;
+    const char *dest_dir = NULL;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "Os", (char **)kwlist, &dict,
+                                     &dest_dir)) {
+        return NULL;
+    }
+    IndexDatabaseSstWriterContext::Artifacts a;
+    if (!artifacts_from_dict(dict, &a)) return NULL;
+    IndexDatabaseSstWriterContext::Artifacts moved;
+    try {
+        Py_BEGIN_ALLOW_THREADS moved = std::move(a).move_to(dest_dir);
+        Py_END_ALLOW_THREADS
+    } catch (const std::exception &e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return NULL;
+    }
+    return artifacts_to_dict(moved);
+}
+
+namespace {
+
+dftracer::utils::coro::CoroTask<void> scan_one_gzip_file(
+    std::string path, std::vector<GzipMember> *out) {
+    out->clear();
+    int fd = ::open(path.c_str(), O_RDONLY);
+    if (fd < 0) co_return;
+    struct stat st;
+    if (::fstat(fd, &st) == 0 && st.st_size >= 18) {
+        co_await enumerate_gzip_member_candidates(
+            fd, static_cast<std::uint64_t>(st.st_size), *out);
+    }
+    ::close(fd);
+}
+
+}  // namespace
+
+static PyObject *enumerate_gzip_members_fn(PyObject * /*self*/, PyObject *args,
+                                           PyObject *kwds) {
+    static const char *kwlist[] = {"files", "runtime", NULL};
+    PyObject *files_obj = NULL;
+    PyObject *runtime_arg = NULL;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O", (char **)kwlist,
+                                     &files_obj, &runtime_arg)) {
+        return NULL;
+    }
+
+    std::vector<std::string> files;
+    {
+        PyObject *seq = PySequence_Fast(files_obj, "files must be a sequence");
+        if (!seq) return NULL;
+        Py_ssize_t n = PySequence_Fast_GET_SIZE(seq);
+        files.reserve(n);
+        for (Py_ssize_t i = 0; i < n; ++i) {
+            const char *s = PyUnicode_AsUTF8(PySequence_Fast_GET_ITEM(seq, i));
+            if (!s) {
+                Py_DECREF(seq);
+                return NULL;
+            }
+            files.emplace_back(s);
+        }
+        Py_DECREF(seq);
+    }
+
+    Runtime *rt = nullptr;
+    if (runtime_arg && runtime_arg != Py_None) {
+        if (PyObject_TypeCheck(runtime_arg, &RuntimeType)) {
+            rt = ((RuntimeObject *)runtime_arg)->runtime.get();
+        } else {
+            PyObject *native = PyObject_GetAttrString(runtime_arg, "_native");
+            if (!native || !PyObject_TypeCheck(native, &RuntimeType)) {
+                Py_XDECREF(native);
+                PyErr_SetString(PyExc_TypeError,
+                                "runtime must be a Runtime instance or None");
+                return NULL;
+            }
+            rt = ((RuntimeObject *)native)->runtime.get();
+            Py_DECREF(native);
+        }
+    } else {
+        rt = get_default_runtime();
+    }
+
+    std::vector<std::vector<GzipMember>> results(files.size());
+    std::string submit_error;
+    Py_BEGIN_ALLOW_THREADS try {
+        rt->submit(
+              dftracer::utils::run_coro_scope(
+                  rt->executor(),
+                  [](dftracer::utils::CoroScope &scope,
+                     const std::vector<std::string> *paths,
+                     std::vector<std::vector<GzipMember>> *out)
+                      -> dftracer::utils::coro::CoroTask<void> {
+                      co_await scope.scope(
+                          [paths, out](dftracer::utils::CoroScope &child)
+                              -> dftracer::utils::coro::CoroTask<void> {
+                              for (std::size_t i = 0; i < paths->size(); ++i) {
+                                  const std::string &path = (*paths)[i];
+                                  auto *slot = &(*out)[i];
+                                  child.spawn(
+                                      [path, slot](dftracer::utils::CoroScope &)
+                                          -> dftracer::utils::coro::CoroTask<
+                                              void> {
+                                          co_await scan_one_gzip_file(path,
+                                                                      slot);
+                                      });
+                              }
+                              co_return;
+                          });
+                      co_return;
+                  },
+                  &files, &results),
+              "enumerate-gzip-members")
+            .get();
+    } catch (const std::exception &e) {
+        submit_error = e.what();
+    }
+    Py_END_ALLOW_THREADS if (!submit_error.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, submit_error.c_str());
+        return NULL;
+    }
+
+    PyObject *out_list = PyList_New(static_cast<Py_ssize_t>(results.size()));
+    if (!out_list) return NULL;
+    for (std::size_t i = 0; i < results.size(); ++i) {
+        const auto &mv = results[i];
+        PyObject *inner = PyList_New(static_cast<Py_ssize_t>(mv.size()));
+        if (!inner) {
+            Py_DECREF(out_list);
+            return NULL;
+        }
+        for (std::size_t j = 0; j < mv.size(); ++j) {
+            PyObject *t =
+                Py_BuildValue("(KK)", (unsigned long long)mv[j].c_offset,
+                              (unsigned long long)mv[j].c_size);
+            if (!t) {
+                Py_DECREF(inner);
+                Py_DECREF(out_list);
+                return NULL;
+            }
+            PyList_SET_ITEM(inner, j, t);
+        }
+        PyList_SET_ITEM(out_list, i, inner);
+    }
+    return out_list;
+}
+
+// Mirrors build_work_units + lpt_assign_units in dftracer_aggregator_mpi.cpp
+// so the Dask backend produces identical work distribution to MPI.
+static PyObject *plan_work_units_fn(PyObject * /*self*/, PyObject *args,
+                                    PyObject *kwds) {
+    static const char *kwlist[] = {"member_map", "num_workers", "target_c_size",
+                                   NULL};
+    PyObject *map_obj = NULL;
+    Py_ssize_t num_workers = 0;
+    unsigned long long target_c_size = 0;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "On|K", (char **)kwlist,
+                                     &map_obj, &num_workers, &target_c_size)) {
+        return NULL;
+    }
+    if (num_workers <= 0) num_workers = 1;
+
+    std::vector<std::vector<GzipMember>> member_map;
+    {
+        PyObject *seq =
+            PySequence_Fast(map_obj, "member_map must be a sequence");
+        if (!seq) return NULL;
+        Py_ssize_t n = PySequence_Fast_GET_SIZE(seq);
+        member_map.resize(n);
+        for (Py_ssize_t i = 0; i < n; ++i) {
+            PyObject *inner = PySequence_Fast_GET_ITEM(seq, i);
+            PyObject *iseq =
+                PySequence_Fast(inner, "member_map[i] must be a sequence");
+            if (!iseq) {
+                Py_DECREF(seq);
+                return NULL;
+            }
+            Py_ssize_t ni = PySequence_Fast_GET_SIZE(iseq);
+            member_map[i].resize(ni);
+            for (Py_ssize_t j = 0; j < ni; ++j) {
+                PyObject *t = PySequence_Fast_GET_ITEM(iseq, j);
+                unsigned long long c_offset = 0, c_size = 0;
+                if (!PyArg_ParseTuple(t, "KK", &c_offset, &c_size)) {
+                    Py_DECREF(iseq);
+                    Py_DECREF(seq);
+                    return NULL;
+                }
+                member_map[i][j].c_offset =
+                    static_cast<std::uint64_t>(c_offset);
+                member_map[i][j].c_size = static_cast<std::uint64_t>(c_size);
+            }
+            Py_DECREF(iseq);
+        }
+        Py_DECREF(seq);
+    }
+
+    // Fallback: treat empty/non-gzip files as a single whole-file member.
+    std::uint64_t total_c = 0;
+    for (auto &mv : member_map) {
+        if (mv.empty()) mv.push_back({0, 0});
+        for (const auto &m : mv) total_c += m.c_size;
+    }
+
+    if (target_c_size == 0) {
+        target_c_size =
+            (total_c + static_cast<std::uint64_t>(num_workers) - 1) /
+            std::max<std::uint64_t>(static_cast<std::uint64_t>(num_workers), 1);
+    }
+
+    struct Unit {
+        std::size_t file_idx;
+        std::size_t member_begin;
+        std::size_t member_end;
+        std::uint64_t c_size;
+    };
+    std::vector<Unit> units;
+    for (std::size_t fi = 0; fi < member_map.size(); ++fi) {
+        const auto &members = member_map[fi];
+        if (members.empty()) continue;
+        std::size_t begin = 0;
+        std::uint64_t accum = 0;
+        for (std::size_t i = 0; i < members.size(); ++i) {
+            accum += members[i].c_size;
+            const bool is_last = (i + 1 == members.size());
+            if ((target_c_size > 0 && accum >= target_c_size) || is_last) {
+                units.push_back({fi, begin, i + 1, accum});
+                begin = i + 1;
+                accum = 0;
+            }
+        }
+    }
+
+    std::vector<std::size_t> order(units.size());
+    for (std::size_t i = 0; i < order.size(); ++i) order[i] = i;
+    std::sort(order.begin(), order.end(), [&](std::size_t a, std::size_t b) {
+        if (units[a].c_size != units[b].c_size)
+            return units[a].c_size > units[b].c_size;
+        if (units[a].file_idx != units[b].file_idx)
+            return units[a].file_idx < units[b].file_idx;
+        return units[a].member_begin < units[b].member_begin;
+    });
+    const std::size_t nw = static_cast<std::size_t>(num_workers);
+    std::vector<std::uint64_t> loads(nw, 0);
+    std::vector<std::vector<std::size_t>> per_worker(nw);
+    for (std::size_t ord : order) {
+        std::size_t best = 0;
+        for (std::size_t r = 1; r < nw; ++r)
+            if (loads[r] < loads[best]) best = r;
+        per_worker[best].push_back(ord);
+        loads[best] += std::max<std::uint64_t>(units[ord].c_size, 1);
+    }
+
+    PyObject *out = PyList_New(static_cast<Py_ssize_t>(nw));
+    if (!out) return NULL;
+    for (std::size_t w = 0; w < nw; ++w) {
+        // Keep per-worker slices sorted by (file_idx, member_begin) for
+        // deterministic, file-group-friendly iteration downstream.
+        auto &lst = per_worker[w];
+        std::sort(lst.begin(), lst.end(), [&](std::size_t a, std::size_t b) {
+            if (units[a].file_idx != units[b].file_idx)
+                return units[a].file_idx < units[b].file_idx;
+            return units[a].member_begin < units[b].member_begin;
+        });
+        PyObject *inner = PyList_New(static_cast<Py_ssize_t>(lst.size()));
+        if (!inner) {
+            Py_DECREF(out);
+            return NULL;
+        }
+        for (std::size_t k = 0; k < lst.size(); ++k) {
+            const auto &u = units[lst[k]];
+            PyObject *t = Py_BuildValue(
+                "(nnnK)", (Py_ssize_t)u.file_idx, (Py_ssize_t)u.member_begin,
+                (Py_ssize_t)u.member_end, (unsigned long long)u.c_size);
+            if (!t) {
+                Py_DECREF(inner);
+                Py_DECREF(out);
+                return NULL;
+            }
+            PyList_SET_ITEM(inner, k, t);
+        }
+        PyList_SET_ITEM(out, w, inner);
+    }
+    return out;
+}
+
+// ---------------------------------------------------------------------------
+// Module registration
+// ---------------------------------------------------------------------------
+
+static PyMethodDef SstDistributionMethods[] = {
+    {"build_sst_batch", (PyCFunction)build_sst_batch_fn,
+     METH_VARARGS | METH_KEYWORDS,
+     "build_sst_batch(files, file_ids, staging_dir, batch_id, ...) "
+     "-> (list[dict], bytes)\n"
+     "Run the indexer pipeline with an SST sink and return "
+     "(artifact_dicts, tracker_blob). The tracker blob is the serialized "
+     "merged AssociationTracker from this batch's aggregation visitors "
+     "(empty bytes when no aggregation_config was passed)."},
+    {"plan_lpt_partition", (PyCFunction)plan_lpt_partition_fn, METH_VARARGS,
+     "plan_lpt_partition(entries, num_workers) -> list[list[(path, size)]]\n"
+     "Greedy Longest-Processing-Time-first bin-packing of (path, size) "
+     "tuples across num_workers buckets. Minimises the maximum per-worker "
+     "total size."},
+    {"scan_files", (PyCFunction)scan_files_fn, METH_VARARGS | METH_KEYWORDS,
+     "scan_files(directory, patterns=None, recursive=False, runtime=None) "
+     "-> list[(path, size)]\n"
+     "Parallel directory scan returning (path, size) tuples for regular "
+     "files matching the patterns."},
+    {"enable_aggregation_deterministic_ids",
+     (PyCFunction)enable_aggregation_deterministic_ids_fn, METH_NOARGS,
+     "enable_aggregation_deterministic_ids() -> None\n"
+     "Flip the global aggregation StringIntern into deterministic-id mode "
+     "so the same string maps to the same 32-bit id in every worker "
+     "process. Call once at worker startup BEFORE any aggregation work."},
+    {"move_artifacts", (PyCFunction)move_artifacts_fn,
+     METH_VARARGS | METH_KEYWORDS,
+     "move_artifacts(artifacts, dest_dir) -> dict\n"
+     "Move every populated SST in `artifacts` (as returned by "
+     "`build_sst_batch`) into `dest_dir` via the C++ rename/copy helper, "
+     "returning a fresh dict with the new paths. Single GIL release, no "
+     "per-file Python shutil.move overhead."},
+    {"enumerate_gzip_members", (PyCFunction)enumerate_gzip_members_fn,
+     METH_VARARGS | METH_KEYWORDS,
+     "enumerate_gzip_members(files, runtime=None) -> list[list[(c_offset, "
+     "c_size)]]\n"
+     "Cooperative async scan of gzip member offsets across `files`. "
+     "Returns lists of (c_offset, c_size) parallel to `files`; empty for "
+     "non-gzip / unreadable files."},
+    {"plan_work_units", (PyCFunction)plan_work_units_fn,
+     METH_VARARGS | METH_KEYWORDS,
+     "plan_work_units(member_map, num_workers, target_c_size=0) "
+     "-> list[list[(file_idx, member_begin, member_end, c_size)]]\n"
+     "Deterministic LPT assignment of intra-file gzip-member slices "
+     "across workers. Each worker's list contains (file_idx, "
+     "member_begin, member_end, c_size) tuples; a file sliced across "
+     "multiple workers appears in each owner's list with disjoint "
+     "[member_begin, member_end) ranges."},
+    {NULL, NULL, 0, NULL}};
+
+int init_sst_distribution(PyObject *m) {
+    if (PyType_Ready(&SstArtifactRegistryType) < 0) return -1;
+    Py_INCREF(&SstArtifactRegistryType);
+    if (PyModule_AddObject(m, "SstArtifactRegistry",
+                           (PyObject *)&SstArtifactRegistryType) < 0) {
+        Py_DECREF(&SstArtifactRegistryType);
+        return -1;
+    }
+    if (PyModule_AddFunctions(m, SstDistributionMethods) < 0) return -1;
+    return 0;
+}
diff --git a/src/dftracer/utils/python/sst_distribution.h b/src/dftracer/utils/python/sst_distribution.h
new file mode 100644
index 00000000..05e9e099
--- /dev/null
+++ b/src/dftracer/utils/python/sst_distribution.h
@@ -0,0 +1,18 @@
+#ifndef DFTRACER_UTILS_PYTHON_SST_DISTRIBUTION_H
+#define DFTRACER_UTILS_PYTHON_SST_DISTRIBUTION_H
+
+#include <Python.h>
+
+namespace dftracer::utils::utilities::indexer {
+class SstArtifactRegistry;
+}
+
+/// Extract the owned C++ SstArtifactRegistry from a Python
+/// SstArtifactRegistry instance. Returns NULL (without setting an error)
+/// if `obj` is not an SstArtifactRegistry.
+dftracer::utils::utilities::indexer::SstArtifactRegistry *
+sst_artifact_registry_get(PyObject *obj);
+
+int init_sst_distribution(PyObject *m);
+
+#endif  // DFTRACER_UTILS_PYTHON_SST_DISTRIBUTION_H
diff --git a/src/dftracer/utils/python/streaming_iterator.cpp b/src/dftracer/utils/python/streaming_iterator.cpp
new file mode 100644
index 00000000..be0d6571
--- /dev/null
+++ b/src/dftracer/utils/python/streaming_iterator.cpp
@@ -0,0 +1,168 @@
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <dftracer/utils/python/streaming_iterator.h>
+#include <dftracer/utils/python/trace_reader_iterator.h>
+
+namespace dftracer::utils::python {
+
+static PyObject* ArrowStreamingIterator_new(PyTypeObject* type,
+                                            PyObject* /*args*/,
+                                            PyObject* /*kwds*/) {
+    ArrowStreamingIteratorObject* self =
+        (ArrowStreamingIteratorObject*)type->tp_alloc(type, 0);
+    if (self) {
+        // Allocate C++ state separately to avoid layout issues
+        self->cpp_state = new ArrowStreamingIteratorState();
+    }
+    return (PyObject*)self;
+}
+
+static void ArrowStreamingIterator_dealloc(ArrowStreamingIteratorObject* self) {
+    if (self->cpp_state) {
+        // Cancel the stream if still running
+        if (self->cpp_state->cancel) {
+            self->cpp_state->cancel();
+        }
+        delete self->cpp_state;
+        self->cpp_state = nullptr;
+    }
+    Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static PyObject* ArrowStreamingIterator_iter(PyObject* self) {
+    Py_INCREF(self);
+    return self;
+}
+
+static PyObject* ArrowStreamingIterator_next(
+    ArrowStreamingIteratorObject* self) {
+    if (!self->cpp_state || !self->cpp_state->pull_next) {
+        PyErr_SetString(PyExc_RuntimeError, "Iterator not initialized");
+        return NULL;
+    }
+
+    std::optional<ArrowExportResult> result;
+    bool had_error = false;
+    std::string error_msg;
+
+    Py_BEGIN_ALLOW_THREADS try {
+        result = self->cpp_state->pull_next();
+    } catch (const std::exception& e) {
+        had_error = true;
+        error_msg = e.what();
+    } catch (...) {
+        had_error = true;
+        error_msg = "Unknown error in streaming iterator";
+    }
+    Py_END_ALLOW_THREADS
+
+        if (had_error) {
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return NULL;
+    }
+
+    if (!result.has_value()) {
+        // Check for error
+        if (self->cpp_state->get_error) {
+            auto ex = self->cpp_state->get_error();
+            if (ex) {
+                try {
+                    std::rethrow_exception(ex);
+                } catch (const std::exception& e) {
+                    PyErr_SetString(PyExc_RuntimeError, e.what());
+                    return NULL;
+                } catch (...) {
+                    PyErr_SetString(PyExc_RuntimeError,
+                                    "Unknown error in streaming iterator");
+                    return NULL;
+                }
+            }
+        }
+        // Normal completion
+        return NULL;  // StopIteration
+    }
+
+    // Wrap the ArrowExportResult in an ArrowBatchCapsule
+    ArrowBatchCapsuleObject* obj =
+        (ArrowBatchCapsuleObject*)ArrowBatchCapsuleType.tp_alloc(
+            &ArrowBatchCapsuleType, 0);
+    if (!obj) return NULL;
+    obj->result = new ArrowExportResult(std::move(*result));
+    return (PyObject*)obj;
+}
+
+static PyObject* ArrowStreamingIterator_cancel(
+    ArrowStreamingIteratorObject* self, PyObject* Py_UNUSED(args)) {
+    if (self->cpp_state && self->cpp_state->cancel) {
+        self->cpp_state->cancel();
+    }
+    Py_RETURN_NONE;
+}
+
+static PyMethodDef ArrowStreamingIterator_methods[] = {
+    {"cancel", (PyCFunction)ArrowStreamingIterator_cancel, METH_NOARGS,
+     "Cancel the streaming iterator."},
+    {NULL}};
+
+PyTypeObject ArrowStreamingIteratorType = {
+    PyVarObject_HEAD_INIT(NULL, 0) "dftracer_utils_ext._ArrowStreamingIterator",
+    sizeof(ArrowStreamingIteratorObject),       /* tp_basicsize */
+    0,                                          /* tp_itemsize */
+    (destructor)ArrowStreamingIterator_dealloc, /* tp_dealloc */
+    0,                                          /* tp_vectorcall_offset */
+    0,                                          /* tp_getattr */
+    0,                                          /* tp_setattr */
+    0,                                          /* tp_as_async */
+    0,                                          /* tp_repr */
+    0,                                          /* tp_as_number */
+    0,                                          /* tp_as_sequence */
+    0,                                          /* tp_as_mapping */
+    0,                                          /* tp_hash */
+    0,                                          /* tp_call */
+    0,                                          /* tp_str */
+    0,                                          /* tp_getattro */
+    0,                                          /* tp_setattro */
+    0,                                          /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,                         /* tp_flags */
+    "Streaming Arrow batch iterator.\n\n"
+    "Yields ArrowBatch objects as they become available from the C++ "
+    "pipeline.\n"
+    "Call cancel() to stop the stream early.", /* tp_doc */
+    0,                                         /* tp_traverse */
+    0,                                         /* tp_clear */
+    0,                                         /* tp_richcompare */
+    0,                                         /* tp_weaklistoffset */
+    ArrowStreamingIterator_iter,               /* tp_iter */
+    (iternextfunc)ArrowStreamingIterator_next, /* tp_iternext */
+    ArrowStreamingIterator_methods,            /* tp_methods */
+    0,                                         /* tp_members */
+    0,                                         /* tp_getset */
+    0,                                         /* tp_base */
+    0,                                         /* tp_dict */
+    0,                                         /* tp_descr_get */
+    0,                                         /* tp_descr_set */
+    0,                                         /* tp_dictoffset */
+    0,                                         /* tp_init */
+    0,                                         /* tp_alloc */
+    ArrowStreamingIterator_new,                /* tp_new */
+};
+
+int init_arrow_streaming_iterator(PyObject* m) {
+    if (PyType_Ready(&ArrowStreamingIteratorType) < 0) return -1;
+
+    Py_INCREF(&ArrowStreamingIteratorType);
+    if (PyModule_AddObject(m, "_ArrowStreamingIterator",
+                           (PyObject*)&ArrowStreamingIteratorType) < 0) {
+        Py_DECREF(&ArrowStreamingIteratorType);
+        return -1;
+    }
+
+    return 0;
+}
+
+}  // namespace dftracer::utils::python
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW
diff --git a/src/dftracer/utils/python/streaming_iterator.h b/src/dftracer/utils/python/streaming_iterator.h
new file mode 100644
index 00000000..cca32b13
--- /dev/null
+++ b/src/dftracer/utils/python/streaming_iterator.h
@@ -0,0 +1,166 @@
+#ifndef DFTRACER_UTILS_PYTHON_STREAMING_ITERATOR_H
+#define DFTRACER_UTILS_PYTHON_STREAMING_ITERATOR_H
+
+#include <Python.h>
+#include <dftracer/utils/core/common/config.h>
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <exception>
+#include <future>
+#include <mutex>
+#include <optional>
+#include <queue>
+#include <utility>
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+#include <dftracer/utils/utilities/common/arrow/arrow_export.h>
+#endif
+
+namespace dftracer::utils::python {
+
+/// Generic streaming state for bridging C++ async producers to Python sync
+/// consumers.
+///
+/// Producer (C++ coroutine on Runtime executor):
+///   - Calls push() to enqueue items
+///   - Calls complete() when done
+///   - Calls fail() on error
+///
+/// Consumer (Python tp_iternext):
+///   - Calls pull() which blocks (with GIL released) until item available
+///   - Returns std::nullopt on completion or error
+template <typename ItemT>
+class StreamingState {
+   public:
+    explicit StreamingState(std::size_t memory_budget_bytes)
+        : memory_budget_bytes_(memory_budget_bytes) {}
+
+    bool push(ItemT item, std::size_t item_bytes) {
+        std::unique_lock<std::mutex> lock(mtx_);
+        cv_producer_.wait(lock, [this] {
+            return bytes_in_queue_.load(std::memory_order_acquire) <
+                       memory_budget_bytes_ ||
+                   cancelled_.load(std::memory_order_acquire);
+        });
+        if (cancelled_.load(std::memory_order_acquire)) {
+            return false;
+        }
+        bytes_in_queue_.fetch_add(item_bytes, std::memory_order_acq_rel);
+        queue_.push({std::move(item), item_bytes});
+        lock.unlock();
+        cv_consumer_.notify_one();
+        return true;
+    }
+
+    void complete() {
+        {
+            std::lock_guard<std::mutex> lock(mtx_);
+            done_.store(true, std::memory_order_release);
+        }
+        cv_consumer_.notify_all();
+    }
+
+    void fail(std::exception_ptr ex) {
+        {
+            std::lock_guard<std::mutex> lock(mtx_);
+            error_ = std::move(ex);
+            done_.store(true, std::memory_order_release);
+        }
+        cv_consumer_.notify_all();
+    }
+
+    void cancel() {
+        cancelled_.store(true, std::memory_order_release);
+        cv_producer_.notify_all();
+        cv_consumer_.notify_all();
+    }
+
+    std::optional<ItemT> pull() {
+        std::unique_lock<std::mutex> lock(mtx_);
+        cv_consumer_.wait(lock, [this] {
+            return !queue_.empty() ||
+                   cancelled_.load(std::memory_order_acquire) ||
+                   done_.load(std::memory_order_acquire);
+        });
+
+        if (cancelled_.load(std::memory_order_acquire) && queue_.empty()) {
+            return std::nullopt;
+        }
+
+        if (queue_.empty()) {
+            return std::nullopt;
+        }
+
+        auto [item, size] = std::move(queue_.front());
+        queue_.pop();
+        bytes_in_queue_.fetch_sub(size, std::memory_order_acq_rel);
+        lock.unlock();
+        cv_producer_.notify_one();
+        return std::move(item);
+    }
+
+    std::exception_ptr error() const { return error_; }
+
+    bool cancelled() const {
+        return cancelled_.load(std::memory_order_acquire);
+    }
+
+    bool done() const { return done_.load(std::memory_order_acquire); }
+
+    void set_task_future(std::shared_future<void> future) {
+        task_future_ = std::move(future);
+    }
+
+   private:
+    struct QueueEntry {
+        ItemT item;
+        std::size_t size;
+    };
+    std::queue<QueueEntry> queue_;
+    std::mutex mtx_;
+    std::condition_variable cv_producer_;
+    std::condition_variable cv_consumer_;
+    std::exception_ptr error_;
+    std::atomic<bool> cancelled_{false};
+    std::atomic<bool> done_{false};
+    std::size_t memory_budget_bytes_;
+    std::atomic<std::size_t> bytes_in_queue_{0};
+    std::shared_future<void> task_future_;
+};
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+
+using utilities::common::arrow::ArrowExportResult;
+
+/// Internal C++ state for ArrowStreamingIterator.
+/// Stored as a pointer to avoid C++ object layout issues with Python.
+struct ArrowStreamingIteratorState {
+    std::shared_ptr<void> state;
+    std::function<std::optional<ArrowExportResult>()> pull_next;
+    std::function<std::exception_ptr()> get_error;
+    std::function<void()> cancel;
+};
+
+/// Type-erased Arrow streaming iterator for Python.
+///
+/// This allows different producer types (AggregationBatch, ArrowExportResult,
+/// etc.) to share the same Python iterator mechanics.
+struct ArrowStreamingIteratorObject {
+    PyObject_HEAD
+
+        /// Pointer to C++ state (owned, allocated with new).
+        ArrowStreamingIteratorState* cpp_state;
+};
+
+extern PyTypeObject ArrowStreamingIteratorType;
+
+/// Initialize the ArrowStreamingIteratorType.
+int init_arrow_streaming_iterator(PyObject* m);
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW
+
+}  // namespace dftracer::utils::python
+
+#endif  // DFTRACER_UTILS_PYTHON_STREAMING_ITERATOR_H
diff --git a/src/dftracer/utils/python/trace_reader.cpp b/src/dftracer/utils/python/trace_reader.cpp
index f50c0e33..dce2288f 100644
--- a/src/dftracer/utils/python/trace_reader.cpp
+++ b/src/dftracer/utils/python/trace_reader.cpp
@@ -1,116 +1,566 @@
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
+#include <dftracer/utils/core/common/config.h>
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/common/memory_budget.h>
+#include <dftracer/utils/core/coro/channel.h>
 #include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/coro/when_all.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/utils/string.h>
 #include <dftracer/utils/python/arrow_helpers.h>
+#include <dftracer/utils/python/batch_byte_size.h>
+#include <dftracer/utils/python/json.h>
 #include <dftracer/utils/python/runtime.h>
 #include <dftracer/utils/python/trace_reader.h>
 #include <dftracer/utils/python/trace_reader_iterator.h>
+#include <dftracer/utils/utilities/common/query/query.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h>
+#include <dftracer/utils/utilities/composites/dft/internal/utils.h>
+#include <dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/internal/helpers.h>
 #include <dftracer/utils/utilities/reader/trace_reader.h>
 
 #include <algorithm>
 #include <cctype>
-#include <cinttypes>
 #include <cstddef>
 #include <cstdio>
 #include <cstring>
 #include <exception>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
-
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
+#include <dftracer/utils/python/arrow_stream_capsule.h>
 #include <dftracer/utils/utilities/common/arrow/column_builder.h>
-#include <yyjson.h>
+#include <dftracer/utils/utilities/common/json/parser.h>
+#endif
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+#include <dftracer/utils/utilities/common/arrow/ipc_writer.h>
+#include <dftracer/utils/utilities/common/arrow/partition_writer.h>
+#include <dftracer/utils/utilities/composites/dft/internal/utils.h>
+#include <dftracer/utils/utilities/composites/dft/metadata_collector_utility.h>
+#include <dftracer/utils/utilities/composites/dft/views/view_builder_utility.h>
+#include <dftracer/utils/utilities/composites/dft/views/view_definition.h>
+#include <dftracer/utils/utilities/composites/dft/views/view_reader_utility.h>
 #endif
 
 namespace {
 
+using dftracer::utils::CoroScope;
 using dftracer::utils::Runtime;
 using dftracer::utils::coro::CoroTask;
+using dftracer::utils::coro::when_all;
+using dftracer::utils::utilities::filesystem::PatternDirectoryScannerUtility;
+using dftracer::utils::utilities::filesystem::
+    PatternDirectoryScannerUtilityInput;
 using dftracer::utils::utilities::reader::ReadConfig;
 using dftracer::utils::utilities::reader::TraceReader;
 using dftracer::utils::utilities::reader::TraceReaderConfig;
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+using dftracer::utils::utilities::common::arrow::ColumnType;
+using dftracer::utils::utilities::common::arrow::RecordBatchBuilder;
+using dftracer::utils::utilities::common::json::JsonParser;
+using dftracer::utils::utilities::common::json::JsonValueHelper;
+#endif
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+using dftracer::utils::utilities::common::arrow::IpcCompression;
+using dftracer::utils::utilities::common::arrow::PartitionWriter;
+using dftracer::utils::utilities::common::arrow::PartitionWriteStats;
+using dftracer::utils::utilities::composites::dft::MetadataCollectorUtility;
+using dftracer::utils::utilities::composites::dft::
+    MetadataCollectorUtilityInput;
+using dftracer::utils::utilities::composites::dft::views::ViewBuilderInput;
+using dftracer::utils::utilities::composites::dft::views::ViewBuilderUtility;
+using dftracer::utils::utilities::composites::dft::views::ViewDefinition;
+using dftracer::utils::utilities::composites::dft::views::ViewReaderInput;
+using dftracer::utils::utilities::composites::dft::views::ViewReaderUtility;
+#endif
 
-int64_t json_to_int64(yyjson_val *value) {
-    if (yyjson_is_int(value)) return yyjson_get_sint(value);
-    return static_cast<int64_t>(yyjson_get_uint(value));
-}
+using dftracer::utils::python::MemoryViewBatchData;
+using dftracer::utils::python::MemoryViewBatchIteratorState;
 
-CoroTask<void> produce_lines(std::shared_ptr<IteratorState> state,
-                             TraceReaderConfig cfg, ReadConfig rc) {
-    auto *sp = state.get();
+CoroTask<void> produce_lines_batched(
+    std::shared_ptr<MemoryViewBatchIteratorState> state,
+    dftracer::utils::coro::ChannelProducer<MemoryViewBatchData> producer,
+    TraceReaderConfig cfg, ReadConfig rc, std::size_t batch_size) {
+    auto guard = producer.guard();
     try {
         TraceReader reader(std::move(cfg));
         auto gen = reader.read_lines(rc);
+        MemoryViewBatchData batch;
+        std::size_t count = 0;
+
         while (auto opt = co_await gen.next()) {
-            if (sp->cancelled.load(std::memory_order_acquire)) break;
-            std::string item(opt->content);
-            {
-                std::unique_lock<std::mutex> lock(sp->mtx);
-                sp->cv_producer.wait(lock, [sp] {
-                    return sp->queue.size() < sp->max_queue_size ||
-                           sp->cancelled.load(std::memory_order_acquire);
-                });
-                if (sp->cancelled.load(std::memory_order_acquire)) break;
-                sp->queue.push(std::move(item));
+            if (state->cancelled.load(std::memory_order_acquire)) break;
+            auto sv = opt->content;
+            Py_ssize_t offset = static_cast<Py_ssize_t>(batch.buffer.size());
+            batch.buffer.insert(batch.buffer.end(), sv.begin(), sv.end());
+            batch.offsets.push_back(offset);
+            batch.lengths.push_back(static_cast<Py_ssize_t>(sv.size()));
+            ++count;
+
+            if (count >= batch_size) {
+                auto batch_bytes = dftracer::utils::python::byte_size(batch);
+                state->bytes_in_queue.fetch_add(batch_bytes,
+                                                std::memory_order_acq_rel);
+                if (!co_await producer.send(std::move(batch))) break;
+                batch = MemoryViewBatchData{};
+                count = 0;
             }
-            sp->cv_consumer.notify_one();
+        }
+        if (count > 0 && !state->cancelled.load(std::memory_order_acquire)) {
+            auto batch_bytes = dftracer::utils::python::byte_size(batch);
+            state->bytes_in_queue.fetch_add(batch_bytes,
+                                            std::memory_order_acq_rel);
+            co_await producer.send(std::move(batch));
         }
     } catch (...) {
-        std::lock_guard<std::mutex> lock(sp->mtx);
-        sp->error = std::current_exception();
-        sp->queue.push(std::nullopt);
-        sp->done.store(true, std::memory_order_release);
-        sp->cv_consumer.notify_one();
-        co_return;
-    }
-    {
-        std::lock_guard<std::mutex> lock(sp->mtx);
-        sp->queue.push(std::nullopt);
-        sp->done.store(true, std::memory_order_release);
+        state->set_error(std::current_exception());
     }
-    sp->cv_consumer.notify_one();
 }
 
-CoroTask<void> produce_raw(std::shared_ptr<IteratorState> state,
-                           TraceReaderConfig cfg, ReadConfig rc) {
-    auto *sp = state.get();
+CoroTask<void> produce_raw_batched(
+    std::shared_ptr<MemoryViewBatchIteratorState> state,
+    dftracer::utils::coro::ChannelProducer<MemoryViewBatchData> producer,
+    TraceReaderConfig cfg, ReadConfig rc) {
+    auto guard = producer.guard();
     try {
         TraceReader reader(std::move(cfg));
         auto gen = reader.read_raw(rc);
         while (auto opt = co_await gen.next()) {
-            if (sp->cancelled.load(std::memory_order_acquire)) break;
-            std::string item(opt->data(), opt->size());
-            {
-                std::unique_lock<std::mutex> lock(sp->mtx);
-                sp->cv_producer.wait(lock, [sp] {
-                    return sp->queue.size() < sp->max_queue_size ||
-                           sp->cancelled.load(std::memory_order_acquire);
-                });
-                if (sp->cancelled.load(std::memory_order_acquire)) break;
-                sp->queue.push(std::move(item));
+            if (state->cancelled.load(std::memory_order_acquire)) break;
+            MemoryViewBatchData batch;
+            batch.buffer.assign(opt->data(), opt->data() + opt->size());
+            batch.offsets.push_back(0);
+            batch.lengths.push_back(static_cast<Py_ssize_t>(opt->size()));
+            auto batch_bytes = dftracer::utils::python::byte_size(batch);
+            state->bytes_in_queue.fetch_add(batch_bytes,
+                                            std::memory_order_acq_rel);
+            if (!co_await producer.send(std::move(batch))) break;
+        }
+    } catch (...) {
+        state->set_error(std::current_exception());
+    }
+}
+
+using dftracer::utils::utilities::common::json::JsonParser;
+using dftracer::utils::utilities::common::json::JsonValueHelper;
+
+static constexpr std::size_t ESTIMATED_BYTES_PER_LINE = 256;
+static constexpr std::size_t ESTIMATED_BYTES_PER_RAW_CHUNK = 4 * 1024 * 1024;
+static constexpr std::size_t ESTIMATED_BYTES_PER_JSON_EVENT = 512;
+static constexpr std::size_t ESTIMATED_BYTES_PER_ARROW_ROW = 1024;
+
+static void insert_simdjson_value(ArgsMap &map, std::string_view key,
+                                  simdjson::ondemand::value val) {
+    auto type = val.type();
+    if (type.error()) return;
+    switch (type.value_unsafe()) {
+        case simdjson::ondemand::json_type::string: {
+            auto r = val.get_string();
+            if (!r.error()) map.insert(key, std::string(r.value_unsafe()));
+            break;
+        }
+        case simdjson::ondemand::json_type::number: {
+            auto ri = val.get_int64();
+            if (!ri.error()) {
+                auto v = ri.value_unsafe();
+                if (v >= 0)
+                    map.insert(key, static_cast<std::uint64_t>(v));
+                else
+                    map.insert(key, v);
+            } else {
+                auto rd = val.get_double();
+                if (!rd.error()) map.insert(key, rd.value_unsafe());
+            }
+            break;
+        }
+        case simdjson::ondemand::json_type::boolean: {
+            auto r = val.get_bool();
+            if (!r.error()) map.insert(key, r.value_unsafe());
+            break;
+        }
+        default:
+            break;
+    }
+}
+
+static void parse_json_to_event(JsonParser &parser, JsonDictEvent &ev) {
+    ev.top.set_valid(true);
+    parser.for_each_field(
+        [&](std::string_view key, simdjson::ondemand::value val) {
+            if (key == "args") {
+                auto obj = val.get_object();
+                if (!obj.error()) {
+                    ev.args.set_valid(true);
+                    for (auto field : obj.value_unsafe()) {
+                        if (field.error()) continue;
+                        auto fkey = field.unescaped_key();
+                        if (fkey.error()) continue;
+                        auto fval = field.value();
+                        if (fval.error()) continue;
+                        insert_simdjson_value(ev.args, fkey.value_unsafe(),
+                                              fval.value_unsafe());
+                    }
+                }
+            } else {
+                insert_simdjson_value(ev.top, key, val);
             }
-            sp->cv_consumer.notify_one();
+        });
+}
+
+CoroTask<void> produce_json_dicts(
+    std::shared_ptr<JsonDictIteratorState> state,
+    dftracer::utils::coro::ChannelProducer<JsonDictBatch> producer,
+    TraceReaderConfig cfg, ReadConfig rc, std::size_t batch_size) {
+    auto guard = producer.guard();
+    try {
+        TraceReader reader(std::move(cfg));
+        auto gen = reader.read_json(rc);
+        JsonDictBatch batch;
+        batch.events.reserve(batch_size);
+
+        while (auto opt = co_await gen.next()) {
+            if (state->cancelled.load(std::memory_order_acquire)) break;
+
+            JsonDictEvent ev;
+            parse_json_to_event(*opt->parser, ev);
+            batch.events.push_back(std::move(ev));
+
+            if (batch.events.size() >= batch_size) {
+                auto batch_bytes = dftracer::utils::python::byte_size(batch);
+                state->bytes_in_queue.fetch_add(batch_bytes,
+                                                std::memory_order_acq_rel);
+                if (!co_await producer.send(std::move(batch))) break;
+                batch = JsonDictBatch{};
+                batch.events.reserve(batch_size);
+            }
+        }
+        if (!batch.events.empty() &&
+            !state->cancelled.load(std::memory_order_acquire)) {
+            auto batch_bytes = dftracer::utils::python::byte_size(batch);
+            state->bytes_in_queue.fetch_add(batch_bytes,
+                                            std::memory_order_acq_rel);
+            co_await producer.send(std::move(batch));
         }
     } catch (...) {
-        std::lock_guard<std::mutex> lock(sp->mtx);
-        sp->error = std::current_exception();
-        sp->queue.push(std::nullopt);
-        sp->done.store(true, std::memory_order_release);
-        sp->cv_consumer.notify_one();
-        co_return;
+        state->set_error(std::current_exception());
+    }
+}
+
+static CoroTask<void> send_files_to_channel(
+    std::shared_ptr<dftracer::utils::coro::Channel<std::string>> file_chan,
+    const std::vector<std::string> *files, std::atomic<bool> *cancelled) {
+    for (const auto &fp : *files) {
+        if (cancelled->load(std::memory_order_acquire)) break;
+        if (!co_await file_chan->send(fp)) break;
+    }
+    file_chan->close();
+    co_return;
+}
+
+static CoroTask<void> json_dict_file_worker(
+    std::shared_ptr<dftracer::utils::coro::Channel<std::string>> file_chan,
+    dftracer::utils::coro::Channel<JsonDictBatch> *out_chan,
+    std::string index_dir, std::size_t checkpoint_size, bool auto_build_index,
+    ReadConfig rc, std::size_t batch_size, std::atomic<bool> *cancelled) {
+    dftracer::utils::coro::ChannelProducer<JsonDictBatch> producer(out_chan);
+    auto guard = producer.guard();
+
+    while (auto file_path = co_await file_chan->receive()) {
+        if (cancelled->load(std::memory_order_acquire)) co_return;
+        TraceReaderConfig cfg;
+        cfg.file_path = std::move(*file_path);
+        cfg.index_dir = index_dir;
+        cfg.checkpoint_size = checkpoint_size;
+        cfg.auto_build_index = auto_build_index;
+
+        TraceReader reader(std::move(cfg));
+        auto gen = reader.read_json(rc);
+        JsonDictBatch batch;
+        batch.events.reserve(batch_size);
+
+        while (auto opt = co_await gen.next()) {
+            if (cancelled->load(std::memory_order_acquire)) co_return;
+            JsonDictEvent ev;
+            parse_json_to_event(*opt->parser, ev);
+            batch.events.push_back(std::move(ev));
+            if (batch.events.size() >= batch_size) {
+                if (!co_await producer.send(std::move(batch))) co_return;
+                batch = JsonDictBatch{};
+                batch.events.reserve(batch_size);
+            }
+        }
+        if (!batch.events.empty()) {
+            if (!co_await producer.send(std::move(batch))) co_return;
+        }
+    }
+    co_return;
+}
+
+static CoroTask<void> spawn_json_dict_producers(
+    CoroScope &child, dftracer::utils::coro::Channel<JsonDictBatch> *out_chan,
+    const std::vector<std::string> *files, const std::string *index_dir,
+    std::size_t checkpoint_size, bool auto_build_index, const ReadConfig *rc,
+    std::size_t batch_size, std::atomic<bool> *cancelled_ptr,
+    std::size_t max_workers) {
+    std::size_t num_workers = std::min(files->size(), max_workers);
+    auto file_chan =
+        dftracer::utils::coro::make_channel<std::string>(num_workers);
+
+    for (std::size_t i = 0; i < num_workers; ++i) {
+        child.spawn([out_chan, fc = file_chan, idx = *index_dir,
+                     checkpoint_size, auto_build_index, r = *rc, batch_size,
+                     cancelled_ptr](CoroScope &) {
+            return json_dict_file_worker(fc, out_chan, idx, checkpoint_size,
+                                         auto_build_index, r, batch_size,
+                                         cancelled_ptr);
+        });
+    }
+
+    child.spawn([fc = file_chan, files, cancelled_ptr](CoroScope &) {
+        return send_files_to_channel(fc, files, cancelled_ptr);
+    });
+    co_return;
+}
+
+static CoroTask<void> produce_json_dicts_parallel(
+    CoroScope &scope, JsonDictIteratorState *sp, std::string dir_path,
+    std::string index_dir, std::size_t checkpoint_size, bool auto_build_index,
+    ReadConfig rc, std::size_t batch_size, std::size_t max_workers) {
+    try {
+        PatternDirectoryScannerUtility scanner;
+        auto scan_input = PatternDirectoryScannerUtilityInput(
+            dir_path, {".pfw", ".pfw.gz"}, true, false);
+        auto entries = co_await scope.spawn(scanner, scan_input);
+
+        std::vector<std::string> files;
+        files.reserve(entries.size());
+        for (auto &e : entries) files.push_back(e.path.string());
+        std::sort(files.begin(), files.end());
+
+        if (files.empty()) {
+            sp->channel->close();
+            co_return;
+        }
+
+        auto *chan_ptr = sp->channel.get();
+        auto *cancelled_ptr = &sp->cancelled;
+
+        co_await scope.scope([chan_ptr, &files, &index_dir, checkpoint_size,
+                              auto_build_index, &rc, batch_size, cancelled_ptr,
+                              max_workers](CoroScope &child) -> CoroTask<void> {
+            co_await spawn_json_dict_producers(
+                child, chan_ptr, &files, &index_dir, checkpoint_size,
+                auto_build_index, &rc, batch_size, cancelled_ptr, max_workers);
+        });
+    } catch (...) {
+        sp->set_error(std::current_exception());
+    }
+}
+
+static CoroTask<void> lines_file_worker(
+    std::shared_ptr<dftracer::utils::coro::Channel<std::string>> file_chan,
+    dftracer::utils::coro::Channel<MemoryViewBatchData> *out_chan,
+    std::string index_dir, std::size_t checkpoint_size, bool auto_build_index,
+    ReadConfig rc, std::size_t batch_size, std::atomic<bool> *cancelled) {
+    dftracer::utils::coro::ChannelProducer<MemoryViewBatchData> producer(
+        out_chan);
+    auto guard = producer.guard();
+
+    while (auto file_path = co_await file_chan->receive()) {
+        if (cancelled->load(std::memory_order_acquire)) co_return;
+        TraceReaderConfig cfg;
+        cfg.file_path = std::move(*file_path);
+        cfg.index_dir = index_dir;
+        cfg.checkpoint_size = checkpoint_size;
+        cfg.auto_build_index = auto_build_index;
+
+        TraceReader reader(std::move(cfg));
+        auto gen = reader.read_lines(rc);
+        MemoryViewBatchData batch;
+        std::size_t count = 0;
+
+        while (auto opt = co_await gen.next()) {
+            if (cancelled->load(std::memory_order_acquire)) co_return;
+            auto sv = opt->content;
+            Py_ssize_t offset = static_cast<Py_ssize_t>(batch.buffer.size());
+            batch.buffer.insert(batch.buffer.end(), sv.begin(), sv.end());
+            batch.offsets.push_back(offset);
+            batch.lengths.push_back(static_cast<Py_ssize_t>(sv.size()));
+            ++count;
+            if (count >= batch_size) {
+                if (!co_await producer.send(std::move(batch))) co_return;
+                batch = MemoryViewBatchData{};
+                count = 0;
+            }
+        }
+        if (count > 0) {
+            if (!co_await producer.send(std::move(batch))) co_return;
+        }
+    }
+    co_return;
+}
+
+static CoroTask<void> spawn_lines_producers(
+    CoroScope &child,
+    dftracer::utils::coro::Channel<MemoryViewBatchData> *out_chan,
+    const std::vector<std::string> *files, const std::string *index_dir,
+    std::size_t checkpoint_size, bool auto_build_index, const ReadConfig *rc,
+    std::size_t batch_size, std::atomic<bool> *cancelled_ptr,
+    std::size_t max_workers) {
+    std::size_t num_workers = std::min(files->size(), max_workers);
+    auto file_chan =
+        dftracer::utils::coro::make_channel<std::string>(num_workers);
+
+    for (std::size_t i = 0; i < num_workers; ++i) {
+        child.spawn([out_chan, fc = file_chan, idx = *index_dir,
+                     checkpoint_size, auto_build_index, r = *rc, batch_size,
+                     cancelled_ptr](CoroScope &) {
+            return lines_file_worker(fc, out_chan, idx, checkpoint_size,
+                                     auto_build_index, r, batch_size,
+                                     cancelled_ptr);
+        });
+    }
+
+    child.spawn([fc = file_chan, files, cancelled_ptr](CoroScope &) {
+        return send_files_to_channel(fc, files, cancelled_ptr);
+    });
+    co_return;
+}
+
+static CoroTask<void> produce_lines_parallel(
+    CoroScope &scope, MemoryViewBatchIteratorState *sp, std::string dir_path,
+    std::string index_dir, std::size_t checkpoint_size, bool auto_build_index,
+    ReadConfig rc, std::size_t batch_size, std::size_t max_workers) {
+    try {
+        PatternDirectoryScannerUtility scanner;
+        auto scan_input = PatternDirectoryScannerUtilityInput(
+            dir_path, {".pfw", ".pfw.gz"}, true, false);
+        auto entries = co_await scope.spawn(scanner, scan_input);
+
+        std::vector<std::string> files;
+        files.reserve(entries.size());
+        for (auto &e : entries) files.push_back(e.path.string());
+        std::sort(files.begin(), files.end());
+
+        if (files.empty()) {
+            sp->channel->close();
+            co_return;
+        }
+
+        auto *chan_ptr = sp->channel.get();
+        auto *cancelled_ptr = &sp->cancelled;
+
+        co_await scope.scope([chan_ptr, &files, &index_dir, checkpoint_size,
+                              auto_build_index, &rc, batch_size, cancelled_ptr,
+                              max_workers](CoroScope &child) -> CoroTask<void> {
+            co_await spawn_lines_producers(
+                child, chan_ptr, &files, &index_dir, checkpoint_size,
+                auto_build_index, &rc, batch_size, cancelled_ptr, max_workers);
+        });
+    } catch (...) {
+        sp->set_error(std::current_exception());
     }
-    {
-        std::lock_guard<std::mutex> lock(sp->mtx);
-        sp->queue.push(std::nullopt);
-        sp->done.store(true, std::memory_order_release);
+}
+
+static CoroTask<void> raw_file_worker(
+    std::shared_ptr<dftracer::utils::coro::Channel<std::string>> file_chan,
+    dftracer::utils::coro::Channel<MemoryViewBatchData> *out_chan,
+    std::string index_dir, std::size_t checkpoint_size, bool auto_build_index,
+    ReadConfig rc, std::atomic<bool> *cancelled) {
+    dftracer::utils::coro::ChannelProducer<MemoryViewBatchData> producer(
+        out_chan);
+    auto guard = producer.guard();
+
+    while (auto file_path = co_await file_chan->receive()) {
+        if (cancelled->load(std::memory_order_acquire)) co_return;
+        TraceReaderConfig cfg;
+        cfg.file_path = std::move(*file_path);
+        cfg.index_dir = index_dir;
+        cfg.checkpoint_size = checkpoint_size;
+        cfg.auto_build_index = auto_build_index;
+
+        TraceReader reader(std::move(cfg));
+        auto gen = reader.read_raw(rc);
+        while (auto opt = co_await gen.next()) {
+            if (cancelled->load(std::memory_order_acquire)) co_return;
+            MemoryViewBatchData batch;
+            batch.buffer.assign(opt->data(), opt->data() + opt->size());
+            batch.offsets.push_back(0);
+            batch.lengths.push_back(static_cast<Py_ssize_t>(opt->size()));
+            if (!co_await producer.send(std::move(batch))) co_return;
+        }
+    }
+    co_return;
+}
+
+static CoroTask<void> spawn_raw_producers(
+    CoroScope &child,
+    dftracer::utils::coro::Channel<MemoryViewBatchData> *out_chan,
+    const std::vector<std::string> *files, const std::string *index_dir,
+    std::size_t checkpoint_size, bool auto_build_index, const ReadConfig *rc,
+    std::atomic<bool> *cancelled_ptr, std::size_t max_workers) {
+    std::size_t num_workers = std::min(files->size(), max_workers);
+    auto file_chan =
+        dftracer::utils::coro::make_channel<std::string>(num_workers);
+
+    for (std::size_t i = 0; i < num_workers; ++i) {
+        child.spawn([out_chan, fc = file_chan, idx = *index_dir,
+                     checkpoint_size, auto_build_index, r = *rc,
+                     cancelled_ptr](CoroScope &) {
+            return raw_file_worker(fc, out_chan, idx, checkpoint_size,
+                                   auto_build_index, r, cancelled_ptr);
+        });
+    }
+
+    child.spawn([fc = file_chan, files, cancelled_ptr](CoroScope &) {
+        return send_files_to_channel(fc, files, cancelled_ptr);
+    });
+    co_return;
+}
+
+static CoroTask<void> produce_raw_parallel(
+    CoroScope &scope, MemoryViewBatchIteratorState *sp, std::string dir_path,
+    std::string index_dir, std::size_t checkpoint_size, bool auto_build_index,
+    ReadConfig rc, std::size_t max_workers) {
+    try {
+        PatternDirectoryScannerUtility scanner;
+        auto scan_input = PatternDirectoryScannerUtilityInput(
+            dir_path, {".pfw", ".pfw.gz"}, true, false);
+        auto entries = co_await scope.spawn(scanner, scan_input);
+
+        std::vector<std::string> files;
+        files.reserve(entries.size());
+        for (auto &e : entries) files.push_back(e.path.string());
+        std::sort(files.begin(), files.end());
+
+        if (files.empty()) {
+            sp->channel->close();
+            co_return;
+        }
+
+        auto *chan_ptr = sp->channel.get();
+        auto *cancelled_ptr = &sp->cancelled;
+
+        co_await scope.scope([chan_ptr, &files, &index_dir, checkpoint_size,
+                              auto_build_index, &rc, cancelled_ptr,
+                              max_workers](CoroScope &child) -> CoroTask<void> {
+            co_await spawn_raw_producers(child, chan_ptr, &files, &index_dir,
+                                         checkpoint_size, auto_build_index, &rc,
+                                         cancelled_ptr, max_workers);
+        });
+    } catch (...) {
+        sp->set_error(std::current_exception());
     }
-    sp->cv_consumer.notify_one();
 }
 
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
 
+using dftracer::utils::utilities::common::arrow::ArrowExportResult;
 using dftracer::utils::utilities::common::arrow::ColumnType;
 using dftracer::utils::utilities::common::arrow::RecordBatchBuilder;
 
@@ -229,64 +679,65 @@ static bool str_contains_lower(std::string_view s, const char *needle) {
     return false;
 }
 
-// Normalize a raw JSON row (already parsed into yyjson) into the semantic
+// Normalize a raw JSON row (parsed with simdjson) into the semantic
 // output schema.  Appends one row to `builder` with the full set of output
 // columns.  Returns false if the row should be skipped (no valid name).
 static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena,
-                          yyjson_val *root) {
+                          JsonParser &parser) {
+    using SVH = JsonValueHelper;
+
     // --- Extract top-level fields ---
-    yyjson_val *v_ph = yyjson_obj_get(root, "ph");
-    yyjson_val *v_name = yyjson_obj_get(root, "name");
-    yyjson_val *v_cat = yyjson_obj_get(root, "cat");
-    yyjson_val *v_pid = yyjson_obj_get(root, "pid");
-    yyjson_val *v_tid = yyjson_obj_get(root, "tid");
-    yyjson_val *v_ts = yyjson_obj_get(root, "ts");
-    yyjson_val *v_dur = yyjson_obj_get(root, "dur");
-    yyjson_val *v_args = yyjson_obj_get(root, "args");
-
-    std::string_view ph =
-        v_ph && yyjson_is_str(v_ph)
-            ? std::string_view(yyjson_get_str(v_ph), yyjson_get_len(v_ph))
-            : std::string_view();
-    std::string_view name_sv =
-        v_name && yyjson_is_str(v_name)
-            ? std::string_view(yyjson_get_str(v_name), yyjson_get_len(v_name))
-            : std::string_view();
-    std::string_view cat_sv =
-        v_cat && yyjson_is_str(v_cat)
-            ? std::string_view(yyjson_get_str(v_cat), yyjson_get_len(v_cat))
-            : std::string_view();
-
-    // Helper to get args fields
-    auto args_str = [&](const char *key) -> std::string_view {
-        if (!v_args) return {};
-        yyjson_val *v = yyjson_obj_get(v_args, key);
-        if (!v) return {};
-        if (yyjson_is_str(v)) return {yyjson_get_str(v), yyjson_get_len(v)};
-        return {};
-    };
-    auto args_int = [&](const char *key) -> std::pair<bool, int64_t> {
-        if (!v_args) return {false, 0};
-        yyjson_val *v = yyjson_obj_get(v_args, key);
-        if (!v) return {false, 0};
-        if (yyjson_is_int(v)) return {true, yyjson_get_sint(v)};
-        if (yyjson_is_uint(v))
-            return {true, static_cast<int64_t>(yyjson_get_uint(v))};
-        if (yyjson_is_real(v))
-            return {true, static_cast<int64_t>(yyjson_get_real(v))};
-        return {false, 0};
-    };
-    auto args_float = [&](const char *key) -> std::pair<bool, double> {
-        if (!v_args) return {false, 0.0};
-        yyjson_val *v = yyjson_obj_get(v_args, key);
-        if (!v) return {false, 0.0};
-        if (yyjson_is_real(v)) return {true, yyjson_get_real(v)};
-        if (yyjson_is_int(v))
-            return {true, static_cast<double>(yyjson_get_sint(v))};
-        if (yyjson_is_uint(v))
-            return {true, static_cast<double>(yyjson_get_uint(v))};
-        return {false, 0.0};
-    };
+    auto ph = parser.get_string("ph").value_or(std::string_view{});
+    auto name_sv = parser.get_string("name").value_or(std::string_view{});
+    auto cat_sv = parser.get_string("cat").value_or(std::string_view{});
+    auto pid_opt = parser.get_int64("pid");
+    auto tid_opt = parser.get_int64("tid");
+    auto ts_opt = parser.get_int64("ts");
+    auto dur_opt = parser.get_int64("dur");
+
+    // Helper lambdas to access args fields (need to rewind after each access)
+    // We'll do a single pass over args instead
+    std::optional<std::string_view> args_name, args_value, args_hhash,
+        args_fhash;
+    std::optional<int64_t> args_epoch, args_step, args_size_sum, args_ret;
+    std::optional<int64_t> args_offset, args_image_idx, args_image_size;
+    std::unordered_map<std::string, int64_t> args_int_map;
+    std::unordered_map<std::string, double> args_float_map;
+
+    parser.rewind();
+    parser.for_each_field(
+        "args", [&](std::string_view key, simdjson::ondemand::value val) {
+            if (key == "name") {
+                if (auto s = SVH::get_string(val)) args_name = s;
+            } else if (key == "value") {
+                if (auto s = SVH::get_string(val)) args_value = s;
+            } else if (key == "hhash") {
+                if (auto s = SVH::get_string(val)) args_hhash = s;
+            } else if (key == "fhash") {
+                if (auto s = SVH::get_string(val)) args_fhash = s;
+            } else if (key == "epoch") {
+                if (auto i = SVH::get_int64(val)) args_epoch = i;
+            } else if (key == "step") {
+                if (auto i = SVH::get_int64(val)) args_step = i;
+            } else if (key == "size_sum") {
+                if (auto i = SVH::get_int64(val)) args_size_sum = i;
+            } else if (key == "ret") {
+                if (auto i = SVH::get_int64(val)) args_ret = i;
+            } else if (key == "offset") {
+                if (auto i = SVH::get_int64(val)) args_offset = i;
+            } else if (key == "image_idx") {
+                if (auto i = SVH::get_int64(val)) args_image_idx = i;
+            } else if (key == "image_size") {
+                if (auto i = SVH::get_int64(val)) args_image_size = i;
+            } else {
+                // Store other int/float args for profile/sys columns
+                if (auto i = SVH::get_int64(val)) {
+                    args_int_map[std::string(key)] = *i;
+                } else if (auto d = SVH::get_double(val)) {
+                    args_float_map[std::string(key)] = *d;
+                }
+            }
+        });
 
     // --- Type classification ---
     bool is_M = (ph == "M");
@@ -315,17 +766,12 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena,
 
     // Name: metadata rows use args.name if available
     std::string_view out_name = name_sv;
-    if (is_M) {
-        auto an = args_str("name");
-        if (!an.empty()) out_name = an;
+    if (is_M && args_name && !args_name->empty()) {
+        out_name = *args_name;
     }
     if (out_name.empty()) return false;  // skip rows without name
 
-    // --- Declare all output columns (lazy — add_or_get_column handles
-    // first-time creation) --- We use a fixed schema so column indices are
-    // stable across rows. The builder backfills nulls for columns not touched
-    // via end_row().
-
+    // --- Declare all output columns ---
     auto ci_type = builder.add_or_get_column("type", ColumnType::INT64);
     auto ci_cat = builder.add_or_get_column("cat", ColumnType::STRING);
     auto ci_name = builder.add_or_get_column("name", ColumnType::STRING);
@@ -342,7 +788,8 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena,
     auto ci_ts = builder.add_or_get_column("ts", ColumnType::INT64);
     auto ci_dur = builder.add_or_get_column("dur", ColumnType::INT64);
     auto ci_te = builder.add_or_get_column("te", ColumnType::INT64);
-    auto ci_trange = builder.add_or_get_column("trange", ColumnType::INT64);
+    [[maybe_unused]] auto ci_trange =
+        builder.add_or_get_column("trange", ColumnType::INT64);
     auto ci_io_cat = builder.add_or_get_column("io_cat", ColumnType::INT64);
     auto ci_size = builder.add_or_get_column("size", ColumnType::INT64);
     auto ci_offset = builder.add_or_get_column("offset", ColumnType::INT64);
@@ -351,7 +798,7 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena,
     // --- Populate core columns ---
     builder.append_int64(ci_type, row_type);
 
-    // cat (lowercased) — write into arena
+    // cat (lowercased) - write into arena
     if (!cat_sv.empty()) {
         char lbuf[256];
         std::size_t clen = std::min(cat_sv.size(), sizeof(lbuf));
@@ -365,42 +812,36 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena,
 
     builder.append_string(ci_name, out_name);
 
-    if (v_pid && (yyjson_is_int(v_pid) || yyjson_is_uint(v_pid)))
-        builder.append_int64(ci_pid, json_to_int64(v_pid));
-    // else: null via end_row backfill
-
-    if (v_tid && (yyjson_is_int(v_tid) || yyjson_is_uint(v_tid)))
-        builder.append_int64(ci_tid, json_to_int64(v_tid));
+    if (pid_opt) builder.append_int64(ci_pid, *pid_opt);
+    if (tid_opt) builder.append_int64(ci_tid, *tid_opt);
 
     // hash / value
-    auto a_value = args_str("value");
-    if (is_hash && !a_value.empty()) builder.append_string(ci_hash, a_value);
-    if (row_type == ROW_METADATA && !a_value.empty())
-        builder.append_string(ci_value, a_value);
+    if (is_hash && args_value && !args_value->empty())
+        builder.append_string(ci_hash, *args_value);
+    if (row_type == ROW_METADATA && args_value && !args_value->empty())
+        builder.append_string(ci_value, *args_value);
 
     // host_hash / file_hash
-    auto a_hhash = args_str("hhash");
-    if (!a_hhash.empty()) builder.append_string(ci_host_hash, a_hhash);
-    auto a_fhash = args_str("fhash");
-    if (!a_fhash.empty()) builder.append_string(ci_file_hash, a_fhash);
+    if (args_hhash && !args_hhash->empty())
+        builder.append_string(ci_host_hash, *args_hhash);
+    if (args_fhash && !args_fhash->empty())
+        builder.append_string(ci_file_hash, *args_fhash);
 
     // epoch / step
-    auto [has_epoch, epoch_v] = args_int("epoch");
-    if (has_epoch && epoch_v >= 0) builder.append_int64(ci_epoch, epoch_v);
-    auto [has_step, step_v] = args_int("step");
-    if (has_step && step_v >= 0) builder.append_int64(ci_step, step_v);
+    if (args_epoch && *args_epoch >= 0)
+        builder.append_int64(ci_epoch, *args_epoch);
+    if (args_step && *args_step >= 0) builder.append_int64(ci_step, *args_step);
 
     // --- Temporal ---
-    bool has_ts = (is_event || is_C) && v_ts &&
-                  (yyjson_is_int(v_ts) || yyjson_is_uint(v_ts));
-    bool has_dur = v_dur && (yyjson_is_int(v_dur) || yyjson_is_uint(v_dur));
+    bool has_ts = (is_event || is_C) && ts_opt.has_value();
+    bool has_dur = dur_opt.has_value();
     int64_t ts_val = 0, dur_val = 0;
     if (has_ts) {
-        ts_val = json_to_int64(v_ts);
+        ts_val = *ts_opt;
         builder.append_int64(ci_ts, ts_val);
     }
     if (is_event && has_ts && has_dur) {
-        dur_val = json_to_int64(v_dur);
+        dur_val = *dur_opt;
         builder.append_int64(ci_dur, dur_val);
         builder.append_int64(ci_te, ts_val + dur_val);
     }
@@ -412,26 +853,22 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena,
         int8_t io_cat = IO_OTHER;
 
         // size priority: size_sum > POSIX ret > image_size
-        auto [has_ss, ss_val] = args_int("size_sum");
-        if (has_ss) {
-            builder.append_int64(ci_size, ss_val);
-            if (is_posix_stdio) io_cat = get_io_cat(name_sv);
+        if (args_size_sum) {
+            builder.append_int64(ci_size, *args_size_sum);
+            if (is_posix_stdio) io_cat = get_io_cat(out_name);
         } else if (is_posix_stdio) {
-            io_cat = get_io_cat(name_sv);
-            auto [has_ret, ret_val] = args_int("ret");
-            if (has_ret && ret_val > 0 &&
+            io_cat = get_io_cat(out_name);
+            if (args_ret && *args_ret > 0 &&
                 (io_cat == IO_READ || io_cat == IO_WRITE))
-                builder.append_int64(ci_size, ret_val);
-            auto [has_ofs, ofs_val] = args_int("offset");
-            if (has_ofs && ofs_val >= 0)
-                builder.append_int64(ci_offset, ofs_val);
+                builder.append_int64(ci_size, *args_ret);
+            if (args_offset && *args_offset >= 0)
+                builder.append_int64(ci_offset, *args_offset);
         } else {
-            auto [has_img, img_val] = args_int("image_idx");
-            if (has_img && img_val > 0)
-                builder.append_int64(ci_image_id, img_val);
-            auto [has_ims, ims_val] = args_int("image_size");
-            if (has_ims && ims_val > 0 && !str_contains_lower(name_sv, "open"))
-                builder.append_int64(ci_size, ims_val);
+            if (args_image_idx && *args_image_idx > 0)
+                builder.append_int64(ci_image_id, *args_image_idx);
+            if (args_image_size && *args_image_size > 0 &&
+                !str_contains_lower(out_name, "open"))
+                builder.append_int64(ci_size, *args_image_size);
         }
         builder.append_int64(ci_io_cat, io_cat);
     }
@@ -440,7 +877,7 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena,
     if (is_profile) {
         bool is_posix_stdio =
             str_iequal(cat_sv, "posix") || str_iequal(cat_sv, "stdio");
-        int8_t io_cat = is_posix_stdio ? get_io_cat(name_sv) : IO_OTHER;
+        int8_t io_cat = is_posix_stdio ? get_io_cat(out_name) : IO_OTHER;
         builder.append_int64(ci_io_cat, io_cat);
 
         static const char *profile_keys[] = {
@@ -451,10 +888,10 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena,
             "ret_max",    "ret_min",    "ret_sum",    "whence",
             "whence_max", "whence_min", "whence_sum", nullptr};
         for (const char **pk = profile_keys; *pk; ++pk) {
-            auto [has_v, val] = args_int(*pk);
-            if (has_v) {
+            auto it = args_int_map.find(*pk);
+            if (it != args_int_map.end()) {
                 auto idx = builder.add_or_get_column(*pk, ColumnType::INT64);
-                builder.append_int64(idx, val);
+                builder.append_int64(idx, it->second);
             }
         }
     }
@@ -466,10 +903,10 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena,
             "irq_pct",  "softirq_pct", "MemAvailable", "MemFree",
             "Cached",   "Dirty",       "Active",       nullptr};
         for (const char **sk = sys_keys; *sk; ++sk) {
-            auto [has_v, val] = args_float(*sk);
-            if (has_v) {
+            auto it = args_float_map.find(*sk);
+            if (it != args_float_map.end()) {
                 auto idx = builder.add_or_get_column(*sk, ColumnType::DOUBLE);
-                builder.append_double(idx, val);
+                builder.append_double(idx, it->second);
             }
         }
     }
@@ -478,276 +915,1321 @@ static bool normalize_row(RecordBatchBuilder &builder, StringArena &arena,
     return true;
 }
 
-// Flatten a yyjson object into "prefix.key" columns using native types.
+// Flatten a simdjson object into "prefix.key" columns using native types.
 // On type mismatch (same key, different type across rows), appends null.
 static void flatten_object_into(RecordBatchBuilder &builder, StringArena &arena,
-                                std::string_view prefix, yyjson_val *obj) {
+                                std::string_view prefix,
+                                simdjson::ondemand::object obj) {
+    using SVH = JsonValueHelper;
     char key_buf[512];
 
-    yyjson_obj_iter sub_iter;
-    yyjson_obj_iter_init(obj, &sub_iter);
-    yyjson_val *sub_key;
-    while ((sub_key = yyjson_obj_iter_next(&sub_iter))) {
-        yyjson_val *sub_val = yyjson_obj_iter_get_val(sub_key);
-        const char *sk_str = yyjson_get_str(sub_key);
-        std::size_t sk_len = yyjson_get_len(sub_key);
+    for (auto field : obj) {
+        if (field.error()) continue;
 
-        std::size_t needed = prefix.size() + 1 + sk_len;
+        auto key_result = field.unescaped_key();
+        if (key_result.error()) continue;
+        std::string_view sk = key_result.value_unsafe();
+
+        auto val_result = field.value();
+        if (val_result.error()) continue;
+        auto sub_val = val_result.value_unsafe();
+
+        std::size_t needed = prefix.size() + 1 + sk.size();
         if (needed >= sizeof(key_buf)) continue;
         std::memcpy(key_buf, prefix.data(), prefix.size());
         key_buf[prefix.size()] = '.';
-        std::memcpy(key_buf + prefix.size() + 1, sk_str, sk_len);
+        std::memcpy(key_buf + prefix.size() + 1, sk.data(), sk.size());
         std::string_view full_key(key_buf, needed);
 
-        if (yyjson_is_int(sub_val)) {
-            auto idx = builder.add_or_get_column(full_key, ColumnType::INT64);
-            if (builder.column_type(idx) == ColumnType::INT64)
-                builder.append_int64(idx, yyjson_get_sint(sub_val));
-            else
-                builder.append_null(idx);
-        } else if (yyjson_is_uint(sub_val)) {
-            auto idx = builder.add_or_get_column(full_key, ColumnType::UINT64);
-            if (builder.column_type(idx) == ColumnType::UINT64)
-                builder.append_uint64(idx, yyjson_get_uint(sub_val));
-            else
-                builder.append_null(idx);
-        } else if (yyjson_is_real(sub_val)) {
-            auto idx = builder.add_or_get_column(full_key, ColumnType::DOUBLE);
-            if (builder.column_type(idx) == ColumnType::DOUBLE)
-                builder.append_double(idx, yyjson_get_real(sub_val));
-            else
-                builder.append_null(idx);
-        } else if (yyjson_is_bool(sub_val)) {
-            auto idx = builder.add_or_get_column(full_key, ColumnType::BOOL);
-            if (builder.column_type(idx) == ColumnType::BOOL)
-                builder.append_bool(idx, yyjson_get_bool(sub_val));
-            else
-                builder.append_null(idx);
-        } else if (yyjson_is_str(sub_val)) {
-            auto idx = builder.add_or_get_column(full_key, ColumnType::STRING);
-            if (builder.column_type(idx) == ColumnType::STRING)
-                builder.append_string(
-                    idx, std::string_view(yyjson_get_str(sub_val),
-                                          yyjson_get_len(sub_val)));
-            else
-                builder.append_null(idx);
-        } else if (yyjson_is_null(sub_val)) {
-            auto existing = builder.find_column(full_key);
-            if (existing) builder.append_null(*existing);
-        } else {
-            // nested object/array: serialize
-            std::size_t json_len;
-            char *json_str = yyjson_val_write(sub_val, 0, &json_len);
-            auto idx = builder.add_or_get_column(full_key, ColumnType::STRING);
-            if (json_str) {
-                builder.append_string(idx, arena.push(json_str, json_len));
-                free(json_str);
-            } else {
-                builder.append_null(idx);
+        auto type_result = sub_val.type();
+        if (type_result.error()) continue;
+        auto json_type = type_result.value_unsafe();
+
+        switch (json_type) {
+            case simdjson::ondemand::json_type::number: {
+                auto num_result = sub_val.get_number();
+                if (num_result.error()) break;
+                auto num = num_result.value_unsafe();
+                if (num.is_int64()) {
+                    auto idx =
+                        builder.add_or_get_column(full_key, ColumnType::INT64);
+                    if (builder.column_type(idx) == ColumnType::INT64)
+                        builder.append_int64(idx, num.get_int64());
+                    else
+                        builder.append_null(idx);
+                } else if (num.is_uint64()) {
+                    auto idx =
+                        builder.add_or_get_column(full_key, ColumnType::UINT64);
+                    if (builder.column_type(idx) == ColumnType::UINT64)
+                        builder.append_uint64(idx, num.get_uint64());
+                    else
+                        builder.append_null(idx);
+                } else {
+                    auto idx =
+                        builder.add_or_get_column(full_key, ColumnType::DOUBLE);
+                    if (builder.column_type(idx) == ColumnType::DOUBLE)
+                        builder.append_double(idx, num.get_double());
+                    else
+                        builder.append_null(idx);
+                }
+                break;
+            }
+            case simdjson::ondemand::json_type::string: {
+                auto str_result = sub_val.get_string();
+                if (str_result.error()) break;
+                auto str = str_result.value_unsafe();
+                auto idx =
+                    builder.add_or_get_column(full_key, ColumnType::STRING);
+                if (builder.column_type(idx) == ColumnType::STRING)
+                    builder.append_string(idx, str);
+                else
+                    builder.append_null(idx);
+                break;
             }
+            case simdjson::ondemand::json_type::boolean: {
+                auto bool_result = sub_val.get_bool();
+                if (bool_result.error()) break;
+                auto b = bool_result.value_unsafe();
+                auto idx =
+                    builder.add_or_get_column(full_key, ColumnType::BOOL);
+                if (builder.column_type(idx) == ColumnType::BOOL)
+                    builder.append_bool(idx, b);
+                else
+                    builder.append_null(idx);
+                break;
+            }
+            case simdjson::ondemand::json_type::null: {
+                auto existing = builder.find_column(full_key);
+                if (existing) builder.append_null(*existing);
+                break;
+            }
+            case simdjson::ondemand::json_type::object:
+            case simdjson::ondemand::json_type::array: {
+                // Serialize nested object/array to JSON string
+                auto json_str = SVH::to_json_string(sub_val);
+                auto idx =
+                    builder.add_or_get_column(full_key, ColumnType::STRING);
+                if (json_str) {
+                    builder.append_string(
+                        idx, arena.push(json_str->data(), json_str->size()));
+                } else {
+                    builder.append_null(idx);
+                }
+                break;
+            }
+            default:
+                break;
         }
     }
 }
 
-CoroTask<void> produce_arrow_batches(std::shared_ptr<ArrowIteratorState> state,
-                                     TraceReaderConfig cfg, ReadConfig rc,
-                                     std::size_t batch_size,
-                                     bool flatten_objects = false,
-                                     bool normalize = false) {
-    auto *sp = state.get();
-    try {
-        TraceReader reader(std::move(cfg));
-        auto gen = reader.read_lines(rc);
-        RecordBatchBuilder builder;
-        builder.reserve(batch_size);
+static bool build_arrow_row(RecordBatchBuilder &builder, JsonParser &parser,
+                            StringArena &arena, bool normalize) {
+    if (normalize) return normalize_row(builder, arena, parser);
+
+    using SVH = JsonValueHelper;
+    parser.for_each_field([&](std::string_view key_sv,
+                              simdjson::ondemand::value val) {
+        auto type_result = val.type();
+        if (type_result.error()) return;
+        auto json_type = type_result.value_unsafe();
+        switch (json_type) {
+            case simdjson::ondemand::json_type::number: {
+                auto num_result = val.get_number();
+                if (num_result.error()) break;
+                auto num = num_result.value_unsafe();
+                if (num.is_int64()) {
+                    std::size_t idx =
+                        builder.add_or_get_column(key_sv, ColumnType::INT64);
+                    builder.append_int64(idx, num.get_int64());
+                } else if (num.is_uint64()) {
+                    std::size_t idx =
+                        builder.add_or_get_column(key_sv, ColumnType::UINT64);
+                    builder.append_uint64(idx, num.get_uint64());
+                } else {
+                    std::size_t idx =
+                        builder.add_or_get_column(key_sv, ColumnType::DOUBLE);
+                    builder.append_double(idx, num.get_double());
+                }
+                break;
+            }
+            case simdjson::ondemand::json_type::string: {
+                auto str_result = val.get_string();
+                if (str_result.error()) break;
+                auto str = str_result.value_unsafe();
+                std::size_t idx =
+                    builder.add_or_get_column(key_sv, ColumnType::STRING);
+                builder.append_string(idx, str);
+                break;
+            }
+            case simdjson::ondemand::json_type::boolean: {
+                auto bool_result = val.get_bool();
+                if (bool_result.error()) break;
+                auto b = bool_result.value_unsafe();
+                std::size_t idx =
+                    builder.add_or_get_column(key_sv, ColumnType::BOOL);
+                builder.append_bool(idx, b);
+                break;
+            }
+            case simdjson::ondemand::json_type::null: {
+                auto existing = builder.find_column(key_sv);
+                if (existing) builder.append_null(*existing);
+                break;
+            }
+            case simdjson::ondemand::json_type::object:
+            case simdjson::ondemand::json_type::array: {
+                auto json_str = SVH::to_json_string(val);
+                std::size_t idx =
+                    builder.add_or_get_column(key_sv, ColumnType::STRING);
+                if (json_str) {
+                    builder.append_string(
+                        idx, arena.push(json_str->data(), json_str->size()));
+                } else {
+                    builder.append_null(idx);
+                }
+                break;
+            }
+            default:
+                break;
+        }
+    });
+    builder.end_row();
+    return true;
+}
 
-        std::vector<yyjson_doc *> held_docs;
-        StringArena arena;
-        held_docs.reserve(batch_size);
+static bool process_json_line(RecordBatchBuilder &builder, JsonParser &parser,
+                              StringArena &arena, std::string_view content,
+                              bool normalize) {
+    const char *trimmed;
+    std::size_t trimmed_length;
+    if (!dftracer::utils::json_trim_and_validate_with_comma(
+            content.data(), content.size(), trimmed, trimmed_length))
+        return false;
+    if (!parser.parse(std::string_view(trimmed, trimmed_length))) return false;
+    return build_arrow_row(builder, parser, arena, normalize);
+}
 
-        while (auto opt = co_await gen.next()) {
-            if (sp->cancelled.load(std::memory_order_acquire)) break;
+static CoroTask<void> produce_arrow_for_file(
+    dftracer::utils::coro::Channel<ArrowExportResult> *chan,
+    std::string file_path, std::string index_dir, std::size_t checkpoint_size,
+    bool auto_build_index, ReadConfig rc, std::size_t batch_size,
+    bool normalize, std::atomic<bool> *cancelled) {
+    dftracer::utils::coro::ChannelProducer<ArrowExportResult> producer(chan);
+    auto guard = producer.guard();
 
-            const char *trimmed;
-            std::size_t trimmed_length;
-            if (!dftracer::utils::json_trim_and_validate(
-                    opt->content.data(), opt->content.size(), trimmed,
-                    trimmed_length)) {
-                continue;
-            }
+    TraceReaderConfig cfg;
+    cfg.file_path = std::move(file_path);
+    cfg.index_dir = std::move(index_dir);
+    cfg.checkpoint_size = checkpoint_size;
+    cfg.auto_build_index = auto_build_index;
+
+    TraceReader reader(std::move(cfg));
+
+    // Fast path: non-normalized Arrow build happens inside TraceReader.
+    // Normalize still goes through read_json + build_arrow_row for the
+    // richer schema derivation.
+    if (!normalize) {
+        auto batch_gen = reader.read_arrow(rc, batch_size);
+        while (auto batch_opt = co_await batch_gen.next()) {
+            if (cancelled->load(std::memory_order_acquire)) co_return;
+            if (!co_await producer.send(std::move(*batch_opt))) co_return;
+        }
+        co_return;
+    }
 
-            yyjson_doc *doc = yyjson_read(trimmed, trimmed_length, 0);
-            if (!doc) continue;
+    auto gen = reader.read_json(rc);
+    RecordBatchBuilder builder;
+    builder.reserve(batch_size);
+    StringArena arena;
 
-            yyjson_val *root = yyjson_doc_get_root(doc);
-            if (!root || !yyjson_is_obj(root)) {
-                yyjson_doc_free(doc);
-                continue;
+    while (auto opt = co_await gen.next()) {
+        if (cancelled->load(std::memory_order_acquire)) co_return;
+        if (!build_arrow_row(builder, *opt->parser, arena, normalize)) continue;
+        if (builder.num_rows() >= batch_size) {
+            auto result = builder.finish();
+            arena.clear();
+            if (!co_await producer.send(std::move(result))) co_return;
+            if (!builder.is_schema_locked()) builder.lock_schema();
+            builder.reset(true);
+            builder.reserve(batch_size);
+        }
+    }
+    if (builder.num_rows() > 0) {
+        co_await producer.send(builder.finish());
+    }
+    co_return;
+}
+
+static CoroTask<void> file_worker(
+    std::shared_ptr<dftracer::utils::coro::Channel<std::string>> file_chan,
+    dftracer::utils::coro::Channel<ArrowExportResult> *out_chan,
+    std::string index_dir, std::size_t checkpoint_size, bool auto_build_index,
+    ReadConfig rc, std::size_t batch_size, bool normalize,
+    std::atomic<bool> *cancelled) {
+    dftracer::utils::coro::ChannelProducer<ArrowExportResult> producer(
+        out_chan);
+    auto guard = producer.guard();
+
+    while (auto file_path = co_await file_chan->receive()) {
+        if (cancelled->load(std::memory_order_acquire)) co_return;
+        TraceReaderConfig cfg;
+        cfg.file_path = std::move(*file_path);
+        cfg.index_dir = index_dir;
+        cfg.checkpoint_size = checkpoint_size;
+        cfg.auto_build_index = auto_build_index;
+
+        TraceReader reader(std::move(cfg));
+
+        if (!normalize) {
+            auto batch_gen = reader.read_arrow(rc, batch_size);
+            while (auto batch_opt = co_await batch_gen.next()) {
+                if (cancelled->load(std::memory_order_acquire)) co_return;
+                if (!co_await producer.send(std::move(*batch_opt))) co_return;
             }
+            continue;
+        }
 
-            if (normalize) {
-                // Produce the semantic output schema directly.
-                // normalize_row calls end_row() internally.
-                if (!normalize_row(builder, arena, root)) {
-                    yyjson_doc_free(doc);
-                    continue;
-                }
-                held_docs.push_back(doc);
-            } else {
-                yyjson_obj_iter iter;
-                yyjson_obj_iter_init(root, &iter);
-                yyjson_val *key;
-                while ((key = yyjson_obj_iter_next(&iter))) {
-                    yyjson_val *val = yyjson_obj_iter_get_val(key);
-                    const char *key_str = yyjson_get_str(key);
-                    std::size_t key_len = yyjson_get_len(key);
-                    std::string_view key_sv(key_str, key_len);
-
-                    if (yyjson_is_int(val)) {
-                        std::size_t idx = builder.add_or_get_column(
-                            key_sv, ColumnType::INT64);
-                        builder.append_int64(idx, yyjson_get_sint(val));
-                    } else if (yyjson_is_uint(val)) {
-                        std::size_t idx = builder.add_or_get_column(
-                            key_sv, ColumnType::UINT64);
-                        builder.append_uint64(idx, yyjson_get_uint(val));
-                    } else if (yyjson_is_real(val)) {
-                        std::size_t idx = builder.add_or_get_column(
-                            key_sv, ColumnType::DOUBLE);
-                        builder.append_double(idx, yyjson_get_real(val));
-                    } else if (yyjson_is_bool(val)) {
-                        std::size_t idx =
-                            builder.add_or_get_column(key_sv, ColumnType::BOOL);
-                        builder.append_bool(idx, yyjson_get_bool(val));
-                    } else if (yyjson_is_str(val)) {
-                        std::size_t idx = builder.add_or_get_column(
-                            key_sv, ColumnType::STRING);
-                        builder.append_string(
-                            idx, std::string_view(yyjson_get_str(val),
-                                                  yyjson_get_len(val)));
-                    } else if (yyjson_is_null(val)) {
-                        auto existing = builder.find_column(key_sv);
-                        if (existing) builder.append_null(*existing);
-                    } else {
-                        std::size_t json_len;
-                        char *json_str = yyjson_val_write(val, 0, &json_len);
-                        std::size_t idx = builder.add_or_get_column(
-                            key_sv, ColumnType::STRING);
-                        if (json_str) {
-                            builder.append_string(
-                                idx, arena.push(json_str, json_len));
-                            free(json_str);
-                        } else {
-                            builder.append_null(idx);
-                        }
-                    }
-                }
-                builder.end_row();
-                held_docs.push_back(doc);
-            }  // end else (raw path)
+        auto gen = reader.read_json(rc);
+        RecordBatchBuilder builder;
+        builder.reserve(batch_size);
+        StringArena arena;
 
+        while (auto opt = co_await gen.next()) {
+            if (cancelled->load(std::memory_order_acquire)) co_return;
+            if (!build_arrow_row(builder, *opt->parser, arena, normalize))
+                continue;
             if (builder.num_rows() >= batch_size) {
                 auto result = builder.finish();
-                for (auto *d : held_docs) yyjson_doc_free(d);
-                held_docs.clear();
                 arena.clear();
-
-                {
-                    std::unique_lock<std::mutex> lock(sp->mtx);
-                    sp->cv_producer.wait(lock, [sp] {
-                        return sp->queue.size() < sp->max_queue_size ||
-                               sp->cancelled.load(std::memory_order_acquire);
-                    });
-                    if (sp->cancelled.load(std::memory_order_acquire)) break;
-                    sp->queue.push(std::move(result));
-                }
-                sp->cv_consumer.notify_one();
-                builder.reset(false);
+                if (!co_await producer.send(std::move(result))) co_return;
+                if (!builder.is_schema_locked()) builder.lock_schema();
+                builder.reset(true);
                 builder.reserve(batch_size);
             }
         }
-
         if (builder.num_rows() > 0) {
-            auto result = builder.finish();
-            for (auto *d : held_docs) yyjson_doc_free(d);
-            held_docs.clear();
-            arena.clear();
-            {
-                std::lock_guard<std::mutex> lock(sp->mtx);
-                sp->queue.push(std::move(result));
-            }
-            sp->cv_consumer.notify_one();
-        } else {
-            for (auto *d : held_docs) yyjson_doc_free(d);
+            if (!co_await producer.send(builder.finish())) co_return;
         }
-    } catch (...) {
-        std::lock_guard<std::mutex> lock(sp->mtx);
-        sp->error = std::current_exception();
-        sp->queue.push(std::nullopt);
-        sp->done.store(true, std::memory_order_release);
-        sp->cv_consumer.notify_one();
-        co_return;
     }
-    {
-        std::lock_guard<std::mutex> lock(sp->mtx);
-        sp->queue.push(std::nullopt);
-        sp->done.store(true, std::memory_order_release);
-    }
-    sp->cv_consumer.notify_one();
+    co_return;
 }
 
-#endif  // DFTRACER_UTILS_ENABLE_ARROW
+// Extract AND-of-EQ leaves from a Query AST. Returns nullopt if the predicate
+// shape is anything else (NE, range ops, IN, NOT, OR), in which case the
+// uniform-match shortcut does not apply.
+static std::optional<std::vector<std::pair<std::string, std::string>>>
+extract_eq_leaves(
+    const dftracer::utils::utilities::common::query::QueryNode &node) {
+    namespace q_ns = dftracer::utils::utilities::common::query;
+    using LeafVec = std::vector<std::pair<std::string, std::string>>;
+
+    auto literal_to_string = [](const q_ns::LiteralNode &lit) -> std::string {
+        return std::visit(
+            [](auto &&v) -> std::string {
+                using T = std::decay_t<decltype(v)>;
+                if constexpr (std::is_same_v<T, std::string>)
+                    return v;
+                else if constexpr (std::is_same_v<T, bool>)
+                    return v ? "true" : "false";
+                else if constexpr (std::is_same_v<T, int64_t>)
+                    return std::to_string(v);
+                else if constexpr (std::is_same_v<T, uint64_t>)
+                    return std::to_string(v);
+                else if constexpr (std::is_same_v<T, double>)
+                    return std::to_string(v);
+                else
+                    return {};
+            },
+            lit.value);
+    };
 
-TraceReaderConfig build_config(TraceReaderObject *self) {
-    TraceReaderConfig cfg;
-    cfg.file_path = PyUnicode_AsUTF8(self->file_path);
-    const char *idx = PyUnicode_AsUTF8(self->index_dir);
-    if (idx) cfg.index_dir = idx;
-    cfg.checkpoint_size = self->checkpoint_size;
-    cfg.auto_build_index = self->auto_build_index != 0;
-    cfg.index_threshold = self->index_threshold;
-    return cfg;
+    return std::visit(
+        [&](const auto &n) -> std::optional<LeafVec> {
+            using T = std::decay_t<decltype(n)>;
+            if constexpr (std::is_same_v<T, q_ns::CompareNode>) {
+                if (n.op != q_ns::CompareOp::EQ) return std::nullopt;
+                return LeafVec{{n.field.path, literal_to_string(n.value)}};
+            } else if constexpr (std::is_same_v<T, q_ns::AndNode>) {
+                auto l = extract_eq_leaves(*n.left);
+                if (!l) return std::nullopt;
+                auto r = extract_eq_leaves(*n.right);
+                if (!r) return std::nullopt;
+                l->insert(l->end(), r->begin(), r->end());
+                return l;
+            } else {
+                return std::nullopt;
+            }
+        },
+        node.data);
 }
 
-static Runtime *get_runtime(TraceReaderObject *self) {
-    if (self->runtime_obj) {
-        return ((RuntimeObject *)self->runtime_obj)->runtime.get();
+// True iff every checkpoint in `chunk_idxs` has dim_stats min == max == literal
+// for every leaf. Empty leaves -> false (no shortcut). Missing dim_stats for
+// any (chunk, leaf) -> false (we don't know, play safe).
+static bool all_chunks_uniform_match(
+    const dftracer::utils::utilities::indexer::IndexDatabase &db, int fid,
+    const std::vector<std::pair<std::string, std::string>> &leaves,
+    const std::vector<std::uint64_t> &chunk_idxs) {
+    if (leaves.empty() || chunk_idxs.empty()) return false;
+    namespace indexing = dftracer::utils::utilities::composites::dft::indexing;
+
+    for (const auto &[dim, val] : leaves) {
+        auto rows = db.query_chunk_dimension_stats_for_dimension(fid, dim);
+        if (rows.empty()) return false;
+        std::unordered_map<std::uint64_t,
+                           const indexing::ChunkDimensionStatsResult *>
+            by_ckpt;
+        by_ckpt.reserve(rows.size());
+        for (const auto &r : rows) by_ckpt.emplace(r.checkpoint_idx, &r);
+        for (auto cidx : chunk_idxs) {
+            auto it = by_ckpt.find(cidx);
+            if (it == by_ckpt.end()) return false;
+            const auto &ds = *it->second;
+            if (ds.min_value != val || ds.max_value != val) return false;
+        }
     }
-    return get_default_runtime();
+    return true;
 }
 
-static TraceReaderIteratorObject *make_iterator(
-    std::shared_ptr<IteratorState> state, IteratorMode mode) {
-    TraceReaderIteratorObject *it =
-        (TraceReaderIteratorObject *)TraceReaderIteratorType.tp_alloc(
-            &TraceReaderIteratorType, 0);
-    if (!it) return NULL;
-    new (&it->state) std::shared_ptr<IteratorState>(std::move(state));
-#ifdef DFTRACER_UTILS_ENABLE_ARROW
-    new (&it->arrow_state) std::shared_ptr<ArrowIteratorState>();
-#endif
-    it->mode = mode;
-    return it;
-}
+// Byte-range work unit for checkpoint-level parallelism. Each unit covers
+// one or more consecutive checkpoints from a single file. Decompression of
+// a single gz file is sequential per gzip stream, so splitting at
+// checkpoint-aligned byte offsets is what lets multiple workers share the
+// decode work for one file.
+struct ArrowWorkItem {
+    std::string file_path;
+    std::size_t start_byte = 0;
+    std::size_t end_byte = 0;
+    bool start_at_checkpoint = false;
+    bool end_at_checkpoint = false;
+    // When true, every kept chunk for this byte range is uniform-matching
+    // (dim_stats min == max == predicate literal for every AND-of-EQ leaf),
+    // so per-event predicate eval is skippable.
+    bool chunk_prune_only = false;
+    // Line-range work items override byte ranges: the worker passes these
+    // down as LINE_RANGE on the read, and the gzip stream resolves them to
+    // byte offsets via the checkpoint index. 0 = no line constraint.
+    std::size_t start_line = 0;
+    std::size_t end_line = 0;
+};
 
-#ifdef DFTRACER_UTILS_ENABLE_ARROW
-static TraceReaderIteratorObject *make_arrow_iterator(
-    std::shared_ptr<ArrowIteratorState> state) {
-    TraceReaderIteratorObject *it =
-        (TraceReaderIteratorObject *)TraceReaderIteratorType.tp_alloc(
-            &TraceReaderIteratorType, 0);
-    if (!it) return NULL;
-    new (&it->state) std::shared_ptr<IteratorState>();
-    new (&it->arrow_state)
-        std::shared_ptr<ArrowIteratorState>(std::move(state));
-    it->mode = IteratorMode::ARROW;
-    return it;
-}
-#endif
+static std::vector<ArrowWorkItem> enumerate_work_items(
+    const std::vector<std::string> &files, const std::string &index_dir,
+    const std::string &query_str, std::size_t max_workers,
+    std::size_t clip_start_byte = 0, std::size_t clip_end_byte = 0,
+    std::size_t clip_start_line = 0, std::size_t clip_end_line = 0) {
+    namespace dft_internal =
+        dftracer::utils::utilities::composites::dft::internal;
+    namespace indexer_ns = dftracer::utils::utilities::indexer;
+    namespace indexing = dftracer::utils::utilities::composites::dft::indexing;
+
+    std::vector<ArrowWorkItem> items;
+    items.reserve(files.size() * 4);
+
+    const bool has_line_clip = (clip_start_line > 0 || clip_end_line > 0);
+    auto push_unsplit = [&](const std::string &fp) {
+        ArrowWorkItem item;
+        item.file_path = fp;
+        item.start_line = clip_start_line;
+        item.end_line = clip_end_line;
+        items.push_back(std::move(item));
+    };
 
-}  // namespace
+    // Parse the query once. Pruner input copies a Query, so we keep the
+    // parsed form around to feed each ChunkPrunerInput without re-parsing.
+    std::optional<dftracer::utils::utilities::common::query::Query> parsed;
+    if (!query_str.empty()) {
+        auto r = dftracer::utils::utilities::common::query::Query::from_string(
+            query_str);
+        if (r) parsed = std::move(*r);
+    }
 
-using dftracer::utils::python::wrap_arrow_table;
+    // All files in a directory-mode scan share the same `.dftindex` root.
+    // Group files by their resolved index path so we can open the RocksDB
+    // once per index and reuse it to prune every file against that handle.
+    std::unordered_map<std::string, std::vector<std::size_t>> by_index;
+    for (std::size_t i = 0; i < files.size(); ++i) {
+        std::string index_path =
+            dft_internal::determine_index_path(files[i], index_dir);
+        by_index[index_path].push_back(i);
+    }
+
+    for (auto &entry : by_index) {
+        const auto &index_path = entry.first;
+        const auto &file_idxs = entry.second;
+        if (!fs::exists(index_path)) {
+            for (auto i : file_idxs) push_unsplit(files[i]);
+            continue;
+        }
+        std::unique_ptr<indexer_ns::IndexDatabase> idx_db;
+        try {
+            idx_db = std::make_unique<indexer_ns::IndexDatabase>(
+                index_path,
+                dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+        } catch (...) {
+            for (auto i : file_idxs) push_unsplit(files[i]);
+            continue;
+        }
+
+        // Resolve fid + checkpoints per file (cheap queries).
+        struct FileCtx {
+            std::size_t file_idx;
+            int fid;
+            std::vector<indexer_ns::IndexerCheckpoint> ckpts;
+        };
+        std::vector<FileCtx> file_ctxs;
+        file_ctxs.reserve(file_idxs.size());
+        for (auto i : file_idxs) {
+            FileCtx fc;
+            fc.file_idx = i;
+            fc.fid = idx_db->get_file_info_id(
+                indexer_ns::internal::get_logical_path(files[i]));
+            if (fc.fid < 0) {
+                push_unsplit(files[i]);
+                continue;
+            }
+            fc.ckpts = idx_db->query_checkpoints(fc.fid);
+            if (fc.ckpts.empty()) {
+                push_unsplit(files[i]);
+                continue;
+            }
+            std::sort(fc.ckpts.begin(), fc.ckpts.end(),
+                      [](const auto &a, const auto &b) {
+                          return a.first_line_num < b.first_line_num;
+                      });
+            file_ctxs.push_back(std::move(fc));
+        }
+
+        // Batch-prune all files against the shared index: dim_stats and
+        // chunk_statistics are loaded in one RocksDB scan each instead of
+        // one scan per file.
+        std::vector<indexing::ChunkPrunerOutput> pruner_outs(file_ctxs.size());
+        if (parsed && !file_ctxs.empty()) {
+            indexing::ChunkPrunerBatchInput batch_in;
+            batch_in.index_path = index_path;
+            batch_in.external_db = idx_db.get();
+            batch_in.items.reserve(file_ctxs.size());
+            for (auto &fc : file_ctxs) {
+                batch_in.items.push_back({files[fc.file_idx], *parsed});
+            }
+            indexing::ChunkPrunerUtility pruner;
+            auto batch_out = pruner.process_batch(batch_in);
+            if (batch_out.success) {
+                pruner_outs = std::move(batch_out.outputs);
+            }
+        }
+
+        // For AND-of-EQ predicates, precompute uniform-match leaves once.
+        // Per-file pure_match is checked inline below and lets workers skip
+        // per-event predicate eval on chunks where dim_stats min == max ==
+        // literal for every leaf.
+        std::optional<std::vector<std::pair<std::string, std::string>>>
+            eq_leaves;
+        if (parsed) eq_leaves = extract_eq_leaves(parsed->root());
+
+        for (std::size_t fc_idx = 0; fc_idx < file_ctxs.size(); ++fc_idx) {
+            auto &fc = file_ctxs[fc_idx];
+            const auto &fp = files[fc.file_idx];
+
+            // Pruner chunk_idx semantics: 0-indexed over uncompressed
+            // slices. fc.ckpts holds gzip recovery points; recovery point
+            // fc.ckpts[k] sits at the START of pruner chunk (k+1). Pruner
+            // chunk 0 has no recovery point at its start (decoded from
+            // gzip stream start). Total pruner chunks = fc.ckpts.size()+1.
+            const std::size_t total_chunks = fc.ckpts.size() + 1;
+            auto chunk_start_byte = [&](std::uint64_t cidx) -> std::size_t {
+                if (cidx == 0) return 0;
+                return fc.ckpts[cidx - 1].uc_offset;
+            };
+            auto chunk_end_byte = [&](std::uint64_t cidx) -> std::size_t {
+                if (cidx == 0)
+                    return fc.ckpts.empty() ? 0 : fc.ckpts[0].uc_offset;
+                std::size_t k = cidx - 1;
+                return fc.ckpts[k].uc_offset + fc.ckpts[k].uc_size;
+            };
+            // Line ranges for a chunk. Chunk 0 covers everything before the
+            // first recovery point; chunk k>=1 spans recovery point (k-1).
+            auto chunk_first_line = [&](std::uint64_t cidx) -> std::size_t {
+                if (cidx == 0) return 1;
+                return fc.ckpts[cidx - 1].first_line_num;
+            };
+            auto chunk_last_line = [&](std::uint64_t cidx) -> std::size_t {
+                if (cidx == 0) {
+                    if (fc.ckpts.empty()) return SIZE_MAX;
+                    return fc.ckpts[0].first_line_num > 0
+                               ? fc.ckpts[0].first_line_num - 1
+                               : 0;
+                }
+                return fc.ckpts[cidx - 1].last_line_num;
+            };
+
+            std::vector<std::uint64_t> keep_chunks;
+            keep_chunks.reserve(total_chunks);
+            if (parsed) {
+                const auto &pr = pruner_outs[fc_idx];
+                if (pr.success && !pr.file_may_match) {
+                    continue;  // whole file pruned
+                }
+                if (pr.success && !pr.candidate_checkpoints.empty() &&
+                    pr.candidate_checkpoints.size() < pr.total_checkpoints) {
+                    for (auto cidx : pr.candidate_checkpoints) {
+                        if (cidx < total_chunks) keep_chunks.push_back(cidx);
+                    }
+                    std::sort(keep_chunks.begin(), keep_chunks.end());
+                    keep_chunks.erase(
+                        std::unique(keep_chunks.begin(), keep_chunks.end()),
+                        keep_chunks.end());
+                } else {
+                    for (std::uint64_t c = 0; c < total_chunks; ++c)
+                        keep_chunks.push_back(c);
+                }
+            } else {
+                for (std::uint64_t c = 0; c < total_chunks; ++c)
+                    keep_chunks.push_back(c);
+            }
+
+            // Intersect with the user's line range so workers only touch
+            // chunks that actually overlap it. Each work item carries the
+            // sub-line-range; LINE_RANGE on the read maps it back to bytes
+            // via the same checkpoint table the gzip stream uses.
+            if (has_line_clip) {
+                std::size_t lo = clip_start_line > 0 ? clip_start_line : 1;
+                std::size_t hi = clip_end_line > 0 ? clip_end_line : SIZE_MAX;
+                std::vector<std::uint64_t> filtered;
+                filtered.reserve(keep_chunks.size());
+                for (auto c : keep_chunks) {
+                    std::size_t cf = chunk_first_line(c);
+                    std::size_t cl = chunk_last_line(c);
+                    if (cl < lo || cf > hi) continue;
+                    filtered.push_back(c);
+                }
+                keep_chunks = std::move(filtered);
+            }
+
+            if (keep_chunks.empty()) continue;
+
+            // All-or-nothing per file: if every kept chunk is uniform-matching
+            // for every leaf, every work item from this file gets the
+            // chunk_prune_only fast path. Mixed files fall back to per-event
+            // eval to stay safe.
+            bool file_pure_match = false;
+            if (eq_leaves && !eq_leaves->empty() && idx_db) {
+                file_pure_match = all_chunks_uniform_match(
+                    *idx_db, fc.fid, *eq_leaves, keep_chunks);
+            }
+
+            std::size_t target_ranges = std::max<std::size_t>(1, max_workers);
+            std::size_t per_range = std::max<std::size_t>(
+                1, (keep_chunks.size() + target_ranges - 1) / target_ranges);
+
+            std::size_t group_start = 0;
+            while (group_start < keep_chunks.size()) {
+                std::size_t group_end = group_start;
+                std::size_t emitted = 0;
+                while (group_end < keep_chunks.size() && emitted < per_range) {
+                    if (group_end > group_start &&
+                        keep_chunks[group_end] !=
+                            keep_chunks[group_end - 1] + 1) {
+                        break;
+                    }
+                    ++group_end;
+                    ++emitted;
+                }
+                std::uint64_t scidx = keep_chunks[group_start];
+                std::uint64_t ecidx = keep_chunks[group_end - 1];
+                std::size_t start_byte = chunk_start_byte(scidx);
+                std::size_t end_byte = chunk_end_byte(ecidx);
+                // start_at_checkpoint: a gzip recovery point sits at
+                // start_byte (true for any cidx>=1; false for the implicit
+                // chunk 0 which decodes from stream start).
+                bool start_at_checkpoint = (scidx >= 1);
+                bool end_at_checkpoint = (group_end < keep_chunks.size());
+                if (has_line_clip) {
+                    std::size_t lo = clip_start_line > 0 ? clip_start_line : 1;
+                    std::size_t hi =
+                        clip_end_line > 0 ? clip_end_line : SIZE_MAX;
+                    std::size_t cluster_first = chunk_first_line(scidx);
+                    std::size_t cluster_last = chunk_last_line(ecidx);
+                    std::size_t item_start =
+                        std::max<std::size_t>(lo, cluster_first);
+                    std::size_t item_end =
+                        std::min<std::size_t>(hi, cluster_last);
+                    if (item_start > item_end) {
+                        group_start = group_end;
+                        continue;
+                    }
+                    ArrowWorkItem item;
+                    item.file_path = fp;
+                    item.chunk_prune_only = file_pure_match;
+                    item.start_line = item_start;
+                    item.end_line = item_end;
+                    items.push_back(std::move(item));
+                    group_start = group_end;
+                    continue;
+                }
+                if (clip_end_byte > clip_start_byte) {
+                    if (start_byte < clip_start_byte) {
+                        start_byte = clip_start_byte;
+                        start_at_checkpoint = false;
+                    }
+                    if (end_byte > clip_end_byte) {
+                        end_byte = clip_end_byte;
+                        end_at_checkpoint = false;
+                    }
+                    if (start_byte >= end_byte) {
+                        group_start = group_end;
+                        continue;
+                    }
+                }
+                items.push_back({fp, start_byte, end_byte, start_at_checkpoint,
+                                 end_at_checkpoint, file_pure_match});
+                group_start = group_end;
+            }
+        }
+    }
+    return items;
+}
+
+static CoroTask<void> send_work_items_to_channel(
+    std::shared_ptr<dftracer::utils::coro::Channel<ArrowWorkItem>> chan,
+    const std::vector<ArrowWorkItem> *items, std::atomic<bool> *cancelled) {
+    for (const auto &it : *items) {
+        if (cancelled->load(std::memory_order_acquire)) break;
+        if (!co_await chan->send(it)) break;
+    }
+    chan->close();
+    co_return;
+}
+
+static CoroTask<void> checkpoint_worker(
+    std::shared_ptr<dftracer::utils::coro::Channel<ArrowWorkItem>> work_chan,
+    dftracer::utils::coro::Channel<ArrowExportResult> *out_chan,
+    std::string index_dir, std::size_t checkpoint_size, bool auto_build_index,
+    ReadConfig rc, std::size_t batch_size, bool normalize,
+    std::atomic<bool> *cancelled) {
+    dftracer::utils::coro::ChannelProducer<ArrowExportResult> producer(
+        out_chan);
+    auto guard = producer.guard();
+
+    // Cache readers keyed by file path so we don't re-probe the same file
+    // when successive work items land on it.
+    std::unordered_map<std::string, std::shared_ptr<TraceReader>> readers;
+
+    while (auto item = co_await work_chan->receive()) {
+        if (cancelled->load(std::memory_order_acquire)) co_return;
+
+        auto &reader_ptr = readers[item->file_path];
+        if (!reader_ptr) {
+            TraceReaderConfig cfg;
+            cfg.file_path = item->file_path;
+            cfg.index_dir = index_dir;
+            cfg.checkpoint_size = checkpoint_size;
+            cfg.auto_build_index = auto_build_index;
+            reader_ptr = std::make_shared<TraceReader>(std::move(cfg));
+        }
+
+        ReadConfig local_rc = rc;
+        if (item->start_line > 0 || item->end_line > 0) {
+            // Line-range work items: the read drives off LINE_RANGE; the
+            // gzip stream resolves it back to byte offsets via checkpoints.
+            local_rc.start_line = item->start_line;
+            local_rc.end_line = item->end_line;
+            local_rc.start_byte = 0;
+            local_rc.end_byte = 0;
+            local_rc.start_at_checkpoint = false;
+            local_rc.end_at_checkpoint = false;
+        } else {
+            local_rc.start_byte = item->start_byte;
+            local_rc.end_byte = item->end_byte;
+            local_rc.start_at_checkpoint = item->start_at_checkpoint;
+            local_rc.end_at_checkpoint = item->end_at_checkpoint;
+        }
+        // Pruning already happened at enumeration time; avoid the per-
+        // work-item RocksDB opens that would otherwise dwarf the actual
+        // read cost at directory scale (256 files * N ranges).
+        local_rc.skip_pruning = true;
+        // chunks pre-classified as uniform-matching skip per-event eval.
+        if (item->chunk_prune_only) local_rc.chunk_prune_only = true;
+
+        if (!normalize) {
+            auto batch_gen = reader_ptr->read_arrow(local_rc, batch_size);
+            while (auto batch_opt = co_await batch_gen.next()) {
+                if (cancelled->load(std::memory_order_acquire)) co_return;
+                if (!co_await producer.send(std::move(*batch_opt))) co_return;
+            }
+            continue;
+        }
+
+        auto gen = reader_ptr->read_json(local_rc);
+        RecordBatchBuilder builder;
+        builder.reserve(batch_size);
+        StringArena arena;
+
+        while (auto opt = co_await gen.next()) {
+            if (cancelled->load(std::memory_order_acquire)) co_return;
+            if (!build_arrow_row(builder, *opt->parser, arena, normalize))
+                continue;
+            if (builder.num_rows() >= batch_size) {
+                auto result = builder.finish();
+                arena.clear();
+                if (!co_await producer.send(std::move(result))) co_return;
+                if (!builder.is_schema_locked()) builder.lock_schema();
+                builder.reset(true);
+                builder.reserve(batch_size);
+            }
+        }
+        if (builder.num_rows() > 0) {
+            if (!co_await producer.send(builder.finish())) co_return;
+        }
+    }
+    co_return;
+}
+
+static CoroTask<void> spawn_arrow_producers(
+    CoroScope &child,
+    dftracer::utils::coro::Channel<ArrowExportResult> *out_chan,
+    const std::vector<ArrowWorkItem> *work_items, const std::string *index_dir,
+    std::size_t checkpoint_size, bool auto_build_index, const ReadConfig *rc,
+    std::size_t batch_size, bool normalize, std::atomic<bool> *cancelled_ptr,
+    std::size_t max_workers) {
+    std::size_t num_workers = std::min(work_items->size(), max_workers);
+    if (num_workers == 0) num_workers = 1;
+    auto work_chan =
+        dftracer::utils::coro::make_channel<ArrowWorkItem>(num_workers);
+
+    for (std::size_t i = 0; i < num_workers; ++i) {
+        child.spawn([out_chan, wc = work_chan, idx = *index_dir,
+                     checkpoint_size, auto_build_index, r = *rc, batch_size,
+                     normalize, cancelled_ptr](CoroScope &) {
+            return checkpoint_worker(wc, out_chan, idx, checkpoint_size,
+                                     auto_build_index, r, batch_size, normalize,
+                                     cancelled_ptr);
+        });
+    }
+
+    child.spawn([wc = work_chan, work_items, cancelled_ptr](CoroScope &) {
+        return send_work_items_to_channel(wc, work_items, cancelled_ptr);
+    });
+    co_return;
+}
+
+static CoroTask<void> produce_arrow_batches_for_files(
+    CoroScope &scope, ArrowIteratorState *sp, std::vector<std::string> files,
+    std::string index_dir, std::size_t checkpoint_size, bool auto_build_index,
+    ReadConfig rc, std::size_t batch_size, bool normalize,
+    std::size_t max_workers) {
+    try {
+        if (files.empty()) {
+            sp->channel->close();
+            co_return;
+        }
+
+        auto work_items = enumerate_work_items(
+            files, index_dir, rc.query, max_workers, rc.start_byte, rc.end_byte,
+            rc.start_line, rc.end_line);
+        if (work_items.empty()) {
+            sp->channel->close();
+            co_return;
+        }
+
+        auto *chan_ptr = sp->channel.get();
+        auto *cancelled_ptr = &sp->cancelled;
+
+        co_await scope.scope([chan_ptr, &work_items, &index_dir,
+                              checkpoint_size, auto_build_index, &rc,
+                              batch_size, normalize, cancelled_ptr,
+                              max_workers](CoroScope &child) -> CoroTask<void> {
+            co_await spawn_arrow_producers(
+                child, chan_ptr, &work_items, &index_dir, checkpoint_size,
+                auto_build_index, &rc, batch_size, normalize, cancelled_ptr,
+                max_workers);
+        });
+    } catch (...) {
+        sp->set_error(std::current_exception());
+    }
+}
+
+static CoroTask<void> produce_arrow_batches_parallel(
+    CoroScope &scope, ArrowIteratorState *sp, std::string dir_path,
+    std::string index_dir, std::size_t checkpoint_size, bool auto_build_index,
+    ReadConfig rc, std::size_t batch_size, bool normalize,
+    std::size_t max_workers) {
+    try {
+        PatternDirectoryScannerUtility scanner;
+        auto scan_input = PatternDirectoryScannerUtilityInput(
+            dir_path, {".pfw", ".pfw.gz"}, true, false);
+        auto entries = co_await scope.spawn(scanner, scan_input);
+
+        std::vector<std::string> files;
+        files.reserve(entries.size());
+        for (auto &e : entries) files.push_back(e.path.string());
+        std::sort(files.begin(), files.end());
+
+        co_await produce_arrow_batches_for_files(
+            scope, sp, std::move(files), std::move(index_dir), checkpoint_size,
+            auto_build_index, std::move(rc), batch_size, normalize,
+            max_workers);
+    } catch (...) {
+        sp->set_error(std::current_exception());
+    }
+}
+
+CoroTask<void> produce_arrow_batches(
+    std::shared_ptr<ArrowIteratorState> state,
+    dftracer::utils::coro::ChannelProducer<ArrowExportResult> producer,
+    TraceReaderConfig cfg, ReadConfig rc, std::size_t batch_size,
+    bool flatten_objects = false, bool normalize = false) {
+    (void)flatten_objects;
+
+    auto guard = producer.guard();
+    try {
+        TraceReader reader(std::move(cfg));
+
+        if (!normalize) {
+            auto batch_gen = reader.read_arrow(rc, batch_size);
+            while (auto batch_opt = co_await batch_gen.next()) {
+                if (state->cancelled.load(std::memory_order_acquire)) break;
+                auto result_bytes =
+                    dftracer::utils::python::byte_size(*batch_opt);
+                state->bytes_in_queue.fetch_add(result_bytes,
+                                                std::memory_order_acq_rel);
+                if (!co_await producer.send(std::move(*batch_opt))) break;
+            }
+            co_return;
+        }
+
+        auto gen = reader.read_json(rc);
+        RecordBatchBuilder builder;
+        builder.reserve(batch_size);
+
+        StringArena arena;
+
+        while (auto opt = co_await gen.next()) {
+            if (state->cancelled.load(std::memory_order_acquire)) break;
+            if (!build_arrow_row(builder, *opt->parser, arena, normalize))
+                continue;
+
+            if (builder.num_rows() >= batch_size) {
+                auto result = builder.finish();
+                arena.clear();
+                auto result_bytes = dftracer::utils::python::byte_size(result);
+                state->bytes_in_queue.fetch_add(result_bytes,
+                                                std::memory_order_acq_rel);
+                if (!co_await producer.send(std::move(result))) break;
+                if (!builder.is_schema_locked()) {
+                    builder.lock_schema();
+                }
+                builder.reset(true);
+                builder.reserve(batch_size);
+            }
+        }
+
+        if (builder.num_rows() > 0 &&
+            !state->cancelled.load(std::memory_order_acquire)) {
+            auto result = builder.finish();
+            auto result_bytes = dftracer::utils::python::byte_size(result);
+            state->bytes_in_queue.fetch_add(result_bytes,
+                                            std::memory_order_acq_rel);
+            co_await producer.send(std::move(result));
+        }
+    } catch (...) {
+        state->set_error(std::current_exception());
+    }
+}
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+struct WriteArrowStats {
+    std::unordered_map<std::string, PartitionWriteStats> partitions;
+    int64_t total_rows = 0;
+    int64_t total_uncompressed_bytes = 0;
+};
+
+struct WriteArrowResult {
+    WriteArrowStats stats;
+    std::string error;
+    std::uint64_t chunks_scanned = 0;
+    std::uint64_t chunks_skipped = 0;
+};
+
+CoroTask<WriteArrowResult> write_arrow_pipeline(
+    std::string file_path, std::string index_path, std::size_t checkpoint_size,
+    std::vector<ViewDefinition> views, std::string output_path,
+    int64_t chunk_size_bytes, IpcCompression compression,
+    std::size_t event_batch_size) {
+    namespace dft_internal =
+        dftracer::utils::utilities::composites::dft::internal;
+    WriteArrowResult result;
+
+    try {
+        if (views.empty()) {
+            views.push_back(ViewDefinition().with_name("all"));
+        }
+
+        std::string resolved_index =
+            index_path.empty()
+                ? dft_internal::determine_index_path(file_path, "")
+                : index_path;
+
+        auto meta_input = MetadataCollectorUtilityInput::from_file(file_path)
+                              .with_checkpoint_size(checkpoint_size)
+                              .with_index(resolved_index);
+        auto metadata = co_await MetadataCollectorUtility{}.process(meta_input);
+        if (!metadata.success) {
+            result.error =
+                "Failed to collect metadata: " + metadata.error_message;
+            co_return result;
+        }
+
+        for (const auto &view : views) {
+            std::string view_output = output_path;
+            if (views.size() > 1 || view.name != "all") {
+                view_output = output_path + "/" + view.name;
+            }
+
+            PartitionWriter writer;
+            int rc_open = co_await writer.open(view_output, chunk_size_bytes,
+                                               compression);
+            if (rc_open != 0) {
+                result.error =
+                    "Failed to open partition writer for view: " + view.name;
+                co_return result;
+            }
+
+            ViewBuilderInput builder_input;
+            builder_input.with_view(view)
+                .with_file_path(file_path)
+                .with_index_path(resolved_index)
+                .with_uncompressed_size(metadata.uncompressed_size)
+                .with_num_checkpoints(metadata.num_checkpoints);
+
+            auto build_output =
+                co_await ViewBuilderUtility{}.process(builder_input);
+            if (!build_output.success) {
+                result.error = "ViewBuilder failed for view: " + view.name;
+                co_return result;
+            }
+
+            result.chunks_skipped += build_output.skipped_checkpoints;
+
+            if (!build_output.file_may_match) {
+                auto stats = co_await writer.close();
+                result.stats.partitions[view.name] = std::move(stats);
+                continue;
+            }
+
+            RecordBatchBuilder builder;
+            bool schema_locked = false;
+
+            for (const auto &candidate : build_output.candidates) {
+                ViewReaderInput reader_input;
+                reader_input.with_file_path(file_path)
+                    .with_index_path(resolved_index)
+                    .with_checkpoint_size(checkpoint_size)
+                    .with_byte_range(candidate.start_byte, candidate.end_byte)
+                    .with_checkpoint_idx(candidate.checkpoint_idx)
+                    .with_event_batch_size(event_batch_size)
+                    .with_view(view);
+                reader_input.query = view.query;
+
+                ViewReaderUtility reader;
+                auto gen = reader.process(reader_input);
+                while (auto opt = co_await gen.next()) {
+                    auto arrow_batch = opt->to_arrow(builder);
+                    int rc_write = co_await writer.write_batch(arrow_batch);
+                    if (rc_write != 0) {
+                        result.error =
+                            "Failed to write batch for view: " + view.name;
+                        co_return result;
+                    }
+                    if (!schema_locked) {
+                        builder.lock_schema();
+                        schema_locked = true;
+                    }
+                    builder.reset(true);
+                }
+                result.chunks_scanned++;
+            }
+
+            auto stats = co_await writer.close();
+            result.stats.partitions[view.name] = std::move(stats);
+            result.stats.total_rows +=
+                result.stats.partitions[view.name].total_rows;
+            result.stats.total_uncompressed_bytes +=
+                result.stats.partitions[view.name].total_uncompressed_bytes;
+        }
+    } catch (const std::exception &e) {
+        result.error = e.what();
+    }
+    co_return result;
+}
+
+struct ViewChunkInfo {
+    std::uint64_t checkpoint_idx;
+    std::size_t start_byte;
+    std::size_t end_byte;
+};
+
+struct GetViewChunksResult {
+    std::vector<ViewChunkInfo> chunks;
+    std::uint64_t total_checkpoints = 0;
+    std::uint64_t skipped_checkpoints = 0;
+    bool file_may_match = false;
+    std::string error;
+};
+
+CoroTask<GetViewChunksResult> get_view_chunks_pipeline(
+    std::string file_path, std::string index_path, std::size_t checkpoint_size,
+    ViewDefinition view) {
+    namespace dft_internal =
+        dftracer::utils::utilities::composites::dft::internal;
+    GetViewChunksResult result;
+
+    try {
+        std::string resolved_index =
+            index_path.empty()
+                ? dft_internal::determine_index_path(file_path, "")
+                : index_path;
+
+        auto meta_input = MetadataCollectorUtilityInput::from_file(file_path)
+                              .with_checkpoint_size(checkpoint_size)
+                              .with_index(resolved_index);
+        auto metadata = co_await MetadataCollectorUtility{}.process(meta_input);
+        if (!metadata.success) {
+            result.error =
+                "Failed to collect metadata: " + metadata.error_message;
+            co_return result;
+        }
+
+        ViewBuilderInput builder_input;
+        builder_input.with_view(view)
+            .with_file_path(file_path)
+            .with_index_path(resolved_index)
+            .with_uncompressed_size(metadata.uncompressed_size)
+            .with_num_checkpoints(metadata.num_checkpoints);
+
+        auto build_output =
+            co_await ViewBuilderUtility{}.process(builder_input);
+        if (!build_output.success) {
+            result.error = "ViewBuilder failed";
+            co_return result;
+        }
+
+        result.file_may_match = build_output.file_may_match;
+        result.total_checkpoints = build_output.total_checkpoints;
+        result.skipped_checkpoints = build_output.skipped_checkpoints;
+
+        for (const auto &candidate : build_output.candidates) {
+            result.chunks.push_back({candidate.checkpoint_idx,
+                                     candidate.start_byte, candidate.end_byte});
+        }
+    } catch (const std::exception &e) {
+        result.error = e.what();
+    }
+    co_return result;
+}
+
+struct WriteViewChunkResult {
+    std::string output_file;
+    std::uint64_t events_matched = 0;
+    std::uint64_t events_scanned = 0;
+    int64_t rows_written = 0;
+    int64_t bytes_written = 0;
+    std::string error;
+};
+
+CoroTask<WriteViewChunkResult> write_view_chunk_pipeline(
+    std::string file_path, std::string index_path, std::size_t checkpoint_size,
+    ViewDefinition view, std::uint64_t checkpoint_idx, std::size_t start_byte,
+    std::size_t end_byte, std::string output_file, IpcCompression compression,
+    std::size_t event_batch_size) {
+    namespace dft_internal =
+        dftracer::utils::utilities::composites::dft::internal;
+    WriteViewChunkResult result;
+    result.output_file = output_file;
+
+    try {
+        std::string resolved_index =
+            index_path.empty()
+                ? dft_internal::determine_index_path(file_path, "")
+                : index_path;
+
+        dftracer::utils::utilities::common::arrow::IpcWriter writer;
+        int rc_open = co_await writer.open(output_file, compression);
+        if (rc_open != 0) {
+            result.error = "Failed to open output file";
+            co_return result;
+        }
+
+        ViewReaderInput reader_input;
+        reader_input.with_file_path(file_path)
+            .with_index_path(resolved_index)
+            .with_checkpoint_size(checkpoint_size)
+            .with_byte_range(start_byte, end_byte)
+            .with_checkpoint_idx(checkpoint_idx)
+            .with_event_batch_size(event_batch_size)
+            .with_view(view);
+        reader_input.query = view.query;
+
+        RecordBatchBuilder builder;
+        bool schema_locked = false;
+
+        ViewReaderUtility reader;
+        auto gen = reader.process(reader_input);
+        while (auto opt = co_await gen.next()) {
+            result.events_matched += opt->events_matched;
+            result.events_scanned += opt->events_scanned;
+            auto batch = opt->to_arrow(builder);
+            if (batch.valid()) {
+                result.rows_written += batch.num_rows();
+                int rc = co_await writer.write_batch(batch);
+                if (rc != 0) {
+                    result.error = "Failed to write batch";
+                    co_return result;
+                }
+                if (!schema_locked) {
+                    builder.lock_schema();
+                    schema_locked = true;
+                }
+                builder.reset(true);
+            }
+        }
+
+        int rc = co_await writer.close();
+        if (rc != 0) {
+            result.error = "Failed to close output file";
+        }
+    } catch (const std::exception &e) {
+        result.error = e.what();
+    }
+    co_return result;
+}
+
+struct ChunkDescriptor {
+    std::uint64_t checkpoint_idx;
+    std::size_t start_byte;
+    std::size_t end_byte;
+    std::string output_file;
+};
+
+struct WriteViewChunksResult {
+    std::vector<WriteViewChunkResult> results;
+    int64_t total_rows = 0;
+    int64_t total_events_matched = 0;
+};
+
+CoroTask<WriteViewChunksResult> write_view_chunks_pipeline(
+    std::string file_path, std::string index_path, std::size_t checkpoint_size,
+    ViewDefinition view, std::vector<ChunkDescriptor> chunks,
+    IpcCompression compression, std::size_t event_batch_size) {
+    WriteViewChunksResult result;
+
+    if (chunks.empty()) {
+        co_return result;
+    }
+
+    std::vector<CoroTask<WriteViewChunkResult>> tasks;
+    tasks.reserve(chunks.size());
+
+    for (const auto &chunk : chunks) {
+        tasks.push_back(write_view_chunk_pipeline(
+            file_path, index_path, checkpoint_size, view, chunk.checkpoint_idx,
+            chunk.start_byte, chunk.end_byte, chunk.output_file, compression,
+            event_batch_size));
+    }
+
+    result.results = co_await when_all(std::move(tasks));
+
+    for (const auto &r : result.results) {
+        result.total_rows += r.rows_written;
+        result.total_events_matched += r.events_matched;
+    }
+
+    co_return result;
+}
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+TraceReaderConfig build_config(TraceReaderObject *self) {
+    TraceReaderConfig cfg;
+    cfg.file_path = PyUnicode_AsUTF8(self->file_path);
+    const char *idx = PyUnicode_AsUTF8(self->index_dir);
+    if (idx) cfg.index_dir = idx;
+    cfg.checkpoint_size = self->checkpoint_size;
+    cfg.auto_build_index = self->auto_build_index != 0;
+    return cfg;
+}
+
+static Runtime *get_runtime(TraceReaderObject *self) {
+    if (self->runtime_obj) {
+        return ((RuntimeObject *)self->runtime_obj)->runtime.get();
+    }
+    return get_default_runtime();
+}
+
+static TraceReaderIteratorObject *make_memoryview_iterator(
+    std::shared_ptr<MemoryViewBatchIteratorState> state) {
+    TraceReaderIteratorObject *it =
+        (TraceReaderIteratorObject *)TraceReaderIteratorType.tp_alloc(
+            &TraceReaderIteratorType, 0);
+    if (!it) return NULL;
+    new (&it->batch_state)
+        std::shared_ptr<MemoryViewBatchIteratorState>(std::move(state));
+    it->current_batch = NULL;
+    it->batch_index = 0;
+    new (&it->json_dict_state) std::shared_ptr<JsonDictIteratorState>();
+    new (&it->json_dict_current_batch) std::shared_ptr<JsonDictBatch>();
+    it->json_dict_index = 0;
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+    new (&it->arrow_state) std::shared_ptr<ArrowIteratorState>();
+#endif
+    it->mode = IteratorMode::MEMORYVIEW;
+    return it;
+}
+
+static TraceReaderIteratorObject *make_json_dict_iterator(
+    std::shared_ptr<JsonDictIteratorState> state) {
+    TraceReaderIteratorObject *it =
+        (TraceReaderIteratorObject *)TraceReaderIteratorType.tp_alloc(
+            &TraceReaderIteratorType, 0);
+    if (!it) return NULL;
+    new (&it->batch_state) std::shared_ptr<MemoryViewBatchIteratorState>();
+    it->current_batch = NULL;
+    it->batch_index = 0;
+    new (&it->json_dict_state)
+        std::shared_ptr<JsonDictIteratorState>(std::move(state));
+    new (&it->json_dict_current_batch) std::shared_ptr<JsonDictBatch>();
+    it->json_dict_index = 0;
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+    new (&it->arrow_state) std::shared_ptr<ArrowIteratorState>();
+#endif
+    it->mode = IteratorMode::JSON_DICT;
+    return it;
+}
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+static TraceReaderIteratorObject *make_arrow_iterator(
+    std::shared_ptr<ArrowIteratorState> state) {
+    TraceReaderIteratorObject *it =
+        (TraceReaderIteratorObject *)TraceReaderIteratorType.tp_alloc(
+            &TraceReaderIteratorType, 0);
+    if (!it) return NULL;
+    new (&it->batch_state) std::shared_ptr<MemoryViewBatchIteratorState>();
+    it->current_batch = NULL;
+    it->batch_index = 0;
+    new (&it->json_dict_state) std::shared_ptr<JsonDictIteratorState>();
+    new (&it->json_dict_current_batch) std::shared_ptr<JsonDictBatch>();
+    it->json_dict_index = 0;
+    new (&it->arrow_state)
+        std::shared_ptr<ArrowIteratorState>(std::move(state));
+    it->mode = IteratorMode::ARROW;
+    return it;
+}
+#endif
+
+}  // namespace
 
 static void TraceReader_dealloc(TraceReaderObject *self) {
     Py_XDECREF(self->file_path);
@@ -764,8 +2246,6 @@ static PyObject *TraceReader_new(PyTypeObject *type, PyObject *args,
         self->index_dir = NULL;
         self->checkpoint_size = 32 * 1024 * 1024;
         self->auto_build_index = 0;
-        self->index_threshold =
-            dftracer::utils::constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD;
         self->has_index = 0;
         self->runtime_obj = NULL;
     }
@@ -774,26 +2254,19 @@ static PyObject *TraceReader_new(PyTypeObject *type, PyObject *args,
 
 static int TraceReader_init(TraceReaderObject *self, PyObject *args,
                             PyObject *kwds) {
-    static const char *kwlist[] = {"file_path",
-                                   "index_dir",
-                                   "checkpoint_size",
-                                   "auto_build_index",
-                                   "index_threshold",
-                                   "runtime",
-                                   NULL};
+    static const char *kwlist[] = {
+        "path",    "index_dir", "checkpoint_size", "auto_build_index",
+        "runtime", NULL};
 
     const char *file_path;
     const char *index_dir = "";
     std::size_t checkpoint_size = 32 * 1024 * 1024;
     int auto_build_index = 0;
-    std::size_t index_threshold =
-        dftracer::utils::constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD;
     PyObject *runtime_arg = NULL;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|snpnO", (char **)kwlist,
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|snpO", (char **)kwlist,
                                      &file_path, &index_dir, &checkpoint_size,
-                                     &auto_build_index, &index_threshold,
-                                     &runtime_arg)) {
+                                     &auto_build_index, &runtime_arg)) {
         return -1;
     }
 
@@ -828,7 +2301,6 @@ static int TraceReader_init(TraceReaderObject *self, PyObject *args,
 
     self->checkpoint_size = checkpoint_size;
     self->auto_build_index = auto_build_index;
-    self->index_threshold = index_threshold;
 
     try {
         TraceReaderConfig cfg;
@@ -836,7 +2308,6 @@ static int TraceReader_init(TraceReaderObject *self, PyObject *args,
         cfg.index_dir = index_dir;
         cfg.checkpoint_size = checkpoint_size;
         cfg.auto_build_index = auto_build_index != 0;
-        cfg.index_threshold = index_threshold;
         TraceReader probe(std::move(cfg));
         self->has_index = probe.has_index() ? 1 : 0;
     } catch (const std::exception &e) {
@@ -853,17 +2324,18 @@ static int TraceReader_init(TraceReaderObject *self, PyObject *args,
 
 static PyObject *TraceReader_iter_lines(TraceReaderObject *self, PyObject *args,
                                         PyObject *kwds) {
-    static const char *kwlist[] = {"start_line", "end_line",    "start_byte",
-                                   "end_byte",   "buffer_size", "query",
-                                   NULL};
+    static const char *kwlist[] = {"start_line",    "end_line",    "start_byte",
+                                   "end_byte",      "buffer_size", "query",
+                                   "memory_budget", NULL};
     Py_ssize_t start_line = 0, end_line = 0;
     Py_ssize_t start_byte = 0, end_byte = 0;
     Py_ssize_t buffer_size = 4 * 1024 * 1024;
     const char *query_str = NULL;
+    Py_ssize_t memory_budget = 0;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|nnnnnz", (char **)kwlist,
-                                     &start_line, &end_line, &start_byte,
-                                     &end_byte, &buffer_size, &query_str)) {
+    if (!PyArg_ParseTupleAndKeywords(
+            args, kwds, "|nnnnnzn", (char **)kwlist, &start_line, &end_line,
+            &start_byte, &end_byte, &buffer_size, &query_str, &memory_budget)) {
         return NULL;
     }
 
@@ -891,18 +2363,47 @@ static PyObject *TraceReader_iter_lines(TraceReaderObject *self, PyObject *args,
     rc.buffer_size = static_cast<std::size_t>(buffer_size);
     if (query_str) rc.query = query_str;
 
-    auto state = std::make_shared<IteratorState>();
+    auto state = std::make_shared<MemoryViewBatchIteratorState>();
+    state->memory_budget_bytes = dftracer::utils::compute_memory_budget(
+        static_cast<std::size_t>(memory_budget));
 
     Runtime *rt = get_runtime(self);
+    std::size_t max_workers = rt->threads();
+    constexpr std::size_t LINE_BATCH_SIZE = 1024;
+    std::size_t capacity = dftracer::utils::compute_channel_capacity(
+        state->memory_budget_bytes, LINE_BATCH_SIZE * ESTIMATED_BYTES_PER_LINE,
+        max_workers);
+    state->channel =
+        dftracer::utils::coro::make_channel<MemoryViewBatchData>(capacity);
+    auto *sp = state.get();
+
     try {
-        auto handle = rt->submit(produce_lines(state, cfg, rc), "iter_lines");
-        state->task_future = handle.future;
+        bool is_dir = fs::is_directory(cfg.file_path);
+        if (is_dir) {
+            auto handle = rt->scope(
+                "iter_lines_parallel",
+                [sp, dir_path = cfg.file_path, index_dir = cfg.index_dir,
+                 checkpoint_size = cfg.checkpoint_size,
+                 auto_build_index = cfg.auto_build_index, rc,
+                 max_workers](CoroScope &scope) -> CoroTask<void> {
+                    co_await produce_lines_parallel(
+                        scope, sp, dir_path, index_dir, checkpoint_size,
+                        auto_build_index, rc, LINE_BATCH_SIZE, max_workers);
+                });
+            state->task_future = handle.future;
+        } else {
+            auto handle = rt->submit(
+                produce_lines_batched(state, state->channel->producer(), cfg,
+                                      rc, LINE_BATCH_SIZE),
+                "iter_lines");
+            state->task_future = handle.future;
+        }
     } catch (const std::exception &e) {
         PyErr_SetString(PyExc_RuntimeError, e.what());
         return NULL;
     }
 
-    TraceReaderIteratorObject *it = make_iterator(state, IteratorMode::LINES);
+    TraceReaderIteratorObject *it = make_memoryview_iterator(std::move(state));
     return (PyObject *)it;
 }
 
@@ -910,18 +2411,20 @@ static PyObject *TraceReader_iter_raw(TraceReaderObject *self, PyObject *args,
                                       PyObject *kwds) {
     static const char *kwlist[] = {"start_line", "end_line",    "start_byte",
                                    "end_byte",   "buffer_size", "line_aligned",
-                                   "multi_line", "query",       NULL};
+                                   "multi_line", "query",       "memory_budget",
+                                   NULL};
     Py_ssize_t start_line = 0, end_line = 0;
     Py_ssize_t start_byte = 0, end_byte = 0;
     Py_ssize_t buffer_size = 4 * 1024 * 1024;
     int line_aligned = 1;
     int multi_line = 1;
     const char *query_str = NULL;
+    Py_ssize_t memory_budget = 0;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|nnnnnppz", (char **)kwlist,
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|nnnnnppzn", (char **)kwlist,
                                      &start_line, &end_line, &start_byte,
                                      &end_byte, &buffer_size, &line_aligned,
-                                     &multi_line, &query_str)) {
+                                     &multi_line, &query_str, &memory_budget)) {
         return NULL;
     }
 
@@ -951,18 +2454,44 @@ static PyObject *TraceReader_iter_raw(TraceReaderObject *self, PyObject *args,
     rc.multi_line = multi_line != 0;
     if (query_str) rc.query = query_str;
 
-    auto state = std::make_shared<IteratorState>();
+    auto state = std::make_shared<MemoryViewBatchIteratorState>();
+    state->memory_budget_bytes = dftracer::utils::compute_memory_budget(
+        static_cast<std::size_t>(memory_budget));
 
     Runtime *rt = get_runtime(self);
+    std::size_t max_workers = rt->threads();
+    std::size_t capacity = dftracer::utils::compute_channel_capacity(
+        state->memory_budget_bytes, ESTIMATED_BYTES_PER_RAW_CHUNK, max_workers);
+    state->channel =
+        dftracer::utils::coro::make_channel<MemoryViewBatchData>(capacity);
+    auto *sp = state.get();
+
     try {
-        auto handle = rt->submit(produce_raw(state, cfg, rc), "iter_raw");
-        state->task_future = handle.future;
+        bool is_dir = fs::is_directory(cfg.file_path);
+        if (is_dir) {
+            auto handle = rt->scope(
+                "iter_raw_parallel",
+                [sp, dir_path = cfg.file_path, index_dir = cfg.index_dir,
+                 checkpoint_size = cfg.checkpoint_size,
+                 auto_build_index = cfg.auto_build_index, rc,
+                 max_workers](CoroScope &scope) -> CoroTask<void> {
+                    co_await produce_raw_parallel(
+                        scope, sp, dir_path, index_dir, checkpoint_size,
+                        auto_build_index, rc, max_workers);
+                });
+            state->task_future = handle.future;
+        } else {
+            auto handle = rt->submit(
+                produce_raw_batched(state, state->channel->producer(), cfg, rc),
+                "iter_raw");
+            state->task_future = handle.future;
+        }
     } catch (const std::exception &e) {
         PyErr_SetString(PyExc_RuntimeError, e.what());
         return NULL;
     }
 
-    TraceReaderIteratorObject *it = make_iterator(state, IteratorMode::RAW);
+    TraceReaderIteratorObject *it = make_memoryview_iterator(std::move(state));
     return (PyObject *)it;
 }
 
@@ -975,6 +2504,102 @@ static PyObject *TraceReader_read_lines(TraceReaderObject *self, PyObject *args,
     return list;
 }
 
+static PyObject *TraceReader_iter_json(TraceReaderObject *self, PyObject *args,
+                                       PyObject *kwds) {
+    static const char *kwlist[] = {"start_line", "end_line",      "start_byte",
+                                   "end_byte",   "buffer_size",   "query",
+                                   "batch_size", "memory_budget", NULL};
+    Py_ssize_t start_line = 0, end_line = 0;
+    Py_ssize_t start_byte = 0, end_byte = 0;
+    Py_ssize_t buffer_size = 4 * 1024 * 1024;
+    const char *query_str = NULL;
+    Py_ssize_t batch_size = 1024;
+    Py_ssize_t memory_budget = 0;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|nnnnnznn", (char **)kwlist,
+                                     &start_line, &end_line, &start_byte,
+                                     &end_byte, &buffer_size, &query_str,
+                                     &batch_size, &memory_budget)) {
+        return NULL;
+    }
+
+    if (start_line < 0 || end_line < 0 || start_byte < 0 || end_byte < 0 ||
+        buffer_size <= 0 || batch_size <= 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "range arguments must be >= 0; buffer_size and "
+                        "batch_size must be > 0");
+        return NULL;
+    }
+
+    TraceReaderConfig cfg;
+    try {
+        cfg = build_config(self);
+    } catch (const std::exception &e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return NULL;
+    }
+
+    ReadConfig rc;
+    rc.start_line = static_cast<std::size_t>(start_line);
+    rc.end_line = static_cast<std::size_t>(end_line);
+    rc.start_byte = static_cast<std::size_t>(start_byte);
+    rc.end_byte = static_cast<std::size_t>(end_byte);
+    rc.buffer_size = static_cast<std::size_t>(buffer_size);
+    if (query_str) rc.query = query_str;
+
+    auto state = std::make_shared<JsonDictIteratorState>();
+    state->memory_budget_bytes = dftracer::utils::compute_memory_budget(
+        static_cast<std::size_t>(memory_budget));
+
+    Runtime *rt = get_runtime(self);
+    std::size_t max_workers = rt->threads();
+    auto bs = static_cast<std::size_t>(batch_size);
+    std::size_t capacity = dftracer::utils::compute_channel_capacity(
+        state->memory_budget_bytes, bs * ESTIMATED_BYTES_PER_JSON_EVENT,
+        max_workers);
+    state->channel =
+        dftracer::utils::coro::make_channel<JsonDictBatch>(capacity);
+    auto *sp = state.get();
+
+    try {
+        bool is_dir = fs::is_directory(cfg.file_path);
+        if (is_dir) {
+            auto handle = rt->scope(
+                "iter_json_parallel",
+                [sp, dir_path = cfg.file_path, index_dir = cfg.index_dir,
+                 checkpoint_size = cfg.checkpoint_size,
+                 auto_build_index = cfg.auto_build_index, rc, bs,
+                 max_workers](CoroScope &scope) -> CoroTask<void> {
+                    co_await produce_json_dicts_parallel(
+                        scope, sp, dir_path, index_dir, checkpoint_size,
+                        auto_build_index, rc, bs, max_workers);
+                });
+            state->task_future = handle.future;
+        } else {
+            auto handle =
+                rt->submit(produce_json_dicts(state, state->channel->producer(),
+                                              cfg, rc, bs),
+                           "iter_json");
+            state->task_future = handle.future;
+        }
+    } catch (const std::exception &e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return NULL;
+    }
+
+    TraceReaderIteratorObject *it = make_json_dict_iterator(std::move(state));
+    return (PyObject *)it;
+}
+
+static PyObject *TraceReader_read_json_py(TraceReaderObject *self,
+                                          PyObject *args, PyObject *kwds) {
+    PyObject *iter = TraceReader_iter_json(self, args, kwds);
+    if (!iter) return NULL;
+    PyObject *list = PySequence_List(iter);
+    Py_DECREF(iter);
+    return list;
+}
+
 static PyObject *TraceReader_read_raw(TraceReaderObject *self, PyObject *args,
                                       PyObject *kwds) {
     PyObject *iter = TraceReader_iter_raw(self, args, kwds);
@@ -984,22 +2609,34 @@ static PyObject *TraceReader_read_raw(TraceReaderObject *self, PyObject *args,
     return list;
 }
 
-static PyObject *TraceReader_iter_lines_json(TraceReaderObject *self,
-                                             PyObject *args, PyObject *kwds) {
-    static const char *kwlist[] = {"start_line", "end_line",    "start_byte",
-                                   "end_byte",   "buffer_size", "query",
-                                   NULL};
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+
+static PyObject *TraceReader_iter_arrow(TraceReaderObject *self, PyObject *args,
+                                        PyObject *kwds) {
+    static const char *kwlist[] = {
+        "batch_size", "start_line",    "end_line", "start_byte",
+        "end_byte",   "buffer_size",   "query",    "flatten_objects",
+        "normalize",  "memory_budget", NULL};
+    Py_ssize_t batch_size = 10000;
     Py_ssize_t start_line = 0, end_line = 0;
     Py_ssize_t start_byte = 0, end_byte = 0;
     Py_ssize_t buffer_size = 4 * 1024 * 1024;
     const char *query_str = NULL;
+    int flatten_objects = 1;  // default: expand top-level objects
+    int normalize = 0;
+    Py_ssize_t memory_budget = 0;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|nnnnnz", (char **)kwlist,
-                                     &start_line, &end_line, &start_byte,
-                                     &end_byte, &buffer_size, &query_str)) {
+    if (!PyArg_ParseTupleAndKeywords(
+            args, kwds, "|nnnnnnzppn", (char **)kwlist, &batch_size,
+            &start_line, &end_line, &start_byte, &end_byte, &buffer_size,
+            &query_str, &flatten_objects, &normalize, &memory_budget)) {
         return NULL;
     }
 
+    if (batch_size <= 0) {
+        PyErr_SetString(PyExc_ValueError, "batch_size must be > 0");
+        return NULL;
+    }
     if (start_line < 0 || end_line < 0 || start_byte < 0 || end_byte < 0 ||
         buffer_size <= 0) {
         PyErr_SetString(
@@ -1022,66 +2659,105 @@ static PyObject *TraceReader_iter_lines_json(TraceReaderObject *self,
     rc.start_byte = static_cast<std::size_t>(start_byte);
     rc.end_byte = static_cast<std::size_t>(end_byte);
     rc.buffer_size = static_cast<std::size_t>(buffer_size);
+    rc.flatten_objects = flatten_objects != 0;
     if (query_str) rc.query = query_str;
 
-    auto state = std::make_shared<IteratorState>();
+    auto state = std::make_shared<ArrowIteratorState>();
+    state->memory_budget_bytes = dftracer::utils::compute_memory_budget(
+        static_cast<std::size_t>(memory_budget));
 
     Runtime *rt = get_runtime(self);
+    std::size_t max_workers = rt->threads();
+    auto bs = static_cast<std::size_t>(batch_size);
+    std::size_t capacity = dftracer::utils::compute_channel_capacity(
+        state->memory_budget_bytes, bs * ESTIMATED_BYTES_PER_ARROW_ROW,
+        max_workers);
+    state->channel =
+        dftracer::utils::coro::make_channel<ArrowIteratorState::BatchType>(
+            capacity);
+    auto *sp = state.get();
+
     try {
-        auto handle =
-            rt->submit(produce_lines(state, cfg, rc), "iter_lines_json");
-        state->task_future = handle.future;
+        bool is_dir = fs::is_directory(cfg.file_path);
+        if (is_dir) {
+            auto handle = rt->scope(
+                "iter_arrow_parallel",
+                [sp, dir_path = cfg.file_path, index_dir = cfg.index_dir,
+                 checkpoint_size = cfg.checkpoint_size,
+                 auto_build_index = cfg.auto_build_index, rc, bs,
+                 norm = normalize != 0,
+                 max_workers](CoroScope &scope) -> CoroTask<void> {
+                    co_await produce_arrow_batches_parallel(
+                        scope, sp, dir_path, index_dir, checkpoint_size,
+                        auto_build_index, rc, bs, norm, max_workers);
+                });
+            state->task_future = handle.future;
+        } else if (normalize) {
+            auto handle = rt->submit(
+                produce_arrow_batches(state, state->channel->producer(), cfg,
+                                      rc, static_cast<std::size_t>(batch_size),
+                                      flatten_objects != 0, normalize != 0),
+                "iter_arrow");
+            state->task_future = handle.future;
+        } else {
+            std::vector<std::string> files_vec{cfg.file_path};
+            auto handle = rt->scope(
+                "iter_arrow_parallel",
+                [sp, files = std::move(files_vec), index_dir = cfg.index_dir,
+                 checkpoint_size = cfg.checkpoint_size,
+                 auto_build_index = cfg.auto_build_index, rc, bs,
+                 norm = normalize != 0,
+                 max_workers](CoroScope &scope) mutable -> CoroTask<void> {
+                    co_await produce_arrow_batches_for_files(
+                        scope, sp, std::move(files), index_dir, checkpoint_size,
+                        auto_build_index, rc, bs, norm, max_workers);
+                });
+            state->task_future = handle.future;
+        }
     } catch (const std::exception &e) {
         PyErr_SetString(PyExc_RuntimeError, e.what());
         return NULL;
     }
 
-    TraceReaderIteratorObject *it = make_iterator(state, IteratorMode::JSON);
+    TraceReaderIteratorObject *it = make_arrow_iterator(std::move(state));
     return (PyObject *)it;
 }
 
-static PyObject *TraceReader_read_lines_json(TraceReaderObject *self,
-                                             PyObject *args, PyObject *kwds) {
-    PyObject *iter = TraceReader_iter_lines_json(self, args, kwds);
-    if (!iter) return NULL;
-    PyObject *list = PySequence_List(iter);
-    Py_DECREF(iter);
-    return list;
-}
-
-#ifdef DFTRACER_UTILS_ENABLE_ARROW
-
-static PyObject *TraceReader_iter_arrow(TraceReaderObject *self, PyObject *args,
-                                        PyObject *kwds) {
+// Build ArrowIteratorState + spawn the producer task. Same plumbing as
+// TraceReader_iter_arrow but returns the state so callers can wrap it as
+// either a per-batch iterator or an ArrowArrayStream.
+static std::shared_ptr<ArrowIteratorState> spawn_arrow_producer(
+    TraceReaderObject *self, PyObject *args, PyObject *kwds) {
     static const char *kwlist[] = {
-        "batch_size", "start_line",  "end_line", "start_byte",
-        "end_byte",   "buffer_size", "query",    "flatten_objects",
-        "normalize",  NULL};
+        "batch_size", "start_line",    "end_line", "start_byte",
+        "end_byte",   "buffer_size",   "query",    "flatten_objects",
+        "normalize",  "memory_budget", NULL};
     Py_ssize_t batch_size = 10000;
     Py_ssize_t start_line = 0, end_line = 0;
     Py_ssize_t start_byte = 0, end_byte = 0;
     Py_ssize_t buffer_size = 4 * 1024 * 1024;
     const char *query_str = NULL;
-    int flatten_objects = 0;
+    int flatten_objects = 1;  // default: expand top-level objects
     int normalize = 0;
+    Py_ssize_t memory_budget = 0;
 
     if (!PyArg_ParseTupleAndKeywords(
-            args, kwds, "|nnnnnnzpp", (char **)kwlist, &batch_size, &start_line,
-            &end_line, &start_byte, &end_byte, &buffer_size, &query_str,
-            &flatten_objects, &normalize)) {
-        return NULL;
+            args, kwds, "|nnnnnnzppn", (char **)kwlist, &batch_size,
+            &start_line, &end_line, &start_byte, &end_byte, &buffer_size,
+            &query_str, &flatten_objects, &normalize, &memory_budget)) {
+        return nullptr;
     }
 
     if (batch_size <= 0) {
         PyErr_SetString(PyExc_ValueError, "batch_size must be > 0");
-        return NULL;
+        return nullptr;
     }
     if (start_line < 0 || end_line < 0 || start_byte < 0 || end_byte < 0 ||
         buffer_size <= 0) {
         PyErr_SetString(
             PyExc_ValueError,
             "range arguments must be >= 0; buffer_size must be > 0");
-        return NULL;
+        return nullptr;
     }
 
     TraceReaderConfig cfg;
@@ -1089,7 +2765,7 @@ static PyObject *TraceReader_iter_arrow(TraceReaderObject *self, PyObject *args,
         cfg = build_config(self);
     } catch (const std::exception &e) {
         PyErr_SetString(PyExc_RuntimeError, e.what());
-        return NULL;
+        return nullptr;
     }
 
     ReadConfig rc;
@@ -1098,40 +2774,702 @@ static PyObject *TraceReader_iter_arrow(TraceReaderObject *self, PyObject *args,
     rc.start_byte = static_cast<std::size_t>(start_byte);
     rc.end_byte = static_cast<std::size_t>(end_byte);
     rc.buffer_size = static_cast<std::size_t>(buffer_size);
+    rc.flatten_objects = flatten_objects != 0;
     if (query_str) rc.query = query_str;
 
     auto state = std::make_shared<ArrowIteratorState>();
+    state->memory_budget_bytes = dftracer::utils::compute_memory_budget(
+        static_cast<std::size_t>(memory_budget));
 
     Runtime *rt = get_runtime(self);
+    std::size_t max_workers = rt->threads();
+    auto bs = static_cast<std::size_t>(batch_size);
+    std::size_t capacity = dftracer::utils::compute_channel_capacity(
+        state->memory_budget_bytes, bs * ESTIMATED_BYTES_PER_ARROW_ROW,
+        max_workers);
+    state->channel =
+        dftracer::utils::coro::make_channel<ArrowIteratorState::BatchType>(
+            capacity);
+    auto *sp = state.get();
+
     try {
-        auto handle =
-            rt->submit(produce_arrow_batches(
-                           state, cfg, rc, static_cast<std::size_t>(batch_size),
-                           flatten_objects != 0, normalize != 0),
-                       "iter_arrow");
-        state->task_future = handle.future;
+        bool is_dir = fs::is_directory(cfg.file_path);
+        if (is_dir) {
+            auto handle = rt->scope(
+                "iter_arrow_parallel",
+                [sp, dir_path = cfg.file_path, index_dir = cfg.index_dir,
+                 checkpoint_size = cfg.checkpoint_size,
+                 auto_build_index = cfg.auto_build_index, rc, bs,
+                 norm = normalize != 0,
+                 max_workers](CoroScope &scope) -> CoroTask<void> {
+                    co_await produce_arrow_batches_parallel(
+                        scope, sp, dir_path, index_dir, checkpoint_size,
+                        auto_build_index, rc, bs, norm, max_workers);
+                });
+            state->task_future = handle.future;
+        } else {
+            auto handle = rt->submit(
+                produce_arrow_batches(state, state->channel->producer(), cfg,
+                                      rc, static_cast<std::size_t>(batch_size),
+                                      flatten_objects != 0, normalize != 0),
+                "iter_arrow");
+            state->task_future = handle.future;
+        }
     } catch (const std::exception &e) {
         PyErr_SetString(PyExc_RuntimeError, e.what());
-        return NULL;
+        return nullptr;
     }
 
-    TraceReaderIteratorObject *it = make_arrow_iterator(std::move(state));
-    return (PyObject *)it;
+    return state;
+}
+
+static PyObject *TraceReader_iter_arrow_stream(TraceReaderObject *self,
+                                               PyObject *args, PyObject *kwds) {
+    auto state = spawn_arrow_producer(self, args, kwds);
+    if (!state) return NULL;
+    return make_arrow_batch_stream(std::move(state));
 }
 
 static PyObject *TraceReader_read_arrow(TraceReaderObject *self, PyObject *args,
                                         PyObject *kwds) {
-    PyObject *iter = TraceReader_iter_arrow(self, args, kwds);
-    if (!iter) return NULL;
-    PyObject *list = PySequence_List(iter);
-    Py_DECREF(iter);
-    if (!list) return NULL;
-
-    return wrap_arrow_table(list);
+    auto state = spawn_arrow_producer(self, args, kwds);
+    if (!state) return NULL;
+    PyObject *stream = make_arrow_batch_stream(std::move(state));
+    if (!stream) return NULL;
+    return dftracer::utils::python::wrap_arrow_stream_table(stream);
 }
 
 #endif  // DFTRACER_UTILS_ENABLE_ARROW
 
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+static int parse_str_list_trace(PyObject *obj, std::vector<std::string> &out,
+                                const char *param_name) {
+    if (!obj || obj == Py_None) return 0;
+    if (!PyList_Check(obj)) {
+        PyErr_Format(PyExc_TypeError, "%s must be a list of str", param_name);
+        return -1;
+    }
+    Py_ssize_t n = PyList_Size(obj);
+    for (Py_ssize_t i = 0; i < n; i++) {
+        const char *s = PyUnicode_AsUTF8(PyList_GetItem(obj, i));
+        if (!s) return -1;
+        out.emplace_back(s);
+    }
+    return 0;
+}
+
+static PyObject *TraceReader_write_arrow(TraceReaderObject *self,
+                                         PyObject *args, PyObject *kwds) {
+    static const char *kwlist[] = {"path",        "views",      "chunk_size_mb",
+                                   "compression", "batch_size", NULL};
+    const char *path = NULL;
+    PyObject *views_obj = Py_None;
+    int chunk_size_mb = 32;
+    const char *compression_str = "zstd";
+    Py_ssize_t batch_size = 10000;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|Oisn", (char **)kwlist,
+                                     &path, &views_obj, &chunk_size_mb,
+                                     &compression_str, &batch_size)) {
+        return NULL;
+    }
+
+    if (chunk_size_mb < 0) {
+        PyErr_SetString(PyExc_ValueError, "chunk_size_mb must be >= 0");
+        return NULL;
+    }
+
+    std::vector<ViewDefinition> views;
+    if (views_obj && views_obj != Py_None) {
+        if (!PyList_Check(views_obj)) {
+            PyErr_SetString(PyExc_TypeError, "views must be a list or None");
+            return NULL;
+        }
+        Py_ssize_t n = PyList_Size(views_obj);
+        for (Py_ssize_t i = 0; i < n; i++) {
+            PyObject *item = PyList_GetItem(views_obj, i);
+            ViewDefinition vd;
+
+            if (PyUnicode_Check(item)) {
+                const char *name = PyUnicode_AsUTF8(item);
+                if (!name) return NULL;
+                std::string name_str(name);
+                if (name_str == "io") {
+                    vd = ViewDefinition::io_view();
+                } else if (name_str == "compute") {
+                    vd = ViewDefinition::compute_view();
+                } else if (name_str == "dlio") {
+                    vd = ViewDefinition::dlio_view();
+                } else {
+                    vd.with_name(name_str);
+                }
+            } else if (PyDict_Check(item)) {
+                PyObject *name_obj = PyDict_GetItemString(item, "name");
+                if (!name_obj || !PyUnicode_Check(name_obj)) {
+                    PyErr_SetString(PyExc_ValueError,
+                                    "view dict must have 'name' string");
+                    return NULL;
+                }
+                vd.with_name(PyUnicode_AsUTF8(name_obj));
+
+                PyObject *query_obj = PyDict_GetItemString(item, "query");
+                if (query_obj && query_obj != Py_None) {
+                    if (!PyUnicode_Check(query_obj)) {
+                        PyErr_SetString(PyExc_ValueError,
+                                        "view 'query' must be a string");
+                        return NULL;
+                    }
+                    vd.with_query(PyUnicode_AsUTF8(query_obj));
+                }
+
+                PyObject *meta_obj =
+                    PyDict_GetItemString(item, "include_metadata");
+                if (meta_obj && meta_obj != Py_None) {
+                    vd.with_include_metadata(PyObject_IsTrue(meta_obj));
+                }
+            } else {
+                PyErr_SetString(PyExc_TypeError,
+                                "views list must contain strings or dicts");
+                return NULL;
+            }
+            views.push_back(std::move(vd));
+        }
+    }
+
+    IpcCompression compression = IpcCompression::ZSTD;
+    if (compression_str) {
+        std::string comp_lower(compression_str);
+        for (auto &c : comp_lower) c = std::tolower(c);
+        if (comp_lower == "none") {
+            compression = IpcCompression::NONE;
+        } else if (comp_lower == "zstd") {
+#ifdef DFTRACER_UTILS_ENABLE_ZSTD
+            compression = IpcCompression::ZSTD;
+#else
+            PyErr_SetString(
+                PyExc_ValueError,
+                "ZSTD compression not available (built without ZSTD)");
+            return NULL;
+#endif
+        } else {
+            PyErr_Format(PyExc_ValueError,
+                         "Unknown compression: %s (use 'none' or 'zstd')",
+                         compression_str);
+            return NULL;
+        }
+    }
+
+    int64_t chunk_size_bytes =
+        static_cast<int64_t>(chunk_size_mb) * 1024 * 1024;
+
+    std::string file_path = PyUnicode_AsUTF8(self->file_path);
+    std::string index_path;
+    const char *idx = PyUnicode_AsUTF8(self->index_dir);
+    if (idx && idx[0] != '\0') {
+        index_path = idx;
+    }
+    std::size_t checkpoint_size = self->checkpoint_size;
+
+    std::string output_path(path);
+    WriteArrowResult result;
+    std::string error_msg;
+
+    Py_BEGIN_ALLOW_THREADS try {
+        Runtime *rt = get_runtime(self);
+        result =
+            rt->submit(write_arrow_pipeline(
+                           file_path, index_path, checkpoint_size,
+                           std::move(views), output_path, chunk_size_bytes,
+                           compression, static_cast<std::size_t>(batch_size)),
+                       "write_arrow")
+                .get();
+    } catch (const std::exception &e) {
+        error_msg = e.what();
+    }
+    Py_END_ALLOW_THREADS
+
+        if (!error_msg.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return NULL;
+    }
+
+    if (!result.error.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, result.error.c_str());
+        return NULL;
+    }
+
+    // Build result dict
+    PyObject *dict = PyDict_New();
+    if (!dict) return NULL;
+
+    // Build files list per partition
+    PyObject *partitions_dict = PyDict_New();
+    if (!partitions_dict) {
+        Py_DECREF(dict);
+        return NULL;
+    }
+
+    for (const auto &[partition_name, partition_stats] :
+         result.stats.partitions) {
+        PyObject *partition_dict = PyDict_New();
+        if (!partition_dict) {
+            Py_DECREF(partitions_dict);
+            Py_DECREF(dict);
+            return NULL;
+        }
+
+        PyObject *files_list = PyList_New(0);
+        if (!files_list) {
+            Py_DECREF(partition_dict);
+            Py_DECREF(partitions_dict);
+            Py_DECREF(dict);
+            return NULL;
+        }
+
+        for (const auto &f : partition_stats.files) {
+            PyObject *file_str = PyUnicode_FromString(f.c_str());
+            if (!file_str || PyList_Append(files_list, file_str) < 0) {
+                Py_XDECREF(file_str);
+                Py_DECREF(files_list);
+                Py_DECREF(partition_dict);
+                Py_DECREF(partitions_dict);
+                Py_DECREF(dict);
+                return NULL;
+            }
+            Py_DECREF(file_str);
+        }
+
+        PyDict_SetItemString(partition_dict, "files", files_list);
+        PyDict_SetItemString(partition_dict, "rows",
+                             PyLong_FromLongLong(partition_stats.total_rows));
+        PyDict_SetItemString(
+            partition_dict, "bytes",
+            PyLong_FromLongLong(partition_stats.total_uncompressed_bytes));
+        Py_DECREF(files_list);
+
+        PyObject *key = partition_name.empty()
+                            ? PyUnicode_FromString("_default")
+                            : PyUnicode_FromString(partition_name.c_str());
+        PyDict_SetItem(partitions_dict, key, partition_dict);
+        Py_DECREF(key);
+        Py_DECREF(partition_dict);
+    }
+
+    PyDict_SetItemString(dict, "partitions", partitions_dict);
+    PyDict_SetItemString(dict, "total_rows",
+                         PyLong_FromLongLong(result.stats.total_rows));
+    PyDict_SetItemString(
+        dict, "total_bytes",
+        PyLong_FromLongLong(result.stats.total_uncompressed_bytes));
+    PyDict_SetItemString(dict, "chunks_scanned",
+                         PyLong_FromUnsignedLongLong(result.chunks_scanned));
+    PyDict_SetItemString(dict, "chunks_skipped",
+                         PyLong_FromUnsignedLongLong(result.chunks_skipped));
+    Py_DECREF(partitions_dict);
+
+    return dict;
+}
+
+static PyObject *TraceReader_get_view_chunks(TraceReaderObject *self,
+                                             PyObject *args, PyObject *kwds) {
+    static const char *kwlist[] = {"view", NULL};
+    PyObject *view_obj = Py_None;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", (char **)kwlist,
+                                     &view_obj)) {
+        return NULL;
+    }
+
+    ViewDefinition view;
+    if (view_obj && view_obj != Py_None) {
+        if (PyUnicode_Check(view_obj)) {
+            const char *name = PyUnicode_AsUTF8(view_obj);
+            if (!name) return NULL;
+            std::string name_str(name);
+            if (name_str == "io") {
+                view = ViewDefinition::io_view();
+            } else if (name_str == "compute") {
+                view = ViewDefinition::compute_view();
+            } else if (name_str == "dlio") {
+                view = ViewDefinition::dlio_view();
+            } else {
+                view.with_name(name_str);
+            }
+        } else if (PyDict_Check(view_obj)) {
+            PyObject *name_obj = PyDict_GetItemString(view_obj, "name");
+            if (name_obj && PyUnicode_Check(name_obj)) {
+                view.with_name(PyUnicode_AsUTF8(name_obj));
+            }
+            PyObject *query_obj = PyDict_GetItemString(view_obj, "query");
+            if (query_obj && query_obj != Py_None &&
+                PyUnicode_Check(query_obj)) {
+                view.with_query(PyUnicode_AsUTF8(query_obj));
+            }
+        } else {
+            PyErr_SetString(PyExc_TypeError, "view must be a string or dict");
+            return NULL;
+        }
+    }
+
+    std::string file_path = PyUnicode_AsUTF8(self->file_path);
+    std::string index_path;
+    const char *idx = PyUnicode_AsUTF8(self->index_dir);
+    if (idx && idx[0] != '\0') {
+        index_path = idx;
+    }
+    std::size_t checkpoint_size = self->checkpoint_size;
+
+    GetViewChunksResult result;
+    std::string error_msg;
+
+    Py_BEGIN_ALLOW_THREADS try {
+        Runtime *rt = get_runtime(self);
+        result = rt->submit(get_view_chunks_pipeline(file_path, index_path,
+                                                     checkpoint_size, view),
+                            "get_view_chunks")
+                     .get();
+    } catch (const std::exception &e) {
+        error_msg = e.what();
+    }
+    Py_END_ALLOW_THREADS
+
+        if (!error_msg.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return NULL;
+    }
+
+    if (!result.error.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, result.error.c_str());
+        return NULL;
+    }
+
+    PyObject *dict = PyDict_New();
+    if (!dict) return NULL;
+
+    PyObject *chunks_list = PyList_New(result.chunks.size());
+    if (!chunks_list) {
+        Py_DECREF(dict);
+        return NULL;
+    }
+
+    for (std::size_t i = 0; i < result.chunks.size(); ++i) {
+        const auto &chunk = result.chunks[i];
+        PyObject *chunk_dict = PyDict_New();
+        if (!chunk_dict) {
+            Py_DECREF(chunks_list);
+            Py_DECREF(dict);
+            return NULL;
+        }
+        PyDict_SetItemString(chunk_dict, "checkpoint_idx",
+                             PyLong_FromUnsignedLongLong(chunk.checkpoint_idx));
+        PyDict_SetItemString(chunk_dict, "start_byte",
+                             PyLong_FromSize_t(chunk.start_byte));
+        PyDict_SetItemString(chunk_dict, "end_byte",
+                             PyLong_FromSize_t(chunk.end_byte));
+        PyList_SetItem(chunks_list, i, chunk_dict);
+    }
+
+    PyDict_SetItemString(dict, "chunks", chunks_list);
+    PyDict_SetItemString(dict, "total_checkpoints",
+                         PyLong_FromUnsignedLongLong(result.total_checkpoints));
+    PyDict_SetItemString(
+        dict, "skipped_checkpoints",
+        PyLong_FromUnsignedLongLong(result.skipped_checkpoints));
+    PyDict_SetItemString(dict, "file_may_match",
+                         PyBool_FromLong(result.file_may_match ? 1 : 0));
+    Py_DECREF(chunks_list);
+
+    return dict;
+}
+
+static PyObject *TraceReader_write_view_chunk(TraceReaderObject *self,
+                                              PyObject *args, PyObject *kwds) {
+    static const char *kwlist[] = {
+        "output_file", "checkpoint_idx", "start_byte", "end_byte",
+        "view",        "compression",    "batch_size", NULL};
+    const char *output_file = NULL;
+    unsigned long long checkpoint_idx = 0;
+    Py_ssize_t start_byte = 0;
+    Py_ssize_t end_byte = 0;
+    PyObject *view_obj = Py_None;
+    const char *compression_str = "zstd";
+    Py_ssize_t batch_size = 10000;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "sKnn|Osn", (char **)kwlist,
+                                     &output_file, &checkpoint_idx, &start_byte,
+                                     &end_byte, &view_obj, &compression_str,
+                                     &batch_size)) {
+        return NULL;
+    }
+
+    IpcCompression compression = IpcCompression::ZSTD;
+    if (compression_str) {
+        std::string comp_lower(compression_str);
+        for (auto &c : comp_lower) c = std::tolower(c);
+        if (comp_lower == "none") {
+            compression = IpcCompression::NONE;
+        } else if (comp_lower == "zstd") {
+#ifdef DFTRACER_UTILS_ENABLE_ZSTD
+            compression = IpcCompression::ZSTD;
+#else
+            PyErr_SetString(PyExc_ValueError, "ZSTD compression not available");
+            return NULL;
+#endif
+        }
+    }
+
+    ViewDefinition view;
+    if (view_obj && view_obj != Py_None) {
+        if (PyUnicode_Check(view_obj)) {
+            const char *name = PyUnicode_AsUTF8(view_obj);
+            if (!name) return NULL;
+            std::string name_str(name);
+            if (name_str == "io") {
+                view = ViewDefinition::io_view();
+            } else if (name_str == "compute") {
+                view = ViewDefinition::compute_view();
+            } else if (name_str == "dlio") {
+                view = ViewDefinition::dlio_view();
+            } else {
+                view.with_name(name_str);
+            }
+        } else if (PyDict_Check(view_obj)) {
+            PyObject *name_obj = PyDict_GetItemString(view_obj, "name");
+            if (name_obj && PyUnicode_Check(name_obj)) {
+                view.with_name(PyUnicode_AsUTF8(name_obj));
+            }
+            PyObject *query_obj = PyDict_GetItemString(view_obj, "query");
+            if (query_obj && query_obj != Py_None &&
+                PyUnicode_Check(query_obj)) {
+                view.with_query(PyUnicode_AsUTF8(query_obj));
+            }
+        }
+    }
+
+    std::string file_path = PyUnicode_AsUTF8(self->file_path);
+    std::string index_path;
+    const char *idx = PyUnicode_AsUTF8(self->index_dir);
+    if (idx && idx[0] != '\0') {
+        index_path = idx;
+    }
+    std::size_t checkpoint_size = self->checkpoint_size;
+
+    WriteViewChunkResult result;
+    std::string error_msg;
+
+    Py_BEGIN_ALLOW_THREADS try {
+        Runtime *rt = get_runtime(self);
+        result =
+            rt->submit(write_view_chunk_pipeline(
+                           file_path, index_path, checkpoint_size, view,
+                           checkpoint_idx, static_cast<std::size_t>(start_byte),
+                           static_cast<std::size_t>(end_byte),
+                           std::string(output_file), compression,
+                           static_cast<std::size_t>(batch_size)),
+                       "write_view_chunk")
+                .get();
+    } catch (const std::exception &e) {
+        error_msg = e.what();
+    }
+    Py_END_ALLOW_THREADS
+
+        if (!error_msg.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return NULL;
+    }
+
+    if (!result.error.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, result.error.c_str());
+        return NULL;
+    }
+
+    PyObject *dict = PyDict_New();
+    if (!dict) return NULL;
+
+    PyDict_SetItemString(dict, "output_file",
+                         PyUnicode_FromString(result.output_file.c_str()));
+    PyDict_SetItemString(dict, "events_matched",
+                         PyLong_FromUnsignedLongLong(result.events_matched));
+    PyDict_SetItemString(dict, "events_scanned",
+                         PyLong_FromUnsignedLongLong(result.events_scanned));
+    PyDict_SetItemString(dict, "rows_written",
+                         PyLong_FromLongLong(result.rows_written));
+    PyDict_SetItemString(dict, "bytes_written",
+                         PyLong_FromLongLong(result.bytes_written));
+
+    return dict;
+}
+
+static PyObject *TraceReader_write_view_chunks(TraceReaderObject *self,
+                                               PyObject *args, PyObject *kwds) {
+    static const char *kwlist[] = {"chunks",      "output_dir", "view",
+                                   "compression", "batch_size", NULL};
+    PyObject *chunks_list = NULL;
+    const char *output_dir = NULL;
+    PyObject *view_obj = Py_None;
+    const char *compression_str = "zstd";
+    Py_ssize_t batch_size = 10000;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "Os|Osn", (char **)kwlist,
+                                     &chunks_list, &output_dir, &view_obj,
+                                     &compression_str, &batch_size)) {
+        return NULL;
+    }
+
+    if (!PyList_Check(chunks_list)) {
+        PyErr_SetString(PyExc_TypeError, "chunks must be a list");
+        return NULL;
+    }
+
+    IpcCompression compression = IpcCompression::ZSTD;
+    if (strcmp(compression_str, "none") == 0) {
+        compression = IpcCompression::NONE;
+    } else if (strcmp(compression_str, "zstd") != 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "compression must be 'zstd' or 'none'");
+        return NULL;
+    }
+
+    ViewDefinition view;
+    if (view_obj && view_obj != Py_None) {
+        if (PyUnicode_Check(view_obj)) {
+            const char *name = PyUnicode_AsUTF8(view_obj);
+            if (!name) return NULL;
+            std::string name_str(name);
+            if (name_str == "io") {
+                view = ViewDefinition::io_view();
+            } else if (name_str == "compute") {
+                view = ViewDefinition::compute_view();
+            } else if (name_str == "dlio") {
+                view = ViewDefinition::dlio_view();
+            } else {
+                view.with_name(name_str);
+            }
+        } else if (PyDict_Check(view_obj)) {
+            PyObject *name_obj = PyDict_GetItemString(view_obj, "name");
+            if (name_obj && PyUnicode_Check(name_obj)) {
+                view.with_name(PyUnicode_AsUTF8(name_obj));
+            }
+            PyObject *query_obj = PyDict_GetItemString(view_obj, "query");
+            if (query_obj && query_obj != Py_None &&
+                PyUnicode_Check(query_obj)) {
+                view.with_query(PyUnicode_AsUTF8(query_obj));
+            }
+        }
+    }
+
+    std::vector<ChunkDescriptor> chunks;
+    Py_ssize_t num_chunks = PyList_Size(chunks_list);
+    chunks.reserve(static_cast<std::size_t>(num_chunks));
+
+    for (Py_ssize_t i = 0; i < num_chunks; i++) {
+        PyObject *chunk_dict = PyList_GetItem(chunks_list, i);
+        if (!PyDict_Check(chunk_dict)) {
+            PyErr_SetString(PyExc_TypeError, "each chunk must be a dict");
+            return NULL;
+        }
+
+        ChunkDescriptor desc;
+
+        PyObject *cp_idx = PyDict_GetItemString(chunk_dict, "checkpoint_idx");
+        PyObject *start = PyDict_GetItemString(chunk_dict, "start_byte");
+        PyObject *end = PyDict_GetItemString(chunk_dict, "end_byte");
+
+        if (!cp_idx || !start || !end) {
+            PyErr_SetString(
+                PyExc_KeyError,
+                "chunk must have checkpoint_idx, start_byte, end_byte");
+            return NULL;
+        }
+
+        desc.checkpoint_idx =
+            static_cast<std::uint64_t>(PyLong_AsUnsignedLongLong(cp_idx));
+        desc.start_byte =
+            static_cast<std::size_t>(PyLong_AsUnsignedLongLong(start));
+        desc.end_byte =
+            static_cast<std::size_t>(PyLong_AsUnsignedLongLong(end));
+
+        char filename[64];
+        snprintf(filename, sizeof(filename), "chunk-%05llu.arrow",
+                 (unsigned long long)desc.checkpoint_idx);
+        desc.output_file = std::string(output_dir) + "/" + filename;
+
+        chunks.push_back(std::move(desc));
+    }
+
+    std::string file_path = PyUnicode_AsUTF8(self->file_path);
+    std::string index_path;
+    const char *idx = PyUnicode_AsUTF8(self->index_dir);
+    if (idx && idx[0] != '\0') {
+        index_path = idx;
+    }
+    std::size_t checkpoint_size = self->checkpoint_size;
+
+    WriteViewChunksResult result;
+    std::string error_msg;
+
+    Py_BEGIN_ALLOW_THREADS try {
+        Runtime *rt = get_runtime(self);
+        result = rt->submit(write_view_chunks_pipeline(
+                                file_path, index_path, checkpoint_size, view,
+                                std::move(chunks), compression,
+                                static_cast<std::size_t>(batch_size)),
+                            "write_view_chunks")
+                     .get();
+    } catch (const std::exception &e) {
+        error_msg = e.what();
+    }
+    Py_END_ALLOW_THREADS
+
+        if (!error_msg.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
+        return NULL;
+    }
+
+    PyObject *dict = PyDict_New();
+    if (!dict) return NULL;
+
+    PyObject *results_list =
+        PyList_New(static_cast<Py_ssize_t>(result.results.size()));
+    if (!results_list) {
+        Py_DECREF(dict);
+        return NULL;
+    }
+
+    for (std::size_t i = 0; i < result.results.size(); i++) {
+        const auto &r = result.results[i];
+        PyObject *item = PyDict_New();
+        if (!item) {
+            Py_DECREF(results_list);
+            Py_DECREF(dict);
+            return NULL;
+        }
+        PyDict_SetItemString(item, "output_file",
+                             PyUnicode_FromString(r.output_file.c_str()));
+        PyDict_SetItemString(item, "rows_written",
+                             PyLong_FromLongLong(r.rows_written));
+        PyDict_SetItemString(item, "events_matched",
+                             PyLong_FromUnsignedLongLong(r.events_matched));
+        if (!r.error.empty()) {
+            PyDict_SetItemString(item, "error",
+                                 PyUnicode_FromString(r.error.c_str()));
+        }
+        PyList_SetItem(results_list, static_cast<Py_ssize_t>(i), item);
+    }
+
+    PyDict_SetItemString(dict, "results", results_list);
+    Py_DECREF(results_list);
+    PyDict_SetItemString(dict, "total_rows",
+                         PyLong_FromLongLong(result.total_rows));
+    PyDict_SetItemString(dict, "total_events_matched",
+                         PyLong_FromLongLong(result.total_events_matched));
+
+    return dict;
+}
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW_IPC
+
 static PyObject *TraceReader_enter(TraceReaderObject *self,
                                    PyObject *Py_UNUSED(ignored)) {
     Py_INCREF(self);
@@ -1235,9 +3573,12 @@ static PyMethodDef TraceReader_methods[] = {
      "    start_byte (int): First byte offset (0 = beginning).\n"
      "    end_byte (int): Last byte offset (0 = end of file).\n"
      "    buffer_size (int): Internal read buffer size in bytes.\n"},
-    {"read_raw", (PyCFunction)TraceReader_read_raw,
+    {"iter_json", (PyCFunction)TraceReader_iter_json,
      METH_VARARGS | METH_KEYWORDS,
-     "Read all raw chunks and return as list.\n"
+     "Return an iterator over parsed JSON events as Python dicts.\n"
+     "\n"
+     "Each event is parsed once in C++ (single-pass simdjson ondemand)\n"
+     "and yielded as a Python dict. No double-parsing overhead.\n"
      "\n"
      "Args:\n"
      "    start_line (int): First line (0 = beginning).\n"
@@ -1245,28 +3586,25 @@ static PyMethodDef TraceReader_methods[] = {
      "    start_byte (int): First byte offset (0 = beginning).\n"
      "    end_byte (int): Last byte offset (0 = end of file).\n"
      "    buffer_size (int): Internal read buffer size in bytes.\n"
-     "    line_aligned (bool): Align chunks to line boundaries.\n"
-     "    multi_line (bool): Allow multiple lines per chunk.\n"},
-    {"iter_lines_json", (PyCFunction)TraceReader_iter_lines_json,
+     "    query (str): Optional query filter.\n"
+     "    batch_size (int): Events per internal batch (default 1024).\n"},
+    {"read_json", (PyCFunction)TraceReader_read_json_py,
      METH_VARARGS | METH_KEYWORDS,
-     "Return an iterator over parsed JSON objects.\n"
+     "Read all events as parsed Python dicts (list).\n"
      "\n"
-     "Args:\n"
-     "    start_line (int): First line (0 = beginning).\n"
-     "    end_line (int): Last line (0 = end of file).\n"
-     "    start_byte (int): First byte offset (0 = beginning).\n"
-     "    end_byte (int): Last byte offset (0 = end of file).\n"
-     "    buffer_size (int): Internal read buffer size in bytes.\n"},
-    {"read_lines_json", (PyCFunction)TraceReader_read_lines_json,
+     "Equivalent to list(iter_json(...)).\n"},
+    {"read_raw", (PyCFunction)TraceReader_read_raw,
      METH_VARARGS | METH_KEYWORDS,
-     "Read all lines as parsed JSON objects.\n"
+     "Read all raw chunks and return as list.\n"
      "\n"
      "Args:\n"
      "    start_line (int): First line (0 = beginning).\n"
      "    end_line (int): Last line (0 = end of file).\n"
      "    start_byte (int): First byte offset (0 = beginning).\n"
      "    end_byte (int): Last byte offset (0 = end of file).\n"
-     "    buffer_size (int): Internal read buffer size in bytes.\n"},
+     "    buffer_size (int): Internal read buffer size in bytes.\n"
+     "    line_aligned (bool): Align chunks to line boundaries.\n"
+     "    multi_line (bool): Allow multiple lines per chunk.\n"},
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
     {"iter_arrow", (PyCFunction)TraceReader_iter_arrow,
      METH_VARARGS | METH_KEYWORDS,
@@ -1279,6 +3617,12 @@ static PyMethodDef TraceReader_methods[] = {
      "    start_byte (int): First byte offset (0 = beginning).\n"
      "    end_byte (int): Last byte offset (0 = end of file).\n"
      "    buffer_size (int): Internal read buffer size in bytes.\n"},
+    {"iter_arrow_stream", (PyCFunction)TraceReader_iter_arrow_stream,
+     METH_VARARGS | METH_KEYWORDS,
+     "Return an _ArrowBatchStream that exposes Arrow record batches\n"
+     "via the Arrow C Data Interface stream protocol\n"
+     "(__arrow_c_stream__). PyArrow can drain the producer channel\n"
+     "with a single call, without per-batch Python iteration.\n"},
     {"read_arrow", (PyCFunction)TraceReader_read_arrow,
      METH_VARARGS | METH_KEYWORDS,
      "Read all events as a materialized ArrowTable.\n"
@@ -1290,6 +3634,64 @@ static PyMethodDef TraceReader_methods[] = {
      "    start_byte (int): First byte offset (0 = beginning).\n"
      "    end_byte (int): Last byte offset (0 = end of file).\n"
      "    buffer_size (int): Internal read buffer size in bytes.\n"},
+#endif
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+    {"write_arrow", (PyCFunction)TraceReader_write_arrow,
+     METH_VARARGS | METH_KEYWORDS,
+     "Write trace data to partitioned Arrow IPC files.\n"
+     "\n"
+     "Args:\n"
+     "    path (str): Output directory path.\n"
+     "    partition_by (list[str] or None): Column names to partition by.\n"
+     "    num_buckets (int): Number of hash buckets (0 = no bucketing).\n"
+     "    chunk_size_mb (int): Max uncompressed MB per file (default 32).\n"
+     "    compression (str): 'zstd' or 'none' (default 'zstd').\n"
+     "    batch_size (int): Rows per internal batch (default 10000).\n"
+     "    normalize (bool): Use normalized schema (default False).\n"
+     "\n"
+     "Returns:\n"
+     "    dict: Statistics including partitions, total_rows, total_bytes.\n"},
+    {"get_view_chunks", (PyCFunction)TraceReader_get_view_chunks,
+     METH_VARARGS | METH_KEYWORDS,
+     "Get candidate chunks for a view after bloom filter pruning.\n"
+     "\n"
+     "Args:\n"
+     "    view (str or dict): View name ('io', 'compute', 'dlio') or\n"
+     "                        dict with 'name' and optional 'query'.\n"
+     "\n"
+     "Returns:\n"
+     "    dict: chunks list, total_checkpoints, skipped_checkpoints.\n"},
+    {"write_view_chunk", (PyCFunction)TraceReader_write_view_chunk,
+     METH_VARARGS | METH_KEYWORDS,
+     "Write a single chunk to an Arrow IPC file.\n"
+     "\n"
+     "Args:\n"
+     "    output_file (str): Path to output Arrow IPC file.\n"
+     "    checkpoint_idx (int): Checkpoint index.\n"
+     "    start_byte (int): Start byte offset.\n"
+     "    end_byte (int): End byte offset.\n"
+     "    view (str or dict): View definition.\n"
+     "    compression (str): 'zstd' or 'none' (default 'zstd').\n"
+     "    batch_size (int): Events per batch (default 10000).\n"
+     "\n"
+     "Returns:\n"
+     "    dict: output_file, events_matched, rows_written, bytes_written.\n"},
+    {"write_view_chunks", (PyCFunction)TraceReader_write_view_chunks,
+     METH_VARARGS | METH_KEYWORDS,
+     "Write multiple chunks to Arrow IPC files in parallel.\n"
+     "\n"
+     "All chunks are processed concurrently on the Runtime thread pool.\n"
+     "\n"
+     "Args:\n"
+     "    chunks (list): List of dicts with checkpoint_idx, start_byte, "
+     "end_byte.\n"
+     "    output_dir (str): Directory for output Arrow IPC files.\n"
+     "    view (str or dict): View definition.\n"
+     "    compression (str): 'zstd' or 'none' (default 'zstd').\n"
+     "    batch_size (int): Events per batch (default 10000).\n"
+     "\n"
+     "Returns:\n"
+     "    dict: results list, total_rows, total_events_matched.\n"},
 #endif
     {"get_max_bytes", (PyCFunction)TraceReader_get_max_bytes, METH_NOARGS,
      "Get the maximum byte position (0 if unknown for compressed\n"
@@ -1307,8 +3709,8 @@ static PyMethodDef TraceReader_methods[] = {
     {NULL}};
 
 static PyGetSetDef TraceReader_getsetters[] = {
-    {"file_path", (getter)TraceReader_get_file_path, NULL,
-     "Path to the trace file", NULL},
+    {"path", (getter)TraceReader_get_file_path, NULL,
+     "Path to the trace file or directory", NULL},
     {"index_dir", (getter)TraceReader_get_index_dir, NULL,
      "Directory for index files", NULL},
     {"has_index", (getter)TraceReader_get_has_index, NULL,
@@ -1340,7 +3742,6 @@ PyTypeObject TraceReaderType = {
     "TraceReader(file_path: str, index_dir: str = '',\n"
     "            checkpoint_size: int = 33554432,\n"
     "            auto_build_index: bool = False,\n"
-    "            index_threshold: int = 1048576,\n"
     "            runtime: Runtime | None = None)\n"
     "--\n"
     "\n"
@@ -1357,9 +3758,7 @@ PyTypeObject TraceReaderType = {
     "        building (default 32 MB).\n"
     "    auto_build_index (bool): If True, automatically build an "
     "index\n"
-    "        when none exists and the file exceeds *index_threshold*.\n"
-    "    index_threshold (int): Minimum file size in bytes before\n"
-    "        auto-indexing is triggered (default 8 MB).\n"
+    "        when none exists.\n"
     "    runtime (Runtime or None): Runtime instance for thread pool "
     "control.\n"
     "        If None, uses the default global Runtime.\n"
diff --git a/src/dftracer/utils/python/trace_reader.h b/src/dftracer/utils/python/trace_reader.h
index f1dcddcb..ca2d3fbb 100644
--- a/src/dftracer/utils/python/trace_reader.h
+++ b/src/dftracer/utils/python/trace_reader.h
@@ -10,7 +10,6 @@ typedef struct {
     PyObject *index_dir;
     std::size_t checkpoint_size;
     int auto_build_index;
-    std::size_t index_threshold;
     int has_index;
     PyObject *runtime_obj;  // RuntimeObject* or NULL (uses default)
 } TraceReaderObject;
diff --git a/src/dftracer/utils/python/trace_reader_iterator.cpp b/src/dftracer/utils/python/trace_reader_iterator.cpp
index 87bf54a5..36e3fba9 100644
--- a/src/dftracer/utils/python/trace_reader_iterator.cpp
+++ b/src/dftracer/utils/python/trace_reader_iterator.cpp
@@ -1,9 +1,9 @@
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include <dftracer/utils/core/utils/string.h>
+#include <dftracer/utils/core/common/config.h>
+#include <dftracer/utils/python/batch_byte_size.h>
 #include <dftracer/utils/python/json.h>
 #include <dftracer/utils/python/trace_reader_iterator.h>
-
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
 #include <nanoarrow/nanoarrow.h>
 
@@ -135,41 +135,41 @@ PyTypeObject ArrowBatchCapsuleType = {
 
 #endif                                     // DFTRACER_UTILS_ENABLE_ARROW
 
+static void cancel_and_wait_batch_state(MemoryViewBatchIteratorState *bs) {
+    bs->cancelled.store(true, std::memory_order_release);
+    if (bs->channel) bs->channel->close();
+    if (bs->task_future.valid()) bs->task_future.wait();
+}
+
+static void cancel_and_wait_json_dict_state(JsonDictIteratorState *js) {
+    js->cancelled.store(true, std::memory_order_release);
+    if (js->channel) js->channel->close();
+    if (js->task_future.valid()) js->task_future.wait();
+}
+
 static void TraceReaderIterator_dealloc(TraceReaderIteratorObject *self) {
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
     if (self->arrow_state) {
-        auto task_future = self->arrow_state->task_future;
         self->arrow_state->cancelled.store(true, std::memory_order_release);
-        self->arrow_state->cv_producer.notify_all();
-        self->arrow_state->cv_consumer.notify_all();  // wake blocked __next__
-        Py_BEGIN_ALLOW_THREADS {
-            std::unique_lock<std::mutex> lock(self->arrow_state->mtx);
-            self->arrow_state->cv_consumer.wait(lock, [self] {
-                return self->arrow_state->done.load(std::memory_order_acquire);
-            });
-        }
-        if (task_future.valid()) {
-            task_future.wait();
+        if (self->arrow_state->channel) self->arrow_state->channel->close();
+        Py_BEGIN_ALLOW_THREADS if (self->arrow_state->task_future.valid()) {
+            self->arrow_state->task_future.wait();
         }
         Py_END_ALLOW_THREADS self->arrow_state.reset();
     }
 #endif
-    if (self->state) {
-        auto task_future = self->state->task_future;
-        self->state->cancelled.store(true, std::memory_order_release);
-        self->state->cv_producer.notify_all();
-        self->state->cv_consumer.notify_all();  // wake blocked __next__
-        Py_BEGIN_ALLOW_THREADS {
-            std::unique_lock<std::mutex> lock(self->state->mtx);
-            self->state->cv_consumer.wait(lock, [self] {
-                return self->state->done.load(std::memory_order_acquire);
-            });
-        }
-        if (task_future.valid()) {
-            task_future.wait();
-        }
-        Py_END_ALLOW_THREADS self->state.reset();
+    if (self->json_dict_state) {
+        Py_BEGIN_ALLOW_THREADS cancel_and_wait_json_dict_state(
+            self->json_dict_state.get());
+        Py_END_ALLOW_THREADS self->json_dict_state.reset();
+    }
+    if (self->batch_state) {
+        Py_BEGIN_ALLOW_THREADS cancel_and_wait_batch_state(
+            self->batch_state.get());
+        Py_END_ALLOW_THREADS self->batch_state.reset();
     }
+    Py_XDECREF(self->current_batch);
+    self->current_batch = NULL;
     Py_TYPE(self)->tp_free((PyObject *)self);
 }
 
@@ -179,31 +179,68 @@ static PyObject *TraceReaderIterator_iter(TraceReaderIteratorObject *self) {
 }
 
 static PyObject *TraceReaderIterator_next(TraceReaderIteratorObject *self) {
-#ifdef DFTRACER_UTILS_ENABLE_ARROW
-    if (self->mode == IteratorMode::ARROW) {
-        auto *astate = self->arrow_state.get();
-        ArrowIteratorState::BatchItem batch;
-        bool cancelled = false;
-        {
-            Py_BEGIN_ALLOW_THREADS std::unique_lock<std::mutex> lock(
-                astate->mtx);
-            astate->cv_consumer.wait(lock, [astate] {
-                return !astate->queue.empty() ||
-                       astate->cancelled.load(std::memory_order_acquire) ||
-                       astate->done.load(std::memory_order_acquire);
-            });
-            cancelled = astate->cancelled.load(std::memory_order_acquire) &&
-                        astate->queue.empty();
-            if (!cancelled) {
-                batch = std::move(astate->queue.front());
-                astate->queue.pop();
+    if (self->mode == IteratorMode::JSON_DICT) {
+        while (true) {
+            if (self->json_dict_current_batch) {
+                auto &events = self->json_dict_current_batch->events;
+                Py_ssize_t n = static_cast<Py_ssize_t>(events.size());
+                if (self->json_dict_index < n) {
+                    JsonDictValueObject *obj =
+                        (JsonDictValueObject *)JsonDictValueType.tp_alloc(
+                            &JsonDictValueType, 0);
+                    if (!obj) return NULL;
+                    new (&obj->batch) std::shared_ptr<JsonDictBatch>(
+                        self->json_dict_current_batch);
+                    obj->event_index =
+                        static_cast<std::size_t>(self->json_dict_index);
+                    obj->is_args = false;
+                    self->json_dict_index++;
+                    return (PyObject *)obj;
+                }
+                self->json_dict_current_batch.reset();
+                self->json_dict_index = 0;
             }
+
+            auto *js = self->json_dict_state.get();
+            std::optional<JsonDictBatch> batch;
+            Py_BEGIN_ALLOW_THREADS batch = js->channel->blocking_receive();
             Py_END_ALLOW_THREADS
+
+                if (!batch.has_value()) {
+                std::lock_guard<std::mutex> lock(js->error_mtx);
+                if (js->error) {
+                    try {
+                        std::rethrow_exception(js->error);
+                    } catch (const std::exception &e) {
+                        PyErr_SetString(PyExc_RuntimeError, e.what());
+                        return NULL;
+                    } catch (...) {
+                        PyErr_SetString(PyExc_RuntimeError,
+                                        "Unknown error in json dict iterator");
+                        return NULL;
+                    }
+                }
+                return NULL;
+            }
+
+            auto dequeued_bytes = dftracer::utils::python::byte_size(*batch);
+            js->bytes_in_queue.fetch_sub(dequeued_bytes,
+                                         std::memory_order_acq_rel);
+            self->json_dict_current_batch =
+                std::make_shared<JsonDictBatch>(std::move(*batch));
+            self->json_dict_index = 0;
         }
-        if (cancelled) return NULL;  // StopIteration
-        astate->cv_producer.notify_one();
+    }
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+    if (self->mode == IteratorMode::ARROW) {
+        auto *astate = self->arrow_state.get();
+        std::optional<ArrowExportResult> batch;
+        Py_BEGIN_ALLOW_THREADS batch = astate->channel->blocking_receive();
+        Py_END_ALLOW_THREADS
 
-        if (!batch.has_value()) {
+            if (!batch.has_value()) {
+            std::lock_guard<std::mutex> lock(astate->error_mtx);
             if (astate->error) {
                 try {
                     std::rethrow_exception(astate->error);
@@ -216,9 +253,13 @@ static PyObject *TraceReaderIterator_next(TraceReaderIteratorObject *self) {
                     return NULL;
                 }
             }
-            return NULL;  // StopIteration
+            return NULL;
         }
 
+        auto dequeued_bytes = dftracer::utils::python::byte_size(*batch);
+        astate->bytes_in_queue.fetch_sub(dequeued_bytes,
+                                         std::memory_order_acq_rel);
+
         ArrowBatchCapsuleObject *obj =
             (ArrowBatchCapsuleObject *)ArrowBatchCapsuleType.tp_alloc(
                 &ArrowBatchCapsuleType, 0);
@@ -228,72 +269,54 @@ static PyObject *TraceReaderIterator_next(TraceReaderIteratorObject *self) {
     }
 #endif
 
-    auto *state = self->state.get();
-
-    // Loop to skip non-JSON lines without recursion (avoids stack overflow
-    // on files with many delimiter lines like "[" and "]").
+    using namespace dftracer::utils::python;
     while (true) {
-        std::optional<std::string> item;
-        bool cancelled = false;
-
-        {
-            Py_BEGIN_ALLOW_THREADS std::unique_lock<std::mutex> lock(
-                state->mtx);
-            state->cv_consumer.wait(lock, [state] {
-                return !state->queue.empty() ||
-                       state->cancelled.load(std::memory_order_acquire) ||
-                       state->done.load(std::memory_order_acquire);
-            });
-            cancelled = state->cancelled.load(std::memory_order_acquire) &&
-                        state->queue.empty();
-            if (!cancelled) {
-                item = std::move(state->queue.front());
-                state->queue.pop();
+        if (self->current_batch) {
+            auto *batch_obj = (MemoryViewBatchObject *)self->current_batch;
+            Py_ssize_t n =
+                static_cast<Py_ssize_t>(batch_obj->data->num_entries());
+            if (self->batch_index < n) {
+                PyObject *mv =
+                    MemoryViewBatch_item(batch_obj, self->batch_index);
+                self->batch_index++;
+                return mv;
             }
-            Py_END_ALLOW_THREADS
+            Py_DECREF(self->current_batch);
+            self->current_batch = NULL;
+            self->batch_index = 0;
         }
-        if (cancelled) return NULL;  // StopIteration
-        state->cv_producer.notify_one();
 
-        if (!item.has_value()) {
-            if (state->error) {
+        auto *bs = self->batch_state.get();
+        std::optional<MemoryViewBatchData> batch_data;
+        Py_BEGIN_ALLOW_THREADS batch_data = bs->channel->blocking_receive();
+        Py_END_ALLOW_THREADS
+
+            if (!batch_data.has_value()) {
+            std::lock_guard<std::mutex> lock(bs->error_mtx);
+            if (bs->error) {
                 try {
-                    std::rethrow_exception(state->error);
+                    std::rethrow_exception(bs->error);
                 } catch (const std::exception &e) {
                     PyErr_SetString(PyExc_RuntimeError, e.what());
                     return NULL;
                 } catch (...) {
                     PyErr_SetString(PyExc_RuntimeError,
-                                    "Unknown error in TraceReaderIterator");
+                                    "Unknown error in batch iterator");
                     return NULL;
                 }
             }
-            return NULL;  // StopIteration
+            return NULL;
         }
 
-        switch (self->mode) {
-            case IteratorMode::LINES:
-                return PyUnicode_FromStringAndSize(
-                    item->data(), static_cast<Py_ssize_t>(item->size()));
-            case IteratorMode::JSON: {
-                const char *trimmed;
-                std::size_t trimmed_length;
-                if (!dftracer::utils::json_trim_and_validate(
-                        item->data(), item->size(), trimmed, trimmed_length)) {
-                    continue;  // skip non-JSON delimiter lines
-                }
-                PyObject *json_obj = JSON_from_data(trimmed, trimmed_length);
-                if (!json_obj) {
-                    PyErr_Clear();
-                    continue;  // skip unparseable lines
-                }
-                return json_obj;
-            }
-            case IteratorMode::RAW:
-            default:
-                return PyBytes_FromStringAndSize(
-                    item->data(), static_cast<Py_ssize_t>(item->size()));
-        }
+        auto dequeued_bytes = dftracer::utils::python::byte_size(*batch_data);
+        bs->bytes_in_queue.fetch_sub(dequeued_bytes, std::memory_order_acq_rel);
+
+        auto *obj = (MemoryViewBatchObject *)MemoryViewBatchType.tp_alloc(
+            &MemoryViewBatchType, 0);
+        if (!obj) return NULL;
+        obj->data = new MemoryViewBatchData(std::move(*batch_data));
+        self->current_batch = (PyObject *)obj;
+        self->batch_index = 0;
     }
 }
 
@@ -317,24 +340,24 @@ PyTypeObject TraceReaderIteratorType = {
     0,                                       /* tp_setattro */
     0,                                       /* tp_as_buffer */
     Py_TPFLAGS_DEFAULT,                      /* tp_flags */
-    "Lazy iterator over TraceReader lines or raw chunks", /* tp_doc */
-    0,                                                    /* tp_traverse */
-    0,                                                    /* tp_clear */
-    0,                                                    /* tp_richcompare */
-    0,                                      /* tp_weaklistoffset */
-    (getiterfunc)TraceReaderIterator_iter,  /* tp_iter */
-    (iternextfunc)TraceReaderIterator_next, /* tp_iternext */
-    0,                                      /* tp_methods */
-    0,                                      /* tp_members */
-    0,                                      /* tp_getset */
-    0,                                      /* tp_base */
-    0,                                      /* tp_dict */
-    0,                                      /* tp_descr_get */
-    0,                                      /* tp_descr_set */
-    0,                                      /* tp_dictoffset */
-    0,                                      /* tp_init */
-    0,                                      /* tp_alloc */
-    0,                                      /* tp_new */
+    "Lazy iterator over TraceReader lines or raw chunks",
+    0,                                       /* tp_traverse */
+    0,                                       /* tp_clear */
+    0,                                       /* tp_richcompare */
+    0,                                       /* tp_weaklistoffset */
+    (getiterfunc)TraceReaderIterator_iter,   /* tp_iter */
+    (iternextfunc)TraceReaderIterator_next,  /* tp_iternext */
+    0,                                       /* tp_methods */
+    0,                                       /* tp_members */
+    0,                                       /* tp_getset */
+    0,                                       /* tp_base */
+    0,                                       /* tp_dict */
+    0,                                       /* tp_descr_get */
+    0,                                       /* tp_descr_set */
+    0,                                       /* tp_dictoffset */
+    0,                                       /* tp_init */
+    0,                                       /* tp_alloc */
+    0,                                       /* tp_new */
 };
 
 int init_trace_reader_iterator(PyObject *m) {
diff --git a/src/dftracer/utils/python/trace_reader_iterator.h b/src/dftracer/utils/python/trace_reader_iterator.h
index 11941fd8..6f9c4654 100644
--- a/src/dftracer/utils/python/trace_reader_iterator.h
+++ b/src/dftracer/utils/python/trace_reader_iterator.h
@@ -2,16 +2,14 @@
 #define DFTRACER_UTILS_PYTHON_TRACE_READER_ITERATOR_H
 
 #include <Python.h>
+#include <dftracer/utils/core/common/config.h>
 #include <dftracer/utils/core/task_handle.h>
+#include <dftracer/utils/python/memoryview_batch.h>
+#include <dftracer/utils/utilities/composites/dft/args_map.h>
 
-#include <atomic>
-#include <condition_variable>
 #include <memory>
-#include <mutex>
-#include <optional>
-#include <queue>
 #include <string>
-
+#include <vector>
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
 #include <dftracer/utils/utilities/common/arrow/arrow_export.h>
 
@@ -24,44 +22,70 @@ extern PyTypeObject ArrowBatchCapsuleType;
 #endif
 
 enum class IteratorMode {
-    LINES,
-    RAW,
-    JSON,
+    MEMORYVIEW,
+    JSON_DICT,
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
     ARROW,
 #endif
 };
 
-struct IteratorState {
-    std::queue<std::optional<std::string>> queue;
-    std::mutex mtx;
-    std::condition_variable cv_producer;
-    std::condition_variable cv_consumer;
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+struct ArrowIteratorState {
+    using BatchType =
+        dftracer::utils::utilities::common::arrow::ArrowExportResult;
+    std::shared_ptr<dftracer::utils::coro::Channel<BatchType>> channel;
+    std::mutex error_mtx;
     std::exception_ptr error;
     std::atomic<bool> cancelled{false};
-    std::atomic<bool> done{false};
-    std::size_t max_queue_size = 64;
+    std::size_t memory_budget_bytes = 0;
+    std::atomic<std::size_t> bytes_in_queue{0};
     std::shared_future<void> task_future;
+
+    void set_error(std::exception_ptr e) {
+        std::lock_guard<std::mutex> lock(error_mtx);
+        if (!error) error = e;
+    }
 };
+#endif
 
-#ifdef DFTRACER_UTILS_ENABLE_ARROW
-struct ArrowIteratorState {
-    using BatchItem = std::optional<
-        dftracer::utils::utilities::common::arrow::ArrowExportResult>;
-    std::queue<BatchItem> queue;
-    std::mutex mtx;
-    std::condition_variable cv_producer;
-    std::condition_variable cv_consumer;
+using ArgsValue = dftracer::utils::utilities::composites::dft::ArgsValue;
+using ArgsMap = dftracer::utils::utilities::composites::dft::ArgsMap;
+
+struct JsonDictEvent {
+    ArgsMap top;
+    ArgsMap args;
+};
+
+struct JsonDictBatch {
+    std::vector<JsonDictEvent> events;
+};
+
+struct JsonDictIteratorState {
+    std::shared_ptr<dftracer::utils::coro::Channel<JsonDictBatch>> channel;
+    std::mutex error_mtx;
     std::exception_ptr error;
     std::atomic<bool> cancelled{false};
-    std::atomic<bool> done{false};
-    std::size_t max_queue_size = 8;
+    std::size_t memory_budget_bytes = 0;
+    std::atomic<std::size_t> bytes_in_queue{0};
     std::shared_future<void> task_future;
+
+    void set_error(std::exception_ptr e) {
+        std::lock_guard<std::mutex> lock(error_mtx);
+        if (!error) error = e;
+    }
 };
-#endif
+
+using dftracer::utils::python::MemoryViewBatchIteratorState;
+using dftracer::utils::python::MemoryViewBatchObject;
+using dftracer::utils::python::MemoryViewBatchType;
 
 typedef struct {
-    PyObject_HEAD std::shared_ptr<IteratorState> state;
+    PyObject_HEAD std::shared_ptr<MemoryViewBatchIteratorState> batch_state;
+    PyObject *current_batch;
+    Py_ssize_t batch_index;
+    std::shared_ptr<JsonDictIteratorState> json_dict_state;
+    std::shared_ptr<JsonDictBatch> json_dict_current_batch;
+    Py_ssize_t json_dict_index;
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
     std::shared_ptr<ArrowIteratorState> arrow_state;
 #endif
diff --git a/src/dftracer/utils/python/utilities/aggregator.cpp b/src/dftracer/utils/python/utilities/aggregator.cpp
index 6f7799d4..204f9e49 100644
--- a/src/dftracer/utils/python/utilities/aggregator.cpp
+++ b/src/dftracer/utils/python/utilities/aggregator.cpp
@@ -1,4 +1,6 @@
 #define PY_SSIZE_T_CLEAN
+#include <dftracer/utils/core/common/config.h>
+#include <dftracer/utils/core/common/memory_budget.h>
 #include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/core/runtime.h>
 #include <dftracer/utils/python/arrow_helpers.h>
@@ -7,9 +9,23 @@
 #include <dftracer/utils/python/utilities/aggregator.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.h>
 
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+#include <dftracer/utils/python/batch_byte_size.h>
+#include <dftracer/utils/python/streaming_iterator.h>
+#endif
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+#include <dftracer/utils/utilities/common/arrow/partition_router.h>
+#include <dftracer/utils/utilities/common/arrow/partition_writer.h>
+#include <dftracer/utils/utilities/common/query/query.h>
+#endif
+
+#include <cctype>
+#include <memory>
+#include <optional>
 #include <string>
 #include <vector>
 
+using dftracer::utils::CoroScope;
 using dftracer::utils::Runtime;
 using dftracer::utils::coro::CoroTask;
 using namespace dftracer::utils::utilities::composites::dft::aggregators;
@@ -18,8 +34,17 @@ using dftracer::utils::python::wrap_arrow_result;
 using dftracer::utils::python::wrap_arrow_table;
 
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
+using dftracer::utils::python::ArrowStreamingIteratorObject;
+using dftracer::utils::python::ArrowStreamingIteratorType;
+using dftracer::utils::python::StreamingState;
 using dftracer::utils::utilities::common::arrow::ArrowExportResult;
 #endif
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+using dftracer::utils::utilities::common::arrow::IpcCompression;
+using dftracer::utils::utilities::common::arrow::PartitionWriter;
+using dftracer::utils::utilities::common::arrow::PartitionWriteStats;
+using dftracer::utils::utilities::common::query::Query;
+#endif
 
 static Runtime *get_runtime(AggregatorObject *self) {
     if (self->runtime_obj)
@@ -91,8 +116,30 @@ static int parse_str_list(PyObject *obj, std::vector<std::string> &out,
     return 0;
 }
 
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+// Parse a view query string into an optional Query
+static int parse_view_query(PyObject *query_obj, std::optional<Query> &out) {
+    if (!query_obj || query_obj == Py_None) {
+        out = std::nullopt;
+        return 0;
+    }
+    const char *query_str = PyUnicode_AsUTF8(query_obj);
+    if (!query_str) return -1;
+    auto parsed = Query::from_string(query_str);
+    if (!parsed) {
+        PyErr_Format(PyExc_ValueError, "Invalid query: %s",
+                     parsed.error().format().c_str());
+        return -1;
+    }
+    out = std::move(*parsed);
+    return 0;
+}
+#endif
+
 static int parse_aggregator_args(PyObject *args, PyObject *kwds,
-                                 AggregatorInput &input) {
+                                 AggregatorInput &input,
+                                 std::size_t *buffer_size_out = nullptr,
+                                 std::optional<Query> *query_out = nullptr) {
     static const char *kwlist[] = {"directory",
                                    "time_interval_ms",
                                    "group_keys",
@@ -101,11 +148,12 @@ static int parse_aggregator_args(PyObject *args, PyObject *kwds,
                                    "index_dir",
                                    "checkpoint_size",
                                    "force_rebuild",
-                                   "chunk_size_mb",
-                                   "batch_size_mb",
+                                   "parallelism",
                                    "event_batch_size",
                                    "custom_metric_fields",
                                    "compute_percentiles",
+                                   "buffer_size",
+                                   "query",
                                    NULL};
 
     const char *directory = NULL;
@@ -116,28 +164,40 @@ static int parse_aggregator_args(PyObject *args, PyObject *kwds,
     const char *index_dir = "";
     Py_ssize_t checkpoint_size = 32 * 1024 * 1024;
     int force_rebuild = 0;
-    Py_ssize_t chunk_size_mb = 64;
-    Py_ssize_t batch_size_mb = 4;
+    Py_ssize_t parallelism = 0;
     Py_ssize_t event_batch_size = 10000;
     PyObject *custom_metrics_obj = Py_None;
     int compute_percentiles = 0;
+    Py_ssize_t buffer_size = 8;
+    PyObject *query_obj = Py_None;
 
     if (!PyArg_ParseTupleAndKeywords(
-            args, kwds, "s|dOOOsnpnnnOp", (char **)kwlist, &directory,
+            args, kwds, "s|dOOOsnpnnOpnO", (char **)kwlist, &directory,
             &time_interval_ms, &group_keys_obj, &categories_obj, &names_obj,
-            &index_dir, &checkpoint_size, &force_rebuild, &chunk_size_mb,
-            &batch_size_mb, &event_batch_size, &custom_metrics_obj,
-            &compute_percentiles))
+            &index_dir, &checkpoint_size, &force_rebuild, &parallelism,
+            &event_batch_size, &custom_metrics_obj, &compute_percentiles,
+            &buffer_size, &query_obj))
         return -1;
 
+    if (buffer_size_out) {
+        *buffer_size_out = static_cast<std::size_t>(buffer_size);
+    }
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+    if (query_out) {
+        if (parse_view_query(query_obj, *query_out) < 0) return -1;
+    }
+#else
+    (void)query_obj;
+#endif
+
     input.directory = directory;
     input.config.time_interval_us =
         static_cast<std::uint64_t>(time_interval_ms * 1000.0);
     input.index_dir = index_dir;
     input.checkpoint_size = static_cast<std::size_t>(checkpoint_size);
     input.force_rebuild = force_rebuild != 0;
-    input.chunk_size_mb = static_cast<std::size_t>(chunk_size_mb);
-    input.batch_size_mb = static_cast<std::size_t>(batch_size_mb);
+    input.parallelism = static_cast<std::size_t>(parallelism);
     input.event_batch_size = static_cast<std::size_t>(event_batch_size);
     input.config.compute_percentiles = compute_percentiles != 0;
 
@@ -151,23 +211,49 @@ static int parse_aggregator_args(PyObject *args, PyObject *kwds,
     return 0;
 }
 
-static int run_aggregator_pipeline(AggregatorObject *self,
-                                   const AggregatorInput &input,
-                                   std::vector<AggregationBatch> &batches,
-                                   std::string &error_msg) {
-    auto *bp = &batches;
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+static int run_aggregator_pipeline(
+    AggregatorObject *self, const AggregatorInput &input,
+    std::vector<ArrowExportResult> &results, std::string &error_msg,
+    const std::optional<Query> *query = nullptr) {
+    auto *rp = &results;
     AggregatorInput input_copy = input;
+    std::optional<Query> query_copy;
+    if (query) query_copy = *query;
 
     Py_BEGIN_ALLOW_THREADS try {
         Runtime *rt = get_runtime(self);
-        auto task = [bp, input_copy]() -> CoroTask<void> {
-            AggregatorUtility util;
-            auto gen = util.process(input_copy);
-            while (auto batch = co_await gen.next()) {
-                bp->push_back(std::move(*batch));
-            }
-        };
-        rt->submit(task(), "aggregator").get();
+        rt->submit(run_coro_scope(
+                       rt->executor(),
+                       [](CoroScope &scope, std::vector<ArrowExportResult> *out,
+                          AggregatorInput input,
+                          std::optional<Query> query) -> CoroTask<void> {
+                           AggregatorUtility util;
+                           util.bind_context(scope);
+                           try {
+                               auto gen = util.process(input);
+                               while (auto batch = co_await gen.next()) {
+                                   if (batch->entries.empty()) continue;
+                                   AggregationBatch filtered;
+                                   if (query) {
+                                       filtered = batch->filter(*query);
+                                       if (filtered.entries.empty()) continue;
+                                   } else {
+                                       filtered = std::move(*batch);
+                                   }
+                                   auto arrow_result = filtered.to_arrow();
+                                   if (!arrow_result.valid()) continue;
+                                   out->push_back(std::move(arrow_result));
+                               }
+                               util.unbind_context();
+                           } catch (...) {
+                               util.unbind_context();
+                               throw;
+                           }
+                       },
+                       rp, std::move(input_copy), std::move(query_copy)),
+                   "aggregator")
+            .get();
     } catch (const std::exception &e) {
         error_msg = e.what();
     }
@@ -177,38 +263,88 @@ static int run_aggregator_pipeline(AggregatorObject *self,
         ? 0
         : -1;
 }
+#endif  // DFTRACER_UTILS_ENABLE_ARROW
 
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
 
+static CoroTask<void> run_aggregator_stream(
+    CoroScope &scope, std::shared_ptr<StreamingState<ArrowExportResult>> state,
+    AggregatorInput input, std::optional<Query> query) {
+    if (state->cancelled()) {
+        state->complete();
+        co_return;
+    }
+
+    try {
+        AggregatorUtility util;
+        util.bind_context(scope);
+        auto gen = util.process(input);
+
+        while (auto batch = co_await gen.next()) {
+            if (state->cancelled()) break;
+            if (batch->entries.empty()) continue;
+
+            AggregationBatch filtered;
+            if (query) {
+                filtered = batch->filter(*query);
+                if (filtered.entries.empty()) continue;
+            } else {
+                filtered = std::move(*batch);
+            }
+
+            auto arrow_result = filtered.to_arrow();
+            if (!arrow_result.valid()) continue;
+
+            auto result_bytes =
+                dftracer::utils::python::byte_size(arrow_result);
+            if (!state->push(std::move(arrow_result), result_bytes)) {
+                break;
+            }
+        }
+
+        util.unbind_context();
+        state->complete();
+    } catch (const std::exception &e) {
+        state->fail(std::current_exception());
+    } catch (...) {
+        state->fail(std::current_exception());
+    }
+}
+
 #endif  // DFTRACER_UTILS_ENABLE_ARROW
 
 // ---------------------------------------------------------------------------
-// process() — returns ArrowTable (materialized)
+// process() - returns ArrowTable (materialized)
 // ---------------------------------------------------------------------------
 
 static PyObject *Aggregator_process(AggregatorObject *self, PyObject *args,
                                     PyObject *kwds) {
     AggregatorInput input;
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+    std::optional<Query> query;
+    if (parse_aggregator_args(args, kwds, input, nullptr, &query) < 0)
+        return NULL;
+#else
     if (parse_aggregator_args(args, kwds, input) < 0) return NULL;
+#endif
 
-    std::vector<AggregationBatch> batches;
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+    std::vector<ArrowExportResult> results;
     std::string error_msg;
-    if (run_aggregator_pipeline(self, input, batches, error_msg) < 0) {
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+    if (run_aggregator_pipeline(self, input, results, error_msg, &query) < 0) {
+#else
+    if (run_aggregator_pipeline(self, input, results, error_msg) < 0) {
+#endif
         PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
         return NULL;
     }
 
-#ifdef DFTRACER_UTILS_ENABLE_ARROW
     PyObject *batch_list = PyList_New(0);
     if (!batch_list) return NULL;
 
-    for (const auto &batch : batches) {
-        if (batch.entries.empty()) continue;
-
-        auto arrow_result = batch.to_arrow();
-        if (!arrow_result.valid()) continue;
-
-        PyObject *cap = wrap_arrow_result(std::move(arrow_result));
+    for (auto &result : results) {
+        PyObject *cap = wrap_arrow_result(std::move(result));
         if (!cap) {
             Py_DECREF(batch_list);
             return NULL;
@@ -230,55 +366,402 @@ static PyObject *Aggregator_process(AggregatorObject *self, PyObject *args,
 }
 
 // ---------------------------------------------------------------------------
-// iter_arrow() — returns list iterator of ArrowBatch capsules
+// iter_arrow() - returns true streaming iterator
 // ---------------------------------------------------------------------------
 
 static PyObject *Aggregator_iter_arrow(AggregatorObject *self, PyObject *args,
                                        PyObject *kwds) {
     AggregatorInput input;
-    if (parse_aggregator_args(args, kwds, input) < 0) return NULL;
+    std::size_t buffer_size = 8;
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+    std::optional<Query> query;
+    if (parse_aggregator_args(args, kwds, input, &buffer_size, &query) < 0)
+        return NULL;
+#else
+    if (parse_aggregator_args(args, kwds, input, &buffer_size) < 0) return NULL;
+#endif
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+    auto state = std::make_shared<StreamingState<ArrowExportResult>>(
+        dftracer::utils::compute_memory_budget(0));
+
+    ArrowStreamingIteratorObject *iter_obj =
+        (ArrowStreamingIteratorObject *)ArrowStreamingIteratorType.tp_new(
+            &ArrowStreamingIteratorType, NULL, NULL);
+    if (!iter_obj) {
+        return NULL;
+    }
+
+    iter_obj->cpp_state->state = state;
+    iter_obj->cpp_state->pull_next =
+        [state]() -> std::optional<ArrowExportResult> { return state->pull(); };
+    iter_obj->cpp_state->get_error = [state]() -> std::exception_ptr {
+        return state->error();
+    };
+    iter_obj->cpp_state->cancel = [state]() { state->cancel(); };
+
+    Runtime *rt = get_runtime(self);
+    AggregatorInput input_copy = input;
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+    std::optional<Query> query_copy = std::move(query);
+    Py_BEGIN_ALLOW_THREADS rt->submit(
+        run_coro_scope(rt->executor(), run_aggregator_stream, state,
+                       std::move(input_copy), std::move(query_copy)),
+        "aggregator_stream");
+#else
+    Py_BEGIN_ALLOW_THREADS rt->submit(
+        run_coro_scope(rt->executor(), run_aggregator_stream, state,
+                       std::move(input_copy), std::nullopt),
+        "aggregator_stream");
+#endif
+    Py_END_ALLOW_THREADS
+
+        return (PyObject *)iter_obj;
+#else
+    PyErr_SetString(PyExc_RuntimeError,
+                    "dftracer-utils was built without Arrow support");
+    return NULL;
+#endif
+}
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+struct AggregatorViewDef {
+    std::string name;
+    std::optional<Query> query;
+};
+
+struct AggregatorWriteArrowResult {
+    std::unordered_map<std::string, PartitionWriteStats> view_stats;
+    int64_t total_rows = 0;
+    int64_t total_bytes = 0;
+    std::string error;
+};
+
+static CoroTask<void> run_aggregator_write_arrow(
+    CoroScope &scope, AggregatorWriteArrowResult *out, AggregatorInput input,
+    std::string output_path, std::vector<AggregatorViewDef> views,
+    int64_t chunk_size_bytes, IpcCompression compression) {
+    try {
+        // If no views specified, create a default "all" view
+        if (views.empty()) {
+            views.push_back({"all", std::nullopt});
+        }
+
+        // Open a writer for each view
+        std::vector<PartitionWriter> writers(views.size());
+        for (std::size_t i = 0; i < views.size(); ++i) {
+            std::string view_path = output_path;
+            if (views.size() > 1 || views[i].name != "all") {
+                view_path = output_path + "/" + views[i].name;
+            }
+            int rc = co_await writers[i].open(view_path, chunk_size_bytes,
+                                              compression);
+            if (rc != 0) {
+                out->error = "Failed to open writer for view: " + views[i].name;
+                co_return;
+            }
+        }
+
+        AggregatorUtility util;
+        util.bind_context(scope);
+        auto gen = util.process(input);
+
+        while (auto batch = co_await gen.next()) {
+            if (batch->entries.empty()) continue;
+
+            // Write to each view (with optional filtering)
+            for (std::size_t i = 0; i < views.size(); ++i) {
+                AggregationBatch filtered_batch;
+                if (views[i].query) {
+                    filtered_batch = batch->filter(*views[i].query);
+                    if (filtered_batch.entries.empty()) continue;
+                } else {
+                    filtered_batch = *batch;
+                }
+
+                auto arrow_result = filtered_batch.to_arrow();
+                if (!arrow_result.valid()) continue;
+
+                int rc = co_await writers[i].write_batch(arrow_result);
+                if (rc != 0) {
+                    util.unbind_context();
+                    out->error =
+                        "Failed to write batch for view: " + views[i].name;
+                    co_return;
+                }
+            }
+        }
+
+        util.unbind_context();
+
+        // Close writers and collect stats
+        for (std::size_t i = 0; i < views.size(); ++i) {
+            auto stats = co_await writers[i].close();
+            out->view_stats[views[i].name] = std::move(stats);
+            out->total_rows += out->view_stats[views[i].name].total_rows;
+            out->total_bytes +=
+                out->view_stats[views[i].name].total_uncompressed_bytes;
+        }
+    } catch (const std::exception &e) {
+        out->error = e.what();
+    }
+}
+
+static PyObject *Aggregator_write_arrow(AggregatorObject *self, PyObject *args,
+                                        PyObject *kwds) {
+    static const char *kwlist[] = {"directory",
+                                   "path",
+                                   "time_interval_ms",
+                                   "group_keys",
+                                   "categories",
+                                   "names",
+                                   "index_dir",
+                                   "checkpoint_size",
+                                   "force_rebuild",
+                                   "parallelism",
+                                   "event_batch_size",
+                                   "custom_metric_fields",
+                                   "compute_percentiles",
+                                   "views",
+                                   "chunk_size_mb",
+                                   "compression",
+                                   NULL};
+
+    const char *directory = NULL;
+    const char *output_path = NULL;
+    double time_interval_ms = 5000.0;
+    PyObject *group_keys_obj = Py_None;
+    PyObject *categories_obj = Py_None;
+    PyObject *names_obj = Py_None;
+    const char *index_dir = "";
+    Py_ssize_t checkpoint_size = 32 * 1024 * 1024;
+    int force_rebuild = 0;
+    Py_ssize_t parallelism = 0;
+    Py_ssize_t event_batch_size = 10000;
+    PyObject *custom_metrics_obj = Py_None;
+    int compute_percentiles = 0;
+    PyObject *views_obj = Py_None;
+    int chunk_size_mb = 32;
+    const char *compression_str = "zstd";
+
+    if (!PyArg_ParseTupleAndKeywords(
+            args, kwds, "ss|dOOOsnpnnOpOis", (char **)kwlist, &directory,
+            &output_path, &time_interval_ms, &group_keys_obj, &categories_obj,
+            &names_obj, &index_dir, &checkpoint_size, &force_rebuild,
+            &parallelism, &event_batch_size, &custom_metrics_obj,
+            &compute_percentiles, &views_obj, &chunk_size_mb, &compression_str))
+        return NULL;
+
+    // Parse views
+    std::vector<AggregatorViewDef> views;
+    if (views_obj && views_obj != Py_None) {
+        if (!PyList_Check(views_obj)) {
+            PyErr_SetString(PyExc_TypeError,
+                            "views must be a list of dicts with 'name' and "
+                            "optional 'query' keys");
+            return NULL;
+        }
+        Py_ssize_t n = PyList_Size(views_obj);
+        for (Py_ssize_t i = 0; i < n; i++) {
+            PyObject *item = PyList_GetItem(views_obj, i);
+            if (!PyDict_Check(item)) {
+                PyErr_SetString(PyExc_TypeError,
+                                "each view must be a dict with 'name' key");
+                return NULL;
+            }
+            AggregatorViewDef view;
+            PyObject *name_obj = PyDict_GetItemString(item, "name");
+            if (!name_obj) {
+                PyErr_SetString(PyExc_ValueError,
+                                "each view must have a 'name' key");
+                return NULL;
+            }
+            const char *name_str = PyUnicode_AsUTF8(name_obj);
+            if (!name_str) return NULL;
+            view.name = name_str;
+
+            PyObject *query_obj = PyDict_GetItemString(item, "query");
+            if (query_obj && query_obj != Py_None) {
+                const char *query_str = PyUnicode_AsUTF8(query_obj);
+                if (!query_str) return NULL;
+                auto parsed = Query::from_string(query_str);
+                if (!parsed) {
+                    PyErr_Format(PyExc_ValueError,
+                                 "Invalid query for view '%s': %s", name_str,
+                                 parsed.error().format().c_str());
+                    return NULL;
+                }
+                view.query = std::move(*parsed);
+            }
+            views.push_back(std::move(view));
+        }
+    }
+
+    // Parse compression
+    IpcCompression compression = IpcCompression::ZSTD;
+    if (compression_str) {
+        std::string comp_lower(compression_str);
+        for (auto &c : comp_lower) c = std::tolower(c);
+        if (comp_lower == "none") {
+            compression = IpcCompression::NONE;
+        } else if (comp_lower == "zstd") {
+#ifdef DFTRACER_UTILS_ENABLE_ZSTD
+            compression = IpcCompression::ZSTD;
+#else
+            PyErr_SetString(PyExc_ValueError, "ZSTD compression not available");
+            return NULL;
+#endif
+        } else {
+            PyErr_Format(PyExc_ValueError,
+                         "Unknown compression: %s (use 'none' or 'zstd')",
+                         compression_str);
+            return NULL;
+        }
+    }
+
+    int64_t chunk_size_bytes =
+        static_cast<int64_t>(chunk_size_mb) * 1024 * 1024;
+
+    // Parse group_keys
+    std::vector<std::string> group_keys;
+    if (group_keys_obj && group_keys_obj != Py_None) {
+        if (!PyList_Check(group_keys_obj)) {
+            PyErr_SetString(PyExc_TypeError,
+                            "group_keys must be a list of str");
+            return NULL;
+        }
+        Py_ssize_t n = PyList_Size(group_keys_obj);
+        for (Py_ssize_t i = 0; i < n; i++) {
+            const char *s = PyUnicode_AsUTF8(PyList_GetItem(group_keys_obj, i));
+            if (!s) return NULL;
+            group_keys.emplace_back(s);
+        }
+    }
+
+    // Parse custom_metric_fields
+    std::vector<std::string> custom_metrics;
+    if (custom_metrics_obj && custom_metrics_obj != Py_None) {
+        if (!PyList_Check(custom_metrics_obj)) {
+            PyErr_SetString(PyExc_TypeError,
+                            "custom_metric_fields must be a list of str");
+            return NULL;
+        }
+        Py_ssize_t n = PyList_Size(custom_metrics_obj);
+        for (Py_ssize_t i = 0; i < n; i++) {
+            const char *s =
+                PyUnicode_AsUTF8(PyList_GetItem(custom_metrics_obj, i));
+            if (!s) return NULL;
+            custom_metrics.emplace_back(s);
+        }
+    }
+
+    AggregatorInput input;
+    input.directory = directory;
+    input.config.time_interval_us =
+        static_cast<std::uint64_t>(time_interval_ms * 1000.0);
+    input.config.extra_group_keys = std::move(group_keys);
+    input.config.custom_metric_fields = std::move(custom_metrics);
+    input.config.compute_percentiles = compute_percentiles != 0;
+    input.index_dir = index_dir;
+    input.checkpoint_size = static_cast<std::size_t>(checkpoint_size);
+    input.force_rebuild = force_rebuild != 0;
+    input.parallelism = static_cast<std::size_t>(parallelism);
+    input.event_batch_size = static_cast<std::size_t>(event_batch_size);
 
-    std::vector<AggregationBatch> batches;
+    std::string output_path_str(output_path);
+    AggregatorWriteArrowResult result;
+    auto *rp = &result;
     std::string error_msg;
-    if (run_aggregator_pipeline(self, input, batches, error_msg) < 0) {
+
+    Py_BEGIN_ALLOW_THREADS try {
+        Runtime *rt = get_runtime(self);
+        rt->submit(
+              run_coro_scope(rt->executor(), run_aggregator_write_arrow, rp,
+                             std::move(input), output_path_str,
+                             std::move(views), chunk_size_bytes, compression),
+              "aggregator_write_arrow")
+            .get();
+    } catch (const std::exception &e) {
+        error_msg = e.what();
+    }
+    Py_END_ALLOW_THREADS
+
+        if (!error_msg.empty()) {
         PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
         return NULL;
     }
 
-#ifdef DFTRACER_UTILS_ENABLE_ARROW
-    PyObject *batch_list = PyList_New(0);
-    if (!batch_list) return NULL;
+    if (!result.error.empty()) {
+        PyErr_SetString(PyExc_RuntimeError, result.error.c_str());
+        return NULL;
+    }
 
-    for (const auto &batch : batches) {
-        if (batch.entries.empty()) continue;
+    // Build result dict
+    PyObject *dict = PyDict_New();
+    if (!dict) return NULL;
 
-        auto arrow_result = batch.to_arrow();
-        if (!arrow_result.valid()) continue;
+    PyObject *views_dict = PyDict_New();
+    if (!views_dict) {
+        Py_DECREF(dict);
+        return NULL;
+    }
 
-        PyObject *cap = wrap_arrow_result(std::move(arrow_result));
-        if (!cap) {
-            Py_DECREF(batch_list);
+    for (const auto &[view_name, view_stats] : result.view_stats) {
+        PyObject *view_dict = PyDict_New();
+        if (!view_dict) {
+            Py_DECREF(views_dict);
+            Py_DECREF(dict);
             return NULL;
         }
 
-        int rc = PyList_Append(batch_list, cap);
-        Py_DECREF(cap);
-        if (rc < 0) {
-            Py_DECREF(batch_list);
+        PyObject *files_list = PyList_New(0);
+        if (!files_list) {
+            Py_DECREF(view_dict);
+            Py_DECREF(views_dict);
+            Py_DECREF(dict);
             return NULL;
         }
+
+        for (const auto &f : view_stats.files) {
+            PyObject *file_str = PyUnicode_FromString(f.c_str());
+            if (!file_str || PyList_Append(files_list, file_str) < 0) {
+                Py_XDECREF(file_str);
+                Py_DECREF(files_list);
+                Py_DECREF(view_dict);
+                Py_DECREF(views_dict);
+                Py_DECREF(dict);
+                return NULL;
+            }
+            Py_DECREF(file_str);
+        }
+
+        PyDict_SetItemString(view_dict, "files", files_list);
+        PyDict_SetItemString(view_dict, "rows",
+                             PyLong_FromLongLong(view_stats.total_rows));
+        PyDict_SetItemString(
+            view_dict, "bytes",
+            PyLong_FromLongLong(view_stats.total_uncompressed_bytes));
+        Py_DECREF(files_list);
+
+        PyObject *key = PyUnicode_FromString(view_name.c_str());
+        PyDict_SetItem(views_dict, key, view_dict);
+        Py_DECREF(key);
+        Py_DECREF(view_dict);
     }
 
-    PyObject *it = PyObject_GetIter(batch_list);
-    Py_DECREF(batch_list);
-    return it;
-#else
-    PyErr_SetString(PyExc_RuntimeError,
-                    "dftracer-utils was built without Arrow support");
-    return NULL;
-#endif
+    PyDict_SetItemString(dict, "views", views_dict);
+    PyDict_SetItemString(dict, "total_rows",
+                         PyLong_FromLongLong(result.total_rows));
+    PyDict_SetItemString(dict, "total_bytes",
+                         PyLong_FromLongLong(result.total_bytes));
+    Py_DECREF(views_dict);
+
+    return dict;
 }
 
+#endif  // DFTRACER_UTILS_ENABLE_ARROW_IPC
+
 static PyObject *Aggregator_call(PyObject *self, PyObject *args,
                                  PyObject *kwds) {
     return Aggregator_process((AggregatorObject *)self, args, kwds);
@@ -289,12 +772,14 @@ static PyMethodDef Aggregator_methods[] = {
      "process(directory, time_interval_ms=5000.0, group_keys=None,\n"
      "        categories=None, names=None, index_dir='',\n"
      "        checkpoint_size=33554432, force_rebuild=False,\n"
-     "        chunk_size_mb=64, batch_size_mb=4, event_batch_size=10000,\n"
+     "        parallelism=0, event_batch_size=10000,\n"
      "        custom_metric_fields=None, compute_percentiles=False)\n"
      "--\n"
      "\n"
      "Run aggregation pipeline, return materialized ArrowTable.\n"
      "\n"
+     "Uses parallel, RocksDB-backed, fused indexing and aggregation.\n"
+     "\n"
      "Args:\n"
      "    directory (str): Directory containing .pfw/.pfw.gz files.\n"
      "    time_interval_ms (float): Time bucket in milliseconds (default "
@@ -305,12 +790,11 @@ static PyMethodDef Aggregator_methods[] = {
      "    index_dir (str): Directory for .dftindex stores (default '').\n"
      "    checkpoint_size (int): Checkpoint size (default 33554432).\n"
      "    force_rebuild (bool): Force index rebuild (default False).\n"
-     "    chunk_size_mb (int): Target chunk size in MB (default 64).\n"
-     "    batch_size_mb (int): Batch read size in MB (default 4).\n"
+     "    parallelism (int): Number of parallel workers (0 = all cores).\n"
      "    event_batch_size (int): Entries per batch (default 10000).\n"
      "    custom_metric_fields (list[str] or None): Extra numeric args\n"
-     "        fields to aggregate into *_total/*_min/*_max/*_mean/*_std\n"
-     "        columns (default None).\n"
+     "        fields to aggregate into ``*_total``/``*_min``/``*_max``/\n"
+     "        ``*_mean``/``*_std`` columns (default None).\n"
      "    compute_percentiles (bool): Enable percentile sketch collection\n"
      "        during aggregation (default False).\n"
      "\n"
@@ -321,12 +805,19 @@ static PyMethodDef Aggregator_methods[] = {
      "iter_arrow(directory, time_interval_ms=5000.0, group_keys=None,\n"
      "           categories=None, names=None, index_dir='',\n"
      "           checkpoint_size=33554432, force_rebuild=False,\n"
-     "           chunk_size_mb=64, batch_size_mb=4, event_batch_size=10000,\n"
-     "           custom_metric_fields=None, compute_percentiles=False)\n"
+     "           parallelism=0, event_batch_size=10000,\n"
+     "           custom_metric_fields=None, compute_percentiles=False,\n"
+     "           buffer_size=8)\n"
      "--\n"
      "\n"
      "Run aggregation pipeline, stream Arrow batches.\n"
      "\n"
+     "Returns immediately with a streaming iterator. Batches are produced\n"
+     "in the background with a bounded buffer. GIL is released while waiting\n"
+     "for the next batch, allowing other Python threads to run.\n"
+     "\n"
+     "Uses parallel, RocksDB-backed, fused indexing and aggregation.\n"
+     "\n"
      "Args:\n"
      "    directory (str): Directory containing .pfw/.pfw.gz files.\n"
      "    time_interval_ms (float): Time bucket in milliseconds (default "
@@ -337,17 +828,56 @@ static PyMethodDef Aggregator_methods[] = {
      "    index_dir (str): Directory for .dftindex stores (default '').\n"
      "    checkpoint_size (int): Checkpoint size (default 33554432).\n"
      "    force_rebuild (bool): Force index rebuild (default False).\n"
-     "    chunk_size_mb (int): Target chunk size in MB (default 64).\n"
-     "    batch_size_mb (int): Batch read size in MB (default 4).\n"
+     "    parallelism (int): Number of parallel workers (0 = all cores).\n"
      "    event_batch_size (int): Entries per batch (default 10000).\n"
      "    custom_metric_fields (list[str] or None): Extra numeric args\n"
-     "        fields to aggregate into *_total/*_min/*_max/*_mean/*_std\n"
-     "        columns (default None).\n"
+     "        fields to aggregate into ``*_total``/``*_min``/``*_max``/\n"
+     "        ``*_mean``/``*_std`` columns (default None).\n"
      "    compute_percentiles (bool): Enable percentile sketch collection\n"
      "        during aggregation (default False).\n"
+     "    buffer_size (int): Max batches to buffer (default 8).\n"
+     "\n"
+     "Returns:\n"
+     "    _ArrowStreamingIterator: Streaming iterator yielding Arrow record\n"
+     "        batches. Supports cancel() to stop early.\n"},
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+    {"write_arrow", (PyCFunction)Aggregator_write_arrow,
+     METH_VARARGS | METH_KEYWORDS,
+     "write_arrow(directory, path, time_interval_ms=5000.0, ..., views=None)\n"
+     "--\n"
+     "\n"
+     "Run aggregation and write results to Arrow IPC files with optional "
+     "views.\n"
+     "\n"
+     "Views allow filtering aggregated entries before writing. Each view\n"
+     "writes to a separate subdirectory. Query syntax supports: cat, name,\n"
+     "pid, tid, hhash, fhash, time_bucket, extra group keys, and aggregation\n"
+     "metrics (count, dur_total, dur_min, dur_max, size_total, etc.).\n"
+     "\n"
+     "Args:\n"
+     "    directory (str): Directory containing .pfw/.pfw.gz files.\n"
+     "    path (str): Output directory for Arrow files.\n"
+     "    time_interval_ms (float): Time bucket in milliseconds.\n"
+     "    group_keys (list[str] or None): Extra grouping dims.\n"
+     "    categories (list[str] or None): Category filter.\n"
+     "    names (list[str] or None): Name filter.\n"
+     "    index_dir (str): Directory for .dftindex stores.\n"
+     "    checkpoint_size (int): Checkpoint size.\n"
+     "    force_rebuild (bool): Force index rebuild.\n"
+     "    parallelism (int): Number of parallel workers.\n"
+     "    event_batch_size (int): Entries per batch.\n"
+     "    custom_metric_fields (list[str] or None): Extra numeric fields.\n"
+     "    compute_percentiles (bool): Enable percentile collection.\n"
+     "    views (list[dict] or None): View definitions, each with 'name' and\n"
+     "        optional 'query' keys. If None, writes all entries to path.\n"
+     "        Example: [{'name': 'io', 'query': 'cat == \"POSIX\"'}]\n"
+     "    chunk_size_mb (int): Max uncompressed MB per file (default 32).\n"
+     "    compression (str): 'zstd' or 'none' (default 'zstd').\n"
      "\n"
      "Returns:\n"
-     "    Iterator[ArrowBatch]: Arrow record batches.\n"},
+     "    dict: Statistics with 'views' (per-view stats), 'total_rows',\n"
+     "        'total_bytes'. Each view has 'files', 'rows', 'bytes'.\n"},
+#endif
     {NULL}};
 
 PyTypeObject AggregatorType = {
diff --git a/src/dftracer/utils/python/utilities/comparator.cpp b/src/dftracer/utils/python/utilities/comparator.cpp
index c377fac7..27ead4bb 100644
--- a/src/dftracer/utils/python/utilities/comparator.cpp
+++ b/src/dftracer/utils/python/utilities/comparator.cpp
@@ -18,6 +18,7 @@
 #include <dftracer/utils/utilities/composites/dft/comparator/comparison_result.h>
 #include <dftracer/utils/utilities/composites/dft/comparator/comparison_utility.h>
 #include <dftracer/utils/utilities/composites/dft/comparator/tree_table_formatter.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
 #include <dftracer/utils/utilities/composites/dft/internal/utils.h>
 #include <dftracer/utils/utilities/composites/dft/metadata_collector_utility.h>
 #include <dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h>
@@ -29,8 +30,6 @@
 #include <cstdio>
 #include <ctime>
 #include <string>
-#include <thread>
-#include <unordered_set>
 #include <vector>
 
 using dftracer::utils::Runtime;
@@ -40,6 +39,7 @@ using namespace dftracer::utils::utilities;
 using namespace dftracer::utils::utilities::composites::dft::aggregators;
 using namespace dftracer::utils::utilities::composites::dft::comparator;
 
+#include <dftracer/utils/core/common/config.h>
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
 using dftracer::utils::python::arrow_result_to_table;
 using dftracer::utils::utilities::common::arrow::ArrowExportResult;
@@ -108,17 +108,27 @@ struct ComparatorArgs {
     double time_interval_ms = 5000.0;
     double threshold = 0.0;
     std::size_t executor_threads = 0;
-    std::string index_dir;
+    std::string baseline_index_dir;
+    std::string variant_index_dir;
     bool force_rebuild = false;
     std::string config_path;
 };
 
 static int parse_comparator_args(PyObject *args, PyObject *kwds,
                                  ComparatorArgs &out) {
-    static const char *kwlist[] = {
-        "baseline",  "variant",          "query",     "group_by",
-        "format",    "time_interval_ms", "threshold", "executor_threads",
-        "index_dir", "force_rebuild",    "config",    NULL};
+    static const char *kwlist[] = {"baseline",
+                                   "variant",
+                                   "query",
+                                   "group_by",
+                                   "format",
+                                   "time_interval_ms",
+                                   "threshold",
+                                   "executor_threads",
+                                   "baseline_index_dir",
+                                   "variant_index_dir",
+                                   "force_rebuild",
+                                   "config",
+                                   NULL};
 
     const char *baseline = NULL;
     const char *variant = NULL;
@@ -128,14 +138,16 @@ static int parse_comparator_args(PyObject *args, PyObject *kwds,
     double time_interval_ms = 5000.0;
     double threshold = 0.0;
     Py_ssize_t executor_threads = 0;
-    const char *index_dir = "";
+    const char *baseline_index_dir = "";
+    const char *variant_index_dir = "";
     int force_rebuild = 0;
     const char *config = "";
 
     if (!PyArg_ParseTupleAndKeywords(
-            args, kwds, "ss|sssddnsps", (char **)kwlist, &baseline, &variant,
+            args, kwds, "ss|sssddnssps", (char **)kwlist, &baseline, &variant,
             &query, &group_by, &format, &time_interval_ms, &threshold,
-            &executor_threads, &index_dir, &force_rebuild, &config))
+            &executor_threads, &baseline_index_dir, &variant_index_dir,
+            &force_rebuild, &config))
         return -1;
 
     out.baseline = baseline;
@@ -146,7 +158,8 @@ static int parse_comparator_args(PyObject *args, PyObject *kwds,
     out.time_interval_ms = time_interval_ms;
     out.threshold = threshold;
     out.executor_threads = static_cast<std::size_t>(executor_threads);
-    out.index_dir = index_dir;
+    out.baseline_index_dir = baseline_index_dir;
+    out.variant_index_dir = variant_index_dir;
     out.force_rebuild = force_rebuild != 0;
     out.config_path = config;
 
@@ -163,7 +176,7 @@ void flatten_nodes(const ComparisonNode &node,
     }
 }
 
-CoroTask<EventAggregatorUtilityOutput> run_aggregation(
+CoroTask<EventAggregatorOutput> run_aggregation(
     std::vector<std::string> input_files, AggregationConfig agg_config,
     std::optional<common::query::Query> query, std::string index_dir,
     std::size_t checkpoint_size, bool force_rebuild,
@@ -177,7 +190,7 @@ CoroTask<EventAggregatorUtilityOutput> run_aggregation(
                                .with_watchdog(false);
     Pipeline pipeline(pipeline_config);
 
-    EventAggregatorUtility merger;
+    EventAggregator merger;
     std::atomic<int> global_chunk_idx{0};
 
     auto streaming_task = make_task(
@@ -274,7 +287,7 @@ CoroTask<EventAggregatorUtilityOutput> run_aggregation(
         },
         "StreamingAggregate");
 
-    EventAggregatorUtilityOutput result;
+    EventAggregatorOutput result;
     auto post_task = make_task(
         [&](CoroScope & /*ctx*/) -> CoroTask<bool> {
             result = merger.finalize();
@@ -320,8 +333,10 @@ static int run_comparison_pipeline(ComparatorObject *self,
         config.no_color = true;
         if (args_copy.executor_threads > 0)
             config.executor_threads = args_copy.executor_threads;
-        if (!args_copy.index_dir.empty())
-            config.index_dir = args_copy.index_dir;
+        if (!args_copy.baseline_index_dir.empty())
+            config.baseline_index_dir = args_copy.baseline_index_dir;
+        if (!args_copy.variant_index_dir.empty())
+            config.variant_index_dir = args_copy.variant_index_dir;
         if (args_copy.force_rebuild)
             config.force_rebuild = args_copy.force_rebuild;
         if (args_copy.threshold > 0.0)
@@ -339,45 +354,87 @@ static int run_comparison_pipeline(ComparatorObject *self,
                 indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE;
         }
 
-        // Create temp index dir if needed
-        std::string temp_index_dir;
-        if (config.index_dir.empty()) {
-            auto temp_path = fs::temp_directory_path();
-            temp_path /=
-                "dftracer_cmp_py_" + std::to_string(std::time(nullptr)) + "_" +
-                std::to_string(static_cast<int>(
-                    std::hash<std::thread::id>{}(std::this_thread::get_id())));
-            temp_index_dir = temp_path.string();
-            fs::create_directories(temp_index_dir);
-            config.index_dir = temp_index_dir;
-        }
-
-        // Enumerate files
-        auto enumerate_files =
-            [](std::string path) -> CoroTask<std::vector<std::string>> {
-            std::vector<std::string> files;
-            if (fs::is_regular_file(path)) {
-                files.push_back(path);
-                co_return files;
-            }
-            filesystem::PatternDirectoryScannerUtility scanner;
-            filesystem::PatternDirectoryScannerUtilityInput scan_input{
-                path, {".pfw", ".pfw.gz"}, false};
-            auto entries = co_await scanner.process(scan_input);
-            files.reserve(entries.size());
-            for (const auto &e : entries) {
-                files.push_back(e.path.string());
-            }
-            co_return files;
-        };
+        using composites::dft::indexing::IndexResolverUtility;
+        using composites::dft::indexing::ResolverInput;
+        using indexer::IndexBatchBuilderUtility;
+        using indexer::IndexBuildBatchConfig;
 
         Runtime *rt = get_runtime(self);
 
         auto *error_msg_ptr = &error_msg;
-        auto task = [config, output_ptr, enumerate_files,
-                     error_msg_ptr]() -> CoroTask<void> {
-            auto baseline_files = co_await enumerate_files(config.baseline);
-            auto variant_files = co_await enumerate_files(config.variant);
+        auto task = [config, output_ptr, error_msg_ptr,
+                     rt]() -> CoroTask<void> {
+            auto resolve_and_build =
+                [&config](
+                    CoroScope &scope, const std::string &path,
+                    const std::string &index_dir,
+                    std::vector<std::string> &out_files) -> CoroTask<void> {
+                IndexResolverUtility resolver;
+                ResolverInput resolve_input;
+                resolve_input.index_dir = index_dir;
+                resolve_input.require_checkpoints = !config.force_rebuild;
+                if (fs::is_regular_file(path)) {
+                    resolve_input.files = {path};
+                } else {
+                    resolve_input.directory = path;
+                }
+
+                auto result = co_await resolver.process(resolve_input);
+                out_files = std::move(result.all_files);
+
+                if (out_files.empty() || result.needs_checkpoint.empty()) {
+                    co_return;
+                }
+
+                auto batch_cfg = std::make_shared<IndexBuildBatchConfig>();
+                batch_cfg->file_paths.reserve(result.needs_checkpoint.size());
+                for (const auto &item : result.needs_checkpoint) {
+                    batch_cfg->file_paths.push_back(item.file_path);
+                }
+                batch_cfg->index_dir = index_dir;
+                batch_cfg->checkpoint_size = config.checkpoint_size;
+                batch_cfg->parallelism = config.executor_threads;
+                batch_cfg->force_rebuild = config.force_rebuild;
+                batch_cfg->use_batch_write = true;
+                batch_cfg->rebuild_root_summaries = true;
+
+                co_await IndexBatchBuilderUtility::process(
+                    &scope, std::move(batch_cfg));
+            };
+
+            std::vector<std::string> baseline_files;
+            std::vector<std::string> variant_files;
+
+            bool shared_index =
+                composites::dft::internal::determine_index_path(
+                    config.baseline, config.baseline_index_dir) ==
+                composites::dft::internal::determine_index_path(
+                    config.variant, config.variant_index_dir);
+
+            co_await run_coro_scope(
+                rt->executor(), [&](CoroScope &scope) -> CoroTask<void> {
+                    if (shared_index) {
+                        co_await resolve_and_build(scope, config.baseline,
+                                                   config.baseline_index_dir,
+                                                   baseline_files);
+                        if (config.baseline == config.variant) {
+                            variant_files = baseline_files;
+                        } else {
+                            co_await resolve_and_build(scope, config.variant,
+                                                       config.variant_index_dir,
+                                                       variant_files);
+                        }
+                    } else {
+                        scope.spawn([&](CoroScope &s) -> CoroTask<void> {
+                            co_await resolve_and_build(
+                                s, config.baseline, config.baseline_index_dir,
+                                baseline_files);
+                        });
+                        co_await resolve_and_build(scope, config.variant,
+                                                   config.variant_index_dir,
+                                                   variant_files);
+                    }
+                });
 
             if (baseline_files.empty()) {
                 *error_msg_ptr =
@@ -390,42 +447,6 @@ static int run_comparison_pipeline(ComparatorObject *self,
                 co_return;
             }
 
-            // Build indexes upfront
-            {
-                if (config.force_rebuild && !baseline_files.empty()) {
-                    const std::string shared_index_path =
-                        composites::dft::internal::determine_index_path(
-                            baseline_files.front(), config.index_dir);
-                    if (fs::exists(shared_index_path)) {
-                        fs::remove_all(shared_index_path);
-                    }
-                }
-                std::unordered_set<std::string> seen;
-                std::vector<std::string> all_files;
-                for (const auto &f : baseline_files) {
-                    if (seen.insert(f).second) all_files.push_back(f);
-                }
-                for (const auto &f : variant_files) {
-                    if (seen.insert(f).second) all_files.push_back(f);
-                }
-                std::vector<indexer::IndexBuildConfig> idx_configs;
-                idx_configs.reserve(all_files.size());
-                for (const auto &file_path : all_files) {
-                    idx_configs.push_back(
-                        indexer::IndexBuildConfig::for_file(file_path)
-                            .with_checkpoint_size(config.checkpoint_size)
-                            .with_force_rebuild(false)
-                            .with_index_dir(config.index_dir));
-                }
-                std::vector<CoroTask<indexer::IndexBuildResult>> idx_tasks;
-                idx_tasks.reserve(idx_configs.size());
-                for (const auto &cfg : idx_configs) {
-                    idx_tasks.push_back(
-                        indexer::IndexBuilderUtility{}.process(cfg));
-                }
-                co_await coro::when_all(std::move(idx_tasks));
-            }
-
             output_ptr->baseline_path = config.baseline;
             output_ptr->variant_path = config.variant;
             output_ptr->baseline_file_count = baseline_files.size();
@@ -466,13 +487,13 @@ static int run_comparison_pipeline(ComparatorObject *self,
 
                     auto [base_result, var_result] = co_await coro::when_all(
                         run_aggregation(
-                            baseline_files, agg_cfg, query, config.index_dir,
-                            config.checkpoint_size, config.force_rebuild,
-                            config.executor_threads),
+                            baseline_files, agg_cfg, query,
+                            config.baseline_index_dir, config.checkpoint_size,
+                            config.force_rebuild, config.executor_threads),
                         run_aggregation(
-                            variant_files, agg_cfg, query, config.index_dir,
-                            config.checkpoint_size, config.force_rebuild,
-                            config.executor_threads));
+                            variant_files, agg_cfg, query,
+                            config.variant_index_dir, config.checkpoint_size,
+                            config.force_rebuild, config.executor_threads));
 
                     if (pairs.empty()) {
                         output_ptr->baseline_meta = extract_metadata(
@@ -514,11 +535,6 @@ static int run_comparison_pipeline(ComparatorObject *self,
         };
 
         rt->submit(task(), "comparator").get();
-
-        // Clean up temp index dir
-        if (!temp_index_dir.empty() && fs::exists(temp_index_dir)) {
-            fs::remove_all(temp_index_dir);
-        }
     } catch (const std::exception &e) {
         error_msg = e.what();
     }
diff --git a/src/dftracer/utils/python/utilities/reorganization_planner.cpp b/src/dftracer/utils/python/utilities/reorganization_planner.cpp
index 929bfe79..5178087d 100644
--- a/src/dftracer/utils/python/utilities/reorganization_planner.cpp
+++ b/src/dftracer/utils/python/utilities/reorganization_planner.cpp
@@ -1,4 +1,7 @@
 #include <dftracer/utils/core/runtime.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/core/utilities/behaviors/behavior_chain.h>
+#include <dftracer/utils/core/utilities/utility_executor.h>
 #include <dftracer/utils/python/runtime.h>
 #include <dftracer/utils/python/utilities/reorganization_planner.h>
 #include <dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h>
@@ -6,7 +9,11 @@
 #include <string>
 #include <vector>
 
+using dftracer::utils::CoroScope;
 using dftracer::utils::Runtime;
+using dftracer::utils::utilities::behaviors::BehaviorChain;
+using dftracer::utils::utilities::behaviors::UtilityExecutor;
+namespace tags = dftracer::utils::utilities::tags;
 using namespace dftracer::utils::utilities::composites::dft::reorganize;
 
 static Runtime *get_runtime(ReorganizationPlannerObject *self) {
@@ -129,11 +136,17 @@ static PyObject *ReorganizationPlanner_plan(ReorganizationPlannerObject *self,
 
     Py_BEGIN_ALLOW_THREADS try {
         Runtime *rt = get_runtime(self);
-        auto task = [plan_p, input_copy]() -> CoroTask<void> {
-            ReorganizationPlannerUtility util;
-            *plan_p = co_await util.process(input_copy);
-        };
-        rt->submit(task(), "reorganization-planner").get();
+        auto task = run_coro_scope(
+            rt->executor(),
+            [plan_p, input_copy](CoroScope &scope) -> CoroTask<void> {
+                auto planner = std::make_shared<ReorganizationPlannerUtility>();
+                UtilityExecutor<ReorganizationPlannerInput, ExtractionPlan,
+                                tags::NeedsContext>
+                    exec(planner, BehaviorChain<ReorganizationPlannerInput,
+                                                ExtractionPlan>{});
+                *plan_p = co_await exec.execute_with_context(scope, input_copy);
+            });
+        rt->submit(std::move(task), "reorganization-planner").wait();
     } catch (const std::exception &e) {
         error_msg = e.what();
     }
diff --git a/src/dftracer/utils/server/cursor.cpp b/src/dftracer/utils/server/cursor.cpp
index cfcefd42..990a9aa5 100644
--- a/src/dftracer/utils/server/cursor.cpp
+++ b/src/dftracer/utils/server/cursor.cpp
@@ -8,7 +8,7 @@ namespace dftracer::utils::server {
 namespace {
 
 // Minimal base64 encode/decode for cursor serialization.
-static constexpr char kBase64Chars[] =
+static constexpr char BASE64_CHARS[] =
     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 
 std::string base64_encode(const void* data, std::size_t len) {
@@ -19,10 +19,10 @@ std::string base64_encode(const void* data, std::size_t len) {
         unsigned val = static_cast<unsigned>(bytes[i]) << 16;
         if (i + 1 < len) val |= static_cast<unsigned>(bytes[i + 1]) << 8;
         if (i + 2 < len) val |= static_cast<unsigned>(bytes[i + 2]);
-        out.push_back(kBase64Chars[(val >> 18) & 0x3F]);
-        out.push_back(kBase64Chars[(val >> 12) & 0x3F]);
-        out.push_back((i + 1 < len) ? kBase64Chars[(val >> 6) & 0x3F] : '=');
-        out.push_back((i + 2 < len) ? kBase64Chars[val & 0x3F] : '=');
+        out.push_back(BASE64_CHARS[(val >> 18) & 0x3F]);
+        out.push_back(BASE64_CHARS[(val >> 12) & 0x3F]);
+        out.push_back((i + 1 < len) ? BASE64_CHARS[(val >> 6) & 0x3F] : '=');
+        out.push_back((i + 2 < len) ? BASE64_CHARS[val & 0x3F] : '=');
     }
     return out;
 }
diff --git a/src/dftracer/utils/server/trace_api.cpp b/src/dftracer/utils/server/trace_api.cpp
index 1b8f3ca0..668d83d5 100644
--- a/src/dftracer/utils/server/trace_api.cpp
+++ b/src/dftracer/utils/server/trace_api.cpp
@@ -13,14 +13,15 @@
 #include <dftracer/utils/utilities/common/json/json_doc_guard.h>
 #include <dftracer/utils/utilities/common/json/json_value.h>
 #include <dftracer/utils/utilities/common/query/query.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
 #include <dftracer/utils/utilities/composites/dft/internal/utils.h>
+#include <dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.h>
 #include <dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h>
 #include <dftracer/utils/utilities/composites/dft/statistics/statistics_query_utility.h>
 #include <dftracer/utils/utilities/composites/dft/views/view_builder_utility.h>
 #include <dftracer/utils/utilities/composites/dft/views/view_definition.h>
 #include <dftracer/utils/utilities/composites/dft/views/view_reader_utility.h>
 #include <dftracer/utils/utilities/fileio/lines/sources/async_streaming_gz_line_generator.h>
-#include <yyjson.h>
 
 #include <atomic>
 #include <cstddef>
@@ -76,105 +77,6 @@ static const std::unordered_set<std::string> HASH_METADATA_NAMES = {"FH", "HH",
 using dftracer::utils::utilities::common::json::JsonDocGuard;
 using dftracer::utils::utilities::common::query::Query;
 
-/// Direct-scan a small file without any `.dftindex` store.
-/// Streams via async_streaming_gz_lines(), parses JSON, applies
-/// predicate filters, collects matching events as raw JSON strings.
-static coro::CoroTask<void> direct_scan_events(
-    const TraceIndex::FileInfo* file_info, const Query* query,
-    bool include_metadata, std::vector<std::string>* collected_events,
-    std::uint64_t* total_scanned, std::uint64_t* total_matched, int limit) {
-    using dftracer::utils::utilities::fileio::lines::sources::
-        async_streaming_gz_lines;
-
-    try {
-        auto gen = async_streaming_gz_lines(file_info->path);
-
-        std::unordered_map<std::string, std::string> pending_metadata;
-        std::unordered_set<std::string> emitted_hashes;
-
-        while (auto line = co_await gen.next()) {
-            if (limit > 0 &&
-                collected_events->size() >= static_cast<std::size_t>(limit)) {
-                co_return;
-            }
-            if (line->content.empty()) continue;
-
-            JsonDocGuard guard{yyjson_read_opts(
-                const_cast<char*>(line->content.data()), line->content.size(),
-                YYJSON_READ_NOFLAG, nullptr, nullptr)};
-            if (!guard.doc) continue;
-
-            yyjson_val* root = yyjson_doc_get_root(guard.doc);
-            if (root && yyjson_is_obj(root)) {
-                JsonValue json(root);
-                // line->content is a string_view valid only for this
-                // iteration.  All storage into collected_events and
-                // pending_metadata must copy to owning std::string.
-                std::string_view ph = json["ph"].get<std::string_view>();
-
-                if (ph == "M" && include_metadata) {
-                    std::string name_str = json["name"].get<std::string>();
-
-                    if (HASH_METADATA_NAMES.count(name_str)) {
-                        auto args = json["args"];
-                        if (args.exists()) {
-                            auto val = args["value"];
-                            if (val.exists()) {
-                                std::string hash_val = val.get<std::string>();
-                                if (!emitted_hashes.count(hash_val)) {
-                                    pending_metadata[hash_val] =
-                                        std::string(line->content.data(),
-                                                    line->content.size());
-                                }
-                            }
-                        }
-                    } else {
-                        collected_events->emplace_back(line->content.data(),
-                                                       line->content.size());
-                        (*total_matched)++;
-                    }
-                } else if (ph != "M") {
-                    (*total_scanned)++;
-                    if (!query || query->evaluate(json)) {
-                        // Flush referenced hash metadata first
-                        if (include_metadata) {
-                            auto args = json["args"];
-                            if (args.exists()) {
-                                static const char* hash_fields[] = {
-                                    "hhash", "fhash", "shash"};
-                                for (const char* field : hash_fields) {
-                                    auto val = args[field];
-                                    if (!val.exists()) continue;
-                                    std::string hash_val =
-                                        val.get<std::string>();
-                                    if (emitted_hashes.count(hash_val))
-                                        continue;
-                                    auto it = pending_metadata.find(hash_val);
-                                    if (it != pending_metadata.end()) {
-                                        collected_events->push_back(
-                                            std::move(it->second));
-                                        (*total_matched)++;
-                                        emitted_hashes.insert(hash_val);
-                                        pending_metadata.erase(it);
-                                    }
-                                }
-                            }
-                        }
-                        collected_events->emplace_back(line->content.data(),
-                                                       line->content.size());
-                        (*total_matched)++;
-                    }
-                }
-            }
-        }
-    } catch (const std::exception& e) {
-        DFTRACER_UTILS_LOG_WARN("Direct scan failed for %s: %s",
-                                file_info->path.c_str(), e.what());
-    }
-
-    co_return;
-}
-
 // --- GET /api/v1/files ---
 static coro::CoroTask<HttpResponse> handle_files(const HttpRequest& /*req*/,
                                                  const QueryParams& /*params*/,
@@ -192,8 +94,6 @@ static coro::CoroTask<HttpResponse> handle_files(const HttpRequest& /*req*/,
         body += f.has_bloom_data ? "true" : "false";
         body += ",\"has_checkpoint_index\":";
         body += f.has_checkpoint_index ? "true" : "false";
-        body += ",\"is_small\":";
-        body += f.is_small ? "true" : "false";
         body += '}';
     }
     body += "],\"count\":";
@@ -225,21 +125,16 @@ static coro::CoroTask<HttpResponse> handle_file_info(const HttpRequest& /*req*/,
     body += info->has_bloom_data ? "true" : "false";
     body += ",\"has_checkpoint_index\":";
     body += info->has_checkpoint_index ? "true" : "false";
-    body += ",\"is_small\":";
-    body += info->is_small ? "true" : "false";
-
     body += ",\"size_mb\":";
     body += std::to_string(info->size_mb);
     body += ",\"compressed_size\":";
     body += std::to_string(info->compressed_size);
-    if (!info->is_small) {
-        body += ",\"num_lines\":";
-        body += std::to_string(info->num_lines);
-        body += ",\"num_checkpoints\":";
-        body += std::to_string(info->num_checkpoints);
-        body += ",\"uncompressed_size\":";
-        body += std::to_string(info->uncompressed_size);
-    }
+    body += ",\"num_lines\":";
+    body += std::to_string(info->num_lines);
+    body += ",\"num_checkpoints\":";
+    body += std::to_string(info->num_checkpoints);
+    body += ",\"uncompressed_size\":";
+    body += std::to_string(info->uncompressed_size);
 
     body += '}';
     co_return HttpResponse::ok(body);
@@ -356,10 +251,6 @@ static std::vector<const TraceIndex::FileInfo*> resolve_target_files(
         std::vector<const TraceIndex::FileInfo*> filtered;
         filtered.reserve(files.size());
         for (auto* fi : files) {
-            if (fi->is_small) {
-                filtered.push_back(fi);
-                continue;
-            }
             if (fi->min_timestamp_us == 0 && fi->max_timestamp_us == 0) {
                 filtered.push_back(fi);
                 continue;
@@ -379,33 +270,13 @@ using StreamChunk = HttpResponse::StreamChunk;
 
 static coro::AsyncGenerator<StreamChunk> stream_events(
     std::vector<const TraceIndex::FileInfo*> files, ViewDefinition ev_view,
-    std::optional<Query> query_opt, double ts_min, double ts_max,
+    std::optional<Query> /*query_opt*/, double ts_min, double ts_max,
     BloomFilterCache* bloom_cache, int limit) {
     int emitted = 0;
-    const Query* query_ptr = query_opt ? &*query_opt : nullptr;
 
     for (auto* file_info : files) {
         if (limit > 0 && emitted >= limit) break;
 
-        if (file_info->is_small) {
-            std::vector<std::string> events;
-            std::uint64_t scanned = 0;
-            std::uint64_t matched = 0;
-            co_await direct_scan_events(
-                file_info, query_ptr, ev_view.include_metadata, &events,
-                &scanned, &matched, limit > 0 ? limit - emitted : 0);
-            std::vector<std::string_view> views;
-            for (const auto& event : events) {
-                if (limit > 0 && emitted >= limit) break;
-                views.push_back(event);
-                emitted++;
-            }
-            if (!views.empty()) {
-                co_yield StreamChunk{views};
-            }
-            continue;
-        }
-
         if (file_info->uncompressed_size == 0 &&
             file_info->num_checkpoints == 0)
             continue;
@@ -514,78 +385,54 @@ static coro::CoroTask<HttpResponse> handle_stats(const HttpRequest& req,
     }
 
     std::vector<TraceStatistics> all_stats;
-    std::size_t skipped_small = 0;
 
-    std::vector<const TraceIndex::FileInfo*> stat_files;
+    // Group files by index_path
+    std::unordered_map<std::string,
+                       std::vector<std::pair<std::size_t, std::string>>>
+        files_by_index;
+    std::size_t file_idx = 0;
     for (const auto& file_info : index.files()) {
-        if (file_info.is_small) {
-            skipped_small++;
-            continue;
-        }
         if (!file_info.has_bloom_data) continue;
-        stat_files.push_back(&file_info);
+        files_by_index[file_info.index_path].emplace_back(file_idx++,
+                                                          file_info.path);
     }
 
-    if (stat_files.size() <= 1) {
-        for (auto* file_info : stat_files) {
-            StatisticsAggregatorInput agg_input;
-            agg_input.file_path = file_info->path;
-            agg_input.index_path = file_info->index_path;
-            agg_input.index_dir = index.index_dir();
-
-            StatisticsAggregatorUtility aggregator;
-            auto stats = co_await aggregator.process(agg_input);
-            if (stats.success) {
-                all_stats.push_back(std::move(stats));
-            }
+    // Resolve each group and read statistics
+    for (auto& [idx_path, files] : files_by_index) {
+        std::vector<std::string> file_paths;
+        file_paths.reserve(files.size());
+        for (const auto& [_, path] : files) {
+            file_paths.push_back(path);
         }
-    } else {
-        std::size_t num_workers =
-            std::min(index.max_concurrent(), stat_files.size());
-        auto* executor = Executor::current();
-
-        auto file_chan = coro::make_channel<std::size_t>(num_workers * 2);
-        auto stats_mutex = std::make_shared<std::mutex>();
-        auto* all_stats_ptr = &all_stats;
-        auto* stat_files_ptr = &stat_files;
-        std::string index_dir = index.index_dir();
-        const auto* index_dir_ptr = &index_dir;
-
-        CoroScope scope(executor);
-
-        scope.spawn([ch = file_chan->producer(), stat_files_ptr](
-                        CoroScope&) mutable -> coro::CoroTask<void> {
-            auto guard = ch.guard();
-            for (std::size_t i = 0; i < stat_files_ptr->size(); ++i) {
-                if (!co_await ch.send(i)) co_return;
-            }
-            co_return;
-        });
 
-        for (std::size_t w = 0; w < num_workers; ++w) {
-            scope.spawn([file_chan, stat_files_ptr, stats_mutex, all_stats_ptr,
-                         index_dir_ptr](CoroScope&) -> coro::CoroTask<void> {
-                while (auto fi_opt = co_await file_chan->receive()) {
-                    auto* file_info = (*stat_files_ptr)[*fi_opt];
+        IndexResolverUtility resolver;
+        ResolverInput input;
+        input.files = std::move(file_paths);
+        input.require_checkpoints = false;
 
-                    StatisticsAggregatorInput agg_input;
-                    agg_input.file_path = file_info->path;
-                    agg_input.index_path = file_info->index_path;
-                    agg_input.index_dir = *index_dir_ptr;
+        auto result = co_await resolver.process(input);
 
-                    StatisticsAggregatorUtility aggregator;
-                    auto stats = co_await aggregator.process(agg_input);
+        if (result.cached.empty()) {
+            continue;
+        }
 
-                    if (stats.success) {
-                        std::lock_guard<std::mutex> lock(*stats_mutex);
-                        all_stats_ptr->push_back(std::move(stats));
-                    }
+        try {
+            SharedIndexStatisticsReader reader;
+            auto batch_rows = co_await reader.query(
+                result.index_path, std::move(result.cached),
+                StatisticsQueryType::SUMMARY);
+            auto callback = [&all_stats](std::size_t /*file_index*/,
+                                         TraceStatistics&& stats) {
+                if (stats.success) {
+                    all_stats.push_back(std::move(stats));
                 }
-                co_return;
-            });
+            };
+            SharedIndexStatisticsReader::process_batch_results(batch_rows,
+                                                               callback);
+        } catch (const std::exception& e) {
+            DFTRACER_UTILS_LOG_WARN("Server stats batch read failed for %s: %s",
+                                    idx_path.c_str(), e.what());
         }
-
-        co_await scope.join();
     }
 
     std::uint64_t total_events = 0;
@@ -600,8 +447,6 @@ static coro::CoroTask<HttpResponse> handle_stats(const HttpRequest& req,
     body += std::to_string(file_count);
     body += ",\"total_events\":";
     body += std::to_string(total_events);
-    body += ",\"skipped_small_files\":";
-    body += std::to_string(skipped_small);
     body += ",\"files\":[";
     for (std::size_t i = 0; i < all_stats.size(); ++i) {
         if (i > 0) body += ',';
@@ -650,8 +495,6 @@ static coro::CoroTask<HttpResponse> handle_info(const HttpRequest& /*req*/,
         body += f.has_bloom_data ? "true" : "false";
         body += ",\"has_checkpoint_index\":";
         body += f.has_checkpoint_index ? "true" : "false";
-        body += ",\"is_small\":";
-        body += f.is_small ? "true" : "false";
         if (f.min_timestamp_us > 0 || f.max_timestamp_us > 0) {
             body += ",\"min_timestamp_us\":";
             body += std::to_string(f.min_timestamp_us);
diff --git a/src/dftracer/utils/server/trace_index.cpp b/src/dftracer/utils/server/trace_index.cpp
index 9ccde5d3..6744995e 100644
--- a/src/dftracer/utils/server/trace_index.cpp
+++ b/src/dftracer/utils/server/trace_index.cpp
@@ -4,7 +4,6 @@
 #include <dftracer/utils/core/io/io_backend.h>
 #include <dftracer/utils/core/pipeline/pipeline.h>
 #include <dftracer/utils/core/pipeline/pipeline_config.h>
-#include <dftracer/utils/core/rocksdb/async.h>
 #include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/tasks/task.h>
 #include <dftracer/utils/server/trace_index.h>
@@ -46,7 +45,6 @@ coro::CoroTask<void> TraceIndex::initialize() {
 
     std::vector<std::size_t> needs_build;
     std::vector<std::size_t> large_files;
-    std::size_t small_count = 0;
 
     for (const auto& entry : entries) {
         FileInfo info;
@@ -56,38 +54,21 @@ coro::CoroTask<void> TraceIndex::initialize() {
         std::error_code ec;
         auto fsize = fs::file_size(info.path, ec);
         info.compressed_size = (!ec && fsize > 0) ? fsize : 0;
-        info.is_small = info.compressed_size > 0 &&
-                        info.compressed_size < INDEX_SIZE_THRESHOLD;
 
         std::size_t idx = files_.size();
         path_to_index_[info.path] = idx;
 
-        if (info.is_small) {
-            info.has_bloom_data = false;
-            info.has_checkpoint_index = false;
-            info.size_mb =
-                static_cast<double>(info.compressed_size) / (1024.0 * 1024.0);
-            small_count++;
+        info.has_bloom_data = fs::exists(info.index_path);
+        info.has_checkpoint_index = fs::exists(info.index_path);
+        if (!info.has_bloom_data) {
+            needs_build.push_back(idx);
         } else {
-            info.has_bloom_data = fs::exists(info.index_path);
-            info.has_checkpoint_index = fs::exists(info.index_path);
-            if (!info.has_bloom_data) {
-                needs_build.push_back(idx);
-            } else {
-                large_files.push_back(idx);
-            }
+            large_files.push_back(idx);
         }
 
         files_.push_back(std::move(info));
     }
 
-    if (small_count > 0) {
-        DFTRACER_UTILS_LOG_INFO(
-            "TraceIndex: %zu small file(s) (< %zu bytes) will be "
-            "streamed directly (no .dftindex database)",
-            small_count, INDEX_SIZE_THRESHOLD);
-    }
-
     if (!needs_build.empty() || !large_files.empty()) {
         auto pipeline_config =
             PipelineConfig()
@@ -122,7 +103,7 @@ coro::CoroTask<void> TraceIndex::initialize() {
                         coro::make_channel<std::size_t>(max_concurrent * 2);
 
                     const auto* index_dir_ptr = &index_dir;
-                    co_await ctx.scope([file_chan, files_ptr, needs_build_ptr,
+                    co_await ctx.scope([&file_chan, files_ptr, needs_build_ptr,
                                         index_dir_ptr,
                                         max_concurrent](CoroScope& scope)
                                            -> coro::CoroTask<void> {
@@ -137,41 +118,38 @@ coro::CoroTask<void> TraceIndex::initialize() {
                             });
 
                         for (std::size_t w = 0; w < max_concurrent; ++w) {
-                            scope.spawn(
-                                [file_chan, files_ptr, index_dir_ptr](
-                                    CoroScope&) -> coro::CoroTask<void> {
-                                    while (auto fi_opt =
-                                               co_await file_chan->receive()) {
-                                        std::size_t fi = *fi_opt;
-                                        auto* info = &(*files_ptr)[fi];
-
-                                        indexer::IndexBuilderUtility builder;
-                                        auto config =
-                                            indexer::IndexBuildConfig::for_file(
-                                                info->path)
-                                                .with_index_dir(*index_dir_ptr)
-                                                .with_bloom(true)
-                                                .with_index_threshold(0);
-                                        auto result =
-                                            co_await builder.process(config);
-
-                                        if (result.success) {
-                                            info->index_path =
-                                                internal::determine_index_path(
-                                                    info->path, *index_dir_ptr);
-                                            info->has_bloom_data = true;
-                                            info->has_checkpoint_index =
-                                                fs::exists(info->index_path);
-                                        } else {
-                                            DFTRACER_UTILS_LOG_WARN(
-                                                "TraceIndex: failed to "
-                                                "index %s: %s",
-                                                info->path.c_str(),
-                                                result.error_message.c_str());
-                                        }
+                            scope.spawn([ch = file_chan->consumer(), files_ptr,
+                                         index_dir_ptr](CoroScope&)
+                                            -> coro::CoroTask<void> {
+                                while (auto fi_opt = co_await ch.receive()) {
+                                    std::size_t fi = *fi_opt;
+                                    auto* info = &(*files_ptr)[fi];
+
+                                    indexer::IndexBuilderUtility builder;
+                                    auto config =
+                                        indexer::IndexBuildConfig::for_file(
+                                            info->path)
+                                            .with_index_dir(*index_dir_ptr);
+                                    auto result =
+                                        co_await builder.process(config);
+
+                                    if (result.success) {
+                                        info->index_path =
+                                            internal::determine_index_path(
+                                                info->path, *index_dir_ptr);
+                                        info->has_bloom_data = true;
+                                        info->has_checkpoint_index =
+                                            fs::exists(info->index_path);
+                                    } else {
+                                        DFTRACER_UTILS_LOG_WARN(
+                                            "TraceIndex: failed to "
+                                            "index %s: %s",
+                                            info->path.c_str(),
+                                            result.error_message.c_str());
                                     }
-                                    co_return;
-                                });
+                                }
+                                co_return;
+                            });
                         }
                         co_return;
                     });
@@ -187,7 +165,7 @@ coro::CoroTask<void> TraceIndex::initialize() {
                     auto meta_chan =
                         coro::make_channel<std::size_t>(max_concurrent * 2);
 
-                    co_await ctx.scope([meta_chan, files_ptr, large_files_ptr,
+                    co_await ctx.scope([&meta_chan, files_ptr, large_files_ptr,
                                         max_concurrent](CoroScope& scope)
                                            -> coro::CoroTask<void> {
                         scope.spawn(
@@ -201,45 +179,31 @@ coro::CoroTask<void> TraceIndex::initialize() {
                             });
 
                         for (std::size_t w = 0; w < max_concurrent; ++w) {
-                            scope.spawn([meta_chan, files_ptr](CoroScope&)
+                            scope.spawn([ch = meta_chan->consumer(),
+                                         files_ptr](CoroScope&)
                                             -> coro::CoroTask<void> {
-                                while (auto fi_opt =
-                                           co_await meta_chan->receive()) {
+                                while (auto fi_opt = co_await ch.receive()) {
                                     std::size_t fi = *fi_opt;
                                     auto* info = &(*files_ptr)[fi];
 
                                     if (info->has_bloom_data) {
                                         try {
-                                            const std::string path = info->path;
-                                            const std::string index_path =
-                                                info->index_path;
-                                            const auto* path_ptr = &path;
-                                            const auto* index_path_ptr =
-                                                &index_path;
-                                            auto bounds = co_await rocksdb::run(
-                                                [path_ptr, index_path_ptr] {
-                                                    indexer::IndexDatabase
-                                                        idx_db(*index_path_ptr);
-                                                    auto logical =
-                                                        indexer::internal::
-                                                            get_logical_path(
-                                                                *path_ptr);
-                                                    int fid =
-                                                        idx_db.get_file_info_id(
-                                                            logical);
-                                                    if (fid < 0) {
-                                                        return indexer::
-                                                            IndexDatabase::
-                                                                TimeBounds{};
-                                                    }
-                                                    return idx_db
-                                                        .query_time_bounds(fid);
-                                                });
-                                            if (bounds.valid) {
-                                                info->min_timestamp_us =
-                                                    bounds.min_timestamp_us;
-                                                info->max_timestamp_us =
-                                                    bounds.max_timestamp_us;
+                                            indexer::IndexDatabase idx_db(
+                                                info->index_path);
+                                            auto logical = indexer::internal::
+                                                get_logical_path(info->path);
+                                            int fid = idx_db.get_file_info_id(
+                                                logical);
+                                            if (fid >= 0) {
+                                                auto bounds =
+                                                    idx_db.query_time_bounds(
+                                                        fid);
+                                                if (bounds.valid) {
+                                                    info->min_timestamp_us =
+                                                        bounds.min_timestamp_us;
+                                                    info->max_timestamp_us =
+                                                        bounds.max_timestamp_us;
+                                                }
                                             }
                                         } catch (const std::exception& e) {
                                             DFTRACER_UTILS_LOG_WARN(
diff --git a/src/dftracer/utils/server/viz_api.cpp b/src/dftracer/utils/server/viz_api.cpp
index 9917c765..99bf5329 100644
--- a/src/dftracer/utils/server/viz_api.cpp
+++ b/src/dftracer/utils/server/viz_api.cpp
@@ -14,7 +14,7 @@
 #include <dftracer/utils/utilities/composites/dft/views/view_definition.h>
 #include <dftracer/utils/utilities/composites/dft/views/view_reader_utility.h>
 #include <dftracer/utils/utilities/fileio/lines/sources/async_streaming_gz_line_generator.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
 #include <atomic>
 #include <cstddef>
@@ -44,38 +44,45 @@ static const std::unordered_set<std::string> HASH_METADATA_NAMES = {"FH", "HH",
 /// the original string on parse failure.
 static std::string normalize_event_ts(const std::string& event_json,
                                       std::uint64_t offset) {
-    auto* doc = yyjson_read(event_json.c_str(), event_json.size(), 0);
-    if (!doc) return event_json;
-
-    auto* mdoc = yyjson_doc_mut_copy(doc, nullptr);
-    yyjson_doc_free(doc);
-    if (!mdoc) return event_json;
-
-    auto* root = yyjson_mut_doc_get_root(mdoc);
-    if (root) {
-        auto* ts_val = yyjson_mut_obj_get(root, "ts");
-        if (ts_val && yyjson_mut_is_uint(ts_val)) {
-            std::uint64_t old_ts = yyjson_mut_get_uint(ts_val);
-            std::uint64_t new_ts = old_ts >= offset ? old_ts - offset : 0;
-            yyjson_mut_set_uint(ts_val, new_ts);
-        } else if (ts_val && yyjson_mut_is_int(ts_val)) {
-            auto old_ts =
-                static_cast<std::uint64_t>(yyjson_mut_get_int(ts_val));
-            std::uint64_t new_ts = old_ts >= offset ? old_ts - offset : 0;
-            yyjson_mut_set_uint(ts_val, new_ts);
-        }
+    thread_local simdjson::dom::parser tl_parser;
+    auto result = tl_parser.parse(event_json);
+    if (result.error()) return event_json;
+
+    auto root = result.value_unsafe();
+    if (!root.is_object()) return event_json;
+
+    auto ts_result = root["ts"];
+    if (ts_result.error()) return event_json;
+
+    std::uint64_t old_ts = 0;
+    if (ts_result.is_uint64()) {
+        old_ts = ts_result.get_uint64().value_unsafe();
+    } else if (ts_result.is_int64()) {
+        auto val = ts_result.get_int64().value_unsafe();
+        old_ts = val >= 0 ? static_cast<std::uint64_t>(val) : 0;
+    } else {
+        return event_json;
     }
 
-    std::size_t len = 0;
-    char* json_str = yyjson_mut_write(mdoc, YYJSON_WRITE_NOFLAG, &len);
-    yyjson_mut_doc_free(mdoc);
+    std::uint64_t new_ts = old_ts >= offset ? old_ts - offset : 0;
+
+    // simdjson DOM is read-only, so we need to rebuild the JSON with the new ts
+    // Find "ts": and replace the value
+    std::string modified = event_json;
+    auto pos = modified.find("\"ts\":");
+    if (pos == std::string::npos) return event_json;
 
-    if (json_str) {
-        std::string result(json_str, len);
-        free(json_str);
-        return result;
+    pos += 5;  // Skip past "ts":
+    while (pos < modified.size() && std::isspace(modified[pos])) ++pos;
+
+    auto end_pos = pos;
+    while (end_pos < modified.size() &&
+           (std::isdigit(modified[end_pos]) || modified[end_pos] == '-')) {
+        ++end_pos;
     }
-    return event_json;
+
+    modified.replace(pos, end_pos - pos, std::to_string(new_ts));
+    return modified;
 }
 
 /// Compute the minimum event duration threshold for a given summary level.
@@ -88,10 +95,16 @@ static double duration_threshold(double begin, double end, unsigned level,
            (static_cast<double>(viewport_width) * static_cast<double>(level));
 }
 
-static std::string extract_json_value(yyjson_val* val) {
-    if (yyjson_is_str(val)) return yyjson_get_str(val);
-    if (yyjson_is_int(val)) return std::to_string(yyjson_get_int(val));
-    if (yyjson_is_uint(val)) return std::to_string(yyjson_get_uint(val));
+static std::string extract_json_value(simdjson::dom::element val) {
+    if (val.is_string()) {
+        return std::string(val.get_string().value_unsafe());
+    }
+    if (val.is_int64()) {
+        return std::to_string(val.get_int64().value_unsafe());
+    }
+    if (val.is_uint64()) {
+        return std::to_string(val.get_uint64().value_unsafe());
+    }
     return {};
 }
 
@@ -111,38 +124,41 @@ static void append_lane_clause(std::string& dsl, const char* field,
 static void apply_lanes(std::string& dsl, std::string_view lanes_str) {
     if (lanes_str.empty()) return;
 
-    std::string buf(lanes_str);
-    auto* doc = yyjson_read(buf.c_str(), buf.size(), 0);
-    if (!doc) return;
-    auto doc_guard = std::unique_ptr<yyjson_doc, decltype(&yyjson_doc_free)>(
-        doc, yyjson_doc_free);
-
-    yyjson_val* root = yyjson_doc_get_root(doc);
-    if (!root) return;
-
-    if (yyjson_is_arr(root)) {
-        yyjson_val* item;
-        yyjson_arr_iter iter;
-        yyjson_arr_iter_init(root, &iter);
-        while ((item = yyjson_arr_iter_next(&iter)) != nullptr) {
-            if (!yyjson_is_obj(item)) continue;
-            auto* field_val = yyjson_obj_get(item, "field");
-            if (!field_val) field_val = yyjson_obj_get(item, "fields");
-            auto* value_val = yyjson_obj_get(item, "value");
-            if (!field_val || !value_val) continue;
-            const char* field = yyjson_get_str(field_val);
-            if (!field) continue;
-            auto val = extract_json_value(value_val);
+    thread_local simdjson::dom::parser tl_parser;
+    auto result = tl_parser.parse(lanes_str.data(), lanes_str.size());
+    if (result.error()) return;
+
+    auto root = result.value_unsafe();
+
+    if (root.is_array()) {
+        auto arr = root.get_array().value_unsafe();
+        for (auto item : arr) {
+            if (!item.is_object()) continue;
+            auto obj = item.get_object().value_unsafe();
+
+            auto field_result = obj["field"];
+            if (field_result.error()) field_result = obj["fields"];
+            auto value_result = obj["value"];
+            if (field_result.error() || value_result.error()) continue;
+
+            if (!field_result.value_unsafe().is_string()) continue;
+            const char* field =
+                field_result.value_unsafe().get_c_str().value_unsafe();
+            auto val = extract_json_value(value_result.value_unsafe());
             if (!val.empty()) append_lane_clause(dsl, field, val);
         }
-    } else if (yyjson_is_obj(root)) {
-        auto* field_val = yyjson_obj_get(root, "field");
-        if (!field_val) field_val = yyjson_obj_get(root, "fields");
-        auto* value_val = yyjson_obj_get(root, "value");
-        if (field_val && value_val) {
-            const char* field = yyjson_get_str(field_val);
-            if (field) {
-                auto val = extract_json_value(value_val);
+    } else if (root.is_object()) {
+        auto obj = root.get_object().value_unsafe();
+
+        auto field_result = obj["field"];
+        if (field_result.error()) field_result = obj["fields"];
+        auto value_result = obj["value"];
+
+        if (!field_result.error() && !value_result.error()) {
+            if (field_result.value_unsafe().is_string()) {
+                const char* field =
+                    field_result.value_unsafe().get_c_str().value_unsafe();
+                auto val = extract_json_value(value_result.value_unsafe());
                 if (!val.empty()) append_lane_clause(dsl, field, val);
             }
         }
@@ -152,31 +168,33 @@ static void apply_lanes(std::string& dsl, std::string_view lanes_str) {
 static void apply_filters(std::string& dsl, std::string_view filters_str) {
     if (filters_str.empty()) return;
 
-    std::string buf(filters_str);
-    auto* doc = yyjson_read(buf.c_str(), buf.size(), 0);
-    if (!doc) return;
-    auto doc_guard = std::unique_ptr<yyjson_doc, decltype(&yyjson_doc_free)>(
-        doc, yyjson_doc_free);
+    thread_local simdjson::dom::parser tl_parser;
+    auto result = tl_parser.parse(filters_str.data(), filters_str.size());
+    if (result.error()) return;
 
-    yyjson_val* root = yyjson_doc_get_root(doc);
-    if (!root || !yyjson_is_arr(root)) return;
+    auto root = result.value_unsafe();
+    if (!root.is_array()) return;
 
-    yyjson_val* item;
-    yyjson_arr_iter iter;
-    yyjson_arr_iter_init(root, &iter);
-    while ((item = yyjson_arr_iter_next(&iter)) != nullptr) {
-        if (!yyjson_is_obj(item)) continue;
+    auto arr = root.get_array().value_unsafe();
+    for (auto item : arr) {
+        if (!item.is_object()) continue;
+        auto obj = item.get_object().value_unsafe();
 
-        auto* field_val = yyjson_obj_get(item, "field");
-        auto* op_val = yyjson_obj_get(item, "op");
-        auto* value_val = yyjson_obj_get(item, "value");
-        if (!field_val || !op_val || !value_val) continue;
+        auto field_result = obj["field"];
+        auto op_result = obj["op"];
+        auto value_result = obj["value"];
+        if (field_result.error() || op_result.error() || value_result.error())
+            continue;
+
+        if (!field_result.value_unsafe().is_string() ||
+            !op_result.value_unsafe().is_string())
+            continue;
 
-        const char* field = yyjson_get_str(field_val);
-        const char* op = yyjson_get_str(op_val);
-        if (!field || !op) continue;
+        const char* field =
+            field_result.value_unsafe().get_c_str().value_unsafe();
+        const char* op = op_result.value_unsafe().get_c_str().value_unsafe();
 
-        std::string val = extract_json_value(value_val);
+        std::string val = extract_json_value(value_result.value_unsafe());
         if (val.empty()) continue;
 
         std::string op_str(op);
@@ -209,105 +227,6 @@ static void apply_filters(std::string& dsl, std::string_view filters_str) {
     }
 }
 
-/// Direct-scan a small file without any `.dftindex` store.
-/// Streams via async_streaming_gz_lines(), parses JSON, applies
-/// predicate filters, collects matching events as raw JSON strings.
-static coro::CoroTask<void> direct_scan_events(
-    const TraceIndex::FileInfo* file_info, const Query* query,
-    bool include_metadata, std::vector<std::string>* collected_events,
-    std::uint64_t* total_scanned, std::uint64_t* total_matched, int limit) {
-    using dftracer::utils::utilities::fileio::lines::sources::
-        async_streaming_gz_lines;
-
-    try {
-        auto gen = async_streaming_gz_lines(file_info->path);
-
-        std::unordered_map<std::string, std::string> pending_metadata;
-        std::unordered_set<std::string> emitted_hashes;
-
-        while (auto line = co_await gen.next()) {
-            if (limit > 0 &&
-                collected_events->size() >= static_cast<std::size_t>(limit)) {
-                co_return;
-            }
-            if (line->content.empty()) continue;
-
-            JsonDocGuard guard{yyjson_read_opts(
-                const_cast<char*>(line->content.data()), line->content.size(),
-                YYJSON_READ_NOFLAG, nullptr, nullptr)};
-            if (!guard.doc) continue;
-
-            yyjson_val* root = yyjson_doc_get_root(guard.doc);
-            if (root && yyjson_is_obj(root)) {
-                JsonValue json(root);
-                // line->content is a string_view valid only for this
-                // iteration.  All storage into collected_events and
-                // pending_metadata must copy to owning std::string.
-                std::string_view ph = json["ph"].get<std::string_view>();
-
-                if (ph == "M" && include_metadata) {
-                    std::string name_str = json["name"].get<std::string>();
-
-                    if (HASH_METADATA_NAMES.count(name_str)) {
-                        auto args = json["args"];
-                        if (args.exists()) {
-                            auto val = args["value"];
-                            if (val.exists()) {
-                                std::string hash_val = val.get<std::string>();
-                                if (!emitted_hashes.count(hash_val)) {
-                                    pending_metadata[hash_val] =
-                                        std::string(line->content.data(),
-                                                    line->content.size());
-                                }
-                            }
-                        }
-                    } else {
-                        collected_events->emplace_back(line->content.data(),
-                                                       line->content.size());
-                        (*total_matched)++;
-                    }
-                } else if (ph != "M") {
-                    (*total_scanned)++;
-                    if (!query || query->evaluate(json)) {
-                        // Flush referenced hash metadata first
-                        if (include_metadata) {
-                            auto args = json["args"];
-                            if (args.exists()) {
-                                static const char* hash_fields[] = {
-                                    "hhash", "fhash", "shash"};
-                                for (const char* field : hash_fields) {
-                                    auto val = args[field];
-                                    if (!val.exists()) continue;
-                                    std::string hash_val =
-                                        val.get<std::string>();
-                                    if (emitted_hashes.count(hash_val))
-                                        continue;
-                                    auto it = pending_metadata.find(hash_val);
-                                    if (it != pending_metadata.end()) {
-                                        collected_events->push_back(
-                                            std::move(it->second));
-                                        (*total_matched)++;
-                                        emitted_hashes.insert(hash_val);
-                                        pending_metadata.erase(it);
-                                    }
-                                }
-                            }
-                        }
-                        collected_events->emplace_back(line->content.data(),
-                                                       line->content.size());
-                        (*total_matched)++;
-                    }
-                }
-            }
-        }
-    } catch (const std::exception& e) {
-        DFTRACER_UTILS_LOG_WARN("Direct scan failed for %s: %s",
-                                file_info->path.c_str(), e.what());
-    }
-
-    co_return;
-}
-
 // --- GET /api/v1/viz/events ---
 static coro::CoroTask<HttpResponse> handle_viz_events(
     const HttpRequest& /*req*/, const QueryParams& params, TraceIndex& index) {
@@ -401,10 +320,6 @@ static coro::CoroTask<HttpResponse> handle_viz_events(
         std::vector<const TraceIndex::FileInfo*> filtered;
         filtered.reserve(target_files.size());
         for (auto* fi : target_files) {
-            if (fi->is_small) {
-                filtered.push_back(fi);
-                continue;
-            }
             if (fi->min_timestamp_us == 0 && fi->max_timestamp_us == 0) {
                 filtered.push_back(fi);
                 continue;
@@ -417,8 +332,6 @@ static coro::CoroTask<HttpResponse> handle_viz_events(
         target_files = std::move(filtered);
     }
 
-    const Query* viz_query_ptr = view.query ? &*view.query : nullptr;
-
     std::vector<std::string> collected_events;
 
     bool truncated = false;
@@ -430,63 +343,50 @@ static coro::CoroTask<HttpResponse> handle_viz_events(
                 truncated = true;
                 break;
             }
-            if (file_info->is_small) {
-                std::uint64_t scanned = 0;
-                std::uint64_t matched = 0;
-                co_await direct_scan_events(
-                    file_info, viz_query_ptr, view.include_metadata,
-                    &collected_events, &scanned, &matched, limit);
+            if (file_info->uncompressed_size == 0 &&
+                file_info->num_checkpoints == 0)
+                continue;
+
+            ViewBuilderInput builder_input;
+            builder_input.with_view(view)
+                .with_file_path(file_info->path)
+                .with_index_path(
+                    file_info->has_bloom_data ? file_info->index_path : "")
+                .with_uncompressed_size(file_info->uncompressed_size)
+                .with_num_checkpoints(file_info->num_checkpoints)
+                .with_bloom_cache(&index.bloom_cache())
+                .with_time_range(begin, end);
+
+            ViewBuilderUtility builder;
+            auto build_output = co_await builder.process(builder_input);
+            if (!build_output.success || !build_output.file_may_match) continue;
+
+            for (const auto& candidate : build_output.candidates) {
                 if (limit > 0 &&
-                    static_cast<int>(collected_events.size()) >= limit)
+                    static_cast<int>(collected_events.size()) >= limit) {
                     truncated = true;
-            } else {
-                if (file_info->uncompressed_size == 0 &&
-                    file_info->num_checkpoints == 0)
-                    continue;
-
-                ViewBuilderInput builder_input;
-                builder_input.with_view(view)
-                    .with_file_path(file_info->path)
-                    .with_index_path(
-                        file_info->has_bloom_data ? file_info->index_path : "")
-                    .with_uncompressed_size(file_info->uncompressed_size)
-                    .with_num_checkpoints(file_info->num_checkpoints)
-                    .with_bloom_cache(&index.bloom_cache())
-                    .with_time_range(begin, end);
-
-                ViewBuilderUtility builder;
-                auto build_output = co_await builder.process(builder_input);
-                if (!build_output.success || !build_output.file_may_match)
-                    continue;
-
-                for (const auto& candidate : build_output.candidates) {
-                    if (limit > 0 &&
-                        static_cast<int>(collected_events.size()) >= limit) {
-                        truncated = true;
-                        break;
-                    }
-                    ViewReaderInput reader_input;
-                    reader_input.with_file_path(file_info->path)
-                        .with_index_path(file_info->index_path)
-                        .with_byte_range(candidate.start_byte,
-                                         candidate.end_byte)
-                        .with_checkpoint_idx(candidate.checkpoint_idx)
-                        .with_view(view);
-
-                    ViewReaderUtility reader;
-                    auto gen = reader.process(reader_input);
-                    while (auto batch = co_await gen.next()) {
-                        for (auto& event : batch->events) {
-                            if (limit > 0 &&
-                                static_cast<int>(collected_events.size()) >=
-                                    limit) {
-                                truncated = true;
-                                break;
-                            }
-                            collected_events.emplace_back(event);
+                    break;
+                }
+                ViewReaderInput reader_input;
+                reader_input.with_file_path(file_info->path)
+                    .with_index_path(file_info->index_path)
+                    .with_byte_range(candidate.start_byte, candidate.end_byte)
+                    .with_checkpoint_idx(candidate.checkpoint_idx)
+                    .with_view(view);
+
+                ViewReaderUtility reader;
+                auto gen = reader.process(reader_input);
+                while (auto batch = co_await gen.next()) {
+                    for (auto& event : batch->events) {
+                        if (limit > 0 &&
+                            static_cast<int>(collected_events.size()) >=
+                                limit) {
+                            truncated = true;
+                            break;
                         }
-                        if (truncated) break;
+                        collected_events.emplace_back(event);
                     }
+                    if (truncated) break;
                 }
             }
         }
@@ -520,81 +420,56 @@ static coro::CoroTask<HttpResponse> handle_viz_events(
 
         for (std::size_t w = 0; w < num_workers; ++w) {
             scope.spawn([file_chan, target_files_ptr, collected_mutex,
-                         collected_ptr, viz_query_ptr, view_ptr,
-                         bloom_cache_ptr, remaining, t_begin,
-                         t_end](CoroScope&) -> coro::CoroTask<void> {
+                         collected_ptr, view_ptr, bloom_cache_ptr, remaining,
+                         t_begin, t_end](CoroScope&) -> coro::CoroTask<void> {
                 while (auto fi_opt = co_await file_chan->receive()) {
                     if (remaining->load(std::memory_order_relaxed) <= 0)
                         co_return;
                     auto* file_info = (*target_files_ptr)[*fi_opt];
 
-                    if (file_info->is_small) {
-                        std::vector<std::string> local_events;
-                        std::uint64_t local_scanned = 0;
-                        std::uint64_t local_matched = 0;
-                        int local_limit =
-                            remaining->load(std::memory_order_relaxed);
-                        if (local_limit <= 0) co_return;
-                        co_await direct_scan_events(
-                            file_info, viz_query_ptr,
-                            view_ptr->include_metadata, &local_events,
-                            &local_scanned, &local_matched, local_limit);
-                        if (!local_events.empty()) {
-                            std::lock_guard<std::mutex> lock(*collected_mutex);
-                            for (auto& ev : local_events) {
-                                collected_ptr->push_back(std::move(ev));
-                            }
-                            remaining->fetch_sub(
-                                static_cast<int>(local_events.size()));
-                        }
-                    } else {
-                        if (file_info->uncompressed_size == 0 &&
-                            file_info->num_checkpoints == 0)
-                            continue;
-
-                        ViewBuilderInput builder_input;
-                        builder_input.with_view(*view_ptr)
-                            .with_file_path(file_info->path)
-                            .with_index_path(file_info->has_bloom_data
-                                                 ? file_info->index_path
-                                                 : "")
-                            .with_uncompressed_size(
-                                file_info->uncompressed_size)
-                            .with_num_checkpoints(file_info->num_checkpoints)
-                            .with_bloom_cache(bloom_cache_ptr)
-                            .with_time_range(t_begin, t_end);
-
-                        ViewBuilderUtility builder;
-                        auto build_output =
-                            co_await builder.process(builder_input);
-                        if (!build_output.success ||
-                            !build_output.file_may_match)
-                            continue;
-
-                        for (const auto& candidate : build_output.candidates) {
-                            if (remaining->load(std::memory_order_relaxed) <= 0)
-                                break;
-
-                            ViewReaderInput reader_input;
-                            reader_input.with_file_path(file_info->path)
-                                .with_index_path(file_info->index_path)
-                                .with_byte_range(candidate.start_byte,
-                                                 candidate.end_byte)
-                                .with_checkpoint_idx(candidate.checkpoint_idx)
-                                .with_view(*view_ptr);
-
-                            ViewReaderUtility reader;
-                            auto gen = reader.process(reader_input);
-                            while (auto batch = co_await gen.next()) {
-                                if (!batch->events.empty()) {
-                                    std::lock_guard<std::mutex> lock(
-                                        *collected_mutex);
-                                    for (auto& event : batch->events) {
-                                        collected_ptr->emplace_back(event);
-                                    }
-                                    remaining->fetch_sub(
-                                        static_cast<int>(batch->events.size()));
+                    if (file_info->uncompressed_size == 0 &&
+                        file_info->num_checkpoints == 0)
+                        continue;
+
+                    ViewBuilderInput builder_input;
+                    builder_input.with_view(*view_ptr)
+                        .with_file_path(file_info->path)
+                        .with_index_path(file_info->has_bloom_data
+                                             ? file_info->index_path
+                                             : "")
+                        .with_uncompressed_size(file_info->uncompressed_size)
+                        .with_num_checkpoints(file_info->num_checkpoints)
+                        .with_bloom_cache(bloom_cache_ptr)
+                        .with_time_range(t_begin, t_end);
+
+                    ViewBuilderUtility builder;
+                    auto build_output = co_await builder.process(builder_input);
+                    if (!build_output.success || !build_output.file_may_match)
+                        continue;
+
+                    for (const auto& candidate : build_output.candidates) {
+                        if (remaining->load(std::memory_order_relaxed) <= 0)
+                            break;
+
+                        ViewReaderInput reader_input;
+                        reader_input.with_file_path(file_info->path)
+                            .with_index_path(file_info->index_path)
+                            .with_byte_range(candidate.start_byte,
+                                             candidate.end_byte)
+                            .with_checkpoint_idx(candidate.checkpoint_idx)
+                            .with_view(*view_ptr);
+
+                        ViewReaderUtility reader;
+                        auto gen = reader.process(reader_input);
+                        while (auto batch = co_await gen.next()) {
+                            if (!batch->events.empty()) {
+                                std::lock_guard<std::mutex> lock(
+                                    *collected_mutex);
+                                for (auto& event : batch->events) {
+                                    collected_ptr->emplace_back(event);
                                 }
+                                remaining->fetch_sub(
+                                    static_cast<int>(batch->events.size()));
                             }
                         }
                     }
diff --git a/src/dftracer/utils/utilities/call_tree/call_tree.cpp b/src/dftracer/utils/utilities/call_tree/call_tree.cpp
index 767af2fb..78b8620d 100644
--- a/src/dftracer/utils/utilities/call_tree/call_tree.cpp
+++ b/src/dftracer/utils/utilities/call_tree/call_tree.cpp
@@ -2,35 +2,61 @@
 #include <dftracer/utils/call_tree/internal/call_tree.h>
 #include <dftracer/utils/call_tree/internal/process_key.h>
 #include <dftracer/utils/call_tree/internal/trace_reader.h>
-#include <dftracer/utils/call_tree/json_serializer.h>
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
-#include <unistd.h>
 
 #include <algorithm>
 #include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
 #include <set>
-#include <sstream>
 
 namespace dftracer::utils::call_tree {
 
 namespace internal {
 
-/**
- * Internal implementation class (PIMPL pattern)
- * Hides complex CallTree internals from public API
- */
+namespace {
+
+std::unordered_map<std::string, std::string> args_to_string_map(
+    const ArgsMap& args) {
+    std::unordered_map<std::string, std::string> out;
+    args.for_each_member(
+        [&](std::string_view k,
+            dftracer::utils::utilities::composites::dft::ArgsValueProxy v) {
+            std::string val;
+            if (v.is_string())
+                val = v.get<std::string>();
+            else if (v.is_int())
+                val = std::to_string(v.get<std::int64_t>());
+            else if (v.is_uint())
+                val = std::to_string(v.get<std::uint64_t>());
+            else if (v.is_number())
+                val = std::to_string(v.get<double>());
+            else if (v.is_bool())
+                val = v.get<bool>() ? "true" : "false";
+            out.emplace(std::string(k), std::move(val));
+        });
+    return out;
+}
+
+void fill_node_info(const CallTreeNode& node, CallTreeNodeInfo& info) {
+    info.id = node.get_id();
+    info.name = std::string(node.get_name());
+    info.category = std::string(node.get_category());
+    info.start_time_us = node.get_start_time();
+    info.duration_us = node.get_duration();
+    info.level = node.get_level();
+    info.parent_id = node.get_parent_id();
+    info.num_children = node.get_children().size();
+    info.children_ids = node.get_children();
+    info.args = args_to_string_map(node.get_args());
+}
+
+}  // namespace
+
 class CallTreeImpl {
    public:
     CallTree graph;
     std::vector<std::string> trace_files;
     std::string trace_directory;
-    std::string output_path;
     bool is_generated;
 
     CallTreeImpl() : is_generated(false) { graph.initialize(); }
@@ -99,58 +125,12 @@ class CallTreeImpl {
         }
 
         const auto& node = it->second;
-
-        // Add current node
         CallTreeNodeInfo info;
-        info.id = node->get_id();
-        info.name = node->get_name();
-        info.category = node->get_category();
-        info.start_time_us = node->get_start_time();
-        info.duration_us = node->get_duration();
-        info.level = node->get_level();
-        info.parent_id = node->get_parent_id();
-        info.num_children = node->get_children().size();
-        info.children_ids = node->get_children();
-        info.args = node->get_args();
-
-        nodes.push_back(info);
-
-        // Recursively traverse children
-        for (std::uint64_t child_id : node->get_children()) {
-            traverse_depth_first(process_graph, child_id, nodes);
-        }
-    }
-
-    void print_node_recursive(const ProcessCallTree& process_graph,
-                              std::uint64_t node_id, int indent, int max_depth,
-                              std::ostream& out) const {
-        if (max_depth > 0 && indent >= max_depth) {
-            return;
-        }
-
-        auto it = process_graph.calls.find(node_id);
-        if (it == process_graph.calls.end()) {
-            return;
-        }
-
-        const auto& node = it->second;
-
-        // Print indentation
-        for (int i = 0; i < indent; i++) {
-            out << "  ";
-        }
-
-        // Print node info
-        out << node->get_name() << " [" << node->get_category() << "] "
-            << "level=" << node->get_level() << " "
-            << "dur=" << (static_cast<double>(node->get_duration()) / 1000.0)
-            << "ms "
-            << "children=" << node->get_children().size() << "\n";
+        fill_node_info(*node, info);
+        nodes.push_back(std::move(info));
 
-        // Print children
         for (std::uint64_t child_id : node->get_children()) {
-            print_node_recursive(process_graph, child_id, indent + 1, max_depth,
-                                 out);
+            traverse_depth_first(process_graph, child_id, nodes);
         }
     }
 
@@ -174,9 +154,11 @@ class CallTreeImpl {
         }
 
         // Print node info
-        printf("%s [%s] level=%d dur=%.3fms children=%zu\n",
-               node->get_name().c_str(), node->get_category().c_str(),
-               node->get_level(),
+        auto nm = node->get_name();
+        auto ct = node->get_category();
+        printf("%.*s [%.*s] level=%d dur=%.3fms children=%zu\n",
+               static_cast<int>(nm.size()), nm.data(),
+               static_cast<int>(ct.size()), ct.data(), node->get_level(),
                static_cast<double>(node->get_duration()) / 1000.0,
                node->get_children().size());
 
@@ -243,11 +225,6 @@ bool CallTree::load_from_directory(const std::string& trace_dir,
     if (found) {
         DFTRACER_UTILS_LOG_INFO("Found %zu trace files in %s",
                                 impl_->trace_files.size(), trace_dir.c_str());
-
-        // Set default output path
-        fs::path dir_path(trace_dir);
-        std::string dir_name = dir_path.filename().string();
-        impl_->output_path = dir_name + ".calltree";
     }
 
     return found;
@@ -305,44 +282,6 @@ void CallTree::print_depth_first(int max_depth) const {
     }
 }
 
-bool CallTree::print_depth_first_to_file(const std::string& filename,
-                                         int max_depth) const {
-    if (!impl_->is_generated) {
-        DFTRACER_UTILS_LOG_ERROR(
-            "%s", "Call tree not generated. Call generate() first.");
-        return false;
-    }
-
-    std::ofstream file(filename);
-    if (!file.is_open()) {
-        DFTRACER_UTILS_LOG_ERROR("Cannot open file for writing: %s",
-                                 filename.c_str());
-        return false;
-    }
-
-    auto keys = impl_->graph.keys();
-
-    for (const auto& key : keys) {
-        auto* process_graph = impl_->graph.get(key);
-        if (!process_graph) continue;
-
-        file << "\n=== Process/Thread: PID=" << key.pid << ", TID=" << key.tid
-             << ", Node=" << key.node_id << " ===" << std::endl;
-        file << "Total nodes: " << process_graph->calls.size() << std::endl;
-        file << "Root calls: " << process_graph->root_calls.size() << std::endl;
-        file << std::endl;
-
-        for (std::uint64_t root_id : process_graph->root_calls) {
-            impl_->print_node_recursive(*process_graph, root_id, 0, max_depth,
-                                        file);
-        }
-    }
-
-    file.close();
-    DFTRACER_UTILS_LOG_INFO("Call tree printed to: %s", filename.c_str());
-    return true;
-}
-
 std::vector<CallTreeNodeInfo> CallTree::get_nodes_depth_first() const {
     std::vector<CallTreeNodeInfo> all_nodes;
 
@@ -366,241 +305,6 @@ std::vector<CallTreeNodeInfo> CallTree::get_nodes_depth_first() const {
     return all_nodes;
 }
 
-std::string CallTree::get_output_path() const { return impl_->output_path; }
-
-void CallTree::set_output_path(const std::string& path) {
-    impl_->output_path = path;
-}
-
-bool CallTree::save_to_file(const std::string& filename) const {
-    if (!impl_->is_generated) {
-        DFTRACER_UTILS_LOG_ERROR(
-            "%s", "Call tree not generated. Call generate() first.");
-        return false;
-    }
-
-    std::string output_file = filename.empty() ? impl_->output_path : filename;
-
-    std::ofstream file(output_file, std::ios::binary);
-    if (!file.is_open()) {
-        DFTRACER_UTILS_LOG_ERROR("Cannot open file for writing: %s",
-                                 output_file.c_str());
-        return false;
-    }
-
-    // Write header
-    const char magic[8] = {'C', 'A', 'L', 'L', 'T', 'R', 'E', 'E'};
-    file.write(magic, 8);
-
-    std::uint32_t version = 1;
-    file.write(reinterpret_cast<const char*>(&version), sizeof(version));
-
-    // Get nodes in depth-first order
-    auto nodes = get_nodes_depth_first();
-
-    std::uint64_t num_nodes = nodes.size();
-    file.write(reinterpret_cast<const char*>(&num_nodes), sizeof(num_nodes));
-
-    // Write each node
-    for (const auto& node : nodes) {
-        file.write(reinterpret_cast<const char*>(&node.id), sizeof(node.id));
-
-        std::uint32_t name_len = static_cast<std::uint32_t>(node.name.size());
-        file.write(reinterpret_cast<const char*>(&name_len), sizeof(name_len));
-        file.write(node.name.data(), name_len);
-
-        std::uint32_t cat_len =
-            static_cast<std::uint32_t>(node.category.size());
-        file.write(reinterpret_cast<const char*>(&cat_len), sizeof(cat_len));
-        file.write(node.category.data(), cat_len);
-
-        file.write(reinterpret_cast<const char*>(&node.start_time_us),
-                   sizeof(node.start_time_us));
-        file.write(reinterpret_cast<const char*>(&node.duration_us),
-                   sizeof(node.duration_us));
-        file.write(reinterpret_cast<const char*>(&node.level),
-                   sizeof(node.level));
-        file.write(reinterpret_cast<const char*>(&node.parent_id),
-                   sizeof(node.parent_id));
-
-        std::uint64_t num_children = node.num_children;
-        file.write(reinterpret_cast<const char*>(&num_children),
-                   sizeof(num_children));
-    }
-
-    file.close();
-    DFTRACER_UTILS_LOG_INFO("Call tree saved to: %s", output_file.c_str());
-    DFTRACER_UTILS_LOG_INFO("  Nodes written: %zu", nodes.size());
-
-    return true;
-}
-
-bool CallTree::save_to_json(const std::string& filename) const {
-    if (!impl_->is_generated) {
-        DFTRACER_UTILS_LOG_ERROR(
-            "%s", "Call tree not generated. Call generate() first.");
-        return false;
-    }
-
-    // Determine output file - use .pfw extension for compatibility with
-    // DFTracer tools
-    std::string output_file = filename;
-    if (output_file.empty()) {
-        // Replace .calltree extension with .pfw if present, otherwise append
-        std::string base = impl_->output_path;
-        if (base.size() >= 9 && base.substr(base.size() - 9) == ".calltree") {
-            base = base.substr(0, base.size() - 9);
-        }
-        output_file = base + ".pfw";
-    }
-
-    std::ofstream file(output_file);
-    if (!file.is_open()) {
-        DFTRACER_UTILS_LOG_ERROR("Cannot open file for writing: %s",
-                                 output_file.c_str());
-        return false;
-    }
-
-    DFTRACER_UTILS_LOG_INFO(
-        "%s", "Serializing call tree to JSON (Chrome Tracing format)...");
-
-    // Create JSON serializer
-    internal::JsonSerializer serializer;
-
-    // Buffer for serialization (16KB should be enough for most events)
-    const size_t BUFFER_SIZE = 16384;
-    char buffer[BUFFER_SIZE];
-
-    // Get hostname for identification
-    char hostname[256];
-    gethostname(hostname, sizeof(hostname));
-    std::string hostname_hash = std::string(hostname);
-
-    // Write opening bracket
-    size_t written = serializer.initialize(buffer, hostname_hash);
-    file.write(buffer, written);
-
-    // Write metadata events for file header
-    std::time_t now = std::time(nullptr);
-    char timestamp[256];
-    std::strftime(timestamp, sizeof(timestamp), "%Y-%m-%d %H:%M:%S",
-                  std::localtime(&now));
-
-    written = serializer.serialize_metadata(buffer, "timestamp", timestamp, "M",
-                                            0, 0, true);
-    file.write(buffer, written - 1);  // Don't write the newline yet
-    file.write(",\n", 2);             // Write comma separator
-
-    written = serializer.serialize_metadata(buffer, "format", "call_tree", "M",
-                                            0, 0, true);
-    file.write(buffer, written - 1);
-    file.write(",\n", 2);
-
-    // Get all process keys
-    auto keys = impl_->graph.keys();
-
-    // Track event index (similar to DFTracer)
-    int event_index = 0;
-    size_t total_events = 0;
-
-    // Iterate over all processes/threads
-    for (const auto& key : keys) {
-        auto* process_graph = impl_->graph.get(key);
-        if (!process_graph) continue;
-
-        // Traverse and serialize nodes in depth-first order
-        for (std::uint64_t root_id : process_graph->root_calls) {
-            std::vector<std::uint64_t> stack;
-            stack.push_back(root_id);
-
-            while (!stack.empty()) {
-                std::uint64_t node_id = stack.back();
-                stack.pop_back();
-
-                auto it = process_graph->calls.find(node_id);
-                if (it == process_graph->calls.end()) continue;
-
-                const auto& node = it->second;
-
-                // Serialize this node
-                written = serializer.serialize_node(buffer, event_index++,
-                                                    *node, key.pid, key.tid);
-
-                // Write to file with comma separator (except last event)
-                file.write(buffer, written - 1);  // Don't write newline
-
-                // Add children to stack in reverse order for depth-first
-                const auto& children = node->get_children();
-                for (auto child_it = children.rbegin();
-                     child_it != children.rend(); ++child_it) {
-                    stack.push_back(*child_it);
-                }
-
-                // Write comma separator for next event
-                file.write(",\n", 2);
-                total_events++;
-            }
-        }
-    }
-
-    // Write closing bracket (overwrites the last comma)
-    file.seekp(-2, std::ios::cur);  // Back up over ",\n"
-    file.write("\n", 1);            // Just write newline
-
-    written = serializer.finalize(buffer, true);
-    file.write(buffer, written);
-
-    file.close();
-
-    DFTRACER_UTILS_LOG_INFO("Call tree saved to JSON: %s", output_file.c_str());
-    DFTRACER_UTILS_LOG_INFO("  Total events: %zu", total_events);
-    DFTRACER_UTILS_LOG_INFO("  Unique processes: %zu", keys.size());
-    DFTRACER_UTILS_LOG_INFO(
-        "%s", "  Format: Chrome Tracing (compatible with Perfetto)");
-
-    return true;
-}
-
-bool CallTree::load_from_file(const std::string& filename) {
-    std::ifstream file(filename, std::ios::binary);
-    if (!file.is_open()) {
-        DFTRACER_UTILS_LOG_ERROR("Cannot open file for reading: %s",
-                                 filename.c_str());
-        return false;
-    }
-
-    // Read and verify header
-    char magic[8];
-    file.read(magic, 8);
-
-    if (std::memcmp(magic, "CALLTREE", 8) != 0) {
-        DFTRACER_UTILS_LOG_ERROR("%s", "Invalid file format");
-        return false;
-    }
-
-    std::uint32_t version;
-    file.read(reinterpret_cast<char*>(&version), sizeof(version));
-
-    if (version != 1) {
-        DFTRACER_UTILS_LOG_ERROR("Unsupported version: %u", version);
-        return false;
-    }
-
-    std::uint64_t num_nodes;
-    file.read(reinterpret_cast<char*>(&num_nodes), sizeof(num_nodes));
-
-    DFTRACER_UTILS_LOG_INFO("Loading %lu nodes from %s",
-                            (unsigned long)num_nodes, filename.c_str());
-
-    // Note: This is a simplified load that just verifies the file
-    // Full reconstruction would require rebuilding the CallTree structure
-
-    file.close();
-    DFTRACER_UTILS_LOG_INFO("%s", "Call tree file validated successfully");
-
-    return true;
-}
-
 CallTreeStats CallTree::get_statistics() const {
     CallTreeStats stats;
 
@@ -687,6 +391,11 @@ void CallTree::print_statistics() const {
 
 bool CallTree::is_generated() const { return impl_->is_generated; }
 
+internal::CallTree& CallTree::internal_tree() { return impl_->graph; }
+const internal::CallTree& CallTree::internal_tree() const {
+    return impl_->graph;
+}
+
 size_t CallTree::get_num_trace_files() const {
     return impl_->trace_files.size();
 }
@@ -696,7 +405,6 @@ void CallTree::clear() {
     impl_->graph.initialize();
     impl_->trace_files.clear();
     impl_->trace_directory.clear();
-    impl_->output_path.clear();
     impl_->is_generated = false;
 }
 
@@ -758,18 +466,8 @@ std::vector<CallTreeNodeInfo> CallTree::get_root_nodes(
             const auto& node = it->second;
 
             CallTreeNodeInfo info;
-            info.id = node->get_id();
-            info.name = node->get_name();
-            info.category = node->get_category();
-            info.start_time_us = node->get_start_time();
-            info.duration_us = node->get_duration();
-            info.level = node->get_level();
-            info.parent_id = node->get_parent_id();
-            info.num_children = node->get_children().size();
-            info.children_ids = node->get_children();
-            info.args = node->get_args();
-
-            root_nodes.push_back(info);
+            internal::fill_node_info(*node, info);
+            root_nodes.push_back(std::move(info));
         }
     }
 
@@ -798,17 +496,7 @@ CallTreeNodeInfo CallTree::get_node_by_id(std::uint64_t id) const {
             const auto& node = it->second;
 
             CallTreeNodeInfo info;
-            info.id = node->get_id();
-            info.name = node->get_name();
-            info.category = node->get_category();
-            info.start_time_us = node->get_start_time();
-            info.duration_us = node->get_duration();
-            info.level = node->get_level();
-            info.parent_id = node->get_parent_id();
-            info.num_children = node->get_children().size();
-            info.children_ids = node->get_children();
-            info.args = node->get_args();
-
+            internal::fill_node_info(*node, info);
             return info;
         }
     }
diff --git a/src/dftracer/utils/utilities/call_tree/call_tree_internal.cpp b/src/dftracer/utils/utilities/call_tree/call_tree_internal.cpp
index a81d2c23..b8299b19 100644
--- a/src/dftracer/utils/utilities/call_tree/call_tree_internal.cpp
+++ b/src/dftracer/utils/utilities/call_tree/call_tree_internal.cpp
@@ -5,19 +5,17 @@
 #include <dftracer/utils/call_tree/internal/process_key.h>
 #include <dftracer/utils/call_tree/internal/trace_reader.h>
 #include <dftracer/utils/core/common/filesystem.h>
-#include <dftracer/utils/core/common/format_detector.h>
 #include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/common/string_intern.h>
 #include <dftracer/utils/core/coro/task.h>
-#include <dftracer/utils/utilities/composites/dft/internal/utils.h>
-#include <dftracer/utils/utilities/reader/internal/line_processor.h>
-#include <dftracer/utils/utilities/reader/internal/reader_factory.h>
-#include <yyjson.h>
-#include <zlib.h>
+#include <dftracer/utils/utilities/common/json/parser.h>
+#include <dftracer/utils/utilities/composites/dft/event.h>
+#include <dftracer/utils/utilities/reader/trace_reader.h>
+#include <simdjson.h>
 
 #include <algorithm>
 #include <cstdio>
-#include <fstream>
-#include <iostream>
+#include <string_view>
 
 namespace dftracer::utils::call_tree {
 namespace internal {
@@ -39,8 +37,8 @@ CallTreeNode::CallTreeNode()
       initialized_(false),
       cleaned_up_(false) {}
 
-CallTreeNode::CallTreeNode(std::uint64_t id, const std::string& name,
-                           const std::string& category)
+CallTreeNode::CallTreeNode(std::uint64_t id, std::string_view name,
+                           std::string_view category)
     : id_(id),
       name_(name),
       category_(category),
@@ -57,10 +55,9 @@ CallTreeNode::~CallTreeNode() {
     if (!cleaned_up_) {
         cleanup();
     }
-    // Clear all state
     id_ = 0;
-    name_.clear();
-    category_.clear();
+    name_ = {};
+    category_ = {};
     start_time_ = 0;
     duration_ = 0;
     level_ = 0;
@@ -125,8 +122,8 @@ CallTreeNode& CallTreeNode::operator=(CallTreeNode&& other) noexcept {
     return *this;
 }
 
-void CallTreeNode::initialize(std::uint64_t id, const std::string& name,
-                              const std::string& category,
+void CallTreeNode::initialize(std::uint64_t id, std::string_view name,
+                              std::string_view category,
                               std::uint64_t start_time, std::uint64_t duration,
                               int level) {
     id_ = id;
@@ -146,13 +143,10 @@ void CallTreeNode::cleanup() {
     if (cleaned_up_) {
         return;
     }
-
-    // Clear containers to free memory
     args_.clear();
     children_.clear();
-    name_.clear();
-    category_.clear();
-
+    name_ = {};
+    category_ = {};
     cleaned_up_ = true;
 }
 
@@ -200,309 +194,126 @@ void CallTreeFactory::cleanup() {
 }
 
 std::shared_ptr<CallTreeNode> CallTreeFactory::create_node(
-    std::uint64_t id, const std::string& name, const std::string& category,
-    std::uint64_t start_time, std::uint64_t duration, int level,
-    const std::unordered_map<std::string, std::string>& args) {
+    std::uint64_t id, std::string_view name, std::string_view category,
+    std::uint64_t start_time, std::uint64_t duration, int level, ArgsMap args) {
     auto node = std::make_shared<CallTreeNode>(id, name, category);
     node->initialize(id, name, category, start_time, duration, level);
-    node->set_args(args);
-
-    // Track the node for cleanup
+    node->set_args(std::move(args));
     managed_nodes_.push_back(node);
     node_count_++;
-
     return node;
 }
 
 // ============================================================================
-// TraceLineProcessor - LineProcessor for parsing trace events
+// TraceReader Implementation (delegates to utilities::reader::TraceReader)
 // ============================================================================
 
-class TraceLineProcessor
-    : public dftracer::utils::utilities::reader::internal::LineProcessor {
-   public:
-    TraceLineProcessor(TraceReader& reader, CallTree& graph)
-        : reader_(reader),
-          graph_(graph),
-          line_count_(0),
-          processed_(0),
-          report_interval_(10000) {}
-
-    coro::CoroTask<bool> process(const char* data,
-                                 std::size_t length) override {
-        line_count_++;
-
-        // Progress indicator
-        if (line_count_ % report_interval_ == 0) {
-            DFTRACER_UTILS_LOG_DEBUG("  processed %zu lines, %zu traces...",
-                                     line_count_, processed_);
-        }
-
-        // Skip empty lines, brackets
-        if (length == 0) {
-            co_return true;
-        }
+namespace {
 
-        std::string line(data, length);
+using dftracer::utils::utilities::common::json::JsonParser;
+using dftracer::utils::utilities::composites::dft::DFTracerEvent;
 
-        // Skip brackets
-        if (line == "[" || line == "]") {
-            co_return true;
-        }
-
-        // Remove trailing comma
-        if (!line.empty() && line.back() == ',') {
-            line.pop_back();
-        }
-
-        if (reader_.process_trace_line(line, graph_)) {
-            processed_++;
-        }
-
-        co_return true;  // Continue processing
-    }
-
-    void end() override {
-        DFTRACER_UTILS_LOG_INFO(
-            "processed %zu trace entries from %zu total lines", processed_,
-            line_count_);
-    }
-
-    std::size_t get_processed_count() const { return processed_; }
-
-   private:
-    TraceReader& reader_;
-    CallTree& graph_;
-    std::size_t line_count_;
-    std::size_t processed_;
-    std::size_t report_interval_;
+struct ParsedEvent {
+    bool parsed = false;
+    bool filtered = false;
 };
 
-// ============================================================================
-// TraceReader Implementation
-// ============================================================================
-
-bool TraceReader::read(const std::string& trace_file, CallTree& graph) {
-    DFTRACER_UTILS_LOG_INFO("reading trace file: %s", trace_file.c_str());
-
-    // Try to use Reader API first (for compressed files, tar.gz, etc.)
-    if (read_with_reader(trace_file, graph)) {
-        return true;
-    }
-
-    // Fallback to direct reading for plain text files
-    return read_direct(trace_file, graph);
+dftracer::utils::StringIntern& name_intern() {
+    static dftracer::utils::StringIntern instance;
+    return instance;
 }
 
-bool TraceReader::read_with_reader(const std::string& trace_file,
-                                   CallTree& graph) {
-    try {
-        // Detect file format
-        auto format = dftracer::utils::FormatDetector::detect(trace_file);
-
-        // For GZIP files, skip Reader API and use direct zlib decompression
-        // since this path expects a prebuilt `.dftindex` store.
-        if (format == dftracer::utils::ArchiveFormat::GZIP) {
-            return false;  // Will trigger fallback to read_direct which handles
-                           // gzip
-        }
-
-        // Check if format is supported by Reader
-        if (!dftracer::utils::utilities::reader::internal::ReaderFactory::
-                is_format_supported(format)) {
-            // Not supported, will use fallback
-            return false;
-        }
-
-        std::string index_path = dftracer::utils::utilities::composites::dft::
-            internal::determine_index_path(trace_file, "");
-
-        // Create reader (this will auto-build index if needed)
-        auto reader =
-            dftracer::utils::utilities::reader::internal::ReaderFactory::create(
-                trace_file, index_path);
-        if (!reader || !reader->is_valid()) {
-            DFTRACER_UTILS_LOG_ERROR("Failed to create reader for %s",
-                                     trace_file.c_str());
-            return false;
-        }
-
-        DFTRACER_UTILS_LOG_INFO("Using Reader API for %s (format: %s)",
-                                trace_file.c_str(),
-                                reader->get_format_name().c_str());
-
-        // Create line processor
-        TraceLineProcessor processor(*this, graph);
+ParsedEvent ingest_event(JsonParser& parser, CallTree& graph,
+                         const std::set<std::uint32_t>* allowed_pids) {
+    ParsedEvent out;
 
-        // Read all lines using line processor
-        std::size_t num_lines = reader->get_num_lines();
-        if (num_lines > 0) {
-            reader->read_lines_with_processor(1, num_lines, processor);
-        }
+    DFTracerEvent ev;
+    if (!DFTracerEvent::parse_ondemand(parser, ev)) return out;
+    out.parsed = true;
 
-        return true;
-
-    } catch (const std::exception& e) {
-        DFTRACER_UTILS_LOG_ERROR("Reader API failed for %s: %s",
-                                 trace_file.c_str(), e.what());
-        return false;
+    if (allowed_pids && allowed_pids->find(static_cast<std::uint32_t>(
+                            ev.pid)) == allowed_pids->end()) {
+        out.filtered = true;
+        return out;
     }
-}
-
-bool TraceReader::read_direct(const std::string& trace_file, CallTree& graph) {
-    // Detect file format to see if we need decompression
-    ArchiveFormat format = FormatDetector::detect(trace_file);
 
-    // Handle gzip files with zlib
-    if (format == ArchiveFormat::GZIP) {
-        DFTRACER_UTILS_LOG_INFO("Using zlib decompression for %s",
-                                trace_file.c_str());
-
-        gzFile gz = gzopen(trace_file.c_str(), "rb");
-        if (!gz) {
-            DFTRACER_UTILS_LOG_ERROR("Cannot open gzip file: %s",
-                                     trace_file.c_str());
-            return false;
-        }
+    if (!ev.is_complete()) return out;
 
-        char buffer[65536];
-        std::string current_line;
-        size_t line_count = 0;
-        size_t processed = 0;
-        size_t report_interval = 10000;
-
-        while (true) {
-            int bytes_read = gzread(gz, buffer, sizeof(buffer) - 1);
-            if (bytes_read <= 0) {
-                // Process any remaining line
-                if (!current_line.empty()) {
-                    line_count++;
-                    if (!current_line.empty() && current_line != "[" &&
-                        current_line != "]") {
-                        if (current_line.back() == ',') current_line.pop_back();
-                        if (process_trace_line(current_line, graph)) {
-                            processed++;
-                        }
-                    }
-                }
-                break;
-            }
-
-            buffer[bytes_read] = '\0';
-            current_line += buffer;
-
-            // Process complete lines
-            size_t pos;
-            while ((pos = current_line.find('\n')) != std::string::npos) {
-                std::string line = current_line.substr(0, pos);
-                current_line = current_line.substr(pos + 1);
-                line_count++;
+    int level = 0;
+    std::uint32_t tid = 0;
+    std::uint32_t node_id = 0;
+    if (auto p = ev.args["level"])
+        level = static_cast<int>(p.get<std::int64_t>());
+    if (auto p = ev.args["tid"])
+        tid = static_cast<std::uint32_t>(p.get<std::uint64_t>());
+    if (auto p = ev.args["node_id"])
+        node_id = static_cast<std::uint32_t>(p.get<std::uint64_t>());
 
-                if (line_count % report_interval == 0) {
-                    DFTRACER_UTILS_LOG_DEBUG(
-                        "  processed %zu lines, %zu traces...", line_count,
-                        processed);
-                }
+    auto name_sv = name_intern().intern(ev.name);
+    auto cat_sv = name_intern().intern(ev.cat);
 
-                if (line.empty() || line == "[" || line == "]") continue;
-                if (!line.empty() && line.back() == ',') line.pop_back();
+    ProcessKey key(static_cast<std::uint32_t>(ev.pid), tid, node_id);
+    auto call = graph.get_factory().create_node(
+        ev.id, name_sv, cat_sv, ev.ts, ev.dur, level, std::move(ev.args));
+    graph.add_call(key, call);
+    return out;
+}
 
-                if (process_trace_line(line, graph)) {
-                    processed++;
-                }
-            }
-        }
+}  // namespace
 
-        gzclose(gz);
-        DFTRACER_UTILS_LOG_INFO("processed %zu trace entries from %zu lines",
-                                processed, line_count);
-        return true;
-    }
+coro::CoroTask<ReadCounts> read_trace_file_async(
+    std::string trace_file, CallTree* graph,
+    const std::set<std::uint32_t>* allowed_pids) {
+    using dftracer::utils::utilities::reader::ReadConfig;
+    using dftracer::utils::utilities::reader::TraceReader;
+    using dftracer::utils::utilities::reader::TraceReaderConfig;
 
-    // Handle tar.gz - not supported without indexer
-    if (format == ArchiveFormat::TAR_GZ) {
-        DFTRACER_UTILS_LOG_ERROR("Cannot read tar.gz file without index: %s",
-                                 trace_file.c_str());
-        DFTRACER_UTILS_LOG_ERROR("%s",
-                                 "Please create an index using dftracer_map");
-        return false;
-    }
+    ReadCounts counts;
 
-    // Plain text file
-    DFTRACER_UTILS_LOG_INFO("Using direct file reading for %s",
-                            trace_file.c_str());
+    TraceReaderConfig cfg;
+    cfg.file_path = trace_file;
+    cfg.auto_build_index = true;
+    TraceReader reader(std::move(cfg));
 
-    std::ifstream file(trace_file);
-    if (!file.is_open()) {
-        DFTRACER_UTILS_LOG_ERROR("cant open trace file: %s",
-                                 trace_file.c_str());
-        return false;
+    auto gen = reader.read_json(ReadConfig{});
+    while (auto opt = co_await gen.next()) {
+        auto res = ingest_event(*opt->parser, *graph, allowed_pids);
+        if (res.filtered)
+            counts.filtered++;
+        else if (res.parsed)
+            counts.processed++;
     }
 
-    std::string line;
-    size_t line_count = 0;
-    size_t processed = 0;
-    size_t report_interval = 10000;
-
-    while (std::getline(file, line)) {
-        line_count++;
-
-        // progress indicator
-        if (line_count % report_interval == 0) {
-            DFTRACER_UTILS_LOG_DEBUG("  processed %zu lines, %zu traces...",
-                                     line_count, processed);
-        }
-
-        // skip brackets and empty lines
-        if (line.empty() || line == "[" || line == "]") {
-            continue;
-        }
-
-        // remove trailing comma
-        if (!line.empty() && line.back() == ',') {
-            line.pop_back();
-        }
-
-        if (process_trace_line(line, graph)) {
-            processed++;
-        } else {
-            // Don't spam errors for metadata entries
-            if (line_count < 10) {
-                DFTRACER_UTILS_LOG_ERROR("failed to parse line %zu in %s",
-                                         line_count, trace_file.c_str());
-            }
-        }
-    }
+    co_return counts;
+}
 
-    DFTRACER_UTILS_LOG_INFO("processed %zu trace entries from %s", processed,
-                            trace_file.c_str());
+ReadCounts read_trace_file(const std::string& trace_file, CallTree& graph,
+                           const std::set<std::uint32_t>* allowed_pids) {
+    return read_trace_file_async(trace_file, &graph, allowed_pids).get();
+}
 
+bool TraceReader::read(const std::string& trace_file, CallTree& graph) {
+    DFTRACER_UTILS_LOG_INFO("reading trace file: %s", trace_file.c_str());
+    auto counts = read_trace_file(trace_file, graph, nullptr);
+    DFTRACER_UTILS_LOG_INFO("processed %zu trace entries from %s",
+                            counts.processed, trace_file.c_str());
     return true;
 }
 
 bool TraceReader::read_multiple(const std::vector<std::string>& trace_files,
                                 CallTree& graph) {
-    bool all_success = true;
-
     DFTRACER_UTILS_LOG_INFO("reading %zu trace files...", trace_files.size());
-
-    size_t file_num = 0;
-    (void)file_num;
+    bool all_success = true;
     for (const auto& file : trace_files) {
-        file_num++;
-        DFTRACER_UTILS_LOG_DEBUG("[%zu/%zu] ", file_num, trace_files.size());
         if (!read(file, graph)) {
             DFTRACER_UTILS_LOG_ERROR("failed to read: %s", file.c_str());
             all_success = false;
         }
     }
-
-    // build parent child relationships after all traces loaded
     DFTRACER_UTILS_LOG_INFO(
         "building call hierarchy for %zu process/thread/node combinations...",
         graph.size());
     graph.build_hierarchy();
-
     return all_success;
 }
 
@@ -515,17 +326,12 @@ bool TraceReader::read_directory(const std::string& directory,
     }
 
     std::vector<std::string> trace_files;
-
-    // collect all matching files
     for (const auto& entry : fs::directory_iterator(directory)) {
-        if (entry.is_regular_file()) {
-            std::string filename = entry.path().filename().string();
-
-            // simple pattern matching (for now, just check file extension)
-            if (pattern == "*" ||
-                filename.find(pattern.substr(1)) != std::string::npos) {
-                trace_files.push_back(entry.path().string());
-            }
+        if (!entry.is_regular_file()) continue;
+        std::string filename = entry.path().filename().string();
+        if (pattern == "*" ||
+            filename.find(pattern.substr(1)) != std::string::npos) {
+            trace_files.push_back(entry.path().string());
         }
     }
 
@@ -535,108 +341,21 @@ bool TraceReader::read_directory(const std::string& directory,
         return false;
     }
 
-    // sort files for consistent processing order
     std::sort(trace_files.begin(), trace_files.end());
-
     DFTRACER_UTILS_LOG_INFO("found %zu trace files in %s", trace_files.size(),
                             directory.c_str());
-
     return read_multiple(trace_files, graph);
 }
 
-bool TraceReader::process_trace_line(const std::string& line, CallTree& graph) {
-    yyjson_doc* doc = yyjson_read(line.c_str(), line.length(), 0);
-    if (!doc) {
-        return false;
-    }
-
-    yyjson_val* root = yyjson_doc_get_root(doc);
-    if (!root) {
-        yyjson_doc_free(doc);
-        return false;
-    }
-
-    // get basic fields
-    yyjson_val* id_val = yyjson_obj_get(root, "id");
-    yyjson_val* name_val = yyjson_obj_get(root, "name");
-    yyjson_val* cat_val = yyjson_obj_get(root, "cat");
-    yyjson_val* pid_val = yyjson_obj_get(root, "pid");
-    yyjson_val* ph_val = yyjson_obj_get(root, "ph");
-    yyjson_val* ts_val = yyjson_obj_get(root, "ts");
-    yyjson_val* dur_val = yyjson_obj_get(root, "dur");
-    yyjson_val* args_val = yyjson_obj_get(root, "args");
-
-    // skip metadata entries
-    if (!ph_val || !yyjson_is_str(ph_val) ||
-        strcmp(yyjson_get_str(ph_val), "X") != 0) {
-        yyjson_doc_free(doc);
-        return true;  // not an error just skip
-    }
-
-    if (!id_val || !name_val || !pid_val || !ts_val) {
-        yyjson_doc_free(doc);
-        return false;
-    }
-
-    std::uint64_t call_id = yyjson_get_uint(id_val);
-    std::uint64_t pid = yyjson_get_uint(pid_val);
-    std::string name = yyjson_get_str(name_val);
-    std::string category = cat_val ? yyjson_get_str(cat_val) : "";
-    std::uint64_t start_time = yyjson_get_uint(ts_val);
-    std::uint64_t duration = dur_val ? yyjson_get_uint(dur_val) : 0;
-
-    // get level, tid, and node_id from args
-    int level = 0;
-    std::uint32_t tid = 0;
-    std::uint32_t node_id = 0;
-
-    // Collect all args
-    std::unordered_map<std::string, std::string> args;
-
-    if (args_val && yyjson_is_obj(args_val)) {
-        yyjson_val* level_val = yyjson_obj_get(args_val, "level");
-        if (level_val) {
-            level = yyjson_get_int(level_val);
-        }
-
-        yyjson_val* tid_val = yyjson_obj_get(args_val, "tid");
-        if (tid_val) {
-            tid = static_cast<std::uint32_t>(yyjson_get_uint(tid_val));
-        }
-
-        yyjson_val* node_val = yyjson_obj_get(args_val, "node_id");
-        if (node_val) {
-            node_id = static_cast<std::uint32_t>(yyjson_get_uint(node_val));
-        }
-
-        // Store all args
-        yyjson_obj_iter iter;
-        yyjson_obj_iter_init(args_val, &iter);
-        yyjson_val *arg_key, *arg_val;
-        while ((arg_key = yyjson_obj_iter_next(&iter))) {
-            arg_val = yyjson_obj_iter_get_val(arg_key);
-            if (yyjson_is_str(arg_val)) {
-                args[yyjson_get_str(arg_key)] = yyjson_get_str(arg_val);
-            } else if (yyjson_is_int(arg_val)) {
-                args[yyjson_get_str(arg_key)] =
-                    std::to_string(yyjson_get_int(arg_val));
-            } else if (yyjson_is_uint(arg_val)) {
-                args[yyjson_get_str(arg_key)] =
-                    std::to_string(yyjson_get_uint(arg_val));
-            }
-        }
-    }
-
-    // Create function call using factory
-    ProcessKey key(static_cast<std::uint32_t>(pid), tid, node_id);
-    auto call = graph.get_factory().create_node(
-        call_id, name, category, start_time, duration, level, args);
-
-    // Add call to graph
-    graph.add_call(key, call);
+bool TraceReader::process_trace_line(JsonParser& parser, CallTree& graph) {
+    auto res = ingest_event(parser, graph, nullptr);
+    return res.parsed;
+}
 
-    yyjson_doc_free(doc);
-    return true;
+bool TraceReader::process_trace_line(const std::string& line, CallTree& graph) {
+    JsonParser parser;
+    if (!parser.parse(line)) return false;
+    return process_trace_line(parser, graph);
 }
 
 // ============================================================================
@@ -705,6 +424,25 @@ bool CallTree::load(const std::string& trace_file) {
     return reader.read(trace_file, *this);
 }
 
+void CallTree::merge_from(CallTree&& other) {
+    for (auto& [key, src_graph] : other.process_graphs_) {
+        if (!src_graph) continue;
+        auto it = process_graphs_.find(key);
+        if (it == process_graphs_.end()) {
+            process_graphs_.emplace(key, std::move(src_graph));
+        } else {
+            auto& dst = *it->second;
+            for (auto& [id, node] : src_graph->calls) {
+                dst.calls[id] = std::move(node);
+            }
+            dst.call_sequence.insert(dst.call_sequence.end(),
+                                     src_graph->call_sequence.begin(),
+                                     src_graph->call_sequence.end());
+        }
+    }
+    other.process_graphs_.clear();
+}
+
 void CallTree::add_call(const ProcessKey& key,
                         std::shared_ptr<CallTreeNode> call) {
     // make sure process graph exists
@@ -755,43 +493,57 @@ void CallTree::build_hierarchy_internal(ProcessCallTree* graph) {
         sorted_calls.push_back(call);
     }
 
-    // sort by start time to build hierarchy
     std::sort(sorted_calls.begin(), sorted_calls.end(),
               [](const auto& a, const auto& b) {
-                  return a->get_start_time() < b->get_start_time();
+                  std::uint64_t a_start = a->get_start_time();
+                  std::uint64_t b_start = b->get_start_time();
+                  if (a_start != b_start) return a_start < b_start;
+                  std::uint64_t a_end = a_start + a->get_duration();
+                  std::uint64_t b_end = b_start + b->get_duration();
+                  if (a_end != b_end) return a_end > b_end;
+                  return a->get_level() < b->get_level();
               });
 
-    // find parents for each call
+    struct OpenEntry {
+        std::uint64_t end_time;
+        std::uint64_t id;
+    };
+    std::vector<std::vector<OpenEntry>> open_by_level;
+
     for (auto& call : sorted_calls) {
-        bool found_parent = false;
-
-        // look for parent that contains this call
-        for (auto& potential_parent : sorted_calls) {
-            if (potential_parent->get_id() == call->get_id()) continue;
-
-            std::uint64_t parent_end = potential_parent->get_start_time() +
-                                       potential_parent->get_duration();
-
-            // check if call is inside parent timespan and level is correct
-            if (call->get_start_time() >= potential_parent->get_start_time() &&
-                (call->get_start_time() + call->get_duration()) <= parent_end &&
-                call->get_level() > potential_parent->get_level()) {
-                // find closest parent by level
-                if (!found_parent ||
-                    potential_parent->get_level() >
-                        graph->calls[call->get_parent_id()]->get_level()) {
-                    call->set_parent_id(potential_parent->get_id());
-                    found_parent = true;
+        const std::uint64_t call_start = call->get_start_time();
+        const std::uint64_t call_end = call_start + call->get_duration();
+        const int call_level = call->get_level();
+
+        std::uint64_t parent_id = 0;
+        int probe_max =
+            std::min<int>(call_level, static_cast<int>(open_by_level.size())) -
+            1;
+        for (int lvl = probe_max; lvl >= 0; --lvl) {
+            auto& stack = open_by_level[lvl];
+            while (!stack.empty() && stack.back().end_time < call_start) {
+                stack.pop_back();
+            }
+            for (auto sit = stack.rbegin(); sit != stack.rend(); ++sit) {
+                if (sit->end_time >= call_end) {
+                    parent_id = sit->id;
+                    break;
                 }
             }
+            if (parent_id != 0) break;
         }
 
-        // add to parent children or root
-        if (found_parent) {
-            graph->calls[call->get_parent_id()]->add_child(call->get_id());
+        if (parent_id != 0) {
+            call->set_parent_id(parent_id);
+            graph->calls[parent_id]->add_child(call->get_id());
         } else {
             graph->root_calls.push_back(call->get_id());
         }
+
+        if (call_level >= static_cast<int>(open_by_level.size())) {
+            open_by_level.resize(call_level + 1);
+        }
+        open_by_level[call_level].push_back({call_end, call->get_id()});
     }
 }
 
@@ -870,9 +622,11 @@ void CallTree::print_calls_recursive(const ProcessCallTree& graph,
     }
 
     // print call info
-    printf("%s [%s] level=%d dur=%luus ts=%lu\n", call->get_name().c_str(),
-           call->get_category().c_str(), call->get_level(),
-           (unsigned long)call->get_duration(),
+    auto nm = call->get_name();
+    auto ct = call->get_category();
+    printf("%.*s [%.*s] level=%d dur=%luus ts=%lu\n",
+           static_cast<int>(nm.size()), nm.data(), static_cast<int>(ct.size()),
+           ct.data(), call->get_level(), (unsigned long)call->get_duration(),
            (unsigned long)call->get_start_time());
 
     // print children
diff --git a/src/dftracer/utils/utilities/call_tree/call_tree_mpi.cpp b/src/dftracer/utils/utilities/call_tree/call_tree_mpi.cpp
index 8a6fe6de..dd9fe8cc 100644
--- a/src/dftracer/utils/utilities/call_tree/call_tree_mpi.cpp
+++ b/src/dftracer/utils/utilities/call_tree/call_tree_mpi.cpp
@@ -2,1188 +2,493 @@
 #include <dftracer/utils/call_tree/internal/process_call_tree.h>
 #include <dftracer/utils/call_tree/internal/process_key.h>
 #include <dftracer/utils/call_tree/internal/trace_reader.h>
-#include <dftracer/utils/call_tree/mpi/build_task.h>
+#include <dftracer/utils/call_tree/json_serializer.h>
 #include <dftracer/utils/call_tree/mpi/builder.h>
-#include <dftracer/utils/call_tree/mpi/config.h>
-#include <dftracer/utils/call_tree/mpi/file_header.h>
-#include <dftracer/utils/call_tree/mpi/filtered_reader.h>
-#include <dftracer/utils/call_tree/mpi/pid_index_info.h>
-#include <dftracer/utils/call_tree/mpi/serializable.h>
-#include <dftracer/utils/call_tree/mpi/serialization.h>
+#include <dftracer/utils/core/common/byte_view.h>
 #include <dftracer/utils/core/common/filesystem.h>
-#include <dftracer/utils/core/common/format_detector.h>
 #include <dftracer/utils/core/common/logging.h>
 #include <dftracer/utils/core/coro/task.h>
-#include <dftracer/utils/core/pipeline/executor.h>
-#include <dftracer/utils/core/pipeline/pipeline.h>
-#include <dftracer/utils/utilities/indexer/internal/indexer.h>
-#include <dftracer/utils/utilities/indexer/internal/indexer_factory.h>
-#include <dftracer/utils/utilities/reader/internal/line_processor.h>
-#include <dftracer/utils/utilities/reader/internal/reader.h>
-#include <dftracer/utils/utilities/reader/internal/reader_factory.h>
-#include <yyjson.h>
-#include <zlib.h>
+#include <dftracer/utils/utilities/fileio/parallel/merge.h>
+#include <dftracer/utils/utilities/fileio/parallel/parallel_writer.h>
+#include <dftracer/utils/utilities/reader/trace_reader.h>
+#include <mpi.h>
+#include <unistd.h>
 
 #include <algorithm>
-#include <chrono>
+#include <atomic>
+#include <cstdint>
 #include <cstdio>
-#include <fstream>
-#include <iostream>
-#include <sstream>
+#include <cstring>
+#include <ctime>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
 
 namespace dftracer::utils::call_tree {
 
-// ============================================================================
-// Serialization Utilities
-// ============================================================================
-
-namespace serialization {
-
-void write_uint32(std::vector<char>& buffer, std::uint32_t value) {
-    buffer.insert(buffer.end(), reinterpret_cast<const char*>(&value),
-                  reinterpret_cast<const char*>(&value) + sizeof(value));
-}
-
-void write_uint64(std::vector<char>& buffer, std::uint64_t value) {
-    buffer.insert(buffer.end(), reinterpret_cast<const char*>(&value),
-                  reinterpret_cast<const char*>(&value) + sizeof(value));
-}
-
-void write_int(std::vector<char>& buffer, int value) {
-    buffer.insert(buffer.end(), reinterpret_cast<const char*>(&value),
-                  reinterpret_cast<const char*>(&value) + sizeof(value));
-}
-
-void write_string(std::vector<char>& buffer, const std::string& str) {
-    std::uint32_t len = static_cast<std::uint32_t>(str.size());
-    write_uint32(buffer, len);
-    buffer.insert(buffer.end(), str.begin(), str.end());
-}
-
-std::uint32_t read_uint32(const char* data, size_t& offset) {
-    std::uint32_t value;
-    std::memcpy(&value, data + offset, sizeof(value));
-    offset += sizeof(value);
-    return value;
-}
-
-std::uint64_t read_uint64(const char* data, size_t& offset) {
-    std::uint64_t value;
-    std::memcpy(&value, data + offset, sizeof(value));
-    offset += sizeof(value);
-    return value;
-}
-
-int read_int(const char* data, size_t& offset) {
-    int value;
-    std::memcpy(&value, data + offset, sizeof(value));
-    offset += sizeof(value);
-    return value;
-}
-
-std::string read_string(const char* data, size_t& offset) {
-    std::uint32_t len = read_uint32(data, offset);
-    std::string str(data + offset, len);
-    offset += len;
-    return str;
-}
-
-}  // namespace serialization
-
-// ============================================================================
-// SerializableCallNode Implementation
-// ============================================================================
-
-std::vector<char> SerializableCallNode::serialize() const {
-    std::vector<char> buffer;
-
-    serialization::write_uint64(buffer, id);
-    serialization::write_string(buffer, name);
-    serialization::write_string(buffer, category);
-    serialization::write_uint64(buffer, start_time);
-    serialization::write_uint64(buffer, duration);
-    serialization::write_int(buffer, level);
-    serialization::write_uint64(buffer, parent_id);
-
-    // Children
-    serialization::write_uint32(buffer,
-                                static_cast<std::uint32_t>(children.size()));
-    for (auto child_id : children) {
-        serialization::write_uint64(buffer, child_id);
-    }
-
-    // Args
-    serialization::write_uint32(buffer,
-                                static_cast<std::uint32_t>(args.size()));
-    for (const auto& [key, value] : args) {
-        serialization::write_string(buffer, key);
-        serialization::write_string(buffer, value);
-    }
-
-    return buffer;
-}
-
-SerializableCallNode SerializableCallNode::deserialize(const char* data,
-                                                       size_t& offset) {
-    SerializableCallNode node;
-
-    node.id = serialization::read_uint64(data, offset);
-    node.name = serialization::read_string(data, offset);
-    node.category = serialization::read_string(data, offset);
-    node.start_time = serialization::read_uint64(data, offset);
-    node.duration = serialization::read_uint64(data, offset);
-    node.level = serialization::read_int(data, offset);
-    node.parent_id = serialization::read_uint64(data, offset);
-
-    // Children
-    std::uint32_t num_children = serialization::read_uint32(data, offset);
-    node.children.reserve(num_children);
-    for (std::uint32_t i = 0; i < num_children; i++) {
-        node.children.push_back(serialization::read_uint64(data, offset));
-    }
-
-    // Args
-    std::uint32_t num_args = serialization::read_uint32(data, offset);
-    for (std::uint32_t i = 0; i < num_args; i++) {
-        std::string key = serialization::read_string(data, offset);
-        std::string value = serialization::read_string(data, offset);
-        node.args[key] = value;
-    }
-
-    return node;
-}
-
-// ============================================================================
-// SerializableProcessGraph Implementation
-// ============================================================================
-
-std::vector<char> SerializableProcessGraph::serialize() const {
-    std::vector<char> buffer;
-
-    // Key
-    serialization::write_uint32(buffer, key.pid);
-    serialization::write_uint32(buffer, key.tid);
-    serialization::write_uint32(buffer, key.node_id);
-
-    // Nodes
-    serialization::write_uint32(buffer,
-                                static_cast<std::uint32_t>(nodes.size()));
-    for (const auto& node : nodes) {
-        auto node_data = node.serialize();
-        serialization::write_uint32(
-            buffer, static_cast<std::uint32_t>(node_data.size()));
-        buffer.insert(buffer.end(), node_data.begin(), node_data.end());
-    }
-
-    // Root calls
-    serialization::write_uint32(buffer,
-                                static_cast<std::uint32_t>(root_calls.size()));
-    for (auto id : root_calls) {
-        serialization::write_uint64(buffer, id);
-    }
-
-    // Call sequence
-    serialization::write_uint32(
-        buffer, static_cast<std::uint32_t>(call_sequence.size()));
-    for (auto id : call_sequence) {
-        serialization::write_uint64(buffer, id);
-    }
-
-    return buffer;
-}
-
-SerializableProcessGraph SerializableProcessGraph::deserialize(const char* data,
-                                                               size_t& offset) {
-    SerializableProcessGraph graph;
-
-    // Key
-    graph.key.pid = serialization::read_uint32(data, offset);
-    graph.key.tid = serialization::read_uint32(data, offset);
-    graph.key.node_id = serialization::read_uint32(data, offset);
-
-    // Nodes
-    std::uint32_t num_nodes = serialization::read_uint32(data, offset);
-    graph.nodes.reserve(num_nodes);
-    for (std::uint32_t i = 0; i < num_nodes; i++) {
-        std::uint32_t node_size = serialization::read_uint32(data, offset);
-        (void)node_size;  // Not needed for deserialization
-        graph.nodes.push_back(SerializableCallNode::deserialize(data, offset));
-    }
-
-    // Root calls
-    std::uint32_t num_roots = serialization::read_uint32(data, offset);
-    graph.root_calls.reserve(num_roots);
-    for (std::uint32_t i = 0; i < num_roots; i++) {
-        graph.root_calls.push_back(serialization::read_uint64(data, offset));
-    }
-
-    // Call sequence
-    std::uint32_t num_seq = serialization::read_uint32(data, offset);
-    graph.call_sequence.reserve(num_seq);
-    for (std::uint32_t i = 0; i < num_seq; i++) {
-        graph.call_sequence.push_back(serialization::read_uint64(data, offset));
-    }
-
-    return graph;
-}
-
-// ============================================================================
-// MPIFilteredTraceReader Implementation
-// ============================================================================
-
-MPIFilteredTraceReader::MPIFilteredTraceReader(
-    const std::set<std::uint32_t>& allowed_pids)
-    : allowed_pids_(allowed_pids), processed_count_(0), filtered_count_(0) {}
-
-bool MPIFilteredTraceReader::read(const std::string& trace_file,
-                                  internal::CallTree& graph) {
-    // Check if it's a gzip file
-    ArchiveFormat format = FormatDetector::detect(trace_file);
-
-    if (format == ArchiveFormat::GZIP) {
-        std::string index_path =
-            utilities::composites::dft::internal::determine_index_path(
-                trace_file, "");
-        if (fs::exists(index_path)) {
-            return read_with_indexer(trace_file, index_path, graph);
-        }
-    }
-
-    // Fall back to direct reading for plain text files
-    std::ifstream file(trace_file);
-    if (!file.is_open()) {
-        DFTRACER_UTILS_LOG_ERROR("Cannot open trace file: %s",
-                                 trace_file.c_str());
-        return false;
-    }
-
-    std::string line;
-    size_t line_count = 0;
-
-    while (std::getline(file, line)) {
-        line_count++;
-
-        // Skip brackets and empty lines
-        if (line.empty() || line == "[" || line == "]") {
-            continue;
-        }
-
-        // Remove trailing comma
-        if (!line.empty() && line.back() == ',') {
-            line.pop_back();
-        }
-
-        yyjson_doc* doc = yyjson_read(line.c_str(), line.length(), 0);
-        if (!doc) {
-            continue;
-        }
-
-        yyjson_val* root = yyjson_doc_get_root(doc);
-        if (!root) {
-            yyjson_doc_free(doc);
-            continue;
-        }
-
-        // Check PID filter
-        yyjson_val* pid_val = yyjson_obj_get(root, "pid");
-        if (pid_val) {
-            std::uint32_t pid =
-                static_cast<std::uint32_t>(yyjson_get_uint(pid_val));
-
-            // Only process if PID is in our allowed set
-            if (allowed_pids_.find(pid) != allowed_pids_.end()) {
-                // Use the standard internal::TraceReader processing
-                internal::TraceReader reader;
-                if (reader.process_trace_line(line, graph)) {
-                    processed_count_++;
-                }
-            } else {
-                filtered_count_++;
-            }
-        }
-
-        yyjson_doc_free(doc);
-    }
-
-    return true;
-}
-
-/**
- * Line processor for filtered reading with indexer
- */
-class FilteredLineProcessor
-    : public utilities::reader::internal::LineProcessor {
-   public:
-    FilteredLineProcessor(const std::set<std::uint32_t>& allowed_pids,
-                          internal::CallTree& graph,
-                          std::size_t& processed_count,
-                          std::size_t& filtered_count)
-        : allowed_pids_(allowed_pids),
-          graph_(graph),
-          processed_count_(processed_count),
-          filtered_count_(filtered_count),
-          reader_() {}
-
-    coro::CoroTask<bool> process(const char* data,
-                                 std::size_t length) override {
-        if (length == 0) {
-            co_return true;
-        }
-
-        std::string line(data, length);
-
-        // Skip brackets
-        if (line == "[" || line == "]") {
-            co_return true;
-        }
-
-        // Remove trailing comma
-        if (!line.empty() && line.back() == ',') {
-            line.pop_back();
-        }
-
-        // Quick PID check
-        yyjson_doc* doc = yyjson_read(line.c_str(), line.length(), 0);
-        if (!doc) {
-            co_return true;
-        }
-
-        yyjson_val* root = yyjson_doc_get_root(doc);
-        if (!root) {
-            yyjson_doc_free(doc);
-            co_return true;
-        }
-
-        yyjson_val* pid_val = yyjson_obj_get(root, "pid");
-        if (pid_val) {
-            std::uint32_t pid =
-                static_cast<std::uint32_t>(yyjson_get_uint(pid_val));
-
-            if (allowed_pids_.find(pid) != allowed_pids_.end()) {
-                if (reader_.process_trace_line(line, graph_)) {
-                    processed_count_++;
-                }
-            } else {
-                filtered_count_++;
+namespace {
+
+bool is_trace_file(const std::string& path) {
+    return (path.size() >= 4 &&
+            path.compare(path.size() - 4, 4, ".pfw") == 0) ||
+           (path.size() >= 7 &&
+            path.compare(path.size() - 7, 7, ".pfw.gz") == 0);
+}
+
+coro::CoroTask<void> scan_file_pids(std::string path,
+                                    std::set<std::uint32_t>* out);
+coro::CoroTask<void> ingest_file(std::string path, internal::CallTree* tree,
+                                 const std::set<std::uint32_t>* pids,
+                                 std::atomic<std::size_t>* total);
+coro::CoroTask<void> build_hierarchy_one(internal::CallTree* tree,
+                                         internal::ProcessKey key);
+coro::CoroTask<void> serialize_one(const internal::CallTree* tree,
+                                   internal::ProcessKey key,
+                                   const std::string* hostname_hash,
+                                   std::vector<std::string>* slice_buffers,
+                                   std::size_t index, std::uint64_t start_idx);
+
+coro::CoroTask<void> scan_files_into(
+    CoroScope* child, const std::vector<std::string>* paths,
+    std::vector<std::set<std::uint32_t>>* per_file);
+coro::CoroTask<void> ingest_files_into(
+    CoroScope* child, const std::vector<std::string>* paths,
+    const std::vector<std::unique_ptr<internal::CallTree>>* per_file,
+    const std::set<std::uint32_t>* pids, std::atomic<std::size_t>* total);
+coro::CoroTask<void> hierarchy_all(
+    CoroScope* child, internal::CallTree* tree,
+    const std::vector<internal::ProcessKey>* keys);
+coro::CoroTask<void> serialize_all(
+    CoroScope* child, const internal::CallTree* tree,
+    const std::vector<internal::ProcessKey>* keys,
+    const std::string* hostname_hash, std::vector<std::string>* slice_buffers,
+    std::uint64_t rank_base, std::uint64_t stride);
+
+coro::CoroTask<void> scan_file_pids(std::string path,
+                                    std::set<std::uint32_t>* out) {
+    using utilities::reader::ReadConfig;
+    using utilities::reader::TraceReader;
+    using utilities::reader::TraceReaderConfig;
+    TraceReaderConfig cfg;
+    cfg.file_path = std::move(path);
+    cfg.auto_build_index = true;
+    TraceReader reader(std::move(cfg));
+    auto gen = reader.read_json(ReadConfig{});
+    while (auto opt = co_await gen.next()) {
+        auto pid = opt->parser->get_uint64("pid");
+        if (pid) out->insert(static_cast<std::uint32_t>(*pid));
+    }
+}
+
+coro::CoroTask<void> scan_files_into(
+    CoroScope* child, const std::vector<std::string>* paths,
+    std::vector<std::set<std::uint32_t>>* per_file) {
+    for (std::size_t k = 0; k < paths->size(); ++k) {
+        std::string path = (*paths)[k];
+        std::set<std::uint32_t>* out = &(*per_file)[k];
+        child->spawn([path = std::move(path),
+                      out](CoroScope&) mutable -> coro::CoroTask<void> {
+            co_await scan_file_pids(std::move(path), out);
+        });
+    }
+    co_return;
+}
+
+coro::CoroTask<void> ingest_files_into(
+    CoroScope* child, const std::vector<std::string>* paths,
+    const std::vector<std::unique_ptr<internal::CallTree>>* per_file,
+    const std::set<std::uint32_t>* pids, std::atomic<std::size_t>* total) {
+    for (std::size_t i = 0; i < paths->size(); ++i) {
+        std::string path = (*paths)[i];
+        internal::CallTree* tree = (*per_file)[i].get();
+        child->spawn([path = std::move(path), tree, pids,
+                      total](CoroScope&) mutable -> coro::CoroTask<void> {
+            co_await ingest_file(std::move(path), tree, pids, total);
+        });
+    }
+    co_return;
+}
+
+coro::CoroTask<void> hierarchy_all(
+    CoroScope* child, internal::CallTree* tree,
+    const std::vector<internal::ProcessKey>* keys) {
+    for (auto k : *keys) {
+        child->spawn([tree, k](CoroScope&) mutable -> coro::CoroTask<void> {
+            co_await build_hierarchy_one(tree, k);
+        });
+    }
+    co_return;
+}
+
+coro::CoroTask<void> serialize_all(
+    CoroScope* child, const internal::CallTree* tree,
+    const std::vector<internal::ProcessKey>* keys,
+    const std::string* hostname_hash, std::vector<std::string>* slice_buffers,
+    std::uint64_t rank_base, std::uint64_t stride) {
+    for (std::size_t i = 0; i < keys->size(); ++i) {
+        internal::ProcessKey k = (*keys)[i];
+        std::uint64_t start_idx = rank_base + i * stride;
+        child->spawn([tree, k, start_idx, i, hostname_hash, slice_buffers](
+                         CoroScope&) mutable -> coro::CoroTask<void> {
+            co_await serialize_one(tree, k, hostname_hash, slice_buffers, i,
+                                   start_idx);
+        });
+    }
+    co_return;
+}
+
+coro::CoroTask<void> ingest_file(std::string path, internal::CallTree* tree,
+                                 const std::set<std::uint32_t>* pids,
+                                 std::atomic<std::size_t>* total) {
+    auto counts =
+        co_await internal::read_trace_file_async(std::move(path), tree, pids);
+    total->fetch_add(counts.processed, std::memory_order_relaxed);
+}
+
+coro::CoroTask<void> build_hierarchy_one(internal::CallTree* tree,
+                                         internal::ProcessKey key) {
+    tree->build_hierarchy_for_process(key);
+    co_return;
+}
+
+void serialize_process_dfs(const internal::ProcessCallTree& pgraph,
+                           const internal::ProcessKey& key,
+                           internal::JsonSerializer& serializer,
+                           std::uint64_t start_idx, std::string& out) {
+    char buffer[16384];
+    std::uint64_t idx = start_idx;
+    for (std::uint64_t root_id : pgraph.root_calls) {
+        std::vector<std::uint64_t> stack;
+        stack.push_back(root_id);
+        while (!stack.empty()) {
+            std::uint64_t nid = stack.back();
+            stack.pop_back();
+            auto it = pgraph.calls.find(nid);
+            if (it == pgraph.calls.end()) continue;
+            const auto& node = it->second;
+            std::size_t w = serializer.serialize_node(
+                buffer, static_cast<int>(idx++), *node, key.pid, key.tid);
+            if (w > 0) {
+                out.append(buffer, w - 1);
+                out.append(",\n", 2);
             }
+            const auto& children = node->get_children();
+            for (auto cit = children.rbegin(); cit != children.rend(); ++cit)
+                stack.push_back(*cit);
         }
-
-        yyjson_doc_free(doc);
-        co_return true;
-    }
-
-   private:
-    const std::set<std::uint32_t>& allowed_pids_;
-    internal::CallTree& graph_;
-    std::size_t& processed_count_;
-    std::size_t& filtered_count_;
-    internal::TraceReader reader_;
-};
-
-bool MPIFilteredTraceReader::read_with_indexer(const std::string& trace_file,
-                                               const std::string& index_file,
-                                               internal::CallTree& graph) {
-    try {
-        auto reader = utilities::reader::internal::ReaderFactory::create(
-            trace_file, index_file);
-        if (!reader || !reader->is_valid()) {
-            DFTRACER_UTILS_LOG_ERROR("Failed to create reader for %s",
-                                     trace_file.c_str());
-            return read(trace_file, graph);  // Fallback
-        }
-
-        FilteredLineProcessor processor(allowed_pids_, graph, processed_count_,
-                                        filtered_count_);
-
-        std::size_t num_lines = reader->get_num_lines();
-        if (num_lines > 0) {
-            reader->read_lines_with_processor(1, num_lines, processor);
-        }
-
-        return true;
-    } catch (const std::exception& e) {
-        DFTRACER_UTILS_LOG_ERROR("Error reading with indexer: %s", e.what());
-        return false;
     }
 }
 
-bool MPIFilteredTraceReader::read_multiple(
-    const std::vector<std::string>& trace_files, internal::CallTree& graph) {
-    for (const auto& file : trace_files) {
-        if (!read(file, graph)) {
-            return false;
-        }
+coro::CoroTask<void> serialize_one(const internal::CallTree* tree,
+                                   internal::ProcessKey key,
+                                   const std::string* hostname_hash,
+                                   std::vector<std::string>* slice_buffers,
+                                   std::size_t index, std::uint64_t start_idx) {
+    auto* pgraph = const_cast<internal::CallTree*>(tree)->get(key);
+    if (pgraph) {
+        internal::JsonSerializer serializer;
+        char init[8];
+        serializer.initialize(init, *hostname_hash);
+        (void)init;
+        serialize_process_dfs(*pgraph, key, serializer, start_idx,
+                              (*slice_buffers)[index]);
     }
-    return true;
+    co_return;
 }
 
-// ============================================================================
-// MPICallTreeBuilder Implementation
-// ============================================================================
+}  // namespace
 
 MPICallTreeBuilder::MPICallTreeBuilder(const MPICallTreeConfig& config)
-    : config_(config),
-      call_tree_(std::make_unique<internal::CallTree>()),
-      trace_files_(),
-      indexers_(),
-      pid_index_map_(),
-      assigned_pids_(),
-      all_pids_(),
-      initialized_(false),
-      pids_discovered_(false),
-      graphs_built_(false),
-      graphs_gathered_(false) {}
-
-MPICallTreeBuilder::~MPICallTreeBuilder() {
-    if (initialized_) {
-        cleanup();
-    }
-}
-
-MPICallTreeBuilder::MPICallTreeBuilder(MPICallTreeBuilder&& other) noexcept
-    : config_(std::move(other.config_)),
-      call_tree_(std::move(other.call_tree_)),
-      trace_files_(std::move(other.trace_files_)),
-      indexers_(std::move(other.indexers_)),
-      pid_index_map_(std::move(other.pid_index_map_)),
-      assigned_pids_(std::move(other.assigned_pids_)),
-      all_pids_(std::move(other.all_pids_)),
-      initialized_(other.initialized_),
-      pids_discovered_(other.pids_discovered_),
-      graphs_built_(other.graphs_built_),
-      graphs_gathered_(other.graphs_gathered_) {
-    other.initialized_ = false;
-}
-
-MPICallTreeBuilder& MPICallTreeBuilder::operator=(
-    MPICallTreeBuilder&& other) noexcept {
-    if (this != &other) {
-        if (initialized_) {
-            cleanup();
-        }
-        config_ = std::move(other.config_);
-        call_tree_ = std::move(other.call_tree_);
-        trace_files_ = std::move(other.trace_files_);
-        indexers_ = std::move(other.indexers_);
-        pid_index_map_ = std::move(other.pid_index_map_);
-        assigned_pids_ = std::move(other.assigned_pids_);
-        all_pids_ = std::move(other.all_pids_);
-        initialized_ = other.initialized_;
-        pids_discovered_ = other.pids_discovered_;
-        graphs_built_ = other.graphs_built_;
-        graphs_gathered_ = other.graphs_gathered_;
-        other.initialized_ = false;
-    }
-    return *this;
-}
-
-void MPICallTreeBuilder::initialize() {
-    if (initialized_) {
-        return;
-    }
-
-    // Initialize MPI utilities singleton
-    mpi::MPIUtils::instance().initialize();
-
+    : config_(config), call_tree_(std::make_unique<internal::CallTree>()) {
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank_);
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size_);
     call_tree_->initialize();
-    initialized_ = true;
-
-    if (mpi::MPIUtils::instance().is_root() && config_.verbose) {
-        DFTRACER_UTILS_LOG_INFO(
-            "MPICallTreeBuilder initialized with %d MPI ranks",
-            mpi::MPIUtils::instance().get_world_size());
-    }
 }
 
-void MPICallTreeBuilder::cleanup() {
-    if (!initialized_) {
-        return;
-    }
-
-    call_tree_->cleanup();
-    indexers_.clear();
-    trace_files_.clear();
-    pid_index_map_.clear();
-    assigned_pids_.clear();
-    all_pids_.clear();
+MPICallTreeBuilder::~MPICallTreeBuilder() = default;
 
-    initialized_ = false;
-    pids_discovered_ = false;
-    graphs_built_ = false;
-    graphs_gathered_ = false;
-}
+MPICallTreeBuilder::MPICallTreeBuilder(MPICallTreeBuilder&&) noexcept = default;
+MPICallTreeBuilder& MPICallTreeBuilder::operator=(
+    MPICallTreeBuilder&&) noexcept = default;
 
 void MPICallTreeBuilder::add_trace_files(
     const std::vector<std::string>& files) {
-    for (const auto& file : files) {
-        if (fs::exists(file) && fs::is_regular_file(file)) {
-            trace_files_.push_back(file);
-        } else if (mpi::MPIUtils::instance().is_root()) {
-            DFTRACER_UTILS_LOG_WARN("File not found: %s", file.c_str());
-        }
-    }
-}
-
-void MPICallTreeBuilder::add_trace_directory(const std::string& directory,
-                                             const std::string& pattern) {
-    if (!fs::exists(directory) || !fs::is_directory(directory)) {
-        if (mpi::MPIUtils::instance().is_root()) {
-            DFTRACER_UTILS_LOG_ERROR("Directory not found: %s",
-                                     directory.c_str());
-        }
-        return;
-    }
-
-    // Recursively find all matching files
-    for (const auto& entry : fs::recursive_directory_iterator(directory)) {
-        if (entry.is_regular_file()) {
-            std::string filename = entry.path().filename().string();
-
-            // Simple pattern matching for *.ext or *.part1.part2 patterns
-            bool matches = false;
-            if (pattern == "*") {
-                matches = true;
-            } else if (pattern.front() == '*') {
-                // *.ext or *.pfw.gz pattern
-                std::string suffix = pattern.substr(1);  // Remove the leading *
-                matches = (filename.size() >= suffix.size() &&
-                           filename.substr(filename.size() - suffix.size()) ==
-                               suffix);
-            } else {
-                matches = (filename.find(pattern) != std::string::npos);
-            }
-
-            if (matches) {
-                trace_files_.push_back(entry.path().string());
-            }
-        }
-    }
-
+    trace_files_.insert(trace_files_.end(), files.begin(), files.end());
     std::sort(trace_files_.begin(), trace_files_.end());
-
-    if (mpi::MPIUtils::instance().is_root() && config_.verbose) {
-        DFTRACER_UTILS_LOG_INFO("Found %zu trace files in %s",
-                                trace_files_.size(), directory.c_str());
-    }
 }
 
-void MPICallTreeBuilder::create_indexer(const std::string& trace_file) {
-    if (indexers_.find(trace_file) != indexers_.end()) {
-        return;
-    }
-
-    ArchiveFormat format = FormatDetector::detect(trace_file);
-    if (format != ArchiveFormat::GZIP) {
-        return;  // Only create indexers for gzip files
-    }
-
-    std::string idx_file = trace_file + ".zindex";
-    std::uint64_t ckpt_size =
-        config_.checkpoint_size > 0
-            ? config_.checkpoint_size
-            : utilities::indexer::internal::Indexer::DEFAULT_CHECKPOINT_SIZE;
-
-    try {
-        auto indexer = utilities::indexer::internal::IndexerFactory::create(
-            trace_file, idx_file, ckpt_size, false);
-        if (indexer) {
-            // Build index if needed
-            if (indexer->need_rebuild()) {
-                if (mpi::MPIUtils::instance().is_root() && config_.verbose) {
-                    DFTRACER_UTILS_LOG_INFO("Building index for %s",
-                                            trace_file.c_str());
-                }
-                indexer->build();
-            }
-            indexers_[trace_file] = std::move(indexer);
-        }
-    } catch (const std::exception& e) {
-        if (config_.verbose) {
-            DFTRACER_UTILS_LOG_WARN("Could not create indexer for %s: %s",
-                                    trace_file.c_str(), e.what());
-        }
+void MPICallTreeBuilder::add_trace_directory(const std::string& directory,
+                                             const std::string& /*pattern*/) {
+    std::vector<std::string> files;
+    std::error_code ec;
+    for (const auto& entry : fs::directory_iterator(directory, ec)) {
+        if (ec) break;
+        if (!entry.is_regular_file(ec)) continue;
+        if (is_trace_file(entry.path().string()))
+            files.push_back(entry.path().string());
     }
+    add_trace_files(files);
 }
 
-std::set<std::uint32_t> MPICallTreeBuilder::scan_file_for_pids(
-    const std::string& trace_file) {
-    std::set<std::uint32_t> pids;
-
-    // Check if it's a gzip file with an index
-    ArchiveFormat format = FormatDetector::detect(trace_file);
-    std::string index_path =
-        utilities::composites::dft::internal::determine_index_path(trace_file,
-                                                                   "");
-
-    if (format == ArchiveFormat::GZIP && fs::exists(index_path)) {
-        try {
-            auto reader = utilities::reader::internal::ReaderFactory::create(
-                trace_file, index_path);
-            if (reader && reader->is_valid()) {
-                // Read first N lines to discover PIDs
-                std::size_t num_lines = reader->get_num_lines();
-                std::string content = reader->read_lines(
-                    1, std::min(num_lines, (std::size_t)100000));
-
-                std::istringstream iss(content);
-                std::string line;
-                while (std::getline(iss, line)) {
-                    if (line.empty() || line == "[" || line == "]") continue;
-                    if (!line.empty() && line.back() == ',') line.pop_back();
-
-                    yyjson_doc* doc =
-                        yyjson_read(line.c_str(), line.length(), 0);
-                    if (doc) {
-                        yyjson_val* root = yyjson_doc_get_root(doc);
-                        if (root) {
-                            yyjson_val* pid_val = yyjson_obj_get(root, "pid");
-                            if (pid_val) {
-                                pids.insert(static_cast<std::uint32_t>(
-                                    yyjson_get_uint(pid_val)));
-                            }
-                        }
-                        yyjson_doc_free(doc);
-                    }
-                }
-
-                return pids;
-            }
-        } catch (const std::exception& e) {
-            // Fall through to direct reading
-        }
-    }
-
-    // For gzip files without index, use gzopen
-    if (format == ArchiveFormat::GZIP) {
-        gzFile gz = gzopen(trace_file.c_str(), "rb");
-        if (!gz) {
-            return pids;
-        }
-
-        char buffer[65536];
-        std::string current_line;
-        int line_count = 0;
-
-        while (line_count < 100000) {
-            int bytes_read = gzread(gz, buffer, sizeof(buffer) - 1);
-            if (bytes_read <= 0) break;
-            buffer[bytes_read] = '\0';
-
-            current_line += buffer;
+namespace {
 
-            // Process complete lines
-            size_t pos;
-            while ((pos = current_line.find('\n')) != std::string::npos) {
-                std::string line = current_line.substr(0, pos);
-                current_line = current_line.substr(pos + 1);
-                line_count++;
-
-                if (line.empty() || line == "[" || line == "]") continue;
-                if (!line.empty() && line.back() == ',') line.pop_back();
-
-                yyjson_doc* doc = yyjson_read(line.c_str(), line.length(), 0);
-                if (doc) {
-                    yyjson_val* root = yyjson_doc_get_root(doc);
-                    if (root) {
-                        yyjson_val* pid_val = yyjson_obj_get(root, "pid");
-                        if (pid_val) {
-                            pids.insert(static_cast<std::uint32_t>(
-                                yyjson_get_uint(pid_val)));
-                        }
-                    }
-                    yyjson_doc_free(doc);
-                }
-
-                if (line_count >= 100000) break;
-            }
-        }
-
-        gzclose(gz);
-        return pids;
-    }
-
-    // Fall back to direct file reading for plain text
-    std::ifstream file(trace_file);
-    if (!file.is_open()) {
-        return pids;
-    }
-
-    std::string line;
-    int line_count = 0;
-    while (std::getline(file, line) && line_count < 100000) {
-        line_count++;
-        if (line.empty() || line == "[" || line == "]") continue;
-        if (!line.empty() && line.back() == ',') line.pop_back();
-
-        yyjson_doc* doc = yyjson_read(line.c_str(), line.length(), 0);
-        if (doc) {
-            yyjson_val* root = yyjson_doc_get_root(doc);
-            if (root) {
-                yyjson_val* pid_val = yyjson_obj_get(root, "pid");
-                if (pid_val) {
-                    pids.insert(
-                        static_cast<std::uint32_t>(yyjson_get_uint(pid_val)));
-                }
-            }
-            yyjson_doc_free(doc);
-        }
-    }
-
-    return pids;
-}
-
-void MPICallTreeBuilder::distribute_pids() {
-    // Round-robin distribution using MPIUtils singleton
-    auto& mpi = mpi::MPIUtils::instance();
-    assigned_pids_.clear();
-    for (size_t i = static_cast<size_t>(mpi.get_rank()); i < all_pids_.size();
-         i += static_cast<size_t>(mpi.get_world_size())) {
-        assigned_pids_.insert(all_pids_[i]);
-    }
+struct DiscoverState {
+    std::vector<std::string> my_paths;
+    std::vector<std::set<std::uint32_t>> per_file;
+};
 
-    if (config_.verbose) {
-        DFTRACER_UTILS_LOG_DEBUG("[Rank %d] Assigned %zu PIDs", mpi.get_rank(),
-                                 assigned_pids_.size());
-    }
+coro::CoroTask<void> discover_scan_phase(CoroScope* scope, DiscoverState* st) {
+    const std::vector<std::string>* paths_ptr = &st->my_paths;
+    std::vector<std::set<std::uint32_t>>* per_file_ptr = &st->per_file;
+    co_await scope->scope(
+        [paths_ptr,
+         per_file_ptr](CoroScope& child) mutable -> coro::CoroTask<void> {
+            co_await scan_files_into(&child, paths_ptr, per_file_ptr);
+        });
 }
 
-std::map<std::uint32_t, PIDIndexInfo> MPICallTreeBuilder::discover_pids() {
-    if (!initialized_) {
-        initialize();
-    }
-
-    auto& mpi = mpi::MPIUtils::instance();
-
-    if (mpi.is_root() && config_.verbose) {
-        DFTRACER_UTILS_LOG_INFO(
-            "Phase 1: Discovering PIDs from %zu trace files...",
-            trace_files_.size());
-    }
-
-    // Broadcast file list from rank 0 using MPIUtils
-    int num_files = static_cast<int>(trace_files_.size());
-    mpi.broadcast_int(num_files, 0);
+}  // namespace
 
-    if (!mpi.is_root()) {
-        trace_files_.resize(num_files);
+coro::CoroTask<bool> MPICallTreeBuilder::discover_pids(CoroScope* scope) {
+    auto state = std::make_unique<DiscoverState>();
+    for (std::size_t i = 0; i < trace_files_.size(); ++i) {
+        if (static_cast<int>(i % static_cast<std::size_t>(world_size_)) ==
+            rank_)
+            state->my_paths.push_back(trace_files_[i]);
     }
+    state->per_file.resize(state->my_paths.size());
 
-    for (int i = 0; i < num_files; i++) {
-        mpi.broadcast_string(trace_files_[i], 0);
-    }
+    co_await discover_scan_phase(scope, state.get());
 
-    // Each rank scans files to discover PIDs
     std::set<std::uint32_t> local_pids;
-
-    for (const auto& trace_file : trace_files_) {
-        // Create indexer if needed
-        create_indexer(trace_file);
-
-        // Scan for PIDs
-        auto file_pids = scan_file_for_pids(trace_file);
-        local_pids.insert(file_pids.begin(), file_pids.end());
-
-        // Store PID index info
-        for (auto pid : file_pids) {
-            if (pid_index_map_.find(pid) == pid_index_map_.end()) {
-                pid_index_map_[pid] = PIDIndexInfo(pid, 0, 0, 0, trace_file);
-            }
-        }
-    }
-
-    // Gather all PIDs to rank 0 using MPIUtils
-    std::vector<std::uint32_t> local_pid_vec(local_pids.begin(),
-                                             local_pids.end());
-    std::vector<std::uint32_t> all_pids_gathered;
-    std::vector<int> recv_counts;
-    std::vector<int> displacements;
-
-    mpi.gatherv_uint32(local_pid_vec, all_pids_gathered, recv_counts,
-                       displacements, 0);
-
-    // Remove duplicates and sort on rank 0
-    if (mpi.is_root()) {
-        std::set<std::uint32_t> unique_pids(all_pids_gathered.begin(),
-                                            all_pids_gathered.end());
-        all_pids_.assign(unique_pids.begin(), unique_pids.end());
-        std::sort(all_pids_.begin(), all_pids_.end());
-
-        if (config_.verbose) {
-            DFTRACER_UTILS_LOG_INFO("Discovered %zu unique PIDs",
-                                    all_pids_.size());
-        }
-    }
-
-    // Broadcast unique PIDs to all ranks
-    mpi.broadcast_uint32_vector(all_pids_, 0);
-
-    // Distribute PIDs across ranks
-    distribute_pids();
-
-    mpi.barrier();
-
-    all_pids_.assign(local_pids.begin(), local_pids.end());
-    assigned_pids_ = local_pids;
-
-    pids_discovered_ = true;
-    return pid_index_map_;
-}
-
-bool MPICallTreeBuilder::read_traces_for_pids(
-    const std::vector<std::string>& files,
-    const std::set<std::uint32_t>& pids) {
-    MPIFilteredTraceReader reader(pids);
-    return reader.read_multiple(files, *call_tree_);
-}
-
-MPICallTreeResult MPICallTreeBuilder::build() {
-    MPICallTreeResult result;
-    auto& mpi = mpi::MPIUtils::instance();
-
-    if (!pids_discovered_) {
-        discover_pids();
-    }
-
-    if (assigned_pids_.empty()) {
-        if (config_.verbose) {
-            DFTRACER_UTILS_LOG_DEBUG(
-                "[Rank %d] No PIDs assigned, skipping build", mpi.get_rank());
-        }
-        result.success = true;
-        graphs_built_ = true;
-        return result;
-    }
-
-    if (mpi.is_root() && config_.verbose) {
-        DFTRACER_UTILS_LOG_INFO("%s", "Phase 2: Building call graphs...");
-    }
-
-    mpi.barrier();
-
-    auto start_time = std::chrono::high_resolution_clock::now();
-
-    // Use pipeline for parallel trace reading
-    if (config_.num_threads > 0) {
-        // For now, use simple sequential processing
-        // Pipeline can be expanded for more complex workflows
-        read_traces_for_pids(trace_files_, assigned_pids_);
-    } else {
-        // Sequential processing
-        read_traces_for_pids(trace_files_, assigned_pids_);
-    }
-
-    // Build hierarchy
-    call_tree_->build_hierarchy();
-
-    auto end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> elapsed = end_time - start_time;
-
-    result.elapsed_time_s = elapsed.count();
-    result.local_pids = assigned_pids_.size();
-    result.local_events = 0;
-
-    // Count events
-    for (const auto& key : call_tree_->keys()) {
-        auto* graph = call_tree_->get(key);
-        if (graph) {
-            result.local_events += graph->calls.size();
-        }
-    }
-
-    // Gather statistics using MPIUtils
-    mpi.reduce_sum_size_t(result.local_pids, result.total_pids, 0);
-    mpi.reduce_sum_size_t(result.local_events, result.total_events, 0);
-
-    double max_time = 0;
-    mpi.reduce_max_double(result.elapsed_time_s, max_time, 0);
-    result.elapsed_time_s = max_time;
-
-    result.success = true;
-    graphs_built_ = true;
-
-    if (mpi.is_root() && config_.verbose) {
-        DFTRACER_UTILS_LOG_INFO("Build completed in %.2f seconds",
-                                result.elapsed_time_s);
-        DFTRACER_UTILS_LOG_INFO("Total PIDs: %zu", result.total_pids);
-        DFTRACER_UTILS_LOG_INFO("Total events: %zu", result.total_events);
+    for (auto& s : state->per_file) local_pids.insert(s.begin(), s.end());
+    state.reset();
+
+    std::vector<std::uint32_t> local_vec(local_pids.begin(), local_pids.end());
+    int my_bytes = static_cast<int>(local_vec.size() * sizeof(std::uint32_t));
+    std::vector<int> rank_bytes(world_size_, 0);
+    MPI_Allgather(&my_bytes, 1, MPI_INT, rank_bytes.data(), 1, MPI_INT,
+                  MPI_COMM_WORLD);
+    std::vector<int> displs(world_size_, 0);
+    int total = 0;
+    for (int r = 0; r < world_size_; ++r) {
+        displs[r] = total;
+        total += rank_bytes[r];
+    }
+    std::vector<char> gathered(total);
+    MPI_Allgatherv(local_vec.data(), my_bytes, MPI_CHAR, gathered.data(),
+                   rank_bytes.data(), displs.data(), MPI_CHAR, MPI_COMM_WORLD);
+    for (int r = 0; r < world_size_; ++r) {
+        const auto* p =
+            reinterpret_cast<const std::uint32_t*>(gathered.data() + displs[r]);
+        const std::size_t n = rank_bytes[r] / sizeof(std::uint32_t);
+        for (std::size_t i = 0; i < n; ++i) all_pids_.insert(p[i]);
+    }
+
+    std::vector<std::uint32_t> sorted_pids(all_pids_.begin(), all_pids_.end());
+    for (std::size_t i = static_cast<std::size_t>(rank_);
+         i < sorted_pids.size(); i += static_cast<std::size_t>(world_size_)) {
+        assigned_pids_.insert(sorted_pids[i]);
+    }
+
+    if (config_.verbose && rank_ == 0) {
+        DFTRACER_UTILS_LOG_INFO(
+            "[rank 0] discovered %zu unique pids across %zu "
+            "files",
+            all_pids_.size(), trace_files_.size());
     }
-
-    return result;
+    co_return true;
 }
 
-SerializableProcessGraph MPICallTreeBuilder::convert_to_serializable(
-    const internal::ProcessCallTree& graph) const {
-    SerializableProcessGraph result;
-    result.key = graph.key;
-    result.root_calls = graph.root_calls;
-    result.call_sequence = graph.call_sequence;
+coro::CoroTask<bool> MPICallTreeBuilder::build(CoroScope* scope) {
+    if (assigned_pids_.empty()) co_return true;
 
-    for (const auto& [id, node] : graph.calls) {
-        SerializableCallNode snode;
-        snode.id = node->get_id();
-        snode.name = node->get_name();
-        snode.category = node->get_category();
-        snode.start_time = node->get_start_time();
-        snode.duration = node->get_duration();
-        snode.level = node->get_level();
-        snode.parent_id = node->get_parent_id();
-        snode.children = node->get_children();
-        snode.args = node->get_args();
-        result.nodes.push_back(std::move(snode));
+    const std::size_t n = trace_files_.size();
+    std::vector<std::unique_ptr<internal::CallTree>> per_file;
+    per_file.reserve(n);
+    for (std::size_t i = 0; i < n; ++i) {
+        per_file.push_back(std::make_unique<internal::CallTree>());
+        per_file.back()->initialize();
     }
 
-    return result;
-}
+    const std::set<std::uint32_t>* pids_ptr = &assigned_pids_;
+    std::atomic<std::size_t> total_events{0};
+    std::atomic<std::size_t>* total_ptr = &total_events;
 
-void MPICallTreeBuilder::merge_from_serializable(
-    const SerializableProcessGraph& serializable) {
-    internal::ProcessCallTree& graph = (*call_tree_)[serializable.key];
-    graph.key = serializable.key;
-    graph.root_calls = serializable.root_calls;
-    graph.call_sequence = serializable.call_sequence;
+    const std::vector<std::string>* paths_ptr = &trace_files_;
+    const std::vector<std::unique_ptr<internal::CallTree>>* per_file_ptr =
+        &per_file;
 
-    for (const auto& snode : serializable.nodes) {
-        auto node = call_tree_->get_factory().create_node(
-            snode.id, snode.name, snode.category, snode.start_time,
-            snode.duration, snode.level, snode.args);
-        node->set_parent_id(snode.parent_id);
-        for (auto child_id : snode.children) {
-            node->add_child(child_id);
-        }
-        graph.calls[snode.id] = node;
-    }
-}
+    co_await scope->scope(
+        [paths_ptr, per_file_ptr, pids_ptr,
+         total_ptr](CoroScope& child) mutable -> coro::CoroTask<void> {
+            co_await ingest_files_into(&child, paths_ptr, per_file_ptr,
+                                       pids_ptr, total_ptr);
+        });
 
-bool MPICallTreeBuilder::alltoall_graphs() {
-    auto& mpi = mpi::MPIUtils::instance();
+    for (auto& t : per_file)
+        if (t) call_tree_->merge_from(std::move(*t));
+    my_process_keys_ = call_tree_->keys();
 
-    // Serialize local graphs
-    std::vector<SerializableProcessGraph> local_graphs;
-    for (const auto& key : call_tree_->keys()) {
-        auto* graph = call_tree_->get(key);
-        if (graph) {
-            local_graphs.push_back(convert_to_serializable(*graph));
-        }
+    if (config_.verbose) {
+        std::printf("[rank %d/%d] build done: %zu events, %zu processes\n",
+                    rank_, world_size_, total_events.load(),
+                    my_process_keys_.size());
+        std::fflush(stdout);
+    }
+    co_return true;
+}
+
+coro::CoroTask<bool> MPICallTreeBuilder::hierarchy(CoroScope* scope) {
+    internal::CallTree* tree = call_tree_.get();
+    const std::vector<internal::ProcessKey>* keys_ptr = &my_process_keys_;
+    co_await scope->scope(
+        [tree, keys_ptr](CoroScope& child) mutable -> coro::CoroTask<void> {
+            co_await hierarchy_all(&child, tree, keys_ptr);
+        });
+    co_return true;
+}
+
+coro::CoroTask<bool> MPICallTreeBuilder::write(CoroScope* scope,
+                                               std::string /*output_path*/,
+                                               std::string staging_dir,
+                                               bool gzip) {
+    char suffix[64];
+    std::snprintf(suffix, sizeof(suffix), "/rank_%05d.pfw%s", rank_,
+                  gzip ? ".gz" : "");
+    my_shard_path_ = staging_dir + suffix;
+    if (rank_ == 0) {
+        std::error_code ec;
+        fs::create_directories(staging_dir, ec);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    const std::size_t n = my_process_keys_.size();
+    std::vector<std::string> slice_buffers(n);
+    static constexpr std::uint64_t IDX_STRIDE = 1ull << 20;
+
+    char hostname[256] = {};
+    gethostname(hostname, sizeof(hostname) - 1);
+    std::string hostname_hash(hostname);
+
+    std::vector<std::string>* slice_buffers_ptr = &slice_buffers;
+    const std::string* hostname_hash_ptr = &hostname_hash;
+    const internal::CallTree* tree = call_tree_.get();
+    const std::uint64_t rank_base = static_cast<std::uint64_t>(rank_) << 40;
+    const std::vector<internal::ProcessKey>* keys_ptr = &my_process_keys_;
+
+    co_await scope->scope(
+        [tree, keys_ptr, hostname_hash_ptr, slice_buffers_ptr,
+         rank_base](CoroScope& child) mutable -> coro::CoroTask<void> {
+            co_await serialize_all(&child, tree, keys_ptr, hostname_hash_ptr,
+                                   slice_buffers_ptr, rank_base, IDX_STRIDE);
+        });
+
+    std::string header;
+    if (rank_ == 0) {
+        header.append("[\n", 2);
+        internal::JsonSerializer serializer;
+        char init[8];
+        serializer.initialize(init, hostname_hash);
+        (void)init;
+        char buf[8192];
+        std::time_t now = std::time(nullptr);
+        char ts[64];
+        std::strftime(ts, sizeof(ts), "%Y-%m-%d %H:%M:%S",
+                      std::localtime(&now));
+        std::size_t w = serializer.serialize_metadata(buf, "timestamp", ts, "M",
+                                                      0, 0, true);
+        if (w > 0) header.append(buf, w - 1);
+        header.append(",\n", 2);
+        w = serializer.serialize_metadata(buf, "format", "call_tree", "M", 0, 0,
+                                          true);
+        if (w > 0) header.append(buf, w - 1);
+        header.append(",\n", 2);
+    }
+
+    utilities::fileio::parallel::WriterConfig wc;
+    wc.layout = utilities::fileio::parallel::FileLayout::SHARDED;
+    wc.gzip = gzip;
+    auto writer = utilities::fileio::parallel::make_writer(wc);
+
+    const std::size_t total_workers = (rank_ == 0 ? 1 : 0) + n;
+    if (total_workers == 0) {
+        FILE* f = std::fopen(my_shard_path_.c_str(), "wb");
+        if (f) std::fclose(f);
+        co_return true;
     }
 
-    // Serialize to bytes
-    std::vector<char> send_buffer;
-    serialization::write_uint32(
-        send_buffer, static_cast<std::uint32_t>(local_graphs.size()));
-    for (const auto& graph : local_graphs) {
-        auto data = graph.serialize();
-        serialization::write_uint32(send_buffer,
-                                    static_cast<std::uint32_t>(data.size()));
-        send_buffer.insert(send_buffer.end(), data.begin(), data.end());
+    if (co_await writer->open(my_shard_path_, total_workers, gzip, scope) !=
+        0) {
+        DFTRACER_UTILS_LOG_ERROR("[rank %d] failed to open writer: %s", rank_,
+                                 my_shard_path_.c_str());
+        co_return false;
     }
 
-    // Use MPIUtils for allgatherv
-    std::vector<char> recv_buffer;
-    std::vector<int> recv_sizes;
-    std::vector<int> displacements;
-
-    mpi.allgatherv_char(send_buffer, recv_buffer, recv_sizes, displacements);
-
-    // Deserialize graphs from other ranks
-    int world_size = mpi.get_world_size();
-    int rank = mpi.get_rank();
-    for (int r = 0; r < world_size; r++) {
-        if (r == rank) continue;  // Skip our own data
-
-        size_t offset = static_cast<size_t>(displacements[r]);
-        std::uint32_t num_graphs =
-            serialization::read_uint32(recv_buffer.data(), offset);
-
-        for (std::uint32_t i = 0; i < num_graphs; i++) {
-            std::uint32_t graph_size =
-                serialization::read_uint32(recv_buffer.data(), offset);
-            (void)graph_size;
-            auto graph = SerializableProcessGraph::deserialize(
-                recv_buffer.data(), offset);
-            merge_from_serializable(graph);
+    std::size_t widx = 0;
+    if (rank_ == 0) {
+        if (co_await writer->write_chunk(
+                widx++, ByteView(header.data(), header.size())) != 0) {
+            co_return false;
         }
     }
 
-    return true;
-}
-
-bool MPICallTreeBuilder::gather() {
-    if (!graphs_built_) {
-        return false;
-    }
-
-    auto& mpi = mpi::MPIUtils::instance();
-
-    if (mpi.is_root() && config_.verbose) {
-        DFTRACER_UTILS_LOG_INFO(
-            "%s", "Phase 3: Gathering call graphs (all-to-all)...");
-    }
-
-    mpi.barrier();
-
-    bool success = alltoall_graphs();
-
-    mpi.barrier();
-
-    graphs_gathered_ = success;
-
-    if (mpi.is_root() && config_.verbose) {
-        DFTRACER_UTILS_LOG_INFO("Gather completed. Total graphs: %zu",
-                                call_tree_->size());
-    }
-
-    return success;
-}
-
-bool MPICallTreeBuilder::save(const std::string& filename) const {
-    // Only rank 0 saves (all ranks have same data after gather)
-    if (!mpi::MPIUtils::instance().is_root()) {
-        return true;
-    }
-
-    std::ofstream file(filename, std::ios::binary);
-    if (!file.is_open()) {
-        DFTRACER_UTILS_LOG_ERROR("Cannot open output file: %s",
-                                 filename.c_str());
-        return false;
-    }
-
-    // Write header
-    CallGraphFileHeader header;
-    header.num_process_graphs = static_cast<std::uint32_t>(call_tree_->size());
-
-    // Count total events
-    std::uint64_t total_events = 0;
-    for (const auto& key : call_tree_->keys()) {
-        auto* graph = call_tree_->get(key);
-        if (graph) {
-            total_events += graph->calls.size();
+    for (std::size_t i = 0; i < n; ++i) {
+        std::string& b = slice_buffers[i];
+        const bool last_overall = (i + 1 == n) && (rank_ == world_size_ - 1);
+        if (last_overall) {
+            if (b.size() >= 2 && b[b.size() - 2] == ',' &&
+                b[b.size() - 1] == '\n') {
+                b.resize(b.size() - 2);
+                b.append("\n]\n", 3);
+            } else {
+                b.append("]\n", 2);
+            }
         }
-    }
-    header.total_events = total_events;
-    header.data_offset = sizeof(CallGraphFileHeader);
-
-    file.write(reinterpret_cast<const char*>(&header), sizeof(header));
-
-    // Write each process graph
-    for (const auto& key : call_tree_->keys()) {
-        auto* graph = call_tree_->get(key);
-        if (graph) {
-            auto serializable =
-                const_cast<MPICallTreeBuilder*>(this)->convert_to_serializable(
-                    *graph);
-            auto data = serializable.serialize();
-            std::uint32_t size = static_cast<std::uint32_t>(data.size());
-            file.write(reinterpret_cast<const char*>(&size), sizeof(size));
-            file.write(data.data(), data.size());
+        if (co_await writer->write_chunk(widx++,
+                                         ByteView(b.data(), b.size())) != 0) {
+            co_return false;
         }
     }
 
-    if (config_.verbose) {
-        DFTRACER_UTILS_LOG_INFO("Saved call graph to %s", filename.c_str());
-    }
-
-    return true;
-}
-
-std::unique_ptr<internal::CallTree> MPICallTreeBuilder::load(
-    const std::string& filename) {
-    std::ifstream file(filename, std::ios::binary);
-    if (!file.is_open()) {
-        DFTRACER_UTILS_LOG_ERROR("Cannot open file: %s", filename.c_str());
-        return nullptr;
-    }
-
-    // Read header
-    CallGraphFileHeader header;
-    file.read(reinterpret_cast<char*>(&header), sizeof(header));
-
-    if (!header.is_valid()) {
-        DFTRACER_UTILS_LOG_ERROR("%s", "Invalid call graph file format");
-        return nullptr;
-    }
-
-    auto call_graph = std::make_unique<internal::CallTree>();
-    call_graph->initialize();
+    if (co_await writer->close() != 0) co_return false;
 
-    // Read each process graph
-    for (std::uint32_t i = 0; i < header.num_process_graphs; i++) {
-        std::uint32_t size;
-        file.read(reinterpret_cast<char*>(&size), sizeof(size));
-
-        std::vector<char> data(size);
-        file.read(data.data(), size);
-
-        size_t offset = 0;
-        auto serializable =
-            SerializableProcessGraph::deserialize(data.data(), offset);
-
-        // Merge into call graph
-        internal::ProcessCallTree& graph = (*call_graph)[serializable.key];
-        graph.key = serializable.key;
-        graph.root_calls = serializable.root_calls;
-        graph.call_sequence = serializable.call_sequence;
-
-        for (const auto& snode : serializable.nodes) {
-            auto node = call_graph->get_factory().create_node(
-                snode.id, snode.name, snode.category, snode.start_time,
-                snode.duration, snode.level, snode.args);
-            node->set_parent_id(snode.parent_id);
-            for (auto child_id : snode.children) {
-                node->add_child(child_id);
-            }
-            graph.calls[snode.id] = node;
+    auto shards = writer->output_paths();
+    if (shards.size() > 1) {
+        if (co_await utilities::fileio::parallel::merge_shards(my_shard_path_,
+                                                               shards) != 0) {
+            DFTRACER_UTILS_LOG_ERROR("[rank %d] local merge failed", rank_);
+            co_return false;
         }
     }
-
-    return call_graph;
+    co_return true;
 }
 
-void MPICallTreeBuilder::print_summary() const {
-    auto& mpi = mpi::MPIUtils::instance();
-    std::size_t local_graphs = call_tree_->size();
-    std::size_t local_events = 0;
+coro::CoroTask<bool> MPICallTreeBuilder::merge(std::string output_path,
+                                               std::string staging_dir,
+                                               bool gzip, bool keep_staging) {
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank_ != 0) co_return true;
 
-    for (const auto& key : call_tree_->keys()) {
-        auto* graph = call_tree_->get(key);
-        if (graph) {
-            local_events += graph->calls.size();
-        }
+    std::vector<std::string> shards;
+    shards.reserve(world_size_);
+    for (int r = 0; r < world_size_; ++r) {
+        char rs[64];
+        std::snprintf(rs, sizeof(rs), "/rank_%05d.pfw%s", r, gzip ? ".gz" : "");
+        shards.emplace_back(staging_dir + rs);
     }
-
-    std::size_t total_graphs = 0;
-    std::size_t total_events = 0;
-
-    // Use MPIUtils for reduce operations
-    const_cast<mpi::MPIUtils&>(mpi).reduce_sum_size_t(local_graphs,
-                                                      total_graphs, 0);
-    const_cast<mpi::MPIUtils&>(mpi).reduce_sum_size_t(local_events,
-                                                      total_events, 0);
-
-    if (mpi.is_root()) {
-        DFTRACER_UTILS_LOG_INFO(
-            "%s", "\n============ MPI Call Graph Summary ============");
-        DFTRACER_UTILS_LOG_INFO("MPI Ranks: %d", mpi.get_world_size());
-        DFTRACER_UTILS_LOG_INFO("Total PIDs: %zu", all_pids_.size());
-        DFTRACER_UTILS_LOG_INFO("Total process graphs: %zu", total_graphs);
-        DFTRACER_UTILS_LOG_INFO("Total events: %zu", total_events);
-        DFTRACER_UTILS_LOG_INFO(
-            "%s", "================================================\n");
+    if (co_await utilities::fileio::parallel::merge_shards(output_path,
+                                                           shards) != 0) {
+        DFTRACER_UTILS_LOG_ERROR("merge_shards failed for %s",
+                                 output_path.c_str());
+        co_return false;
     }
-
-    // Each rank prints its summary
-    int world_size = mpi.get_world_size();
-    int rank = mpi.get_rank();
-    for (int r = 0; r < world_size; r++) {
-        if (r == rank) {
-            DFTRACER_UTILS_LOG_INFO("[Rank %d] Local Summary:", rank);
-            DFTRACER_UTILS_LOG_INFO("  Assigned PIDs: %zu",
-                                    assigned_pids_.size());
-            DFTRACER_UTILS_LOG_INFO("  Process graphs: %zu", local_graphs);
-            DFTRACER_UTILS_LOG_INFO("  Events: %zu", local_events);
-        }
-        const_cast<mpi::MPIUtils&>(mpi).barrier();
+    if (!keep_staging) {
+        std::error_code ec;
+        fs::remove_all(staging_dir, ec);
     }
+    co_return true;
 }
 
 }  // namespace dftracer::utils::call_tree
diff --git a/src/dftracer/utils/utilities/call_tree/call_tree_save_arrow.cpp b/src/dftracer/utils/utilities/call_tree/call_tree_save_arrow.cpp
new file mode 100644
index 00000000..f3a4be00
--- /dev/null
+++ b/src/dftracer/utils/utilities/call_tree/call_tree_save_arrow.cpp
@@ -0,0 +1,391 @@
+// Arrow IPC save/load for in-memory CallTree. Produces a .arrow file with a
+// single record batch (zstd buffer-level compression by default) consumable
+// by pyarrow / polars / nanoarrow / dfanalyzer.
+//
+// Schema (one row per CallTreeNode, rows grouped by ProcessKey and ordered
+// by call_sequence within each group):
+//
+//   pid          uint64
+//   tid          uint64
+//   node_pkid    uint64           // ProcessKey.node_id
+//   id           uint64           // node id
+//   name         utf8             // ZSTD compresses repeated values well
+//   category     utf8
+//   start_time   uint64
+//   duration     uint64
+//   level        int64
+//   parent_id    uint64
+//   is_root      bool             // node is in ProcessCallTree::root_calls
+//   seq_idx      int64            // position in ProcessCallTree::call_sequence
+//   children     utf8             // ',' joined child ids
+//   arg_keys     utf8             // '\x1f' (US sep) joined keys
+//   arg_values   utf8             // '\x1f' joined stringified values
+
+#include <dftracer/utils/call_tree/internal/call_tree.h>
+#include <dftracer/utils/call_tree/internal/process_call_tree.h>
+#include <dftracer/utils/call_tree/internal/process_key.h>
+#include <dftracer/utils/call_tree/mpi/serializable.h>
+#include <dftracer/utils/core/common/config.h>
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/common/string_intern.h>
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+#include <dftracer/utils/utilities/common/arrow/column_builder.h>
+#include <dftracer/utils/utilities/common/arrow/ipc_reader.h>
+#include <dftracer/utils/utilities/common/arrow/ipc_writer.h>
+#include <dftracer/utils/utilities/composites/dft/args_map.h>
+#include <nanoarrow/nanoarrow.h>
+
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <vector>
+#endif
+
+namespace dftracer::utils::call_tree {
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+namespace {
+
+using utilities::common::arrow::ArrowExportResult;
+using utilities::common::arrow::ColumnSpec;
+using utilities::common::arrow::ColumnType;
+using utilities::common::arrow::IpcCompression;
+using utilities::common::arrow::IpcReader;
+using utilities::common::arrow::IpcWriter;
+using utilities::common::arrow::RecordBatchBuilder;
+using utilities::composites::dft::ArgsValueProxy;
+
+constexpr char ARG_SEP = '\x1f';
+
+void join_uint64(std::string& out, const std::vector<std::uint64_t>& v) {
+    out.clear();
+    bool first = true;
+    for (auto x : v) {
+        if (!first) out.push_back(',');
+        out.append(std::to_string(x));
+        first = false;
+    }
+}
+
+std::string args_value_to_string(ArgsValueProxy v) {
+    if (v.is_string()) return v.get<std::string>();
+    if (v.is_uint()) return std::to_string(v.get<std::uint64_t>());
+    if (v.is_int()) return std::to_string(v.get<std::int64_t>());
+    if (v.is_number()) return std::to_string(v.get<double>());
+    if (v.is_bool()) return v.get<bool>() ? "true" : "false";
+    return {};
+}
+
+dftracer::utils::StringIntern& arrow_load_intern() {
+    static dftracer::utils::StringIntern instance;
+    return instance;
+}
+
+// Split `s` on `delim`; preserves empty tokens.
+std::vector<std::string_view> split_view(std::string_view s, char delim) {
+    std::vector<std::string_view> out;
+    if (s.empty()) return out;
+    std::size_t start = 0;
+    for (std::size_t i = 0; i < s.size(); ++i) {
+        if (s[i] == delim) {
+            out.emplace_back(s.data() + start, i - start);
+            start = i + 1;
+        }
+    }
+    out.emplace_back(s.data() + start, s.size() - start);
+    return out;
+}
+
+std::uint64_t parse_u64(std::string_view s) {
+    std::uint64_t v = 0;
+    for (char c : s) {
+        if (c < '0' || c > '9') break;
+        v = v * 10 + static_cast<std::uint64_t>(c - '0');
+    }
+    return v;
+}
+
+// Locate the index of column `name` in a flat record-batch schema.
+int find_column(ArrowSchema* schema, const char* name) {
+    if (!schema || !schema->children) return -1;
+    for (int i = 0; i < schema->n_children; ++i) {
+        if (schema->children[i] && schema->children[i]->name &&
+            std::strcmp(schema->children[i]->name, name) == 0) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+// Pull a string out of an Arrow string-view: plain utf8 returns the
+// underlying buffer; dictionary<utf8> resolves the dictionary entry.
+std::string_view get_string(const ArrowArrayView* view, std::int64_t row) {
+    if (view->dictionary != nullptr) {
+        const std::int64_t idx = ArrowArrayViewGetIntUnsafe(view, row);
+        auto s = ArrowArrayViewGetStringUnsafe(view->dictionary, idx);
+        return std::string_view(s.data, static_cast<std::size_t>(s.size_bytes));
+    }
+    auto s = ArrowArrayViewGetStringUnsafe(view, row);
+    return std::string_view(s.data, static_cast<std::size_t>(s.size_bytes));
+}
+
+}  // namespace
+
+coro::CoroTask<bool> save_arrow(CoroScope* /*scope*/,
+                                const internal::CallTree& tree,
+                                std::string output_path) {
+    RecordBatchBuilder builder;
+    builder.declare_schema({
+        {"pid", ColumnType::UINT64},
+        {"tid", ColumnType::UINT64},
+        {"node_pkid", ColumnType::UINT64},
+        {"id", ColumnType::UINT64},
+        {"name", ColumnType::STRING},
+        {"category", ColumnType::STRING},
+        {"start_time", ColumnType::UINT64},
+        {"duration", ColumnType::UINT64},
+        {"level", ColumnType::INT64},
+        {"parent_id", ColumnType::UINT64},
+        {"is_root", ColumnType::BOOL},
+        {"seq_idx", ColumnType::INT64},
+        {"children", ColumnType::STRING},
+        {"arg_keys", ColumnType::STRING},
+        {"arg_values", ColumnType::STRING},
+    });
+
+    auto keys = const_cast<internal::CallTree&>(tree).keys();
+    std::string children_join;
+    std::string arg_keys_join;
+    std::string arg_values_join;
+
+    for (const auto& key : keys) {
+        auto* graph = const_cast<internal::CallTree&>(tree).get(key);
+        if (!graph) continue;
+
+        std::unordered_set<std::uint64_t> root_set(graph->root_calls.begin(),
+                                                   graph->root_calls.end());
+
+        std::int64_t seq_idx = 0;
+        for (std::uint64_t nid : graph->call_sequence) {
+            auto it = graph->calls.find(nid);
+            if (it == graph->calls.end()) continue;
+            const auto& node = it->second;
+            if (!node) continue;
+
+            builder.append_uint64(0, key.pid);
+            builder.append_uint64(1, key.tid);
+            builder.append_uint64(2, key.node_id);
+            builder.append_uint64(3, node->get_id());
+            builder.append_string(4, node->get_name());
+            builder.append_string(5, node->get_category());
+            builder.append_uint64(6, node->get_start_time());
+            builder.append_uint64(7, node->get_duration());
+            builder.append_int64(8,
+                                 static_cast<std::int64_t>(node->get_level()));
+            builder.append_uint64(9, node->get_parent_id());
+            builder.append_bool(10, root_set.count(nid) > 0);
+            builder.append_int64(11, seq_idx++);
+
+            join_uint64(children_join, node->get_children());
+            builder.append_string(12, children_join);
+
+            arg_keys_join.clear();
+            arg_values_join.clear();
+            bool first = true;
+            node->get_args().for_each_member(
+                [&](std::string_view k, ArgsValueProxy v) {
+                    if (!first) {
+                        arg_keys_join.push_back(ARG_SEP);
+                        arg_values_join.push_back(ARG_SEP);
+                    }
+                    arg_keys_join.append(k);
+                    arg_values_join.append(args_value_to_string(v));
+                    first = false;
+                });
+            builder.append_string(13, arg_keys_join);
+            builder.append_string(14, arg_values_join);
+
+            builder.end_row();
+        }
+    }
+
+    if (builder.num_rows() == 0) {
+        DFTRACER_UTILS_LOG_WARN("save_arrow: tree is empty, writing empty %s",
+                                output_path.c_str());
+    }
+
+    auto batch = builder.finish();
+    IpcWriter writer;
+    if (co_await writer.open(output_path, IpcCompression::ZSTD) != 0) {
+        DFTRACER_UTILS_LOG_ERROR("save_arrow: open failed: %s",
+                                 output_path.c_str());
+        co_return false;
+    }
+    if (co_await writer.write_batch(batch) != 0) {
+        DFTRACER_UTILS_LOG_ERROR("%s", "save_arrow: write_batch failed");
+        co_return false;
+    }
+    if (co_await writer.close() != 0) {
+        DFTRACER_UTILS_LOG_ERROR("%s", "save_arrow: close failed");
+        co_return false;
+    }
+    co_return true;
+}
+
+coro::CoroTask<std::unique_ptr<internal::CallTree>> load_arrow(
+    CoroScope* /*scope*/, std::string input_path) {
+    IpcReader reader;
+    if (reader.open(input_path) != 0) {
+        DFTRACER_UTILS_LOG_ERROR("load_arrow: open failed: %s",
+                                 input_path.c_str());
+        co_return nullptr;
+    }
+
+    auto tree = std::make_unique<internal::CallTree>();
+    tree->initialize();
+
+    using utilities::composites::dft::ArgsMap;
+
+    auto process_batch = [&](ArrowExportResult& batch) -> int {
+        ArrowSchema* schema = batch.get_schema();
+        ArrowArray* array = batch.get_array();
+        if (!schema || !array) return -1;
+
+        const int c_pid = find_column(schema, "pid");
+        const int c_tid = find_column(schema, "tid");
+        const int c_node_pkid = find_column(schema, "node_pkid");
+        const int c_id = find_column(schema, "id");
+        const int c_name = find_column(schema, "name");
+        const int c_cat = find_column(schema, "category");
+        const int c_start = find_column(schema, "start_time");
+        const int c_dur = find_column(schema, "duration");
+        const int c_level = find_column(schema, "level");
+        const int c_parent = find_column(schema, "parent_id");
+        const int c_isroot = find_column(schema, "is_root");
+        const int c_seq = find_column(schema, "seq_idx");
+        const int c_children = find_column(schema, "children");
+        const int c_argk = find_column(schema, "arg_keys");
+        const int c_argv = find_column(schema, "arg_values");
+        if (c_pid < 0 || c_tid < 0 || c_node_pkid < 0 || c_id < 0 ||
+            c_name < 0 || c_cat < 0 || c_start < 0 || c_dur < 0 ||
+            c_level < 0 || c_parent < 0 || c_isroot < 0 || c_seq < 0 ||
+            c_children < 0 || c_argk < 0 || c_argv < 0) {
+            DFTRACER_UTILS_LOG_ERROR("%s",
+                                     "load_arrow: schema missing required "
+                                     "columns");
+            return -1;
+        }
+
+        ArrowArrayView view;
+        ArrowError err;
+        if (ArrowArrayViewInitFromSchema(&view, schema, &err) != NANOARROW_OK) {
+            DFTRACER_UTILS_LOG_ERROR("load_arrow: InitFromSchema: %s",
+                                     err.message);
+            return -1;
+        }
+        struct ViewGuard {
+            ArrowArrayView* v;
+            ~ViewGuard() { ArrowArrayViewReset(v); }
+        } guard{&view};
+        if (ArrowArrayViewSetArray(&view, array, &err) != NANOARROW_OK) {
+            DFTRACER_UTILS_LOG_ERROR("load_arrow: SetArray: %s", err.message);
+            return -1;
+        }
+
+        const std::int64_t n = array->length;
+        for (std::int64_t i = 0; i < n; ++i) {
+            const std::uint64_t pid =
+                ArrowArrayViewGetUIntUnsafe(view.children[c_pid], i);
+            const std::uint64_t tid =
+                ArrowArrayViewGetUIntUnsafe(view.children[c_tid], i);
+            const std::uint64_t node_pkid =
+                ArrowArrayViewGetUIntUnsafe(view.children[c_node_pkid], i);
+            const std::uint64_t id =
+                ArrowArrayViewGetUIntUnsafe(view.children[c_id], i);
+            const std::uint64_t start =
+                ArrowArrayViewGetUIntUnsafe(view.children[c_start], i);
+            const std::uint64_t dur =
+                ArrowArrayViewGetUIntUnsafe(view.children[c_dur], i);
+            const std::int64_t level =
+                ArrowArrayViewGetIntUnsafe(view.children[c_level], i);
+            const std::uint64_t parent =
+                ArrowArrayViewGetUIntUnsafe(view.children[c_parent], i);
+            const bool is_root =
+                ArrowArrayViewGetIntUnsafe(view.children[c_isroot], i) != 0;
+
+            auto name_sv = get_string(view.children[c_name], i);
+            auto cat_sv = get_string(view.children[c_cat], i);
+            auto children_sv = get_string(view.children[c_children], i);
+            auto argk_sv = get_string(view.children[c_argk], i);
+            auto argv_sv = get_string(view.children[c_argv], i);
+
+            // Args round-trip as strings; type info is lost vs the typed
+            // custom-binary format.
+            ArgsMap args;
+            auto keys_tok = split_view(argk_sv, ARG_SEP);
+            auto vals_tok = split_view(argv_sv, ARG_SEP);
+            const std::size_t n_args =
+                std::min(keys_tok.size(), vals_tok.size());
+            if (n_args > 0) args.set_valid(true);
+            for (std::size_t k = 0; k < n_args; ++k) {
+                args.insert(keys_tok[k], std::string(vals_tok[k]));
+            }
+
+            auto name_interned = arrow_load_intern().intern(name_sv);
+            auto cat_interned = arrow_load_intern().intern(cat_sv);
+
+            auto node = tree->get_factory().create_node(
+                id, name_interned, cat_interned, start, dur,
+                static_cast<int>(level), std::move(args));
+            node->set_parent_id(parent);
+            for (auto child_sv : split_view(children_sv, ',')) {
+                if (child_sv.empty()) continue;
+                node->add_child(parse_u64(child_sv));
+            }
+
+            internal::ProcessKey key(static_cast<std::uint32_t>(pid),
+                                     static_cast<std::uint32_t>(tid),
+                                     static_cast<std::uint32_t>(node_pkid));
+            tree->add_call(key, node);
+
+            // add_call already appended id to call_sequence (in row order,
+            // which is the saved call_sequence order). Only push the root
+            // flag here.
+            auto* pgraph = tree->get(key);
+            if (pgraph && is_root) pgraph->root_calls.push_back(id);
+        }
+        return 0;
+    };
+
+    if (reader.for_each_batch(process_batch) != 0) {
+        DFTRACER_UTILS_LOG_ERROR("%s", "load_arrow: batch iteration failed");
+        co_return nullptr;
+    }
+    co_return tree;
+}
+
+#else   // !DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+coro::CoroTask<bool> save_arrow(CoroScope* /*scope*/,
+                                const internal::CallTree& /*tree*/,
+                                std::string /*output_path*/) {
+    DFTRACER_UTILS_LOG_ERROR("%s",
+                             "save_arrow: build without DFTRACER_UTILS_ENABLE_"
+                             "ARROW_IPC, cannot write Arrow IPC");
+    co_return false;
+}
+
+coro::CoroTask<std::unique_ptr<internal::CallTree>> load_arrow(
+    CoroScope* /*scope*/, std::string /*input_path*/) {
+    DFTRACER_UTILS_LOG_ERROR("%s", "load_arrow: arrow IPC disabled");
+    co_return nullptr;
+}
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+}  // namespace dftracer::utils::call_tree
diff --git a/src/dftracer/utils/utilities/call_tree/call_tree_save_binary.cpp b/src/dftracer/utils/utilities/call_tree/call_tree_save_binary.cpp
new file mode 100644
index 00000000..32eb3b2c
--- /dev/null
+++ b/src/dftracer/utils/utilities/call_tree/call_tree_save_binary.cpp
@@ -0,0 +1,429 @@
+// Compact custom-binary save/load for in-memory CallTree.
+//
+// On-disk layout (little-endian, all multi-byte fields native u32/u64/...):
+//
+//   magic[8]      = "DFTCGRP2"
+//   version u32   = 2
+//   flags   u32   (reserved, currently 0)
+//   string_table:
+//     count u32
+//     for each: u32 length + raw bytes (utf-8; embedded NULs OK)
+//   process_count u32
+//   for each ProcessCallTree:
+//     pid u32, tid u32, node_id u32
+//     call_count u32
+//     for each CallTreeNode:
+//       id u64
+//       name_str_id u32, cat_str_id u32
+//       start_time u64, duration u64
+//       level i32, parent_id u64
+//       child_count u32, then u64 ids
+//       arg_count u32, then per arg:
+//         key_str_id u32
+//         type u8   { 0:string-id-u32, 1:u64, 2:i64, 3:double, 4:bool-u8 }
+//         payload (type-dependent)
+//     root_count u32, then u64 ids
+//     seq_count  u32, then u64 ids
+
+#include <ankerl/unordered_dense.h>
+#include <dftracer/utils/call_tree/internal/call_tree.h>
+#include <dftracer/utils/call_tree/internal/process_call_tree.h>
+#include <dftracer/utils/call_tree/internal/process_key.h>
+#include <dftracer/utils/call_tree/mpi/serializable.h>
+#include <dftracer/utils/core/common/byte_view.h>
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/common/string_intern.h>
+#include <dftracer/utils/utilities/composites/dft/args_map.h>
+#include <dftracer/utils/utilities/fileio/parallel/parallel_writer.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cstdint>
+#include <cstring>
+#include <deque>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace dftracer::utils::call_tree {
+
+namespace {
+
+using utilities::composites::dft::ArgsMap;
+using utilities::composites::dft::ArgsValueProxy;
+
+enum ArgTypeTag : std::uint8_t {
+    ARG_STRING = 0,
+    ARG_U64 = 1,
+    ARG_I64 = 2,
+    ARG_DOUBLE = 3,
+    ARG_BOOL = 4,
+};
+
+dftracer::utils::StringIntern& binary_load_intern() {
+    static dftracer::utils::StringIntern instance;
+    return instance;
+}
+
+void append_bytes(std::vector<char>& out, const void* p, std::size_t n) {
+    out.insert(out.end(), static_cast<const char*>(p),
+               static_cast<const char*>(p) + n);
+}
+
+template <typename T>
+void put_pod(std::vector<char>& out, T v) {
+    append_bytes(out, &v, sizeof(v));
+}
+
+// Builder maintains the global string table and assigns dense ids; identical
+// strings get the same id. `strings_` is a deque (not vector) so push_back
+// keeps existing element addresses stable -- the index_ map stores
+// string_views into those elements, and a vector realloc would dangle them
+// (especially nasty for SSO strings whose storage lives inside the string
+// object).
+class StringTable {
+   public:
+    std::uint32_t intern(std::string_view s) {
+        auto it = index_.find(s);
+        if (it != index_.end()) return it->second;
+        const std::uint32_t id = static_cast<std::uint32_t>(strings_.size());
+        strings_.emplace_back(s);
+        index_.emplace(std::string_view(strings_.back()), id);
+        return id;
+    }
+
+    void write(std::vector<char>& out) const {
+        put_pod<std::uint32_t>(out,
+                               static_cast<std::uint32_t>(strings_.size()));
+        for (const auto& s : strings_) {
+            put_pod<std::uint32_t>(out, static_cast<std::uint32_t>(s.size()));
+            append_bytes(out, s.data(), s.size());
+        }
+    }
+
+   private:
+    std::deque<std::string> strings_;
+    ankerl::unordered_dense::map<std::string_view, std::uint32_t> index_;
+};
+
+// Cursor over an in-memory buffer; tracks bounds and an `ok` flag so callers
+// can early-exit on truncation without per-call error plumbing.
+struct Cursor {
+    const char* p;
+    const char* end;
+    bool ok = true;
+
+    template <typename T>
+    bool get_pod(T& out) {
+        if (end - p < static_cast<std::ptrdiff_t>(sizeof(T))) {
+            ok = false;
+            return false;
+        }
+        std::memcpy(&out, p, sizeof(T));
+        p += sizeof(T);
+        return true;
+    }
+    bool get_string_view(std::string_view& out, std::uint32_t len) {
+        if (end - p < static_cast<std::ptrdiff_t>(len)) {
+            ok = false;
+            return false;
+        }
+        out = std::string_view(p, len);
+        p += len;
+        return true;
+    }
+};
+
+void serialize_node(std::vector<char>& out, StringTable& strings,
+                    const internal::CallTreeNode& n) {
+    put_pod<std::uint64_t>(out, n.get_id());
+    put_pod<std::uint32_t>(out, strings.intern(n.get_name()));
+    put_pod<std::uint32_t>(out, strings.intern(n.get_category()));
+    put_pod<std::uint64_t>(out, n.get_start_time());
+    put_pod<std::uint64_t>(out, n.get_duration());
+    put_pod<std::int32_t>(out, static_cast<std::int32_t>(n.get_level()));
+    put_pod<std::uint64_t>(out, n.get_parent_id());
+
+    const auto& children = n.get_children();
+    put_pod<std::uint32_t>(out, static_cast<std::uint32_t>(children.size()));
+    for (auto id : children) put_pod<std::uint64_t>(out, id);
+
+    // Pull args out as (key, ArgsValueProxy) so we can preserve typing.
+    std::vector<std::pair<std::string_view, ArgsValueProxy>> args;
+    n.get_args().for_each_member(
+        [&](std::string_view k, ArgsValueProxy v) { args.emplace_back(k, v); });
+
+    put_pod<std::uint32_t>(out, static_cast<std::uint32_t>(args.size()));
+    for (auto& [k, v] : args) {
+        put_pod<std::uint32_t>(out, strings.intern(k));
+        if (v.is_string()) {
+            put_pod<std::uint8_t>(out, ARG_STRING);
+            const auto s = v.get<std::string>();
+            put_pod<std::uint32_t>(out, strings.intern(s));
+        } else if (v.is_uint()) {
+            put_pod<std::uint8_t>(out, ARG_U64);
+            put_pod<std::uint64_t>(out, v.get<std::uint64_t>());
+        } else if (v.is_int()) {
+            put_pod<std::uint8_t>(out, ARG_I64);
+            put_pod<std::int64_t>(out, v.get<std::int64_t>());
+        } else if (v.is_number()) {
+            put_pod<std::uint8_t>(out, ARG_DOUBLE);
+            put_pod<double>(out, v.get<double>());
+        } else if (v.is_bool()) {
+            put_pod<std::uint8_t>(out, ARG_BOOL);
+            put_pod<std::uint8_t>(out, v.get<bool>() ? 1 : 0);
+        } else {
+            put_pod<std::uint8_t>(out, ARG_STRING);
+            put_pod<std::uint32_t>(out, strings.intern(""));
+        }
+    }
+}
+
+}  // namespace
+
+coro::CoroTask<bool> save_binary(CoroScope* scope,
+                                 const internal::CallTree& tree,
+                                 std::string output_path) {
+    // First pass: emit body to a scratch buffer while populating the
+    // string table. Then write header + table + body.
+    std::vector<char> body;
+    body.reserve(1 << 20);
+    StringTable strings;
+
+    auto keys = const_cast<internal::CallTree&>(tree).keys();
+    put_pod<std::uint32_t>(body, static_cast<std::uint32_t>(keys.size()));
+
+    for (const auto& key : keys) {
+        auto* graph = const_cast<internal::CallTree&>(tree).get(key);
+        if (!graph) continue;
+
+        put_pod<std::uint32_t>(body, key.pid);
+        put_pod<std::uint32_t>(body, key.tid);
+        put_pod<std::uint32_t>(body, key.node_id);
+        put_pod<std::uint32_t>(body,
+                               static_cast<std::uint32_t>(graph->calls.size()));
+        for (const auto& [id, node] : graph->calls) {
+            if (node) serialize_node(body, strings, *node);
+        }
+        put_pod<std::uint32_t>(
+            body, static_cast<std::uint32_t>(graph->root_calls.size()));
+        for (auto id : graph->root_calls) put_pod<std::uint64_t>(body, id);
+        put_pod<std::uint32_t>(
+            body, static_cast<std::uint32_t>(graph->call_sequence.size()));
+        for (auto id : graph->call_sequence) put_pod<std::uint64_t>(body, id);
+    }
+
+    std::vector<char> out;
+    out.reserve(8 + 4 + 4 + body.size() + 16);
+    append_bytes(out, CALLTREE_BINARY_MAGIC, sizeof(CALLTREE_BINARY_MAGIC));
+    put_pod<std::uint32_t>(out, CALLTREE_BINARY_VERSION);
+    put_pod<std::uint32_t>(out, 0u);  // flags
+    strings.write(out);
+    append_bytes(out, body.data(), body.size());
+
+    utilities::fileio::parallel::WriterConfig wc;
+    wc.layout = utilities::fileio::parallel::FileLayout::STRIPED;
+    wc.gzip = false;
+    auto writer = utilities::fileio::parallel::make_writer(wc);
+    if (co_await writer->open(output_path, 1, false, scope) != 0) {
+        DFTRACER_UTILS_LOG_ERROR("save_binary: open failed: %s",
+                                 output_path.c_str());
+        co_return false;
+    }
+    if (co_await writer->write_chunk(0, ByteView(out.data(), out.size())) !=
+        0) {
+        co_return false;
+    }
+    if (co_await writer->close() != 0) co_return false;
+    co_return true;
+}
+
+coro::CoroTask<std::unique_ptr<internal::CallTree>> load_binary(
+    CoroScope* /*scope*/, std::string input_path) {
+    int fd = ::open(input_path.c_str(), O_RDONLY);
+    if (fd < 0) {
+        DFTRACER_UTILS_LOG_ERROR("load_binary: cannot open %s",
+                                 input_path.c_str());
+        co_return nullptr;
+    }
+    struct stat st;
+    if (::fstat(fd, &st) != 0 || st.st_size <= 0) {
+        ::close(fd);
+        co_return nullptr;
+    }
+    std::vector<char> buf(static_cast<std::size_t>(st.st_size));
+    std::size_t got = 0;
+    while (got < buf.size()) {
+        ssize_t n = ::read(fd, buf.data() + got, buf.size() - got);
+        if (n <= 0) break;
+        got += static_cast<std::size_t>(n);
+    }
+    ::close(fd);
+    if (got != buf.size()) co_return nullptr;
+
+    Cursor c{buf.data(), buf.data() + buf.size()};
+    if (c.end - c.p < 8 || std::memcmp(c.p, CALLTREE_BINARY_MAGIC,
+                                       sizeof(CALLTREE_BINARY_MAGIC)) != 0) {
+        DFTRACER_UTILS_LOG_ERROR("load_binary: bad magic in %s",
+                                 input_path.c_str());
+        co_return nullptr;
+    }
+    c.p += 8;
+    std::uint32_t version = 0, flags = 0;
+    if (!c.get_pod(version) || !c.get_pod(flags)) co_return nullptr;
+    if (version != CALLTREE_BINARY_VERSION) {
+        DFTRACER_UTILS_LOG_ERROR("load_binary: unsupported version %u",
+                                 version);
+        co_return nullptr;
+    }
+
+    std::uint32_t nstr = 0;
+    if (!c.get_pod(nstr)) co_return nullptr;
+    std::vector<std::string_view> table;
+    table.reserve(nstr);
+    for (std::uint32_t i = 0; i < nstr && c.ok; ++i) {
+        std::uint32_t len = 0;
+        std::string_view s;
+        if (!c.get_pod(len) || !c.get_string_view(s, len)) co_return nullptr;
+        table.push_back(s);
+    }
+    auto lookup_str = [&](std::uint32_t id) -> std::string_view {
+        return id < table.size() ? table[id] : std::string_view{};
+    };
+
+    auto tree = std::make_unique<internal::CallTree>();
+    tree->initialize();
+
+    std::uint32_t nprocs = 0;
+    if (!c.get_pod(nprocs)) co_return nullptr;
+
+    for (std::uint32_t pi = 0; pi < nprocs && c.ok; ++pi) {
+        std::uint32_t pid = 0, tid = 0, node_id = 0, ncalls = 0;
+        if (!c.get_pod(pid) || !c.get_pod(tid) || !c.get_pod(node_id) ||
+            !c.get_pod(ncalls))
+            break;
+        internal::ProcessKey key(pid, tid, node_id);
+
+        for (std::uint32_t ci = 0; ci < ncalls && c.ok; ++ci) {
+            std::uint64_t id = 0, start = 0, dur = 0, parent = 0;
+            std::uint32_t name_id = 0, cat_id = 0;
+            std::int32_t level = 0;
+            if (!c.get_pod(id) || !c.get_pod(name_id) || !c.get_pod(cat_id) ||
+                !c.get_pod(start) || !c.get_pod(dur) || !c.get_pod(level) ||
+                !c.get_pod(parent))
+                break;
+
+            std::uint32_t nchildren = 0;
+            if (!c.get_pod(nchildren)) break;
+            std::vector<std::uint64_t> children;
+            children.reserve(nchildren);
+            for (std::uint32_t k = 0; k < nchildren && c.ok; ++k) {
+                std::uint64_t cid = 0;
+                if (!c.get_pod(cid)) break;
+                children.push_back(cid);
+            }
+
+            std::uint32_t nargs = 0;
+            if (!c.get_pod(nargs)) break;
+            ArgsMap args;
+            if (nargs > 0) args.set_valid(true);
+            for (std::uint32_t k = 0; k < nargs && c.ok; ++k) {
+                std::uint32_t key_id = 0;
+                std::uint8_t type = 0;
+                if (!c.get_pod(key_id) || !c.get_pod(type)) break;
+                auto key_sv = lookup_str(key_id);
+                switch (type) {
+                    case ARG_STRING: {
+                        std::uint32_t val_id = 0;
+                        if (!c.get_pod(val_id)) {
+                            c.ok = false;
+                            break;
+                        }
+                        args.insert(key_sv, std::string(lookup_str(val_id)));
+                        break;
+                    }
+                    case ARG_U64: {
+                        std::uint64_t v = 0;
+                        if (!c.get_pod(v)) {
+                            c.ok = false;
+                            break;
+                        }
+                        args.insert(key_sv, v);
+                        break;
+                    }
+                    case ARG_I64: {
+                        std::int64_t v = 0;
+                        if (!c.get_pod(v)) {
+                            c.ok = false;
+                            break;
+                        }
+                        args.insert(key_sv, v);
+                        break;
+                    }
+                    case ARG_DOUBLE: {
+                        double v = 0;
+                        if (!c.get_pod(v)) {
+                            c.ok = false;
+                            break;
+                        }
+                        args.insert(key_sv, v);
+                        break;
+                    }
+                    case ARG_BOOL: {
+                        std::uint8_t v = 0;
+                        if (!c.get_pod(v)) {
+                            c.ok = false;
+                            break;
+                        }
+                        args.insert(key_sv, v != 0);
+                        break;
+                    }
+                    default:
+                        c.ok = false;
+                        break;
+                }
+            }
+
+            auto name = binary_load_intern().intern(lookup_str(name_id));
+            auto cat = binary_load_intern().intern(lookup_str(cat_id));
+            auto node = tree->get_factory().create_node(
+                id, name, cat, start, dur, static_cast<int>(level),
+                std::move(args));
+            node->set_parent_id(parent);
+            for (auto cid : children) node->add_child(cid);
+            tree->add_call(key, node);
+        }
+
+        auto* pgraph = tree->get(key);
+        if (!pgraph) continue;
+        // add_call already appended each new node id into call_sequence in
+        // insertion order; the saved roots/sequence are authoritative, so
+        // clear before replacing.
+        pgraph->root_calls.clear();
+        pgraph->call_sequence.clear();
+        std::uint32_t nroots = 0;
+        if (!c.get_pod(nroots)) break;
+        for (std::uint32_t k = 0; k < nroots && c.ok; ++k) {
+            std::uint64_t id = 0;
+            if (!c.get_pod(id)) break;
+            pgraph->root_calls.push_back(id);
+        }
+        std::uint32_t nseq = 0;
+        if (!c.get_pod(nseq)) break;
+        for (std::uint32_t k = 0; k < nseq && c.ok; ++k) {
+            std::uint64_t id = 0;
+            if (!c.get_pod(id)) break;
+            pgraph->call_sequence.push_back(id);
+        }
+    }
+
+    if (!c.ok) {
+        DFTRACER_UTILS_LOG_ERROR("load_binary: truncated/malformed file %s",
+                                 input_path.c_str());
+        co_return nullptr;
+    }
+    co_return tree;
+}
+
+}  // namespace dftracer::utils::call_tree
diff --git a/src/dftracer/utils/utilities/call_tree/json_serializer.cpp b/src/dftracer/utils/utilities/call_tree/json_serializer.cpp
index 6b667e65..ee1bb394 100644
--- a/src/dftracer/utils/utilities/call_tree/json_serializer.cpp
+++ b/src/dftracer/utils/utilities/call_tree/json_serializer.cpp
@@ -1,4 +1,5 @@
 #include <dftracer/utils/call_tree/json_serializer.h>
+#include <dftracer/utils/utilities/composites/dft/args_map.h>
 
 #include <cstdio>
 #include <cstring>
@@ -7,167 +8,147 @@
 namespace dftracer::utils::call_tree {
 namespace internal {
 
+using dftracer::utils::utilities::composites::dft::ArgsValueProxy;
+
 JsonSerializer::JsonSerializer() : hostname_hash_("") {}
 
 size_t JsonSerializer::initialize(char* buffer,
                                   const std::string& hostname_hash) {
     hostname_hash_ = hostname_hash;
-    // Write opening bracket for JSON array (Chrome Tracing format requirement)
     buffer[0] = '[';
     buffer[1] = '\n';
     return 2;
 }
 
-bool JsonSerializer::convert_args_to_json(
-    const std::unordered_map<std::string, std::string>& args,
-    std::stringstream& stream) {
-    if (args.empty()) {
-        return false;
-    }
+bool JsonSerializer::convert_args_to_json(const ArgsMap& args,
+                                          std::stringstream& stream) {
+    if (!args) return false;
 
-    // Known fields that should always be strings (hash values, etc.)
-    const std::set<std::string> string_fields = {"hhash", "fhash", "exec_hash",
-                                                 "cmd_hash", "hostname_hash"};
+    static const std::set<std::string_view, std::less<>> string_fields = {
+        "hhash", "fhash", "exec_hash", "cmd_hash", "hostname_hash"};
 
     size_t count = 0;
-    for (const auto& [key, value] : args) {
-        // Add comma separator if not first element
-        if (count > 0) {
-            stream << ",";
-        }
+    bool any = false;
+    args.for_each_member([&](std::string_view key, ArgsValueProxy v) {
+        if (count > 0) stream << ",";
+        count++;
+        any = true;
 
-        // Check if this field should always be a string
-        bool force_string = (string_fields.find(key) != string_fields.end());
-
-        // Check if value looks like a pure number
-        // To be safe, only treat it as a number if:
-        // 1. Not a known string field
-        // 2. It doesn't contain any letters (handles hex strings like
-        // "df57e0a251b84b54")
-        // 3. It successfully parses as a number
-        // 4. The entire string was consumed during parsing
-        bool is_number = false;
-        if (!force_string && !value.empty()) {
-            // First check: no alphabetic characters
-            bool has_alpha = false;
-            for (char c : value) {
-                if (std::isalpha(c)) {
-                    has_alpha = true;
-                    break;
-                }
-            }
+        stream << "\"" << key << "\":";
 
-            // Only try to parse as number if no alphabetic chars
-            if (!has_alpha && (std::isdigit(value[0]) || value[0] == '-' ||
-                               value[0] == '+')) {
-                char* end;
-                // Try integer parse
-                std::strtoll(value.c_str(), &end, 10);
-                if (end && *end == '\0') {
-                    is_number = true;
-                } else {
-                    // Try float parse
-                    std::strtod(value.c_str(), &end);
-                    if (end && *end == '\0') {
+        if (v.is_string()) {
+            std::string sv = v.get<std::string>();
+            bool force_string = string_fields.find(key) != string_fields.end();
+            bool is_number = false;
+            if (!force_string && !sv.empty()) {
+                bool has_alpha = false;
+                for (char c : sv) {
+                    if (std::isalpha(static_cast<unsigned char>(c))) {
+                        has_alpha = true;
+                        break;
+                    }
+                }
+                if (!has_alpha &&
+                    (std::isdigit(static_cast<unsigned char>(sv[0])) ||
+                     sv[0] == '-' || sv[0] == '+')) {
+                    char* end;
+                    std::strtoll(sv.c_str(), &end, 10);
+                    if (end && *end == '\0')
                         is_number = true;
+                    else {
+                        std::strtod(sv.c_str(), &end);
+                        if (end && *end == '\0') is_number = true;
                     }
                 }
             }
-        }
-
-        // Format as JSON key-value pair
-        stream << "\"" << key << "\":";
-        if (is_number) {
-            stream << value;
-        } else {
-            // Escape special characters in string values
-            stream << "\"";
-            for (char c : value) {
-                switch (c) {
-                    case '"':
-                        stream << "\\\"";
-                        break;
-                    case '\\':
-                        stream << "\\\\";
-                        break;
-                    case '\n':
-                        stream << "\\n";
-                        break;
-                    case '\r':
-                        stream << "\\r";
-                        break;
-                    case '\t':
-                        stream << "\\t";
-                        break;
-                    default:
-                        stream << c;
-                        break;
+            if (is_number) {
+                stream << sv;
+            } else {
+                stream << "\"";
+                for (char c : sv) {
+                    switch (c) {
+                        case '"':
+                            stream << "\\\"";
+                            break;
+                        case '\\':
+                            stream << "\\\\";
+                            break;
+                        case '\n':
+                            stream << "\\n";
+                            break;
+                        case '\r':
+                            stream << "\\r";
+                            break;
+                        case '\t':
+                            stream << "\\t";
+                            break;
+                        default:
+                            stream << c;
+                            break;
+                    }
                 }
+                stream << "\"";
             }
-            stream << "\"";
+        } else if (v.is_uint()) {
+            stream << v.get<std::uint64_t>();
+        } else if (v.is_int()) {
+            stream << v.get<std::int64_t>();
+        } else if (v.is_number()) {
+            stream << v.get<double>();
+        } else if (v.is_bool()) {
+            stream << (v.get<bool>() ? "true" : "false");
+        } else {
+            stream << "null";
         }
+    });
 
-        count++;
-    }
-
-    return true;
+    return any;
 }
 
 size_t JsonSerializer::serialize_node(char* buffer, int index,
                                       const CallTreeNode& node,
                                       std::uint32_t process_id,
                                       std::uint32_t thread_id) {
-    size_t written_size = 0;
-
-    // Get node data
     const auto& args = node.get_args();
 
-    // Build args JSON string if present
     std::stringstream args_stream;
     bool has_args = convert_args_to_json(args, args_stream);
 
-    // Build complete args object including hostname hash and metadata
     std::stringstream all_args;
 
-    // Check if args already has hhash, if not add it
-    bool has_hhash = args.find("hhash") != args.end();
+    bool has_hhash = args["hhash"].exists();
     if (!has_hhash && !hostname_hash_.empty()) {
         all_args << "\"hhash\":\"" << hostname_hash_ << "\"";
     }
 
-    // Check if args already has level, if not add it
-    bool has_level = args.find("level") != args.end();
+    bool has_level = args["level"].exists();
     if (!has_level) {
         if (all_args.str().size() > 0) all_args << ",";
         all_args << "\"level\":" << node.get_level();
     }
 
-    // Add parent_id if not root and not already in args
-    bool has_parent = args.find("parent_id") != args.end();
+    bool has_parent = args["parent_id"].exists();
     if (node.get_parent_id() != 0 && !has_parent) {
         if (all_args.str().size() > 0) all_args << ",";
         all_args << "\"parent_id\":" << node.get_parent_id();
     }
 
-    // Add custom args if present
     if (has_args) {
         if (all_args.str().size() > 0) all_args << ",";
         all_args << args_stream.str();
     }
 
-    // Format as Chrome Tracing complete event (phase "X")
-    // Following DFTracer's format exactly:
-    // {"id":%d,"name":"%s","cat":"%s","pid":%d,"tid":%lu,"ts":%llu,"dur":%llu,"ph":"X","args":{...}}
-    written_size = std::snprintf(
-        buffer,
-        16384,  // Large buffer size to handle long strings
-        R"({"id":%d,"name":"%s","cat":"%s","pid":%u,"tid":%u,"ts":%llu,"dur":%llu,"ph":"X","args":{%s}})",
-        index, node.get_name().c_str(), node.get_category().c_str(), process_id,
-        thread_id, static_cast<unsigned long long>(node.get_start_time()),
+    auto nm = node.get_name();
+    auto ct = node.get_category();
+    size_t written_size = std::snprintf(
+        buffer, 16384,
+        R"({"id":%d,"name":"%.*s","cat":"%.*s","pid":%u,"tid":%u,"ts":%llu,"dur":%llu,"ph":"X","args":{%s}})",
+        index, static_cast<int>(nm.size()), nm.data(),
+        static_cast<int>(ct.size()), ct.data(), process_id, thread_id,
+        static_cast<unsigned long long>(node.get_start_time()),
         static_cast<unsigned long long>(node.get_duration()),
         all_args.str().c_str());
 
-    // Add newline terminator
     if (written_size > 0) {
         buffer[written_size++] = '\n';
         buffer[written_size] = '\0';
@@ -184,10 +165,6 @@ size_t JsonSerializer::serialize_metadata(char* buffer, const std::string& name,
                                           bool is_string) {
     size_t written_size = 0;
 
-    // Format metadata event (phase "M")
-    // Following DFTracer's format:
-    // {"name":"%s","cat":"dftracer","pid":%d,"tid":%lu,"ph":"M","args":{"hhash":"%s","name":"%s","value":"%s"}}
-
     if (is_string) {
         written_size = std::snprintf(
             buffer, 8192,
@@ -202,7 +179,6 @@ size_t JsonSerializer::serialize_metadata(char* buffer, const std::string& name,
             value.c_str());
     }
 
-    // Add newline terminator
     if (written_size > 0) {
         buffer[written_size++] = '\n';
         buffer[written_size] = '\0';
@@ -213,7 +189,6 @@ size_t JsonSerializer::serialize_metadata(char* buffer, const std::string& name,
 
 size_t JsonSerializer::finalize(char* buffer, bool write_bracket) {
     if (write_bracket) {
-        // Write closing bracket for JSON array
         buffer[0] = ']';
         buffer[1] = '\n';
         return 2;
diff --git a/src/dftracer/utils/utilities/common/arrow/column_builder.cpp b/src/dftracer/utils/utilities/common/arrow/column_builder.cpp
index 1f326820..5c1bd444 100644
--- a/src/dftracer/utils/utilities/common/arrow/column_builder.cpp
+++ b/src/dftracer/utils/utilities/common/arrow/column_builder.cpp
@@ -1,3 +1,4 @@
+#include <dftracer/utils/core/common/config.h>
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
 
 #include <dftracer/utils/utilities/common/arrow/column_builder.h>
@@ -23,6 +24,10 @@ ArrowType to_nanoarrow_type(ColumnType t) noexcept {
             return NANOARROW_TYPE_STRING;
         case ColumnType::BOOL:
             return NANOARROW_TYPE_BOOL;
+        case ColumnType::DICT_STRING:
+            // Dictionary uses INT32 indices; dictionary values handled
+            // separately
+            return NANOARROW_TYPE_INT32;
     }
     return NANOARROW_TYPE_UNINITIALIZED;
 }
@@ -37,11 +42,15 @@ void RecordBatchBuilder::init_column(ColumnData& col, ColumnType type,
     col.has_nulls = false;
 }
 
-void RecordBatchBuilder::backfill_nulls(ColumnData& col, size_t target_count) {
-    size_t n = target_count - col.count;
+void RecordBatchBuilder::backfill_nulls(ColumnData& col,
+                                        std::size_t target_count) {
+    std::size_t n = target_count - col.count;
     if (n == 0) return;
 
-    col.has_nulls = true;
+    if (!col.has_nulls) {
+        col.has_nulls = true;
+        col.validity.assign(col.count, 1);
+    }
     col.validity.resize(col.count + n, 0);
 
     switch (col.type) {
@@ -55,11 +64,16 @@ void RecordBatchBuilder::backfill_nulls(ColumnData& col, size_t target_count) {
             col.double_values.resize(col.count + n, 0.0);
             break;
         case ColumnType::STRING:
-            col.string_values.resize(col.count + n, std::string_view{});
+            col.string_offsets.resize(
+                col.count + n,
+                static_cast<std::int32_t>(col.string_data.size()));
             break;
         case ColumnType::BOOL:
             col.bool_values.resize(col.count + n, 0);
             break;
+        case ColumnType::DICT_STRING:
+            col.dict_indices.resize(col.count + n, -1);  // -1 = null
+            break;
     }
     col.count += n;
 }
@@ -76,7 +90,7 @@ void RecordBatchBuilder::declare_schema(const std::vector<ColumnSpec>& specs) {
     touched_.assign(specs.size(), false);
 
     for (const auto& spec : specs) {
-        size_t idx = columns_.size();
+        std::size_t idx = columns_.size();
         columns_.emplace_back();
         init_column(columns_.back(), spec.type, spec.name);
         name_to_index_[spec.name] = idx;
@@ -84,80 +98,125 @@ void RecordBatchBuilder::declare_schema(const std::vector<ColumnSpec>& specs) {
     schema_declared_ = true;
 }
 
-size_t RecordBatchBuilder::add_or_get_column(std::string_view name,
-                                             ColumnType type) {
-    auto it = name_to_index_.find(std::string(name));
+std::size_t RecordBatchBuilder::add_or_get_column(std::string_view name,
+                                                  ColumnType type) {
+    auto it = name_to_index_.find(name);
     if (it != name_to_index_.end()) {
         // Existing column: type is ignored. Callers that need type-safe
         // appends should use find_column() + column_type() first.
         return it->second;
     }
 
-    size_t idx = columns_.size();
+    std::size_t idx = columns_.size();
     columns_.emplace_back();
     init_column(columns_.back(), type, name);
     if (num_rows_ > 0) {
         backfill_nulls(columns_.back(), num_rows_);
     }
-    name_to_index_[std::string(name)] = idx;
+    name_to_index_.emplace(std::string(name), idx);
     touched_.push_back(false);
     return idx;
 }
 
-std::optional<size_t> RecordBatchBuilder::find_column(
+std::optional<std::size_t> RecordBatchBuilder::find_column(
     std::string_view name) const {
-    auto it = name_to_index_.find(std::string(name));
+    auto it = name_to_index_.find(name);
     if (it != name_to_index_.end()) return it->second;
     return std::nullopt;
 }
 
-ColumnType RecordBatchBuilder::column_type(size_t col_idx) const noexcept {
+ColumnType RecordBatchBuilder::column_type(std::size_t col_idx) const noexcept {
     return columns_[col_idx].type;
 }
 
-void RecordBatchBuilder::append_int64(size_t col_idx, int64_t value) {
+void RecordBatchBuilder::append_int64(std::size_t col_idx, std::int64_t value) {
     auto& col = columns_[col_idx];
     col.int64_values.push_back(value);
-    col.validity.push_back(1);
+    if (col.has_nulls) col.validity.push_back(1);
     ++col.count;
-    if (!schema_declared_) touched_[col_idx] = true;
+    if (!schema_declared_ && !schema_locked_ && !touched_[col_idx]) {
+        touched_[col_idx] = true;
+        ++row_touched_count_;
+    }
 }
 
-void RecordBatchBuilder::append_uint64(size_t col_idx, uint64_t value) {
+void RecordBatchBuilder::append_uint64(std::size_t col_idx,
+                                       std::uint64_t value) {
     auto& col = columns_[col_idx];
     col.uint64_values.push_back(value);
-    col.validity.push_back(1);
+    if (col.has_nulls) col.validity.push_back(1);
     ++col.count;
-    if (!schema_declared_) touched_[col_idx] = true;
+    if (!schema_declared_ && !schema_locked_ && !touched_[col_idx]) {
+        touched_[col_idx] = true;
+        ++row_touched_count_;
+    }
 }
 
-void RecordBatchBuilder::append_double(size_t col_idx, double value) {
+void RecordBatchBuilder::append_double(std::size_t col_idx, double value) {
     auto& col = columns_[col_idx];
     col.double_values.push_back(value);
-    col.validity.push_back(1);
+    if (col.has_nulls) col.validity.push_back(1);
     ++col.count;
-    if (!schema_declared_) touched_[col_idx] = true;
+    if (!schema_declared_ && !schema_locked_ && !touched_[col_idx]) {
+        touched_[col_idx] = true;
+        ++row_touched_count_;
+    }
 }
 
-void RecordBatchBuilder::append_string(size_t col_idx, std::string_view value) {
+void RecordBatchBuilder::append_string(std::size_t col_idx,
+                                       std::string_view value) {
     auto& col = columns_[col_idx];
-    col.string_values.push_back(value);
+    col.string_data.insert(col.string_data.end(), value.begin(), value.end());
+    col.string_offsets.push_back(
+        static_cast<std::int32_t>(col.string_data.size()));
+    if (col.has_nulls) col.validity.push_back(1);
+    ++col.count;
+    if (!schema_declared_ && !schema_locked_ && !touched_[col_idx]) {
+        touched_[col_idx] = true;
+        ++row_touched_count_;
+    }
+}
+
+void RecordBatchBuilder::append_dict_string(std::size_t col_idx,
+                                            std::string_view value) {
+    auto& col = columns_[col_idx];
+    // Look up or insert into dictionary
+    auto it = col.dict_map.find(value);
+    std::int32_t idx;
+    if (it != col.dict_map.end()) {
+        idx = it->second;
+    } else {
+        idx = static_cast<std::int32_t>(col.dict_values.size());
+        col.dict_values.emplace_back(value);
+        // Map key must point to stable storage (dict_values)
+        col.dict_map[col.dict_values.back()] = idx;
+    }
+    col.dict_indices.push_back(idx);
     col.validity.push_back(1);
     ++col.count;
-    if (!schema_declared_) touched_[col_idx] = true;
+    if (!schema_declared_ && !schema_locked_ && !touched_[col_idx]) {
+        touched_[col_idx] = true;
+        ++row_touched_count_;
+    }
 }
 
-void RecordBatchBuilder::append_bool(size_t col_idx, bool value) {
+void RecordBatchBuilder::append_bool(std::size_t col_idx, bool value) {
     auto& col = columns_[col_idx];
     col.bool_values.push_back(value ? 1 : 0);
-    col.validity.push_back(1);
+    if (col.has_nulls) col.validity.push_back(1);
     ++col.count;
-    if (!schema_declared_) touched_[col_idx] = true;
+    if (!schema_declared_ && !schema_locked_ && !touched_[col_idx]) {
+        touched_[col_idx] = true;
+        ++row_touched_count_;
+    }
 }
 
-void RecordBatchBuilder::append_null(size_t col_idx) {
+void RecordBatchBuilder::append_null(std::size_t col_idx) {
     auto& col = columns_[col_idx];
-    col.has_nulls = true;
+    if (!col.has_nulls) {
+        col.has_nulls = true;
+        col.validity.assign(col.count, 1);
+    }
     col.validity.push_back(0);
 
     switch (col.type) {
@@ -171,29 +230,44 @@ void RecordBatchBuilder::append_null(size_t col_idx) {
             col.double_values.push_back(0.0);
             break;
         case ColumnType::STRING:
-            col.string_values.push_back(std::string_view{});
+            col.string_offsets.push_back(
+                static_cast<std::int32_t>(col.string_data.size()));
             break;
         case ColumnType::BOOL:
             col.bool_values.push_back(0);
             break;
+        case ColumnType::DICT_STRING:
+            col.dict_indices.push_back(-1);  // -1 = null
+            break;
     }
     ++col.count;
-    if (!schema_declared_) touched_[col_idx] = true;
+    if (!schema_declared_ && !touched_[col_idx]) {
+        touched_[col_idx] = true;
+        ++row_touched_count_;
+    }
 }
 
 void RecordBatchBuilder::end_row() {
-    // Backfill nulls for any column not appended to this row.
-    // In dynamic mode, use touched_ flags; in static mode, compare counts.
-    for (size_t i = 0; i < columns_.size(); ++i) {
-        if (columns_[i].count <= num_rows_) {
-            backfill_nulls(columns_[i], num_rows_ + 1);
+    if (!schema_declared_ && !schema_locked_ &&
+        row_touched_count_ == columns_.size()) {
+        std::fill(touched_.begin(), touched_.end(), false);
+        row_touched_count_ = 0;
+        ++num_rows_;
+        return;
+    }
+    const std::size_t target = num_rows_ + 1;
+    const bool reset_touched = !schema_declared_ && !schema_locked_;
+    for (std::size_t i = 0; i < columns_.size(); ++i) {
+        if (columns_[i].count < target) {
+            backfill_nulls(columns_[i], target);
         }
-        if (!schema_declared_) touched_[i] = false;
+        if (reset_touched) touched_[i] = false;
     }
+    row_touched_count_ = 0;
     ++num_rows_;
 }
 
-void RecordBatchBuilder::reserve(size_t num_rows) {
+void RecordBatchBuilder::reserve(std::size_t num_rows) {
     for (auto& col : columns_) {
         switch (col.type) {
             case ColumnType::INT64:
@@ -206,19 +280,27 @@ void RecordBatchBuilder::reserve(size_t num_rows) {
                 col.double_values.reserve(num_rows);
                 break;
             case ColumnType::STRING:
-                col.string_values.reserve(num_rows);
+                col.string_offsets.reserve(num_rows + 1);
+                // dftracer hash strings are 16 bytes; common strings
+                // (event names, categories) range 4-32. Bumping the
+                // initial reservation cuts geometric-growth memmove churn
+                // visible in perf for moderate batch sizes.
+                col.string_data.reserve(num_rows * 32);
                 break;
             case ColumnType::BOOL:
                 col.bool_values.reserve(num_rows);
                 break;
+            case ColumnType::DICT_STRING:
+                col.dict_indices.reserve(num_rows);
+                break;
         }
         col.validity.reserve(num_rows);
     }
 }
 
 ArrowExportResult RecordBatchBuilder::finish() {
-    const int64_t ncols = static_cast<int64_t>(columns_.size());
-    const int64_t nrows = static_cast<int64_t>(num_rows_);
+    const std::int64_t ncols = static_cast<std::int64_t>(columns_.size());
+    const std::int64_t nrows = static_cast<std::int64_t>(num_rows_);
 
     // Build schema: struct with one child per column.
     nanoarrow::UniqueSchema schema;
@@ -229,12 +311,34 @@ ArrowExportResult RecordBatchBuilder::finish() {
     if (ArrowSchemaAllocateChildren(schema.get(), ncols) != NANOARROW_OK) {
         throw std::runtime_error("ArrowSchemaAllocateChildren failed");
     }
-    for (int64_t i = 0; i < ncols; ++i) {
-        const auto& col = columns_[static_cast<size_t>(i)];
+    for (std::int64_t i = 0; i < ncols; ++i) {
+        const auto& col = columns_[static_cast<std::size_t>(i)];
         ArrowSchema* child_schema = schema->children[i];
-        if (ArrowSchemaInitFromType(
-                child_schema, to_nanoarrow_type(col.type)) != NANOARROW_OK) {
-            throw std::runtime_error("ArrowSchemaInitFromType(child) failed");
+
+        if (col.type == ColumnType::DICT_STRING) {
+            // Dictionary-encoded string: indices are INT32, values are STRING
+            if (ArrowSchemaInitFromType(child_schema, NANOARROW_TYPE_INT32) !=
+                NANOARROW_OK) {
+                throw std::runtime_error(
+                    "ArrowSchemaInitFromType(dict indices) failed");
+            }
+            if (ArrowSchemaAllocateDictionary(child_schema) != NANOARROW_OK) {
+                throw std::runtime_error(
+                    "ArrowSchemaAllocateDictionary failed");
+            }
+            if (ArrowSchemaInitFromType(child_schema->dictionary,
+                                        NANOARROW_TYPE_STRING) !=
+                NANOARROW_OK) {
+                throw std::runtime_error(
+                    "ArrowSchemaInitFromType(dict values) failed");
+            }
+        } else {
+            if (ArrowSchemaInitFromType(child_schema,
+                                        to_nanoarrow_type(col.type)) !=
+                NANOARROW_OK) {
+                throw std::runtime_error(
+                    "ArrowSchemaInitFromType(child) failed");
+            }
         }
         if (ArrowSchemaSetName(child_schema, col.name.c_str()) !=
             NANOARROW_OK) {
@@ -253,100 +357,158 @@ ArrowExportResult RecordBatchBuilder::finish() {
         throw std::runtime_error("ArrowArrayStartAppending failed");
     }
 
-    for (int64_t i = 0; i < ncols; ++i) {
-        const auto& col = columns_[static_cast<size_t>(i)];
+    for (std::int64_t i = 0; i < ncols; ++i) {
+        const auto& col = columns_[static_cast<std::size_t>(i)];
         ArrowArray* child = array->children[i];
 
         if (ArrowArrayReserve(child, nrows) != NANOARROW_OK) {
             throw std::runtime_error("ArrowArrayReserve failed");
         }
 
-        // AppendNull handles validity bits internally.
+        const std::size_t row_count =
+            std::min<std::size_t>(col.count, static_cast<std::size_t>(nrows));
+        std::int64_t null_count = 0;
+
+        auto fill_validity = [&]() {
+            if (!col.has_nulls) return;
+            ArrowBitmap* bm = ArrowArrayValidityBitmap(child);
+            ArrowBitmapReserve(bm, static_cast<std::int64_t>(row_count));
+            for (std::size_t r = 0; r < row_count; ++r) {
+                std::uint8_t v = col.validity[r];
+                if (!v) ++null_count;
+                ArrowBitmapAppendUnsafe(bm, v, 1);
+            }
+        };
+
         switch (col.type) {
-            case ColumnType::INT64:
-                for (size_t r = 0; r < col.count; ++r) {
-                    if (col.has_nulls && col.validity[r] == 0) {
-                        if (ArrowArrayAppendNull(child, 1) != NANOARROW_OK) {
-                            throw std::runtime_error(
-                                "ArrowArrayAppendNull failed");
-                        }
-                    } else {
-                        if (ArrowArrayAppendInt(child, col.int64_values[r]) !=
-                            NANOARROW_OK) {
-                            throw std::runtime_error(
-                                "ArrowArrayAppendInt failed");
-                        }
-                    }
+            case ColumnType::INT64: {
+                fill_validity();
+                ArrowBuffer* data_buf = ArrowArrayBuffer(child, 1);
+                if (ArrowBufferAppend(data_buf, col.int64_values.data(),
+                                      static_cast<std::int64_t>(
+                                          row_count * sizeof(std::int64_t))) !=
+                    NANOARROW_OK) {
+                    throw std::runtime_error("INT64 buffer append failed");
                 }
                 break;
-            case ColumnType::UINT64:
-                for (size_t r = 0; r < col.count; ++r) {
-                    if (col.has_nulls && col.validity[r] == 0) {
-                        if (ArrowArrayAppendNull(child, 1) != NANOARROW_OK) {
-                            throw std::runtime_error(
-                                "ArrowArrayAppendNull failed");
-                        }
-                    } else {
-                        if (ArrowArrayAppendUInt(child, col.uint64_values[r]) !=
-                            NANOARROW_OK) {
-                            throw std::runtime_error(
-                                "ArrowArrayAppendUInt failed");
-                        }
-                    }
+            }
+            case ColumnType::UINT64: {
+                fill_validity();
+                ArrowBuffer* data_buf = ArrowArrayBuffer(child, 1);
+                if (ArrowBufferAppend(data_buf, col.uint64_values.data(),
+                                      static_cast<std::int64_t>(
+                                          row_count * sizeof(std::uint64_t))) !=
+                    NANOARROW_OK) {
+                    throw std::runtime_error("UINT64 buffer append failed");
                 }
                 break;
-            case ColumnType::DOUBLE:
-                for (size_t r = 0; r < col.count; ++r) {
-                    if (col.has_nulls && col.validity[r] == 0) {
-                        if (ArrowArrayAppendNull(child, 1) != NANOARROW_OK) {
-                            throw std::runtime_error(
-                                "ArrowArrayAppendNull failed");
-                        }
-                    } else {
-                        if (ArrowArrayAppendDouble(
-                                child, col.double_values[r]) != NANOARROW_OK) {
-                            throw std::runtime_error(
-                                "ArrowArrayAppendDouble failed");
-                        }
-                    }
+            }
+            case ColumnType::DOUBLE: {
+                fill_validity();
+                ArrowBuffer* data_buf = ArrowArrayBuffer(child, 1);
+                if (ArrowBufferAppend(data_buf, col.double_values.data(),
+                                      static_cast<std::int64_t>(
+                                          row_count * sizeof(double))) !=
+                    NANOARROW_OK) {
+                    throw std::runtime_error("DOUBLE buffer append failed");
                 }
                 break;
+            }
             case ColumnType::STRING: {
-                for (size_t r = 0; r < col.count; ++r) {
+                fill_validity();
+                ArrowBuffer* offsets_buf = ArrowArrayBuffer(child, 1);
+                ArrowBuffer* data_buf = ArrowArrayBuffer(child, 2);
+                if (row_count > 0) {
+                    ArrowBufferReserve(offsets_buf,
+                                       row_count * sizeof(std::int32_t));
+                    ArrowBufferAppend(offsets_buf, col.string_offsets.data(),
+                                      static_cast<std::int64_t>(
+                                          row_count * sizeof(std::int32_t)));
+                }
+                if (!col.string_data.empty()) {
+                    ArrowBufferAppend(
+                        data_buf, col.string_data.data(),
+                        static_cast<std::int64_t>(col.string_data.size()));
+                }
+                break;
+            }
+            case ColumnType::BOOL: {
+                for (std::size_t r = 0; r < row_count; ++r) {
                     if (col.has_nulls && col.validity[r] == 0) {
                         if (ArrowArrayAppendNull(child, 1) != NANOARROW_OK) {
                             throw std::runtime_error(
-                                "ArrowArrayAppendNull failed");
+                                "ArrowArrayAppendNull(bool) failed");
                         }
+                        ++null_count;
                     } else {
-                        std::string_view sv = col.string_values[r];
-                        ArrowStringView asv{sv.data(),
-                                            static_cast<int64_t>(sv.size())};
-                        if (ArrowArrayAppendString(child, asv) !=
+                        if (ArrowArrayAppendInt(child, col.bool_values[r]) !=
                             NANOARROW_OK) {
                             throw std::runtime_error(
-                                "ArrowArrayAppendString failed");
+                                "ArrowArrayAppendInt(bool) failed");
                         }
                     }
                 }
                 break;
             }
-            case ColumnType::BOOL:
-                for (size_t r = 0; r < col.count; ++r) {
+            case ColumnType::DICT_STRING: {
+                // Build indices array (INT32)
+                for (std::size_t r = 0; r < col.count; ++r) {
                     if (col.has_nulls && col.validity[r] == 0) {
                         if (ArrowArrayAppendNull(child, 1) != NANOARROW_OK) {
                             throw std::runtime_error(
-                                "ArrowArrayAppendNull failed");
+                                "ArrowArrayAppendNull(dict) failed");
                         }
                     } else {
-                        if (ArrowArrayAppendInt(child, col.bool_values[r]) !=
+                        if (ArrowArrayAppendInt(child, col.dict_indices[r]) !=
                             NANOARROW_OK) {
                             throw std::runtime_error(
-                                "ArrowArrayAppendInt(bool) failed");
+                                "ArrowArrayAppendInt(dict index) failed");
                         }
                     }
                 }
+
+                // Build dictionary array (STRING)
+                // Allocate dictionary array
+                child->dictionary =
+                    static_cast<ArrowArray*>(ArrowMalloc(sizeof(ArrowArray)));
+                if (!child->dictionary) {
+                    throw std::runtime_error("Failed to allocate dictionary");
+                }
+                ArrowArrayInitFromType(child->dictionary,
+                                       NANOARROW_TYPE_STRING);
+                if (ArrowArrayStartAppending(child->dictionary) !=
+                    NANOARROW_OK) {
+                    throw std::runtime_error(
+                        "ArrowArrayStartAppending(dict) failed");
+                }
+                if (ArrowArrayReserve(
+                        child->dictionary,
+                        static_cast<std::int64_t>(col.dict_values.size())) !=
+                    NANOARROW_OK) {
+                    throw std::runtime_error("ArrowArrayReserve(dict) failed");
+                }
+                for (const auto& s : col.dict_values) {
+                    ArrowStringView asv{s.data(),
+                                        static_cast<std::int64_t>(s.size())};
+                    if (ArrowArrayAppendString(child->dictionary, asv) !=
+                        NANOARROW_OK) {
+                        throw std::runtime_error(
+                            "ArrowArrayAppendString(dict) failed");
+                    }
+                }
+                if (ArrowArrayFinishBuildingDefault(child->dictionary,
+                                                    nullptr) != NANOARROW_OK) {
+                    throw std::runtime_error(
+                        "ArrowArrayFinishBuildingDefault(dict) failed");
+                }
                 break;
+            }
+        }
+
+        if (col.type == ColumnType::INT64 || col.type == ColumnType::UINT64 ||
+            col.type == ColumnType::DOUBLE || col.type == ColumnType::STRING) {
+            child->length = static_cast<std::int64_t>(row_count);
+            child->null_count = col.has_nulls ? null_count : 0;
         }
 
         if (ArrowArrayFinishBuildingDefault(child, nullptr) != NANOARROW_OK) {
@@ -367,22 +529,30 @@ ArrowExportResult RecordBatchBuilder::finish() {
 }
 
 void RecordBatchBuilder::reset(bool keep_schema) {
-    if (keep_schema && schema_declared_) {
+    // Keep schema if explicitly declared OR if dynamically locked
+    if (keep_schema && (schema_declared_ || schema_locked_)) {
         for (auto& col : columns_) {
             col.int64_values.clear();
             col.uint64_values.clear();
             col.double_values.clear();
-            col.string_values.clear();
+            col.string_offsets.clear();
+            col.string_data.clear();
             col.bool_values.clear();
+            col.dict_indices.clear();
+            col.dict_values.clear();
+            col.dict_map.clear();
             col.validity.clear();
             col.count = 0;
             col.has_nulls = false;
         }
+        // Reset touched flags but keep the vector size
+        std::fill(touched_.begin(), touched_.end(), false);
     } else {
         columns_.clear();
         name_to_index_.clear();
         touched_.clear();
         schema_declared_ = false;
+        schema_locked_ = false;
     }
     num_rows_ = 0;
 }
diff --git a/src/dftracer/utils/utilities/common/arrow/ipc_reader.cpp b/src/dftracer/utils/utilities/common/arrow/ipc_reader.cpp
new file mode 100644
index 00000000..30b39673
--- /dev/null
+++ b/src/dftracer/utils/utilities/common/arrow/ipc_reader.cpp
@@ -0,0 +1,355 @@
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+#include <dftracer/utils/utilities/common/arrow/ipc_reader.h>
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow_ipc.h>
+
+#include <cstring>
+#include <new>
+
+// Platform-specific includes for mmap
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif
+
+namespace dftracer::utils::utilities::common::arrow {
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+static ArrowIpcDecoder* as_decoder(void* p) noexcept {
+    return static_cast<ArrowIpcDecoder*>(p);
+}
+
+// ---------------------------------------------------------------------------
+// Lifecycle
+// ---------------------------------------------------------------------------
+
+IpcReader::~IpcReader() { close(); }
+
+IpcReader::IpcReader(IpcReader&& other) noexcept
+    : mapped_data_(other.mapped_data_),
+      mapped_size_(other.mapped_size_),
+      fd_(other.fd_),
+      decoder_(other.decoder_),
+      shared_schema_(std::move(other.shared_schema_)),
+      blocks_(std::move(other.blocks_)),
+      num_batches_(other.num_batches_),
+      total_rows_(other.total_rows_) {
+    other.reset_state();
+}
+
+IpcReader& IpcReader::operator=(IpcReader&& other) noexcept {
+    if (this != &other) {
+        close();
+        mapped_data_ = other.mapped_data_;
+        mapped_size_ = other.mapped_size_;
+        fd_ = other.fd_;
+        decoder_ = other.decoder_;
+        shared_schema_ = std::move(other.shared_schema_);
+        blocks_ = std::move(other.blocks_);
+        num_batches_ = other.num_batches_;
+        total_rows_ = other.total_rows_;
+        other.reset_state();
+    }
+    return *this;
+}
+
+void IpcReader::reset_state() noexcept {
+    mapped_data_ = nullptr;
+    mapped_size_ = 0;
+    fd_ = -1;
+    decoder_ = nullptr;
+    shared_schema_.reset();
+    blocks_.clear();
+    num_batches_ = 0;
+    total_rows_ = 0;
+}
+
+void IpcReader::close() {
+    if (decoder_) {
+        ArrowIpcDecoderReset(as_decoder(decoder_));
+        delete as_decoder(decoder_);
+        decoder_ = nullptr;
+    }
+
+    shared_schema_.reset();
+
+#ifdef _WIN32
+    if (mapped_data_) {
+        UnmapViewOfFile(mapped_data_);
+    }
+    if (fd_ != -1) {
+        CloseHandle(reinterpret_cast<HANDLE>(fd_));
+    }
+#else
+    if (mapped_data_ && mapped_data_ != MAP_FAILED) {
+        munmap(mapped_data_, mapped_size_);
+    }
+    if (fd_ != -1) {
+        ::close(fd_);
+    }
+#endif
+
+    reset_state();
+}
+
+// ---------------------------------------------------------------------------
+// open / read_footer
+// ---------------------------------------------------------------------------
+
+int IpcReader::open(const std::string& path) {
+    if (is_open()) return -1;
+
+#ifdef _WIN32
+    // Windows memory mapping
+    HANDLE file =
+        CreateFileA(path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr,
+                    OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr);
+    if (file == INVALID_HANDLE_VALUE) return -1;
+
+    LARGE_INTEGER size;
+    if (!GetFileSizeEx(file, &size)) {
+        CloseHandle(file);
+        return -1;
+    }
+    mapped_size_ = static_cast<std::size_t>(size.QuadPart);
+
+    HANDLE mapping =
+        CreateFileMappingA(file, nullptr, PAGE_READONLY, 0, 0, nullptr);
+    if (!mapping) {
+        CloseHandle(file);
+        return -1;
+    }
+
+    mapped_data_ = MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0);
+    CloseHandle(mapping);
+    if (!mapped_data_) {
+        CloseHandle(file);
+        return -1;
+    }
+    fd_ = reinterpret_cast<int>(file);
+#else
+    // POSIX memory mapping
+    fd_ = ::open(path.c_str(), O_RDONLY);
+    if (fd_ < 0) return -1;
+
+    struct stat st;
+    if (fstat(fd_, &st) < 0) {
+        ::close(fd_);
+        fd_ = -1;
+        return -1;
+    }
+    mapped_size_ = static_cast<std::size_t>(st.st_size);
+
+    // Minimum Arrow IPC file: magic(8) + footer_size(4) + magic(6) = 18 bytes
+    if (mapped_size_ < 18) {
+        ::close(fd_);
+        fd_ = -1;
+        return -1;
+    }
+
+    mapped_data_ = mmap(nullptr, mapped_size_, PROT_READ, MAP_PRIVATE, fd_, 0);
+    if (mapped_data_ == MAP_FAILED) {
+        ::close(fd_);
+        fd_ = -1;
+        mapped_data_ = nullptr;
+        return -1;
+    }
+
+    // Advise kernel we'll read sequentially
+    madvise(mapped_data_, mapped_size_, MADV_SEQUENTIAL);
+#endif
+
+    int rc = read_footer();
+    if (rc != NANOARROW_OK) {
+        close();
+        return rc;
+    }
+
+    return NANOARROW_OK;
+}
+
+int IpcReader::read_footer() {
+    auto* data = static_cast<const uint8_t*>(mapped_data_);
+
+    // Arrow IPC file format footer:
+    // ... | footer | footer_size (4 bytes) | "ARROW1" (6 bytes)
+
+    // Validate magic at end
+    if (std::memcmp(data + mapped_size_ - 6, "ARROW1", 6) != 0) {
+        return -1;
+    }
+
+    // Read footer size (4 bytes before magic)
+    std::int32_t footer_size;
+    std::memcpy(&footer_size, data + mapped_size_ - 10, sizeof(footer_size));
+
+    // Calculate footer bounds
+    std::int64_t footer_total_size =
+        footer_size + 10;     // footer + size(4) + magic(6)
+    std::int64_t footer_offset = mapped_size_ - footer_total_size;
+    if (footer_offset < 8) {  // Must be after file magic
+        return -1;
+    }
+
+    // Initialize decoder
+    auto* decoder = new (std::nothrow) ArrowIpcDecoder;
+    if (!decoder) return -1;
+    std::memset(decoder, 0, sizeof(ArrowIpcDecoder));
+
+    int rc = ArrowIpcDecoderInit(decoder);
+    if (rc != NANOARROW_OK) {
+        delete decoder;
+        return rc;
+    }
+    decoder_ = decoder;
+
+    // Decode footer directly from mmap'd memory (zero-copy)
+    ArrowBufferView footer_view;
+    footer_view.data.as_uint8 = data + footer_offset;
+    footer_view.size_bytes = footer_total_size;
+
+    ArrowError error;
+    rc = ArrowIpcDecoderVerifyFooter(decoder, footer_view, &error);
+    if (rc != NANOARROW_OK) {
+        return rc;
+    }
+
+    rc = ArrowIpcDecoderDecodeFooter(decoder, footer_view, &error);
+    if (rc != NANOARROW_OK) {
+        return rc;
+    }
+
+    // Footer is now available at decoder->footer
+    ArrowIpcFooter* footer = decoder->footer;
+
+    // Copy block info - decoder state may be modified by subsequent operations
+    num_batches_ =
+        footer->record_batch_blocks.size_bytes / sizeof(ArrowIpcFileBlock);
+    blocks_.resize(num_batches_);
+    auto* src_blocks = reinterpret_cast<const ArrowIpcFileBlock*>(
+        footer->record_batch_blocks.data);
+    for (std::size_t i = 0; i < num_batches_; ++i) {
+        blocks_[i].offset = src_blocks[i].offset;
+        blocks_[i].metadata_length = src_blocks[i].metadata_length;
+        blocks_[i].body_length = src_blocks[i].body_length;
+    }
+
+    // Create shared schema - deep copy once, share for all batches
+    auto* schema = new (std::nothrow) ArrowSchema;
+    if (!schema) return -1;
+    std::memset(schema, 0, sizeof(ArrowSchema));
+    rc = ArrowSchemaDeepCopy(&footer->schema, schema);
+    if (rc != NANOARROW_OK) {
+        delete schema;
+        return rc;
+    }
+
+    // Wrap in shared_ptr with custom deleter
+    shared_schema_ = std::shared_ptr<void>(schema, [](void* p) {
+        auto* s = static_cast<ArrowSchema*>(p);
+        if (s->release) s->release(s);
+        delete s;
+    });
+
+    // Set decoder's expected schema
+    rc = ArrowIpcDecoderSetSchema(decoder, &footer->schema, &error);
+    if (rc != NANOARROW_OK) {
+        return rc;
+    }
+
+    return NANOARROW_OK;
+}
+
+// ---------------------------------------------------------------------------
+// read_batch
+// ---------------------------------------------------------------------------
+
+ArrowExportResult IpcReader::read_batch(std::size_t index) {
+    if (!is_open() || index >= num_batches_) {
+        return ArrowExportResult();
+    }
+
+    auto* decoder = as_decoder(decoder_);
+    auto* data = static_cast<const uint8_t*>(mapped_data_);
+    const auto& block = blocks_[index];
+
+    // Zero-copy: point directly into mmap'd memory for header
+    ArrowBufferView header_view;
+    header_view.data.as_uint8 = data + block.offset;
+    header_view.size_bytes = block.metadata_length;
+
+    ArrowError error;
+    int rc = ArrowIpcDecoderDecodeHeader(decoder, header_view, &error);
+    if (rc != NANOARROW_OK) {
+        return ArrowExportResult();
+    }
+
+    // Zero-copy: point directly into mmap'd memory for body
+    ArrowBufferView body_view;
+    body_view.data.as_uint8 = data + block.offset + block.metadata_length;
+    body_view.size_bytes = block.body_length;
+
+    // Decode array
+    nanoarrow::UniqueArray array;
+    rc = ArrowIpcDecoderDecodeArray(decoder, body_view, -1, array.get(),
+                                    NANOARROW_VALIDATION_LEVEL_FULL, &error);
+    if (rc != NANOARROW_OK) {
+        return ArrowExportResult();
+    }
+
+    // Share schema instead of deep copying
+    // We need to create a new ArrowSchema that references our shared one
+    auto* schema_ptr = static_cast<ArrowSchema*>(shared_schema_.get());
+    nanoarrow::UniqueSchema schema;
+    rc = ArrowSchemaDeepCopy(schema_ptr, schema.get());
+    if (rc != NANOARROW_OK) {
+        return ArrowExportResult();
+    }
+
+    return ArrowExportResult(std::move(schema), std::move(array));
+}
+
+// ---------------------------------------------------------------------------
+// read_all / for_each_batch
+// ---------------------------------------------------------------------------
+
+std::vector<ArrowExportResult> IpcReader::read_all() {
+    std::vector<ArrowExportResult> results;
+    results.reserve(num_batches_);
+
+    for (std::size_t i = 0; i < num_batches_; ++i) {
+        auto batch = read_batch(i);
+        if (batch.valid()) {
+            results.push_back(std::move(batch));
+        }
+    }
+
+    return results;
+}
+
+int IpcReader::for_each_batch(std::function<int(ArrowExportResult&)> callback) {
+    for (std::size_t i = 0; i < num_batches_; ++i) {
+        auto batch = read_batch(i);
+        if (!batch.valid()) {
+            return -1;
+        }
+        int rc = callback(batch);
+        if (rc != 0) {
+            return rc;
+        }
+    }
+    return 0;
+}
+
+}  // namespace dftracer::utils::utilities::common::arrow
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW_IPC
diff --git a/src/dftracer/utils/utilities/common/arrow/ipc_writer.cpp b/src/dftracer/utils/utilities/common/arrow/ipc_writer.cpp
index 871d85f0..8ee4cb4e 100644
--- a/src/dftracer/utils/utilities/common/arrow/ipc_writer.cpp
+++ b/src/dftracer/utils/utilities/common/arrow/ipc_writer.cpp
@@ -1,199 +1,719 @@
+#include <dftracer/utils/core/common/config.h>
 #ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
 
+#include <dftracer/utils/core/coro/when_all.h>
+#include <dftracer/utils/core/io/ops.h>
 #include <dftracer/utils/utilities/common/arrow/ipc_writer.h>
+#include <fcntl.h>
+#include <flatcc/flatcc_builder.h>
+#include <nanoarrow/ipc/flatcc_generated.h>
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow_ipc.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#ifdef DFTRACER_UTILS_ENABLE_ZSTD
+#include <zstd.h>
+#endif
 
-#include <cstdio>
 #include <cstring>
 #include <new>
+#include <vector>
+
+#define ns(x) FLATBUFFERS_WRAP_NAMESPACE(org_apache_arrow_flatbuf, x)
 
 namespace dftracer::utils::utilities::common::arrow {
 
+using FileBlock = ArrowIpcFileBlock;
+
+// ---------------------------------------------------------------------------
+// BufferPool
+// ---------------------------------------------------------------------------
+
+BufferPool::BufferPool(std::size_t num_slots, std::size_t initial_capacity) {
+    slots_.reserve(num_slots);
+    for (std::size_t i = 0; i < num_slots; ++i) {
+        auto slot = std::make_unique<Slot>();
+        slot->data.reserve(initial_capacity);
+        slots_.push_back(std::move(slot));
+    }
+}
+
+BufferPool::Slot* BufferPool::acquire(std::size_t min_capacity) {
+    for (auto& slot : slots_) {
+        bool expected = false;
+        if (slot->in_use.compare_exchange_strong(expected, true,
+                                                 std::memory_order_acquire)) {
+            if (slot->data.capacity() < min_capacity) {
+                slot->data.reserve(min_capacity);
+            }
+            slot->data.clear();
+            return slot.get();
+        }
+    }
+    return nullptr;  // All slots in use
+}
+
+void BufferPool::release(Slot* slot) {
+    if (slot) {
+        slot->in_use.store(false, std::memory_order_release);
+    }
+}
+
 // ---------------------------------------------------------------------------
-// Helpers
+// Compression helpers
 // ---------------------------------------------------------------------------
 
-static ArrowIpcWriter* as_writer(void* p) noexcept {
-    return static_cast<ArrowIpcWriter*>(p);
+#ifdef DFTRACER_UTILS_ENABLE_ZSTD
+
+struct BufferInfo {
+    std::int64_t offset;
+    std::int64_t length;
+};
+
+static void collect_flat_buffers(ArrowArrayView* view,
+                                 std::vector<ArrowBufferView>& out,
+                                 bool is_root = false) {
+    if (!is_root) {
+        std::int64_t num_buffers = ArrowArrayViewGetNumBuffers(view);
+        for (std::int64_t i = 0; i < num_buffers; i++) {
+            out.push_back(ArrowArrayViewGetBufferView(view, i));
+        }
+    }
+    for (std::int64_t i = 0; i < view->n_children; i++) {
+        collect_flat_buffers(view->children[i], out, false);
+    }
+}
+
+static void collect_nodes(const ArrowArrayView* view,
+                          std::vector<ns(FieldNode_t)>& nodes,
+                          bool is_root = false) {
+    if (!is_root) {
+        ns(FieldNode_t) node;
+        node.length = view->length;
+        node.null_count = ArrowArrayViewComputeNullCount(view);
+        nodes.push_back(node);
+    }
+    for (std::int64_t i = 0; i < view->n_children; i++) {
+        collect_nodes(view->children[i], nodes, false);
+    }
+}
+
+static int build_compressed_body(ArrowArrayView* view,
+                                 std::vector<uint8_t>& out_body,
+                                 std::vector<BufferInfo>& out_info) {
+    std::vector<ArrowBufferView> flat_buffers;
+    collect_flat_buffers(view, flat_buffers, true);
+
+    out_body.clear();
+    out_info.clear();
+    std::int64_t compressed_offset = 0;
+
+    for (const auto& buf : flat_buffers) {
+        std::int64_t uncompressed_size = buf.size_bytes;
+
+        if (uncompressed_size == 0 || buf.data.data == nullptr) {
+            out_info.push_back({compressed_offset, 0});
+            continue;
+        }
+
+        std::size_t max_compressed = ZSTD_compressBound(uncompressed_size);
+        std::size_t old_size = out_body.size();
+
+        // Reserve space: [int64 uncompressed_size][zstd data]
+        out_body.resize(old_size + 8 + max_compressed);
+
+        std::size_t compressed_size =
+            ZSTD_compress(out_body.data() + old_size + 8, max_compressed,
+                          buf.data.data, uncompressed_size, 3);
+
+        if (ZSTD_isError(compressed_size)) {
+            return -1;
+        }
+
+        std::int64_t total_size =
+            8 + static_cast<std::int64_t>(compressed_size);
+        out_body.resize(old_size + total_size);
+        std::memcpy(out_body.data() + old_size, &uncompressed_size, 8);
+
+        out_info.push_back({compressed_offset, total_size});
+
+        compressed_offset += total_size;
+        std::int64_t padded = (compressed_offset + 7) & ~7;
+        out_body.resize(padded, 0);
+        compressed_offset = padded;
+    }
+
+    return 0;
 }
 
-static ArrowIpcOutputStream* as_stream(void* p) noexcept {
-    return static_cast<ArrowIpcOutputStream*>(p);
+static int build_message_header(ArrowArrayView* view,
+                                const std::vector<BufferInfo>& buffer_info,
+                                std::int64_t body_length,
+                                std::vector<uint8_t>& out_header) {
+    std::vector<ns(FieldNode_t)> nodes;
+    collect_nodes(view, nodes, true);
+
+    flatcc_builder_t builder;
+    if (flatcc_builder_init(&builder) == -1) {
+        return -1;
+    }
+    flatcc_builder_set_vtable_clustering(&builder, 0);
+
+    std::vector<ns(Buffer_t)> buffer_structs;
+    buffer_structs.reserve(buffer_info.size());
+    for (const auto& buf : buffer_info) {
+        ns(Buffer_t) b;
+        b.offset = buf.offset;
+        b.length = buf.length;
+        buffer_structs.push_back(b);
+    }
+
+    ns(BodyCompression_ref_t) compression_ref = ns(BodyCompression_create(
+        &builder, ns(CompressionType_ZSTD), ns(BodyCompressionMethod_BUFFER)));
+
+    ns(Message_start_as_root(&builder));
+    ns(Message_version_add(&builder, ns(MetadataVersion_V5)));
+    ns(Message_header_RecordBatch_start(&builder));
+    ns(RecordBatch_length_add(&builder, view->length));
+    ns(RecordBatch_nodes_create(
+        &builder, reinterpret_cast<ns(FieldNode_t)*>(nodes.data()),
+        nodes.size()));
+    ns(RecordBatch_buffers_create(&builder, buffer_structs.data(),
+                                  buffer_structs.size()));
+    ns(RecordBatch_compression_add(&builder, compression_ref));
+    ns(Message_header_RecordBatch_end(&builder));
+    ns(Message_bodyLength_add(&builder, body_length));
+    ns(Message_end_as_root(&builder));
+
+    std::size_t msg_size = 0;
+    void* msg_buf = flatcc_builder_get_direct_buffer(&builder, &msg_size);
+    void* allocated_buf = nullptr;
+
+    if (!msg_buf) {
+        msg_buf = flatcc_builder_finalize_buffer(&builder, &msg_size);
+        allocated_buf = msg_buf;
+    }
+
+    if (!msg_buf || msg_size == 0) {
+        if (allocated_buf) flatcc_builder_free(allocated_buf);
+        flatcc_builder_clear(&builder);
+        return -1;
+    }
+
+    // Build IPC encapsulated message: continuation(-1) + size + metadata +
+    // padding
+    std::int32_t continuation = -1;
+    std::int32_t msg_size_i32 = static_cast<std::int32_t>(msg_size);
+    std::size_t msg_padding = (8 - (msg_size % 8)) % 8;
+
+    out_header.clear();
+    out_header.resize(8 + msg_size + msg_padding);
+    std::memcpy(out_header.data(), &continuation, 4);
+    std::memcpy(out_header.data() + 4, &msg_size_i32, 4);
+    std::memcpy(out_header.data() + 8, msg_buf, msg_size);
+    // Padding bytes are already zero from resize
+
+    if (allocated_buf) flatcc_builder_free(allocated_buf);
+    flatcc_builder_clear(&builder);
+
+    return 0;
 }
 
+#endif  // DFTRACER_UTILS_ENABLE_ZSTD
+
 // ---------------------------------------------------------------------------
-// Lifecycle
+// IpcWriter lifecycle
 // ---------------------------------------------------------------------------
 
 IpcWriter::~IpcWriter() {
     if (is_open()) {
-        close();
+        // Sync close in destructor - not ideal but safe
+        if (fd_ >= 0) {
+            ::close(fd_);
+        }
+        reset_state();
     }
 }
 
-IpcWriter::IpcWriter(IpcWriter&& other) noexcept
-    : file_(other.file_),
-      schema_written_(other.schema_written_),
-      writer_(other.writer_),
-      stream_(other.stream_) {
+IpcWriter::IpcWriter(IpcWriter&& other) noexcept {
+    fd_ = other.fd_;
+    write_offset_ = other.write_offset_;
+    buffer_pool_ = std::move(other.buffer_pool_);
+    schema_written_ = other.schema_written_;
+    compression_ = other.compression_;
+    batch_blocks_ = other.batch_blocks_;
+    schema_copy_ = other.schema_copy_;
     other.reset_state();
 }
 
 IpcWriter& IpcWriter::operator=(IpcWriter&& other) noexcept {
     if (this != &other) {
-        if (is_open()) close();
-        file_ = other.file_;
+        if (fd_ >= 0) ::close(fd_);
+        fd_ = other.fd_;
+        write_offset_ = other.write_offset_;
+        buffer_pool_ = std::move(other.buffer_pool_);
         schema_written_ = other.schema_written_;
-        writer_ = other.writer_;
-        stream_ = other.stream_;
+        compression_ = other.compression_;
+        batch_blocks_ = other.batch_blocks_;
+        schema_copy_ = other.schema_copy_;
         other.reset_state();
     }
     return *this;
 }
 
 void IpcWriter::reset_state() noexcept {
-    file_ = nullptr;
-    writer_ = nullptr;
-    stream_ = nullptr;
+    fd_ = -1;
+    write_offset_ = 0;
     schema_written_ = false;
+    batch_blocks_ = nullptr;
+    schema_copy_ = nullptr;
 }
 
 // ---------------------------------------------------------------------------
 // open
 // ---------------------------------------------------------------------------
 
-int IpcWriter::open(const std::string& path) {
-    if (is_open()) return -1;
+coro::CoroTask<int> IpcWriter::open(const std::string& path,
+                                    IpcCompression compression,
+                                    std::size_t pool_slots) {
+    if (is_open()) co_return -1;
 
-    file_ = std::fopen(path.c_str(), "wb");
-    if (!file_) return -1;
+    compression_ = compression;
+    buffer_pool_ = BufferPool(pool_slots);
 
-    // Allocate stream. ArrowIpcWriterInit takes ownership on success.
-    auto* os = new (std::nothrow) ArrowIpcOutputStream;
-    if (!os) {
-        std::fclose(file_);
-        file_ = nullptr;
-        return -1;
+    // Async open via io::open (auto-detects executor context)
+    auto result =
+        co_await io::open(path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
+
+    if (result < 0) {
+        reset_state();
+        co_return static_cast<int>(result);
+    }
+
+    fd_ = static_cast<int>(result);
+    write_offset_ = 0;
+
+    // Write Arrow IPC file magic "ARROW1" + padding
+    constexpr char MAGIC[] = "ARROW1\0\0";
+    auto write_result = co_await io::pwrite(fd_, MAGIC, 8, 0);
+    if (write_result < 0) {
+        co_await io::close(fd_);
+        reset_state();
+        co_return static_cast<int>(write_result);
+    }
+    write_offset_ = 8;
+
+    // Initialize block tracking
+    batch_blocks_ = new std::vector<FileBlock>();
+
+    co_return 0;
+}
+
+// ---------------------------------------------------------------------------
+// write_schema
+// ---------------------------------------------------------------------------
+
+coro::CoroTask<int> IpcWriter::write_schema(ArrowExportResult& batch) {
+    ArrowSchema* schema = batch.get_schema();
+
+    // Deep copy schema for footer
+    auto* schema_cp = new ArrowSchema;
+    ArrowSchemaDeepCopy(schema, schema_cp);
+    schema_copy_ = schema_cp;
+
+    // Encode schema message
+    ArrowIpcEncoder encoder;
+    ArrowIpcEncoderInit(&encoder);
+
+    ArrowError error;
+    int rc = ArrowIpcEncoderEncodeSchema(&encoder, schema, &error);
+    if (rc != NANOARROW_OK) {
+        ArrowIpcEncoderReset(&encoder);
+        co_return rc;
+    }
+
+    ArrowBuffer msg_buf;
+    ArrowBufferInit(&msg_buf);
+    rc = ArrowIpcEncoderFinalizeBuffer(&encoder, 1, &msg_buf);
+    ArrowIpcEncoderReset(&encoder);
+    if (rc != NANOARROW_OK) {
+        ArrowBufferReset(&msg_buf);
+        co_return rc;
+    }
+
+    // Write schema message
+    auto result = co_await io::pwrite(fd_, msg_buf.data, msg_buf.size_bytes,
+                                      write_offset_);
+
+    if (result < 0) {
+        ArrowBufferReset(&msg_buf);
+        co_return static_cast<int>(result);
     }
-    std::memset(os, 0, sizeof(ArrowIpcOutputStream));
 
-    // close_on_release=0: we manage the FILE* ourselves.
-    int rc = ArrowIpcOutputStreamInitFile(os, file_, /*close_on_release=*/0);
+    write_offset_ += msg_buf.size_bytes;
+    ArrowBufferReset(&msg_buf);
+    schema_written_ = true;
+
+    co_return 0;
+}
+
+// ---------------------------------------------------------------------------
+// encode_batch_uncompressed
+// ---------------------------------------------------------------------------
+
+static int encode_batch_uncompressed(ArrowExportResult& batch,
+                                     std::vector<uint8_t>& out_header,
+                                     std::vector<uint8_t>& out_body) {
+    ArrowSchema* schema = batch.get_schema();
+    ArrowArray* array = batch.get_array();
+
+    ArrowArrayView view;
+    ArrowError error;
+    int rc = ArrowArrayViewInitFromSchema(&view, schema, &error);
     if (rc != NANOARROW_OK) {
-        delete os;
-        std::fclose(file_);
-        file_ = nullptr;
         return rc;
     }
 
-    auto* w = new (std::nothrow) ArrowIpcWriter;
-    if (!w) {
-        // os not yet consumed — release it manually
-        if (os->release) os->release(os);
-        delete os;
-        std::fclose(file_);
-        file_ = nullptr;
-        return -1;
+    rc = ArrowArrayViewSetArray(&view, array, &error);
+    if (rc != NANOARROW_OK) {
+        ArrowArrayViewReset(&view);
+        return rc;
     }
-    std::memset(w, 0, sizeof(ArrowIpcWriter));
 
-    // ArrowIpcWriterInit takes ownership of *os (moves it internally).
-    rc = ArrowIpcWriterInit(w, os);
+    ArrowIpcEncoder encoder;
+    ArrowIpcEncoderInit(&encoder);
+
+    ArrowBuffer body_buf;
+    ArrowBufferInit(&body_buf);
+
+    rc = ArrowIpcEncoderEncodeSimpleRecordBatch(&encoder, &view, &body_buf,
+                                                &error);
     if (rc != NANOARROW_OK) {
-        // Init failed: writer did not take ownership, release stream ourselves.
-        if (os->release) os->release(os);
-        delete os;
-        delete w;
-        std::fclose(file_);
-        file_ = nullptr;
+        ArrowBufferReset(&body_buf);
+        ArrowIpcEncoderReset(&encoder);
+        ArrowArrayViewReset(&view);
         return rc;
     }
 
-    // Keep os pointer so we can delete the allocation in close().
-    // The writer owns the stream contents; we only own the heap allocation.
-    stream_ = os;
-    writer_ = w;
+    ArrowBuffer header_buf;
+    ArrowBufferInit(&header_buf);
 
-    // Write IPC file magic.
-    rc = ArrowIpcWriterStartFile(w, nullptr);
+    rc = ArrowIpcEncoderFinalizeBuffer(&encoder, 1, &header_buf);
     if (rc != NANOARROW_OK) {
-        ArrowIpcWriterReset(w);
-        delete w;
-        delete os;
-        std::fclose(file_);
-        reset_state();
+        ArrowBufferReset(&body_buf);
+        ArrowBufferReset(&header_buf);
+        ArrowIpcEncoderReset(&encoder);
+        ArrowArrayViewReset(&view);
         return rc;
     }
 
-    return NANOARROW_OK;
+    out_header.resize(header_buf.size_bytes);
+    std::memcpy(out_header.data(), header_buf.data, header_buf.size_bytes);
+
+    out_body.resize(body_buf.size_bytes);
+    std::memcpy(out_body.data(), body_buf.data, body_buf.size_bytes);
+
+    ArrowBufferReset(&body_buf);
+    ArrowBufferReset(&header_buf);
+    ArrowIpcEncoderReset(&encoder);
+    ArrowArrayViewReset(&view);
+
+    return 0;
 }
 
 // ---------------------------------------------------------------------------
-// write_batch
+// compress_batch
 // ---------------------------------------------------------------------------
 
-int IpcWriter::write_batch(ArrowExportResult& batch) {
-    if (!is_open() || !batch.valid()) return -1;
+coro::CoroTask<IpcWriter::CompressedBatch> IpcWriter::compress_batch(
+    ArrowExportResult& batch) {
+    CompressedBatch result{};
+
+    if (compression_ == IpcCompression::NONE) {
+        result.body_slot = buffer_pool_.acquire(64 * 1024);
+        if (!result.body_slot) {
+            result.body_slot = new BufferPool::Slot();
+        }
+
+        int rc = encode_batch_uncompressed(batch, result.header,
+                                           result.body_slot->data);
+        if (rc != 0) {
+            co_return result;
+        }
+
+        result.body_size = result.body_slot->data.size();
+        result.body_length = static_cast<std::int64_t>(result.body_size);
+        result.metadata_length =
+            static_cast<std::int32_t>(result.header.size());
+        co_return result;
+    }
+
+#ifdef DFTRACER_UTILS_ENABLE_ZSTD
+    if (compression_ != IpcCompression::ZSTD) {
+        co_return result;
+    }
 
-    ArrowIpcWriter* w = as_writer(writer_);
     ArrowSchema* schema = batch.get_schema();
     ArrowArray* array = batch.get_array();
 
-    // Write schema once, before the first record batch.
-    if (!schema_written_) {
-        int rc = ArrowIpcWriterWriteSchema(w, schema, nullptr);
-        if (rc != NANOARROW_OK) return rc;
-        schema_written_ = true;
-    }
-
-    // Build an ArrowArrayView from the schema + array.
     ArrowArrayView view;
     ArrowError error;
     int rc = ArrowArrayViewInitFromSchema(&view, schema, &error);
     if (rc != NANOARROW_OK) {
         ArrowArrayViewReset(&view);
-        return rc;
+        co_return result;
     }
 
     rc = ArrowArrayViewSetArray(&view, array, &error);
     if (rc != NANOARROW_OK) {
         ArrowArrayViewReset(&view);
-        return rc;
+        co_return result;
+    }
+
+    // Acquire pooled buffer for compressed body
+    std::size_t estimated_size = 0;
+    std::vector<ArrowBufferView> flat_buffers;
+    collect_flat_buffers(&view, flat_buffers, true);
+    for (const auto& buf : flat_buffers) {
+        estimated_size += ZSTD_compressBound(buf.size_bytes) + 8;
+    }
+
+    result.body_slot = buffer_pool_.acquire(estimated_size);
+    if (!result.body_slot) {
+        result.body_slot = new BufferPool::Slot();
+        result.body_slot->data.reserve(estimated_size);
+    }
+
+    // Compress into pooled buffer
+    std::vector<BufferInfo> buffer_info;
+    rc = build_compressed_body(&view, result.body_slot->data, buffer_info);
+    if (rc != 0) {
+        ArrowArrayViewReset(&view);
+        co_return result;
+    }
+
+    result.body_size = result.body_slot->data.size();
+    result.body_length = static_cast<std::int64_t>(result.body_size);
+
+    // Build message header
+    rc = build_message_header(&view, buffer_info, result.body_length,
+                              result.header);
+    if (rc != 0) {
+        ArrowArrayViewReset(&view);
+        co_return result;
     }
 
-    rc = ArrowIpcWriterWriteArrayView(w, &view, nullptr);
+    result.metadata_length = static_cast<std::int32_t>(result.header.size());
+
     ArrowArrayViewReset(&view);
-    return rc;
+#endif
+
+    co_return result;
 }
 
 // ---------------------------------------------------------------------------
-// close
+// write_compressed
 // ---------------------------------------------------------------------------
 
-int IpcWriter::close() {
-    if (!is_open()) return 0;
+coro::CoroTask<int> IpcWriter::write_compressed(CompressedBatch& cb) {
+    auto* blocks = static_cast<std::vector<FileBlock>*>(batch_blocks_);
 
-    int rc = NANOARROW_OK;
-    ArrowIpcWriter* w = as_writer(writer_);
+    // Record block info
+    FileBlock block;
+    block.offset = write_offset_;
+    block.metadata_length = cb.metadata_length;
+    block.body_length = cb.body_length;
 
-    if (w && schema_written_) {
-        rc = ArrowIpcWriterFinalizeFile(w, nullptr);
+    // Vectored write: header + body
+    struct iovec iov[2];
+    iov[0].iov_base = cb.header.data();
+    iov[0].iov_len = cb.header.size();
+    iov[1].iov_base = cb.body_slot->data.data();
+    iov[1].iov_len = cb.body_size;
+
+    auto result = co_await io::pwritev(fd_, iov, 2, write_offset_);
+
+    // Release pooled buffer
+    buffer_pool_.release(cb.body_slot);
+    cb.body_slot = nullptr;
+
+    if (result < 0) {
+        co_return static_cast<int>(result);
     }
 
-    if (w) {
-        ArrowIpcWriterReset(w);
-        delete w;
+    write_offset_ += result;
+    blocks->push_back(block);
+
+    co_return 0;
+}
+
+// ---------------------------------------------------------------------------
+// write_batch
+// ---------------------------------------------------------------------------
+
+coro::CoroTask<int> IpcWriter::write_batch(ArrowExportResult& batch) {
+    if (!is_open() || !batch.valid()) co_return -1;
+
+    // Write schema on first batch
+    if (!schema_written_) {
+        int rc = co_await write_schema(batch);
+        if (rc != 0) co_return rc;
     }
 
-    // The stream allocation is ours; its contents were released by Reset.
-    if (stream_) {
-        delete as_stream(stream_);
+    // Compress and write
+    auto cb = co_await compress_batch(batch);
+    if (cb.header.empty()) co_return -1;
+
+    co_return co_await write_compressed(cb);
+}
+
+// ---------------------------------------------------------------------------
+// write_batches (parallel compression)
+// ---------------------------------------------------------------------------
+
+coro::CoroTask<int> IpcWriter::write_batches(
+    std::vector<ArrowExportResult>& batches) {
+    if (!is_open()) co_return -1;
+    if (batches.empty()) co_return 0;
+
+    // Write schema from first batch
+    if (!schema_written_) {
+        int rc = co_await write_schema(batches[0]);
+        if (rc != 0) co_return rc;
+    }
+
+    // Parallel compress all batches
+    std::vector<coro::CoroTask<CompressedBatch>> compress_tasks;
+    compress_tasks.reserve(batches.size());
+
+    for (auto& batch : batches) {
+        if (batch.valid()) {
+            compress_tasks.push_back(compress_batch(batch));
+        }
+    }
+
+    auto compressed = co_await coro::when_all(std::move(compress_tasks));
+
+    // Write in order (sequential to maintain file structure)
+    for (auto& cb : compressed) {
+        if (cb.header.empty()) {
+            co_return -1;
+        }
+        int rc = co_await write_compressed(cb);
+        if (rc != 0) co_return rc;
+    }
+
+    co_return 0;
+}
+
+// ---------------------------------------------------------------------------
+// write_footer
+// ---------------------------------------------------------------------------
+
+coro::CoroTask<int> IpcWriter::write_footer() {
+    auto* blocks = static_cast<std::vector<FileBlock>*>(batch_blocks_);
+    auto* schema = static_cast<ArrowSchema*>(schema_copy_);
+
+    if (!blocks || !schema) {
+        co_return -1;
+    }
+
+    ArrowIpcFooter footer;
+    ArrowIpcFooterInit(&footer);
+    ArrowSchemaMove(schema, &footer.schema);
+
+    for (const auto& block : *blocks) {
+        int rc = ArrowBufferAppend(&footer.record_batch_blocks, &block,
+                                   sizeof(FileBlock));
+        if (rc != NANOARROW_OK) {
+            ArrowIpcFooterReset(&footer);
+            co_return rc;
+        }
     }
 
-    std::fclose(file_);
+    ArrowIpcEncoder encoder;
+    ArrowIpcEncoderInit(&encoder);
+
+    ArrowError error;
+    int rc = ArrowIpcEncoderEncodeFooter(&encoder, &footer, &error);
+    if (rc != NANOARROW_OK) {
+        ArrowIpcEncoderReset(&encoder);
+        ArrowIpcFooterReset(&footer);
+        co_return rc;
+    }
+
+    ArrowBuffer footer_buf;
+    ArrowBufferInit(&footer_buf);
+    rc = ArrowIpcEncoderFinalizeBuffer(&encoder, 0, &footer_buf);
+    ArrowIpcEncoderReset(&encoder);
+    if (rc != NANOARROW_OK) {
+        ArrowBufferReset(&footer_buf);
+        ArrowIpcFooterReset(&footer);
+        co_return rc;
+    }
+
+    // Build footer: EOS marker + footer + footer_size + magic
+    std::vector<uint8_t> footer_data;
+    footer_data.resize(8 + footer_buf.size_bytes + 4 + 6);
+
+    std::int32_t eos_continuation = -1;
+    std::int32_t eos_size = 0;
+    std::memcpy(footer_data.data(), &eos_continuation, 4);
+    std::memcpy(footer_data.data() + 4, &eos_size, 4);
+    std::memcpy(footer_data.data() + 8, footer_buf.data, footer_buf.size_bytes);
+
+    std::int32_t footer_size = static_cast<std::int32_t>(footer_buf.size_bytes);
+    std::memcpy(footer_data.data() + 8 + footer_buf.size_bytes, &footer_size,
+                4);
+    std::memcpy(footer_data.data() + 8 + footer_buf.size_bytes + 4, "ARROW1",
+                6);
+
+    ArrowBufferReset(&footer_buf);
+    ArrowIpcFooterReset(&footer);
+
+    // Write footer
+    auto result = co_await io::pwrite(fd_, footer_data.data(),
+                                      footer_data.size(), write_offset_);
+
+    if (result < 0) {
+        co_return static_cast<int>(result);
+    }
+
+    write_offset_ += result;
+    co_return 0;
+}
+
+// ---------------------------------------------------------------------------
+// close
+// ---------------------------------------------------------------------------
+
+coro::CoroTask<int> IpcWriter::close() {
+    if (!is_open()) co_return 0;
+
+    int rc = 0;
+
+    // Write footer
+    if (schema_written_ && batch_blocks_) {
+        rc = co_await write_footer();
+    }
+
+    // Cleanup
+    if (schema_copy_) {
+        auto* schema = static_cast<ArrowSchema*>(schema_copy_);
+        if (schema->release) schema->release(schema);
+        delete schema;
+    }
+    if (batch_blocks_) {
+        delete static_cast<std::vector<FileBlock>*>(batch_blocks_);
+    }
+
+    // Fsync and close
+    co_await io::fsync(fd_);
+    co_await io::close(fd_);
+
     reset_state();
-    return rc;
+    co_return rc;
 }
 
 }  // namespace dftracer::utils::utilities::common::arrow
diff --git a/src/dftracer/utils/utilities/common/arrow/parallel_reader.cpp b/src/dftracer/utils/utilities/common/arrow/parallel_reader.cpp
new file mode 100644
index 00000000..fba3cdc3
--- /dev/null
+++ b/src/dftracer/utils/utilities/common/arrow/parallel_reader.cpp
@@ -0,0 +1,111 @@
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+#include <dftracer/utils/core/coro/when_all.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/utilities/common/arrow/ipc_reader.h>
+#include <dftracer/utils/utilities/common/arrow/parallel_reader.h>
+
+#include <exception>
+
+namespace dftracer::utils::utilities::common::arrow {
+
+using dftracer::utils::coro::CoroTask;
+using dftracer::utils::coro::when_all;
+
+CoroTask<ArrowFileReadResult> read_arrow_file_async(std::string path) {
+    ArrowFileReadResult result;
+    result.path = path;
+
+    try {
+        IpcReader reader;
+        int rc = reader.open(path);
+        if (rc != 0) {
+            result.success = false;
+            result.error = "Failed to open file: " + path;
+            co_return result;
+        }
+
+        *result.batches = reader.read_all();
+        for (const auto& batch : *result.batches) {
+            result.total_rows += batch.num_rows();
+        }
+
+        result.success = true;
+    } catch (const std::exception& e) {
+        result.success = false;
+        result.error = e.what();
+    }
+
+    co_return result;
+}
+
+CoroTask<ParallelReadResult> read_arrow_files_parallel(
+    std::vector<std::string> paths) {
+    ParallelReadResult result;
+
+    if (paths.empty()) {
+        co_return result;
+    }
+
+    std::vector<CoroTask<ArrowFileReadResult>> tasks;
+    tasks.reserve(paths.size());
+
+    for (auto& path : paths) {
+        tasks.push_back(read_arrow_file_async(std::move(path)));
+    }
+
+    result.file_results = co_await when_all(std::move(tasks));
+
+    for (const auto& fr : result.file_results) {
+        if (fr.success) {
+            result.files_read++;
+            result.total_rows += fr.total_rows;
+            result.total_batches += fr.batches->size();
+        } else {
+            result.files_failed++;
+        }
+    }
+
+    co_return result;
+}
+
+CoroTask<ParallelReadResult> read_arrow_files_streaming(
+    CoroScope& /*scope*/, std::vector<std::string> paths,
+    FileResultCallback callback) {
+    if (paths.empty()) {
+        co_return ParallelReadResult{};
+    }
+
+    std::vector<CoroTask<ArrowFileReadResult>> tasks;
+    tasks.reserve(paths.size());
+
+    for (auto& path : paths) {
+        tasks.push_back(read_arrow_file_async(std::move(path)));
+    }
+
+    auto results = co_await when_all(std::move(tasks));
+
+    ParallelReadResult summary;
+    bool cancelled = false;
+
+    for (auto& result : results) {
+        if (result.success) {
+            summary.files_read++;
+            summary.total_rows += result.total_rows;
+            summary.total_batches += result.batches->size();
+        } else {
+            summary.files_failed++;
+        }
+
+        if (!cancelled && !callback(std::move(result))) {
+            cancelled = true;
+        }
+    }
+
+    co_return summary;
+}
+
+}  // namespace dftracer::utils::utilities::common::arrow
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW_IPC
diff --git a/src/dftracer/utils/utilities/common/arrow/partition_router.cpp b/src/dftracer/utils/utilities/common/arrow/partition_router.cpp
new file mode 100644
index 00000000..d7dad205
--- /dev/null
+++ b/src/dftracer/utils/utilities/common/arrow/partition_router.cpp
@@ -0,0 +1,623 @@
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/utilities/common/arrow/column_builder.h>
+#include <dftracer/utils/utilities/common/arrow/partition_router.h>
+#include <nanoarrow/nanoarrow.h>
+
+#include <cstring>
+#include <functional>
+#include <iomanip>
+#include <sstream>
+
+namespace dftracer::utils::utilities::common::arrow {
+
+namespace {
+
+std::string extract_string(const ArrowArrayView* view, int64_t idx) {
+    ArrowStringView sv = ArrowArrayViewGetStringUnsafe(view, idx);
+    return std::string(sv.data, sv.size_bytes);
+}
+
+int64_t extract_int64(const ArrowArrayView* view, int64_t idx) {
+    return ArrowArrayViewGetIntUnsafe(view, idx);
+}
+
+uint64_t extract_uint64(const ArrowArrayView* view, int64_t idx) {
+    return static_cast<uint64_t>(ArrowArrayViewGetUIntUnsafe(view, idx));
+}
+
+double extract_double(const ArrowArrayView* view, int64_t idx) {
+    return ArrowArrayViewGetDoubleUnsafe(view, idx);
+}
+
+bool is_null(const ArrowArrayView* view, int64_t idx) {
+    return ArrowArrayViewIsNull(view, idx);
+}
+
+std::string value_to_string(const ArrowArrayView* view, int64_t idx) {
+    if (is_null(view, idx)) {
+        return "__null__";
+    }
+
+    switch (view->storage_type) {
+        case NANOARROW_TYPE_STRING:
+        case NANOARROW_TYPE_LARGE_STRING:
+            return extract_string(view, idx);
+        case NANOARROW_TYPE_INT64:
+        case NANOARROW_TYPE_INT32:
+        case NANOARROW_TYPE_INT16:
+        case NANOARROW_TYPE_INT8:
+            return std::to_string(extract_int64(view, idx));
+        case NANOARROW_TYPE_UINT64:
+        case NANOARROW_TYPE_UINT32:
+        case NANOARROW_TYPE_UINT16:
+        case NANOARROW_TYPE_UINT8:
+            return std::to_string(extract_uint64(view, idx));
+        case NANOARROW_TYPE_DOUBLE:
+        case NANOARROW_TYPE_FLOAT:
+            return std::to_string(extract_double(view, idx));
+        case NANOARROW_TYPE_BOOL:
+            return extract_int64(view, idx) ? "true" : "false";
+        default:
+            return "__unsupported__";
+    }
+}
+
+ColumnType nanoarrow_to_column_type(ArrowType type) {
+    switch (type) {
+        case NANOARROW_TYPE_INT64:
+        case NANOARROW_TYPE_INT32:
+        case NANOARROW_TYPE_INT16:
+        case NANOARROW_TYPE_INT8:
+            return ColumnType::INT64;
+        case NANOARROW_TYPE_UINT64:
+        case NANOARROW_TYPE_UINT32:
+        case NANOARROW_TYPE_UINT16:
+        case NANOARROW_TYPE_UINT8:
+            return ColumnType::UINT64;
+        case NANOARROW_TYPE_DOUBLE:
+        case NANOARROW_TYPE_FLOAT:
+            return ColumnType::DOUBLE;
+        case NANOARROW_TYPE_STRING:
+        case NANOARROW_TYPE_LARGE_STRING:
+            return ColumnType::STRING;
+        case NANOARROW_TYPE_BOOL:
+            return ColumnType::BOOL;
+        default:
+            return ColumnType::STRING;
+    }
+}
+
+uint64_t fnv1a_hash(const std::string& s) {
+    uint64_t hash = 14695981039346656037ULL;
+    for (char c : s) {
+        hash ^= static_cast<uint64_t>(c);
+        hash *= 1099511628211ULL;
+    }
+    return hash;
+}
+
+}  // namespace
+
+PartitionRouter::~PartitionRouter() {}
+
+PartitionRouter::PartitionRouter(PartitionRouter&& other) noexcept
+    : output_dir_(std::move(other.output_dir_)),
+      config_(std::move(other.config_)),
+      chunk_size_bytes_(other.chunk_size_bytes_),
+      compression_(other.compression_),
+      is_open_(other.is_open_),
+      writers_(std::move(other.writers_)),
+      predicates_(std::move(other.predicates_)) {
+    other.is_open_ = false;
+}
+
+PartitionRouter& PartitionRouter::operator=(PartitionRouter&& other) noexcept {
+    if (this != &other) {
+        output_dir_ = std::move(other.output_dir_);
+        config_ = std::move(other.config_);
+        chunk_size_bytes_ = other.chunk_size_bytes_;
+        compression_ = other.compression_;
+        is_open_ = other.is_open_;
+        writers_ = std::move(other.writers_);
+        predicates_ = std::move(other.predicates_);
+        other.is_open_ = false;
+    }
+    return *this;
+}
+
+int PartitionRouter::open(const std::string& output_dir,
+                          const PartitionConfig& config,
+                          int64_t chunk_size_bytes,
+                          IpcCompression compression) {
+    if (is_open_) return -1;
+
+    std::error_code ec;
+    fs::create_directories(output_dir, ec);
+    if (ec) return -1;
+
+    output_dir_ = output_dir;
+    config_ = config;
+    chunk_size_bytes_ = chunk_size_bytes;
+    compression_ = compression;
+    writers_.clear();
+    predicates_.clear();
+
+    is_open_ = true;
+    return 0;
+}
+
+void PartitionRouter::register_predicate(const std::string& view_name,
+                                         PredicateEvaluator evaluator) {
+    predicates_[view_name] = std::move(evaluator);
+}
+
+std::string PartitionRouter::partition_path(
+    const std::string& partition_key) const {
+    if (partition_key.empty()) {
+        return output_dir_;
+    }
+    return (fs::path(output_dir_) / partition_key).string();
+}
+
+coro::CoroTask<PartitionWriter*> PartitionRouter::get_or_create_writer(
+    const std::string& partition_key) {
+    auto it = writers_.find(partition_key);
+    if (it != writers_.end()) {
+        co_return it->second.get();
+    }
+
+    auto writer = std::make_unique<PartitionWriter>();
+    std::string path = partition_path(partition_key);
+    if (co_await writer->open(path, chunk_size_bytes_, compression_) != 0) {
+        co_return nullptr;
+    }
+
+    PartitionWriter* ptr = writer.get();
+    writers_[partition_key] = std::move(writer);
+    co_return ptr;
+}
+
+int PartitionRouter::compute_bucket(
+    const std::vector<std::string>& values) const {
+    std::string combined;
+    for (const auto& v : values) {
+        combined += v;
+        combined += '\0';
+    }
+    return static_cast<int>(fnv1a_hash(combined) %
+                            static_cast<uint64_t>(config_.num_buckets));
+}
+
+coro::CoroTask<int> PartitionRouter::route_none(ArrowExportResult& batch) {
+    PartitionWriter* writer = co_await get_or_create_writer("");
+    if (!writer) co_return -1;
+    co_return co_await writer->write_batch(batch);
+}
+
+coro::CoroTask<int> PartitionRouter::route_column(ArrowExportResult& batch) {
+    ArrowSchema* schema = batch.get_schema();
+    ArrowArray* array = batch.get_array();
+    int64_t num_rows = batch.num_rows();
+
+    if (num_rows == 0) co_return 0;
+
+    ArrowArrayView view;
+    ArrowError error;
+    int rc = ArrowArrayViewInitFromSchema(&view, schema, &error);
+    if (rc != NANOARROW_OK) {
+        ArrowArrayViewReset(&view);
+        co_return rc;
+    }
+    rc = ArrowArrayViewSetArray(&view, array, &error);
+    if (rc != NANOARROW_OK) {
+        ArrowArrayViewReset(&view);
+        co_return rc;
+    }
+
+    std::vector<int64_t> partition_col_indices;
+    for (const auto& col_name : config_.partition_columns) {
+        for (int64_t i = 0; i < schema->n_children; i++) {
+            if (schema->children[i]->name == col_name) {
+                partition_col_indices.push_back(i);
+                break;
+            }
+        }
+    }
+
+    if (partition_col_indices.size() != config_.partition_columns.size()) {
+        ArrowArrayViewReset(&view);
+        co_return -1;
+    }
+
+    std::unordered_map<std::string, std::vector<int64_t>> partition_rows;
+
+    for (int64_t row = 0; row < num_rows; row++) {
+        std::string partition_key;
+        for (size_t i = 0; i < partition_col_indices.size(); i++) {
+            int64_t col_idx = partition_col_indices[i];
+            const ArrowArrayView* col_view = view.children[col_idx];
+            std::string value = value_to_string(col_view, row);
+
+            if (i > 0) partition_key += "/";
+            partition_key += config_.partition_columns[i] + "=" + value;
+        }
+        partition_rows[partition_key].push_back(row);
+    }
+
+    std::vector<ColumnSpec> col_specs;
+    col_specs.reserve(schema->n_children);
+    for (int64_t i = 0; i < schema->n_children; i++) {
+        ArrowType type = view.children[i]->storage_type;
+        col_specs.push_back(
+            {schema->children[i]->name, nanoarrow_to_column_type(type)});
+    }
+
+    for (auto& [partition_key, rows] : partition_rows) {
+        RecordBatchBuilder builder;
+        builder.declare_schema(col_specs);
+        builder.reserve(rows.size());
+
+        for (int64_t row : rows) {
+            for (int64_t col = 0; col < schema->n_children; col++) {
+                const ArrowArrayView* col_view = view.children[col];
+
+                if (is_null(col_view, row)) {
+                    builder.append_null(col);
+                } else {
+                    switch (col_view->storage_type) {
+                        case NANOARROW_TYPE_INT64:
+                        case NANOARROW_TYPE_INT32:
+                        case NANOARROW_TYPE_INT16:
+                        case NANOARROW_TYPE_INT8:
+                            builder.append_int64(col,
+                                                 extract_int64(col_view, row));
+                            break;
+                        case NANOARROW_TYPE_UINT64:
+                        case NANOARROW_TYPE_UINT32:
+                        case NANOARROW_TYPE_UINT16:
+                        case NANOARROW_TYPE_UINT8:
+                            builder.append_uint64(
+                                col, extract_uint64(col_view, row));
+                            break;
+                        case NANOARROW_TYPE_DOUBLE:
+                        case NANOARROW_TYPE_FLOAT:
+                            builder.append_double(
+                                col, extract_double(col_view, row));
+                            break;
+                        case NANOARROW_TYPE_STRING:
+                        case NANOARROW_TYPE_LARGE_STRING:
+                            builder.append_string(
+                                col, extract_string(col_view, row));
+                            break;
+                        case NANOARROW_TYPE_BOOL:
+                            builder.append_bool(
+                                col, extract_int64(col_view, row) != 0);
+                            break;
+                        default:
+                            builder.append_null(col);
+                            break;
+                    }
+                }
+            }
+            builder.end_row();
+        }
+
+        auto sub_batch = builder.finish();
+        PartitionWriter* writer = co_await get_or_create_writer(partition_key);
+        if (!writer) {
+            ArrowArrayViewReset(&view);
+            co_return -1;
+        }
+        rc = co_await writer->write_batch(sub_batch);
+        if (rc != 0) {
+            ArrowArrayViewReset(&view);
+            co_return rc;
+        }
+    }
+
+    ArrowArrayViewReset(&view);
+    co_return 0;
+}
+
+coro::CoroTask<int> PartitionRouter::route_bucketed(ArrowExportResult& batch) {
+    ArrowSchema* schema = batch.get_schema();
+    ArrowArray* array = batch.get_array();
+    int64_t num_rows = batch.num_rows();
+
+    if (num_rows == 0) co_return 0;
+
+    ArrowArrayView view;
+    ArrowError error;
+    int rc = ArrowArrayViewInitFromSchema(&view, schema, &error);
+    if (rc != NANOARROW_OK) {
+        ArrowArrayViewReset(&view);
+        co_return rc;
+    }
+    rc = ArrowArrayViewSetArray(&view, array, &error);
+    if (rc != NANOARROW_OK) {
+        ArrowArrayViewReset(&view);
+        co_return rc;
+    }
+
+    std::vector<int64_t> partition_col_indices;
+    for (const auto& col_name : config_.partition_columns) {
+        for (int64_t i = 0; i < schema->n_children; i++) {
+            if (schema->children[i]->name == col_name) {
+                partition_col_indices.push_back(i);
+                break;
+            }
+        }
+    }
+
+    if (partition_col_indices.size() != config_.partition_columns.size()) {
+        ArrowArrayViewReset(&view);
+        co_return -1;
+    }
+
+    std::unordered_map<int, std::vector<int64_t>> bucket_rows;
+
+    for (int64_t row = 0; row < num_rows; row++) {
+        std::vector<std::string> values;
+        values.reserve(partition_col_indices.size());
+        for (int64_t col_idx : partition_col_indices) {
+            values.push_back(value_to_string(view.children[col_idx], row));
+        }
+        int bucket = compute_bucket(values);
+        bucket_rows[bucket].push_back(row);
+    }
+
+    std::vector<ColumnSpec> col_specs;
+    col_specs.reserve(schema->n_children);
+    for (int64_t i = 0; i < schema->n_children; i++) {
+        col_specs.push_back(
+            {schema->children[i]->name,
+             nanoarrow_to_column_type(view.children[i]->storage_type)});
+    }
+
+    auto bucket_key = [&](int bucket) {
+        std::ostringstream ss;
+        ss << config_.partition_columns[0] << "_bucket=" << std::setw(2)
+           << std::setfill('0') << bucket;
+        return ss.str();
+    };
+
+    for (auto& [bucket, rows] : bucket_rows) {
+        RecordBatchBuilder builder;
+        builder.declare_schema(col_specs);
+        builder.reserve(rows.size());
+
+        for (int64_t row : rows) {
+            for (int64_t col = 0; col < schema->n_children; col++) {
+                const ArrowArrayView* col_view = view.children[col];
+
+                if (is_null(col_view, row)) {
+                    builder.append_null(col);
+                } else {
+                    switch (col_view->storage_type) {
+                        case NANOARROW_TYPE_INT64:
+                        case NANOARROW_TYPE_INT32:
+                        case NANOARROW_TYPE_INT16:
+                        case NANOARROW_TYPE_INT8:
+                            builder.append_int64(col,
+                                                 extract_int64(col_view, row));
+                            break;
+                        case NANOARROW_TYPE_UINT64:
+                        case NANOARROW_TYPE_UINT32:
+                        case NANOARROW_TYPE_UINT16:
+                        case NANOARROW_TYPE_UINT8:
+                            builder.append_uint64(
+                                col, extract_uint64(col_view, row));
+                            break;
+                        case NANOARROW_TYPE_DOUBLE:
+                        case NANOARROW_TYPE_FLOAT:
+                            builder.append_double(
+                                col, extract_double(col_view, row));
+                            break;
+                        case NANOARROW_TYPE_STRING:
+                        case NANOARROW_TYPE_LARGE_STRING:
+                            builder.append_string(
+                                col, extract_string(col_view, row));
+                            break;
+                        case NANOARROW_TYPE_BOOL:
+                            builder.append_bool(
+                                col, extract_int64(col_view, row) != 0);
+                            break;
+                        default:
+                            builder.append_null(col);
+                            break;
+                    }
+                }
+            }
+            builder.end_row();
+        }
+
+        auto sub_batch = builder.finish();
+        PartitionWriter* writer =
+            co_await get_or_create_writer(bucket_key(bucket));
+        if (!writer) {
+            ArrowArrayViewReset(&view);
+            co_return -1;
+        }
+        rc = co_await writer->write_batch(sub_batch);
+        if (rc != 0) {
+            ArrowArrayViewReset(&view);
+            co_return rc;
+        }
+    }
+
+    ArrowArrayViewReset(&view);
+    co_return 0;
+}
+
+coro::CoroTask<int> PartitionRouter::route_view(ArrowExportResult& batch) {
+    ArrowSchema* schema = batch.get_schema();
+    ArrowArray* array = batch.get_array();
+    int64_t num_rows = batch.num_rows();
+
+    if (num_rows == 0) co_return 0;
+
+    ArrowArrayView view;
+    ArrowError error;
+    int rc = ArrowArrayViewInitFromSchema(&view, schema, &error);
+    if (rc != NANOARROW_OK) {
+        ArrowArrayViewReset(&view);
+        co_return rc;
+    }
+    rc = ArrowArrayViewSetArray(&view, array, &error);
+    if (rc != NANOARROW_OK) {
+        ArrowArrayViewReset(&view);
+        co_return rc;
+    }
+
+    std::unordered_map<std::string, int64_t> col_name_to_idx;
+    for (int64_t i = 0; i < schema->n_children; i++) {
+        col_name_to_idx[schema->children[i]->name] = i;
+    }
+
+    std::unordered_map<std::string, std::vector<int64_t>> view_rows;
+
+    for (int64_t row = 0; row < num_rows; row++) {
+        std::unordered_map<std::string, std::string> row_values;
+        for (int64_t col = 0; col < schema->n_children; col++) {
+            row_values[schema->children[col]->name] =
+                value_to_string(view.children[col], row);
+        }
+
+        std::string matched_view;
+        for (const auto& [view_name, predicate] : config_.views) {
+            if (!predicate.has_value()) {
+                if (matched_view.empty()) {
+                    matched_view = view_name;
+                }
+                continue;
+            }
+
+            auto it = predicates_.find(view_name);
+            if (it != predicates_.end() && it->second(row_values)) {
+                matched_view = view_name;
+                break;
+            }
+        }
+
+        if (!matched_view.empty()) {
+            view_rows[matched_view].push_back(row);
+        }
+    }
+
+    std::vector<ColumnSpec> col_specs;
+    col_specs.reserve(schema->n_children);
+    for (int64_t i = 0; i < schema->n_children; i++) {
+        col_specs.push_back(
+            {schema->children[i]->name,
+             nanoarrow_to_column_type(view.children[i]->storage_type)});
+    }
+
+    for (auto& [view_name, rows] : view_rows) {
+        RecordBatchBuilder builder;
+        builder.declare_schema(col_specs);
+        builder.reserve(rows.size());
+
+        for (int64_t row : rows) {
+            for (int64_t col = 0; col < schema->n_children; col++) {
+                const ArrowArrayView* col_view = view.children[col];
+
+                if (is_null(col_view, row)) {
+                    builder.append_null(col);
+                } else {
+                    switch (col_view->storage_type) {
+                        case NANOARROW_TYPE_INT64:
+                        case NANOARROW_TYPE_INT32:
+                        case NANOARROW_TYPE_INT16:
+                        case NANOARROW_TYPE_INT8:
+                            builder.append_int64(col,
+                                                 extract_int64(col_view, row));
+                            break;
+                        case NANOARROW_TYPE_UINT64:
+                        case NANOARROW_TYPE_UINT32:
+                        case NANOARROW_TYPE_UINT16:
+                        case NANOARROW_TYPE_UINT8:
+                            builder.append_uint64(
+                                col, extract_uint64(col_view, row));
+                            break;
+                        case NANOARROW_TYPE_DOUBLE:
+                        case NANOARROW_TYPE_FLOAT:
+                            builder.append_double(
+                                col, extract_double(col_view, row));
+                            break;
+                        case NANOARROW_TYPE_STRING:
+                        case NANOARROW_TYPE_LARGE_STRING:
+                            builder.append_string(
+                                col, extract_string(col_view, row));
+                            break;
+                        case NANOARROW_TYPE_BOOL:
+                            builder.append_bool(
+                                col, extract_int64(col_view, row) != 0);
+                            break;
+                        default:
+                            builder.append_null(col);
+                            break;
+                    }
+                }
+            }
+            builder.end_row();
+        }
+
+        auto sub_batch = builder.finish();
+        PartitionWriter* writer = co_await get_or_create_writer(view_name);
+        if (!writer) {
+            ArrowArrayViewReset(&view);
+            co_return -1;
+        }
+        rc = co_await writer->write_batch(sub_batch);
+        if (rc != 0) {
+            ArrowArrayViewReset(&view);
+            co_return rc;
+        }
+    }
+
+    ArrowArrayViewReset(&view);
+    co_return 0;
+}
+
+coro::CoroTask<int> PartitionRouter::write_batch(ArrowExportResult& batch) {
+    if (!is_open_ || !batch.valid()) co_return -1;
+
+    switch (config_.mode) {
+        case PartitionConfig::Mode::NONE:
+            co_return co_await route_none(batch);
+        case PartitionConfig::Mode::COLUMN:
+            co_return co_await route_column(batch);
+        case PartitionConfig::Mode::BUCKETED:
+            co_return co_await route_bucketed(batch);
+        case PartitionConfig::Mode::VIEW:
+            co_return co_await route_view(batch);
+    }
+    co_return -1;
+}
+
+coro::CoroTask<RouterWriteStats> PartitionRouter::close() {
+    RouterWriteStats stats;
+
+    if (!is_open_) co_return stats;
+
+    for (auto& [partition_key, writer] : writers_) {
+        auto partition_stats = co_await writer->close();
+        stats.partitions[partition_key] = std::move(partition_stats);
+        stats.total_rows += stats.partitions[partition_key].total_rows;
+        stats.total_uncompressed_bytes +=
+            stats.partitions[partition_key].total_uncompressed_bytes;
+    }
+
+    writers_.clear();
+    predicates_.clear();
+    is_open_ = false;
+
+    co_return stats;
+}
+
+}  // namespace dftracer::utils::utilities::common::arrow
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW_IPC
diff --git a/src/dftracer/utils/utilities/common/arrow/partition_writer.cpp b/src/dftracer/utils/utilities/common/arrow/partition_writer.cpp
new file mode 100644
index 00000000..d642925a
--- /dev/null
+++ b/src/dftracer/utils/utilities/common/arrow/partition_writer.cpp
@@ -0,0 +1,207 @@
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/utilities/common/arrow/partition_writer.h>
+#include <nanoarrow/nanoarrow.h>
+
+#include <cstdio>
+#include <iomanip>
+#include <sstream>
+
+namespace dftracer::utils::utilities::common::arrow {
+
+PartitionWriter::~PartitionWriter() {}
+
+PartitionWriter::PartitionWriter(PartitionWriter&& other) noexcept
+    : output_dir_(std::move(other.output_dir_)),
+      chunk_size_bytes_(other.chunk_size_bytes_),
+      compression_(other.compression_),
+      writer_(std::move(other.writer_)),
+      is_open_(other.is_open_),
+      file_index_(other.file_index_),
+      current_file_bytes_(other.current_file_bytes_),
+      current_file_rows_(other.current_file_rows_),
+      total_bytes_(other.total_bytes_),
+      total_rows_(other.total_rows_),
+      files_(std::move(other.files_)),
+      row_counts_(std::move(other.row_counts_)) {
+    other.is_open_ = false;
+    other.file_index_ = 0;
+    other.current_file_bytes_ = 0;
+    other.current_file_rows_ = 0;
+    other.total_bytes_ = 0;
+    other.total_rows_ = 0;
+}
+
+PartitionWriter& PartitionWriter::operator=(PartitionWriter&& other) noexcept {
+    if (this != &other) {
+        output_dir_ = std::move(other.output_dir_);
+        chunk_size_bytes_ = other.chunk_size_bytes_;
+        compression_ = other.compression_;
+        writer_ = std::move(other.writer_);
+        is_open_ = other.is_open_;
+        file_index_ = other.file_index_;
+        current_file_bytes_ = other.current_file_bytes_;
+        current_file_rows_ = other.current_file_rows_;
+        total_bytes_ = other.total_bytes_;
+        total_rows_ = other.total_rows_;
+        files_ = std::move(other.files_);
+        row_counts_ = std::move(other.row_counts_);
+
+        other.is_open_ = false;
+        other.file_index_ = 0;
+        other.current_file_bytes_ = 0;
+        other.current_file_rows_ = 0;
+        other.total_bytes_ = 0;
+        other.total_rows_ = 0;
+    }
+    return *this;
+}
+
+std::string PartitionWriter::generate_filename() const {
+    std::ostringstream ss;
+    ss << "part-" << std::setw(5) << std::setfill('0') << file_index_
+       << ".arrow";
+    return (fs::path(output_dir_) / ss.str()).string();
+}
+
+coro::CoroTask<int> PartitionWriter::open(const std::string& output_dir,
+                                          int64_t chunk_size_bytes,
+                                          IpcCompression compression) {
+    if (is_open_) co_return -1;
+
+    std::error_code ec;
+    fs::create_directories(output_dir, ec);
+    if (ec) co_return -1;
+
+    output_dir_ = output_dir;
+    chunk_size_bytes_ = chunk_size_bytes;
+    compression_ = compression;
+    file_index_ = 0;
+    current_file_bytes_ = 0;
+    current_file_rows_ = 0;
+    total_bytes_ = 0;
+    total_rows_ = 0;
+    files_.clear();
+    row_counts_.clear();
+
+    std::string path = generate_filename();
+    int rc = co_await writer_.open(path, compression_);
+    if (rc != 0) co_return rc;
+
+    is_open_ = true;
+    co_return 0;
+}
+
+int64_t PartitionWriter::calculate_uncompressed_size(ArrowExportResult& batch) {
+    ArrowSchema* schema = batch.get_schema();
+    ArrowArray* array = batch.get_array();
+
+    ArrowArrayView view;
+    ArrowError error;
+    int rc = ArrowArrayViewInitFromSchema(&view, schema, &error);
+    if (rc != NANOARROW_OK) {
+        ArrowArrayViewReset(&view);
+        return 0;
+    }
+
+    rc = ArrowArrayViewSetArray(&view, array, &error);
+    if (rc != NANOARROW_OK) {
+        ArrowArrayViewReset(&view);
+        return 0;
+    }
+
+    // Calculate total buffer size recursively
+    int64_t total = 0;
+    struct BufferCounter {
+        static void count(ArrowArrayView* v, int64_t& total, bool is_root) {
+            if (!is_root) {
+                int64_t num_buffers = ArrowArrayViewGetNumBuffers(v);
+                for (int64_t i = 0; i < num_buffers; i++) {
+                    ArrowBufferView buf = ArrowArrayViewGetBufferView(v, i);
+                    total += buf.size_bytes;
+                }
+            }
+            for (int64_t i = 0; i < v->n_children; i++) {
+                count(v->children[i], total, false);
+            }
+        }
+    };
+    BufferCounter::count(&view, total, true);
+
+    ArrowArrayViewReset(&view);
+    return total;
+}
+
+coro::CoroTask<int> PartitionWriter::rotate_file() {
+    co_await writer_.close();
+
+    files_.push_back(generate_filename());
+    row_counts_.push_back(current_file_rows_);
+
+    file_index_++;
+    current_file_bytes_ = 0;
+    current_file_rows_ = 0;
+
+    std::string path = generate_filename();
+    co_return co_await writer_.open(path, compression_);
+}
+
+coro::CoroTask<int> PartitionWriter::write_batch(ArrowExportResult& batch) {
+    if (!is_open_ || !batch.valid()) co_return -1;
+
+    int64_t batch_size = calculate_uncompressed_size(batch);
+    int64_t batch_rows = batch.num_rows();
+
+    // Check if we need to rotate before writing
+    // (only rotate if we've written something and adding this batch exceeds
+    // limit)
+    if (chunk_size_bytes_ > 0 && current_file_bytes_ > 0 &&
+        current_file_bytes_ + batch_size > chunk_size_bytes_) {
+        int rc = co_await rotate_file();
+        if (rc != 0) co_return rc;
+    }
+
+    int rc = co_await writer_.write_batch(batch);
+    if (rc != 0) co_return rc;
+
+    current_file_bytes_ += batch_size;
+    current_file_rows_ += batch_rows;
+    total_bytes_ += batch_size;
+    total_rows_ += batch_rows;
+
+    co_return 0;
+}
+
+coro::CoroTask<PartitionWriteStats> PartitionWriter::close() {
+    PartitionWriteStats stats;
+
+    if (!is_open_) co_return stats;
+
+    co_await writer_.close();
+
+    // Record final file stats (only if rows were written)
+    if (current_file_rows_ > 0) {
+        files_.push_back(generate_filename());
+        row_counts_.push_back(current_file_rows_);
+    }
+
+    stats.files = std::move(files_);
+    stats.row_counts = std::move(row_counts_);
+    stats.total_rows = total_rows_;
+    stats.total_uncompressed_bytes = total_bytes_;
+
+    is_open_ = false;
+    file_index_ = 0;
+    current_file_bytes_ = 0;
+    current_file_rows_ = 0;
+    total_bytes_ = 0;
+    total_rows_ = 0;
+
+    co_return stats;
+}
+
+}  // namespace dftracer::utils::utilities::common::arrow
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW_IPC
diff --git a/src/dftracer/utils/utilities/common/json/json_value.cpp b/src/dftracer/utils/utilities/common/json/json_value.cpp
index 989884ea..acfccaf8 100644
--- a/src/dftracer/utils/utilities/common/json/json_value.cpp
+++ b/src/dftracer/utils/utilities/common/json/json_value.cpp
@@ -7,9 +7,9 @@
 namespace dftracer::utils::utilities::common::json {
 
 JsonValue JsonValue::at(const char* path) const {
-    if (!val_ || !path) return JsonValue(nullptr);
+    if (!valid_ || !path) return JsonValue();
 
-    JsonValue current(val_);
+    JsonValue current = *this;
     const char* start = path;
 
     while (*start) {
@@ -22,18 +22,11 @@ JsonValue JsonValue::at(const char* path) const {
             continue;
         }
 
-        char key_buf[256];
-        if (key_len >= sizeof(key_buf)) {
-            std::string key_str(start, key_len);
-            current = current[key_str.c_str()];
-        } else {
-            std::memcpy(key_buf, start, key_len);
-            key_buf[key_len] = '\0';
-            current = current[key_buf];
-        }
+        std::string_view key_sv(start, key_len);
+        current = current[key_sv];
 
         if (!current.exists()) {
-            return JsonValue(nullptr);
+            return JsonValue();
         }
 
         start = (*end == '.') ? end + 1 : end;
@@ -76,23 +69,14 @@ coro::CoroTask<JsonParserOutput> StringJsonParserUtility::process(
     const StringJsonParserInput& input) {
     content_ = input.content;
 
-    yyjson_doc* doc =
-        yyjson_read(content_.content.data(), content_.content.size(), 0);
-
-    yyjson_val* json_object = nullptr;
-    if (doc) {
-        json_object = yyjson_doc_get_root(doc);
-        owned_doc_ = std::shared_ptr<yyjson_doc>(doc, [](yyjson_doc* d) {
-            if (d) yyjson_doc_free(d);
-        });
+    auto result =
+        parser_.parse(content_.content.data(), content_.content.size());
+    if (result.error()) {
+        co_return JsonValue();
     }
-
-    co_return JsonValue(json_object);
+    co_return JsonValue(result.value_unsafe());
 }
 
-void StringJsonParserUtility::reset() {
-    owned_doc_.reset();
-    content_ = utilities::text::Text{};
-}
+void StringJsonParserUtility::reset() { content_ = utilities::text::Text{}; }
 
 }  // namespace dftracer::utils::utilities::common::json
diff --git a/src/dftracer/utils/utilities/common/json/parser.cpp b/src/dftracer/utils/utilities/common/json/parser.cpp
new file mode 100644
index 00000000..1f583f72
--- /dev/null
+++ b/src/dftracer/utils/utilities/common/json/parser.cpp
@@ -0,0 +1,73 @@
+#include <dftracer/utils/utilities/common/json/parser.h>
+
+namespace dftracer::utils::utilities::common::json {
+
+JsonParser::JsonParser(std::size_t capacity) : parser_(capacity) {}
+
+bool JsonParser::parse(std::string_view json_line) {
+    padded_json_ = simdjson::padded_string(json_line);
+    auto result = parser_.iterate(padded_json_);
+    if (result.error()) {
+        valid_ = false;
+        return false;
+    }
+    doc_ = std::move(result.value());
+    active_ = simdjson::ondemand::document_reference(doc_);
+    valid_ = true;
+    return true;
+}
+
+bool JsonParser::parse_padded(simdjson::padded_string_view json) {
+    auto result = parser_.iterate(json);
+    if (result.error()) {
+        valid_ = false;
+        return false;
+    }
+    doc_ = std::move(result.value());
+    active_ = simdjson::ondemand::document_reference(doc_);
+    valid_ = true;
+    return true;
+}
+
+void JsonParser::rewind() {
+    if (valid_) {
+        active_.rewind();
+    }
+}
+
+std::optional<std::int64_t> JsonParser::get_int64(std::string_view key) {
+    if (!valid_) return std::nullopt;
+    auto result = active_[key].get_int64();
+    if (result.error()) return std::nullopt;
+    return result.value();
+}
+
+std::optional<std::uint64_t> JsonParser::get_uint64(std::string_view key) {
+    if (!valid_) return std::nullopt;
+    auto result = active_[key].get_uint64();
+    if (result.error()) return std::nullopt;
+    return result.value();
+}
+
+std::optional<double> JsonParser::get_double(std::string_view key) {
+    if (!valid_) return std::nullopt;
+    auto result = active_[key].get_double();
+    if (result.error()) return std::nullopt;
+    return result.value();
+}
+
+std::optional<bool> JsonParser::get_bool(std::string_view key) {
+    if (!valid_) return std::nullopt;
+    auto result = active_[key].get_bool();
+    if (result.error()) return std::nullopt;
+    return result.value();
+}
+
+std::optional<std::string_view> JsonParser::get_string(std::string_view key) {
+    if (!valid_) return std::nullopt;
+    auto result = active_[key].get_string();
+    if (result.error()) return std::nullopt;
+    return result.value();
+}
+
+}  // namespace dftracer::utils::utilities::common::json
diff --git a/src/dftracer/utils/utilities/common/query/ast.cpp b/src/dftracer/utils/utilities/common/query/ast.cpp
index 3ee47932..7c5914f1 100644
--- a/src/dftracer/utils/utilities/common/query/ast.cpp
+++ b/src/dftracer/utils/utilities/common/query/ast.cpp
@@ -87,6 +87,30 @@ void node_to_string(std::ostringstream& os, const QueryNode& node) {
         node.data);
 }
 
+void collect_fields_impl(const QueryNode& node,
+                         dftracer::utils::StringViewSet& out) {
+    std::visit(
+        [&out](auto&& n) {
+            using T = std::decay_t<decltype(n)>;
+            if constexpr (std::is_same_v<T, CompareNode>) {
+                out.insert(n.field.path);
+            } else if constexpr (std::is_same_v<T, InNode>) {
+                out.insert(n.field.path);
+            } else if constexpr (std::is_same_v<T, NotInNode>) {
+                out.insert(n.field.path);
+            } else if constexpr (std::is_same_v<T, AndNode>) {
+                collect_fields_impl(*n.left, out);
+                collect_fields_impl(*n.right, out);
+            } else if constexpr (std::is_same_v<T, OrNode>) {
+                collect_fields_impl(*n.left, out);
+                collect_fields_impl(*n.right, out);
+            } else if constexpr (std::is_same_v<T, NotNode>) {
+                collect_fields_impl(*n.operand, out);
+            }
+        },
+        node.data);
+}
+
 }  // namespace
 
 std::string to_string(const QueryNode& node) {
@@ -95,4 +119,10 @@ std::string to_string(const QueryNode& node) {
     return os.str();
 }
 
+dftracer::utils::StringViewSet collect_fields(const QueryNode& node) {
+    dftracer::utils::StringViewSet fields;
+    collect_fields_impl(node, fields);
+    return fields;
+}
+
 }  // namespace dftracer::utils::utilities::common::query
diff --git a/src/dftracer/utils/utilities/common/query/query.cpp b/src/dftracer/utils/utilities/common/query/query.cpp
index b3fac100..3db5b005 100644
--- a/src/dftracer/utils/utilities/common/query/query.cpp
+++ b/src/dftracer/utils/utilities/common/query/query.cpp
@@ -2,7 +2,8 @@
 
 namespace dftracer::utils::utilities::common::query {
 
-Query::Query(const Query& other) : source_(other.source_) {
+Query::Query(const Query& other)
+    : source_(other.source_), fields_(other.fields_) {
     auto result = parse(source_);
     if (!result) throw QueryParseError(result.error());
     root_ = std::move(*result);
@@ -11,6 +12,7 @@ Query::Query(const Query& other) : source_(other.source_) {
 Query& Query::operator=(const Query& other) {
     if (this != &other) {
         source_ = other.source_;
+        fields_ = other.fields_;
         auto result = parse(source_);
         if (!result) throw QueryParseError(result.error());
         root_ = std::move(*result);
diff --git a/src/dftracer/utils/utilities/common/statistics/log2_histogram.cpp b/src/dftracer/utils/utilities/common/statistics/log2_histogram.cpp
index d1ae12f4..1a731e1e 100644
--- a/src/dftracer/utils/utilities/common/statistics/log2_histogram.cpp
+++ b/src/dftracer/utils/utilities/common/statistics/log2_histogram.cpp
@@ -1,5 +1,5 @@
 #include <dftracer/utils/utilities/common/statistics/log2_histogram.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
 #include <algorithm>
 #include <cstdio>
@@ -197,62 +197,51 @@ std::string Log2Histogram::render_blocks(std::size_t max_width,
     return out.str();
 }
 
-yyjson_mut_val* Log2Histogram::to_yyjson(yyjson_mut_doc* doc) const {
-    yyjson_mut_val* arr = yyjson_mut_arr(doc);
+std::string Log2Histogram::to_json() const {
+    std::ostringstream ss;
+    ss << '[';
+    bool first = true;
     for (std::size_t i = 0; i < NUM_BINS; ++i) {
         if (bins_[i] == 0) continue;
-        yyjson_mut_val* pair = yyjson_mut_arr(doc);
-        yyjson_mut_arr_add_uint(doc, pair, static_cast<std::uint64_t>(i));
-        yyjson_mut_arr_add_uint(doc, pair, bins_[i]);
-        yyjson_mut_arr_append(arr, pair);
+        if (!first) ss << ',';
+        first = false;
+        ss << '[' << i << ',' << bins_[i] << ']';
     }
-    return arr;
-}
-
-std::string Log2Histogram::to_json() const {
-    yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr);
-    yyjson_mut_val* arr = to_yyjson(doc);
-    yyjson_mut_doc_set_root(doc, arr);
-
-    char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_NOFLAG, nullptr);
-    std::string result(json_str ? json_str : "[]");
-    if (json_str) free(json_str);
-    yyjson_mut_doc_free(doc);
-    return result;
+    ss << ']';
+    return ss.str();
 }
 
 Log2Histogram Log2Histogram::from_json(const std::string& json) {
     Log2Histogram hist;
 
-    yyjson_doc* doc =
-        yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG);
-    if (!doc) return hist;
+    simdjson::dom::parser parser;
+    auto result = parser.parse(json.data(), json.size());
+    if (result.error()) return hist;
 
-    yyjson_val* root = yyjson_doc_get_root(doc);
-    if (!root || !yyjson_is_arr(root)) {
-        yyjson_doc_free(doc);
-        return hist;
-    }
+    auto root = result.value_unsafe();
+    if (!root.is_array()) return hist;
+
+    simdjson::dom::array root_arr;
+    if (root.get(root_arr)) return hist;
 
-    std::size_t idx, max;
-    yyjson_val* pair;
-    yyjson_arr_foreach(root, idx, max, pair) {
-        if (!yyjson_is_arr(pair) || yyjson_arr_size(pair) != 2) continue;
-        yyjson_val* bin_idx_val = yyjson_arr_get(pair, 0);
-        yyjson_val* count_val = yyjson_arr_get(pair, 1);
-        if (!yyjson_is_uint(bin_idx_val) || !yyjson_is_uint(count_val))
-            continue;
+    for (auto pair : root_arr) {
+        if (!pair.is_array()) continue;
+        simdjson::dom::array arr;
+        if (pair.get(arr)) continue;
+        if (arr.size() != 2) continue;
+
+        auto bin_idx_result = arr.at(0).get_uint64();
+        auto count_result = arr.at(1).get_uint64();
+        if (bin_idx_result.error() || count_result.error()) continue;
 
         std::size_t bin_idx =
-            static_cast<std::size_t>(yyjson_get_uint(bin_idx_val));
-        std::uint64_t count = yyjson_get_uint(count_val);
+            static_cast<std::size_t>(bin_idx_result.value_unsafe());
+        std::uint64_t count = count_result.value_unsafe();
         if (bin_idx < NUM_BINS) {
             hist.bins_[bin_idx] += count;
             hist.total_count_ += count;
         }
     }
-
-    yyjson_doc_free(doc);
     return hist;
 }
 
diff --git a/src/dftracer/utils/utilities/common/statistics/timestamp_histogram.cpp b/src/dftracer/utils/utilities/common/statistics/timestamp_histogram.cpp
new file mode 100644
index 00000000..487af91f
--- /dev/null
+++ b/src/dftracer/utils/utilities/common/statistics/timestamp_histogram.cpp
@@ -0,0 +1,173 @@
+#include <dftracer/utils/utilities/common/statistics/timestamp_histogram.h>
+
+#include <algorithm>
+#include <cstring>
+
+namespace dftracer::utils::utilities::common::statistics {
+
+void TimestampHistogram::add(std::uint64_t timestamp_us) {
+    std::uint64_t idx = bin_index(timestamp_us);
+    total_count_++;
+
+    auto it = std::lower_bound(
+        bins_.begin(), bins_.end(), idx,
+        [](const auto& p, std::uint64_t val) { return p.first < val; });
+
+    if (it != bins_.end() && it->first == idx) {
+        it->second++;
+    } else {
+        bins_.insert(it, {idx, 1});
+    }
+}
+
+void TimestampHistogram::merge(const TimestampHistogram& other) {
+    if (other.bins_.empty()) return;
+
+    std::vector<std::pair<std::uint64_t, std::uint64_t>> merged;
+    merged.reserve(bins_.size() + other.bins_.size());
+
+    auto a = bins_.begin();
+    auto b = other.bins_.begin();
+
+    while (a != bins_.end() && b != other.bins_.end()) {
+        if (a->first < b->first) {
+            merged.push_back(*a++);
+        } else if (a->first > b->first) {
+            merged.push_back(*b++);
+        } else {
+            merged.push_back({a->first, a->second + b->second});
+            ++a;
+            ++b;
+        }
+    }
+    while (a != bins_.end()) merged.push_back(*a++);
+    while (b != other.bins_.end()) merged.push_back(*b++);
+
+    bins_ = std::move(merged);
+    total_count_ += other.total_count_;
+}
+
+std::uint64_t TimestampHistogram::count_in_range(
+    std::uint64_t ts_start_us, std::uint64_t ts_end_us) const {
+    if (bins_.empty() || ts_start_us >= ts_end_us) return 0;
+
+    std::uint64_t start_bin = bin_index(ts_start_us);
+    std::uint64_t end_bin = bin_index(ts_end_us - 1);
+
+    auto it = std::lower_bound(
+        bins_.begin(), bins_.end(), start_bin,
+        [](const auto& p, std::uint64_t val) { return p.first < val; });
+
+    std::uint64_t count = 0;
+    for (; it != bins_.end() && it->first <= end_bin; ++it) {
+        count += it->second;
+    }
+    return count;
+}
+
+double TimestampHistogram::selectivity(std::uint64_t ts_start_us,
+                                       std::uint64_t ts_end_us) const {
+    if (total_count_ == 0) return 0.0;
+    return static_cast<double>(count_in_range(ts_start_us, ts_end_us)) /
+           static_cast<double>(total_count_);
+}
+
+std::vector<double> TimestampHistogram::expansion_weights(
+    std::uint64_t bucket_start_us, std::uint64_t bucket_end_us,
+    std::size_t num_sub_buckets) const {
+    std::vector<double> weights(num_sub_buckets, 0.0);
+    if (num_sub_buckets == 0 || bucket_start_us >= bucket_end_us)
+        return weights;
+
+    std::uint64_t sub_width =
+        (bucket_end_us - bucket_start_us) / num_sub_buckets;
+    if (sub_width == 0) sub_width = 1;
+
+    std::uint64_t total_in_range = 0;
+    for (std::size_t i = 0; i < num_sub_buckets; ++i) {
+        std::uint64_t sub_start = bucket_start_us + i * sub_width;
+        std::uint64_t sub_end = (i + 1 < num_sub_buckets)
+                                    ? bucket_start_us + (i + 1) * sub_width
+                                    : bucket_end_us;
+        std::uint64_t c = count_in_range(sub_start, sub_end);
+        weights[i] = static_cast<double>(c);
+        total_in_range += c;
+    }
+
+    if (total_in_range > 0) {
+        double inv = 1.0 / static_cast<double>(total_in_range);
+        for (auto& w : weights) w *= inv;
+    } else {
+        double uniform = 1.0 / static_cast<double>(num_sub_buckets);
+        for (auto& w : weights) w = uniform;
+    }
+
+    return weights;
+}
+
+// Varint encoding helpers
+namespace {
+
+void encode_varint(std::vector<std::uint8_t>& out, std::uint64_t value) {
+    while (value >= 0x80) {
+        out.push_back(static_cast<std::uint8_t>(value | 0x80));
+        value >>= 7;
+    }
+    out.push_back(static_cast<std::uint8_t>(value));
+}
+
+std::uint64_t decode_varint(const std::uint8_t*& ptr, const std::uint8_t* end) {
+    std::uint64_t result = 0;
+    unsigned shift = 0;
+    while (ptr < end) {
+        std::uint8_t byte = *ptr++;
+        result |= static_cast<std::uint64_t>(byte & 0x7F) << shift;
+        if ((byte & 0x80) == 0) return result;
+        shift += 7;
+    }
+    return result;
+}
+
+}  // namespace
+
+std::vector<std::uint8_t> TimestampHistogram::serialize() const {
+    std::vector<std::uint8_t> out;
+    out.reserve(bins_.size() * 6 + 16);
+
+    encode_varint(out, total_count_);
+    encode_varint(out, bins_.size());
+
+    std::uint64_t prev_idx = 0;
+    for (const auto& [idx, count] : bins_) {
+        encode_varint(out, idx - prev_idx);
+        encode_varint(out, count);
+        prev_idx = idx;
+    }
+
+    return out;
+}
+
+TimestampHistogram TimestampHistogram::deserialize(const std::uint8_t* data,
+                                                   std::size_t len) {
+    TimestampHistogram hist;
+    if (!data || len == 0) return hist;
+
+    const auto* ptr = data;
+    const auto* end = data + len;
+
+    hist.total_count_ = decode_varint(ptr, end);
+    std::uint64_t num_bins = decode_varint(ptr, end);
+
+    hist.bins_.reserve(static_cast<std::size_t>(num_bins));
+    std::uint64_t prev_idx = 0;
+    for (std::uint64_t i = 0; i < num_bins && ptr < end; ++i) {
+        std::uint64_t delta = decode_varint(ptr, end);
+        std::uint64_t count = decode_varint(ptr, end);
+        prev_idx += delta;
+        hist.bins_.push_back({prev_idx, count});
+    }
+
+    return hist;
+}
+
+}  // namespace dftracer::utils::utilities::common::statistics
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.cpp
new file mode 100644
index 00000000..56325907
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.cpp
@@ -0,0 +1,281 @@
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.h>
+#include <dftracer/utils/utilities/hash/fnv1a_hasher_utility.h>
+
+#include <algorithm>
+#include <cmath>
+#include <unordered_map>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+namespace {
+
+using hash::Fnv1aHashBuilder;
+
+// Key for grouping entries during shrinking (merge)
+struct MergeKey {
+    std::uint32_t cat_id;
+    std::uint32_t name_id;
+    std::uint64_t pid;
+    std::uint64_t tid;
+    std::uint32_t hhash_id;
+    std::uint32_t fhash_id;
+    std::uint64_t target_bucket;  // computed from source bucket
+
+    bool operator==(const MergeKey& other) const {
+        return cat_id == other.cat_id && name_id == other.name_id &&
+               pid == other.pid && tid == other.tid &&
+               hhash_id == other.hhash_id && fhash_id == other.fhash_id &&
+               target_bucket == other.target_bucket;
+    }
+};
+
+struct MergeKeyHash {
+    std::size_t operator()(const MergeKey& k) const {
+        Fnv1aHashBuilder h;
+        h.update_value(k.cat_id);
+        h.update_value(k.name_id);
+        h.update_value(k.pid);
+        h.update_value(k.tid);
+        h.update_value(k.hhash_id);
+        h.update_value(k.fhash_id);
+        h.update_value(k.target_bucket);
+        return static_cast<std::size_t>(h.finish());
+    }
+};
+
+// Shrink: merge multiple source buckets into one target bucket
+AggregationBatch shrink_batch(const AggregationBatch& input,
+                              std::uint64_t source_interval_us,
+                              std::uint64_t target_interval_us) {
+    AggregationBatch result;
+    result.batch_type = input.batch_type;
+    result.total_events_processed = input.total_events_processed;
+    result.total_files_processed = input.total_files_processed;
+    result.total_bytes_processed = input.total_bytes_processed;
+    result.has_approximated_entries = false;
+    result.global_extra_key_ids = input.global_extra_key_ids;
+    result.global_custom_metric_names = input.global_custom_metric_names;
+
+    // Group entries by MergeKey
+    std::unordered_map<MergeKey, AggregationEntry, MergeKeyHash> merged;
+
+    for (const auto& entry : input.entries) {
+        const auto& key = entry.key;
+        const auto& metrics = entry.metrics;
+
+        // Compute target bucket from source bucket
+        std::uint64_t source_time = key.time_bucket * source_interval_us;
+        std::uint64_t target_bucket = source_time / target_interval_us;
+
+        MergeKey mk{key.cat_id,   key.name_id,  key.pid,      key.tid,
+                    key.hhash_id, key.fhash_id, target_bucket};
+
+        auto it = merged.find(mk);
+        if (it == merged.end()) {
+            AggregationEntry new_entry;
+            new_entry.key = key;
+            new_entry.key.time_bucket = target_bucket;
+            new_entry.metrics = metrics;
+            new_entry.is_approximated = false;
+            merged.emplace(mk, std::move(new_entry));
+        } else {
+            // Merge metrics
+            it->second.metrics.merge_from(metrics);
+        }
+    }
+
+    result.entries.reserve(merged.size());
+    for (auto& [_, entry] : merged) {
+        result.entries.push_back(std::move(entry));
+    }
+
+    return result;
+}
+
+// Expand: split one source bucket into multiple target buckets
+AggregationBatch expand_batch(const AggregationBatch& input,
+                              std::uint64_t source_interval_us,
+                              std::uint64_t target_interval_us) {
+    AggregationBatch result;
+    result.batch_type = input.batch_type;
+    result.total_events_processed = input.total_events_processed;
+    result.total_files_processed = input.total_files_processed;
+    result.total_bytes_processed = input.total_bytes_processed;
+    result.has_approximated_entries = true;
+    result.global_extra_key_ids = input.global_extra_key_ids;
+    result.global_custom_metric_names = input.global_custom_metric_names;
+
+    for (const auto& entry : input.entries) {
+        const auto& key = entry.key;
+        const auto& metrics = entry.metrics;
+
+        // Source bucket boundaries
+        std::uint64_t bucket_start = key.time_bucket * source_interval_us;
+        std::uint64_t bucket_end = bucket_start + source_interval_us;
+
+        // Actual event span from ts/te
+        std::uint64_t ts = metrics.ts;
+        std::uint64_t te = metrics.te;
+
+        // Clamp ts/te to bucket boundaries
+        ts = std::max(ts, bucket_start);
+        te = std::min(te, bucket_end);
+
+        // Handle edge case: ts >= te (all events at same instant or invalid)
+        if (ts >= te) {
+            // Use original ts to determine target bucket
+            std::uint64_t original_ts = metrics.ts;
+            if (original_ts < bucket_start) original_ts = bucket_start;
+            if (original_ts >= bucket_end) original_ts = bucket_end - 1;
+
+            std::uint64_t target_bucket = original_ts / target_interval_us;
+
+            AggregationEntry new_entry;
+            new_entry.key = key;
+            new_entry.key.time_bucket = target_bucket;
+            new_entry.is_approximated = true;
+            new_entry.metrics = metrics;
+            new_entry.count_ci =
+                compute_poisson_ci(static_cast<double>(metrics.count));
+
+            result.entries.push_back(std::move(new_entry));
+            continue;
+        }
+
+        std::uint64_t span = te - ts;
+
+        // Compute first and last target buckets that overlap with [ts, te]
+        std::uint64_t first_target = ts / target_interval_us;
+        std::uint64_t last_target = (te - 1) / target_interval_us;
+
+        // Distribute across overlapping sub-buckets
+        double total_weight = 0.0;
+        std::vector<std::pair<std::uint64_t, double>> bucket_weights;
+
+        for (std::uint64_t tb = first_target; tb <= last_target; ++tb) {
+            std::uint64_t tb_start = tb * target_interval_us;
+            std::uint64_t tb_end = tb_start + target_interval_us;
+
+            // Overlap with [ts, te]
+            std::uint64_t overlap_start = std::max(ts, tb_start);
+            std::uint64_t overlap_end = std::min(te, tb_end);
+
+            if (overlap_start < overlap_end) {
+                double weight =
+                    static_cast<double>(overlap_end - overlap_start) / span;
+                bucket_weights.emplace_back(tb, weight);
+                total_weight += weight;
+            }
+        }
+
+        // Normalize weights (should sum to ~1.0)
+        if (total_weight > 0.0) {
+            for (auto& [_, w] : bucket_weights) {
+                w /= total_weight;
+            }
+        }
+
+        // Create sub-bucket entries
+        double count = static_cast<double>(metrics.count);
+        std::uint64_t count_sum = 0;
+
+        for (std::size_t i = 0; i < bucket_weights.size(); ++i) {
+            auto& [tb, weight] = bucket_weights[i];
+
+            AggregationEntry new_entry;
+            new_entry.key = key;
+            new_entry.key.time_bucket = tb;
+            new_entry.is_approximated = true;
+
+            // Distribute count by weight
+            double sub_count = count * weight;
+
+            // For the last bucket, adjust to ensure sum equals original
+            std::uint64_t sub_count_int;
+            if (i == bucket_weights.size() - 1) {
+                sub_count_int = metrics.count - count_sum;
+            } else {
+                sub_count_int =
+                    static_cast<std::uint64_t>(std::round(sub_count));
+                count_sum += sub_count_int;
+            }
+
+            // Create metrics for sub-bucket
+            new_entry.metrics = metrics;  // Copy all fields
+            new_entry.metrics.count = sub_count_int;
+
+            // Scale duration total by weight
+            new_entry.metrics.duration.total = static_cast<std::uint64_t>(
+                std::round(metrics.duration.total * weight));
+            new_entry.metrics.duration.count = sub_count_int;
+
+            // Scale size total by weight
+            new_entry.metrics.size.total = static_cast<std::uint64_t>(
+                std::round(metrics.size.total * weight));
+            new_entry.metrics.size.count = sub_count_int;
+
+            // Keep min/max conservative (can't know which sub-bucket had them)
+            // mean stays the same
+            // variance stays the same (conservative)
+
+            // Compute confidence interval
+            new_entry.count_ci = compute_poisson_ci(sub_count);
+
+            // Scale custom metrics by weight
+            if (new_entry.metrics.custom_metrics) {
+                for (auto& [name, stat] : *new_entry.metrics.custom_metrics) {
+                    stat.total = static_cast<std::uint64_t>(
+                        std::round(stat.total * weight));
+                    stat.count = sub_count_int;
+                }
+            }
+
+            // Only add if count > 0
+            if (sub_count_int > 0) {
+                result.entries.push_back(std::move(new_entry));
+            }
+        }
+    }
+
+    return result;
+}
+
+}  // namespace
+
+AggregationBatch augment_batch(const AggregationBatch& input,
+                               const AugmentationConfig& config) {
+    // Pass through if intervals match
+    if (config.source_interval_us == config.target_interval_us) {
+        AggregationBatch result;
+        result.batch_type = input.batch_type;
+        result.total_events_processed = input.total_events_processed;
+        result.total_files_processed = input.total_files_processed;
+        result.total_bytes_processed = input.total_bytes_processed;
+        result.has_approximated_entries = false;
+        result.global_extra_key_ids = input.global_extra_key_ids;
+        result.global_custom_metric_names = input.global_custom_metric_names;
+
+        result.entries.reserve(input.entries.size());
+        for (const auto& entry : input.entries) {
+            AggregationEntry new_entry;
+            new_entry.key = entry.key;
+            new_entry.metrics = entry.metrics;
+            new_entry.is_approximated = false;
+            result.entries.push_back(std::move(new_entry));
+        }
+
+        return result;
+    }
+
+    // Shrink: target > source (fewer, larger buckets)
+    if (config.target_interval_us > config.source_interval_us) {
+        return shrink_batch(input, config.source_interval_us,
+                            config.target_interval_us);
+    }
+
+    // Expand: target < source (more, smaller buckets)
+    return expand_batch(input, config.source_interval_us,
+                        config.target_interval_us);
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.cpp
new file mode 100644
index 00000000..66642760
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.cpp
@@ -0,0 +1,212 @@
+#include <dftracer/utils/core/common/string_intern.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.h>
+#include <dftracer/utils/utilities/composites/dft/args_map.h>
+#include <dftracer/utils/utilities/composites/dft/internal/utils.h>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+namespace {
+
+void apply_preaggregated_metric(MetricStats& stats, std::uint64_t ev_count,
+                                const ArgsValueProxy& sum_val,
+                                const ArgsValueProxy& min_val,
+                                const ArgsValueProxy& max_val) {
+    if (!sum_val.exists()) return;
+
+    const auto total = sum_val.get<std::uint64_t>();
+    stats.count += ev_count;
+    stats.total += total;
+    if (min_val.exists()) {
+        stats.min = std::min(stats.min, min_val.get<std::uint64_t>());
+    }
+    if (max_val.exists()) {
+        stats.max = std::max(stats.max, max_val.get<std::uint64_t>());
+    }
+
+    if (stats.count > 0) {
+        stats.mean =
+            static_cast<double>(stats.total) / static_cast<double>(stats.count);
+        stats.m2 = 0.0;
+    }
+}
+
+}  // namespace
+
+std::uint64_t compute_time_bucket(std::uint64_t timestamp,
+                                  std::uint64_t duration,
+                                  const AggregationConfig& config) {
+    std::uint64_t midpoint = timestamp + (duration / 2);
+
+    if (config.use_relative_time) {
+        midpoint -= config.reference_timestamp;
+    }
+    if (config.time_interval_us == 0) return midpoint;
+    return (midpoint / config.time_interval_us) * config.time_interval_us;
+}
+
+AggregationKey build_aggregation_key(const DFTracerEvent& ev,
+                                     const AggregationConfig& config) {
+    auto& intern = aggregation_intern();
+
+    AggregationKey key;
+    key.cat_id = intern.get_or_insert(ev.cat);
+    key.name_id = intern.get_or_insert(ev.name);
+    key.pid = ev.pid;
+    key.tid = ev.tid;
+
+    auto hhash_sv = ev.args["hhash"].get<std::string_view>();
+    if (!hhash_sv.empty()) {
+        key.hhash_id = intern.get_or_insert(hhash_sv);
+    }
+    auto fhash_sv = ev.args["fhash"].get<std::string_view>();
+    if (!fhash_sv.empty()) {
+        key.fhash_id = intern.get_or_insert(fhash_sv);
+    }
+
+    key.time_bucket = compute_time_bucket(ev.ts, ev.dur, config);
+
+    if (!config.extra_group_keys.empty()) {
+        key.extra_keys = std::make_unique<
+            std::vector<std::pair<std::uint32_t, std::uint32_t>>>();
+        for (const auto& extra_key : config.extra_group_keys) {
+            std::string_view value = ev.args[extra_key].get<std::string_view>();
+            if (!value.empty()) {
+                key.extra_keys->emplace_back(intern.get_or_insert(extra_key),
+                                             intern.get_or_insert(value));
+            }
+        }
+    }
+
+    return key;
+}
+
+void update_aggregation_entry(const DFTracerEvent& ev,
+                              const AggregationConfig& config,
+                              AggregationMap& aggregations,
+                              const AggregationKey& key) {
+    auto it = aggregations.find(key);
+    if (it == aggregations.end()) {
+        it = aggregations
+                 .emplace(key, AggregationMetrics(config.sketch_accuracy))
+                 .first;
+    }
+    auto& metrics = it->second;
+
+    std::uint64_t ev_count = 0;
+
+    if (ev.is_counter()) {
+        auto a_count = ev.args["dft_cnt"];
+        if (!a_count.exists()) a_count = ev.args["count"];
+        ev_count = a_count.exists() ? a_count.get<std::uint64_t>() : 1;
+        metrics.count += ev_count;
+
+        auto a_dur = ev.args["dur_sum"];
+        if (!a_dur.exists()) a_dur = ev.args["dur"];
+        auto a_dur_min = ev.args["dur_min"];
+        if (!a_dur_min.exists()) a_dur_min = ev.args["dur"];
+        auto a_dur_max = ev.args["dur_max"];
+        if (!a_dur_max.exists()) a_dur_max = ev.args["dur"];
+        apply_preaggregated_metric(metrics.duration, ev_count, a_dur, a_dur_min,
+                                   a_dur_max);
+
+        auto a_size_sum = ev.args["ret_sum"];
+        if (!a_size_sum.exists()) a_size_sum = ev.args["ret"];
+        auto a_size_min = ev.args["ret_min"];
+        if (!a_size_min.exists()) a_size_min = ev.args["ret"];
+        auto a_size_max = ev.args["ret_max"];
+        if (!a_size_max.exists()) a_size_max = ev.args["ret"];
+        apply_preaggregated_metric(metrics.size, ev_count, a_size_sum,
+                                   a_size_min, a_size_max);
+
+        metrics.update_timestamp(ev.ts, config.time_interval_us);
+    } else {
+        metrics.update_duration(ev.dur, config.compute_percentiles);
+        metrics.update_timestamp(ev.ts, ev.dur);
+
+        auto ret = ev.args["ret"];
+        if (ret.exists() &&
+            internal::is_data_transfer_op(key.cat(), key.name())) {
+            std::uint64_t size = ret.get<std::uint64_t>();
+            metrics.update_size(size, config.compute_percentiles);
+        }
+    }
+
+    auto track_metric_field = [&](std::string_view field) {
+        if (ev.is_counter()) {
+            std::string sum_key = std::string(field) + "_sum";
+            auto a_sum = ev.args[sum_key];
+            if (!a_sum.exists()) a_sum = ev.args[field];
+            std::string min_key = std::string(field) + "_min";
+            auto a_min = ev.args[min_key];
+            if (!a_min.exists()) a_min = ev.args[field];
+            std::string max_key = std::string(field) + "_max";
+            auto a_max = ev.args[max_key];
+            if (!a_max.exists()) a_max = ev.args[field];
+            if (a_sum.exists() || a_min.exists() || a_max.exists()) {
+                if (!metrics.custom_metrics) {
+                    metrics.custom_metrics =
+                        std::make_unique<CustomMetricsMap>();
+                }
+                auto& cm = *metrics.custom_metrics;
+                auto cm_it = cm.find(field);
+                if (cm_it == cm.end()) {
+                    cm_it = cm.emplace(std::string(field),
+                                       MetricStats(metrics.sketch_accuracy))
+                                .first;
+                }
+                apply_preaggregated_metric(cm_it->second, ev_count, a_sum,
+                                           a_min, a_max);
+            }
+        } else {
+            auto field_val = ev.args[field];
+            if (field_val.exists()) {
+                std::uint64_t value = field_val.get<std::uint64_t>();
+                metrics.update_custom_metric(field, value,
+                                             config.compute_percentiles);
+            }
+        }
+    };
+
+    for (const auto& field : config.custom_metric_fields) {
+        track_metric_field(field);
+    }
+
+    if (config.track_default_args) {
+        auto is_reserved = [](std::string_view k) {
+            return k == "hhash" || k == "fhash" || k == "dft_cnt" ||
+                   k == "dur" || k == "dur_sum" || k == "dur_min" ||
+                   k == "dur_max" || k == "ret" || k == "ret_sum" ||
+                   k == "ret_min" || k == "ret_max";
+        };
+
+        auto is_preagg_suffix = [](std::string_view k) {
+            return k.size() > 4 && (k.substr(k.size() - 4) == "_sum" ||
+                                    k.substr(k.size() - 4) == "_min" ||
+                                    k.substr(k.size() - 4) == "_max");
+        };
+
+        auto is_extra_group_key = [&](std::string_view k) {
+            for (const auto& gk : config.extra_group_keys) {
+                if (gk == k) return true;
+            }
+            return false;
+        };
+
+        auto is_custom_field = [&](std::string_view k) {
+            for (const auto& cf : config.custom_metric_fields) {
+                if (cf == k) return true;
+            }
+            return false;
+        };
+
+        ev.args.for_each_member([&](std::string_view k, ArgsValueProxy v) {
+            if (is_reserved(k) || is_extra_group_key(k) || is_custom_field(k))
+                return;
+            if (ev.is_counter() && is_preagg_suffix(k)) return;
+            if (!v.is_number()) return;
+            track_metric_field(k);
+        });
+    }
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.cpp
new file mode 100644
index 00000000..e318f48b
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.cpp
@@ -0,0 +1,54 @@
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+
+#include <string_view>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+bool AggregationMergeOperator::FullMergeV2(
+    const MergeOperationInput& merge_in,
+    MergeOperationOutput* merge_out) const {
+    AggregationMetrics result;
+
+    if (merge_in.existing_value) {
+        try {
+            result = deserialize_agg_value(
+                std::string_view(merge_in.existing_value->data(),
+                                 merge_in.existing_value->size()));
+        } catch (...) {
+            return false;
+        }
+    }
+
+    for (const auto& operand : merge_in.operand_list) {
+        try {
+            auto other = deserialize_agg_value(
+                std::string_view(operand.data(), operand.size()));
+            result.merge_from(other);
+        } catch (...) {
+            return false;
+        }
+    }
+
+    merge_out->new_value = serialize_agg_value(result);
+    return true;
+}
+
+bool AggregationMergeOperator::PartialMerge(
+    const ::rocksdb::Slice& /*key*/, const ::rocksdb::Slice& left_operand,
+    const ::rocksdb::Slice& right_operand, std::string* new_value,
+    ::rocksdb::Logger* /*logger*/) const {
+    try {
+        auto left = deserialize_agg_value(
+            std::string_view(left_operand.data(), left_operand.size()));
+        auto right = deserialize_agg_value(
+            std::string_view(right_operand.data(), right_operand.size()));
+        left.merge_from(right);
+        *new_value = serialize_agg_value(left);
+        return true;
+    } catch (...) {
+        return false;
+    }
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.cpp
index 3dc0295e..76759b18 100644
--- a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_metrics.cpp
@@ -5,65 +5,54 @@
 
 namespace dftracer::utils::utilities::composites::dft::aggregators {
 
-void MetricStats::update(std::uint64_t value, std::uint64_t count,
-                         bool compute_percentiles) {
+// Representation note:
+//   count, total -> plain integer running sums (bit-exact regardless of
+//                   merge order; overflow guarded by u64 range for typical
+//                   trace magnitudes).
+//   m2, m3, m4   -> REPURPOSED. Now hold raw power sums:
+//                     m2 = sum_x^2
+//                     m3 = sum_x^3
+//                     m4 = sum_x^4
+//                   Instead of Welford central moments. Merge becomes
+//                   plain addition, making it commutative + associative.
+//                   Integer-valued inputs with v^k representable in
+//                   double mantissa (<= 2^52) keep additions exact, so
+//                   serial and MPI outputs match bit-for-bit. Stddev /
+//                   skewness / kurtosis are computed at read time by
+//                   converting power sums to central moments.
+//   mean         -> Not maintained incrementally; filled in at emit time
+//                   by the aggregator from (total / count).
+void MetricStats::update(std::uint64_t value, bool compute_percentiles) {
+    count++;
     total += value;
     if (value < min) min = value;
     if (value > max) max = value;
 
-    double n = static_cast<double>(count);
-    double delta = static_cast<double>(value) - mean;
-    double delta_n = delta / n;
-    double delta_n2 = delta_n * delta_n;
-    double term1 = delta * delta_n * (n - 1);
-
-    m4 += term1 * delta_n2 * (n * n - 3 * n + 3) + 6 * delta_n2 * m2 -
-          4 * delta_n * m3;
-    m3 += term1 * delta_n * (n - 2) - 3 * delta_n * m2;
-    m2 += term1;
-    mean += delta_n;
+    const double v = static_cast<double>(value);
+    const double v2 = v * v;
+    m2 += v2;
+    m3 += v2 * v;
+    m4 += v2 * v2;
+    mean = static_cast<double>(total) / static_cast<double>(count);
 
     if (compute_percentiles) {
         if (!sketch) {
             sketch = std::make_unique<DDSketch>(sketch_accuracy_);
         }
-        sketch->add(static_cast<double>(value));
+        sketch->add(v);
     }
 }
 
-void MetricStats::merge_from(const MetricStats& other, std::uint64_t n1,
-                             std::uint64_t n2, std::uint64_t n) {
+void MetricStats::merge_from(const MetricStats& other) {
+    count += other.count;
     total += other.total;
     min = std::min(min, other.min);
     max = std::max(max, other.max);
-
-    if (n > 0) {
-        double delta = other.mean - mean;
-        double delta2 = delta * delta;
-        double delta3 = delta * delta2;
-        double delta4 = delta2 * delta2;
-
-        double n1_d = static_cast<double>(n1);
-        double n2_d = static_cast<double>(n2);
-        double n_d = static_cast<double>(n);
-
-        double mean_new = (n1_d * mean + n2_d * other.mean) / n_d;
-
-        m4 = m4 + other.m4 +
-             delta4 * n1_d * n2_d * (n1_d * n1_d - n1_d * n2_d + n2_d * n2_d) /
-                 (n_d * n_d * n_d) +
-             6 * delta2 * (n1_d * n1_d * other.m2 + n2_d * n2_d * m2) /
-                 (n_d * n_d) +
-             4 * delta * (n1_d * other.m3 - n2_d * m3) / n_d;
-
-        m3 = m3 + other.m3 +
-             delta3 * n1_d * n2_d * (n1_d - n2_d) / (n_d * n_d) +
-             3 * delta * (n1_d * other.m2 - n2_d * m2) / n_d;
-
-        m2 = m2 + other.m2 + delta2 * n1_d * n2_d / n_d;
-
-        mean = mean_new;
-    }
+    m2 += other.m2;
+    m3 += other.m3;
+    m4 += other.m4;
+    mean = count > 0 ? static_cast<double>(total) / static_cast<double>(count)
+                     : 0.0;
 
     if (other.sketch) {
         if (!sketch) {
@@ -73,32 +62,59 @@ void MetricStats::merge_from(const MetricStats& other, std::uint64_t n1,
     }
 }
 
-double MetricStats::get_stddev(std::uint64_t count) const {
+// Convert power sums (m2=sum_x^2, m3=sum_x^3, m4=sum_x^4) and
+// (count, total) into the central moments needed for stddev / skewness
+// / kurtosis. Well-known identities:
+//   mu = total / n
+//   M2 = sum_x^2 - n * mu^2
+//   M3 = sum_x^3 - 3 * mu * sum_x^2 + 2 * n * mu^3
+//   M4 = sum_x^4 - 4 * mu * sum_x^3 + 6 * mu^2 * sum_x^2 - 3 * n * mu^4
+static void central_moments(std::uint64_t count, std::uint64_t total, double m2,
+                            double m3, double m4, double& M2, double& M3,
+                            double& M4, double& n, double& mu) {
+    n = static_cast<double>(count);
+    mu = static_cast<double>(total) / n;
+    M2 = m2 - n * mu * mu;
+    M3 = m3 - 3.0 * mu * m2 + 2.0 * n * mu * mu * mu;
+    M4 = m4 - 4.0 * mu * m3 + 6.0 * mu * mu * m2 - 3.0 * n * mu * mu * mu * mu;
+    // Rounding can push nonneg moments slightly negative.
+    if (M2 < 0.0) M2 = 0.0;
+    if (M4 < 0.0) M4 = 0.0;
+}
+
+double MetricStats::get_stddev() const {
     if (count < 2) return 0.0;
-    return std::sqrt(m2 / static_cast<double>(count - 1));
+    double M2, M3, M4, n, mu;
+    central_moments(count, total, m2, m3, m4, M2, M3, M4, n, mu);
+    const double var = M2 / (n - 1.0);
+    return var > 0.0 ? std::sqrt(var) : 0.0;
 }
 
-double MetricStats::get_skewness(std::uint64_t count) const {
-    if (count < 3 || m2 == 0.0) return 0.0;
-    double n = static_cast<double>(count);
-    return std::sqrt(n) * m3 / std::pow(m2, 1.5);
+double MetricStats::get_skewness() const {
+    if (count < 3) return 0.0;
+    double M2, M3, M4, n, mu;
+    central_moments(count, total, m2, m3, m4, M2, M3, M4, n, mu);
+    if (M2 == 0.0) return 0.0;
+    return std::sqrt(n) * M3 / std::pow(M2, 1.5);
 }
 
-double MetricStats::get_kurtosis(std::uint64_t count) const {
-    if (count < 4 || m2 == 0.0) return 0.0;
-    double n = static_cast<double>(count);
-    return n * m4 / (m2 * m2) - 3.0;
+double MetricStats::get_kurtosis() const {
+    if (count < 4) return 0.0;
+    double M2, M3, M4, n, mu;
+    central_moments(count, total, m2, m3, m4, M2, M3, M4, n, mu);
+    if (M2 == 0.0) return 0.0;
+    return n * M4 / (M2 * M2) - 3.0;
 }
 
 void AggregationMetrics::update_duration(std::uint64_t dur,
                                          bool compute_percentiles) {
     count++;
-    duration.update(dur, count, compute_percentiles);
+    duration.update(dur, compute_percentiles);
 }
 
 void AggregationMetrics::update_size(std::uint64_t sz,
                                      bool compute_percentiles) {
-    size.update(sz, count, compute_percentiles);
+    size.update(sz, compute_percentiles);
 }
 
 void AggregationMetrics::update_timestamp(std::uint64_t event_ts,
@@ -122,42 +138,26 @@ void AggregationMetrics::update_timestamp_clamped(std::uint64_t event_ts,
     if (clamped_te > te) te = clamped_te;
 }
 
-void AggregationMetrics::update_custom_metric(const std::string& name,
+void AggregationMetrics::update_custom_metric(std::string_view name,
                                               std::uint64_t value,
                                               bool compute_percentiles) {
     if (!custom_metrics) {
         custom_metrics = std::make_unique<CustomMetricsMap>();
     }
-    if (custom_metrics->find(name) == custom_metrics->end()) {
-        custom_metrics->emplace(name, MetricStats(sketch_accuracy));
-    }
-    (*custom_metrics)[name].update(value, count, compute_percentiles);
-}
-
-double AggregationMetrics::get_stddev_duration() const {
-    return duration.get_stddev(count);
-}
-
-double AggregationMetrics::get_stddev_size() const {
-    return size.get_stddev(count);
-}
-
-double AggregationMetrics::get_custom_stddev(const std::string& name) const {
-    if (!custom_metrics) return 0.0;
     auto it = custom_metrics->find(name);
-    if (it == custom_metrics->end()) return 0.0;
-    return it->second.get_stddev(count);
+    if (it == custom_metrics->end()) {
+        auto [new_it, _] = custom_metrics->emplace(
+            std::string(name), MetricStats(sketch_accuracy));
+        it = new_it;
+    }
+    it->second.update(value, compute_percentiles);
 }
 
 void AggregationMetrics::merge_from(const AggregationMetrics& other) {
-    std::uint64_t n1 = count;
-    std::uint64_t n2 = other.count;
-    std::uint64_t n = n1 + n2;
-
-    count = n;
+    count += other.count;
 
-    duration.merge_from(other.duration, n1, n2, n);
-    size.merge_from(other.size, n1, n2, n);
+    duration.merge_from(other.duration);
+    size.merge_from(other.size);
 
     ts = std::min(ts, other.ts);
     te = std::max(te, other.te);
@@ -167,7 +167,13 @@ void AggregationMetrics::merge_from(const AggregationMetrics& other) {
             custom_metrics = std::make_unique<CustomMetricsMap>();
         }
         for (const auto& [name, other_metric] : *other.custom_metrics) {
-            (*custom_metrics)[name].merge_from(other_metric, n1, n2, n);
+            auto it = custom_metrics->find(name);
+            if (it == custom_metrics->end()) {
+                auto [new_it, _] =
+                    custom_metrics->emplace(name, MetricStats(sketch_accuracy));
+                it = new_it;
+            }
+            it->second.merge_from(other_metric);
         }
     }
 }
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.cpp
new file mode 100644
index 00000000..faa8b849
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.cpp
@@ -0,0 +1,453 @@
+#include <dftracer/utils/utilities/common/serialization/binary_codec.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <dftracer/utils/utilities/hash/fnv1a_hasher_utility.h>
+#include <dftracer/utils/utilities/indexer/index_batch_sink.h>
+
+#include <cstring>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+namespace {
+
+namespace hash = dftracer::utils::utilities::hash;
+
+using common::serialization::BinaryReader;
+using common::serialization::put_be16;
+using common::serialization::put_blob;
+using common::serialization::put_double;
+using common::serialization::put_str;
+using common::serialization::put_u8;
+using common::serialization::put_varint;
+using common::serialization::write_double;
+using common::serialization::write_str;
+using common::serialization::write_varint;
+
+std::uint16_t compute_shard(std::string_view cat, std::string_view name,
+                            std::uint64_t pid, std::uint64_t tid) {
+    struct Cache {
+        char cat_buf[64];
+        char name_buf[64];
+        std::size_t cat_len = SIZE_MAX;
+        std::size_t name_len = SIZE_MAX;
+        std::uint64_t pid = 0;
+        std::uint64_t tid = 0;
+        std::uint16_t shard = 0;
+    };
+    thread_local Cache cache;
+
+    if (cat.size() == cache.cat_len && name.size() == cache.name_len &&
+        pid == cache.pid && tid == cache.tid &&
+        std::memcmp(cache.cat_buf, cat.data(), cat.size()) == 0 &&
+        std::memcmp(cache.name_buf, name.data(), name.size()) == 0) {
+        return cache.shard;
+    }
+
+    hash::Fnv1aHashBuilder h;
+    h.update(cat);
+    h.update(name);
+    h.update_value(pid);
+    h.update_value(tid);
+    const auto shard =
+        static_cast<std::uint16_t>(h.finish() % AGG_KEY_NUM_SHARDS);
+
+    if (cat.size() <= sizeof(cache.cat_buf) &&
+        name.size() <= sizeof(cache.name_buf)) {
+        std::memcpy(cache.cat_buf, cat.data(), cat.size());
+        std::memcpy(cache.name_buf, name.data(), name.size());
+        cache.cat_len = cat.size();
+        cache.name_len = name.size();
+        cache.pid = pid;
+        cache.tid = tid;
+        cache.shard = shard;
+    } else {
+        cache.cat_len = SIZE_MAX;
+    }
+    return shard;
+}
+
+// Wire layout (FULL / FULL_WITH_SKETCH):
+//   fmt:u8, count:varint, total:varint, min:varint, max:varint,
+//   mean:f64, m2:f64, m3:f64, m4:f64, [sketch blob]
+// m2/m3/m4 are raw power sums (sum_x^2/3/4); mean is redundantly persisted
+// so consumers that don't need stddev can skip the power sums.
+inline char* write_metric_stats(char* p, const MetricStats& ms) {
+    // COMPACT format can only represent "empty" (count=0) or a single
+    // event with value = total (count=1). Critically: count=1 total=0 is
+    // a VALID state (one event with value 0) that COMPACT cannot round-
+    // trip because the deserializer falls back to count=0 whenever the
+    // serialized varint is 0. Avoid COMPACT for that case.
+    const bool compact_empty =
+        ms.count == 0 && ms.total == 0 && ms.m2 == 0.0 && !ms.sketch;
+    const bool compact_single = ms.count == 1 && ms.total > 0 &&
+                                ms.m2 == static_cast<double>(ms.total) *
+                                             static_cast<double>(ms.total) &&
+                                !ms.sketch;
+    if (compact_empty || compact_single) {
+        *p++ = static_cast<char>(METRIC_FMT_COMPACT);
+        return write_varint(p, ms.count == 0 ? 0 : ms.total);
+    }
+    *p++ = static_cast<char>(METRIC_FMT_FULL);
+    p = write_varint(p, ms.count);
+    p = write_varint(p, ms.total);
+    p = write_varint(p, ms.min);
+    p = write_varint(p, ms.max);
+    p = write_double(p, ms.mean);
+    p = write_double(p, ms.m2);
+    // m3/m4 not persisted yet -- skewness/kurtosis recomputed in memory.
+    // p = write_double(p, ms.m3);
+    // p = write_double(p, ms.m4);
+    return p;
+}
+
+// Upper bound for MetricStats (FULL fmt, no sketch):
+//   1 (fmt) + 4*10 (varints) + 2*8 (doubles) = 57 bytes
+constexpr std::size_t METRIC_STATS_MAX_BYTES_NO_SKETCH = 57;
+
+void serialize_metric_stats(std::string& out, const MetricStats& ms) {
+    if (!ms.sketch) {
+        const auto old_size = out.size();
+        out.resize(old_size + METRIC_STATS_MAX_BYTES_NO_SKETCH);
+        char* begin = out.data() + old_size;
+        char* p = write_metric_stats(begin, ms);
+        out.resize(old_size + static_cast<std::size_t>(p - begin));
+        return;
+    }
+    put_u8(out, METRIC_FMT_FULL_WITH_SKETCH);
+    put_varint(out, ms.count);
+    put_varint(out, ms.total);
+    put_varint(out, ms.min);
+    put_varint(out, ms.max);
+    put_double(out, ms.mean);
+    put_double(out, ms.m2);
+    // m3/m4 not persisted yet.
+    // put_double(out, ms.m3);
+    // put_double(out, ms.m4);
+    auto blob = ms.sketch->serialize();
+    put_blob(out, blob);
+}
+
+MetricStats deserialize_metric_stats(BinaryReader& r, double accuracy) {
+    auto fmt = r.u8();
+    if (fmt == METRIC_FMT_COMPACT) {
+        MetricStats ms(accuracy);
+        auto val = r.varint();
+        if (val > 0) {
+            ms.count = 1;
+            ms.total = val;
+            ms.min = val;
+            ms.max = val;
+            ms.mean = static_cast<double>(val);
+            const double v = static_cast<double>(val);
+            ms.m2 = v * v;
+            // In-memory skewness/kurtosis only; not persisted:
+            // ms.m3 = v * v * v;
+            // ms.m4 = v * v * v * v;
+        }
+        return ms;
+    }
+    MetricStats ms(accuracy);
+    ms.count = r.varint();
+    ms.total = r.varint();
+    ms.min = r.varint();
+    ms.max = r.varint();
+    ms.mean = r.f64();
+    ms.m2 = r.f64();
+    // ms.m3 = r.f64();
+    // ms.m4 = r.f64();
+    if (fmt == METRIC_FMT_FULL_WITH_SKETCH) {
+        auto blob = r.blob();
+        ms.sketch = std::make_unique<DDSketch>(DDSketch::deserialize(
+            reinterpret_cast<const std::uint8_t*>(blob.data()), blob.size()));
+    }
+    return ms;
+}
+
+}  // namespace
+
+void serialize_agg_key_into(std::string& out, std::uint32_t /*config_hash*/,
+                            AggMapType map_type, const AggregationKey& key) {
+    out.clear();
+    auto& intern = aggregation_intern();
+    auto cat = intern.resolve(key.cat_id);
+    auto name = intern.resolve(key.name_id);
+    put_be16(out, compute_shard(cat, name, key.pid, key.tid));
+    put_u8(out, static_cast<std::uint8_t>(map_type));
+    put_varint(out, key.cat_id);
+    put_varint(out, key.name_id);
+    put_varint(out, key.pid);
+    put_varint(out, key.tid);
+    put_varint(out, key.hhash_id);
+    put_varint(out, key.fhash_id);
+    put_varint(out, key.time_bucket);
+    std::uint16_t num_extra =
+        key.extra_keys ? static_cast<std::uint16_t>(key.extra_keys->size()) : 0;
+    put_be16(out, num_extra);
+    if (key.extra_keys) {
+        for (const auto& [k, v] : *key.extra_keys) {
+            put_varint(out, k);
+            put_varint(out, v);
+        }
+    }
+}
+
+void serialize_agg_key_into(
+    std::string& out, std::uint32_t /*config_hash*/, AggMapType map_type,
+    std::string_view cat, std::string_view name, std::uint64_t pid,
+    std::uint64_t tid, std::string_view hhash, std::string_view fhash,
+    std::uint64_t time_bucket,
+    const std::vector<std::pair<std::string_view, std::string_view>>*
+        extra_keys) {
+    auto& intern = aggregation_intern();
+    const std::uint16_t shard = compute_shard(cat, name, pid, tid);
+    const std::uint16_t num_extra =
+        extra_keys ? static_cast<std::uint16_t>(extra_keys->size()) : 0;
+
+    // All fields are varints now — conservative upper bound
+    std::size_t total = 2 + 1 + 7 * 5 + 2 + num_extra * 2 * 5;
+
+    out.clear();
+    out.reserve(total);
+
+    put_be16(out, shard);
+    out.push_back(static_cast<char>(map_type));
+    put_varint(out, intern.get_or_insert(cat));
+    put_varint(out, intern.get_or_insert(name));
+    put_varint(out, pid);
+    put_varint(out, tid);
+    put_varint(out, hhash.empty() ? 0 : intern.get_or_insert(hhash));
+    put_varint(out, fhash.empty() ? 0 : intern.get_or_insert(fhash));
+    put_varint(out, time_bucket);
+    put_be16(out, num_extra);
+    if (extra_keys) {
+        for (const auto& [k, v] : *extra_keys) {
+            put_varint(out, intern.get_or_insert(k));
+            put_varint(out, intern.get_or_insert(v));
+        }
+    }
+}
+
+std::string serialize_agg_key(std::uint32_t config_hash, AggMapType map_type,
+                              const AggregationKey& key) {
+    std::string out;
+    out.reserve(47);
+    serialize_agg_key_into(out, config_hash, map_type, key);
+    return out;
+}
+
+DeserializedAggKey deserialize_agg_key(std::string_view data) {
+    BinaryReader r(data);
+    (void)r.be16();
+    auto map_type = static_cast<AggMapType>(r.u8());
+    AggregationKey key;
+    key.cat_id = static_cast<std::uint32_t>(r.varint());
+    key.name_id = static_cast<std::uint32_t>(r.varint());
+    key.pid = r.varint();
+    key.tid = r.varint();
+    key.hhash_id = static_cast<std::uint32_t>(r.varint());
+    key.fhash_id = static_cast<std::uint32_t>(r.varint());
+    key.time_bucket = r.varint();
+    auto num_extra = r.be16();
+    if (num_extra > 0) {
+        key.extra_keys = std::make_unique<
+            std::vector<std::pair<std::uint32_t, std::uint32_t>>>();
+        key.extra_keys->reserve(num_extra);
+        for (std::uint16_t i = 0; i < num_extra; ++i) {
+            auto k = static_cast<std::uint32_t>(r.varint());
+            auto v = static_cast<std::uint32_t>(r.varint());
+            key.extra_keys->emplace_back(k, v);
+        }
+    }
+    return {0, map_type, std::move(key)};
+}
+
+void serialize_agg_value_into(std::string& out, const AggregationMetrics& m) {
+    // Fast path: no sketches anywhere. Pre-size to a conservative upper
+    // bound and write directly via pointer, then shrink.
+    bool has_sketch = m.duration.sketch || m.size.sketch;
+    if (!has_sketch && m.custom_metrics) {
+        for (const auto& [_, ms] : *m.custom_metrics) {
+            if (ms.sketch) {
+                has_sketch = true;
+                break;
+            }
+        }
+    }
+
+    if (!has_sketch) {
+        std::size_t custom_bytes = 0;
+        if (m.custom_metrics) {
+            for (const auto& [name, _] : *m.custom_metrics) {
+                custom_bytes +=
+                    2 + name.size() + METRIC_STATS_MAX_BYTES_NO_SKETCH;
+            }
+        }
+        const std::size_t max_total =
+            10 /*count*/ + METRIC_STATS_MAX_BYTES_NO_SKETCH /*dur*/ +
+            METRIC_STATS_MAX_BYTES_NO_SKETCH /*size*/ + 10 + 10 +
+            10 /*ts/te/parent*/ + 10 /*num_custom*/ + custom_bytes;
+        out.resize(max_total);
+        char* begin = out.data();
+        char* p = begin;
+        p = write_varint(p, m.count);
+        p = write_metric_stats(p, m.duration);
+        p = write_metric_stats(p, m.size);
+        p = write_varint(p, m.ts);
+        p = write_varint(p, m.te);
+        p = write_varint(p, m.parent_pid);
+        const std::uint32_t num_custom =
+            m.custom_metrics
+                ? static_cast<std::uint32_t>(m.custom_metrics->size())
+                : 0;
+        p = write_varint(p, num_custom);
+        if (m.custom_metrics) {
+            for (const auto& [name, ms] : *m.custom_metrics) {
+                p = write_str(p, name);
+                p = write_metric_stats(p, ms);
+            }
+        }
+        out.resize(static_cast<std::size_t>(p - begin));
+        return;
+    }
+
+    out.clear();
+    put_varint(out, m.count);
+    serialize_metric_stats(out, m.duration);
+    serialize_metric_stats(out, m.size);
+    put_varint(out, m.ts);
+    put_varint(out, m.te);
+    put_varint(out, m.parent_pid);
+
+    std::uint32_t num_custom =
+        m.custom_metrics ? static_cast<std::uint32_t>(m.custom_metrics->size())
+                         : 0;
+    put_varint(out, num_custom);
+    if (m.custom_metrics) {
+        for (const auto& [name, ms] : *m.custom_metrics) {
+            put_str(out, name);
+            serialize_metric_stats(out, ms);
+        }
+    }
+}
+
+std::string serialize_agg_value(const AggregationMetrics& m) {
+    std::string out;
+    out.reserve(256);
+    serialize_agg_value_into(out, m);
+    return out;
+}
+
+AggregationMetrics deserialize_agg_value(std::string_view data) {
+    BinaryReader r(data);
+    AggregationMetrics m;
+    m.count = r.varint();
+    m.duration = deserialize_metric_stats(r, m.sketch_accuracy);
+    m.size = deserialize_metric_stats(r, m.sketch_accuracy);
+    m.ts = r.varint();
+    m.te = r.varint();
+    m.parent_pid = r.varint();
+
+    auto num_custom = r.varint();
+    if (num_custom > 0) {
+        m.custom_metrics = std::make_unique<CustomMetricsMap>();
+        for (std::uint32_t i = 0; i < num_custom; ++i) {
+            auto name = r.str();
+            auto ms = deserialize_metric_stats(r, m.sketch_accuracy);
+            m.custom_metrics->emplace(std::string(name), std::move(ms));
+        }
+    }
+    return m;
+}
+
+namespace {
+std::atomic<std::uint32_t>& intern_flushed_watermark() {
+    static std::atomic<std::uint32_t> watermark{0};
+    return watermark;
+}
+}  // namespace
+
+void load_intern_dictionary(dftracer::utils::rocksdb::RocksDatabase& db) {
+    namespace rcf = dftracer::utils::rocksdb::cf;
+    auto& intern = aggregation_intern();
+    auto it = db.new_iterator(rcf::AGGREGATION);
+    std::uint32_t max_id_plus_one = 0;
+    for (it->Seek({AGG_INTERN_DICT_PREFIX, AGG_INTERN_DICT_PREFIX_LEN});
+         it->Valid(); it->Next()) {
+        auto key_slice = it->key();
+        if (key_slice.size() < AGG_INTERN_DICT_PREFIX_LEN) break;
+        if (static_cast<std::uint8_t>(key_slice[0]) != 0xFF ||
+            static_cast<std::uint8_t>(key_slice[1]) != 0xFD)
+            break;
+
+        // Decode the id encoded as varint after the prefix. RocksDB key order
+        // is lex, which is NOT varint-numeric order past 127, so we cannot
+        // infer the id from iteration order. Read it explicitly.
+        common::serialization::BinaryReader key_reader(
+            std::string_view(key_slice.data() + AGG_INTERN_DICT_PREFIX_LEN,
+                             key_slice.size() - AGG_INTERN_DICT_PREFIX_LEN));
+        std::uint32_t id = 0;
+        try {
+            id = static_cast<std::uint32_t>(key_reader.varint());
+        } catch (const std::exception&) {
+            continue;
+        }
+
+        auto val_slice = it->value();
+        intern.insert_at_id(
+            id, std::string_view(val_slice.data(), val_slice.size()));
+        if (id + 1u > max_id_plus_one) max_id_plus_one = id + 1u;
+    }
+    intern_flushed_watermark().store(max_id_plus_one,
+                                     std::memory_order_relaxed);
+}
+
+void flush_intern_dictionary(
+    dftracer::utils::rocksdb::RocksDatabase& db,
+    dftracer::utils::rocksdb::RocksDatabase::Batch& batch) {
+    namespace rcf = dftracer::utils::rocksdb::cf;
+    auto& intern = aggregation_intern();
+    auto current = static_cast<std::uint32_t>(intern.size());
+    auto flushed = intern_flushed_watermark().load(std::memory_order_relaxed);
+    if (current <= flushed) return;
+
+    for (std::uint32_t id = flushed; id < current; ++id) {
+        std::string key(AGG_INTERN_DICT_PREFIX, AGG_INTERN_DICT_PREFIX_LEN);
+        common::serialization::put_varint(key, id);
+        auto sv = intern.resolve(id);
+        db.put(batch, rcf::AGGREGATION, key,
+               std::string_view(sv.data(), sv.size()));
+    }
+
+    // CAS to advance watermark; another thread may have already advanced it
+    while (flushed < current) {
+        if (intern_flushed_watermark().compare_exchange_weak(
+                flushed, current, std::memory_order_relaxed))
+            break;
+        if (flushed >= current) break;
+    }
+}
+
+void flush_intern_dictionary(
+    dftracer::utils::utilities::indexer::IndexBatchSink& sink) {
+    auto& intern = aggregation_intern();
+    auto current = static_cast<std::uint32_t>(intern.size());
+    auto flushed = intern_flushed_watermark().load(std::memory_order_relaxed);
+    if (current <= flushed) return;
+
+    std::string key;
+    for (std::uint32_t id = flushed; id < current; ++id) {
+        key.assign(AGG_INTERN_DICT_PREFIX, AGG_INTERN_DICT_PREFIX_LEN);
+        common::serialization::put_varint(key, id);
+        auto sv = intern.resolve(id);
+        sink.insert_aggregation_put(key,
+                                    std::string_view(sv.data(), sv.size()));
+    }
+
+    while (flushed < current) {
+        if (intern_flushed_watermark().compare_exchange_weak(
+                flushed, current, std::memory_order_relaxed))
+            break;
+        if (flushed >= current) break;
+    }
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.cpp
new file mode 100644
index 00000000..5db29fcb
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.cpp
@@ -0,0 +1,461 @@
+#include <dftracer/utils/core/rocksdb/column_families.h>
+#include <dftracer/utils/core/rocksdb/database.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/args_map.h>
+#include <dftracer/utils/utilities/composites/dft/internal/utils.h>
+#include <dftracer/utils/utilities/indexer/index_batch_sink.h>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+namespace rcf = dftracer::utils::rocksdb::cf;
+
+namespace {
+
+inline bool is_reserved_arg(std::string_view k) {
+    if (k.empty()) return false;
+    switch (k[0]) {
+        case 'h':
+            return k == "hhash";
+        case 'f':
+            return k == "fhash";
+        case 'd':
+            return k == "dur" || k == "dur_sum" || k == "dur_min" ||
+                   k == "dur_max" || k == "dft_cnt";
+        case 'r':
+            return k == "ret" || k == "ret_sum" || k == "ret_min" ||
+                   k == "ret_max";
+    }
+    return false;
+}
+
+inline bool is_preagg_suffix(std::string_view k) {
+    if (k.size() <= 4) return false;
+    std::string_view tail = k.substr(k.size() - 4);
+    return tail == "_sum" || tail == "_min" || tail == "_max";
+}
+
+}  // namespace
+
+namespace {
+
+/// Derive a unique per-file batch_id from a staging prefix + the file
+/// path. Uses FNV1a so concurrent visitors processing different files
+/// land in disjoint subdirectories under the staging root.
+std::string make_per_file_batch_id(std::string_view prefix,
+                                   std::string_view file_path) {
+    std::uint64_t fnv_basis = 1469598103934665603ULL;
+    std::uint64_t fnv_prime = 1099511628211ULL;
+    std::uint64_t h = fnv_basis;
+    for (unsigned char c : file_path) {
+        h ^= c;
+        h *= fnv_prime;
+    }
+    char hex[17];
+    std::snprintf(hex, sizeof(hex), "%016llx",
+                  static_cast<unsigned long long>(h));
+    std::string out;
+    out.reserve(prefix.size() + 1 + 16);
+    out.append(prefix);
+    out.push_back('_');
+    out.append(hex, 16);
+    return out;
+}
+
+}  // namespace
+
+AggregationVisitor::AggregationVisitor(
+    std::shared_ptr<rocksdb::RocksDatabase> db, std::uint32_t config_hash,
+    AggregationConfig config, std::string file_path)
+    : db_(std::move(db)),
+      config_hash_(config_hash),
+      config_(std::move(config)),
+      file_path_(std::move(file_path)) {
+    if (config_.track_process_parents || !config_.boundary_events.empty()) {
+        tracker_ = std::make_shared<AssociationTracker>();
+    }
+    local_buffer_.reserve(65536);
+    key_buf_.reserve(128);
+    val_buf_.reserve(256);
+}
+
+AggregationVisitor::AggregationVisitor(std::string staging_dir,
+                                       std::string batch_id_prefix,
+                                       std::uint32_t config_hash,
+                                       AggregationConfig config,
+                                       std::string file_path)
+    : sst_staging_dir_(std::move(staging_dir)),
+      sst_batch_prefix_(make_per_file_batch_id(batch_id_prefix, file_path)),
+      config_hash_(config_hash),
+      config_(std::move(config)),
+      file_path_(std::move(file_path)) {
+    if (config_.track_process_parents || !config_.boundary_events.empty()) {
+        tracker_ = std::make_shared<AssociationTracker>();
+    }
+    local_buffer_.reserve(65536);
+    key_buf_.reserve(128);
+    val_buf_.reserve(256);
+    // First SST writer; rotated after each flush in seal_local_buffer.
+    sst_sink_ = std::make_unique<indexer::IndexDatabaseSstWriterContext>(
+        sst_staging_dir_,
+        sst_batch_prefix_ + "_" + std::to_string(sst_flush_counter_++));
+}
+
+void AggregationVisitor::begin(std::size_t /*num_checkpoints*/) {}
+
+void AggregationVisitor::on_checkpoint(std::size_t /*checkpoint_idx*/) {}
+
+void AggregationVisitor::on_event(const EventRecord& record) {
+    const auto& ev = record.ev;
+    if (ev.is_metadata()) {
+        return;
+    }
+
+    if (tracker_) {
+        tracker_->extract_from_event(ev.name, ev.pid, ev.ts, ev.dur, ev.args,
+                                     config_);
+    }
+
+    bool is_preaggregated_system = false;
+    if (ev.is_system()) {
+        auto cnt = ev.args["count"];
+        auto dft_cnt = ev.args["dft_cnt"];
+        bool is_preaggregated = cnt.is_number() || dft_cnt.is_number();
+        if (!is_preaggregated) {
+            handle_system_event(record);
+            return;
+        }
+        is_preaggregated_system = true;
+    }
+
+    AggMapType map_type = AggMapType::EVENT;
+    if (is_preaggregated_system) {
+        map_type = AggMapType::SYSTEM;
+    } else if (ev.is_profile()) {
+        map_type = AggMapType::PROFILE;
+    }
+
+    auto hhash = ev.args["hhash"].get<std::string_view>();
+    auto fhash = ev.args["fhash"].get<std::string_view>();
+    auto time_bucket = compute_time_bucket(ev.ts, ev.dur, config_);
+
+    if (time_bucket < min_time_bucket_) min_time_bucket_ = time_bucket;
+    if (time_bucket > max_time_bucket_) max_time_bucket_ = time_bucket;
+
+    std::vector<std::pair<std::string_view, std::string_view>> extra_keys_vec;
+    std::vector<std::pair<std::string_view, std::string_view>>* extra_ptr =
+        nullptr;
+    if (!config_.extra_group_keys.empty()) {
+        for (const auto& extra_key : config_.extra_group_keys) {
+            auto value = ev.args[extra_key].get<std::string_view>();
+            if (!value.empty()) {
+                extra_keys_vec.emplace_back(extra_key, value);
+                observed_extra_keys_.emplace(extra_key);
+            }
+        }
+        if (!extra_keys_vec.empty()) extra_ptr = &extra_keys_vec;
+    }
+
+    serialize_agg_key_into(key_buf_, config_hash_, map_type, ev.cat, ev.name,
+                           ev.pid, ev.tid, hhash, fhash, time_bucket,
+                           extra_ptr);
+
+    AggregationMetrics* entry_ptr;
+    if (last_entry_ != nullptr && last_key_ == key_buf_) {
+        entry_ptr = last_entry_;
+    } else {
+        auto [it, inserted] =
+            local_buffer_.try_emplace(key_buf_, config_.sketch_accuracy);
+        entry_ptr = &it->second;
+        last_entry_ = entry_ptr;
+        last_key_ = it->first;
+    }
+    auto& entry = *entry_ptr;
+    const bool compute_percentiles = config_.compute_percentiles;
+
+    std::uint64_t ev_count = 1;
+    if (ev.is_counter()) {
+        auto a_count = ev.args["dft_cnt"];
+        if (!a_count.exists()) a_count = ev.args["count"];
+        ev_count = a_count.exists() ? a_count.get<std::uint64_t>() : 1;
+        entry.count += ev_count;
+
+        auto a_dur = ev.args["dur_sum"];
+        if (!a_dur.exists()) a_dur = ev.args["dur"];
+        if (a_dur.exists()) {
+            MetricStats tmp(config_.sketch_accuracy);
+            tmp.count = ev_count;
+            tmp.total = a_dur.get<std::uint64_t>();
+            auto a_dur_min = ev.args["dur_min"];
+            if (!a_dur_min.exists()) a_dur_min = a_dur;
+            tmp.min = a_dur_min.get<std::uint64_t>();
+            auto a_dur_max = ev.args["dur_max"];
+            if (!a_dur_max.exists()) a_dur_max = a_dur;
+            tmp.max = a_dur_max.get<std::uint64_t>();
+            if (tmp.count > 0) {
+                tmp.mean = static_cast<double>(tmp.total) /
+                           static_cast<double>(tmp.count);
+            }
+            entry.duration.merge_from(tmp);
+        }
+
+        auto a_size = ev.args["ret_sum"];
+        if (!a_size.exists()) a_size = ev.args["ret"];
+        if (a_size.exists()) {
+            MetricStats tmp(config_.sketch_accuracy);
+            tmp.count = ev_count;
+            tmp.total = a_size.get<std::uint64_t>();
+            auto a_min = ev.args["ret_min"];
+            if (!a_min.exists()) a_min = a_size;
+            tmp.min = a_min.get<std::uint64_t>();
+            auto a_max = ev.args["ret_max"];
+            if (!a_max.exists()) a_max = a_size;
+            tmp.max = a_max.get<std::uint64_t>();
+            if (tmp.count > 0) {
+                tmp.mean = static_cast<double>(tmp.total) /
+                           static_cast<double>(tmp.count);
+            }
+            entry.size.merge_from(tmp);
+        }
+
+        entry.update_timestamp(ev.ts, config_.time_interval_us);
+    } else {
+        entry.update_duration(ev.dur, compute_percentiles);
+        entry.update_timestamp(ev.ts, ev.dur);
+
+        auto ret = ev.args["ret"];
+        if (ret.exists() && internal::is_data_transfer_op(ev.cat, ev.name)) {
+            entry.update_size(ret.get<std::uint64_t>(), compute_percentiles);
+        }
+    }
+
+    if (config_.track_default_args) {
+        const bool is_counter_ev = ev.is_counter();
+        ev.args.for_each_member([&](std::string_view k, ArgsValueProxy v) {
+            if (!v.is_number()) return;
+            if (is_reserved_arg(k)) return;
+            if (is_counter_ev && is_preagg_suffix(k)) return;
+            for (const auto& gk : config_.extra_group_keys) {
+                if (gk == k) return;
+            }
+            for (const auto& cf : config_.custom_metric_fields) {
+                if (cf == k) return;
+            }
+            entry.update_custom_metric(k, v.get<std::uint64_t>(),
+                                       compute_percentiles);
+        });
+    }
+
+    for (const auto& field : config_.custom_metric_fields) {
+        if (ev.is_counter()) {
+            std::string sum_key = std::string(field) + "_sum";
+            auto a_sum = ev.args[sum_key];
+            if (!a_sum.exists()) a_sum = ev.args[field];
+            std::string min_key = std::string(field) + "_min";
+            auto a_min = ev.args[min_key];
+            if (!a_min.exists()) a_min = ev.args[field];
+            std::string max_key = std::string(field) + "_max";
+            auto a_max = ev.args[max_key];
+            if (!a_max.exists()) a_max = ev.args[field];
+            if (a_sum.exists() && a_sum.is_number()) {
+                if (!entry.custom_metrics) {
+                    entry.custom_metrics = std::make_unique<CustomMetricsMap>();
+                }
+                auto& cm = *entry.custom_metrics;
+                auto cm_it = cm.find(field);
+                if (cm_it == cm.end()) {
+                    cm_it = cm.emplace(std::string(field),
+                                       MetricStats(config_.sketch_accuracy))
+                                .first;
+                }
+                auto& stats = cm_it->second;
+                stats.count += ev_count;
+                stats.total += a_sum.get<std::uint64_t>();
+                if (a_min.exists() && a_min.is_number()) {
+                    stats.min = std::min(stats.min, a_min.get<std::uint64_t>());
+                }
+                if (a_max.exists() && a_max.is_number()) {
+                    stats.max = std::max(stats.max, a_max.get<std::uint64_t>());
+                }
+                if (stats.count > 0) {
+                    stats.mean = static_cast<double>(stats.total) /
+                                 static_cast<double>(stats.count);
+                }
+                observed_custom_metrics_.insert(field);
+            }
+        } else {
+            auto field_val = ev.args[field];
+            if (field_val.exists() && field_val.is_number()) {
+                entry.update_custom_metric(
+                    field, field_val.get<std::uint64_t>(), compute_percentiles);
+            }
+        }
+    }
+
+    events_processed_++;
+
+    if (local_buffer_.size() >= FLUSH_THRESHOLD) {
+        seal_local_buffer();
+    }
+}
+
+void AggregationVisitor::handle_system_event(const EventRecord& record) {
+    const auto& ev = record.ev;
+
+    auto hhash = ev.args["hhash"].get<std::string_view>();
+    auto time_bucket = compute_time_bucket(ev.ts, ev.dur, config_);
+
+    if (time_bucket < min_time_bucket_) min_time_bucket_ = time_bucket;
+    if (time_bucket > max_time_bucket_) max_time_bucket_ = time_bucket;
+
+    serialize_system_key_into(system_key_buf_, hhash, time_bucket);
+
+    auto [it, inserted] =
+        system_buffer_.try_emplace(system_key_buf_, config_.sketch_accuracy);
+    auto& entry = it->second;
+
+    entry.count++;
+    entry.update_timestamp(ev.ts);
+
+    const bool compute_percentiles = config_.compute_percentiles;
+
+    ev.args.for_each_member([&](std::string_view k, ArgsValueProxy v) {
+        if (!v.is_number()) return;
+        if (k == "hhash" || k == "fhash") return;
+
+        double val = v.get<double>();
+        entry.update_metric(k, val, compute_percentiles);
+        observed_system_metrics_.insert(std::string(k));
+    });
+
+    events_processed_++;
+
+    if (system_buffer_.size() >= FLUSH_THRESHOLD) {
+        seal_local_buffer();
+    }
+}
+
+void AggregationVisitor::seal_local_buffer() {
+    if (local_buffer_.empty() && system_buffer_.empty()) return;
+
+    for (const auto& [key, metrics] : local_buffer_) {
+        if (metrics.custom_metrics) {
+            for (const auto& [name, _] : *metrics.custom_metrics) {
+                observed_custom_metrics_.insert(name);
+            }
+        }
+    }
+
+    if (sst_sink_) {
+        // Distributed mode: flush the current in-memory maps into the
+        // active per-flush SstWriterContext, then rotate to a fresh one
+        // so the next flush (or on_file_complete) writes to its own SST
+        // with a fresh, strictly-ascending key space.
+        for (auto& [k, m] : local_buffer_) {
+            serialize_agg_value_into(val_buf_, m);
+            sst_sink_->insert_aggregation_merge(k, val_buf_);
+        }
+        local_buffer_.clear();
+        last_entry_ = nullptr;
+        last_key_ = {};
+
+        for (auto& [k, m] : system_buffer_) {
+            serialize_system_value_into(system_val_buf_, m);
+            sst_sink_->insert_system_metrics_merge(k, system_val_buf_);
+        }
+        system_buffer_.clear();
+
+        flush_intern_dictionary(*sst_sink_);
+
+        // Commit this flush's SSTs and open a new SstWriterContext for
+        // the next flush. Only rotate if something was actually written;
+        // an empty commit produces no paths and no-ops.
+        auto a = sst_sink_->commit();
+        if (!a.empty()) sst_artifacts_.push_back(std::move(a));
+        sst_sink_ = std::make_unique<indexer::IndexDatabaseSstWriterContext>(
+            sst_staging_dir_,
+            sst_batch_prefix_ + "_" + std::to_string(sst_flush_counter_++));
+        return;
+    }
+
+    // Legacy mode: flush to a RocksDatabase batch; commit at
+    // on_file_complete.
+    if (!db_) return;
+    auto batch = db_->begin_batch();
+    for (auto& [k, m] : local_buffer_) {
+        serialize_agg_value_into(val_buf_, m);
+        db_->merge(batch, rcf::AGGREGATION, k, val_buf_);
+    }
+    local_buffer_.clear();
+    last_entry_ = nullptr;
+    last_key_ = {};
+
+    for (auto& [k, m] : system_buffer_) {
+        serialize_system_value_into(system_val_buf_, m);
+        db_->merge(batch, rcf::SYSTEM_METRICS, k, system_val_buf_);
+    }
+    system_buffer_.clear();
+
+    flush_intern_dictionary(*db_, batch);
+    pending_batches_.push_back(std::move(batch));
+}
+
+coro::CoroTask<void> AggregationVisitor::on_file_complete() {
+    seal_local_buffer();
+
+    if (sst_sink_) {
+        // Commit any final residue (the rotated-to-fresh SstWriterContext
+        // that seal_local_buffer left behind). An empty commit returns
+        // empty paths which we skip.
+        auto a = sst_sink_->commit();
+        if (!a.empty()) sst_artifacts_.push_back(std::move(a));
+        sst_sink_.reset();
+        co_return;
+    }
+
+    if (pending_batches_.empty()) co_return;
+    for (auto& batch : pending_batches_) {
+        db_->commit_batch(batch);
+    }
+    pending_batches_.clear();
+}
+
+void AggregationVisitor::flush_to_batch(rocksdb::RocksDatabase::Batch& batch) {
+    // Legacy-only helper, used by aggregator_utility for draining a
+    // batch-write phase. SST mode never calls this.
+    if (!db_) return;
+    for (auto& [k, m] : local_buffer_) {
+        serialize_agg_value_into(val_buf_, m);
+        db_->merge(batch, rcf::AGGREGATION, k, val_buf_);
+    }
+    local_buffer_.clear();
+
+    for (auto& [k, m] : system_buffer_) {
+        serialize_system_value_into(system_val_buf_, m);
+        db_->merge(batch, rcf::SYSTEM_METRICS, k, system_val_buf_);
+    }
+    system_buffer_.clear();
+
+    flush_intern_dictionary(*db_, batch);
+}
+
+ChunkAggregationOutput AggregationVisitor::take_output() {
+    if (tracker_) {
+        tracker_->finalize();
+    }
+
+    ChunkAggregationOutput output;
+    output.file_path = std::move(file_path_);
+    output.events_processed = events_processed_;
+    output.success = true;
+    output.local_tracker = std::move(tracker_);
+    output.min_time_bucket = min_time_bucket_;
+    output.max_time_bucket = max_time_bucket_;
+
+    return output;
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.cpp
index be37ccd2..3fae1eca 100644
--- a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.cpp
@@ -1,9 +1,9 @@
+#include <dftracer/utils/core/common/transparent_string_hash.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/aggregator_summary_utility.h>
 
 #include <cstdint>
 #include <cstdio>
 #include <string>
-#include <unordered_map>
 
 namespace dftracer::utils::utilities::composites::dft::aggregators {
 
@@ -21,9 +21,15 @@ coro::CoroTask<void> AggregatorSummaryUtility::process(
     std::printf("Total events aggregated: %llu\n",
                 static_cast<unsigned long long>(total_events));
 
-    std::unordered_map<std::string, std::uint64_t> category_counts;
+    StringViewMap<std::uint64_t> category_counts;
     for (const auto& [key, metrics] : aggregations) {
-        category_counts[std::string(key.cat())] += metrics.count;
+        auto cat = key.cat();
+        auto it = category_counts.find(cat);
+        if (it == category_counts.end()) {
+            category_counts.emplace(std::string(cat), metrics.count);
+        } else {
+            it->second += metrics.count;
+        }
     }
 
     std::printf("\nEvents by category:\n");
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.cpp
index 465d4029..475d2913 100644
--- a/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.cpp
@@ -1,26 +1,28 @@
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/common/platform_compat.h>
+#include <dftracer/utils/core/rocksdb/column_families.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_output.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.h>
-#include <dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.h>
-#include <dftracer/utils/utilities/composites/dft/aggregators/chunk_mapper_utility.h>
-#include <dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.h>
-#include <dftracer/utils/utilities/composites/dft/internal/utils.h>
-#include <dftracer/utils/utilities/composites/dft/metadata_collector_utility.h>
-#include <dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
 #include <dftracer/utils/utilities/indexer/index_builder_utility.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
 
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
 #include <dftracer/utils/utilities/common/arrow/column_builder.h>
 #endif
 
-#include <unistd.h>
+#include <dftracer/utils/core/common/transparent_string_hash.h>
 
 #include <algorithm>
 #include <atomic>
-#include <ctime>
 #include <set>
+#include <unordered_set>
 
 namespace dftracer::utils::utilities::composites::dft::aggregators {
 
@@ -53,13 +55,8 @@ AggregatorInput& AggregatorInput::with_force_rebuild(bool force) {
     return *this;
 }
 
-AggregatorInput& AggregatorInput::with_chunk_size_mb(std::size_t mb) {
-    chunk_size_mb = mb;
-    return *this;
-}
-
-AggregatorInput& AggregatorInput::with_batch_size_mb(std::size_t mb) {
-    batch_size_mb = mb;
+AggregatorInput& AggregatorInput::with_parallelism(std::size_t n) {
+    parallelism = n;
     return *this;
 }
 
@@ -80,25 +77,35 @@ using common::arrow::RecordBatchBuilder;
 ArrowExportResult AggregationBatch::to_arrow() const {
     RecordBatchBuilder builder;
 
-    // Discover the union of extra key IDs and custom metric names.
-    std::set<std::uint32_t> extra_key_id_set;
-    std::set<std::string_view, std::less<>> custom_metric_name_set;
-    for (const auto& [key, metrics] : entries) {
-        if (key.extra_keys && !key.extra_keys->empty()) {
-            for (const auto& [k, v] : *key.extra_keys) {
-                extra_key_id_set.insert(k);
+    // Use precomputed global columns if available, otherwise discover locally.
+    std::vector<std::uint32_t> local_extra_key_ids;
+    std::vector<std::string> local_custom_metric_names;
+    if (!global_extra_key_ids || !global_custom_metric_names) {
+        std::set<std::uint32_t> extra_key_id_set;
+        std::set<std::string_view, std::less<>> custom_metric_name_set;
+        for (const auto& entry : entries) {
+            if (entry.key.extra_keys && !entry.key.extra_keys->empty()) {
+                for (const auto& [k, v] : *entry.key.extra_keys) {
+                    extra_key_id_set.insert(k);
+                }
             }
-        }
-        if (metrics.custom_metrics && !metrics.custom_metrics->empty()) {
-            for (const auto& [name, _] : *metrics.custom_metrics) {
-                custom_metric_name_set.insert(name);
+            if (entry.metrics.custom_metrics &&
+                !entry.metrics.custom_metrics->empty()) {
+                for (const auto& [name, _] : *entry.metrics.custom_metrics) {
+                    custom_metric_name_set.insert(name);
+                }
             }
         }
+        local_extra_key_ids.assign(extra_key_id_set.begin(),
+                                   extra_key_id_set.end());
+        local_custom_metric_names.assign(custom_metric_name_set.begin(),
+                                         custom_metric_name_set.end());
     }
-    std::vector<std::uint32_t> extra_key_ids(extra_key_id_set.begin(),
-                                             extra_key_id_set.end());
-    std::vector<std::string_view> custom_metric_names(
-        custom_metric_name_set.begin(), custom_metric_name_set.end());
+    const auto& extra_key_ids =
+        global_extra_key_ids ? *global_extra_key_ids : local_extra_key_ids;
+    const auto& custom_metric_names = global_custom_metric_names
+                                          ? *global_custom_metric_names
+                                          : local_custom_metric_names;
 
     // Build schema: batch_type + fixed columns + extra keys + custom metrics
     std::vector<common::arrow::ColumnSpec> schema = {
@@ -114,11 +121,16 @@ ArrowExportResult AggregationBatch::to_arrow() const {
         {"size_std", ColumnType::DOUBLE},   {"ts", ColumnType::UINT64},
         {"te", ColumnType::UINT64},
     };
+    // Add CI columns when batch has approximated entries
+    if (has_approximated_entries) {
+        schema.push_back({"count_ci_lower", ColumnType::DOUBLE});
+        schema.push_back({"count_ci_upper", ColumnType::DOUBLE});
+    }
     for (auto id : extra_key_ids) {
         schema.push_back({std::string(aggregation_intern().resolve(id)),
                           ColumnType::STRING});
     }
-    // Custom metric suffixed names — need owned strings for ColumnSpec
+    // Custom metric suffixed names
     struct MetricSuffix {
         const char* suffix;
         ColumnType type;
@@ -141,7 +153,9 @@ ArrowExportResult AggregationBatch::to_arrow() const {
     builder.declare_schema(schema);
     builder.reserve(entries.size());
 
-    for (const auto& [key, metrics] : entries) {
+    for (const auto& entry : entries) {
+        const auto& key = entry.key;
+        const auto& metrics = entry.metrics;
         std::size_t ci = 0;
         builder.append_int64(ci++, static_cast<int64_t>(batch_type));
         builder.append_string(ci++, key.cat());
@@ -157,12 +171,12 @@ ArrowExportResult AggregationBatch::to_arrow() const {
                               metrics.count > 0 ? metrics.duration.min : 0);
         builder.append_uint64(ci++, metrics.duration.max);
         builder.append_double(ci++, metrics.duration.mean);
-        builder.append_double(ci++, metrics.get_stddev_duration());
+        builder.append_double(ci++, metrics.duration.get_stddev());
         builder.append_uint64(ci++, metrics.size.total);
         builder.append_uint64(ci++, metrics.count > 0 ? metrics.size.min : 0);
         builder.append_uint64(ci++, metrics.size.max);
         builder.append_double(ci++, metrics.size.mean);
-        builder.append_double(ci++, metrics.get_stddev_size());
+        builder.append_double(ci++, metrics.size.get_stddev());
         builder.append_uint64(ci++, metrics.ts);
         builder.append_uint64(ci++, metrics.te);
 
@@ -192,7 +206,7 @@ ArrowExportResult AggregationBatch::to_arrow() const {
                     builder.append_uint64(ci++, metrics.count > 0 ? ms.min : 0);
                     builder.append_uint64(ci++, ms.max);
                     builder.append_double(ci++, ms.mean);
-                    builder.append_double(ci++, ms.get_stddev(metrics.count));
+                    builder.append_double(ci++, ms.get_stddev());
                     continue;
                 }
             }
@@ -200,187 +214,513 @@ ArrowExportResult AggregationBatch::to_arrow() const {
                 builder.append_null(ci++);
         }
 
+        // Add CI columns when batch has approximated entries
+        if (has_approximated_entries) {
+            builder.append_double(ci++, entry.count_ci.lower);
+            builder.append_double(ci++, entry.count_ci.upper);
+        }
+
         builder.end_row();
     }
 
     return builder.finish();
 }
+
+// ---------------------------------------------------------------------------
+// AggregationBatch::to_dfanalyzer_arrow
+// ---------------------------------------------------------------------------
+
+namespace {
+
+// IO category constants matching dfanalyzer IOCategory enum
+enum class IOCategory : std::int8_t {
+    READ = 1,
+    WRITE = 2,
+    METADATA = 3,
+    PCTL = 4,
+    IPC = 5,
+    OTHER = 6,
+    SYNC = 7,
+};
+
+IOCategory get_io_category(std::string_view func_name) {
+    // Read functions
+    if (func_name == "read" || func_name == "pread" || func_name == "readv" ||
+        func_name == "preadv" || func_name == "fread") {
+        return IOCategory::READ;
+    }
+    // Write functions
+    if (func_name == "write" || func_name == "pwrite" ||
+        func_name == "writev" || func_name == "pwritev" ||
+        func_name == "fwrite") {
+        return IOCategory::WRITE;
+    }
+    // Sync functions
+    if (func_name == "fsync" || func_name == "fdatasync" ||
+        func_name == "msync" || func_name == "sync") {
+        return IOCategory::SYNC;
+    }
+    // Metadata functions
+    if (func_name == "open" || func_name == "open64" || func_name == "close" ||
+        func_name == "fopen" || func_name == "fopen64" ||
+        func_name == "fclose" || func_name == "stat" || func_name == "fstat" ||
+        func_name == "lstat" || func_name == "fstatat" ||
+        func_name == "__xstat" || func_name == "__xstat64" ||
+        func_name == "__lxstat" || func_name == "__lxstat64" ||
+        func_name == "__fxstat" || func_name == "__fxstat64" ||
+        func_name == "access" || func_name == "lseek" ||
+        func_name == "lseek64" || func_name == "fseek" ||
+        func_name == "ftell" || func_name == "seek" || func_name == "fcntl" ||
+        func_name == "ftruncate" || func_name == "mkdir" ||
+        func_name == "rmdir" || func_name == "unlink" ||
+        func_name == "remove" || func_name == "rename" || func_name == "link" ||
+        func_name == "readlink" || func_name == "opendir" ||
+        func_name == "closedir" || func_name == "readdir") {
+        return IOCategory::METADATA;
+    }
+    return IOCategory::OTHER;
+}
+
+std::string resolve_hash(
+    const std::unordered_map<std::string, std::string>* hash_table,
+    std::string_view hash) {
+    if (!hash_table || hash.empty()) return std::string(hash);
+    auto it = hash_table->find(std::string(hash));
+    if (it != hash_table->end()) return it->second;
+    return std::string(hash);
+}
+
+std::string build_proc_name(std::string_view host_name, std::string_view hhash,
+                            std::uint64_t pid, std::uint64_t tid) {
+    std::string result = "app#";
+    if (!host_name.empty()) {
+        result.append(host_name);
+    } else if (!hhash.empty()) {
+        result.append(hhash);
+    } else {
+        result.append("unknown");
+    }
+    result.push_back('#');
+    result.append(std::to_string(pid));
+    result.push_back('#');
+    result.append(std::to_string(tid));
+    return result;
+}
+
+}  // namespace
+
+ArrowExportResult AggregationBatch::to_dfanalyzer_arrow(
+    const DfanalyzerContext& ctx) const {
+    RecordBatchBuilder builder;
+
+    // Bucket width in microseconds
+    auto bucket_width_us =
+        static_cast<std::uint64_t>(ctx.time_granularity * ctx.time_resolution);
+
+    if (batch_type == AggregationBatchType::SYSTEM) {
+        // System metrics schema
+        std::vector<common::arrow::ColumnSpec> schema = {
+            {"host_hash", ColumnType::STRING},
+            {"time_range", ColumnType::INT64},
+            {"sys_cpu_iowait_pct", ColumnType::DOUBLE},
+            {"sys_cpu_user_pct", ColumnType::DOUBLE},
+            {"sys_cpu_system_pct", ColumnType::DOUBLE},
+            {"sys_cpu_idle_pct", ColumnType::DOUBLE},
+            {"sys_core_iowait_pct_max", ColumnType::DOUBLE},
+            {"sys_core_iowait_pct_p95", ColumnType::DOUBLE},
+            {"sys_mem_dirty", ColumnType::DOUBLE},
+            {"sys_mem_cached", ColumnType::DOUBLE},
+            {"sys_mem_available", ColumnType::DOUBLE},
+        };
+        builder.declare_schema(schema);
+        builder.reserve(entries.size());
+
+        for (const auto& entry : entries) {
+            const auto& key = entry.key;
+            const auto& metrics = entry.metrics;
+            std::size_t ci = 0;
+
+            builder.append_string(ci++, key.hhash());
+            auto time_range =
+                bucket_width_us > 0
+                    ? static_cast<std::int64_t>(
+                          (key.time_bucket - ctx.time_origin) / bucket_width_us)
+                    : 0;
+            builder.append_int64(ci++, time_range);
+
+            // Extract system metrics from custom_metrics
+            auto get_metric = [&](const char* name) -> double {
+                if (!metrics.custom_metrics) return 0.0;
+                auto it = metrics.custom_metrics->find(name);
+                if (it == metrics.custom_metrics->end()) return 0.0;
+                return it->second.mean;
+            };
+            auto get_metric_max = [&](const char* name) -> double {
+                if (!metrics.custom_metrics) return 0.0;
+                auto it = metrics.custom_metrics->find(name);
+                if (it == metrics.custom_metrics->end()) return 0.0;
+                return static_cast<double>(it->second.max);
+            };
+
+            builder.append_double(ci++, get_metric("iowait_pct"));
+            builder.append_double(ci++, get_metric("user_pct"));
+            builder.append_double(ci++, get_metric("system_pct"));
+            builder.append_double(ci++, get_metric("idle_pct"));
+            builder.append_double(ci++, get_metric_max("iowait_pct"));
+            builder.append_double(ci++,
+                                  get_metric("iowait_pct"));  // p95 approx
+            builder.append_double(ci++, get_metric("Dirty"));
+            builder.append_double(ci++, get_metric("Cached"));
+            builder.append_double(ci++, get_metric("MemAvailable"));
+
+            builder.end_row();
+        }
+    } else {
+        // Events/Profiles schema
+        std::vector<common::arrow::ColumnSpec> schema = {
+            {"cat", ColumnType::STRING},
+            {"func_name", ColumnType::STRING},
+            {"pid", ColumnType::INT64},
+            {"tid", ColumnType::INT64},
+            {"file_hash", ColumnType::STRING},
+            {"host_hash", ColumnType::STRING},
+            {"file_name", ColumnType::STRING},
+            {"host_name", ColumnType::STRING},
+            {"proc_name", ColumnType::STRING},
+            {"io_cat", ColumnType::INT64},
+            {"acc_pat", ColumnType::INT64},
+            {"count", ColumnType::INT64},
+            {"time", ColumnType::DOUBLE},
+            {"size", ColumnType::INT64},
+            {"time_min", ColumnType::DOUBLE},
+            {"time_max", ColumnType::DOUBLE},
+            {"size_min", ColumnType::INT64},
+            {"size_max", ColumnType::INT64},
+            {"time_range", ColumnType::INT64},
+            {"time_start", ColumnType::INT64},
+            {"time_end", ColumnType::INT64},
+        };
+        builder.declare_schema(schema);
+        builder.reserve(entries.size());
+
+        for (const auto& entry : entries) {
+            const auto& key = entry.key;
+            const auto& metrics = entry.metrics;
+            std::size_t ci = 0;
+
+            auto fhash = key.fhash();
+            auto hhash = key.hhash();
+            auto file_name = resolve_hash(ctx.file_hashes, fhash);
+            auto host_name = resolve_hash(ctx.host_hashes, hhash);
+            auto proc_name =
+                build_proc_name(host_name, hhash, key.pid, key.tid);
+            auto io_cat = get_io_category(key.name());
+
+            builder.append_string(ci++, key.cat());
+            builder.append_string(ci++, key.name());
+            builder.append_int64(ci++, static_cast<std::int64_t>(key.pid));
+            builder.append_int64(ci++, static_cast<std::int64_t>(key.tid));
+            builder.append_string(ci++, fhash);
+            builder.append_string(ci++, hhash);
+            builder.append_string(ci++, file_name);
+            builder.append_string(ci++, host_name);
+            builder.append_string(ci++, proc_name);
+            builder.append_int64(ci++, static_cast<std::int64_t>(io_cat));
+            builder.append_int64(ci++, 0);  // acc_pat always 0
+
+            builder.append_int64(ci++,
+                                 static_cast<std::int64_t>(metrics.count));
+            // time: duration in seconds (dur_total is in us)
+            builder.append_double(ci++,
+                                  static_cast<double>(metrics.duration.total) /
+                                      ctx.time_resolution);
+            // size: nullable (0 means null)
+            if (metrics.size.total > 0) {
+                builder.append_int64(
+                    ci++, static_cast<std::int64_t>(metrics.size.total));
+            } else {
+                builder.append_null(ci++);
+            }
+            // time_min/max in seconds
+            builder.append_double(
+                ci++, metrics.count > 0
+                          ? static_cast<double>(metrics.duration.min) /
+                                ctx.time_resolution
+                          : 0.0);
+            builder.append_double(ci++,
+                                  static_cast<double>(metrics.duration.max) /
+                                      ctx.time_resolution);
+            // size_min/max: nullable
+            if (metrics.size.total > 0 && metrics.count > 0) {
+                builder.append_int64(
+                    ci++, static_cast<std::int64_t>(metrics.size.min));
+                builder.append_int64(
+                    ci++, static_cast<std::int64_t>(metrics.size.max));
+            } else {
+                builder.append_null(ci++);
+                builder.append_null(ci++);
+            }
+
+            // time_range: normalized bucket index
+            auto time_range =
+                bucket_width_us > 0
+                    ? static_cast<std::int64_t>(
+                          (key.time_bucket - ctx.time_origin) / bucket_width_us)
+                    : 0;
+            builder.append_int64(ci++, time_range);
+            // time_start/end: relative to time_origin (still in us)
+            builder.append_int64(
+                ci++, static_cast<std::int64_t>(metrics.ts - ctx.time_origin));
+            builder.append_int64(
+                ci++, static_cast<std::int64_t>(metrics.te - ctx.time_origin));
+
+            builder.end_row();
+        }
+    }
+
+    return builder.finish();
+}
 #endif  // DFTRACER_UTILS_ENABLE_ARROW
 
 // ---------------------------------------------------------------------------
-// AggregatorUtility::process
+// AggregatorUtility::process - parallel, RocksDB-backed, fused pipeline
 // ---------------------------------------------------------------------------
 
 coro::AsyncGenerator<AggregationBatch> AggregatorUtility::process(
     const AggregatorInput& input) {
-    // Resolve index directory — create a temp one if not specified.
-    std::string effective_index_dir = input.index_dir;
-    std::string temp_index_dir;
-    if (effective_index_dir.empty()) {
-        try {
-            auto temp_path = fs::temp_directory_path();
-            temp_path /= "dftracer_idx_" + std::to_string(std::time(nullptr)) +
-                         "_" + std::to_string(getpid());
-            temp_index_dir = temp_path.string();
-            fs::create_directories(temp_index_dir);
-        } catch (const fs::filesystem_error&) {
-            temp_index_dir = "/tmp/dftracer_idx_" +
-                             std::to_string(std::time(nullptr)) + "_" +
-                             std::to_string(getpid());
-            fs::create_directories(temp_index_dir);
-        }
-        effective_index_dir = temp_index_dir;
+    if (!has_context()) {
+        DFTRACER_UTILS_LOG_ERROR(
+            "AggregatorUtility requires CoroScope context. "
+            "Use Runtime::scope() to run this utility.");
+        co_return;
     }
+    CoroScope& scope = context();
 
-    // Discover input files.
-    filesystem::PatternDirectoryScannerUtility scanner;
-    filesystem::PatternDirectoryScannerUtilityInput scan_input{
-        input.directory, {".pfw", ".pfw.gz"}, false};
-    auto matched_entries = co_await scanner.process(scan_input);
-
-    std::vector<std::string> input_files;
-    input_files.reserve(matched_entries.size());
-    for (const auto& entry : matched_entries) {
-        input_files.push_back(entry.path.string());
+    // Determine parallelism
+    std::size_t parallelism = input.parallelism;
+    if (parallelism == 0) {
+        parallelism = dftracer_utils_hardware_concurrency();
     }
 
-    if (input_files.empty()) {
+    // Resolve files and index path with aggregation cache check
+    indexing::IndexResolverUtility resolver;
+    indexing::ResolverInput resolver_input;
+    resolver_input.directory = input.directory;
+    resolver_input.index_dir = input.index_dir;
+    resolver_input.require_aggregation = !input.force_rebuild;
+    resolver_input.aggregation_config = input.config;
+    auto scan_result = co_await scope.spawn(resolver, resolver_input);
+
+    if (scan_result.all_files.empty()) {
         DFTRACER_UTILS_LOG_WARN("No .pfw or .pfw.gz files found in: %s",
                                 input.directory.c_str());
-        co_yield AggregationBatch{};
         co_return;
     }
 
-    // Sequential pipeline: index → metadata → chunk map → aggregate → merge.
-    // Parallelism at the file/chunk level is left to the caller (e.g. the
-    // CLI binary uses CoroScope workers; Python callers use the Runtime).
-    EventAggregatorUtility merger;
-    std::atomic<int> global_chunk_idx{0};
-
-    if (input.force_rebuild && !input_files.empty()) {
-        const std::string shared_index_path =
-            composites::dft::internal::determine_index_path(
-                input_files.front(), effective_index_dir);
-        if (fs::exists(shared_index_path)) {
-            fs::remove_all(shared_index_path);
-        }
+    DFTRACER_UTILS_LOG_INFO(
+        "Found %zu files (%zu need checkpoint, %zu need aggregation, %zu "
+        "cached)",
+        scan_result.all_files.size(), scan_result.needs_checkpoint.size(),
+        scan_result.needs_aggregation.size(), scan_result.cached.size());
+
+    const auto& shared_index_path = scan_result.index_path;
+
+    // Force rebuild: clear existing index
+    if (input.force_rebuild && fs::exists(shared_index_path)) {
+        DFTRACER_UTILS_LOG_INFO("Clearing shared index store: %s",
+                                shared_index_path.c_str());
+        fs::remove_all(shared_index_path);
     }
 
-    for (const auto& file_path : input_files) {
-        bool is_compressed =
-            file_path.size() >= 3 &&
-            file_path.compare(file_path.size() - 3, 3, ".gz") == 0;
-
-        std::string idx_path;
-        if (is_compressed) {
-            idx_path = composites::dft::internal::determine_index_path(
-                file_path, effective_index_dir);
-            auto idx_input = indexer::IndexBuildConfig::for_file(file_path)
-                                 .with_checkpoint_size(input.checkpoint_size)
-                                 .with_force_rebuild(false)
-                                 .with_index_dir(effective_index_dir);
-            co_await indexer::IndexBuilderUtility{}.process(idx_input);
-        }
+    // Open RocksDB-backed aggregator with merge operator
+    auto agg_db = EventAggregator::open_with_merge_operator(shared_index_path);
+    auto merger = std::make_unique<EventAggregator>(agg_db, 0);
 
-        // Collect file metadata (line count, size, etc.).
-        auto meta_input =
-            composites::dft::MetadataCollectorUtilityInput::from_file(file_path)
-                .with_checkpoint_size(input.checkpoint_size)
-                .with_force_rebuild(false)
-                .with_index(idx_path);
-        auto metadata =
-            co_await composites::dft::MetadataCollectorUtility{}.process(
-                meta_input);
-
-        if (!metadata.success) {
-            DFTRACER_UTILS_LOG_WARN("Skipping file (metadata failed): %s",
-                                    file_path.c_str());
-            continue;
-        }
+    // Build list of files needing work (checkpoint or aggregation)
+    std::vector<std::string> files_needing_work;
+    files_needing_work.reserve(scan_result.needs_checkpoint.size() +
+                               scan_result.needs_aggregation.size());
+    for (auto& item : scan_result.needs_checkpoint) {
+        files_needing_work.push_back(std::move(item.file_path));
+    }
+    for (auto& item : scan_result.needs_aggregation) {
+        files_needing_work.push_back(std::move(item.file_path));
+    }
 
-        // Partition the file into byte-range chunks.
-        FileChunkMapperUtility file_mapper;
-        auto file_chunks = co_await file_mapper.process(
-            FileChunkMapperInput::from_metadata(metadata)
-                .with_config(input.config)
-                .with_checkpoint_size(input.checkpoint_size)
-                .with_target_chunk_size(input.chunk_size_mb)
-                .with_batch_size(input.batch_size_mb * 1024 * 1024));
-
-        int start_idx =
-            global_chunk_idx.fetch_add(static_cast<int>(file_chunks.size()));
-        for (int i = 0; i < static_cast<int>(file_chunks.size()); ++i) {
-            file_chunks[i].chunk_index = start_idx + i;
+    // Index and aggregate in parallel using fused pipeline
+    if (!files_needing_work.empty()) {
+        auto agg_config_ptr = std::make_shared<AggregationConfig>(input.config);
+
+        auto batch_config = std::make_shared<indexer::IndexBuildBatchConfig>();
+        batch_config->file_paths = std::move(files_needing_work);
+        batch_config->index_dir = input.index_dir;
+        batch_config->checkpoint_size = input.checkpoint_size;
+        batch_config->parallelism = parallelism;
+        batch_config->force_rebuild = false;  // Already handled above
+        batch_config->use_batch_write = true;
+
+        // Attach AggregationVisitor to each file during parsing
+        batch_config->dft_visitor_factory =
+            [agg_db, agg_config_ptr](const std::string& file_path)
+            -> std::vector<std::unique_ptr<composites::dft::DftEventVisitor>> {
+            std::vector<std::unique_ptr<composites::dft::DftEventVisitor>>
+                visitors;
+            visitors.push_back(std::make_unique<AggregationVisitor>(
+                agg_db, 0, *agg_config_ptr, file_path));
+            return visitors;
+        };
+
+        auto batch_result = co_await indexer::IndexBatchBuilderUtility::process(
+            &scope, std::move(batch_config));
+
+        // Drain visitors and merge results
+        std::vector<std::string> processed_files;
+        for (auto& file_visitors : batch_result.extra_visitors) {
+            for (auto& visitor : file_visitors) {
+                auto* agg_visitor =
+                    dynamic_cast<AggregationVisitor*>(visitor.get());
+                if (agg_visitor) {
+                    for (const auto& k : agg_visitor->observed_extra_keys())
+                        merger->add_observed_extra_key(k);
+                    for (const auto& m : agg_visitor->observed_custom_metrics())
+                        merger->add_observed_custom_metric(m);
+                    auto output = agg_visitor->take_output();
+                    processed_files.push_back(output.file_path);
+                    merger->merge_chunk(std::move(output));
+                }
+            }
+            file_visitors.clear();
         }
 
-        for (auto& chunk : file_chunks) {
-            ChunkAggregatorUtility agg;
-            auto output = co_await agg.process(chunk);
-            merger.merge_chunk(std::move(output));
+        // Write global config and per-file markers for cache detection
+        if (!processed_files.empty()) {
+            namespace rcf = dftracer::utils::rocksdb::cf;
+            indexer::IndexDatabase idx_db(
+                shared_index_path,
+                dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+
+            auto batch = agg_db->begin_batch();
+
+            // Write global config (0xFFFE key)
+            AggGlobalConfig global_cfg;
+            global_cfg.time_interval_us = input.config.time_interval_us;
+            global_cfg.config_hash = input.config.compute_hash();
+            agg_db->put(batch, rcf::AGGREGATION,
+                        std::string_view(AGG_GLOBAL_CONFIG_KEY, 2),
+                        serialize_agg_global_config(global_cfg));
+
+            // Write per-file markers (0xFFFF + file_id keys)
+            for (const auto& file_path : processed_files) {
+                int file_id = idx_db.find_file(file_path);
+                if (file_id >= 0) {
+                    agg_db->put(batch, rcf::AGGREGATION,
+                                make_agg_file_key(file_id), "");
+                }
+            }
+
+            agg_db->commit_batch(batch);
         }
     }
 
-    // Finalize the merged aggregation map.
-    auto agg_results = merger.finalize();
-
-    // Resolve process-parent associations and boundary events.
-    AssociationResolverInput resolver_input;
-    resolver_input.trackers = std::move(agg_results.trackers);
-    resolver_input.aggregations = std::move(agg_results);
-    resolver_input.config = input.config;
+    // Get observed columns for consistent Arrow schema
+    auto obs = merger->observed_columns();
+    auto global_extra_key_ids =
+        std::make_shared<std::vector<std::uint32_t>>(obs.extra_key_ids);
+    auto global_custom_metric_names =
+        std::make_shared<std::vector<std::string>>(obs.custom_metric_names);
 
-    AssociationResolverUtility resolver;
-    auto resolver_output = co_await resolver.process(resolver_input);
+    // Stable, deterministic schema ordering
+    std::sort(global_extra_key_ids->begin(), global_extra_key_ids->end());
+    std::sort(global_custom_metric_names->begin(),
+              global_custom_metric_names->end());
 
-    // Yield resolved aggregations in bounded batches, separated by type.
+    // Yield batches by scanning the merged aggregator
     const std::size_t batch_sz = input.event_batch_size;
-    const auto& resolved = resolver_output.aggregations;
-
-    auto yield_map = [&](AggregationMap& map, AggregationBatchType type)
-        -> coro::AsyncGenerator<AggregationBatch> {
-        AggregationBatch batch;
-        batch.batch_type = type;
-        batch.total_events_processed = resolved.total_events_processed;
-        batch.total_files_processed = resolved.total_files_processed;
-        batch.total_bytes_processed = resolved.total_bytes_processed;
-        for (auto& [key, metrics] : map) {
-            batch.entries.emplace_back(std::move(key), std::move(metrics));
-            if (batch.entries.size() >= batch_sz) {
-                co_yield std::move(batch);
-                batch = AggregationBatch{};
-                batch.batch_type = type;
-                batch.total_events_processed = resolved.total_events_processed;
-                batch.total_files_processed = resolved.total_files_processed;
-                batch.total_bytes_processed = resolved.total_bytes_processed;
-            }
+
+    const std::size_t total_events = merger->total_events();
+    const std::size_t total_files = merger->total_files();
+
+    auto make_batch = [&](AggregationBatchType type) {
+        AggregationBatch b;
+        b.batch_type = type;
+        b.total_events_processed = total_events;
+        b.total_files_processed = total_files;
+        b.global_extra_key_ids = global_extra_key_ids.get();
+        b.global_custom_metric_names = global_custom_metric_names.get();
+        return b;
+    };
+
+    // Collect entries grouped by type (scan callback is synchronous)
+    std::vector<AggregationEntry> event_entries;
+    std::vector<AggregationEntry> profile_entries;
+    std::vector<AggregationEntry> system_entries;
+
+    std::size_t total_keys = 0;
+    merger->scan([&](AggMapType map_type, const AggregationKey& key,
+                     AggregationMetrics& metrics) {
+        total_keys++;
+        switch (map_type) {
+            case AggMapType::EVENT:
+                event_entries.emplace_back(key, std::move(metrics));
+                break;
+            case AggMapType::PROFILE:
+                profile_entries.emplace_back(key, std::move(metrics));
+                break;
+            case AggMapType::SYSTEM:
+                system_entries.emplace_back(key, std::move(metrics));
+                break;
         }
-        if (!batch.entries.empty()) {
-            co_yield std::move(batch);
+        return true;
+    });
+
+    // Setup augmentation if needed
+    std::optional<AugmentationConfig> aug_config;
+    if (scan_result.needs_augmentation) {
+        aug_config = AugmentationConfig{scan_result.stored_time_interval_us,
+                                        input.config.time_interval_us};
+        DFTRACER_UTILS_LOG_INFO("Augmenting time interval: %lu us -> %lu us",
+                                scan_result.stored_time_interval_us,
+                                input.config.time_interval_us);
+    }
+
+    auto yield_batch = [&](AggregationBatch batch) -> AggregationBatch {
+        if (aug_config) {
+            return augment_batch(batch, *aug_config);
         }
+        return batch;
     };
 
-    // Events
-    auto event_gen = yield_map(resolver_output.aggregations.aggregations,
-                               AggregationBatchType::EVENT);
-    while (auto b = co_await event_gen.next()) co_yield std::move(*b);
-
-    // Profiles
-    auto profile_gen =
-        yield_map(resolver_output.aggregations.profile_aggregations,
-                  AggregationBatchType::PROFILE);
-    while (auto b = co_await profile_gen.next()) co_yield std::move(*b);
-
-    // System
-    auto system_gen =
-        yield_map(resolver_output.aggregations.system_aggregations,
-                  AggregationBatchType::SYSTEM);
-    while (auto b = co_await system_gen.next()) co_yield std::move(*b);
-
-    // Clean up the temporary index directory if we created it.
-    if (!temp_index_dir.empty()) {
-        std::error_code ec;
-        fs::remove_all(temp_index_dir, ec);
+    // Yield event batches
+    for (std::size_t i = 0; i < event_entries.size(); i += batch_sz) {
+        AggregationBatch batch = make_batch(AggregationBatchType::EVENT);
+        std::size_t end = std::min(i + batch_sz, event_entries.size());
+        for (std::size_t j = i; j < end; ++j) {
+            batch.entries.push_back(std::move(event_entries[j]));
+        }
+        co_yield yield_batch(std::move(batch));
+    }
+
+    // Yield profile batches
+    for (std::size_t i = 0; i < profile_entries.size(); i += batch_sz) {
+        AggregationBatch batch = make_batch(AggregationBatchType::PROFILE);
+        std::size_t end = std::min(i + batch_sz, profile_entries.size());
+        for (std::size_t j = i; j < end; ++j) {
+            batch.entries.push_back(std::move(profile_entries[j]));
+        }
+        co_yield yield_batch(std::move(batch));
     }
+
+    // Yield system batches
+    for (std::size_t i = 0; i < system_entries.size(); i += batch_sz) {
+        AggregationBatch batch = make_batch(AggregationBatchType::SYSTEM);
+        std::size_t end = std::min(i + batch_sz, system_entries.size());
+        for (std::size_t j = i; j < end; ++j) {
+            batch.entries.push_back(std::move(system_entries[j]));
+        }
+        co_yield yield_batch(std::move(batch));
+    }
+
+    DFTRACER_UTILS_LOG_INFO("Aggregation complete: %zu keys", total_keys);
 }
 
 }  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.cpp
index c9fc8f00..5df75775 100644
--- a/src/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/association_resolver_utility.cpp
@@ -109,7 +109,7 @@ coro::CoroTask<AssociationResolverOutput> AssociationResolverUtility::process(
 
 void AssociationResolverUtility::compute_trace_metadata(
     const AssociationTracker& tracker,
-    const EventAggregatorUtilityOutput& /*aggregations*/,
+    const EventAggregatorOutput& /*aggregations*/,
     AssociationResolverOutput& output) {
     const auto& intervals = tracker.get_all_intervals();
 
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.cpp
index 67ac119a..eab0cc9a 100644
--- a/src/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/association_tracker.cpp
@@ -1,15 +1,15 @@
+#include <dftracer/utils/core/rocksdb/key_codec.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h>
 
 #include <algorithm>
 
 namespace dftracer::utils::utilities::composites::dft::aggregators {
 
-void AssociationTracker::extract_from_event(const JsonValue& json,
-                                            const JsonValue& args,
+void AssociationTracker::extract_from_event(std::string_view name,
+                                            std::uint64_t pid, std::uint64_t ts,
+                                            std::uint64_t dur,
+                                            const ArgsMap& args,
                                             const AggregationConfig& config) {
-    std::string_view name = json["name"].get<std::string_view>();
-    std::uint64_t pid = json["pid"].get<std::uint64_t>();
-
     if (config.track_process_parents && pid > 0) {
         all_pids_.insert(pid);
     }
@@ -38,9 +38,6 @@ void AssociationTracker::extract_from_event(const JsonValue& json,
                 final_value = std::to_string(counter);
             }
 
-            std::uint64_t ts = json["ts"].get<std::uint64_t>();
-            std::uint64_t dur = json["dur"].get<std::uint64_t>();
-
             BoundaryInterval interval;
             interval.name = boundary_config.output_name;
             interval.value = final_value;
@@ -132,4 +129,116 @@ void AssociationTracker::merge(const AssociationTracker& other) {
     }
 }
 
+namespace {
+namespace rocks = dftracer::utils::rocksdb;
+
+void put_be64(std::string& out, std::uint64_t v) {
+    rocks::KeyCodec::append_be64(out, v);
+}
+void put_be32(std::string& out, std::uint32_t v) {
+    rocks::KeyCodec::append_be32(out, v);
+}
+void put_str(std::string& out, const std::string& s) {
+    put_be32(out, static_cast<std::uint32_t>(s.size()));
+    out.append(s);
+}
+std::uint64_t read_be64(const char*& p) {
+    auto v = rocks::KeyCodec::decode_be64(std::string_view(p, 8));
+    p += 8;
+    return v;
+}
+std::uint32_t read_be32(const char*& p) {
+    auto v = rocks::KeyCodec::decode_be32(std::string_view(p, 4));
+    p += 4;
+    return v;
+}
+std::string read_str(const char*& p) {
+    auto len = read_be32(p);
+    std::string s(p, len);
+    p += len;
+    return s;
+}
+}  // namespace
+
+std::string AssociationTracker::serialize() const {
+    std::string out;
+    out.reserve(4096);
+
+    put_be32(out, static_cast<std::uint32_t>(all_pids_.size()));
+    for (auto pid : all_pids_) put_be64(out, pid);
+
+    put_be32(out, static_cast<std::uint32_t>(process_parents_.size()));
+    for (const auto& [child, parent] : process_parents_) {
+        put_be64(out, child);
+        put_be64(out, parent);
+    }
+
+    put_be32(out, static_cast<std::uint32_t>(all_intervals_.size()));
+    for (const auto& iv : all_intervals_) {
+        put_str(out, iv.name);
+        put_str(out, iv.value);
+        put_be64(out, iv.start_ts);
+        put_be64(out, iv.end_ts);
+    }
+
+    put_be32(out, static_cast<std::uint32_t>(process_intervals_.size()));
+    for (const auto& [pid, intervals] : process_intervals_) {
+        put_be64(out, pid);
+        put_be32(out, static_cast<std::uint32_t>(intervals.size()));
+        for (const auto& iv : intervals) {
+            put_str(out, iv.name);
+            put_str(out, iv.value);
+            put_be64(out, iv.start_ts);
+            put_be64(out, iv.end_ts);
+        }
+    }
+
+    return out;
+}
+
+AssociationTracker AssociationTracker::deserialize(std::string_view data) {
+    AssociationTracker t;
+    const char* p = data.data();
+
+    auto num_pids = read_be32(p);
+    for (std::uint32_t i = 0; i < num_pids; ++i)
+        t.all_pids_.insert(read_be64(p));
+
+    auto num_parents = read_be32(p);
+    for (std::uint32_t i = 0; i < num_parents; ++i) {
+        auto child = read_be64(p);
+        auto parent = read_be64(p);
+        t.process_parents_[child] = parent;
+    }
+
+    auto num_intervals = read_be32(p);
+    t.all_intervals_.reserve(num_intervals);
+    for (std::uint32_t i = 0; i < num_intervals; ++i) {
+        BoundaryInterval iv;
+        iv.name = read_str(p);
+        iv.value = read_str(p);
+        iv.start_ts = read_be64(p);
+        iv.end_ts = read_be64(p);
+        t.all_intervals_.push_back(std::move(iv));
+    }
+
+    auto num_pid_intervals = read_be32(p);
+    for (std::uint32_t i = 0; i < num_pid_intervals; ++i) {
+        auto pid = read_be64(p);
+        auto count = read_be32(p);
+        auto& vec = t.process_intervals_[pid];
+        vec.reserve(count);
+        for (std::uint32_t j = 0; j < count; ++j) {
+            BoundaryInterval iv;
+            iv.name = read_str(p);
+            iv.value = read_str(p);
+            iv.start_ts = read_be64(p);
+            iv.end_ts = read_be64(p);
+            vec.push_back(std::move(iv));
+        }
+    }
+
+    return t;
+}
+
 }  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.cpp
index 4618338d..9ba717cf 100644
--- a/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.cpp
@@ -1,190 +1,15 @@
 #include <dftracer/utils/core/common/logging.h>
-#include <dftracer/utils/core/common/string_intern.h>
-#include <dftracer/utils/utilities/common/json/json.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_logic.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/chunk_aggregator_utility.h>
 #include <dftracer/utils/utilities/composites/dft/event.h>
-#include <dftracer/utils/utilities/composites/dft/internal/utils.h>
-#include <dftracer/utils/utilities/composites/indexed_file_reader_utility.h>
-#include <dftracer/utils/utilities/composites/types.h>
-#include <dftracer/utils/utilities/reader/internal/stream_config.h>
 #include <dftracer/utils/utilities/reader/trace_reader.h>
-#include <yyjson.h>
 
-#include <chrono>
-#include <cstring>
 #include <string_view>
 
 namespace dftracer::utils::utilities::composites::dft::aggregators {
 
 using dftracer::utils::utilities::composites::dft::DFTracerEvent;
 
-namespace {
-
-void apply_preaggregated_metric(MetricStats& stats, std::uint64_t count,
-                                const JsonValue& sum_val,
-                                const JsonValue& min_val,
-                                const JsonValue& max_val) {
-    if (!sum_val.exists()) return;
-
-    const auto total = sum_val.get<std::uint64_t>();
-    stats.total += total;
-    if (min_val.exists()) {
-        stats.min = std::min(stats.min, min_val.get<std::uint64_t>());
-    }
-    if (max_val.exists()) {
-        stats.max = std::max(stats.max, max_val.get<std::uint64_t>());
-    }
-
-    if (count > 0) {
-        stats.mean =
-            static_cast<double>(stats.total) / static_cast<double>(count);
-        // Counter rows carry pre-aggregated totals/min/max. Higher moments
-        // are not available from the source trace, so stddev/skew/kurtosis
-        // remain 0 unless enough information is accumulated elsewhere.
-        stats.m2 = 0.0;
-        stats.m3 = 0.0;
-        stats.m4 = 0.0;
-    }
-}
-
-}  // namespace
-
-std::uint64_t ChunkAggregatorUtility::compute_time_bucket(
-    std::uint64_t timestamp, std::uint64_t duration,
-    const AggregationConfig& config) const {
-    std::uint64_t midpoint = timestamp + (duration / 2);
-
-    if (config.use_relative_time) {
-        midpoint -= config.reference_timestamp;
-    }
-    if (config.time_interval_us == 0) return midpoint;
-    return (midpoint / config.time_interval_us) * config.time_interval_us;
-}
-
-AggregationKey ChunkAggregatorUtility::build_key(
-    const DFTracerEvent& ev, const AggregationConfig& config) const {
-    auto& intern = aggregation_intern();
-
-    AggregationKey key;
-    key.cat_id = intern.get_or_insert(ev.cat);
-    key.name_id = intern.get_or_insert(ev.name);
-    key.pid = ev.pid;
-    key.tid = ev.tid;
-
-    auto hhash_sv = ev.args["hhash"].get<std::string_view>();
-    if (!hhash_sv.empty()) {
-        key.hhash_id = intern.get_or_insert(hhash_sv);
-    }
-    auto fhash_sv = ev.args["fhash"].get<std::string_view>();
-    if (!fhash_sv.empty()) {
-        key.fhash_id = intern.get_or_insert(fhash_sv);
-    }
-
-    key.time_bucket = compute_time_bucket(ev.ts, ev.dur, config);
-
-    if (!config.extra_group_keys.empty()) {
-        key.extra_keys = std::make_unique<
-            std::vector<std::pair<std::uint32_t, std::uint32_t>>>();
-        for (const auto& extra_key : config.extra_group_keys) {
-            std::string_view value = ev.args[extra_key].get<std::string_view>();
-            if (!value.empty()) {
-                key.extra_keys->emplace_back(intern.get_or_insert(extra_key),
-                                             intern.get_or_insert(value));
-            }
-        }
-    }
-
-    return key;
-}
-
-void ChunkAggregatorUtility::update_entry(const DFTracerEvent& ev,
-                                          const AggregationConfig& config,
-                                          AggregationMap& aggregations,
-                                          const AggregationKey& key) {
-    auto it = aggregations.find(key);
-    if (it == aggregations.end()) {
-        it = aggregations
-                 .emplace(key, AggregationMetrics(config.sketch_accuracy))
-                 .first;
-    }
-    auto& metrics = it->second;
-
-    if (ev.is_counter()) {
-        // Profile/system events carry pre-aggregated data in args.
-        // Use args.count for the event count, args.dur_sum for total duration,
-        // etc.
-        JsonValue a_count = ev.args["dft_cnt"];
-        if (!a_count.exists()) a_count = ev.args["count"];
-        std::uint64_t ev_count =
-            a_count.exists() ? a_count.get<std::uint64_t>() : 1;
-        metrics.count += ev_count;
-
-        JsonValue a_dur = ev.args["dur_sum"];
-        if (!a_dur.exists()) a_dur = ev.args["dur"];
-        JsonValue a_dur_min = ev.args["dur_min"];
-        if (!a_dur_min.exists()) a_dur_min = ev.args["dur"];
-        JsonValue a_dur_max = ev.args["dur_max"];
-        if (!a_dur_max.exists()) a_dur_max = ev.args["dur"];
-        apply_preaggregated_metric(metrics.duration, metrics.count, a_dur,
-                                   a_dur_min, a_dur_max);
-
-        JsonValue a_size_sum = ev.args["ret_sum"];
-        if (!a_size_sum.exists()) a_size_sum = ev.args["ret"];
-        JsonValue a_size_min = ev.args["ret_min"];
-        if (!a_size_min.exists()) a_size_min = ev.args["ret"];
-        JsonValue a_size_max = ev.args["ret_max"];
-        if (!a_size_max.exists()) a_size_max = ev.args["ret"];
-        apply_preaggregated_metric(metrics.size, metrics.count, a_size_sum,
-                                   a_size_min, a_size_max);
-
-        metrics.update_timestamp(ev.ts, config.time_interval_us);
-    } else {
-        // Regular events: count += 1, use event's own dur/size.
-        metrics.update_duration(ev.dur, config.compute_percentiles);
-        metrics.update_timestamp(ev.ts, ev.dur);
-
-        JsonValue ret = ev.args["ret"];
-        if (ret.exists() &&
-            internal::is_data_transfer_op(key.cat(), key.name())) {
-            std::uint64_t size = ret.get<std::uint64_t>();
-            metrics.update_size(size, config.compute_percentiles);
-        }
-    }
-
-    if (!config.custom_metric_fields.empty()) {
-        for (const auto& field : config.custom_metric_fields) {
-            if (ev.is_counter()) {
-                // Profile/system: read pre-aggregated field_sum/min/max
-                std::string sum_key = field + "_sum";
-                JsonValue a_sum = ev.args[sum_key];
-                if (!a_sum.exists()) a_sum = ev.args[field];
-                std::string min_key = field + "_min";
-                JsonValue a_min = ev.args[min_key];
-                if (!a_min.exists()) a_min = ev.args[field];
-                std::string max_key = field + "_max";
-                JsonValue a_max = ev.args[max_key];
-                if (!a_max.exists()) a_max = ev.args[field];
-                if (a_sum.exists() || a_min.exists() || a_max.exists()) {
-                    if (!metrics.custom_metrics) {
-                        metrics.custom_metrics =
-                            std::make_unique<CustomMetricsMap>();
-                    }
-                    auto& ms = (*metrics.custom_metrics)[field];
-                    apply_preaggregated_metric(ms, metrics.count, a_sum, a_min,
-                                               a_max);
-                }
-            } else {
-                JsonValue field_val = ev.args[field];
-                if (field_val.exists()) {
-                    std::uint64_t value = field_val.get<std::uint64_t>();
-                    metrics.update_custom_metric(field, value,
-                                                 config.compute_percentiles);
-                }
-            }
-        }
-    }
-}
-
 coro::CoroTask<ChunkAggregationOutput> ChunkAggregatorUtility::process(
     const ChunkAggregatorInput& input) {
     ChunkAggregationOutput output;
@@ -217,8 +42,11 @@ coro::CoroTask<ChunkAggregationOutput> ChunkAggregatorUtility::process(
     rc.start_byte = input.start_byte;
     rc.end_byte = input.end_byte;
     rc.buffer_size = input.batch_size;
+    if (input.query) {
+        rc.query = input.query->source();
+    }
 
-    auto line_gen = trace_reader.read_lines(rc);
+    auto json_gen = trace_reader.read_json(rc);
 
     AggregationMap local_aggregations;
     AggregationMap local_profiles;
@@ -230,51 +58,24 @@ coro::CoroTask<ChunkAggregationOutput> ChunkAggregatorUtility::process(
         local_tracker = std::make_shared<AssociationTracker>();
     }
 
-    char yy_buf[common::json::YYJSON_LINE_POOL_SIZE];
-    yyjson_alc yy_alc;
-    yyjson_alc_pool_init(&yy_alc, yy_buf, sizeof(yy_buf));
-
-    while (auto opt = co_await line_gen.next()) {
-        const char* line_start = opt->content.data();
-        std::size_t line_len = opt->content.size();
-        if (line_len == 0) continue;
+    while (auto opt = co_await json_gen.next()) {
+        DFTracerEvent ev;
+        if (!DFTracerEvent::parse_ondemand(*opt->parser, ev)) continue;
+        if (ev.is_metadata()) continue;
 
-        yyjson_doc* doc =
-            yyjson_read_opts(const_cast<char*>(line_start), line_len,
-                             YYJSON_READ_NOFLAG, &yy_alc, nullptr);
-        if (!doc) continue;
-
-        yyjson_val* root = yyjson_doc_get_root(doc);
-        if (root && yyjson_is_obj(root)) {
-            bool pass = true;
-            if (input.query) {
-                JsonValue json(root);
-                std::string_view ph = json["ph"].get<std::string_view>();
-                if (ph != "M") {
-                    pass = input.query->evaluate(json);
-                }
-            }
-            if (pass) {
-                DFTracerEvent ev;
-                if (DFTracerEvent::parse(root, ev) && !ev.is_metadata()) {
-                    if (local_tracker) {
-                        JsonValue json(root);
-                        local_tracker->extract_from_event(json, ev.args,
-                                                          input.config);
-                    }
-                    auto key = build_key(ev, input.config);
-                    if (ev.is_system()) {
-                        update_entry(ev, input.config, local_system, key);
-                    } else if (ev.is_profile()) {
-                        update_entry(ev, input.config, local_profiles, key);
-                    } else {
-                        update_entry(ev, input.config, local_aggregations, key);
-                    }
-                    output.events_processed++;
-                }
-            }
+        if (local_tracker) {
+            local_tracker->extract_from_event(ev.name, ev.pid, ev.ts, ev.dur,
+                                              ev.args, input.config);
+        }
+        auto key = build_aggregation_key(ev, input.config);
+        if (ev.is_system()) {
+            update_aggregation_entry(ev, input.config, local_system, key);
+        } else if (ev.is_profile()) {
+            update_aggregation_entry(ev, input.config, local_profiles, key);
+        } else {
+            update_aggregation_entry(ev, input.config, local_aggregations, key);
         }
-        yyjson_doc_free(doc);
+        output.events_processed++;
     }
 
     if (local_tracker) {
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.cpp
new file mode 100644
index 00000000..a840a2a0
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.cpp
@@ -0,0 +1,468 @@
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/rocksdb/column_families.h>
+#include <dftracer/utils/core/rocksdb/db_manager.h>
+#include <dftracer/utils/core/rocksdb/key_codec.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.h>
+#include <rocksdb/table.h>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+namespace rcf = dftracer::utils::rocksdb::cf;
+namespace rocks = dftracer::utils::rocksdb;
+
+static constexpr std::string_view TIME_BOUNDS_DB_KEY = "__time_bounds__";
+
+EventAggregator::EventAggregator() : rocksdb_mode_(false) {}
+
+EventAggregator::EventAggregator(std::shared_ptr<rocksdb::RocksDatabase> db,
+                                 std::uint32_t config_hash)
+    : rocksdb_mode_(true), db_(std::move(db)), config_hash_(config_hash) {
+    load_intern_dictionary(*db_);
+}
+
+void EventAggregator::merge_chunk(ChunkAggregationOutput&& chunk_output) {
+    if (rocksdb_mode_) {
+        merge_chunk_rocksdb(std::move(chunk_output));
+    } else {
+        merge_chunk_memory(std::move(chunk_output));
+    }
+}
+
+void EventAggregator::merge_chunk_memory(
+    ChunkAggregationOutput&& chunk_output) {
+    if (!chunk_output.success) return;
+
+    state_.total_events_processed += chunk_output.events_processed;
+    state_.total_bytes_processed += chunk_output.bytes_processed;
+    unique_files_.insert(chunk_output.file_path);
+
+    auto merge_into = [](AggregationMap& dst, AggregationMap& src) {
+        for (auto& [key, metrics] : src) {
+            auto it = dst.find(key);
+            if (it == dst.end()) {
+                dst.emplace(key, std::move(metrics));
+            } else {
+                it->second.merge_from(metrics);
+            }
+        }
+    };
+    merge_into(state_.aggregations, chunk_output.aggregations);
+    merge_into(state_.profile_aggregations, chunk_output.profile_aggregations);
+    merge_into(state_.system_aggregations, chunk_output.system_aggregations);
+
+    if (chunk_output.local_tracker) {
+        state_.trackers.push_back(std::move(chunk_output.local_tracker));
+    }
+
+    update_time_bounds(chunk_output.min_time_bucket);
+    update_time_bounds(chunk_output.max_time_bucket);
+}
+
+void EventAggregator::merge_chunk_rocksdb(
+    ChunkAggregationOutput&& chunk_output) {
+    if (!chunk_output.success) return;
+
+    total_events_ += chunk_output.events_processed;
+    total_bytes_ += chunk_output.bytes_processed;
+
+    update_time_bounds(chunk_output.min_time_bucket);
+    update_time_bounds(chunk_output.max_time_bucket);
+
+    unique_files_.insert(std::move(chunk_output.file_path));
+
+    if (chunk_output.local_tracker) {
+        trackers_.push_back(std::move(chunk_output.local_tracker));
+    }
+}
+
+void EventAggregator::add_observed_extra_key(const std::string& key) {
+    auto& intern = aggregation_intern();
+    observed_extra_key_ids_.insert(intern.get_or_insert(key));
+}
+
+void EventAggregator::add_observed_custom_metric(const std::string& name) {
+    observed_custom_metric_names_.insert(name);
+}
+
+EventAggregatorOutput EventAggregator::finalize() {
+    if (rocksdb_mode_) {
+        EventAggregatorOutput output;
+        output.total_events_processed = total_events_.load();
+        output.total_bytes_processed = total_bytes_.load();
+        output.total_files_processed = unique_files_.size();
+        output.trackers = std::move(trackers_);
+
+        scan([&output](AggMapType map_type, const AggregationKey& key,
+                       AggregationMetrics& metrics) {
+            switch (map_type) {
+                case AggMapType::PROFILE:
+                    output.profile_aggregations.emplace(key,
+                                                        std::move(metrics));
+                    break;
+                case AggMapType::SYSTEM:
+                    output.system_aggregations.emplace(key, std::move(metrics));
+                    break;
+                default:
+                    output.aggregations.emplace(key, std::move(metrics));
+                    break;
+            }
+            return true;
+        });
+
+        output.success = true;
+
+        auto min_tb = min_time_bucket_.load(std::memory_order_relaxed);
+        auto max_tb = max_time_bucket_.load(std::memory_order_relaxed);
+        if (min_tb != UINT64_MAX && max_tb != 0 && min_tb <= max_tb) {
+            std::string time_bounds_val = rocks::KeyCodec::encode_be64(min_tb);
+            time_bounds_val += rocks::KeyCodec::encode_be64(max_tb);
+            db_->put(TIME_BOUNDS_DB_KEY, time_bounds_val, rcf::AGGREGATION);
+        }
+
+        DFTRACER_UTILS_LOG_INFO(
+            "Aggregation complete: %zu unique keys, %zu total events, %zu "
+            "files",
+            output.aggregations.size(), output.total_events_processed,
+            output.total_files_processed);
+
+        return output;
+    }
+
+    state_.total_files_processed = unique_files_.size();
+    state_.success = true;
+
+    DFTRACER_UTILS_LOG_INFO(
+        "Aggregation complete: %zu unique keys, %zu total events, %zu files",
+        state_.aggregations.size(), state_.total_events_processed,
+        state_.total_files_processed);
+
+    return std::move(state_);
+}
+
+std::size_t EventAggregator::scan(ScanCallback callback) const {
+    if (!rocksdb_mode_) {
+        std::size_t count = 0;
+        auto scan_map = [&](const AggregationMap& map, AggMapType map_type) {
+            for (auto& [key, metrics] : map) {
+                count++;
+                auto& mutable_metrics =
+                    const_cast<AggregationMetrics&>(metrics);
+                if (!callback(map_type, key, mutable_metrics)) return false;
+            }
+            return true;
+        };
+        if (!scan_map(state_.aggregations, AggMapType::EVENT)) return count;
+        if (!scan_map(state_.profile_aggregations, AggMapType::PROFILE))
+            return count;
+        scan_map(state_.system_aggregations, AggMapType::SYSTEM);
+        return count;
+    }
+
+    return scan_shard_range(0, AGG_KEY_NUM_SHARDS, callback);
+}
+
+std::size_t EventAggregator::scan_shard_range_raw_fn(std::uint16_t shard_begin,
+                                                     std::uint16_t shard_end,
+                                                     RawScanCallbackFn fn,
+                                                     void* ctx) const {
+    if (!rocksdb_mode_ || !db_) return 0;
+
+    char begin_key[2];
+    begin_key[0] = static_cast<char>(shard_begin >> 8);
+    begin_key[1] = static_cast<char>(shard_begin);
+
+    auto it = db_->new_iterator(rcf::AGGREGATION);
+    std::size_t count = 0;
+    for (it->Seek({begin_key, 2}); it->Valid(); it->Next()) {
+        auto key_slice = it->key();
+        if (key_slice.size() < 3) continue;
+        std::uint16_t shard = (static_cast<std::uint8_t>(key_slice[0]) << 8) |
+                              static_cast<std::uint8_t>(key_slice[1]);
+        if (shard >= AGG_KEY_NUM_SHARDS) break;
+        if (shard >= shard_end) break;
+
+        auto val_slice = it->value();
+        count++;
+        if (!fn(ctx, std::string_view(key_slice.data(), key_slice.size()),
+                std::string_view(val_slice.data(), val_slice.size())))
+            break;
+    }
+    return count;
+}
+
+std::size_t EventAggregator::scan_shard_range(std::uint16_t shard_begin,
+                                              std::uint16_t shard_end,
+                                              ScanCallback callback) const {
+    if (!rocksdb_mode_ || !db_) return 0;
+
+    char begin_key[2];
+    begin_key[0] = static_cast<char>(shard_begin >> 8);
+    begin_key[1] = static_cast<char>(shard_begin);
+
+    auto it = db_->new_iterator(rcf::AGGREGATION);
+    std::size_t count = 0;
+    for (it->Seek({begin_key, 2}); it->Valid(); it->Next()) {
+        auto key_slice = it->key();
+        if (key_slice.size() < 3) continue;
+        std::uint16_t shard = (static_cast<std::uint8_t>(key_slice[0]) << 8) |
+                              static_cast<std::uint8_t>(key_slice[1]);
+        if (shard >= AGG_KEY_NUM_SHARDS) break;
+        if (shard >= shard_end) break;
+
+        auto val_slice = it->value();
+        auto deserialized = deserialize_agg_key(
+            std::string_view(key_slice.data(), key_slice.size()));
+        auto metrics = deserialize_agg_value(
+            std::string_view(val_slice.data(), val_slice.size()));
+
+        count++;
+        if (!callback(deserialized.map_type, deserialized.key, metrics)) break;
+    }
+    return count;
+}
+
+namespace {
+
+std::string serialize_observed_columns(
+    const std::set<std::uint32_t>& extra_key_ids,
+    const std::set<std::string>& custom_metric_names) {
+    namespace rocks = dftracer::utils::rocksdb;
+    auto& intern = aggregation_intern();
+    std::string out;
+    auto put_str = [&](std::string_view s) {
+        rocks::KeyCodec::append_be32(out, static_cast<std::uint32_t>(s.size()));
+        out.append(s.data(), s.size());
+    };
+
+    rocks::KeyCodec::append_be32(
+        out, static_cast<std::uint32_t>(extra_key_ids.size()));
+    for (auto id : extra_key_ids) put_str(intern.resolve(id));
+
+    rocks::KeyCodec::append_be32(
+        out, static_cast<std::uint32_t>(custom_metric_names.size()));
+    for (const auto& name : custom_metric_names) put_str(name);
+
+    return out;
+}
+
+void deserialize_observed_columns(std::string_view data,
+                                  std::set<std::uint32_t>& extra_key_ids,
+                                  std::set<std::string>& custom_metric_names) {
+    namespace rocks = dftracer::utils::rocksdb;
+    auto& intern = aggregation_intern();
+    std::size_t off = 0;
+    auto read_u32 = [&]() -> std::uint32_t {
+        if (off + 4 > data.size()) return 0;
+        auto v = rocks::KeyCodec::decode_be32(data.substr(off, 4));
+        off += 4;
+        return v;
+    };
+    auto read_str = [&]() -> std::string_view {
+        auto len = read_u32();
+        if (off + len > data.size()) return {};
+        auto sv = data.substr(off, len);
+        off += len;
+        return sv;
+    };
+
+    auto n_extra = read_u32();
+    for (std::uint32_t i = 0; i < n_extra; ++i) {
+        auto sv = read_str();
+        if (!sv.empty()) extra_key_ids.insert(intern.get_or_insert(sv));
+    }
+
+    auto n_metrics = read_u32();
+    for (std::uint32_t i = 0; i < n_metrics; ++i) {
+        auto sv = read_str();
+        if (!sv.empty()) custom_metric_names.emplace(sv);
+    }
+}
+
+}  // namespace
+
+static constexpr std::string_view COLUMNS_DB_KEY = "__observed_columns__";
+
+EventAggregator::ObservedColumns EventAggregator::observed_columns() {
+    if (rocksdb_mode_ && db_) {
+        std::string val;
+        if (db_->get(COLUMNS_DB_KEY, &val, rcf::AGGREGATION).ok() &&
+            !val.empty()) {
+            deserialize_observed_columns(val, observed_extra_key_ids_,
+                                         observed_custom_metric_names_);
+        }
+
+        auto serialized = serialize_observed_columns(
+            observed_extra_key_ids_, observed_custom_metric_names_);
+        db_->put(COLUMNS_DB_KEY, serialized, rcf::AGGREGATION);
+    }
+
+    ObservedColumns result;
+    result.extra_key_ids.assign(observed_extra_key_ids_.begin(),
+                                observed_extra_key_ids_.end());
+    result.custom_metric_names.assign(observed_custom_metric_names_.begin(),
+                                      observed_custom_metric_names_.end());
+    return result;
+}
+
+std::vector<std::shared_ptr<AssociationTracker>>
+EventAggregator::take_trackers() {
+    return std::move(trackers_);
+}
+
+static constexpr std::string_view TRACKER_DB_KEY = "__tracker__";
+
+std::unique_ptr<AssociationTracker> EventAggregator::build_global_tracker() {
+    auto tracker = std::make_unique<AssociationTracker>();
+
+    for (const auto& t : trackers_) {
+        if (t) tracker->merge(*t);
+    }
+    trackers_.clear();
+
+    if (rocksdb_mode_ && db_) {
+        std::string val;
+        if (db_->get(TRACKER_DB_KEY, &val, rcf::AGGREGATION).ok() &&
+            !val.empty()) {
+            tracker->merge(AssociationTracker::deserialize(val));
+        }
+    }
+
+    tracker->finalize();
+
+    if (rocksdb_mode_ && db_) {
+        db_->put(TRACKER_DB_KEY, tracker->serialize(), rcf::AGGREGATION);
+    }
+
+    return tracker;
+}
+
+std::shared_ptr<rocksdb::RocksDatabase>
+EventAggregator::open_with_merge_operator(const std::string& index_path) {
+    auto agg_merge_op = std::make_shared<AggregationMergeOperator>();
+    auto sys_merge_op = std::make_shared<SystemMetricsMergeOperator>();
+    auto cf_override = [agg_merge_op, sys_merge_op](
+                           const std::string& cf_name,
+                           ::rocksdb::ColumnFamilyOptions& opts) {
+        if (cf_name == rcf::AGGREGATION) {
+            opts.merge_operator = agg_merge_op;
+
+            ::rocksdb::BlockBasedTableOptions bbt;
+            bbt.block_size = 32 * 1024;
+            bbt.format_version = 5;
+            bbt.index_block_restart_interval = 16;
+            bbt.whole_key_filtering = false;
+            opts.table_factory.reset(::rocksdb::NewBlockBasedTableFactory(bbt));
+
+            opts.level0_file_num_compaction_trigger = 2;
+            opts.max_bytes_for_level_multiplier = 20;
+
+#ifdef DFTRACER_UTILS_ENABLE_ZSTD
+            opts.compression = ::rocksdb::kZSTD;
+            opts.compression_opts.level = 9;
+            opts.compression_opts.max_dict_bytes = 262144;
+            opts.compression_opts.zstd_max_train_bytes = 1048576;
+            opts.compression_opts.enabled = true;
+            opts.bottommost_compression = ::rocksdb::kZSTD;
+            opts.bottommost_compression_opts.level = 9;
+            opts.bottommost_compression_opts.max_dict_bytes = 262144;
+            opts.bottommost_compression_opts.zstd_max_train_bytes = 1048576;
+            opts.bottommost_compression_opts.enabled = true;
+#elif defined(DFTRACER_UTILS_ENABLE_LZ4)
+            opts.compression = ::rocksdb::kLZ4Compression;
+            opts.bottommost_compression = ::rocksdb::kLZ4Compression;
+#else
+            opts.compression = ::rocksdb::kZlibCompression;
+            opts.bottommost_compression = ::rocksdb::kZlibCompression;
+#endif
+        } else if (cf_name == rcf::SYSTEM_METRICS) {
+            opts.merge_operator = sys_merge_op;
+#ifdef DFTRACER_UTILS_ENABLE_ZSTD
+            opts.compression = ::rocksdb::kZSTD;
+            opts.bottommost_compression = ::rocksdb::kZSTD;
+#elif defined(DFTRACER_UTILS_ENABLE_LZ4)
+            opts.compression = ::rocksdb::kLZ4Compression;
+            opts.bottommost_compression = ::rocksdb::kLZ4Compression;
+#else
+            opts.compression = ::rocksdb::kZlibCompression;
+            opts.bottommost_compression = ::rocksdb::kZlibCompression;
+#endif
+        }
+    };
+    auto& mgr = rocksdb::RocksDBManager::instance();
+    mgr.reset(index_path);
+    return mgr.get_or_open(
+        index_path, rocksdb::RocksDatabase::OpenMode::ReadWrite, cf_override);
+}
+
+std::shared_ptr<rocksdb::RocksDatabase>
+EventAggregator::open_read_only_with_merge_operator(
+    const std::string& index_path) {
+    auto agg_merge_op = std::make_shared<AggregationMergeOperator>();
+    auto sys_merge_op = std::make_shared<SystemMetricsMergeOperator>();
+    auto cf_override = [agg_merge_op, sys_merge_op](
+                           const std::string& cf_name,
+                           ::rocksdb::ColumnFamilyOptions& opts) {
+        if (cf_name == rcf::AGGREGATION) {
+            opts.merge_operator = agg_merge_op;
+        } else if (cf_name == rcf::SYSTEM_METRICS) {
+            opts.merge_operator = sys_merge_op;
+        }
+    };
+    auto& mgr = rocksdb::RocksDBManager::instance();
+    mgr.reset(index_path);
+    return mgr.get_or_open(
+        index_path, rocksdb::RocksDatabase::OpenMode::ReadOnly, cf_override);
+}
+
+void EventAggregator::update_time_bounds(std::uint64_t time_bucket) {
+    std::uint64_t old_min = min_time_bucket_.load(std::memory_order_relaxed);
+    while (time_bucket < old_min &&
+           !min_time_bucket_.compare_exchange_weak(old_min, time_bucket,
+                                                   std::memory_order_relaxed)) {
+    }
+
+    std::uint64_t old_max = max_time_bucket_.load(std::memory_order_relaxed);
+    while (time_bucket > old_max &&
+           !max_time_bucket_.compare_exchange_weak(old_max, time_bucket,
+                                                   std::memory_order_relaxed)) {
+    }
+}
+
+std::uint64_t EventAggregator::min_time_bucket() const {
+    return min_time_bucket_.load(std::memory_order_relaxed);
+}
+
+std::uint64_t EventAggregator::max_time_bucket() const {
+    return max_time_bucket_.load(std::memory_order_relaxed);
+}
+
+EventAggregator::TimeBoundsResult EventAggregator::query_time_bounds() const {
+    TimeBoundsResult result;
+
+    if (rocksdb_mode_ && db_) {
+        std::string val;
+        if (db_->get(TIME_BOUNDS_DB_KEY, &val, rcf::AGGREGATION).ok() &&
+            val.size() >= 16) {
+            result.min_time_bucket = rocks::KeyCodec::decode_be64(
+                std::string_view(val).substr(0, 8));
+            result.max_time_bucket = rocks::KeyCodec::decode_be64(
+                std::string_view(val).substr(8, 8));
+            result.valid = true;
+            return result;
+        }
+    }
+
+    std::uint64_t min_val = min_time_bucket_.load(std::memory_order_relaxed);
+    std::uint64_t max_val = max_time_bucket_.load(std::memory_order_relaxed);
+    result.min_time_bucket = min_val;
+    result.max_time_bucket = max_val;
+    result.valid =
+        (min_val != UINT64_MAX && max_val != 0 && min_val <= max_val);
+    return result;
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.cpp
deleted file mode 100644
index 5eb27e5f..00000000
--- a/src/dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-#include <dftracer/utils/core/common/logging.h>
-#include <dftracer/utils/core/coro/task.h>
-#include <dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.h>
-
-namespace dftracer::utils::utilities::composites::dft::aggregators {
-
-void EventAggregatorUtility::merge_chunk(
-    ChunkAggregationOutput&& chunk_output) {
-    if (!chunk_output.success) return;
-
-    state_.total_events_processed += chunk_output.events_processed;
-    state_.total_bytes_processed += chunk_output.bytes_processed;
-    unique_files_.insert(chunk_output.file_path);
-
-    auto merge_into = [](AggregationMap& dst, AggregationMap& src) {
-        for (auto& [key, metrics] : src) {
-            auto it = dst.find(key);
-            if (it == dst.end()) {
-                dst.emplace(key, std::move(metrics));
-            } else {
-                it->second.merge_from(metrics);
-            }
-        }
-    };
-    merge_into(state_.aggregations, chunk_output.aggregations);
-    merge_into(state_.profile_aggregations, chunk_output.profile_aggregations);
-    merge_into(state_.system_aggregations, chunk_output.system_aggregations);
-
-    if (chunk_output.local_tracker) {
-        state_.trackers.push_back(std::move(chunk_output.local_tracker));
-    }
-}
-
-EventAggregatorUtilityOutput EventAggregatorUtility::finalize() {
-    state_.total_files_processed = unique_files_.size();
-    state_.success = true;
-
-    DFTRACER_UTILS_LOG_INFO(
-        "Aggregation complete: %zu unique keys, %zu total events, %zu files",
-        state_.aggregations.size(), state_.total_events_processed,
-        state_.total_files_processed);
-
-    return std::move(state_);
-}
-
-coro::CoroTask<EventAggregatorUtilityOutput> EventAggregatorUtility::process(
-    const EventAggregatorUtilityInput& input) {
-    for (auto& output : const_cast<std::vector<ChunkAggregationOutput>&>(
-             input.chunk_outputs)) {
-        merge_chunk(std::move(output));
-    }
-
-    co_return finalize();
-}
-
-}  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.cpp
index a0d60260..ba0606d8 100644
--- a/src/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.cpp
@@ -1,449 +1,706 @@
 #include <dftracer/utils/core/common/byte_view.h>
 #include <dftracer/utils/core/common/logging.h>
 #include <dftracer/utils/core/io/io.h>
+#include <dftracer/utils/core/rocksdb/column_families.h>
+#include <dftracer/utils/core/rocksdb/database.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_config.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/perfetto_trace_writer_utility.h>
 #include <dftracer/utils/utilities/compression/zlib/streaming_compressor_utility.h>
+#include <dftracer/utils/utilities/fileio/parallel/layout.h>
+#include <dftracer/utils/utilities/fileio/parallel/merge.h>
+#include <dftracer/utils/utilities/fileio/parallel/parallel_writer.h>
 #include <dftracer/utils/utilities/fileio/streaming_file_writer_utility.h>
 #include <dftracer/utils/utilities/hash/hasher_utility.h>
 #include <fcntl.h>
 
-#include <algorithm>
+#include <charconv>
 #include <cmath>
+#include <cstdarg>
+#include <cstring>
 #include <limits>
+#include <memory>
 #include <string>
 #include <string_view>
 
 namespace dftracer::utils::utilities::composites::dft::aggregators {
 
-std::uint64_t PerfettoTraceWriterUtility::generate_synthetic_tid(
-    const AggregationKey& key) const {
-    dftracer::utils::utilities::hash::HasherUtility hasher;
-    std::string key_str =
-        std::string(key.cat()) + ":" + std::string(key.name()) + ":" +
-        std::to_string(key.pid) + ":" + std::to_string(key.time_bucket);
+namespace {
 
-    if (!key.fhash().empty()) {
-        key_str += ":";
-        key_str += key.fhash();
+class JsonBuffer {
+   public:
+    explicit JsonBuffer(std::size_t capacity)
+        : data_(std::make_unique<char[]>(capacity)),
+          capacity_(capacity),
+          size_(0) {}
+
+    void append(const char* ptr, std::size_t len) {
+        std::memcpy(data_.get() + size_, ptr, len);
+        size_ += len;
+    }
+
+    void append(std::string_view sv) { append(sv.data(), sv.size()); }
+
+    void push_back(char c) { data_[size_++] = c; }
+
+    template <std::size_t N>
+    void append_literal(const char (&lit)[N]) {
+        append(lit, N - 1);
+    }
+
+    void append_u64(std::uint64_t v) {
+        auto res =
+            std::to_chars(data_.get() + size_, data_.get() + capacity_, v);
+        size_ = static_cast<std::size_t>(res.ptr - data_.get());
+    }
+
+    void append_i64(std::int64_t v) {
+        auto res =
+            std::to_chars(data_.get() + size_, data_.get() + capacity_, v);
+        size_ = static_cast<std::size_t>(res.ptr - data_.get());
     }
 
-    if (key.extra_keys) {
-        auto& intern = aggregation_intern();
-        for (const auto& [k, v] : *key.extra_keys) {
-            key_str += ":";
-            key_str += intern.resolve(k);
-            key_str += "=";
-            key_str += intern.resolve(v);
+    void append_double(double value) {
+        int n;
+        if (std::abs(value - std::round(value)) < 1e-9) {
+            n = std::snprintf(data_.get() + size_, capacity_ - size_, "%lld",
+                              static_cast<long long>(std::round(value)));
+        } else {
+            n = std::snprintf(data_.get() + size_, capacity_ - size_, "%.2f",
+                              value);
         }
+        size_ += static_cast<std::size_t>(n);
     }
 
-    // CPU-bound hash — .get() intentional
-    std::size_t hash = hasher.process(key_str).get().value;
-    return 1000000000ULL + (hash % 1000000ULL);
-}
+    int format(const char* fmt, ...) __attribute__((format(printf, 2, 3))) {
+        va_list ap;
+        va_start(ap, fmt);
+        int n = std::vsnprintf(data_.get() + size_, capacity_ - size_, fmt, ap);
+        va_end(ap);
+        size_ += static_cast<std::size_t>(n);
+        return n;
+    }
 
-void PerfettoTraceWriterUtility::append_json_string(
-    std::string& buffer, std::string_view str) const {
-    for (char c : str) {
-        switch (c) {
-            case '"':
-                buffer += "\\\"";
-                break;
-            case '\\':
-                buffer += "\\\\";
-                break;
-            case '\b':
-                buffer += "\\b";
-                break;
-            case '\f':
-                buffer += "\\f";
-                break;
-            case '\n':
-                buffer += "\\n";
-                break;
-            case '\r':
-                buffer += "\\r";
-                break;
-            case '\t':
-                buffer += "\\t";
-                break;
-            default:
-                if (c >= 32 && c < 127) {
-                    buffer += c;
-                } else {
-                    char hex[7];
-                    std::snprintf(hex, sizeof(hex), "\\u%04x",
-                                  (unsigned char)c);
-                    buffer += hex;
-                }
-                break;
+    void append_json_escaped(std::string_view str) {
+        const char* start = str.data();
+        const char* end = start + str.size();
+        const char* safe_start = start;
+        for (const char* p = start; p < end; ++p) {
+            unsigned char c = static_cast<unsigned char>(*p);
+            if (c >= 32 && c < 127 && c != '"' && c != '\\') continue;
+            if (p > safe_start) {
+                append(safe_start, static_cast<std::size_t>(p - safe_start));
+            }
+            switch (c) {
+                case '"':
+                    append_literal("\\\"");
+                    break;
+                case '\\':
+                    append_literal("\\\\");
+                    break;
+                case '\b':
+                    append_literal("\\b");
+                    break;
+                case '\f':
+                    append_literal("\\f");
+                    break;
+                case '\n':
+                    append_literal("\\n");
+                    break;
+                case '\r':
+                    append_literal("\\r");
+                    break;
+                case '\t':
+                    append_literal("\\t");
+                    break;
+                default:
+                    format("\\u%04x", c);
+                    break;
+            }
+            safe_start = p + 1;
+        }
+        if (end > safe_start) {
+            append(safe_start, static_cast<std::size_t>(end - safe_start));
         }
     }
-}
 
-void PerfettoTraceWriterUtility::append_double(std::string& buffer,
-                                               double value) const {
-    char temp[64];
-    if (std::abs(value - std::round(value)) < 1e-9) {
-        std::snprintf(temp, sizeof(temp), "%lld",
-                      static_cast<long long>(std::round(value)));
-    } else {
-        std::snprintf(temp, sizeof(temp), "%.2f", value);
+    const char* data() const { return data_.get(); }
+    std::size_t size() const { return size_; }
+    std::size_t capacity() const { return capacity_; }
+    std::size_t remaining() const { return capacity_ - size_; }
+    bool empty() const { return size_ == 0; }
+    void clear() { size_ = 0; }
+    ByteView view() const { return ByteView(data_.get(), size_); }
+
+   private:
+    std::unique_ptr<char[]> data_;
+    std::size_t capacity_;
+    std::size_t size_;
+};
+
+class ByteReader {
+   public:
+    explicit ByteReader(std::string_view data) : data_(data), off_(0) {}
+
+    std::uint8_t u8() { return static_cast<std::uint8_t>(data_[off_++]); }
+
+    std::uint16_t be16() {
+        auto hi = static_cast<std::uint8_t>(data_[off_++]);
+        auto lo = static_cast<std::uint8_t>(data_[off_++]);
+        return static_cast<std::uint16_t>((hi << 8) | lo);
     }
-    buffer += temp;
-}
 
-void PerfettoTraceWriterUtility::append_metric_stats(
-    std::string& buffer, const MetricStats& stats, std::uint64_t count,
-    bool compute_statistics, bool compute_percentiles,
-    const std::vector<double>& percentiles) const {
-    char temp[256];
+    void skip(std::size_t n) { off_ += n; }
 
-    std::snprintf(temp, sizeof(temp), "\"sum\":%llu",
-                  static_cast<unsigned long long>(stats.total));
-    buffer += temp;
+    std::uint64_t varint() {
+        std::uint64_t v = 0;
+        unsigned shift = 0;
+        while (off_ < data_.size()) {
+            auto b = static_cast<std::uint8_t>(data_[off_++]);
+            v |= static_cast<std::uint64_t>(b & 0x7F) << shift;
+            if ((b & 0x80) == 0) return v;
+            shift += 7;
+        }
+        return v;
+    }
 
-    buffer += ",\"avg\":";
-    append_double(buffer, stats.mean);
+    std::uint64_t be64() {
+        std::uint64_t v = 0;
+        for (int i = 0; i < 8; ++i) {
+            v = (v << 8) | static_cast<std::uint8_t>(data_[off_++]);
+        }
+        return v;
+    }
 
-    if (stats.min != std::numeric_limits<std::uint64_t>::max()) {
-        std::snprintf(temp, sizeof(temp), ",\"min\":%llu",
-                      static_cast<unsigned long long>(stats.min));
-        buffer += temp;
+    double f64() {
+        std::uint64_t bits = be64();
+        double v;
+        std::memcpy(&v, &bits, 8);
+        return v;
     }
 
-    if (stats.max > 0) {
-        std::snprintf(temp, sizeof(temp), ",\"max\":%llu",
-                      static_cast<unsigned long long>(stats.max));
-        buffer += temp;
+    std::string_view str() {
+        auto len_hi = static_cast<std::uint8_t>(data_[off_++]);
+        auto len_lo = static_cast<std::uint8_t>(data_[off_++]);
+        std::size_t len = (static_cast<std::size_t>(len_hi) << 8) | len_lo;
+        auto s = data_.substr(off_, len);
+        off_ += len;
+        return s;
     }
 
-    if (compute_statistics && count >= 2) {
-        buffer += ",\"std\":";
-        append_double(buffer, stats.get_stddev(count));
+    void skip_blob() {
+        std::uint32_t len = 0;
+        for (int i = 0; i < 4; ++i) {
+            len = (len << 8) | static_cast<std::uint8_t>(data_[off_++]);
+        }
+        off_ += len;
     }
-    if (compute_statistics && count >= 3) {
-        buffer += ",\"skw\":";
-        append_double(buffer, stats.get_skewness(count));
+
+    std::size_t offset() const { return off_; }
+
+   private:
+    std::string_view data_;
+    std::size_t off_;
+};
+
+inline void emit_metric_stats_from_bytes(ByteReader& r, std::string_view prefix,
+                                         bool compute_statistics,
+                                         JsonBuffer& buf) {
+    auto fmt = r.u8();
+    if (fmt == METRIC_FMT_COMPACT) {
+        auto val = r.varint();
+        if (val == 0) return;
+        buf.append_literal(",\"");
+        buf.append(prefix);
+        buf.append_literal("_sum\":");
+        buf.append_u64(val);
+        buf.append_literal(",\"");
+        buf.append(prefix);
+        buf.append_literal("_min\":");
+        buf.append_u64(val);
+        buf.append_literal(",\"");
+        buf.append(prefix);
+        buf.append_literal("_max\":");
+        buf.append_u64(val);
+        return;
     }
-    if (compute_statistics && count >= 4) {
-        buffer += ",\"krt\":";
-        append_double(buffer, stats.get_kurtosis(count));
+    // FULL or FULL_WITH_SKETCH
+    auto count = r.varint();
+    auto total = r.varint();
+    auto min = r.varint();
+    auto max = r.varint();
+    (void)r.f64();  // mean
+    auto m2 = r.f64();
+    if (fmt == METRIC_FMT_FULL_WITH_SKETCH) {
+        r.skip_blob();
     }
 
-    if (compute_percentiles && stats.sketch && !stats.sketch->empty()) {
-        for (double p : percentiles) {
-            double percentile_value = stats.sketch->quantile(p);
-            int p_percent = static_cast<int>(p * 100);
-            std::snprintf(temp, sizeof(temp), ",\"p%d\":", p_percent);
-            buffer += temp;
-            append_double(buffer, percentile_value);
-        }
+    buf.append_literal(",\"");
+    buf.append(prefix);
+    buf.append_literal("_sum\":");
+    buf.append_u64(total);
+
+    if (min != std::numeric_limits<std::uint64_t>::max()) {
+        buf.append_literal(",\"");
+        buf.append(prefix);
+        buf.append_literal("_min\":");
+        buf.append_u64(min);
+    }
+    if (max > 0) {
+        buf.append_literal(",\"");
+        buf.append(prefix);
+        buf.append_literal("_max\":");
+        buf.append_u64(max);
     }
-}
 
-void PerfettoTraceWriterUtility::append_event_args(
-    std::string& buffer, const AggregationKey& key,
-    const AggregationMetrics& metrics, bool compute_statistics,
-    bool compute_percentiles, const std::vector<double>& percentiles,
-    std::uint64_t real_tid) const {
-    char temp[512];
-
-    buffer += "\"hhash\":\"";
-    append_json_string(buffer, key.hhash());
-    buffer += "\"";
-
-    if (real_tid > 0) {
-        std::snprintf(temp, sizeof(temp), ",\"real_tid\":%llu",
-                      static_cast<unsigned long long>(real_tid));
-        buffer += temp;
+    if (compute_statistics && count >= 2) {
+        // `m2` holds the raw power sum
+        // `sum_x^2`, not Welford's central M2. Convert to central
+        // moment then to sample variance. Clamp at zero for float
+        // cancellation.
+        const double n = static_cast<double>(count);
+        const double sum_x = static_cast<double>(total);
+        const double central = m2 - sum_x * sum_x / n;
+        const double var = (central > 0.0 ? central : 0.0) / (n - 1.0);
+        const double stddev = var > 0.0 ? std::sqrt(var) : 0.0;
+        buf.append_literal(",\"");
+        buf.append(prefix);
+        buf.append_literal("_std\":");
+        buf.append_double(stddev);
     }
+}
 
-    if (!key.fhash().empty()) {
-        buffer += ",\"fhash\":\"";
-        append_json_string(buffer, key.fhash());
-        buffer += "\"";
+inline void skip_metric_stats(ByteReader& r) {
+    auto fmt = r.u8();
+    if (fmt == METRIC_FMT_COMPACT) {
+        r.varint();
+        return;
     }
+    // FULL: count, total, min, max (varints), 2 doubles (mean, m2)
+    r.varint();
+    r.varint();
+    r.varint();
+    r.varint();
+    r.skip(16);
+    if (fmt == METRIC_FMT_FULL_WITH_SKETCH) r.skip_blob();
+}
 
-    if (key.extra_keys) {
-        auto& intern = aggregation_intern();
-        for (const auto& [k, v] : *key.extra_keys) {
-            buffer += ",\"";
-            append_json_string(buffer, intern.resolve(k));
-            buffer += "\":\"";
-            append_json_string(buffer, intern.resolve(v));
-            buffer += "\"";
+// Compress `data` into a standalone gzip member so the result can be written
+// at any offset in a concatenated-gzip file.
+coro::CoroTask<bool> compress_to_gzip_member(int level, ByteView data,
+                                             std::vector<unsigned char>& out) {
+    out.clear();
+    compression::zlib::ManualStreamingCompressorUtility comp(
+        level, compression::zlib::CompressionFormat::GZIP);
+    if (data.size() > 0) {
+        auto gen = comp.compress(data);
+        while (auto view = co_await gen.next()) {
+            const auto* p =
+                reinterpret_cast<const unsigned char*>(view->data());
+            out.insert(out.end(), p, p + view->size());
         }
     }
-
-    std::snprintf(temp, sizeof(temp), ",\"count\":%llu",
-                  static_cast<unsigned long long>(metrics.count));
-    buffer += temp;
-
-    buffer += ",\"dur\":{";
-    append_metric_stats(buffer, metrics.duration, metrics.count,
-                        compute_statistics, compute_percentiles, percentiles);
-    buffer += "}";
-
-    if (metrics.size.total > 0) {
-        buffer += ",\"size\":{";
-        append_metric_stats(buffer, metrics.size, metrics.count,
-                            compute_statistics, compute_percentiles,
-                            percentiles);
-        buffer += "}";
+    auto fin = comp.finalize_stream();
+    while (auto view = co_await fin.next()) {
+        const auto* p = reinterpret_cast<const unsigned char*>(view->data());
+        out.insert(out.end(), p, p + view->size());
     }
+    co_return true;
+}
 
-    if (metrics.custom_metrics)
-        for (const auto& [metric_name, metric_stats] :
-             *metrics.custom_metrics) {
-            buffer += ",\"";
-            append_json_string(buffer, metric_name);
-            buffer += "\":{";
-            append_metric_stats(buffer, metric_stats, metrics.count,
-                                compute_statistics, compute_percentiles,
-                                percentiles);
-            buffer += "}";
+coro::CoroTask<bool> write_shard_events(
+    std::size_t worker_idx, std::uint16_t shard_begin, std::uint16_t shard_end,
+    std::size_t flush_threshold, std::size_t buffer_capacity,
+    fileio::parallel::ParallelWriter* writer,
+    const PerfettoTraceWriterInput* input) {
+    using namespace dftracer::utils::utilities;
+
+    JsonBuffer buf(buffer_capacity);
+    std::vector<unsigned char> compressed;
+
+    auto flush_buffer = [&]() -> coro::CoroTask<int> {
+        if (buf.empty()) co_return 0;
+        int rc;
+        if (input->compress) {
+            co_await compress_to_gzip_member(input->compression_level,
+                                             buf.view(), compressed);
+            rc = co_await writer->write_chunk(
+                worker_idx,
+                ByteView(reinterpret_cast<const char*>(compressed.data()),
+                         compressed.size()));
+        } else {
+            rc = co_await writer->write_chunk(worker_idx, buf.view());
         }
+        buf.clear();
+        co_return rc;
+    };
+
+    std::size_t local_keys = 0;
+    std::vector<std::string> pending_chunks;
+    input->aggregator->scan_shard_range_raw(
+        shard_begin, shard_end,
+        [&](std::string_view key_bytes, std::string_view value_bytes) {
+            local_keys++;
+
+            // Layout: shard(2) map_type(1) cat(varint ID) name(varint ID)
+            //         pid(varint) tid(varint) hhash(varint ID) fhash(varint ID)
+            //         time_bucket(varint) num_extra(2) [k(varint ID) v(varint
+            //         ID)]*
+            auto& intern = aggregation_intern();
+            ByteReader kr(key_bytes);
+            kr.skip(2);     // shard
+            (void)kr.u8();  // map_type
+            auto cat = intern.resolve(static_cast<std::uint32_t>(kr.varint()));
+            auto name = intern.resolve(static_cast<std::uint32_t>(kr.varint()));
+            auto pid = kr.varint();
+            auto tid = kr.varint();
+            auto hhash_id = static_cast<std::uint32_t>(kr.varint());
+            auto hhash =
+                hhash_id ? intern.resolve(hhash_id) : std::string_view{};
+            auto fhash_id = static_cast<std::uint32_t>(kr.varint());
+            auto fhash =
+                fhash_id ? intern.resolve(fhash_id) : std::string_view{};
+            auto time_bucket = kr.varint();
+            auto num_extra = kr.be16();
+
+            // For REGULAR, pre-parse ts/te by skipping through value bytes.
+            std::uint64_t regular_ts = 0, regular_te = 0;
+            if (input->format == PerfettoEventFormat::REGULAR) {
+                ByteReader tmp(value_bytes);
+                tmp.varint();            // count
+                skip_metric_stats(tmp);  // duration
+                skip_metric_stats(tmp);  // size
+                regular_ts = tmp.varint();
+                regular_te = tmp.varint();
+            }
+
+            // Emit event header
+            if (input->format == PerfettoEventFormat::COUNTER) {
+                buf.append_literal("{\"name\":\"");
+                buf.append_json_escaped(name);
+                buf.append_literal("\",\"cat\":\"");
+                buf.append_json_escaped(cat);
+                buf.append_literal("\",\"ts\":");
+                buf.append_u64(time_bucket);
+                buf.append_literal(",\"ph\":\"C\",\"pid\":");
+                buf.append_u64(pid);
+                buf.append_literal(",\"tid\":");
+                buf.append_u64(tid);
+                buf.append_literal(",\"args\":{");
+            } else if (input->format == PerfettoEventFormat::REGULAR) {
+                std::uint64_t duration = regular_te - regular_ts;
+                buf.append_literal("{\"name\":\"");
+                buf.append_json_escaped(name);
+                buf.append_literal("\",\"cat\":\"");
+                buf.append_json_escaped(cat);
+                buf.append_literal("\",\"ts\":");
+                buf.append_u64(regular_ts);
+                buf.append_literal(",\"dur\":");
+                buf.append_u64(duration);
+                buf.append_literal(",\"ph\":\"X\",\"pid\":");
+                buf.append_u64(pid);
+                buf.append_literal(",\"tid\":");
+                buf.append_u64(tid);
+                buf.append_literal(",\"args\":{");
+            }
+
+            // hhash
+            buf.append_literal("\"hhash\":\"");
+            buf.append_json_escaped(hhash);
+            buf.append_literal("\"");
+
+            // fhash
+            if (!fhash.empty()) {
+                buf.append_literal(",\"fhash\":\"");
+                buf.append_json_escaped(fhash);
+                buf.append_literal("\"");
+            }
+
+            // extra keys (varint intern IDs)
+            for (std::uint16_t i = 0; i < num_extra; ++i) {
+                auto ek =
+                    intern.resolve(static_cast<std::uint32_t>(kr.varint()));
+                auto ev =
+                    intern.resolve(static_cast<std::uint32_t>(kr.varint()));
+                buf.append_literal(",\"");
+                buf.append_json_escaped(ek);
+                buf.append_literal("\":\"");
+                buf.append_json_escaped(ev);
+                buf.append_literal("\"");
+            }
+
+            // Value bytes: count, dur, size, ts, te, parent_pid, num_custom,
+            // customs
+            ByteReader vr(value_bytes);
+            auto count = vr.varint();
+
+            buf.append_literal(",\"dft_cnt\":");
+            buf.append_u64(count);
+
+            emit_metric_stats_from_bytes(vr, "dur", input->compute_statistics,
+                                         buf);
+            emit_metric_stats_from_bytes(vr, "ret", input->compute_statistics,
+                                         buf);
+
+            auto m_ts = vr.varint();
+            auto m_te = vr.varint();
+            auto m_parent_pid = vr.varint();
+
+            // Custom metrics come AFTER ts/te in the stream but BEFORE ts/te
+            // in the JSON output order, so emit them now.
+            auto num_custom = vr.varint();
+            for (std::uint64_t i = 0; i < num_custom; ++i) {
+                auto cname = vr.str();
+                emit_metric_stats_from_bytes(vr, cname,
+                                             input->compute_statistics, buf);
+            }
+
+            buf.append_literal(",\"ts\":");
+            buf.append_u64(m_ts);
+            buf.append_literal(",\"te\":");
+            buf.append_u64(m_te);
+
+            // Compute effective parent_pid (tracker may override)
+            std::uint64_t effective_parent = m_parent_pid;
+            if (input->tracker && input->agg_config &&
+                input->agg_config->track_process_parents &&
+                input->tracker->has_process_tree()) {
+                auto pp = input->tracker->get_parent_pid(pid);
+                if (pp != 0) effective_parent = pp;
+            }
+
+            // Boundary associations (emitted between ts/te and parent_pid)
+            if (input->tracker && input->agg_config &&
+                !input->agg_config->boundary_events.empty() &&
+                input->tracker->has_boundary_events()) {
+                auto mid = (m_ts + m_te) / 2;
+                auto bpid = effective_parent > 0 ? effective_parent : pid;
+                auto assoc =
+                    input->tracker->get_boundary_associations(bpid, mid);
+                for (const auto& [an, av] : assoc) {
+                    buf.append_literal(",\"");
+                    buf.append_json_escaped(an);
+                    buf.append_literal("\":\"");
+                    buf.append_json_escaped(av);
+                    buf.append_literal("\"");
+                }
+            }
+
+            if (effective_parent > 0) {
+                buf.append_literal(",\"parent_pid\":");
+                buf.append_u64(effective_parent);
+            }
 
-    buffer += ",\"ts\":";
-    std::snprintf(temp, sizeof(temp), "%llu",
-                  static_cast<unsigned long long>(metrics.ts));
-    buffer += temp;
-    buffer += ",\"te\":";
-    std::snprintf(temp, sizeof(temp), "%llu",
-                  static_cast<unsigned long long>(metrics.te));
-    buffer += temp;
-
-    if (metrics.boundary_associations)
-        for (const auto& [assoc_name, assoc_value] :
-             *metrics.boundary_associations) {
-            buffer += ",\"";
-            append_json_string(buffer, assoc_name);
-            buffer += "\":\"";
-            append_json_string(buffer, assoc_value);
-            buffer += "\"";
+            buf.append_literal("}}\n");
+
+            if (buf.size() >= flush_threshold) {
+                pending_chunks.emplace_back(buf.data(), buf.size());
+                buf.clear();
+            }
+            return true;
+        });
+
+    for (auto& s : pending_chunks) {
+        if (input->compress) {
+            co_await compress_to_gzip_member(
+                input->compression_level,
+                ByteView(reinterpret_cast<const std::byte*>(s.data()),
+                         s.size()),
+                compressed);
+            auto rc = co_await writer->write_chunk(
+                worker_idx,
+                ByteView(reinterpret_cast<const std::byte*>(compressed.data()),
+                         compressed.size()));
+            if (rc != 0) co_return false;
+        } else {
+            auto rc = co_await writer->write_chunk(
+                worker_idx,
+                ByteView(reinterpret_cast<const std::byte*>(s.data()),
+                         s.size()));
+            if (rc != 0) co_return false;
         }
+    }
+
+    if (co_await flush_buffer() != 0) co_return false;
 
-    if (metrics.parent_pid > 0) {
-        std::snprintf(temp, sizeof(temp), ",\"parent_pid\":%llu",
-                      static_cast<unsigned long long>(metrics.parent_pid));
-        buffer += temp;
+    if (input->keys_written) {
+        input->keys_written->fetch_add(local_keys, std::memory_order_relaxed);
     }
+    co_return true;
 }
 
+}  // namespace
+
 coro::CoroTask<bool> PerfettoTraceWriterUtility::process(
     const PerfettoTraceWriterInput& input) {
-    const auto& aggregations = input.resolver_output.aggregations.aggregations;
-    const auto& root_pids = input.resolver_output.root_pids;
+    using namespace dftracer::utils::utilities;
+
+    constexpr std::size_t HEADER_BUFFER_BYTES = 4 * 1024 * 1024;
+    constexpr std::size_t DEFAULT_FLUSH_BYTES = 12 * 1024 * 1024;
+    constexpr std::size_t BUFFER_HEADROOM_BYTES = 4 * 1024 * 1024;
+
+    auto layout_info = fileio::parallel::detect_layout(input.output_path);
+    const std::size_t executor_threads =
+        this->context().get_executor()->get_num_threads();
+    const std::size_t baseline =
+        std::min<std::size_t>(executor_threads, AGG_KEY_NUM_SHARDS);
+    // Mirror make_writer's padded-layout gate so sizing picks the matching
+    // flush_threshold.
+    const bool uses_padded =
+        layout_info.layout == fileio::parallel::FileLayout::STRIPED &&
+        input.compress &&
+        layout_info.stripe_size >= fileio::parallel::MIN_PADDED_STRIPE_BYTES;
+    const auto sizing = fileio::parallel::compute_writer_sizing(
+        layout_info, baseline, DEFAULT_FLUSH_BYTES, BUFFER_HEADROOM_BYTES,
+        uses_padded);
+    const std::size_t num_workers = sizing.num_workers;
+    const std::size_t flush_threshold = sizing.flush_threshold;
+    const std::size_t buffer_capacity = sizing.buffer_capacity;
+    fileio::parallel::WriterConfig wcfg;
+    wcfg.layout = layout_info.layout;
+    wcfg.stripe_size = layout_info.stripe_size;
+    wcfg.gzip = input.compress;
+    auto writer = fileio::parallel::make_writer(wcfg);
+    if (co_await writer->open(input.output_path, num_workers, input.compress,
+                              &this->context()) != 0) {
+        co_return false;
+    }
 
-    std::string buffer;
-    buffer.reserve(1024 * 1024);
+    auto write_section = [&](ByteView data,
+                             bool is_footer) -> coro::CoroTask<bool> {
+        std::vector<unsigned char> compressed;
+        ByteView payload = data;
+        if (input.compress) {
+            co_await compress_to_gzip_member(input.compression_level, data,
+                                             compressed);
+            payload =
+                ByteView(reinterpret_cast<const std::byte*>(compressed.data()),
+                         compressed.size());
+        }
+        int rc = is_footer ? co_await writer->write_footer(payload)
+                           : co_await writer->write_header(payload);
+        co_return rc == 0;
+    };
 
-    buffer += "[\n";
+    JsonBuffer header(HEADER_BUFFER_BYTES);
+    if (input.emit_header) header.append_literal("[\n");
 
-    if (input.resolver_output.trace_duration > 0 ||
-        !input.resolver_output.boundary_ranges.empty()) {
-        buffer +=
+    if (input.emit_header &&
+        (input.trace_duration > 0 || !input.boundary_ranges.empty())) {
+        header.append_literal(
             "{\"name\":\"trace_metadata\",\"cat\":\"metadata\",\"ph\":"
-            "\"M\",\"args\":{";
-
-        char temp[512];
-        std::snprintf(temp, sizeof(temp), "\"trace_duration\":%llu",
-                      static_cast<unsigned long long>(
-                          input.resolver_output.trace_duration));
-        buffer += temp;
+            "\"M\",\"args\":{");
+        header.format("\"trace_duration\":%llu",
+                      static_cast<unsigned long long>(input.trace_duration));
 
-        if (!input.resolver_output.boundary_ranges.empty()) {
-            buffer += ",\"boundary_ranges\":{";
+        if (!input.boundary_ranges.empty()) {
+            header.append_literal(",\"boundary_ranges\":{");
             bool first_boundary = true;
 
             for (const auto& [boundary_name, value_map] :
-                 input.resolver_output.boundary_ranges) {
-                if (!first_boundary) {
-                    buffer += ",";
-                }
+                 input.boundary_ranges) {
+                if (!first_boundary) header.append_literal(",");
                 first_boundary = false;
-
-                buffer += "\"";
-                append_json_string(buffer, boundary_name);
-                buffer += "\":{";
-
+                header.append_literal("\"");
+                header.append_json_escaped(boundary_name);
+                header.append_literal("\":{");
                 bool first_value = true;
                 for (const auto& [value, time_range] : value_map) {
-                    if (!first_value) {
-                        buffer += ",";
-                    }
+                    if (!first_value) header.append_literal(",");
                     first_value = false;
-
-                    buffer += "\"";
-                    append_json_string(buffer, value);
-                    buffer += "\":{";
-
-                    std::snprintf(
-                        temp, sizeof(temp), "\"ts\":%llu,\"te\":%llu",
+                    header.append_literal("\"");
+                    header.append_json_escaped(value);
+                    header.append_literal("\":{");
+                    header.format(
+                        "\"ts\":%llu,\"te\":%llu",
                         static_cast<unsigned long long>(time_range.ts),
                         static_cast<unsigned long long>(time_range.te));
-                    buffer += temp;
-
-                    buffer += "}";
+                    header.append_literal("}");
                 }
-
-                buffer += "}";
+                header.append_literal("}");
             }
-
-            buffer += "}";
+            header.append_literal("}");
         }
-
-        buffer += "}}\n";
+        header.append_literal("}}\n");
     }
 
-    if (!root_pids.empty()) {
-        for (std::uint64_t pid : root_pids) {
-            buffer +=
+    if (input.emit_header) {
+        for (std::uint64_t pid : input.root_pids) {
+            header.format(
                 "{\"name\":\"root_process\",\"cat\":\"dftracer\",\"ph\":"
-                "\"M\",\"pid\":";
-            buffer += std::to_string(pid);
-            buffer += ",\"tid\":";
-            buffer += std::to_string(pid);
-            buffer += ",\"args\":{\"is_root\":\"true\"}}\n";
+                "\"M\",\"pid\":%llu,\"tid\":%llu,"
+                "\"args\":{\"is_root\":\"true\"}}\n",
+                static_cast<unsigned long long>(pid),
+                static_cast<unsigned long long>(pid));
         }
     }
 
-    for (const auto& [key, metrics] : aggregations) {
-        char temp[512];
-
-        if (input.format == PerfettoEventFormat::COUNTER) {
-            buffer += "{\"name\":\"";
-            append_json_string(buffer, key.name());
-            buffer += "\",\"cat\":\"";
-            append_json_string(buffer, key.cat());
-            std::snprintf(temp, sizeof(temp),
-                          "\",\"ts\":%llu,\"ph\":\"C\",\"pid\":%llu,"
-                          "\"tid\":%llu,\"args\":{",
-                          static_cast<unsigned long long>(key.time_bucket),
-                          static_cast<unsigned long long>(key.pid),
-                          static_cast<unsigned long long>(key.tid));
-            buffer += temp;
-            append_event_args(buffer, key, metrics, input.compute_statistics,
-                              input.compute_percentiles, input.percentiles);
-            buffer += "}}\n";
-
-        } else if (input.format == PerfettoEventFormat::REGULAR) {
-            std::uint64_t duration = metrics.te - metrics.ts;
-
-            buffer += "{\"name\":\"";
-            append_json_string(buffer, key.name());
-            buffer += "\",\"cat\":\"";
-            append_json_string(buffer, key.cat());
-            std::snprintf(
-                temp, sizeof(temp),
-                "\",\"ts\":%llu,\"dur\":%llu,\"ph\":\"X\",\"pid\":%llu,"
-                "\"tid\":%llu,\"args\":{",
-                static_cast<unsigned long long>(metrics.ts),
-                static_cast<unsigned long long>(duration),
-                static_cast<unsigned long long>(key.pid),
-                static_cast<unsigned long long>(key.tid));
-            buffer += temp;
-            append_event_args(buffer, key, metrics, input.compute_statistics,
-                              input.compute_percentiles, input.percentiles);
-            buffer += "}}\n";
-
-        } else {
-            std::string event_id =
-                std::string(key.cat()) + ":" + std::string(key.name()) + ":" +
-                std::to_string(key.pid) + ":" + std::to_string(key.tid) + ":" +
-                std::to_string(key.time_bucket);
-            if (!key.fhash().empty()) {
-                event_id += ":";
-                event_id += key.fhash();
-            }
-            if (key.extra_keys) {
-                auto& intern = aggregation_intern();
-                for (const auto& [k, v] : *key.extra_keys) {
-                    event_id += ":";
-                    event_id += intern.resolve(k);
-                    event_id += "=";
-                    event_id += intern.resolve(v);
-                }
+    if (!co_await write_section(header.view(), false)) co_return false;
+
+    std::atomic<bool> worker_success{true};
+    const std::uint16_t range_begin = input.shard_begin;
+    const std::uint16_t range_end =
+        input.shard_end == 0 ? AGG_KEY_NUM_SHARDS : input.shard_end;
+    const std::uint16_t range_width =
+        range_end > range_begin
+            ? static_cast<std::uint16_t>(range_end - range_begin)
+            : std::uint16_t{0};
+    std::uint16_t shards_per_worker =
+        num_workers > 0 ? static_cast<std::uint16_t>(range_width / num_workers)
+                        : std::uint16_t{0};
+
+    co_await this->context().scope(
+        [&](CoroScope& child) -> coro::CoroTask<void> {
+            for (std::size_t i = 0; i < num_workers; ++i) {
+                auto shard_begin = static_cast<std::uint16_t>(
+                    range_begin + i * shards_per_worker);
+                auto shard_end =
+                    (i + 1 == num_workers)
+                        ? range_end
+                        : static_cast<std::uint16_t>(
+                              range_begin + (i + 1) * shards_per_worker);
+                const auto* input_ptr = &input;
+                auto* success_ptr = &worker_success;
+                auto* writer_ptr = writer.get();
+                child.spawn([i, shard_begin, shard_end, flush_threshold,
+                             buffer_capacity, writer_ptr, input_ptr,
+                             success_ptr](CoroScope&) -> coro::CoroTask<void> {
+                    auto ok = co_await write_shard_events(
+                        i, shard_begin, shard_end, flush_threshold,
+                        buffer_capacity, writer_ptr, input_ptr);
+                    if (!ok) success_ptr->store(false);
+                });
             }
+            co_return;
+        });
 
-            buffer += "{\"name\":\"";
-            append_json_string(buffer, key.name());
-            buffer += "\",\"cat\":\"";
-            append_json_string(buffer, key.cat());
-            std::snprintf(temp, sizeof(temp),
-                          "\",\"ts\":%llu,\"ph\":\"b\",\"pid\":%llu,"
-                          "\"tid\":%llu,\"id\":\"",
-                          static_cast<unsigned long long>(metrics.ts),
-                          static_cast<unsigned long long>(key.pid),
-                          static_cast<unsigned long long>(key.tid));
-            buffer += temp;
-            append_json_string(buffer, event_id);
-            buffer += "\",\"args\":{";
-            append_event_args(buffer, key, metrics, input.compute_statistics,
-                              input.compute_percentiles, input.percentiles);
-            buffer += "}}\n";
-
-            buffer += "{\"name\":\"";
-            append_json_string(buffer, key.name());
-            buffer += "\",\"cat\":\"";
-            append_json_string(buffer, key.cat());
-            std::snprintf(temp, sizeof(temp),
-                          "\",\"ts\":%llu,\"ph\":\"e\",\"pid\":%llu,"
-                          "\"tid\":%llu,\"id\":\"",
-                          static_cast<unsigned long long>(metrics.te),
-                          static_cast<unsigned long long>(key.pid),
-                          static_cast<unsigned long long>(key.tid));
-            buffer += temp;
-            append_json_string(buffer, event_id);
-            buffer += "\"}\n";
-        }
+    if (!worker_success.load()) {
+        co_await writer->close();
+        co_return false;
     }
 
-    buffer += "]\n";
-
-    try {
-        if (input.compress) {
-            using namespace dftracer::utils::utilities;
-
-            compression::zlib::ManualStreamingCompressorUtility compressor(
-                input.compression_level,
-                compression::zlib::CompressionFormat::GZIP);
-
-            fileio::StreamingFileWriterUtility writer(input.output_path);
+    if (input.emit_footer) {
+        const char footer[] = "]\n";
+        if (!co_await write_section(
+                ByteView(reinterpret_cast<const std::byte*>(footer), 2), true))
+            co_return false;
+    }
 
-            {
-                auto gen = compressor.compress(ByteView(buffer));
-                while (auto chunk = co_await gen.next()) {
-                    co_await writer.process(*chunk);
-                }
-            }
-            {
-                auto gen = compressor.finalize_stream();
-                while (auto chunk = co_await gen.next()) {
-                    co_await writer.process(*chunk);
-                }
-            }
+    if (co_await writer->close() != 0) co_return false;
 
-            writer.close();
-        } else {
-            ssize_t fd = co_await ::dftracer::utils::io::open(
-                input.output_path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
-            if (fd < 0) {
-                DFTRACER_UTILS_LOG_ERROR("Failed to open output file: %s",
-                                         input.output_path.c_str());
-                co_return false;
-            }
-            co_await ::dftracer::utils::io::write(static_cast<int>(fd),
-                                                  buffer.data(), buffer.size());
-            co_await ::dftracer::utils::io::close(static_cast<int>(fd));
+    if (input.merge_on_sharded &&
+        layout_info.layout == fileio::parallel::FileLayout::SHARDED) {
+        auto shards = writer->output_paths();
+        if (co_await fileio::parallel::merge_shards(input.output_path,
+                                                    shards) != 0) {
+            co_return false;
         }
-
-        co_return true;
-    } catch (const std::exception& e) {
-        DFTRACER_UTILS_LOG_ERROR("Failed to write output: %s", e.what());
-        co_return false;
     }
+
+    co_return true;
 }
 
 }  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.cpp
new file mode 100644
index 00000000..6829db00
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.cpp
@@ -0,0 +1,54 @@
+#include <dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.h>
+
+#include <string_view>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+bool SystemMetricsMergeOperator::FullMergeV2(
+    const MergeOperationInput& merge_in,
+    MergeOperationOutput* merge_out) const {
+    SystemAggregationMetrics result;
+
+    if (merge_in.existing_value) {
+        try {
+            result = deserialize_system_value(
+                std::string_view(merge_in.existing_value->data(),
+                                 merge_in.existing_value->size()));
+        } catch (...) {
+            return false;
+        }
+    }
+
+    for (const auto& operand : merge_in.operand_list) {
+        try {
+            auto other = deserialize_system_value(
+                std::string_view(operand.data(), operand.size()));
+            result.merge_from(other);
+        } catch (...) {
+            return false;
+        }
+    }
+
+    merge_out->new_value = serialize_system_value(result);
+    return true;
+}
+
+bool SystemMetricsMergeOperator::PartialMerge(
+    const ::rocksdb::Slice& /*key*/, const ::rocksdb::Slice& left_operand,
+    const ::rocksdb::Slice& right_operand, std::string* new_value,
+    ::rocksdb::Logger* /*logger*/) const {
+    try {
+        auto left = deserialize_system_value(
+            std::string_view(left_operand.data(), left_operand.size()));
+        auto right = deserialize_system_value(
+            std::string_view(right_operand.data(), right_operand.size()));
+        left.merge_from(right);
+        *new_value = serialize_system_value(left);
+        return true;
+    } catch (...) {
+        return false;
+    }
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.cpp b/src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.cpp
new file mode 100644
index 00000000..d2db20c7
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.cpp
@@ -0,0 +1,126 @@
+#include <dftracer/utils/utilities/common/serialization/binary_codec.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.h>
+
+#include <cstring>
+
+namespace dftracer::utils::utilities::composites::dft::aggregators {
+
+namespace {
+
+using common::serialization::BinaryReader;
+using common::serialization::put_blob;
+using common::serialization::put_double;
+using common::serialization::put_str;
+using common::serialization::put_varint;
+
+void serialize_float_metric_stats(std::string& out,
+                                  const FloatMetricStats& ms) {
+    put_varint(out, ms.count);
+    put_double(out, ms.total);
+    put_double(out, ms.min);
+    put_double(out, ms.max);
+    put_double(out, ms.mean);
+    put_double(out, ms.m2);
+    // m3/m4 not persisted yet -- skewness/kurtosis are recomputed
+    // in-memory only.
+    // put_double(out, ms.m3);
+    // put_double(out, ms.m4);
+    if (ms.sketch) {
+        out.push_back(1);
+        auto blob = ms.sketch->serialize();
+        put_blob(out, blob);
+    } else {
+        out.push_back(0);
+    }
+}
+
+FloatMetricStats deserialize_float_metric_stats(BinaryReader& r,
+                                                double accuracy) {
+    FloatMetricStats ms(accuracy);
+    ms.count = r.varint();
+    ms.total = r.f64();
+    ms.min = r.f64();
+    ms.max = r.f64();
+    ms.mean = r.f64();
+    ms.m2 = r.f64();
+    // ms.m3 = r.f64();
+    // ms.m4 = r.f64();
+    if (r.u8()) {
+        auto blob = r.blob();
+        ms.sketch = std::make_unique<DDSketch>(DDSketch::deserialize(
+            reinterpret_cast<const std::uint8_t*>(blob.data()), blob.size()));
+    }
+    return ms;
+}
+
+}  // namespace
+
+void serialize_system_key_into(std::string& out, std::string_view hhash,
+                               std::uint64_t time_bucket) {
+    out.clear();
+    out.reserve(2 + hhash.size() + 10);
+    put_str(out, hhash);
+    put_varint(out, time_bucket);
+}
+
+std::string serialize_system_key(std::string_view hhash,
+                                 std::uint64_t time_bucket) {
+    std::string out;
+    serialize_system_key_into(out, hhash, time_bucket);
+    return out;
+}
+
+DeserializedSystemKey deserialize_system_key(std::string_view data) {
+    BinaryReader r(data);
+    auto hhash = r.str();
+    auto time_bucket = r.varint();
+    return {{std::string(hhash), time_bucket}};
+}
+
+void serialize_system_value_into(std::string& out,
+                                 const SystemAggregationMetrics& m) {
+    out.clear();
+    out.reserve(128);
+
+    put_varint(out, m.count);
+    put_varint(out, m.ts);
+    put_varint(out, m.te);
+
+    std::uint32_t num_metrics =
+        m.metrics ? static_cast<std::uint32_t>(m.metrics->size()) : 0;
+    put_varint(out, num_metrics);
+
+    if (m.metrics) {
+        for (const auto& [name, stats] : *m.metrics) {
+            put_str(out, name);
+            serialize_float_metric_stats(out, stats);
+        }
+    }
+}
+
+std::string serialize_system_value(const SystemAggregationMetrics& m) {
+    std::string out;
+    serialize_system_value_into(out, m);
+    return out;
+}
+
+SystemAggregationMetrics deserialize_system_value(std::string_view data) {
+    BinaryReader r(data);
+    SystemAggregationMetrics m;
+    m.count = r.varint();
+    m.ts = r.varint();
+    m.te = r.varint();
+
+    auto num_metrics = r.varint();
+    if (num_metrics > 0) {
+        m.metrics = std::make_unique<FloatMetricsMap>();
+        for (std::uint32_t i = 0; i < num_metrics; ++i) {
+            auto name = r.str();
+            auto stats = deserialize_float_metric_stats(r, m.sketch_accuracy);
+            m.metrics->emplace(std::string(name), std::move(stats));
+        }
+    }
+    return m;
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::aggregators
diff --git a/src/dftracer/utils/utilities/composites/dft/comparator/comparison_config.cpp b/src/dftracer/utils/utilities/composites/dft/comparator/comparison_config.cpp
index cc757d53..64743716 100644
--- a/src/dftracer/utils/utilities/composites/dft/comparator/comparison_config.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/comparator/comparison_config.cpp
@@ -1,6 +1,7 @@
 #include <dftracer/utils/utilities/composites/dft/comparator/comparison_config.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
+#include <fstream>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -27,78 +28,74 @@ std::vector<std::string> split_csv(const std::string& s) {
 }  // namespace
 
 // static
-bool ComparisonConfig::parse_node(void* yyjson_val_ptr, ComparisonNode& node,
-                                  std::string& error) {
-    auto* val = static_cast<yyjson_val*>(yyjson_val_ptr);
-    if (!val || !yyjson_is_obj(val)) {
+bool ComparisonConfig::parse_node(simdjson::dom::element val,
+                                  ComparisonNode& node, std::string& error) {
+    if (!val.is_object()) {
         error = "node must be a JSON object";
         return false;
     }
 
-    yyjson_val* name_val = yyjson_obj_get(val, "name");
-    if (!name_val || !yyjson_is_str(name_val)) {
+    auto name_result = val["name"];
+    if (name_result.error() || !name_result.value_unsafe().is_string()) {
         error = "node missing required string field 'name'";
         return false;
     }
-    node.name = yyjson_get_str(name_val);
+    node.name = std::string(name_result.value_unsafe().get_string().value());
 
-    yyjson_val* query_val = yyjson_obj_get(val, "query");
-    if (query_val && yyjson_is_str(query_val)) {
-        node.query = yyjson_get_str(query_val);
+    auto query_result = val["query"];
+    if (!query_result.error() && query_result.value_unsafe().is_string()) {
+        node.query =
+            std::string(query_result.value_unsafe().get_string().value());
     }
 
-    yyjson_val* gb_val = yyjson_obj_get(val, "group_by");
-    if (gb_val && yyjson_is_arr(gb_val)) {
-        std::size_t idx, max;
-        yyjson_val* elem;
-        yyjson_arr_foreach(gb_val, idx, max, elem) {
-            if (yyjson_is_str(elem)) {
-                node.group_by.push_back(yyjson_get_str(elem));
+    auto gb_result = val["group_by"];
+    if (!gb_result.error() && gb_result.value_unsafe().is_array()) {
+        for (auto elem : gb_result.value_unsafe().get_array()) {
+            if (elem.is_string()) {
+                node.group_by.push_back(std::string(elem.get_string().value()));
             }
         }
     }
 
-    yyjson_val* metrics_val = yyjson_obj_get(val, "metrics");
-    if (metrics_val && yyjson_is_arr(metrics_val)) {
+    auto metrics_result = val["metrics"];
+    if (!metrics_result.error() && metrics_result.value_unsafe().is_array()) {
         std::vector<std::string> metrics;
-        std::size_t idx, max;
-        yyjson_val* elem;
-        yyjson_arr_foreach(metrics_val, idx, max, elem) {
-            if (yyjson_is_str(elem)) {
-                metrics.push_back(yyjson_get_str(elem));
+        for (auto elem : metrics_result.value_unsafe().get_array()) {
+            if (elem.is_string()) {
+                metrics.push_back(std::string(elem.get_string().value()));
             }
         }
         node.metrics = std::move(metrics);
     }
 
-    yyjson_val* pct_val = yyjson_obj_get(val, "percentiles");
-    if (pct_val && yyjson_is_arr(pct_val)) {
+    auto pct_result = val["percentiles"];
+    if (!pct_result.error() && pct_result.value_unsafe().is_array()) {
         std::vector<double> percentiles;
-        std::size_t idx, max;
-        yyjson_val* elem;
-        yyjson_arr_foreach(pct_val, idx, max, elem) {
-            if (yyjson_is_num(elem)) {
-                percentiles.push_back(yyjson_get_num(elem));
+        for (auto elem : pct_result.value_unsafe().get_array()) {
+            if (elem.is_double() || elem.is_int64() || elem.is_uint64()) {
+                percentiles.push_back(elem.get_double().value());
             }
         }
         node.percentiles = std::move(percentiles);
     }
 
-    yyjson_val* thr_val = yyjson_obj_get(val, "threshold_pct");
-    if (thr_val && yyjson_is_num(thr_val)) {
-        node.threshold_pct = yyjson_get_num(thr_val);
+    auto thr_result = val["threshold_pct"];
+    if (!thr_result.error()) {
+        auto thr_val = thr_result.value_unsafe();
+        if (thr_val.is_double() || thr_val.is_int64() || thr_val.is_uint64()) {
+            node.threshold_pct = thr_val.get_double().value();
+        }
     }
 
-    yyjson_val* sort_val = yyjson_obj_get(val, "sort_by");
-    if (sort_val && yyjson_is_str(sort_val)) {
-        node.sort_by = yyjson_get_str(sort_val);
+    auto sort_result = val["sort_by"];
+    if (!sort_result.error() && sort_result.value_unsafe().is_string()) {
+        node.sort_by =
+            std::string(sort_result.value_unsafe().get_string().value());
     }
 
-    yyjson_val* children_val = yyjson_obj_get(val, "children");
-    if (children_val && yyjson_is_arr(children_val)) {
-        std::size_t idx, max;
-        yyjson_val* child_elem;
-        yyjson_arr_foreach(children_val, idx, max, child_elem) {
+    auto children_result = val["children"];
+    if (!children_result.error() && children_result.value_unsafe().is_array()) {
+        for (auto child_elem : children_result.value_unsafe().get_array()) {
             ComparisonNode child;
             if (!parse_node(child_elem, child, error)) return false;
             node.children.push_back(std::move(child));
@@ -111,94 +108,107 @@ bool ComparisonConfig::parse_node(void* yyjson_val_ptr, ComparisonNode& node,
 // static
 std::optional<ComparisonConfig> ComparisonConfig::from_json_file(
     const std::string& path, std::string& error) {
-    yyjson_doc* doc = yyjson_read_file(path.c_str(), 0, nullptr, nullptr);
-    if (!doc) {
+    std::ifstream file(path);
+    if (!file) {
         error = "failed to read or parse JSON file: " + path;
         return std::nullopt;
     }
+    std::string content((std::istreambuf_iterator<char>(file)),
+                        std::istreambuf_iterator<char>());
+
+    simdjson::dom::parser parser;
+    auto result = parser.parse(content);
+    if (result.error()) {
+        error = "failed to parse JSON file: " + path;
+        return std::nullopt;
+    }
 
-    yyjson_val* root = yyjson_doc_get_root(doc);
-    if (!root || !yyjson_is_obj(root)) {
-        yyjson_doc_free(doc);
+    auto root = result.value_unsafe();
+    if (!root.is_object()) {
         error = "JSON root must be an object";
         return std::nullopt;
     }
 
     ComparisonConfig cfg;
 
-    yyjson_val* baseline_val = yyjson_obj_get(root, "baseline");
-    if (!baseline_val || !yyjson_is_str(baseline_val)) {
-        yyjson_doc_free(doc);
+    auto baseline_result = root["baseline"];
+    if (baseline_result.error() ||
+        !baseline_result.value_unsafe().is_string()) {
         error = "missing required string field 'baseline'";
         return std::nullopt;
     }
-    cfg.baseline = yyjson_get_str(baseline_val);
+    cfg.baseline =
+        std::string(baseline_result.value_unsafe().get_string().value());
 
-    yyjson_val* variant_val = yyjson_obj_get(root, "variant");
-    if (!variant_val || !yyjson_is_str(variant_val)) {
-        yyjson_doc_free(doc);
+    auto variant_result = root["variant"];
+    if (variant_result.error() || !variant_result.value_unsafe().is_string()) {
         error = "missing required string field 'variant'";
         return std::nullopt;
     }
-    cfg.variant = yyjson_get_str(variant_val);
+    cfg.variant =
+        std::string(variant_result.value_unsafe().get_string().value());
 
-    yyjson_val* defaults_val = yyjson_obj_get(root, "defaults");
-    if (defaults_val && yyjson_is_obj(defaults_val)) {
-        yyjson_val* dm = yyjson_obj_get(defaults_val, "metrics");
-        if (dm && yyjson_is_arr(dm)) {
+    auto defaults_result = root["defaults"];
+    if (!defaults_result.error() &&
+        defaults_result.value_unsafe().is_object()) {
+        auto defaults_val = defaults_result.value_unsafe();
+
+        auto dm_result = defaults_val["metrics"];
+        if (!dm_result.error() && dm_result.value_unsafe().is_array()) {
             cfg.defaults.metrics.clear();
-            std::size_t idx, max;
-            yyjson_val* elem;
-            yyjson_arr_foreach(dm, idx, max, elem) {
-                if (yyjson_is_str(elem)) {
-                    cfg.defaults.metrics.push_back(yyjson_get_str(elem));
+            for (auto elem : dm_result.value_unsafe().get_array()) {
+                if (elem.is_string()) {
+                    cfg.defaults.metrics.push_back(
+                        std::string(elem.get_string().value()));
                 }
             }
         }
 
-        yyjson_val* dp = yyjson_obj_get(defaults_val, "percentiles");
-        if (dp && yyjson_is_arr(dp)) {
+        auto dp_result = defaults_val["percentiles"];
+        if (!dp_result.error() && dp_result.value_unsafe().is_array()) {
             cfg.defaults.percentiles.clear();
-            std::size_t idx, max;
-            yyjson_val* elem;
-            yyjson_arr_foreach(dp, idx, max, elem) {
-                if (yyjson_is_num(elem)) {
-                    cfg.defaults.percentiles.push_back(yyjson_get_num(elem));
+            for (auto elem : dp_result.value_unsafe().get_array()) {
+                if (elem.is_double() || elem.is_int64() || elem.is_uint64()) {
+                    cfg.defaults.percentiles.push_back(
+                        elem.get_double().value());
                 }
             }
         }
 
-        yyjson_val* dt = yyjson_obj_get(defaults_val, "threshold_pct");
-        if (dt && yyjson_is_num(dt)) {
-            cfg.defaults.threshold_pct = yyjson_get_num(dt);
+        auto dt_result = defaults_val["threshold_pct"];
+        if (!dt_result.error()) {
+            auto dt_val = dt_result.value_unsafe();
+            if (dt_val.is_double() || dt_val.is_int64() || dt_val.is_uint64()) {
+                cfg.defaults.threshold_pct = dt_val.get_double().value();
+            }
         }
 
-        yyjson_val* ti = yyjson_obj_get(defaults_val, "time_interval_ms");
-        if (ti && yyjson_is_num(ti)) {
-            cfg.defaults.time_interval_ms = yyjson_get_num(ti);
+        auto ti_result = defaults_val["time_interval_ms"];
+        if (!ti_result.error()) {
+            auto ti_val = ti_result.value_unsafe();
+            if (ti_val.is_double() || ti_val.is_int64() || ti_val.is_uint64()) {
+                cfg.defaults.time_interval_ms = ti_val.get_double().value();
+            }
         }
 
-        yyjson_val* ds = yyjson_obj_get(defaults_val, "sort_by");
-        if (ds && yyjson_is_str(ds)) {
-            cfg.defaults.sort_by = yyjson_get_str(ds);
+        auto ds_result = defaults_val["sort_by"];
+        if (!ds_result.error() && ds_result.value_unsafe().is_string()) {
+            cfg.defaults.sort_by =
+                std::string(ds_result.value_unsafe().get_string().value());
         }
     }
 
-    yyjson_val* nodes_val = yyjson_obj_get(root, "nodes");
-    if (nodes_val && yyjson_is_arr(nodes_val)) {
-        std::size_t idx, max;
-        yyjson_val* node_elem;
-        yyjson_arr_foreach(nodes_val, idx, max, node_elem) {
+    auto nodes_result = root["nodes"];
+    if (!nodes_result.error() && nodes_result.value_unsafe().is_array()) {
+        for (auto node_elem : nodes_result.value_unsafe().get_array()) {
             ComparisonNode node;
             if (!parse_node(node_elem, node, error)) {
-                yyjson_doc_free(doc);
                 return std::nullopt;
             }
             cfg.nodes.push_back(std::move(node));
         }
     }
 
-    yyjson_doc_free(doc);
     return cfg;
 }
 
diff --git a/src/dftracer/utils/utilities/composites/dft/comparator/comparison_result.cpp b/src/dftracer/utils/utilities/composites/dft/comparator/comparison_result.cpp
index 06f5c829..f1e88fa5 100644
--- a/src/dftracer/utils/utilities/composites/dft/comparator/comparison_result.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/comparator/comparison_result.cpp
@@ -87,8 +87,16 @@ std::vector<MetricComparison> build_metadata_metrics(
 double compute_cohens_d(const MetricStats& base, std::uint64_t n_base,
                         const MetricStats& var, std::uint64_t n_var) {
     if (n_base < 2 || n_var < 2) return 0.0;
-    double var_base = base.m2 / static_cast<double>(n_base);
-    double var_var = var.m2 / static_cast<double>(n_var);
+    // `m2` now holds the raw power sum sum_x^2 (not Welford central M2).
+    // Convert to population variance: Var = (sum_x^2 - (sum_x)^2 / n) / n.
+    auto pop_var = [](const MetricStats& ms, std::uint64_t n) {
+        const double nd = static_cast<double>(n);
+        const double sx = static_cast<double>(ms.total);
+        const double central = ms.m2 - sx * sx / nd;
+        return (central > 0.0 ? central : 0.0) / nd;
+    };
+    double var_base = pop_var(base, n_base);
+    double var_var = pop_var(var, n_var);
     double pooled = std::sqrt((var_base + var_var) / 2.0);
     if (pooled < 1e-15) return 0.0;
     return (var.mean - base.mean) / pooled;
diff --git a/src/dftracer/utils/utilities/composites/dft/comparator/tree_table_formatter.cpp b/src/dftracer/utils/utilities/composites/dft/comparator/tree_table_formatter.cpp
index 6be146b4..8fb0c65e 100644
--- a/src/dftracer/utils/utilities/composites/dft/comparator/tree_table_formatter.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/comparator/tree_table_formatter.cpp
@@ -1,5 +1,5 @@
+#include <dftracer/utils/core/common/config.h>
 #include <dftracer/utils/utilities/composites/dft/comparator/tree_table_formatter.h>
-#include <yyjson.h>
 
 #include <algorithm>
 #include <cstdio>
@@ -625,81 +625,119 @@ const char* sig_str(Significance s) {
     return "NEGLIGIBLE";
 }
 
-yyjson_mut_val* build_metric_json(yyjson_mut_doc* doc,
-                                  const MetricComparison& mc) {
+std::string escape_json_string(const std::string& s) {
+    std::string result;
+    result.reserve(s.size());
+    for (char c : s) {
+        switch (c) {
+            case '"':
+                result += "\\\"";
+                break;
+            case '\\':
+                result += "\\\\";
+                break;
+            case '\b':
+                result += "\\b";
+                break;
+            case '\f':
+                result += "\\f";
+                break;
+            case '\n':
+                result += "\\n";
+                break;
+            case '\r':
+                result += "\\r";
+                break;
+            case '\t':
+                result += "\\t";
+                break;
+            default:
+                result += c;
+                break;
+        }
+    }
+    return result;
+}
+
+std::string double_to_json(double v) {
+    if (!std::isfinite(v)) return "0";
+    char buf[32];
+    std::snprintf(buf, sizeof(buf), "%.15g", v);
+    return buf;
+}
+
+void build_metric_json(std::ostringstream& out, const MetricComparison& mc) {
     auto safe = [](double v) { return std::isfinite(v) ? v : 0.0; };
-    yyjson_mut_val* obj = yyjson_mut_obj(doc);
-    yyjson_mut_obj_add_str(doc, obj, "name", mc.metric_name.c_str());
-    yyjson_mut_obj_add_real(doc, obj, "baseline", safe(mc.baseline_value));
-    yyjson_mut_obj_add_real(doc, obj, "variant", safe(mc.variant_value));
-    yyjson_mut_obj_add_real(doc, obj, "delta", safe(mc.delta));
-    yyjson_mut_obj_add_real(doc, obj, "pct_change", safe(mc.pct_change));
-    yyjson_mut_obj_add_real(doc, obj, "cohens_d", safe(mc.cohens_d));
-    yyjson_mut_obj_add_str(doc, obj, "significance", sig_str(mc.significance));
-    yyjson_mut_obj_add_bool(doc, obj, "is_regression", mc.is_regression);
-    return obj;
-}
-
-yyjson_mut_val* build_metrics_arr(yyjson_mut_doc* doc,
-                                  const std::vector<MetricComparison>& ms) {
-    yyjson_mut_val* arr = yyjson_mut_arr(doc);
-    for (const auto& mc : ms) {
-        yyjson_mut_arr_append(arr, build_metric_json(doc, mc));
+    out << "{";
+    out << "\"name\":\"" << escape_json_string(mc.metric_name) << "\",";
+    out << "\"baseline\":" << double_to_json(safe(mc.baseline_value)) << ",";
+    out << "\"variant\":" << double_to_json(safe(mc.variant_value)) << ",";
+    out << "\"delta\":" << double_to_json(safe(mc.delta)) << ",";
+    out << "\"pct_change\":" << double_to_json(safe(mc.pct_change)) << ",";
+    out << "\"cohens_d\":" << double_to_json(safe(mc.cohens_d)) << ",";
+    out << "\"significance\":\"" << sig_str(mc.significance) << "\",";
+    out << "\"is_regression\":" << (mc.is_regression ? "true" : "false");
+    out << "}";
+}
+
+void build_metrics_arr(std::ostringstream& out,
+                       const std::vector<MetricComparison>& ms) {
+    out << "[";
+    for (std::size_t i = 0; i < ms.size(); ++i) {
+        if (i > 0) out << ",";
+        build_metric_json(out, ms[i]);
     }
-    return arr;
+    out << "]";
 }
 
-yyjson_mut_val* build_group_json(yyjson_mut_doc* doc,
-                                 const GroupComparison& g) {
-    yyjson_mut_val* obj = yyjson_mut_obj(doc);
-    yyjson_mut_obj_add_str(doc, obj, "label", g.label.c_str());
-    yyjson_mut_obj_add_val(doc, obj, "metrics",
-                           build_metrics_arr(doc, g.metrics));
-    return obj;
+void build_group_json(std::ostringstream& out, const GroupComparison& g) {
+    out << "{";
+    out << "\"label\":\"" << escape_json_string(g.label) << "\",";
+    out << "\"metrics\":";
+    build_metrics_arr(out, g.metrics);
+    out << "}";
 }
 
-yyjson_mut_val* build_node_json(yyjson_mut_doc* doc, const NodeResult& node);
+void build_node_json(std::ostringstream& out, const NodeResult& node);
 
-yyjson_mut_val* build_node_json(yyjson_mut_doc* doc, const NodeResult& node) {
-    yyjson_mut_val* obj = yyjson_mut_obj(doc);
-    yyjson_mut_obj_add_str(doc, obj, "name", node.name.c_str());
-    yyjson_mut_obj_add_str(doc, obj, "query", node.composed_query.c_str());
+void build_node_json(std::ostringstream& out, const NodeResult& node) {
+    out << "{";
+    out << "\"name\":\"" << escape_json_string(node.name) << "\",";
+    out << "\"query\":\"" << escape_json_string(node.composed_query) << "\",";
 
     // summary
-    yyjson_mut_val* summary = yyjson_mut_obj(doc);
-    yyjson_mut_obj_add_val(doc, summary, "metrics",
-                           build_metrics_arr(doc, node.summary.metrics));
-    yyjson_mut_obj_add_val(doc, obj, "summary", summary);
+    out << "\"summary\":{\"metrics\":";
+    build_metrics_arr(out, node.summary.metrics);
+    out << "},";
 
     // groups
-    yyjson_mut_val* groups_arr = yyjson_mut_arr(doc);
-    for (const auto& g : node.groups) {
-        yyjson_mut_arr_append(groups_arr, build_group_json(doc, g));
+    out << "\"groups\":[";
+    for (std::size_t i = 0; i < node.groups.size(); ++i) {
+        if (i > 0) out << ",";
+        build_group_json(out, node.groups[i]);
     }
-    yyjson_mut_obj_add_val(doc, obj, "groups", groups_arr);
+    out << "],";
 
     // children
-    yyjson_mut_val* children_arr = yyjson_mut_arr(doc);
-    for (const auto& child : node.children) {
-        yyjson_mut_arr_append(children_arr, build_node_json(doc, child));
+    out << "\"children\":[";
+    for (std::size_t i = 0; i < node.children.size(); ++i) {
+        if (i > 0) out << ",";
+        build_node_json(out, node.children[i]);
     }
-    yyjson_mut_obj_add_val(doc, obj, "children", children_arr);
+    out << "]";
 
-    return obj;
+    out << "}";
 }
 
-yyjson_mut_val* build_meta_json(yyjson_mut_doc* doc, const TraceMetadata& m) {
-    yyjson_mut_val* obj = yyjson_mut_obj(doc);
-    yyjson_mut_obj_add_int(doc, obj, "files",
-                           static_cast<int64_t>(m.file_count));
-    yyjson_mut_obj_add_int(doc, obj, "processes",
-                           static_cast<int64_t>(m.process_count));
-    yyjson_mut_obj_add_int(doc, obj, "threads",
-                           static_cast<int64_t>(m.thread_count));
-    yyjson_mut_obj_add_real(doc, obj, "total_bytes", m.total_bytes);
-    yyjson_mut_obj_add_real(doc, obj, "total_io_time_us", m.total_io_time_us);
-    yyjson_mut_obj_add_real(doc, obj, "makespan_us", m.makespan_us);
-    return obj;
+void build_meta_json(std::ostringstream& out, const TraceMetadata& m) {
+    out << "{";
+    out << "\"files\":" << m.file_count << ",";
+    out << "\"processes\":" << m.process_count << ",";
+    out << "\"threads\":" << m.thread_count << ",";
+    out << "\"total_bytes\":" << double_to_json(m.total_bytes) << ",";
+    out << "\"total_io_time_us\":" << double_to_json(m.total_io_time_us) << ",";
+    out << "\"makespan_us\":" << double_to_json(m.makespan_us);
+    out << "}";
 }
 
 }  // namespace
@@ -710,40 +748,31 @@ yyjson_mut_val* build_meta_json(yyjson_mut_doc* doc, const TraceMetadata& m) {
 
 std::string TreeTableFormatter::render_json(
     const ComparisonOutput& output) const {
-    yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr);
-    yyjson_mut_val* root = yyjson_mut_obj(doc);
-    yyjson_mut_doc_set_root(doc, root);
-
-    yyjson_mut_obj_add_str(doc, root, "baseline", output.baseline_path.c_str());
-    yyjson_mut_obj_add_str(doc, root, "variant", output.variant_path.c_str());
-    yyjson_mut_obj_add_val(doc, root, "baseline_meta",
-                           build_meta_json(doc, output.baseline_meta));
-    yyjson_mut_obj_add_val(doc, root, "variant_meta",
-                           build_meta_json(doc, output.variant_meta));
-    yyjson_mut_obj_add_real(doc, root, "execution_time_ms",
-                            output.execution_time_ms);
-
-    yyjson_mut_val* nodes_arr = yyjson_mut_arr(doc);
-    for (const auto& node : output.nodes) {
-        yyjson_mut_arr_append(nodes_arr, build_node_json(doc, node));
-    }
-    yyjson_mut_obj_add_val(doc, root, "nodes", nodes_arr);
-
-    yyjson_write_err write_err = {};
-    std::size_t json_len = 0;
-    char* json = yyjson_mut_write_opts(doc, YYJSON_WRITE_PRETTY, nullptr,
-                                       &json_len, &write_err);
-    if (!json) {
-        yyjson_mut_doc_free(doc);
-        throw std::runtime_error(
-            std::string("JSON serialization failed: ") +
-            (write_err.msg ? write_err.msg : "unknown error"));
+    std::ostringstream out;
+
+    out << "{";
+    out << "\"baseline\":\"" << escape_json_string(output.baseline_path)
+        << "\",";
+    out << "\"variant\":\"" << escape_json_string(output.variant_path) << "\",";
+    out << "\"baseline_meta\":";
+    build_meta_json(out, output.baseline_meta);
+    out << ",";
+    out << "\"variant_meta\":";
+    build_meta_json(out, output.variant_meta);
+    out << ",";
+    out << "\"execution_time_ms\":" << double_to_json(output.execution_time_ms)
+        << ",";
+
+    out << "\"nodes\":[";
+    for (std::size_t i = 0; i < output.nodes.size(); ++i) {
+        if (i > 0) out << ",";
+        build_node_json(out, output.nodes[i]);
     }
-    std::string result(json, json_len);
-    free(json);  // NOLINT(cppcoreguidelines-no-malloc)
-    yyjson_mut_doc_free(doc);
+    out << "]";
 
-    return result;
+    out << "}";
+
+    return out.str();
 }
 
 }  // namespace dftracer::utils::utilities::composites::dft::comparator
diff --git a/src/dftracer/utils/utilities/composites/dft/event_collector_utility.cpp b/src/dftracer/utils/utilities/composites/dft/event_collector_utility.cpp
index 5a6ebee1..c586bac1 100644
--- a/src/dftracer/utils/utilities/composites/dft/event_collector_utility.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/event_collector_utility.cpp
@@ -6,7 +6,7 @@
 #include <dftracer/utils/utilities/fileio/lines/sources/async_plain_file_line_generator.h>
 #include <dftracer/utils/utilities/indexer/internal/indexer_factory.h>
 #include <dftracer/utils/utilities/reader/internal/reader_factory.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
 #include <algorithm>
 
@@ -19,6 +19,7 @@ class EventIdCollector : public reader::internal::LineProcessor {
    public:
     std::vector<EventId>& events;
     bool trim_commas;
+    simdjson::dom::parser parser;
 
     explicit EventIdCollector(std::vector<EventId>& event_list,
                               bool should_trim_commas = false)
@@ -29,7 +30,6 @@ class EventIdCollector : public reader::internal::LineProcessor {
         const char* trimmed;
         std::size_t trimmed_length;
 
-        // Use comma-trimming variant if requested (for JSON array format)
         bool valid = trim_commas ? json_trim_and_validate_with_comma(
                                        data, length, trimmed, trimmed_length)
                                  : json_trim_and_validate(data, length, trimmed,
@@ -39,36 +39,32 @@ class EventIdCollector : public reader::internal::LineProcessor {
             co_return true;
         }
 
-        yyjson_doc* doc = yyjson_read(trimmed, trimmed_length, 0);
-        if (!doc) co_return true;
+        auto result = parser.parse(trimmed, trimmed_length);
+        if (result.error()) co_return true;
 
-        yyjson_val* root = yyjson_doc_get_root(doc);
-        if (!yyjson_is_obj(root)) {
-            yyjson_doc_free(doc);
-            co_return true;
-        }
+        auto root = result.value_unsafe();
+        if (!root.is_object()) co_return true;
 
         EventId event;
-        yyjson_val* id_val = yyjson_obj_get(root, "id");
-        if (id_val && yyjson_is_int(id_val)) {
-            event.id = yyjson_get_int(id_val);
+        auto id_result = root["id"].get_int64();
+        if (!id_result.error()) {
+            event.id = id_result.value_unsafe();
         }
 
-        yyjson_val* pid_val = yyjson_obj_get(root, "pid");
-        if (pid_val && yyjson_is_int(pid_val)) {
-            event.pid = yyjson_get_int(pid_val);
+        auto pid_result = root["pid"].get_int64();
+        if (!pid_result.error()) {
+            event.pid = pid_result.value_unsafe();
         }
 
-        yyjson_val* tid_val = yyjson_obj_get(root, "tid");
-        if (tid_val && yyjson_is_int(tid_val)) {
-            event.tid = yyjson_get_int(tid_val);
+        auto tid_result = root["tid"].get_int64();
+        if (!tid_result.error()) {
+            event.tid = tid_result.value_unsafe();
         }
 
         if (event.is_valid()) {
             events.push_back(event);
         }
 
-        yyjson_doc_free(doc);
         co_return true;
     }
 };
diff --git a/src/dftracer/utils/utilities/composites/dft/event_id_extractor_utility.cpp b/src/dftracer/utils/utilities/composites/dft/event_id_extractor_utility.cpp
index 095022e6..b3edb434 100644
--- a/src/dftracer/utils/utilities/composites/dft/event_id_extractor_utility.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/event_id_extractor_utility.cpp
@@ -1,6 +1,6 @@
 #include <dftracer/utils/core/coro/task.h>
 #include <dftracer/utils/utilities/composites/dft/event_id_extractor_utility.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
 namespace dftracer::utils::utilities::composites::dft {
 
@@ -8,37 +8,32 @@ coro::CoroTask<EventIdExtractionOutput> EventIdExtractor::process(
     const EventIdExtractionInput& input) {
     EventId event;
 
-    yyjson_doc* doc =
-        yyjson_read(input.json_data.data(), input.json_data.size(), 0);
-    if (!doc) {
-        co_return event;  // Invalid JSON
+    simdjson::dom::parser parser;
+    auto result = parser.parse(input.json_data.data(), input.json_data.size());
+    if (result.error()) {
+        co_return event;
     }
 
-    yyjson_val* root = yyjson_doc_get_root(doc);
-    if (!yyjson_is_obj(root)) {
-        yyjson_doc_free(doc);
-        co_return event;  // Not a JSON object
+    auto root = result.value_unsafe();
+    if (!root.is_object()) {
+        co_return event;
     }
 
-    // Extract id
-    yyjson_val* id_val = yyjson_obj_get(root, "id");
-    if (id_val && yyjson_is_int(id_val)) {
-        event.id = yyjson_get_int(id_val);
+    auto id_result = root["id"].get_int64();
+    if (!id_result.error()) {
+        event.id = id_result.value_unsafe();
     }
 
-    // Extract pid
-    yyjson_val* pid_val = yyjson_obj_get(root, "pid");
-    if (pid_val && yyjson_is_int(pid_val)) {
-        event.pid = yyjson_get_int(pid_val);
+    auto pid_result = root["pid"].get_int64();
+    if (!pid_result.error()) {
+        event.pid = pid_result.value_unsafe();
     }
 
-    // Extract tid
-    yyjson_val* tid_val = yyjson_obj_get(root, "tid");
-    if (tid_val && yyjson_is_int(tid_val)) {
-        event.tid = yyjson_get_int(tid_val);
+    auto tid_result = root["tid"].get_int64();
+    if (!tid_result.error()) {
+        event.tid = tid_result.value_unsafe();
     }
 
-    yyjson_doc_free(doc);
     co_return event;
 }
 
diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.cpp
index a9bdc02c..edc9c96e 100644
--- a/src/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/indexing/bloom_filter.cpp
@@ -8,11 +8,23 @@
 namespace dftracer::utils::utilities::composites::dft::indexing {
 
 namespace {
-constexpr std::size_t HEADER_SIZE =
-    12;  // 4 bytes num_hashes + 4 bytes num_entries + 4 bytes num_bits
+constexpr std::size_t HEADER_SIZE = 12;
+constexpr std::size_t BLOCK_BYTES = 32;  // 8 x u32 = 256 bits
+constexpr std::size_t BLOCK_BITS = BLOCK_BYTES * 8;
+constexpr std::size_t BLOCK_WORDS = BLOCK_BYTES / 4;
+
+// Split block Bloom filter SALT array, taken verbatim from the Apache
+// Parquet spec (parquet-format/BloomFilter.md). Eight odd 32-bit
+// constants; each (h2 * SALT[i]) >> 27 picks one of 32 bits in word i
+// of the 256-bit block, with the 8 bit-selectors empirically
+// uncorrelated. See Apple, "Split block Bloom filters", arXiv:2101.01719.
+constexpr std::uint32_t SALT[BLOCK_WORDS] = {
+    0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
+    0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U,
+};
 
 void write_u32_le(unsigned char* buf, std::uint32_t val) {
-    if (!buf) return;  // Defensive check to silence compiler warning
+    if (!buf) return;
     buf[0] = static_cast<unsigned char>(val & 0xFF);
     buf[1] = static_cast<unsigned char>((val >> 8) & 0xFF);
     buf[2] = static_cast<unsigned char>((val >> 16) & 0xFF);
@@ -25,6 +37,20 @@ std::uint32_t read_u32_le(const unsigned char* buf) {
            (static_cast<std::uint32_t>(buf[2]) << 16) |
            (static_cast<std::uint32_t>(buf[3]) << 24);
 }
+
+inline std::size_t block_index(std::uint64_t h1, std::size_t num_blocks) {
+    return static_cast<std::size_t>(
+        (static_cast<__uint128_t>(h1) * num_blocks) >> 64);
+}
+
+inline void compute_block_mask(std::uint64_t h2,
+                               std::uint32_t (&out)[BLOCK_WORDS]) {
+    auto h32 = static_cast<std::uint32_t>(h2 ^ (h2 >> 32));
+    for (std::size_t i = 0; i < BLOCK_WORDS; ++i) {
+        std::uint32_t y = h32 * SALT[i];
+        out[i] = 1U << (y >> 27);
+    }
+}
 }  // namespace
 
 std::size_t BloomFilter::optimal_num_bits(std::size_t n, double p) {
@@ -34,7 +60,12 @@ std::size_t BloomFilter::optimal_num_bits(std::size_t n, double p) {
     auto m = static_cast<std::size_t>(
         std::ceil(-static_cast<double>(n) * std::log(p) /
                   (std::log(2.0) * std::log(2.0))));
-    return std::max(m, static_cast<std::size_t>(64));
+    // Round up to a whole number of 512-bit blocks. Blocked bloom filters
+    // pay ~10-15% extra memory for the same FPR vs classical; bump the
+    // requested bit count to compensate before rounding.
+    m = static_cast<std::size_t>(static_cast<double>(m) * 1.15);
+    m = std::max(m, BLOCK_BITS);
+    return ((m + BLOCK_BITS - 1) / BLOCK_BITS) * BLOCK_BITS;
 }
 
 std::size_t BloomFilter::optimal_num_hashes(std::size_t m, std::size_t n) {
@@ -49,8 +80,7 @@ BloomFilter::BloomFilter(std::size_t expected_entries,
     : num_bits_(optimal_num_bits(expected_entries, false_positive_rate)),
       num_hashes_(optimal_num_hashes(num_bits_, expected_entries)),
       num_entries_(0) {
-    std::size_t num_bytes = (num_bits_ + 7) / 8;
-    bits_.resize(num_bytes, 0);
+    bits_.assign(num_bits_ / 8, 0);
 }
 
 BloomFilter::BloomFilter(std::vector<unsigned char> bits, std::size_t num_bits,
@@ -85,41 +115,67 @@ void BloomFilter::compute_hashes(std::string_view value, std::uint64_t& h1,
                                  std::uint64_t& h2) const {
     hasher_.reset();
     hasher_.update(value);
-    h1 = hasher_.get_hash().value;
-    // Second hash: mix with a different seed using FNV-like mixing
-    std::uint64_t seed = 0x517cc1b727220a95ULL;
-    h2 = h1 * seed + 0x9e3779b97f4a7c15ULL;
-    h2 ^= (h2 >> 33);
-    h2 *= 0xff51afd7ed558ccdULL;
-    h2 ^= (h2 >> 33);
+    std::uint64_t raw = hasher_.get_hash().value;
+    // FNV-1a leaves correlated high bits for similar short keys, which
+    // breaks Lemire reduction in the blocked path. Run a SplitMix64-style
+    // finisher to fully avalanche, then derive a second hash for masking.
+    h1 = raw;
+    h1 = (h1 ^ (h1 >> 30)) * 0xbf58476d1ce4e5b9ULL;
+    h1 = (h1 ^ (h1 >> 27)) * 0x94d049bb133111ebULL;
+    h1 ^= (h1 >> 31);
+    h2 = raw + 0x9e3779b97f4a7c15ULL;
+    h2 = (h2 ^ (h2 >> 30)) * 0xbf58476d1ce4e5b9ULL;
+    h2 = (h2 ^ (h2 >> 27)) * 0x94d049bb133111ebULL;
+    h2 ^= (h2 >> 31);
 }
 
 std::size_t BloomFilter::nth_hash(std::uint64_t h1, std::uint64_t h2,
                                   std::size_t n) const {
-    // Kirsch-Mitzenmacher: g_i(x) = h1(x) + i * h2(x)
     return static_cast<std::size_t>((h1 + n * h2) % num_bits_);
 }
 
 void BloomFilter::add(std::string_view value) {
+    if (last_value_valid_ && value.size() == last_value_size_ &&
+        std::memcmp(last_value_buf_.data(), value.data(), value.size()) == 0) {
+        ++num_entries_;
+        return;
+    }
+
     std::uint64_t h1, h2;
     compute_hashes(value, h1, h2);
 
-    for (std::size_t i = 0; i < num_hashes_; ++i) {
-        std::size_t bit_pos = nth_hash(h1, h2, i);
-        bits_[bit_pos / 8] |= static_cast<std::uint8_t>(1u << (bit_pos % 8));
-    }
+    std::size_t num_blocks = num_bits_ / BLOCK_BITS;
+    std::size_t blk = block_index(h1, num_blocks);
+    auto* block =
+        reinterpret_cast<std::uint32_t*>(bits_.data() + blk * BLOCK_BYTES);
+
+    std::uint32_t mask[BLOCK_WORDS];
+    compute_block_mask(h2, mask);
+    for (std::size_t i = 0; i < BLOCK_WORDS; ++i) block[i] |= mask[i];
     ++num_entries_;
+
+    if (value.size() <= LAST_VALUE_CAP) {
+        std::memcpy(last_value_buf_.data(), value.data(), value.size());
+        last_value_size_ = value.size();
+        last_value_valid_ = true;
+    } else {
+        last_value_valid_ = false;
+    }
 }
 
 bool BloomFilter::possibly_contains(std::string_view value) const {
     std::uint64_t h1, h2;
     compute_hashes(value, h1, h2);
 
-    for (std::size_t i = 0; i < num_hashes_; ++i) {
-        std::size_t bit_pos = nth_hash(h1, h2, i);
-        if (!(bits_[bit_pos / 8] & (1u << (bit_pos % 8)))) {
-            return false;
-        }
+    std::size_t num_blocks = num_bits_ / BLOCK_BITS;
+    std::size_t blk = block_index(h1, num_blocks);
+    const auto* block = reinterpret_cast<const std::uint32_t*>(
+        bits_.data() + blk * BLOCK_BYTES);
+
+    std::uint32_t mask[BLOCK_WORDS];
+    compute_block_mask(h2, mask);
+    for (std::size_t i = 0; i < BLOCK_WORDS; ++i) {
+        if ((block[i] & mask[i]) != mask[i]) return false;
     }
     return true;
 }
@@ -131,9 +187,11 @@ void BloomFilter::merge_from(const BloomFilter& other) {
             "BloomFilter::merge_from: incompatible filter parameters");
     }
 
-    for (std::size_t i = 0; i < bits_.size(); ++i) {
-        bits_[i] |= other.bits_[i];
-    }
+    auto* dst = reinterpret_cast<std::uint64_t*>(bits_.data());
+    const auto* src =
+        reinterpret_cast<const std::uint64_t*>(other.bits_.data());
+    std::size_t n = bits_.size() / 8;
+    for (std::size_t i = 0; i < n; ++i) dst[i] |= src[i];
     num_entries_ += other.num_entries_;
 }
 
diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.cpp
index 8866a804..2f436be5 100644
--- a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.cpp
@@ -8,48 +8,52 @@
 namespace dftracer::utils::utilities::composites::dft::indexing {
 
 void ChunkDimensionStats::observe(std::string_view value) {
+    if (last_key_ != nullptr && *last_key_ == value) {
+        ++*last_counter_;
+        return;
+    }
+
     if (!value_counts) {
         value_counts.emplace();
     }
 
-    // NOTE(perf): transparent lookup: find with string_view, only construct
-    // string on insert
     auto it = value_counts->find(value);
-    bool inserted = false;
-    if (it == value_counts->end()) {
-        auto [new_it, _] = value_counts->emplace(std::string(value), 0);
-        it = new_it;
-        inserted = true;
+    if (it != value_counts->end()) {
+        it->second++;
+        last_key_ = &it->first;
+        last_counter_ = &it->second;
+        return;
     }
-    it->second++;
 
-    if (inserted) {
-        distinct_count = value_counts->size();
-    }
+    auto [new_it, _] = value_counts->emplace(std::string(value), 1);
+    it = new_it;
+    distinct_count = value_counts->size();
+    last_key_ = &it->first;
+    last_counter_ = &it->second;
+
+    const std::string& val_ref = it->first;
 
-    // NOTE(perf): min/max: fast-path for uint dimensions compare as integers
     if (value_type == "uint") {
         std::uint64_t val = 0;
         auto [ptr, ec] =
             std::from_chars(value.data(), value.data() + value.size(), val);
         if (ec == std::errc()) {
             if (min_value.empty()) {
-                min_value = it->first;
-                max_value = it->first;
+                min_value = val_ref;
+                max_value = val_ref;
             } else {
                 std::uint64_t cur_min = 0, cur_max = 0;
                 std::from_chars(min_value.data(),
                                 min_value.data() + min_value.size(), cur_min);
                 std::from_chars(max_value.data(),
                                 max_value.data() + max_value.size(), cur_max);
-                if (val < cur_min) min_value = it->first;
-                if (val > cur_max) max_value = it->first;
+                if (val < cur_min) min_value = val_ref;
+                if (val > cur_max) max_value = val_ref;
             }
             return;
         }
     }
 
-    const std::string& val_ref = it->first;
     if (min_value.empty() || val_ref < min_value) {
         min_value = val_ref;
     }
@@ -58,6 +62,23 @@ void ChunkDimensionStats::observe(std::string_view value) {
     }
 }
 
+void ChunkDimensionStats::observe_range_only(std::uint64_t value) {
+    distinct_count++;
+    auto str = std::to_string(value);
+    if (min_value.empty()) {
+        min_value = str;
+        max_value = str;
+    } else {
+        std::uint64_t cur_min = 0, cur_max = 0;
+        std::from_chars(min_value.data(), min_value.data() + min_value.size(),
+                        cur_min);
+        std::from_chars(max_value.data(), max_value.data() + max_value.size(),
+                        cur_max);
+        if (value < cur_min) min_value = str;
+        if (value > cur_max) max_value = str;
+    }
+}
+
 std::vector<std::uint8_t> ChunkDimensionStats::serialize_value_counts() const {
     if (!value_counts || value_counts->empty()) return {};
 
@@ -99,23 +120,10 @@ ChunkDimensionStats::compress_value_counts(std::size_t cap_bytes) const {
     auto raw = serialize_value_counts();
     if (raw.empty()) return std::nullopt;
 
-    // NOTE(perf): Reuse zlib stream across calls, deflateReset resets state
-    // without reallocating internal buffers.
-    struct ZlibDeflater {
-        z_stream strm{};
-        bool init = false;
-        ~ZlibDeflater() {
-            if (init) deflateEnd(&strm);
-        }
-    };
-    static thread_local ZlibDeflater zd;
-    if (!zd.init) {
-        deflateInit(&zd.strm, Z_DEFAULT_COMPRESSION);
-        zd.init = true;
-    } else {
-        deflateReset(&zd.strm);
+    z_stream strm{};
+    if (deflateInit(&strm, Z_DEFAULT_COMPRESSION) != Z_OK) {
+        return std::nullopt;
     }
-    auto& strm = zd.strm;
 
     uLongf compressed_len = compressBound(static_cast<uLong>(raw.size()));
     std::vector<std::uint8_t> compressed(compressed_len);
@@ -126,9 +134,13 @@ ChunkDimensionStats::compress_value_counts(std::size_t cap_bytes) const {
     strm.avail_out = static_cast<uInt>(compressed_len);
 
     int rc = deflate(&strm, Z_FINISH);
-    if (rc != Z_STREAM_END) return std::nullopt;
+    if (rc != Z_STREAM_END) {
+        deflateEnd(&strm);
+        return std::nullopt;
+    }
 
     compressed.resize(strm.total_out);
+    deflateEnd(&strm);
     if (compressed.size() > cap_bytes) return std::nullopt;
 
     return compressed;
@@ -153,10 +165,10 @@ std::uint64_t read_u64_le(const std::uint8_t* p) {
 }
 }  // namespace
 
-std::unordered_map<std::string, std::uint64_t>
+dftracer::utils::StringViewMap<std::uint64_t>
 ChunkDimensionStats::deserialize_value_counts(const std::uint8_t* data,
                                               std::size_t len) {
-    std::unordered_map<std::string, std::uint64_t> result;
+    dftracer::utils::StringViewMap<std::uint64_t> result;
     if (!data || len < 4) return result;
 
     std::size_t pos = 0;
@@ -182,7 +194,7 @@ ChunkDimensionStats::deserialize_value_counts(const std::uint8_t* data,
     return result;
 }
 
-std::unordered_map<std::string, std::uint64_t>
+dftracer::utils::StringViewMap<std::uint64_t>
 ChunkDimensionStats::decompress_value_counts(const std::uint8_t* data,
                                              std::size_t len) {
     if (!data || len == 0) return {};
diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.cpp
index 8f9ce32a..40a09c3e 100644
--- a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.cpp
@@ -1,19 +1,17 @@
 #include <dftracer/utils/core/common/logging.h>
-#include <dftracer/utils/utilities/common/json/json_value.h>
-#include <dftracer/utils/utilities/composites/dft/event.h>
+#include <dftracer/utils/utilities/common/json/json_doc_guard.h>
+#include <dftracer/utils/utilities/common/json/parser.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/chunk_indexer_utility.h>
 #include <dftracer/utils/utilities/composites/indexed_file_reader_utility.h>
 #include <dftracer/utils/utilities/composites/types.h>
 #include <dftracer/utils/utilities/reader/internal/stream_config.h>
-#include <yyjson.h>
 
 #include <cstring>
 #include <map>
 #include <string_view>
 
-// Import JsonValue from common json namespace
-using dftracer::utils::utilities::common::json::JsonValue;
-using dftracer::utils::utilities::composites::dft::DFTracerEvent;
+using dftracer::utils::utilities::common::json::JsonParser;
+using dftracer::utils::utilities::common::json::ondemand_value_to_string;
 
 namespace dftracer::utils::utilities::composites::dft::indexing {
 
@@ -30,23 +28,6 @@ static const std::string DIM_CAT = "cat";
 static const std::string DIM_PID = "pid";
 static const std::string DIM_TID = "tid";
 
-// Convert a JsonValue to string for bloom filter insertion.
-// Handles strings, integers, floats, bools.
-std::string json_value_to_string(const JsonValue& val) {
-    if (val.is_string()) {
-        return val.get<std::string>();
-    } else if (val.is_uint()) {
-        return std::to_string(val.get<std::uint64_t>());
-    } else if (val.is_int()) {
-        return std::to_string(val.get<std::int64_t>());
-    } else if (val.is_number()) {
-        return std::to_string(val.get<double>());
-    } else if (val.is_bool()) {
-        return val.get<bool>() ? "true" : "false";
-    }
-    return {};
-}
-
 // Build set of dimensions to index based on config
 std::vector<std::string> get_target_dimensions(
     const ChunkIndexerConfig& config) {
@@ -206,6 +187,19 @@ coro::CoroTask<ChunkIndexerOutput> ChunkIndexerUtility::process(
         event_lines;
     std::map<std::string, std::vector<std::uint32_t>> metadata_lines;
 
+    // On-Demand parser for lazy field access - only parses what we use
+    JsonParser parser;
+
+    // Pre-check which bloom filters we need
+    const bool need_name = output.bloom_filters.count(DIM_NAME) > 0;
+    const bool need_cat = output.bloom_filters.count(DIM_CAT) > 0;
+    const bool need_pid = output.bloom_filters.count(DIM_PID) > 0;
+    const bool need_tid = output.bloom_filters.count(DIM_TID) > 0;
+    const bool need_hhash = output.bloom_filters.count(DIM_HHASH) > 0;
+    const bool need_fhash = output.bloom_filters.count(DIM_FHASH) > 0;
+    const bool need_shash = output.bloom_filters.count(DIM_SHASH) > 0;
+    const bool has_extra_dims = !input.config.extra_dimensions.empty();
+
     while (!stream->done()) {
         auto chunk = co_await stream->read_async();
 
@@ -229,153 +223,209 @@ coro::CoroTask<ChunkIndexerOutput> ChunkIndexerUtility::process(
             std::size_t line_len = newline - line_start;
 
             if (line_len > 0) {
-                yyjson_read_flag flg = YYJSON_READ_NOFLAG;
-                yyjson_doc* doc =
-                    yyjson_read_opts(const_cast<char*>(line_start), line_len,
-                                     flg, nullptr, nullptr);
-
-                if (doc) {
-                    yyjson_val* root = yyjson_doc_get_root(doc);
-                    if (root && yyjson_is_obj(root)) {
-                        JsonValue json(root);
-                        DFTracerEvent ev;
-                        if (!DFTracerEvent::parse(json, ev)) {
-                            yyjson_doc_free(doc);
-                            pos = (newline - data) + 1;
-                            line_number++;
-                            continue;
-                        }
+                std::string_view line_sv(line_start, line_len);
+                if (!parser.parse(line_sv)) {
+                    pos = (newline - data) + 1;
+                    line_number++;
+                    continue;
+                }
+
+                // Extract ph first to determine event type
+                auto ph = parser.get_string("ph");
+                if (!ph) {
+                    pos = (newline - data) + 1;
+                    line_number++;
+                    continue;
+                }
 
-                        if (ev.is_metadata()) {
-                            // Metadata event: collect hash resolutions
-                            if (ev.args.exists()) {
-                                std::string hash_val =
-                                    ev.args["value"].get<std::string>();
-                                std::string resolved =
-                                    ev.args["name"].get<std::string>();
-
-                                if (!hash_val.empty() && !resolved.empty()) {
-                                    if (ev.name == "HH") {
-                                        output.hash_resolutions[DIM_HHASH]
-                                                               [hash_val] =
-                                            resolved;
-                                    } else if (ev.name == "FH") {
-                                        output.hash_resolutions[DIM_FHASH]
-                                                               [hash_val] =
-                                            resolved;
-                                    } else if (ev.name == "SH") {
-                                        output.hash_resolutions[DIM_SHASH]
-                                                               [hash_val] =
-                                            resolved;
+                bool is_metadata = (*ph == "M");
+
+                if (is_metadata) {
+                    // Metadata event: extract name and args in single pass
+                    // Re-parse to get fresh document state
+                    parser.parse(line_sv);
+
+                    std::string event_name;
+                    std::string hash_val;
+                    std::string resolved;
+
+                    parser.for_each_field([&](std::string_view key,
+                                              simdjson::ondemand::value val) {
+                        if (key == "name") {
+                            auto s = val.get_string();
+                            if (!s.error()) event_name = std::string(s.value());
+                        } else if (key == "args") {
+                            auto obj = val.get_object();
+                            if (!obj.error()) {
+                                for (auto field : obj.value()) {
+                                    if (field.error()) continue;
+                                    auto fkey = field.unescaped_key();
+                                    if (fkey.error()) continue;
+                                    auto fval = field.value();
+                                    if (fval.error()) continue;
+
+                                    if (fkey.value() == "value") {
+                                        auto s = fval.value().get_string();
+                                        if (!s.error())
+                                            hash_val = std::string(s.value());
+                                    } else if (fkey.value() == "name") {
+                                        auto s = fval.value().get_string();
+                                        if (!s.error())
+                                            resolved = std::string(s.value());
                                     }
                                 }
                             }
-                            if (collect_manifest) {
-                                std::string meta_type(ev.name);
-                                metadata_lines[meta_type].push_back(
-                                    line_number);
-                            }
-                        } else {
-                            // Regular event: index into bloom filters + stats
-
-                            // Update statistics (always update for accuracy)
-                            output.statistics.update_from_event(
-                                ev.name, ev.cat, ev.pid, ev.tid, ev.ts, ev.dur);
-
-                            // Add to bloom filters for missing dimensions only
-                            auto it = output.bloom_filters.find(DIM_NAME);
-                            if (it != output.bloom_filters.end() &&
-                                !ev.name.empty()) {
-                                it->second.add(ev.name);
-                            }
+                        }
+                    });
+
+                    if (!hash_val.empty() && !resolved.empty()) {
+                        if (event_name == "HH") {
+                            output.hash_resolutions[DIM_HHASH][hash_val] =
+                                resolved;
+                        } else if (event_name == "FH") {
+                            output.hash_resolutions[DIM_FHASH][hash_val] =
+                                resolved;
+                        } else if (event_name == "SH") {
+                            output.hash_resolutions[DIM_SHASH][hash_val] =
+                                resolved;
+                        }
+                    }
 
-                            it = output.bloom_filters.find(DIM_CAT);
-                            if (it != output.bloom_filters.end() &&
-                                !ev.cat.empty()) {
-                                it->second.add(ev.cat);
-                            }
+                    if (collect_manifest) {
+                        metadata_lines[event_name].push_back(line_number);
+                    }
+                } else {
+                    // Regular event: re-parse for fresh state and extract
+                    // fields
+                    parser.parse(line_sv);
+                    auto name_opt = parser.get_string("name");
+                    std::string_view name = name_opt.value_or("");
+                    auto cat_opt = parser.get_string("cat");
+                    std::string_view cat = cat_opt.value_or("");
+
+                    auto pid = parser.get_uint64("pid").value_or(0);
+                    auto tid = parser.get_uint64("tid").value_or(0);
+                    auto ts = parser.get_uint64("ts").value_or(0);
+                    auto dur = parser.get_uint64("dur").value_or(0);
+
+                    // Update statistics
+                    output.statistics.update_from_event(name, cat, pid, tid, ts,
+                                                        dur);
+
+                    // Add to bloom filters
+                    if (need_name && !name.empty()) {
+                        output.bloom_filters[DIM_NAME].add(name);
+                    }
 
-                            it = output.bloom_filters.find(DIM_PID);
-                            if (it != output.bloom_filters.end()) {
-                                char pid_buf[32];
-                                int n = std::snprintf(
-                                    pid_buf, sizeof(pid_buf), "%llu",
-                                    static_cast<unsigned long long>(ev.pid));
-                                it->second.add(std::string_view(pid_buf, n));
-                            }
+                    if (need_cat && !cat.empty()) {
+                        output.bloom_filters[DIM_CAT].add(cat);
+                    }
 
-                            it = output.bloom_filters.find(DIM_TID);
-                            if (it != output.bloom_filters.end()) {
-                                char tid_buf[32];
-                                int n = std::snprintf(
-                                    tid_buf, sizeof(tid_buf), "%llu",
-                                    static_cast<unsigned long long>(ev.tid));
-                                it->second.add(std::string_view(tid_buf, n));
-                            }
+                    if (need_pid) {
+                        char pid_buf[32];
+                        int n =
+                            std::snprintf(pid_buf, sizeof(pid_buf), "%llu",
+                                          static_cast<unsigned long long>(pid));
+                        output.bloom_filters[DIM_PID].add(
+                            std::string_view(pid_buf, n));
+                    }
 
-                            if (ev.args.exists()) {
-                                // Hash dimensions: add hash to bloom
-                                it = output.bloom_filters.find(DIM_HHASH);
-                                if (it != output.bloom_filters.end()) {
-                                    std::string_view hhash =
-                                        ev.args["hhash"]
-                                            .get<std::string_view>();
-                                    if (!hhash.empty()) {
-                                        it->second.add(hhash);
-                                    }
-                                }
+                    if (need_tid) {
+                        char tid_buf[32];
+                        int n =
+                            std::snprintf(tid_buf, sizeof(tid_buf), "%llu",
+                                          static_cast<unsigned long long>(tid));
+                        output.bloom_filters[DIM_TID].add(
+                            std::string_view(tid_buf, n));
+                    }
 
-                                it = output.bloom_filters.find(DIM_FHASH);
-                                if (it != output.bloom_filters.end()) {
-                                    std::string_view fhash =
-                                        ev.args["fhash"]
-                                            .get<std::string_view>();
-                                    if (!fhash.empty()) {
-                                        it->second.add(fhash);
-                                    }
+                    // Process args for hash dimensions and extra dimensions
+                    if (need_hhash || need_fhash || need_shash ||
+                        has_extra_dims) {
+                        parser.for_each_field("args", [&](std::string_view key,
+                                                          simdjson::ondemand::
+                                                              value val) {
+                            if (need_hhash && key == "hhash") {
+                                auto s = val.get_string();
+                                if (!s.error() && !s.value().empty()) {
+                                    output.bloom_filters[DIM_HHASH].add(
+                                        s.value());
                                 }
-
-                                it = output.bloom_filters.find(DIM_SHASH);
-                                if (it != output.bloom_filters.end()) {
-                                    // shash can be under cmd_hash or exec_hash
-                                    std::string_view shash =
-                                        ev.args["cmd_hash"]
-                                            .get<std::string_view>();
-                                    if (shash.empty()) {
-                                        shash = ev.args["exec_hash"]
-                                                    .get<std::string_view>();
-                                    }
-                                    if (!shash.empty()) {
-                                        it->second.add(shash);
-                                    }
+                            } else if (need_fhash && key == "fhash") {
+                                auto s = val.get_string();
+                                if (!s.error() && !s.value().empty()) {
+                                    output.bloom_filters[DIM_FHASH].add(
+                                        s.value());
                                 }
-
-                                // Extra dimensions: arbitrary nested dot-paths
+                            } else if (need_shash && (key == "cmd_hash" ||
+                                                      key == "exec_hash")) {
+                                auto s = val.get_string();
+                                if (!s.error() && !s.value().empty()) {
+                                    output.bloom_filters[DIM_SHASH].add(
+                                        s.value());
+                                }
+                            } else if (has_extra_dims) {
+                                // Check if this key matches any extra dimension
                                 for (const auto& dim :
                                      input.config.extra_dimensions) {
-                                    it = output.bloom_filters.find(dim);
-                                    if (it != output.bloom_filters.end()) {
-                                        JsonValue val = ev.args.at(dim.c_str());
-                                        if (val.exists()) {
-                                            std::string str_val =
-                                                json_value_to_string(val);
-                                            if (!str_val.empty()) {
-                                                it->second.add(str_val);
+                                    // Check for exact match (flat key)
+                                    if (key == dim) {
+                                        std::string str_val =
+                                            ondemand_value_to_string(val);
+                                        if (!str_val.empty()) {
+                                            output.bloom_filters[dim].add(
+                                                str_val);
+                                        }
+                                        break;
+                                    }
+                                    // Check for nested key (e.g., "io.size")
+                                    auto dot_pos = dim.find('.');
+                                    if (dot_pos != std::string::npos) {
+                                        std::string_view prefix(dim.data(),
+                                                                dot_pos);
+                                        if (key == prefix) {
+                                            // Navigate into nested object
+                                            std::string_view suffix(
+                                                dim.data() + dot_pos + 1,
+                                                dim.size() - dot_pos - 1);
+                                            auto obj = val.get_object();
+                                            if (!obj.error()) {
+                                                for (auto field : obj.value()) {
+                                                    if (field.error()) continue;
+                                                    auto fkey =
+                                                        field.unescaped_key();
+                                                    if (fkey.error()) continue;
+                                                    if (fkey.value() ==
+                                                        suffix) {
+                                                        auto fval =
+                                                            field.value();
+                                                        if (fval.error())
+                                                            continue;
+                                                        std::string str_val =
+                                                            ondemand_value_to_string(
+                                                                fval.value());
+                                                        if (!str_val.empty()) {
+                                                            output
+                                                                .bloom_filters
+                                                                    [dim]
+                                                                .add(str_val);
+                                                        }
+                                                        break;
+                                                    }
+                                                }
                                             }
                                         }
                                     }
                                 }
                             }
+                        });
+                    }
 
-                            if (collect_manifest) {
-                                event_lines[{std::string(ev.cat),
-                                             std::string(ev.name)}]
-                                    .push_back(line_number);
-                            }
-                            output.events_processed++;
-                        }
+                    if (collect_manifest) {
+                        event_lines[{std::string(cat), std::string(name)}]
+                            .push_back(line_number);
                     }
-                    yyjson_doc_free(doc);
+                    output.events_processed++;
                 }
             }
 
diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.cpp
index b30ecbf7..98288a06 100644
--- a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.cpp
@@ -1,14 +1,16 @@
 #include <dftracer/utils/core/common/logging.h>
-#include <dftracer/utils/core/rocksdb/async.h>
 #include <dftracer/utils/utilities/common/query/ast.h>
+#include <dftracer/utils/utilities/common/statistics/timestamp_histogram.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/queries/queries.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
 
 #include <algorithm>
+#include <optional>
 #include <set>
 #include <unordered_map>
 #include <unordered_set>
@@ -37,6 +39,14 @@ bool looks_like_hash(const std::string& value) {
     return true;
 }
 
+std::optional<IndexDatabase::HashType> dim_to_hash_type(
+    const std::string& dim) {
+    if (dim == "fhash") return IndexDatabase::HashType::FILE;
+    if (dim == "hhash") return IndexDatabase::HashType::HOST;
+    if (dim == "shash") return IndexDatabase::HashType::STRING;
+    return std::nullopt;
+}
+
 struct PrunerContext {
     int file_info_id;
     std::uint64_t total_chunks;
@@ -45,9 +55,13 @@ struct PrunerContext {
     std::unordered_map<std::string,
                        std::unordered_map<std::uint64_t, BloomFilter>>
         bloom_filters;
+    std::unordered_map<std::uint64_t, common::statistics::TimestampHistogram>
+        ts_histograms;
 
     // Hash resolution: human-readable value → hash strings
     std::unordered_map<std::string, std::vector<std::string>> hash_cache;
+    std::unordered_map<std::string, std::set<std::uint64_t>> name_postings;
+    std::unordered_map<std::string, bool> name_file_membership;
     const IndexDatabase* db = nullptr;
     int fid = -1;
 
@@ -67,13 +81,47 @@ struct PrunerContext {
         if (it != hash_cache.end()) return it->second;
 
         if (db) {
-            auto hashes = db->query_hash_by_resolved(dim, val);
-            auto& cached = hash_cache[key];
-            cached = std::move(hashes);
-            return cached;
+            auto hash_type = dim_to_hash_type(dim);
+            if (hash_type) {
+                auto hash = db->resolve_name_to_hash(*hash_type, val);
+                auto& cached = hash_cache[key];
+                if (hash) {
+                    cached.push_back(std::move(*hash));
+                }
+                return cached;
+            }
         }
         return empty;
     }
+
+    const std::set<std::uint64_t>& resolve_name_chunks(const std::string& val) {
+        static const std::set<std::uint64_t> empty;
+        auto it = name_postings.find(val);
+        if (it != name_postings.end()) return it->second;
+        if (!db || fid < 0) return empty;
+
+        auto chunk_ids = db->query_name_chunk_postings(val, fid);
+        auto& cached = name_postings[val];
+        cached.insert(chunk_ids.begin(), chunk_ids.end());
+        return cached;
+    }
+
+    std::optional<bool> file_contains_name(const std::string& val) {
+        auto it = name_file_membership.find(val);
+        if (it != name_file_membership.end()) return it->second;
+        if (!db || fid < 0) return std::nullopt;
+
+        auto name_id = db->query_name_id(val);
+        if (!name_id.has_value()) {
+            return std::nullopt;
+        }
+
+        auto file_ids = db->query_name_file_postings(val);
+        const bool present =
+            std::find(file_ids.begin(), file_ids.end(), fid) != file_ids.end();
+        name_file_membership[val] = present;
+        return present;
+    }
 };
 
 std::string literal_to_string(const query_ns::LiteralNode& lit) {
@@ -103,6 +151,8 @@ std::optional<bool> dict_contains(const ChunkMeta& meta, const std::string& dim,
                                   const std::string& val) {
     auto it = meta.dim_stats.find(dim);
     if (it == meta.dim_stats.end()) return std::nullopt;
+    if (!it->second.has_value_counts_payload()) return std::nullopt;
+    it->second.ensure_value_counts_decoded();
     if (!it->second.value_counts) return std::nullopt;
     return it->second.value_counts->count(val) > 0;
 }
@@ -113,6 +163,8 @@ std::optional<bool> dict_excludes(const ChunkMeta& meta, const std::string& dim,
                                   const std::string& val) {
     auto it = meta.dim_stats.find(dim);
     if (it == meta.dim_stats.end()) return std::nullopt;
+    if (!it->second.has_value_counts_payload()) return std::nullopt;
+    it->second.ensure_value_counts_decoded();
     if (!it->second.value_counts) return std::nullopt;
     auto& vc = *it->second.value_counts;
     // If the only value in the chunk IS val, all events match val
@@ -129,13 +181,26 @@ int compare_values(const std::string& a, const std::string& b,
                    const std::string& vtype) {
     if (is_numeric_type(vtype)) {
         try {
+            if (vtype == "uint") {
+                auto ua = std::stoull(a);
+                auto ub = std::stoull(b);
+                if (ua < ub) return -1;
+                if (ua > ub) return 1;
+                return 0;
+            }
+            if (vtype == "int") {
+                auto ia = std::stoll(a);
+                auto ib = std::stoll(b);
+                if (ia < ib) return -1;
+                if (ia > ib) return 1;
+                return 0;
+            }
             double da = std::stod(a);
             double db = std::stod(b);
             if (da < db) return -1;
             if (da > db) return 1;
             return 0;
         } catch (...) {
-            // Fall through to string comparison
         }
     }
     if (a < b) return -1;
@@ -187,6 +252,28 @@ bool bloom_may_contain(PrunerContext& ctx, const std::string& dim,
     return bloom_probe(ctx, dim, ckpt, val);
 }
 
+// Tier 4: Timestamp histogram check -- zero events in range means skip
+bool histogram_has_events(PrunerContext& ctx, std::uint64_t ckpt,
+                          query_ns::CompareOp op, std::uint64_t ts_val) {
+    auto it = ctx.ts_histograms.find(ckpt);
+    if (it == ctx.ts_histograms.end()) return true;
+    const auto& hist = it->second;
+    if (hist.empty()) return true;
+
+    switch (op) {
+        case query_ns::CompareOp::GT:
+            return hist.count_in_range(ts_val + 1, UINT64_MAX) > 0;
+        case query_ns::CompareOp::GE:
+            return hist.count_in_range(ts_val, UINT64_MAX) > 0;
+        case query_ns::CompareOp::LT:
+            return hist.count_in_range(0, ts_val) > 0;
+        case query_ns::CompareOp::LE:
+            return hist.count_in_range(0, ts_val + 1) > 0;
+        default:
+            return true;
+    }
+}
+
 // Recursive AST evaluation: returns candidate chunk set
 std::set<std::uint64_t> evaluate_node(const query_ns::QueryNode& node,
                                       PrunerContext& ctx);
@@ -196,6 +283,19 @@ std::set<std::uint64_t> eval_compare(const query_ns::CompareNode& n,
     std::set<std::uint64_t> result;
     auto val_str = literal_to_string(n.value);
 
+    if (n.field.path == "name" && n.op == query_ns::CompareOp::EQ) {
+        auto contains = ctx.file_contains_name(val_str);
+        if (contains.has_value()) {
+            if (!*contains) {
+                return result;
+            }
+            auto exact_chunks = ctx.resolve_name_chunks(val_str);
+            if (!exact_chunks.empty()) {
+                return exact_chunks;
+            }
+        }
+    }
+
     for (auto ckpt : ctx.all_chunks) {
         auto chunk_it = ctx.chunks.find(ckpt);
         ChunkMeta empty_meta;
@@ -220,8 +320,17 @@ std::set<std::uint64_t> eval_compare(const query_ns::CompareNode& n,
             result.insert(ckpt);
         } else {
             // Range operators: Tier 2
-            if (range_may_match(meta, n.field.path, n.op, val_str))
-                result.insert(ckpt);
+            if (!range_may_match(meta, n.field.path, n.op, val_str)) continue;
+            // Tier 4: histogram for ts queries
+            if (n.field.path == "ts") {
+                try {
+                    auto ts_val = std::stoull(val_str);
+                    if (!histogram_has_events(ctx, ckpt, n.op, ts_val))
+                        continue;
+                } catch (...) {
+                }
+            }
+            result.insert(ckpt);
         }
     }
     return result;
@@ -229,6 +338,26 @@ std::set<std::uint64_t> eval_compare(const query_ns::CompareNode& n,
 std::set<std::uint64_t> eval_in(const query_ns::InNode& n, PrunerContext& ctx) {
     std::set<std::uint64_t> result;
 
+    if (n.field.path == "name") {
+        for (const auto& elem : n.values.elements) {
+            auto val_str = literal_to_string(elem);
+            auto contains = ctx.file_contains_name(val_str);
+            if (contains.has_value()) {
+                if (!*contains) {
+                    continue;
+                }
+                auto exact_chunks = ctx.resolve_name_chunks(val_str);
+                if (!exact_chunks.empty()) {
+                    result.insert(exact_chunks.begin(), exact_chunks.end());
+                    continue;
+                }
+            }
+        }
+        if (!result.empty()) {
+            return result;
+        }
+    }
+
     for (auto ckpt : ctx.all_chunks) {
         auto chunk_it = ctx.chunks.find(ckpt);
         ChunkMeta empty_meta;
@@ -272,11 +401,17 @@ std::set<std::uint64_t> eval_not_in(const query_ns::NotInNode& n,
         auto& meta = chunk_it->second;
 
         auto dim_it = meta.dim_stats.find(n.field.path);
-        if (dim_it == meta.dim_stats.end() || !dim_it->second.value_counts) {
+        if (dim_it == meta.dim_stats.end() ||
+            !dim_it->second.has_value_counts_payload()) {
             // No dictionary — cannot safely skip
             result.insert(ckpt);
             continue;
         }
+        dim_it->second.ensure_value_counts_decoded();
+        if (!dim_it->second.value_counts) {
+            result.insert(ckpt);
+            continue;
+        }
 
         auto& vc = *dim_it->second.value_counts;
         bool all_excluded = true;
@@ -347,7 +482,15 @@ coro::CoroTask<ChunkPrunerOutput> ChunkPrunerUtility::process(
         out.file_may_match = false;
 
         try {
-            IndexDatabase idx_db(input.index_path);
+            std::optional<IndexDatabase> owned_db;
+            IndexDatabase* db_ptr = input.external_db;
+            if (!db_ptr) {
+                owned_db.emplace(input.index_path,
+                                 dftracer::utils::rocksdb::RocksDatabase::
+                                     OpenMode::ReadOnly);
+                db_ptr = &*owned_db;
+            }
+            IndexDatabase& idx_db = *db_ptr;
             int fid =
                 idx_db.get_file_info_id(get_logical_path(input.file_path));
             if (fid < 0) {
@@ -371,6 +514,15 @@ coro::CoroTask<ChunkPrunerOutput> ChunkPrunerUtility::process(
                 ctx.chunks[ds.checkpoint_idx].dim_stats[ds.dimension] = ds;
             }
 
+            // Load timestamp histograms for Tier 4 pruning
+            auto chunk_stats = idx_db.query_chunk_statistics(fid);
+            for (auto& row : chunk_stats) {
+                if (!row.stats.timestamp_histogram.empty()) {
+                    ctx.ts_histograms[row.checkpoint_idx] =
+                        std::move(row.stats.timestamp_histogram);
+                }
+            }
+
             // Load bloom filters for all dimensions
             auto indexed_dims = idx_db.query_index_dimensions(fid);
             auto all_chunk_blooms =
@@ -426,7 +578,132 @@ coro::CoroTask<ChunkPrunerOutput> ChunkPrunerUtility::process(
         return out;
     };
 
-    co_return co_await rocksdb::run(do_query);
+    co_return do_query();
+}
+
+ChunkPrunerBatchOutput ChunkPrunerUtility::process_batch(
+    const ChunkPrunerBatchInput& input) {
+    ChunkPrunerBatchOutput batch_out;
+    batch_out.outputs.resize(input.items.size());
+    batch_out.success = false;
+
+    try {
+        std::optional<IndexDatabase> owned_db;
+        IndexDatabase* db_ptr = input.external_db;
+        if (!db_ptr) {
+            owned_db.emplace(
+                input.index_path,
+                dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+            db_ptr = &*owned_db;
+        }
+        IndexDatabase& idx_db = *db_ptr;
+
+        // Collect all (item_idx -> fid) mappings up front.
+        std::vector<int> fids;
+        fids.reserve(input.items.size());
+        std::vector<int> item_to_fid(input.items.size(), -1);
+        for (std::size_t i = 0; i < input.items.size(); ++i) {
+            int fid = idx_db.get_file_info_id(
+                get_logical_path(input.items[i].file_path));
+            item_to_fid[i] = fid;
+            if (fid >= 0) fids.push_back(fid);
+        }
+
+        // Batch-load per-fid dim_stats and chunk_statistics with one
+        // RocksDB column-family scan each instead of N scans.
+        auto all_dim_stats = idx_db.query_chunk_dimension_stats_batch(fids);
+        auto all_chunk_stats = idx_db.query_chunk_statistics_batch(fids);
+
+        // Per-file eval: blooms / index_dimensions queries still happen
+        // per file but against the shared DB handle.
+        for (std::size_t i = 0; i < input.items.size(); ++i) {
+            const auto& item = input.items[i];
+            auto& out = batch_out.outputs[i];
+            out.success = false;
+            out.file_may_match = false;
+
+            int fid = item_to_fid[i];
+            if (fid < 0) {
+                out.success = true;
+                out.file_may_match = true;
+                continue;
+            }
+
+            try {
+                PrunerContext ctx;
+                ctx.file_info_id = fid;
+                ctx.cache = input.cache;
+                ctx.index_path = input.index_path;
+                ctx.db = &idx_db;
+                ctx.fid = fid;
+
+                auto dim_it = all_dim_stats.find(fid);
+                if (dim_it != all_dim_stats.end()) {
+                    for (const auto& ds : dim_it->second) {
+                        ctx.all_chunks.insert(ds.checkpoint_idx);
+                        ctx.chunks[ds.checkpoint_idx].dim_stats[ds.dimension] =
+                            ds;
+                    }
+                }
+
+                auto cs_it = all_chunk_stats.find(fid);
+                if (cs_it != all_chunk_stats.end()) {
+                    for (auto& row : cs_it->second) {
+                        if (!row.stats.timestamp_histogram.empty()) {
+                            ctx.ts_histograms[row.checkpoint_idx] =
+                                std::move(row.stats.timestamp_histogram);
+                        }
+                    }
+                }
+
+                auto indexed_dims = idx_db.query_index_dimensions(fid);
+                auto all_chunk_blooms =
+                    idx_db.query_chunk_bloom_filters_batch(fid, indexed_dims);
+                for (const auto& [dim, chunk_blooms] : all_chunk_blooms) {
+                    for (const auto& cb : chunk_blooms) {
+                        ctx.all_chunks.insert(cb.checkpoint_idx);
+                        BloomFilter bf = BloomFilter::from_blob(
+                            cb.bloom_data.data(), cb.bloom_data.size());
+                        if (input.cache) {
+                            input.cache->put(input.index_path, dim,
+                                             cb.checkpoint_idx, bf);
+                        }
+                        ctx.bloom_filters[dim][cb.checkpoint_idx] =
+                            std::move(bf);
+                    }
+                }
+
+                ctx.total_chunks =
+                    ctx.all_chunks.empty() ? 0 : *ctx.all_chunks.rbegin() + 1;
+                out.total_checkpoints = ctx.total_chunks;
+
+                if (ctx.all_chunks.empty()) {
+                    out.file_may_match = true;
+                    out.success = true;
+                    continue;
+                }
+
+                auto candidates = evaluate_node(item.query.root(), ctx);
+                out.candidate_checkpoints.assign(candidates.begin(),
+                                                 candidates.end());
+                out.file_may_match = !out.candidate_checkpoints.empty();
+                out.success = true;
+            } catch (const std::exception& e) {
+                DFTRACER_UTILS_LOG_WARN(
+                    "ChunkPruner: error for %s: %s, assuming match",
+                    item.file_path.c_str(), e.what());
+                out.file_may_match = true;
+                out.success = true;
+            }
+        }
+
+        batch_out.success = true;
+    } catch (const std::exception& e) {
+        DFTRACER_UTILS_LOG_WARN("ChunkPruner: batch error for index %s: %s",
+                                input.index_path.c_str(), e.what());
+    }
+
+    return batch_out;
 }
 
 }  // namespace dftracer::utils::utilities::composites::dft::indexing
diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.cpp
index 6aa594a9..2c9ad274 100644
--- a/src/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.cpp
@@ -1,11 +1,13 @@
 #include <dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
 #include <algorithm>
 #include <charconv>
 #include <cmath>
 #include <cstring>
+#include <iomanip>
 #include <limits>
+#include <sstream>
 #include <stdexcept>
 
 namespace dftracer::utils::utilities::composites::dft::indexing {
@@ -30,9 +32,19 @@ void ChunkStatistics::update_from_event(std::string_view name,
     }
     std::string_view pt_sv(pt_buf, tp - pt_buf);
 
-    category_counts[std::string(cat)]++;
-    name_counts[std::string(name)]++;
-    pid_tid_counts[std::string(pt_sv)]++;
+    // Increment counts with a single lookup — allocate a string only on
+    // first observation.
+    auto bump = [](auto& map, std::string_view key) {
+        auto it = map.find(key);
+        if (it == map.end()) {
+            map.emplace(std::string(key), 1);
+        } else {
+            it->second++;
+        }
+    };
+    bump(category_counts, cat);
+    bump(name_counts, name);
+    bump(pid_tid_counts, pt_sv);
 
     if (ts < min_timestamp_us) min_timestamp_us = ts;
     std::uint64_t end_ts = ts + dur;
@@ -58,9 +70,16 @@ void ChunkStatistics::update_from_event(std::string_view name,
     double dur_d = static_cast<double>(dur);
     duration_sketch.add(dur_d);
     duration_histogram.add(dur);
-
-    auto [sketch_it, _3] =
-        name_duration_sketches.try_emplace(std::string(name));
+    timestamp_histogram.add(ts);
+
+    // name_duration_sketches: transparent find, allocate only on first
+    // observation, then reuse the interned key for the other name_*_ maps.
+    auto sketch_it = name_duration_sketches.find(name);
+    if (sketch_it == name_duration_sketches.end()) {
+        auto [new_it, _] = name_duration_sketches.emplace(
+            std::string(name), common::statistics::DDSketch{});
+        sketch_it = new_it;
+    }
     sketch_it->second.add(dur_d);
 
     const std::string& name_key = sketch_it->first;
@@ -105,6 +124,7 @@ void ChunkStatistics::merge_from(const ChunkStatistics& other) {
 
     duration_sketch.merge(other.duration_sketch);
     duration_histogram.merge(other.duration_histogram);
+    timestamp_histogram.merge(other.timestamp_histogram);
 
     for (const auto& [k, v] : other.name_duration_sketches) {
         name_duration_sketches[k].merge(v);
@@ -135,82 +155,66 @@ double ChunkStatistics::duration_variance() const {
 }
 
 std::string ChunkStatistics::name_category_json() const {
-    yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr);
-    yyjson_mut_val* root = yyjson_mut_obj(doc);
-    yyjson_mut_doc_set_root(doc, root);
-
+    std::ostringstream ss;
+    ss << '{';
+    bool first = true;
     for (const auto& [key, value] : name_category) {
-        yyjson_mut_obj_add_str(doc, root, key.c_str(), value.c_str());
+        if (!first) ss << ',';
+        first = false;
+        ss << '"' << key << "\":\"" << value << '"';
     }
-
-    char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_NOFLAG, nullptr);
-    std::string result(json_str ? json_str : "{}");
-    if (json_str) free(json_str);
-    yyjson_mut_doc_free(doc);
-    return result;
+    ss << '}';
+    return ss.str();
 }
 
-std::unordered_map<std::string, std::string>
-ChunkStatistics::parse_string_map_json(const std::string& json) {
-    std::unordered_map<std::string, std::string> result;
+StringViewMap<std::string> ChunkStatistics::parse_string_map_json(
+    const std::string& json) {
+    StringViewMap<std::string> result;
 
-    yyjson_doc* doc =
-        yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG);
-    if (!doc) return result;
+    simdjson::dom::parser parser;
+    auto parse_result = parser.parse(json.data(), json.size());
+    if (parse_result.error()) return result;
 
-    yyjson_val* root = yyjson_doc_get_root(doc);
-    if (!root || !yyjson_is_obj(root)) {
-        yyjson_doc_free(doc);
-        return result;
-    }
+    auto root = parse_result.value_unsafe();
+    if (!root.is_object()) return result;
 
-    yyjson_obj_iter iter;
-    yyjson_obj_iter_init(root, &iter);
-    yyjson_val* key;
-    while ((key = yyjson_obj_iter_next(&iter))) {
-        yyjson_val* val = yyjson_obj_iter_get_val(key);
-        if (yyjson_is_str(val)) {
-            result[yyjson_get_str(key)] = yyjson_get_str(val);
+    auto obj = root.get_object().value_unsafe();
+    for (auto field : obj) {
+        auto val_result = field.value.get_string();
+        if (!val_result.error()) {
+            result[std::string(field.key)] =
+                std::string(val_result.value_unsafe());
         }
     }
-
-    yyjson_doc_free(doc);
     return result;
 }
 
 std::string ChunkStatistics::name_duration_histograms_json() const {
-    yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr);
-    yyjson_mut_val* root = yyjson_mut_obj(doc);
-    yyjson_mut_doc_set_root(doc, root);
-
+    std::ostringstream ss;
+    ss << '{';
+    bool first = true;
     for (const auto& [key, hist] : name_duration_histograms) {
-        yyjson_mut_val* arr = hist.to_yyjson(doc);
-        yyjson_mut_obj_add_val(doc, root, key.c_str(), arr);
+        if (!first) ss << ',';
+        first = false;
+        ss << '"' << key << "\":" << hist.to_json();
     }
-
-    char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_NOFLAG, nullptr);
-    std::string result(json_str ? json_str : "{}");
-    if (json_str) free(json_str);
-    yyjson_mut_doc_free(doc);
-    return result;
+    ss << '}';
+    return ss.str();
 }
 
 namespace {
-std::string double_map_to_json(
-    const std::unordered_map<std::string, double>& map) {
-    yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr);
-    yyjson_mut_val* root = yyjson_mut_obj(doc);
-    yyjson_mut_doc_set_root(doc, root);
-
+template <typename Map>
+std::string double_map_to_json(const Map& map) {
+    std::ostringstream ss;
+    ss << std::setprecision(17) << '{';
+    bool first = true;
     for (const auto& [key, value] : map) {
-        yyjson_mut_obj_add_real(doc, root, key.c_str(), value);
+        if (!first) ss << ',';
+        first = false;
+        ss << '"' << key << "\":" << value;
     }
-
-    char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_NOFLAG, nullptr);
-    std::string result(json_str ? json_str : "{}");
-    if (json_str) free(json_str);
-    yyjson_mut_doc_free(doc);
-    return result;
+    ss << '}';
+    return ss.str();
 }
 }  // namespace
 
@@ -258,89 +262,82 @@ std::vector<std::uint8_t> ChunkStatistics::serialize_name_duration_sketches()
     return buf;
 }
 
-std::unordered_map<std::string, double> ChunkStatistics::parse_double_map_json(
+StringViewMap<double> ChunkStatistics::parse_double_map_json(
     const std::string& json) {
-    std::unordered_map<std::string, double> result;
-
-    yyjson_doc* doc =
-        yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG);
-    if (!doc) return result;
-
-    yyjson_val* root = yyjson_doc_get_root(doc);
-    if (!root || !yyjson_is_obj(root)) {
-        yyjson_doc_free(doc);
-        return result;
-    }
-
-    yyjson_obj_iter iter;
-    yyjson_obj_iter_init(root, &iter);
-    yyjson_val* key;
-    while ((key = yyjson_obj_iter_next(&iter))) {
-        yyjson_val* val = yyjson_obj_iter_get_val(key);
-        if (yyjson_is_real(val)) {
-            result[yyjson_get_str(key)] = yyjson_get_real(val);
-        } else if (yyjson_is_int(val)) {
-            result[yyjson_get_str(key)] =
-                static_cast<double>(yyjson_get_int(val));
-        } else if (yyjson_is_uint(val)) {
-            result[yyjson_get_str(key)] =
-                static_cast<double>(yyjson_get_uint(val));
+    StringViewMap<double> result;
+
+    simdjson::dom::parser parser;
+    auto parse_result = parser.parse(json.data(), json.size());
+    if (parse_result.error()) return result;
+
+    auto root = parse_result.value_unsafe();
+    if (!root.is_object()) return result;
+
+    auto obj = root.get_object().value_unsafe();
+    for (auto field : obj) {
+        auto double_result = field.value.get_double();
+        if (!double_result.error()) {
+            result[std::string(field.key)] = double_result.value_unsafe();
+        } else {
+            auto int_result = field.value.get_int64();
+            if (!int_result.error()) {
+                result[std::string(field.key)] =
+                    static_cast<double>(int_result.value_unsafe());
+            } else {
+                auto uint_result = field.value.get_uint64();
+                if (!uint_result.error()) {
+                    result[std::string(field.key)] =
+                        static_cast<double>(uint_result.value_unsafe());
+                }
+            }
         }
     }
-
-    yyjson_doc_free(doc);
     return result;
 }
 
-std::unordered_map<std::string, common::statistics::Log2Histogram>
+StringViewMap<common::statistics::Log2Histogram>
 ChunkStatistics::parse_histogram_map_json(const std::string& json) {
-    std::unordered_map<std::string, common::statistics::Log2Histogram> result;
+    StringViewMap<common::statistics::Log2Histogram> result;
 
-    yyjson_doc* doc =
-        yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG);
-    if (!doc) return result;
+    simdjson::dom::parser parser;
+    auto parse_result = parser.parse(json.data(), json.size());
+    if (parse_result.error()) return result;
 
-    yyjson_val* root = yyjson_doc_get_root(doc);
-    if (!root || !yyjson_is_obj(root)) {
-        yyjson_doc_free(doc);
-        return result;
-    }
+    auto root = parse_result.value_unsafe();
+    if (!root.is_object()) return result;
 
-    yyjson_obj_iter iter;
-    yyjson_obj_iter_init(root, &iter);
-    yyjson_val* key;
-    while ((key = yyjson_obj_iter_next(&iter))) {
-        yyjson_val* val = yyjson_obj_iter_get_val(key);
-        if (!yyjson_is_arr(val)) continue;
+    auto obj = root.get_object().value_unsafe();
+    for (auto field : obj) {
+        if (!field.value.is_array()) continue;
 
         common::statistics::Log2Histogram hist;
-        std::size_t idx, max;
-        yyjson_val* pair;
-        yyjson_arr_foreach(val, idx, max, pair) {
-            if (!yyjson_is_arr(pair) || yyjson_arr_size(pair) != 2) continue;
-            yyjson_val* bin_idx_val = yyjson_arr_get(pair, 0);
-            yyjson_val* count_val = yyjson_arr_get(pair, 1);
-            if (!yyjson_is_uint(bin_idx_val) || !yyjson_is_uint(count_val))
-                continue;
+        auto arr = field.value.get_array().value_unsafe();
+        for (auto pair : arr) {
+            if (!pair.is_array()) continue;
+            auto pair_arr = pair.get_array().value_unsafe();
+            if (pair_arr.size() != 2) continue;
+
+            auto bin_idx_result = pair_arr.at(0).get_uint64();
+            auto count_result = pair_arr.at(1).get_uint64();
+            if (bin_idx_result.error() || count_result.error()) continue;
+
             auto bin_idx =
-                static_cast<std::size_t>(yyjson_get_uint(bin_idx_val));
-            auto count = yyjson_get_uint(count_val);
+                static_cast<std::size_t>(bin_idx_result.value_unsafe());
+            auto count = count_result.value_unsafe();
             if (bin_idx < common::statistics::Log2Histogram::NUM_BINS) {
                 hist.add(common::statistics::Log2Histogram::bin_lower(bin_idx),
                          count);
             }
         }
-        result[yyjson_get_str(key)] = std::move(hist);
+        result[std::string(field.key)] = std::move(hist);
     }
-
-    yyjson_doc_free(doc);
     return result;
 }
 
-std::unordered_map<std::string, common::statistics::DDSketch>
+StringViewMap<common::statistics::DDSketch>
 ChunkStatistics::deserialize_name_duration_sketches(const std::uint8_t* data,
                                                     std::size_t len) {
-    std::unordered_map<std::string, common::statistics::DDSketch> result;
+    StringViewMap<common::statistics::DDSketch> result;
     if (!data || len < sizeof(std::uint32_t)) return result;
 
     const std::uint8_t* p = data;
diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.cpp
new file mode 100644
index 00000000..2cad9e6b
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.cpp
@@ -0,0 +1,324 @@
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/coro/when_all.h>
+#include <dftracer/utils/core/rocksdb/column_families.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
+#include <dftracer/utils/utilities/composites/dft/internal/utils.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/internal/helpers.h>
+
+#include <unordered_map>
+#include <unordered_set>
+
+namespace dftracer::utils::utilities::composites::dft::indexing {
+
+namespace {
+
+namespace rcf = dftracer::utils::rocksdb::cf;
+
+using aggregators::AGG_FILE_KEY_LEN;
+using aggregators::AGG_FILE_KEY_PREFIX;
+using aggregators::AGG_GLOBAL_CONFIG_KEY;
+using aggregators::deserialize_agg_global_config;
+using indexer::has_capability;
+using indexer::IndexDatabase;
+using indexer::IndexFileEntryCapability;
+
+struct PendingFile {
+    std::size_t file_index;
+    std::string file_path;
+    std::string logical_path;
+};
+
+struct ResolveGroupInput {
+    std::string index_path;
+    std::vector<PendingFile> files;
+    bool require_checkpoints;
+    bool require_bloom;
+    bool require_manifest;
+    bool require_aggregation;
+    std::optional<aggregators::AggregationConfig> aggregation_config;
+};
+
+struct ResolveGroupOutput {
+    std::vector<FileWorkItem> needs_checkpoint;
+    std::vector<FileWorkItem> needs_bloom;
+    std::vector<FileWorkItem> needs_manifest;
+    std::vector<FileWorkItem> needs_aggregation;
+    std::vector<ResolvedFile> cached;
+    bool success = true;
+    std::string error_message;
+
+    // Aggregation augmentation info
+    bool needs_augmentation = false;
+    std::uint64_t stored_time_interval_us = 0;
+};
+
+ResolveGroupOutput resolve_group_sync(ResolveGroupInput input) {
+    ResolveGroupOutput result;
+
+    if (input.index_path.empty() || !fs::exists(input.index_path)) {
+        for (auto& f : input.files) {
+            result.needs_checkpoint.push_back(
+                FileWorkItem{f.file_index, std::move(f.file_path), -1});
+        }
+        return result;
+    }
+
+    try {
+        IndexDatabase db(
+            input.index_path,
+            dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+        auto registry = db.query_all_file_registry();
+
+        // Check global aggregation config first
+        bool agg_config_compatible = false;
+        if (input.require_aggregation && input.aggregation_config) {
+            std::string global_config_val;
+            auto status =
+                db.db()->get(std::string_view(AGG_GLOBAL_CONFIG_KEY, 2),
+                             &global_config_val, rcf::AGGREGATION);
+            if (status.ok() && !global_config_val.empty()) {
+                auto global_cfg =
+                    deserialize_agg_global_config(global_config_val);
+                // config_hash == 0 means "any config is compatible"
+                // Otherwise recompute hash with stored time_interval to check
+                bool hashes_match = global_cfg.config_hash == 0;
+                if (!hashes_match) {
+                    auto check_config = *input.aggregation_config;
+                    check_config.time_interval_us = global_cfg.time_interval_us;
+                    hashes_match =
+                        check_config.compute_hash() == global_cfg.config_hash;
+                }
+                if (hashes_match) {
+                    agg_config_compatible = true;
+                    result.stored_time_interval_us =
+                        global_cfg.time_interval_us;
+                    if (global_cfg.time_interval_us !=
+                        input.aggregation_config->time_interval_us) {
+                        result.needs_augmentation = true;
+                    }
+                }
+            }
+        }
+
+        // Build set of file_ids with aggregation data (key existence = cached)
+        std::unordered_set<std::int32_t> agg_cached_file_ids;
+        if (input.require_aggregation && agg_config_compatible) {
+            auto iter = db.db()->new_iterator(rcf::AGGREGATION);
+            if (iter) {
+                iter->Seek(AGG_FILE_KEY_PREFIX);
+                while (iter->Valid()) {
+                    auto key = iter->key();
+                    if (key.size() < AGG_FILE_KEY_LEN ||
+                        key[0] != AGG_FILE_KEY_PREFIX[0] ||
+                        key[1] != AGG_FILE_KEY_PREFIX[1]) {
+                        break;
+                    }
+                    std::int32_t file_id =
+                        (static_cast<std::int32_t>(
+                             static_cast<std::uint8_t>(key[2]))
+                         << 24) |
+                        (static_cast<std::int32_t>(
+                             static_cast<std::uint8_t>(key[3]))
+                         << 16) |
+                        (static_cast<std::int32_t>(
+                             static_cast<std::uint8_t>(key[4]))
+                         << 8) |
+                        static_cast<std::int32_t>(
+                            static_cast<std::uint8_t>(key[5]));
+                    agg_cached_file_ids.insert(file_id);
+                    iter->Next();
+                }
+            }
+        }
+
+        for (auto& f : input.files) {
+            auto reg_it = registry.find(f.logical_path);
+            if (reg_it == registry.end()) {
+                result.needs_checkpoint.push_back(
+                    FileWorkItem{f.file_index, std::move(f.file_path), -1});
+                continue;
+            }
+
+            const auto& reg = reg_it->second;
+            auto caps = reg.capabilities;
+            bool has_checkpoints =
+                has_capability(caps, IndexFileEntryCapability::CHECKPOINTS) ||
+                has_capability(caps, IndexFileEntryCapability::FILE_SUMMARY);
+            bool has_bloom =
+                has_capability(caps, IndexFileEntryCapability::BLOOM);
+            bool has_manifest =
+                has_capability(caps, IndexFileEntryCapability::MANIFEST);
+
+            if (input.require_checkpoints && !has_checkpoints) {
+                result.needs_checkpoint.push_back(FileWorkItem{
+                    f.file_index, std::move(f.file_path), reg.file_id});
+                continue;
+            }
+
+            if (input.require_bloom && !has_bloom) {
+                result.needs_bloom.push_back(FileWorkItem{
+                    f.file_index, std::move(f.file_path), reg.file_id});
+                continue;
+            }
+
+            if (input.require_manifest && !has_manifest) {
+                result.needs_manifest.push_back(FileWorkItem{
+                    f.file_index, std::move(f.file_path), reg.file_id});
+                continue;
+            }
+
+            if (input.require_aggregation &&
+                agg_cached_file_ids.find(reg.file_id) ==
+                    agg_cached_file_ids.end()) {
+                result.needs_aggregation.push_back(FileWorkItem{
+                    f.file_index, std::move(f.file_path), reg.file_id});
+                continue;
+            }
+
+            result.cached.push_back(ResolvedFile{
+                f.file_index, std::move(f.file_path), reg.file_id, caps});
+        }
+
+        result.success = true;
+    } catch (const std::exception& e) {
+        result.success = false;
+        result.error_message = e.what();
+        for (auto& f : input.files) {
+            result.needs_checkpoint.push_back(
+                FileWorkItem{f.file_index, std::move(f.file_path), -1});
+        }
+    }
+
+    return result;
+}
+
+}  // namespace
+
+coro::CoroTask<ResolverResult> IndexResolverUtility::process(
+    const ResolverInput& input) {
+    ResolverResult result;
+
+    if (!input.directory.empty()) {
+        filesystem::PatternDirectoryScannerUtilityInput scan_input{
+            input.directory, {".pfw", ".pfw.gz"}, false};
+        std::vector<filesystem::FileEntry> matched;
+        if (this->has_context()) {
+            matched = co_await this->context().spawn(scanner_, scan_input);
+        } else {
+            matched = co_await scanner_.process(scan_input);
+        }
+        result.all_files.reserve(matched.size());
+        result.all_file_sizes.reserve(matched.size());
+        for (const auto& entry : matched) {
+            result.all_files.push_back(entry.path.string());
+            result.all_file_sizes.push_back(entry.size);
+        }
+    } else {
+        result.all_files = input.files;
+        result.all_file_sizes.assign(input.files.size(), 0);
+        for (std::size_t i = 0; i < input.files.size(); ++i) {
+            std::error_code ec;
+            auto sz = fs::file_size(input.files[i], ec);
+            if (!ec) result.all_file_sizes[i] = static_cast<std::size_t>(sz);
+        }
+    }
+
+    if (result.all_files.empty()) {
+        co_return result;
+    }
+
+    result.index_path = internal::determine_index_path(result.all_files.front(),
+                                                       input.index_dir);
+
+    // Group files by index path and prepare for resolution
+    std::unordered_map<std::string, std::vector<PendingFile>> groups;
+    for (std::size_t i = 0; i < result.all_files.size(); ++i) {
+        const auto& file_path = result.all_files[i];
+        auto idx_path =
+            internal::determine_index_path(file_path, input.index_dir);
+        auto logical = indexer::internal::get_logical_path(file_path);
+        groups[idx_path].push_back(
+            PendingFile{i, file_path, std::move(logical)});
+    }
+
+    std::vector<ResolveGroupOutput> outputs;
+    outputs.reserve(groups.size());
+
+    if (this->has_context() && groups.size() > 1) {
+        std::vector<coro::SpawnFuture<ResolveGroupOutput>> futures;
+        futures.reserve(groups.size());
+
+        for (auto& [idx_path, files] : groups) {
+            ResolveGroupInput group_input;
+            group_input.index_path = idx_path;
+            group_input.files = std::move(files);
+            group_input.require_checkpoints = input.require_checkpoints;
+            group_input.require_bloom = input.require_bloom;
+            group_input.require_manifest = input.require_manifest;
+            group_input.require_aggregation = input.require_aggregation;
+            group_input.aggregation_config = input.aggregation_config;
+
+            futures.push_back(this->context().spawn(
+                [gi = std::move(group_input)](
+                    CoroScope&) mutable -> coro::CoroTask<ResolveGroupOutput> {
+                    co_return resolve_group_sync(std::move(gi));
+                }));
+        }
+
+        for (auto& f : futures) {
+            outputs.push_back(co_await f);
+        }
+    } else {
+        for (auto& [idx_path, files] : groups) {
+            ResolveGroupInput group_input;
+            group_input.index_path = idx_path;
+            group_input.files = std::move(files);
+            group_input.require_checkpoints = input.require_checkpoints;
+            group_input.require_bloom = input.require_bloom;
+            group_input.require_manifest = input.require_manifest;
+            group_input.require_aggregation = input.require_aggregation;
+            group_input.aggregation_config = input.aggregation_config;
+
+            outputs.push_back(resolve_group_sync(std::move(group_input)));
+        }
+    }
+
+    // Merge results
+    for (auto& out : outputs) {
+        for (auto& item : out.needs_checkpoint) {
+            result.needs_checkpoint.push_back(std::move(item));
+        }
+        for (auto& item : out.needs_bloom) {
+            result.needs_bloom.push_back(std::move(item));
+        }
+        for (auto& item : out.needs_manifest) {
+            result.needs_manifest.push_back(std::move(item));
+        }
+        for (auto& item : out.needs_aggregation) {
+            result.needs_aggregation.push_back(std::move(item));
+        }
+        for (auto& item : out.cached) {
+            result.cached.push_back(std::move(item));
+        }
+        // Merge augmentation info (all groups should have same global config)
+        if (out.needs_augmentation) {
+            result.needs_augmentation = true;
+            result.stored_time_interval_us = out.stored_time_interval_us;
+        }
+    }
+
+    DFTRACER_UTILS_LOG_INFO(
+        "Resolver: %zu total, %zu cached, %zu need checkpoint, %zu need bloom, "
+        "%zu need manifest, %zu need aggregation",
+        result.all_files.size(), result.cached.size(),
+        result.needs_checkpoint.size(), result.needs_bloom.size(),
+        result.needs_manifest.size(), result.needs_aggregation.size());
+
+    co_return result;
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::indexing
diff --git a/src/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.cpp b/src/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.cpp
new file mode 100644
index 00000000..fb8e65a3
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.cpp
@@ -0,0 +1,214 @@
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/common/platform_compat.h>
+#include <dftracer/utils/core/rocksdb/column_families.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/resolve_and_build.h>
+#include <dftracer/utils/utilities/indexer/index_builder_utility.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/internal/helpers.h>
+
+#include <cstring>
+#include <set>
+
+namespace dftracer::utils::utilities::composites::dft::indexing {
+
+using aggregators::AGG_GLOBAL_CONFIG_KEY;
+using aggregators::AggGlobalConfig;
+using aggregators::AggregationVisitor;
+using aggregators::EventAggregator;
+using aggregators::serialize_agg_global_config;
+using indexer::internal::get_logical_path;
+
+coro::CoroTask<ResolverResult> resolve_and_build_index(
+    CoroScope* scope, ResolveAndBuildInput input) {
+    // Determine parallelism
+    std::size_t parallelism = input.parallelism;
+    if (parallelism == 0) {
+        parallelism = dftracer_utils_hardware_concurrency();
+    }
+
+    // Initial resolve
+    IndexResolverUtility resolver;
+    ResolverInput resolve_input;
+    resolve_input.directory = std::move(input.directory);
+    resolve_input.files = std::move(input.files);
+    resolve_input.index_dir = input.index_dir;
+    resolve_input.require_checkpoints = input.require_checkpoints;
+    resolve_input.require_bloom = input.require_bloom;
+    resolve_input.require_manifest = input.require_manifest;
+    resolve_input.require_aggregation = input.require_aggregation;
+    resolve_input.aggregation_config = input.aggregation_config;
+
+    auto result = co_await resolver.process(resolve_input);
+
+    if (result.all_files.empty()) {
+        co_return result;
+    }
+
+    // Collect files that need work (checkpoint or aggregation)
+    // When force_rebuild is set, process all files
+    std::vector<std::string> files_needing_work;
+    if (input.force_rebuild) {
+        // all_files is already a vector of strings
+        files_needing_work = result.all_files;
+    } else {
+        std::set<std::string> files_needing_work_set;
+        for (const auto& item : result.needs_checkpoint) {
+            files_needing_work_set.insert(item.file_path);
+        }
+        for (const auto& item : result.needs_aggregation) {
+            files_needing_work_set.insert(item.file_path);
+        }
+        files_needing_work.assign(files_needing_work_set.begin(),
+                                  files_needing_work_set.end());
+    }
+
+    if (!files_needing_work.empty()) {
+        DFTRACER_UTILS_LOG_INFO(
+            "Building index for %zu files (checkpoint: %zu, aggregation: %zu)",
+            files_needing_work.size(), result.needs_checkpoint.size(),
+            result.needs_aggregation.size());
+
+        // Set up aggregation components if needed
+        std::shared_ptr<dftracer::utils::rocksdb::RocksDatabase> agg_db;
+        std::unique_ptr<EventAggregator> merger;
+        std::shared_ptr<aggregators::AggregationConfig> agg_config_ptr;
+
+        if (input.require_aggregation && input.aggregation_config) {
+            agg_db =
+                EventAggregator::open_with_merge_operator(result.index_path);
+            merger = std::make_unique<EventAggregator>(agg_db, 0);
+            agg_config_ptr = std::make_shared<aggregators::AggregationConfig>(
+                *input.aggregation_config);
+        }
+
+        auto batch_config = std::make_shared<indexer::IndexBuildBatchConfig>();
+        batch_config->file_paths = std::move(files_needing_work);
+        batch_config->index_dir = input.index_dir;
+        batch_config->checkpoint_size = input.checkpoint_size;
+        batch_config->parallelism = parallelism;
+        batch_config->force_rebuild = input.force_rebuild;
+        batch_config->build_manifest = input.require_manifest;
+        batch_config->use_batch_write = true;
+        batch_config->rebuild_root_summaries = true;
+
+        // Attach AggregationVisitor if aggregation is required
+        if (agg_db && agg_config_ptr) {
+            batch_config->dft_visitor_factory =
+                [agg_db, agg_config_ptr](const std::string& file_path)
+                -> std::vector<
+                    std::unique_ptr<composites::dft::DftEventVisitor>> {
+                std::vector<std::unique_ptr<composites::dft::DftEventVisitor>>
+                    visitors;
+                visitors.push_back(std::make_unique<AggregationVisitor>(
+                    agg_db, 0, *agg_config_ptr, file_path));
+                return visitors;
+            };
+        }
+
+        auto batch_result = co_await indexer::IndexBatchBuilderUtility::process(
+            scope, std::move(batch_config));
+
+        // Drain visitors and merge aggregation results
+        std::vector<std::string> processed_files;
+        if (merger) {
+            for (auto& file_visitors : batch_result.extra_visitors) {
+                for (auto& visitor : file_visitors) {
+                    auto* agg_visitor =
+                        dynamic_cast<AggregationVisitor*>(visitor.get());
+                    if (agg_visitor) {
+                        for (const auto& k : agg_visitor->observed_extra_keys())
+                            merger->add_observed_extra_key(k);
+                        for (const auto& m :
+                             agg_visitor->observed_custom_metrics())
+                            merger->add_observed_custom_metric(m);
+                        auto output = agg_visitor->take_output();
+                        processed_files.push_back(output.file_path);
+                        merger->merge_chunk(std::move(output));
+                    }
+                }
+                file_visitors.clear();
+            }
+
+            // Write global config and per-file markers
+            if (!processed_files.empty()) {
+                namespace rcf = dftracer::utils::rocksdb::cf;
+                indexer::IndexDatabase idx_db(
+                    result.index_path, dftracer::utils::rocksdb::RocksDatabase::
+                                           OpenMode::ReadOnly);
+
+                auto batch = agg_db->begin_batch();
+
+                // Write global config (0xFFFE key)
+                AggGlobalConfig global_cfg;
+                global_cfg.time_interval_us =
+                    input.aggregation_config->time_interval_us;
+                global_cfg.config_hash = 0;
+                agg_db->put(batch, rcf::AGGREGATION,
+                            std::string_view(AGG_GLOBAL_CONFIG_KEY, 2),
+                            serialize_agg_global_config(global_cfg));
+
+                // Write per-file markers
+                for (const auto& file_path : processed_files) {
+                    int file_id =
+                        idx_db.get_file_info_id(get_logical_path(file_path));
+                    if (file_id >= 0) {
+                        char marker_key[6];
+                        marker_key[0] = '\xFF';
+                        marker_key[1] = '\xFF';
+                        auto fid_u32 = static_cast<std::uint32_t>(file_id);
+                        std::uint32_t fid_be = __builtin_bswap32(fid_u32);
+                        std::memcpy(&marker_key[2], &fid_be, 4);
+                        agg_db->put(batch, rcf::AGGREGATION,
+                                    std::string_view(marker_key, 6),
+                                    std::string_view());
+                    }
+                }
+
+                agg_db->commit_batch(batch);
+
+                // Compact aggregation CFs so all Merge entries become Puts.
+                // This allows concurrent ReadOnly access without merge
+                // operators.
+                agg_db->compact(rcf::AGGREGATION);
+                agg_db->compact(rcf::SYSTEM_METRICS);
+            }
+        }
+
+        // Re-resolve newly built files to get file_ids
+        ResolverInput refresh_input;
+        refresh_input.files.reserve(result.needs_checkpoint.size());
+        for (const auto& item : result.needs_checkpoint) {
+            refresh_input.files.push_back(item.file_path);
+        }
+        refresh_input.index_dir = input.index_dir;
+        refresh_input.require_checkpoints = true;
+
+        if (!refresh_input.files.empty()) {
+            auto refresh_result = co_await resolver.process(refresh_input);
+
+            // Merge newly indexed into cached
+            for (auto& entry : refresh_result.cached) {
+                result.cached.push_back(std::move(entry));
+            }
+
+            // Update needs_checkpoint with any that still failed
+            result.needs_checkpoint =
+                std::move(refresh_result.needs_checkpoint);
+        }
+
+        // Clear needs_aggregation since we processed them
+        result.needs_aggregation.clear();
+    }
+
+    DFTRACER_UTILS_LOG_INFO(
+        "Resolve complete: %zu total, %zu cached, %zu failed checkpoint",
+        result.all_files.size(), result.cached.size(),
+        result.needs_checkpoint.size());
+
+    co_return result;
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::indexing
diff --git a/src/dftracer/utils/utilities/composites/dft/internal/utils.cpp b/src/dftracer/utils/utilities/composites/dft/internal/utils.cpp
index 150a61d8..c5865387 100644
--- a/src/dftracer/utils/utilities/composites/dft/internal/utils.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/internal/utils.cpp
@@ -1,6 +1,7 @@
 #include <dftracer/utils/core/common/constants.h>
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/utilities/composites/dft/internal/utils.h>
+#include <dftracer/utils/utilities/indexer/internal/helpers.h>
 
 #include <functional>
 #include <sstream>
@@ -8,12 +9,14 @@
 
 namespace dftracer::utils::utilities::composites::dft::internal {
 
-std::string determine_index_path(const std::string& file_path,
+std::string determine_index_path(const std::string& path,
                                  const std::string& index_dir) {
-    fs::path data_path(file_path);
-    fs::path root =
-        index_dir.empty() ? data_path.parent_path() : fs::path(index_dir);
-    return (root / ".dftindex").string();
+    fs::path data_path(path);
+    fs::path root = index_dir.empty() ? (fs::is_directory(data_path)
+                                             ? data_path
+                                             : data_path.parent_path())
+                                      : fs::path(index_dir);
+    return indexer::internal::normalize_index_root(root.string());
 }
 
 std::string determine_provenance_index_path(const std::string& data_path,
diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.cpp
new file mode 100644
index 00000000..6a2a44f1
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.cpp
@@ -0,0 +1,852 @@
+#include <dftracer/utils/core/common/byte_view.h>
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/pipeline/executor.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/dft_event_dispatcher.h>
+#include <dftracer/utils/utilities/composites/dft/reorganize/group_writer_task.h>
+#include <dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.h>
+#include <dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.h>
+#include <dftracer/utils/utilities/compression/zlib/streaming_compressor_utility.h>
+#include <dftracer/utils/utilities/fileio/chunk_writer.h>
+#include <dftracer/utils/utilities/fileio/parallel/layout.h>
+#include <dftracer/utils/utilities/fileio/parallel/merge.h>
+#include <dftracer/utils/utilities/fileio/parallel/parallel_writer.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_sst_writer_context.h>
+#include <dftracer/utils/utilities/indexer/internal/checkpoint.h>
+
+#include <algorithm>
+// #include <cstdlib>  // re-enable with the DFT_MOCK_PADDED_STRIPE_BYTES block
+#include <cstring>
+#include <fstream>
+#include <memory>
+#include <vector>
+
+namespace dftracer::utils::utilities::composites::dft::reorganize {
+
+namespace {
+
+constexpr std::size_t DEFAULT_FLUSH_BYTES = 32 * 1024 * 1024;
+constexpr std::size_t BUFFER_HEADROOM_BYTES = 1 * 1024 * 1024;
+
+coro::CoroTask<void> compress_to_gzip_member(int level, ByteView data,
+                                             std::vector<unsigned char>& out) {
+    out.clear();
+    compression::zlib::ManualStreamingCompressorUtility comp(
+        level, compression::zlib::CompressionFormat::GZIP);
+    if (data.size() > 0) {
+        auto gen = comp.compress(data);
+        while (auto view = co_await gen.next()) {
+            const auto* p =
+                reinterpret_cast<const unsigned char*>(view->data());
+            out.insert(out.end(), p, p + view->size());
+        }
+    }
+    auto fin = comp.finalize_stream();
+    while (auto view = co_await fin.next()) {
+        const auto* p = reinterpret_cast<const unsigned char*>(view->data());
+        out.insert(out.end(), p, p + view->size());
+    }
+    co_return;
+}
+
+std::string make_chunk_path(const std::string& dir, int index, bool compress) {
+    return dir + "/chunk_chunk" + std::to_string(index) + ".pfw" +
+           (compress ? ".gz" : "");
+}
+
+struct WorkerBuf {
+    std::vector<char> payload;
+    std::size_t lines_in_flush = 0;
+    std::vector<std::size_t> flush_line_counts;
+};
+
+struct PendingSegment {
+    int source_file_idx;
+    int checkpoint_idx;
+    std::size_t worker_idx;
+    std::size_t flush_idx;
+    std::size_t offset_in_flush;
+    std::size_t count;
+};
+
+struct FlushTask {
+    std::vector<char> payload;
+    std::uint64_t uc_offset = 0;
+    std::uint64_t line_count = 0;
+    std::uint64_t first_line_num = 0;
+    std::size_t dispatch_idx = 0;
+};
+
+struct IndexBatch {
+    std::shared_ptr<std::string> payload;
+    std::size_t dispatch_idx = 0;
+    std::size_t worker_idx = 0;
+    std::uint64_t c_offset = 0;
+    std::uint64_t c_size = 0;
+    std::uint64_t uc_offset = 0;
+    std::uint64_t uc_size = 0;
+    std::uint64_t line_count = 0;
+    std::uint64_t first_line_num = 0;
+};
+
+constexpr std::size_t FLUSH_CHANNEL_CAPACITY = 3;
+constexpr std::size_t INDEX_CHANNEL_CAPACITY = 8;
+
+struct InlineIndexState {
+    std::unique_ptr<visitors::BloomVisitor> bloom;
+    std::unique_ptr<visitors::HashTableVisitor> hash_table;
+    std::unique_ptr<visitors::ManifestVisitor> manifest;
+    std::unique_ptr<aggregators::AggregationVisitor> aggregation;
+    std::unique_ptr<DftEventDispatcher> dispatcher;
+    std::shared_ptr<coro::Channel<IndexBatch>> index_channel;
+    std::vector<IndexBatch> finalized_batches;
+    int file_id = -1;
+    std::unique_ptr<indexer::IndexDatabaseSstWriterContext> sink;
+    std::uint64_t slice_uc_bytes = 0;
+    std::uint64_t sink_uc_bytes = 0;
+};
+
+constexpr std::uint64_t SLICE_UC_THRESHOLD = 64ULL * 1024 * 1024;
+constexpr std::uint64_t SINK_UC_THRESHOLD = 1ULL * 1024 * 1024 * 1024;
+
+struct ChunkState {
+    std::unique_ptr<fileio::parallel::ParallelWriter> writer;
+    std::string output_path;
+    fileio::parallel::LayoutInfo layout_info;
+    std::size_t num_workers = 1;
+    bool compress = false;
+    int compression_level = Z_DEFAULT_COMPRESSION;
+    std::size_t flush_threshold = 0;
+    std::size_t buffer_capacity = 0;
+    std::size_t bytes_uncompressed = 0;
+    std::size_t events_written = 0;
+    bool has_any_events = false;
+    std::vector<WorkerBuf> workers;
+    std::vector<PendingSegment> segments;
+    std::vector<unsigned char> compressed_scratch;
+    std::vector<std::shared_ptr<coro::Channel<FlushTask>>> flush_channels;
+
+    bool inline_index_enabled = false;
+    std::uint64_t inline_uc_dispatched = 0;
+    std::uint64_t inline_lines_dispatched = 0;
+    std::size_t inline_dispatch_counter = 0;
+    InlineIndexState inline_index;
+};
+
+coro::CoroTask<bool> run_flusher(
+    std::size_t worker_idx, std::shared_ptr<coro::Channel<FlushTask>> channel,
+    ChunkState* st) {
+    std::vector<unsigned char> scratch;
+    while (auto task_opt = co_await channel->receive()) {
+        FlushTask& task = *task_opt;
+        if (task.payload.empty()) continue;
+        ByteView view(task.payload.data(), task.payload.size());
+        int rc;
+        if (st->compress) {
+            scratch.clear();
+            co_await compress_to_gzip_member(st->compression_level, view,
+                                             scratch);
+            rc = co_await st->writer->write_chunk(
+                worker_idx,
+                ByteView(reinterpret_cast<const char*>(scratch.data()),
+                         scratch.size()));
+        } else {
+            rc = co_await st->writer->write_chunk(worker_idx, view);
+        }
+        if (rc != 0) co_return false;
+
+        if (st->inline_index_enabled) {
+            auto member = st->writer->last_member(worker_idx);
+            if (!member) co_return false;
+            IndexBatch batch;
+            batch.payload = std::make_shared<std::string>(task.payload.data(),
+                                                          task.payload.size());
+            batch.dispatch_idx = task.dispatch_idx;
+            batch.worker_idx = worker_idx;
+            batch.c_offset = member->offset;
+            batch.c_size = member->length;
+            batch.uc_offset = task.uc_offset;
+            batch.uc_size = task.payload.size();
+            batch.line_count = task.line_count;
+            batch.first_line_num = task.first_line_num;
+            if (!co_await st->inline_index.index_channel->send(
+                    std::move(batch))) {
+                co_return false;
+            }
+        }
+    }
+    co_return true;
+}
+
+void rotate_inline_sink(InlineIndexState& idx,
+                        const GroupWriterConfig& config) {
+    auto a = idx.sink->commit();
+    if (!a.empty()) config.artifacts_queue->enqueue(std::move(a));
+    const auto next_idx =
+        config.batch_counter->fetch_add(1, std::memory_order_relaxed);
+    idx.sink = std::make_unique<indexer::IndexDatabaseSstWriterContext>(
+        config.staging_root, "inline_" + std::to_string(next_idx));
+    idx.sink_uc_bytes = 0;
+}
+
+void flush_slice_visitors(InlineIndexState& idx,
+                          const GroupWriterConfig& config) {
+    if (!idx.sink || idx.file_id < 0) return;
+    if (idx.bloom) {
+        idx.bloom->flush_per_checkpoint_to_sink(*idx.sink, idx.file_id);
+    }
+    if (idx.manifest) {
+        idx.manifest->flush_per_checkpoint_to_sink(*idx.sink, idx.file_id);
+    }
+    idx.sink_uc_bytes += idx.slice_uc_bytes;
+    idx.slice_uc_bytes = 0;
+    if (idx.sink_uc_bytes >= SINK_UC_THRESHOLD) {
+        rotate_inline_sink(idx, config);
+    }
+}
+
+coro::CoroTask<bool> run_index_feeder(ChunkState* st,
+                                      const GroupWriterConfig* config) {
+    auto& idx = st->inline_index;
+    while (auto batch_opt = co_await idx.index_channel->receive()) {
+        IndexBatch batch = std::move(*batch_opt);
+        if (batch.payload && !batch.payload->empty()) {
+            co_await idx.dispatcher->on_chunk(batch.payload->data(),
+                                              batch.payload->size(),
+                                              batch.dispatch_idx);
+            co_await idx.dispatcher->on_checkpoint(batch.dispatch_idx);
+        }
+        idx.slice_uc_bytes += batch.uc_size;
+        batch.payload.reset();
+        idx.finalized_batches.push_back(std::move(batch));
+        if (idx.slice_uc_bytes >= SLICE_UC_THRESHOLD) {
+            co_await idx.dispatcher->flush();
+            flush_slice_visitors(idx, *config);
+        }
+    }
+    co_return true;
+}
+
+coro::CoroTask<bool> dispatch_flush(ChunkState& st, std::size_t w) {
+    auto& ww = st.workers[w];
+    if (ww.payload.empty()) co_return true;
+    ww.flush_line_counts.push_back(ww.lines_in_flush);
+    FlushTask task;
+    task.payload = std::move(ww.payload);
+    task.line_count = ww.lines_in_flush;
+    task.uc_offset = st.inline_uc_dispatched;
+    task.first_line_num = st.inline_lines_dispatched;
+    task.dispatch_idx = st.inline_dispatch_counter++;
+    st.inline_uc_dispatched += task.payload.size();
+    st.inline_lines_dispatched += task.line_count;
+    ww.payload = std::vector<char>();
+    ww.payload.reserve(st.buffer_capacity);
+    ww.lines_in_flush = 0;
+    bool ok = co_await st.flush_channels[w]->send(std::move(task));
+    co_return ok;
+}
+
+coro::CoroTask<bool> write_section(ChunkState& st, ByteView data,
+                                   bool is_footer) {
+    ByteView payload = data;
+    if (st.compress) {
+        co_await compress_to_gzip_member(st.compression_level, data,
+                                         st.compressed_scratch);
+        payload = ByteView(
+            reinterpret_cast<const char*>(st.compressed_scratch.data()),
+            st.compressed_scratch.size());
+    }
+    int rc = is_footer ? co_await st.writer->write_footer(payload)
+                       : co_await st.writer->write_header(payload);
+    co_return rc == 0;
+}
+
+coro::CoroTask<bool> open_chunk(ChunkState& st, const std::string& path,
+                                bool compress, int compression_level,
+                                std::size_t chunk_size_bytes,
+                                std::size_t baseline_workers, CoroScope* scope,
+                                const GroupWriterConfig& config) {
+    st.output_path = path;
+    st.compress = compress;
+    st.compression_level = compression_level;
+    st.bytes_uncompressed = 0;
+    st.events_written = 0;
+    st.has_any_events = false;
+    st.segments.clear();
+
+    st.layout_info = fileio::parallel::detect_layout(path);
+
+    // Local validation aid: force padded-striped layout when no Lustre is
+    // available. Uncomment to exercise the padded-striped writer path.
+    // if (const char* mock = std::getenv("DFT_MOCK_PADDED_STRIPE_BYTES")) {
+    //     const auto sz =
+    //         static_cast<std::size_t>(std::strtoull(mock, nullptr, 10));
+    //     if (sz >= fileio::parallel::MIN_PADDED_STRIPE_BYTES) {
+    //         st.layout_info.layout = fileio::parallel::FileLayout::STRIPED;
+    //         st.layout_info.stripe_size = sz;
+    //     }
+    // }
+
+    if (st.layout_info.layout == fileio::parallel::FileLayout::STRIPED &&
+        st.layout_info.stripe_size == 0) {
+        st.layout_info.layout = fileio::parallel::FileLayout::SHARDED;
+    }
+
+    const bool uses_padded =
+        st.layout_info.layout == fileio::parallel::FileLayout::STRIPED &&
+        compress &&
+        st.layout_info.stripe_size >= fileio::parallel::MIN_PADDED_STRIPE_BYTES;
+    const bool uses_sharded =
+        st.layout_info.layout == fileio::parallel::FileLayout::SHARDED;
+    // Plain striped writes at an atomic offset so cross-worker order is
+    // non-deterministic; keep one worker so we can resolve absolute line
+    // numbers. Padded-striped and sharded layouts expose deterministic
+    // worker slotting so we can fan out.
+    const std::size_t effective_baseline =
+        (uses_padded || uses_sharded)
+            ? std::max<std::size_t>(baseline_workers, 1)
+            : 1;
+    const auto sizing = fileio::parallel::compute_writer_sizing(
+        st.layout_info, effective_baseline, DEFAULT_FLUSH_BYTES,
+        BUFFER_HEADROOM_BYTES, uses_padded);
+    st.num_workers = sizing.num_workers;
+    st.flush_threshold = sizing.flush_threshold;
+    st.buffer_capacity = sizing.buffer_capacity;
+    if (chunk_size_bytes > 0 && st.flush_threshold > chunk_size_bytes) {
+        st.flush_threshold = chunk_size_bytes;
+    }
+
+    st.workers.clear();
+    st.workers.resize(st.num_workers);
+    for (auto& w : st.workers) {
+        w.payload.reserve(st.buffer_capacity);
+    }
+
+    fileio::parallel::WriterConfig wcfg;
+    wcfg.layout = st.layout_info.layout;
+    wcfg.stripe_size = st.layout_info.stripe_size;
+    wcfg.gzip = compress;
+    st.writer = fileio::parallel::make_writer(wcfg);
+
+    if (co_await st.writer->open(path, st.num_workers, compress, scope) != 0) {
+        co_return false;
+    }
+
+    st.inline_index_enabled =
+        !config.index_dir.empty() && config.artifacts_queue &&
+        config.batch_counter &&
+        (st.layout_info.layout == fileio::parallel::FileLayout::STRIPED ||
+         st.layout_info.layout == fileio::parallel::FileLayout::SHARDED);
+    st.inline_uc_dispatched = 0;
+    st.inline_lines_dispatched = 0;
+    st.inline_dispatch_counter = 0;
+    if (st.inline_index_enabled) {
+        st.inline_index = InlineIndexState{};
+        st.inline_index.bloom = std::make_unique<visitors::BloomVisitor>(
+            config.bloom_config, config.bloom_dimensions);
+        st.inline_index.hash_table =
+            std::make_unique<visitors::HashTableVisitor>();
+        st.inline_index.manifest =
+            std::make_unique<visitors::ManifestVisitor>();
+        if (config.with_aggregation) {
+            aggregators::AggregationConfig agg_cfg;
+            agg_cfg.time_interval_us =
+                static_cast<std::uint64_t>(config.agg_time_interval_us);
+            agg_cfg.compute_statistics = true;
+            agg_cfg.track_process_parents = true;
+            agg_cfg.track_default_args = true;
+            const std::size_t batch_idx =
+                config.batch_counter->fetch_add(1, std::memory_order_relaxed);
+            st.inline_index.aggregation =
+                std::make_unique<aggregators::AggregationVisitor>(
+                    config.staging_root, "agg_" + std::to_string(batch_idx),
+                    /*config_hash=*/0u, agg_cfg, path);
+        }
+        DftEventDispatcher::VisitorList visitors;
+        visitors.emplace_back(*st.inline_index.bloom);
+        visitors.emplace_back(*st.inline_index.hash_table);
+        visitors.emplace_back(*st.inline_index.manifest);
+        if (st.inline_index.aggregation) {
+            visitors.emplace_back(*st.inline_index.aggregation);
+        }
+        st.inline_index.dispatcher = std::make_unique<DftEventDispatcher>(
+            std::move(visitors), /*force_serial=*/true);
+        st.inline_index.dispatcher->begin(0);
+        st.inline_index.index_channel =
+            coro::make_channel<IndexBatch>(INDEX_CHANNEL_CAPACITY);
+    }
+
+    const char header[] = "[\n";
+    if (!co_await write_section(
+            st, ByteView(reinterpret_cast<const char*>(header), 2), false)) {
+        co_return false;
+    }
+    co_return true;
+}
+
+void emit_segments(const ChunkState& st, int chunk_idx,
+                   ProvenanceTracker& prov) {
+    std::vector<std::vector<std::size_t>> abs_base(st.num_workers);
+    for (std::size_t w = 0; w < st.num_workers; ++w) {
+        abs_base[w].assign(st.workers[w].flush_line_counts.size(), 0);
+    }
+    std::size_t cum = 0;
+    const bool striped_parallel =
+        st.layout_info.layout == fileio::parallel::FileLayout::STRIPED &&
+        st.num_workers > 1;
+    if (striped_parallel) {
+        std::size_t max_flushes = 0;
+        for (std::size_t w = 0; w < st.num_workers; ++w) {
+            max_flushes =
+                std::max(max_flushes, st.workers[w].flush_line_counts.size());
+        }
+        for (std::size_t k = 0; k < max_flushes; ++k) {
+            for (std::size_t w = 0; w < st.num_workers; ++w) {
+                if (k >= st.workers[w].flush_line_counts.size()) continue;
+                abs_base[w][k] = cum;
+                cum += st.workers[w].flush_line_counts[k];
+            }
+        }
+    } else {
+        for (std::size_t w = 0; w < st.num_workers; ++w) {
+            for (std::size_t k = 0; k < st.workers[w].flush_line_counts.size();
+                 ++k) {
+                abs_base[w][k] = cum;
+                cum += st.workers[w].flush_line_counts[k];
+            }
+        }
+    }
+    for (const auto& seg : st.segments) {
+        std::size_t abs_start =
+            abs_base[seg.worker_idx][seg.flush_idx] + seg.offset_in_flush;
+        std::size_t abs_end = abs_start + seg.count - 1;
+        prov.record(seg.source_file_idx, seg.checkpoint_idx, chunk_idx,
+                    static_cast<int>(abs_start), static_cast<int>(abs_end),
+                    static_cast<int>(seg.count));
+    }
+}
+
+coro::CoroTask<bool> dispatch_flush_all(ChunkState& st) {
+    for (std::size_t w = 0; w < st.num_workers; ++w) {
+        if (!co_await dispatch_flush(st, w)) co_return false;
+    }
+    co_return true;
+}
+
+// Append a single line (plus trailing '\n') to worker w's payload. Does NOT
+// flush; caller decides when to flush.
+void append_line(ChunkState& st, std::size_t w, ByteView line) {
+    auto& ww = st.workers[w];
+    const char* p = line.as<char>();
+    ww.payload.insert(ww.payload.end(), p, p + line.size());
+    ww.payload.push_back('\n');
+    ww.lines_in_flush += 1;
+    st.bytes_uncompressed += line.size() + 1;
+    st.events_written += 1;
+    st.has_any_events = true;
+}
+
+}  // namespace
+
+coro::CoroTask<GroupWriterResult> run_group_writer(CoroScope* scope,
+                                                   GroupWriterConfig config) {
+    auto result = std::make_unique<GroupWriterResult>();
+    result->group_name = config.group_name;
+
+    try {
+        std::string group_output_dir =
+            config.output_dir + "/" + config.group_name;
+        if (!fs::exists(group_output_dir)) {
+            fs::create_directories(group_output_dir);
+        }
+
+        auto provenance = std::make_unique<ProvenanceTracker>();
+
+        int current_chunk_idx = 0;
+        std::vector<fileio::ChunkInfo> chunks_info;
+        std::unique_ptr<indexer::IndexDatabase> coord_db;
+        bool any_chunk_inline_indexed = false;
+        const bool inline_index_active = !config.index_dir.empty() &&
+                                         config.artifacts_queue &&
+                                         config.batch_counter;
+        if (inline_index_active) {
+            coord_db =
+                std::make_unique<indexer::IndexDatabase>(config.index_dir);
+            coord_db->init_schema();
+        }
+
+        const std::size_t baseline_workers =
+            (scope && scope->get_executor())
+                ? scope->get_executor()->get_num_threads()
+                : 1;
+
+        auto open_inline_sink = [&](ChunkState& cs, const std::string& path) {
+            if (!cs.inline_index_enabled) return;
+            // Sharded writers don't materialize the merged path until
+            // finalize_chunk runs `merge_shards`. Touch it so register_files
+            // can stat/hash it now.
+            if (!fs::exists(path)) {
+                std::ofstream(path).close();
+            }
+            std::vector<int> ids =
+                coord_db->register_files({path}, /*build_manifest=*/true);
+            cs.inline_index.file_id = ids.empty() ? -1 : ids.front();
+            const auto idx =
+                config.batch_counter->fetch_add(1, std::memory_order_relaxed);
+            cs.inline_index.sink =
+                std::make_unique<indexer::IndexDatabaseSstWriterContext>(
+                    config.staging_root, "inline_" + std::to_string(idx));
+            cs.inline_index.slice_uc_bytes = 0;
+            cs.inline_index.sink_uc_bytes = 0;
+        };
+
+        constexpr std::size_t MAX_IN_FLIGHT_CHUNKS = 4;
+        auto sync_mutex = std::make_shared<std::mutex>();
+        auto inline_indexed_flag = std::make_shared<std::atomic<bool>>(false);
+        auto sem = coro::make_channel<int>(MAX_IN_FLIGHT_CHUNKS);
+
+        co_await scope->scope([&](CoroScope& group_scope)
+                                  -> coro::CoroTask<void> {
+            for (std::size_t i = 0; i < MAX_IN_FLIGHT_CHUNKS; ++i) {
+                co_await sem->send(0);
+            }
+
+            auto cs = std::make_shared<ChunkState>();
+            {
+                const auto path = make_chunk_path(
+                    group_output_dir, current_chunk_idx, config.compress);
+                if (!co_await open_chunk(
+                        *cs, path, config.compress, config.compression_level,
+                        config.chunk_size_bytes, baseline_workers, &group_scope,
+                        config)) {
+                    throw std::runtime_error("Failed to open initial chunk");
+                }
+                open_inline_sink(*cs, path);
+            }
+
+            bool input_eof = false;
+            while (!input_eof) {
+                cs->flush_channels.clear();
+                cs->flush_channels.reserve(cs->num_workers);
+                for (std::size_t i = 0; i < cs->num_workers; ++i) {
+                    cs->flush_channels.push_back(
+                        coro::make_channel<FlushTask>(FLUSH_CHANNEL_CAPACITY));
+                }
+
+                co_await sem->receive();
+
+                const int captured_chunk_idx = current_chunk_idx;
+                const GroupWriterConfig* cfg_ptr = &config;
+                ProvenanceTracker* prov_ptr = provenance.get();
+                std::vector<fileio::ChunkInfo>* chunks_info_ptr = &chunks_info;
+                GroupWriterResult* result_raw = result.get();
+                auto sync_mtx = sync_mutex;
+                auto indexed_flag = inline_indexed_flag;
+                auto sem_release = sem;
+
+                group_scope.spawn([cs, captured_chunk_idx, cfg_ptr, prov_ptr,
+                                   chunks_info_ptr, result_raw, sync_mtx,
+                                   indexed_flag,
+                                   sem_release](CoroScope& orch_scope)
+                                      -> coro::CoroTask<void> {
+                    ChunkState* cs_p = cs.get();
+                    co_await orch_scope.scope([cs_p,
+                                               cfg_ptr](CoroScope& work_scope)
+                                                  -> coro::CoroTask<void> {
+                        if (cs_p->inline_index_enabled) {
+                            work_scope.spawn(
+                                [cs_p,
+                                 cfg_ptr](CoroScope&) -> coro::CoroTask<void> {
+                                    co_await run_index_feeder(cs_p, cfg_ptr);
+                                });
+                        }
+                        co_await work_scope.scope([cs_p](CoroScope& flush_scope)
+                                                      -> coro::CoroTask<void> {
+                            for (std::size_t i = 0; i < cs_p->num_workers;
+                                 ++i) {
+                                auto ch = cs_p->flush_channels[i];
+                                flush_scope.spawn(
+                                    [i, ch,
+                                     cs_p](CoroScope&) -> coro::CoroTask<void> {
+                                        co_await run_flusher(i, ch, cs_p);
+                                    });
+                            }
+                            co_return;
+                        });
+                        if (cs_p->inline_index_enabled &&
+                            cs_p->inline_index.index_channel) {
+                            cs_p->inline_index.index_channel->close();
+                        }
+                        co_return;
+                    });
+
+                    if (cs_p->inline_index_enabled) {
+                        co_await cs_p->inline_index.dispatcher->flush();
+                    }
+
+                    {
+                        std::lock_guard<std::mutex> lk(*sync_mtx);
+                        emit_segments(*cs_p, captured_chunk_idx, *prov_ptr);
+                    }
+                    cs_p->segments.clear();
+
+                    const char footer[] = "]\n";
+                    if (!co_await write_section(
+                            *cs_p,
+                            ByteView(reinterpret_cast<const char*>(footer), 2),
+                            true)) {
+                        throw std::runtime_error("Failed to write footer");
+                    }
+                    if (co_await cs_p->writer->close() != 0) {
+                        throw std::runtime_error("Failed to close writer");
+                    }
+                    if (cs_p->inline_index_enabled) {
+                        auto bases = cs_p->writer->shard_base_offsets();
+                        if (!bases.empty()) {
+                            for (auto& b :
+                                 cs_p->inline_index.finalized_batches) {
+                                if (b.worker_idx < bases.size()) {
+                                    b.c_offset += bases[b.worker_idx];
+                                }
+                            }
+                        }
+                    }
+                    if (cs_p->layout_info.layout ==
+                        fileio::parallel::FileLayout::SHARDED) {
+                        auto shards = cs_p->writer->output_paths();
+                        if (co_await fileio::parallel::merge_shards(
+                                cs_p->output_path, shards) != 0) {
+                            throw std::runtime_error("merge_shards failed");
+                        }
+                    }
+
+                    {
+                        std::lock_guard<std::mutex> lk(*sync_mtx);
+                        chunks_info_ptr->push_back(fileio::ChunkInfo{
+                            .path = cs_p->output_path,
+                            .bytes_written = cs_p->bytes_uncompressed,
+                            .events_written = cs_p->events_written,
+                            .chunk_index = captured_chunk_idx,
+                        });
+                        result_raw->output_files.push_back(cs_p->output_path);
+                        auto span = cs_p->writer->member_layout();
+                        if (!span.empty()) {
+                            ChunkMemberLayout layout;
+                            layout.path = cs_p->output_path;
+                            layout.members.assign(span.begin(), span.end());
+                            result_raw->chunk_layouts.push_back(
+                                std::move(layout));
+                        }
+                    }
+
+                    if (cs_p->inline_index_enabled && cs_p->inline_index.sink &&
+                        cs_p->inline_index.file_id >= 0) {
+                        auto& idx = cs_p->inline_index;
+
+                        std::vector<IndexBatch*> ordered;
+                        ordered.reserve(idx.finalized_batches.size());
+                        for (auto& b : idx.finalized_batches)
+                            ordered.push_back(&b);
+                        std::sort(ordered.begin(), ordered.end(),
+                                  [](const IndexBatch* a, const IndexBatch* b) {
+                                      return a->c_offset < b->c_offset;
+                                  });
+                        std::uint64_t running = 0;
+                        for (auto* b : ordered) {
+                            b->first_line_num = running;
+                            running += b->line_count;
+                        }
+                        for (const auto& b : idx.finalized_batches) {
+                            indexer::internal::IndexerCheckpoint cp;
+                            cp.checkpoint_idx = b.dispatch_idx;
+                            cp.uc_offset = b.uc_offset;
+                            cp.uc_size = b.uc_size;
+                            cp.c_offset = b.c_offset;
+                            cp.c_size = b.c_size;
+                            cp.bits = 0;
+                            cp.num_lines = b.line_count;
+                            cp.first_line_num = b.first_line_num;
+                            cp.last_line_num =
+                                b.line_count > 0
+                                    ? b.first_line_num + b.line_count - 1
+                                    : b.first_line_num;
+                            idx.sink->insert_checkpoint(idx.file_id, cp);
+                        }
+                        idx.sink->insert_file_metadata(
+                            idx.file_id, /*checkpoint_size=*/0,
+                            cs_p->inline_lines_dispatched,
+                            cs_p->inline_uc_dispatched);
+
+                        if (idx.bloom) {
+                            idx.bloom->finalize_file_to_sink(*idx.sink,
+                                                             idx.file_id);
+                        }
+                        if (idx.manifest) {
+                            idx.manifest->finalize_file_to_sink(*idx.sink,
+                                                                idx.file_id);
+                        }
+                        if (idx.hash_table) {
+                            idx.hash_table->finalize(*idx.sink, idx.file_id);
+                        }
+                        if (idx.aggregation) {
+                            co_await idx.aggregation->on_file_complete();
+
+                            aggregators::AggGlobalConfig agg_global;
+                            agg_global.time_interval_us =
+                                static_cast<std::uint64_t>(
+                                    cfg_ptr->agg_time_interval_us);
+                            agg_global.config_hash = 0;
+                            idx.sink->insert_aggregation_put(
+                                std::string_view(
+                                    aggregators::AGG_GLOBAL_CONFIG_KEY, 2),
+                                aggregators::serialize_agg_global_config(
+                                    agg_global));
+                            idx.sink->insert_aggregation_put(
+                                aggregators::make_agg_file_key(idx.file_id),
+                                "");
+
+                            for (auto& a :
+                                 idx.aggregation->aggregation_artifacts()) {
+                                if (!a.empty()) {
+                                    cfg_ptr->artifacts_queue->enqueue(
+                                        std::move(a));
+                                }
+                            }
+                        }
+                        auto a = idx.sink->commit();
+                        if (!a.empty()) {
+                            cfg_ptr->artifacts_queue->enqueue(std::move(a));
+                        }
+                        indexed_flag->store(true, std::memory_order_release);
+                    }
+
+                    co_await sem_release->send(0);
+                });
+
+                bool rotated = false;
+                ChunkState* cs_ptr = cs.get();
+                std::size_t batch_counter = 0;
+
+                while (auto batch_opt =
+                           co_await config.input_channel->receive()) {
+                    LineBatch* batch_ptr = batch_opt->get();
+                    const std::size_t line_count = batch_ptr->lines.size();
+                    if (line_count == 0) continue;
+
+                    const std::size_t worker =
+                        batch_counter++ %
+                        std::max<std::size_t>(cs_ptr->num_workers, 1);
+                    const int src_file =
+                        static_cast<int>(batch_ptr->lines[0].source_file_idx);
+                    const int ckpt =
+                        static_cast<int>(batch_ptr->lines[0].checkpoint_idx);
+
+                    std::size_t line_idx = 0;
+                    while (line_idx < line_count) {
+                        auto& ww = cs_ptr->workers[worker];
+                        PendingSegment seg{
+                            src_file,
+                            ckpt,
+                            worker,
+                            ww.flush_line_counts.size(),
+                            ww.lines_in_flush,
+                            0,
+                        };
+                        bool inner_rotated = false;
+
+                        while (line_idx < line_count) {
+                            auto view = batch_ptr->line_view(line_idx);
+                            ByteView line(view.data(), view.size());
+                            append_line(*cs_ptr, worker, line);
+                            seg.count++;
+                            line_idx++;
+                            if (ww.payload.size() >= cs_ptr->flush_threshold) {
+                                if (!co_await dispatch_flush(*cs_ptr, worker)) {
+                                    throw std::runtime_error(
+                                        "Failed to dispatch flush");
+                                }
+                                break;
+                            }
+                            if (config.chunk_size_bytes > 0 &&
+                                cs_ptr->bytes_uncompressed >=
+                                    config.chunk_size_bytes) {
+                                inner_rotated = true;
+                                break;
+                            }
+                        }
+
+                        if (seg.count > 0) cs_ptr->segments.push_back(seg);
+                        if (inner_rotated) {
+                            rotated = true;
+                            break;
+                        }
+                    }
+
+                    result->events_written += line_count;
+                    if (rotated) break;
+                }
+
+                if (!co_await dispatch_flush_all(*cs_ptr)) {
+                    throw std::runtime_error(
+                        "Failed to dispatch trailing flush");
+                }
+                for (auto& ch : cs_ptr->flush_channels) ch->close();
+
+                if (!rotated) {
+                    input_eof = true;
+                    break;
+                }
+
+                current_chunk_idx++;
+                cs = std::make_shared<ChunkState>();
+                const auto next_path = make_chunk_path(
+                    group_output_dir, current_chunk_idx, config.compress);
+                if (!co_await open_chunk(
+                        *cs, next_path, config.compress,
+                        config.compression_level, config.chunk_size_bytes,
+                        baseline_workers, &group_scope, config)) {
+                    throw std::runtime_error("Failed to open next chunk");
+                }
+                open_inline_sink(*cs, next_path);
+            }
+            co_return;
+        });
+
+        if (inline_indexed_flag->load(std::memory_order_acquire)) {
+            any_chunk_inline_indexed = true;
+        }
+
+        result->bytes_written = 0;
+        for (const auto& c : chunks_info) {
+            result->bytes_written += c.bytes_written;
+        }
+        result->chunks_created = chunks_info.size();
+
+        if (config.source_files) {
+            ExtractionPlan plan;
+            plan.source_files = *config.source_files;
+            co_await provenance->flush_to_db(plan, config.group_name,
+                                             config.group_query, chunks_info,
+                                             config.output_dir);
+        }
+
+        if (any_chunk_inline_indexed) result->indexed_inline = true;
+
+        result->success = true;
+
+    } catch (const std::exception& e) {
+        result->error_message = e.what();
+        DFTRACER_UTILS_LOG_ERROR("GroupWriter failed for %s: %s",
+                                 config.group_name.c_str(), e.what());
+    }
+
+    co_return std::move(*result);
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::reorganize
diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.cpp
new file mode 100644
index 00000000..2cd8a036
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.cpp
@@ -0,0 +1,176 @@
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/utilities/common/query/ast.h>
+#include <dftracer/utils/utilities/common/query/query.h>
+#include <dftracer/utils/utilities/composites/dft/reorganize/manifest_extractor.h>
+#include <dftracer/utils/utilities/fileio/lines/sources/async_streaming_gz_line_generator.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+
+#include <unordered_map>
+#include <variant>
+
+namespace dftracer::utils::utilities::composites::dft::reorganize {
+
+namespace {
+
+struct LineGroupMapping {
+    std::unordered_map<std::size_t, std::size_t> line_to_group;
+};
+
+bool query_matches_cat_name(const common::query::QueryNode& root,
+                            const std::string& cat, const std::string& name) {
+    if (std::holds_alternative<common::query::CompareNode>(root.data)) {
+        const auto& comp = std::get<common::query::CompareNode>(root.data);
+        if (comp.op != common::query::CompareOp::EQ) return false;
+
+        if (comp.field.path == "cat") {
+            if (std::holds_alternative<std::string>(comp.value.value)) {
+                return std::get<std::string>(comp.value.value) == cat;
+            }
+        } else if (comp.field.path == "name") {
+            if (std::holds_alternative<std::string>(comp.value.value)) {
+                return std::get<std::string>(comp.value.value) == name;
+            }
+        }
+    }
+    return false;
+}
+
+LineGroupMapping build_line_group_mapping(
+    const std::vector<indexer::EventRangeResult>& event_ranges,
+    const std::vector<PredicateGroup>& groups,
+    const std::vector<std::optional<common::query::Query>>& parsed_queries) {
+    LineGroupMapping mapping;
+
+    for (const auto& range : event_ranges) {
+        std::size_t target_group = SIZE_MAX;
+
+        for (std::size_t g = 0; g < groups.size(); ++g) {
+            const auto& query_opt = parsed_queries[g];
+            if (!query_opt) {
+                target_group = g;
+                break;
+            }
+
+            if (query_matches_cat_name(query_opt->root(), range.cat,
+                                       range.name)) {
+                target_group = g;
+                break;
+            }
+        }
+
+        if (target_group != SIZE_MAX) {
+            for (std::uint32_t line_num : range.line_numbers) {
+                mapping.line_to_group[static_cast<std::size_t>(line_num)] =
+                    target_group;
+            }
+        }
+    }
+
+    return mapping;
+}
+
+}  // namespace
+
+coro::CoroTask<ManifestExtractorResult> extract_from_manifest(
+    ManifestExtractorConfig config) {
+    ManifestExtractorResult result;
+
+    try {
+        indexer::IndexDatabase db(
+            config.index_path,
+            dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+
+        int file_id = db.get_file_info_id(config.file_path);
+        if (file_id < 0) {
+            result.error_message =
+                "File not found in index: " + config.file_path;
+            co_return result;
+        }
+
+        if (!db.has_manifest_data(file_id)) {
+            result.error_message =
+                "No manifest data for file: " + config.file_path;
+            co_return result;
+        }
+
+        auto event_ranges = db.query_event_ranges(file_id);
+
+        std::vector<std::optional<common::query::Query>> parsed_queries;
+        parsed_queries.reserve(config.groups.size());
+        for (const auto& group : config.groups) {
+            if (group.query.empty()) {
+                parsed_queries.push_back(std::nullopt);
+            } else {
+                auto q = common::query::Query::from_string(group.query);
+                if (q) {
+                    parsed_queries.push_back(std::move(*q));
+                } else {
+                    parsed_queries.push_back(std::nullopt);
+                }
+            }
+        }
+
+        auto mapping = build_line_group_mapping(event_ranges, config.groups,
+                                                parsed_queries);
+
+        std::vector<LineBatch> pending_batches(config.groups.size());
+        for (auto& batch : pending_batches) {
+            batch.reserve(config.batch_size);
+        }
+
+        using fileio::lines::sources::async_streaming_gz_lines;
+        std::size_t line_num = 0;
+        auto gen = async_streaming_gz_lines(config.file_path);
+
+        while (auto line_opt = co_await gen.next()) {
+            const auto& line = *line_opt;
+
+            auto it = mapping.line_to_group.find(line_num);
+            if (it != mapping.line_to_group.end()) {
+                std::size_t group_idx = it->second;
+                auto& batch = pending_batches[group_idx];
+
+                batch.append_line(line.content, config.source_file_idx,
+                                  /*checkpoint_idx=*/0, line_num);
+
+                result.events_extracted++;
+
+                if (batch.size() >= config.batch_size) {
+                    auto& channel = config.group_channels[group_idx];
+                    if (channel) {
+                        co_await channel->send(
+                            std::make_shared<LineBatch>(std::move(batch)));
+                    }
+                    batch.clear();
+                    batch.reserve(config.batch_size);
+                }
+            } else {
+                result.events_unmatched++;
+            }
+
+            line_num++;
+        }
+
+        for (std::size_t i = 0; i < pending_batches.size(); ++i) {
+            auto& batch = pending_batches[i];
+            if (!batch.empty()) {
+                auto& channel = config.group_channels[i];
+                if (channel) {
+                    co_await channel->send(
+                        std::make_shared<LineBatch>(std::move(batch)));
+                }
+            }
+        }
+
+        result.success = true;
+
+    } catch (const std::exception& e) {
+        result.error_message = e.what();
+        DFTRACER_UTILS_LOG_ERROR("ManifestExtractor failed for %s: %s",
+                                 config.file_path.c_str(), e.what());
+    }
+
+    co_return result;
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::reorganize
diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.cpp
new file mode 100644
index 00000000..fe315112
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.cpp
@@ -0,0 +1,145 @@
+#include <dftracer/utils/utilities/composites/dft/reorganize/organize_visitor.h>
+
+namespace dftracer::utils::utilities::composites::dft::reorganize {
+
+OrganizeVisitor::OrganizeVisitor(OrganizeVisitorConfig config)
+    : config_(std::move(config)) {
+    parsed_queries_.reserve(config_.groups.size());
+    for (const auto& group : config_.groups) {
+        if (group.query.empty()) {
+            parsed_queries_.push_back(std::nullopt);
+        } else {
+            auto result = common::query::Query::from_string(group.query);
+            if (result) {
+                parsed_queries_.push_back(std::move(*result));
+            } else {
+                parsed_queries_.push_back(std::nullopt);
+            }
+        }
+    }
+
+    pending_batches_.resize(config_.groups.size());
+    for (auto& batch : pending_batches_) {
+        batch.reserve(config_.batch_size);
+    }
+    drain_queue_.resize(config_.groups.size());
+}
+
+void OrganizeVisitor::begin(std::size_t /*num_checkpoints*/) {
+    for (auto& batch : pending_batches_) {
+        batch.clear();
+    }
+    for (auto& q : drain_queue_) {
+        q.clear();
+    }
+    events_routed_ = 0;
+    events_unmatched_ = 0;
+}
+
+void OrganizeVisitor::on_checkpoint(std::size_t checkpoint_idx) {
+    current_checkpoint_ = checkpoint_idx;
+}
+
+std::size_t OrganizeVisitor::evaluate_event(
+    const DFTracerEvent& /*ev*/, const common::json::JsonValue& json) {
+    for (std::size_t i = 0; i < parsed_queries_.size(); ++i) {
+        const auto& query_opt = parsed_queries_[i];
+        if (!query_opt) {
+            return i;
+        }
+        if (query_opt->evaluate(json)) {
+            return i;
+        }
+    }
+    return SIZE_MAX;
+}
+
+void OrganizeVisitor::on_event(const EventRecord& record) {
+    if (record.ev.is_metadata()) {
+        return;
+    }
+
+    std::size_t group_idx = evaluate_event(record.ev, record.json);
+    if (group_idx == SIZE_MAX) {
+        events_unmatched_++;
+        return;
+    }
+
+    auto& batch = pending_batches_[group_idx];
+    batch.append_line(record.line, config_.source_file_idx,
+                      record.checkpoint_idx, record.line_number);
+
+    events_routed_++;
+}
+
+bool OrganizeVisitor::wants_drain() const noexcept {
+    for (std::size_t i = 0; i < pending_batches_.size(); ++i) {
+        if (!drain_queue_[i].empty()) return true;
+        if (pending_batches_[i].size() >= config_.batch_size) return true;
+    }
+    return false;
+}
+
+coro::CoroTask<void> OrganizeVisitor::drain_pending() {
+    for (std::size_t i = 0; i < pending_batches_.size(); ++i) {
+        auto& channel = config_.group_channels[i];
+        // Send queued slice batches first.
+        for (auto& shared_batch : drain_queue_[i]) {
+            if (channel) {
+                co_await channel->send(std::move(shared_batch));
+            }
+        }
+        drain_queue_[i].clear();
+        // Then drain the threshold-triggered current batch.
+        auto& batch = pending_batches_[i];
+        if (batch.size() < config_.batch_size) continue;
+        if (channel) {
+            co_await channel->send(
+                std::make_shared<LineBatch>(std::move(batch)));
+        }
+        batch.clear();
+        batch.reserve(config_.batch_size);
+    }
+}
+
+coro::CoroTask<void> OrganizeVisitor::on_file_complete() {
+    for (std::size_t i = 0; i < pending_batches_.size(); ++i) {
+        auto& channel = config_.group_channels[i];
+        for (auto& shared_batch : drain_queue_[i]) {
+            if (channel) {
+                co_await channel->send(std::move(shared_batch));
+            }
+        }
+        drain_queue_[i].clear();
+        auto& batch = pending_batches_[i];
+        if (batch.empty()) continue;
+        if (channel) {
+            co_await channel->send(
+                std::make_shared<LineBatch>(std::move(batch)));
+        }
+        batch.clear();
+        batch.reserve(config_.batch_size);
+    }
+}
+
+std::unique_ptr<DftEventVisitor> OrganizeVisitor::create_parallel_slice()
+    const {
+    return std::make_unique<OrganizeVisitor>(config_);
+}
+
+void OrganizeVisitor::merge_parallel_slice(DftEventVisitor& slice_base) {
+    auto* slice = dynamic_cast<OrganizeVisitor*>(&slice_base);
+    if (!slice) return;
+    for (std::size_t i = 0;
+         i < drain_queue_.size() && i < slice->pending_batches_.size(); ++i) {
+        auto& src = slice->pending_batches_[i];
+        if (src.empty()) continue;
+        drain_queue_[i].push_back(std::make_shared<LineBatch>(std::move(src)));
+        src.clear();
+        src.reserve(config_.batch_size);
+    }
+    events_routed_ += slice->events_routed_;
+    events_unmatched_ += slice->events_unmatched_;
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::reorganize
diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.cpp
index 31a97cce..e50257b6 100644
--- a/src/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.cpp
@@ -1,10 +1,12 @@
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
-#include <dftracer/utils/core/rocksdb/async.h>
 #include <dftracer/utils/utilities/composites/dft/reorganize/provenance_tracker.h>
 #include <dftracer/utils/utilities/indexer/internal/transaction_scope.h>
 #include <dftracer/utils/utilities/indexer/provenance_database.h>
 
+#include <cstdint>
+#include <unordered_map>
+
 namespace dftracer::utils::utilities::composites::dft::reorganize {
 
 void ProvenanceTracker::record(int source_file_idx, int checkpoint_idx,
@@ -23,51 +25,39 @@ coro::CoroTask<void> ProvenanceTracker::flush_to_db(
     using indexer::ProvenanceDatabase;
 
     for (const auto& chunk : chunks) {
-        auto provenance_path = std::make_shared<std::string>(
-            indexer::determine_provenance_index_path(chunk.path));
-        const auto* plan_ptr = &plan;
-        const auto* group_name_ptr = &group_name;
-        const auto* group_query_ptr = &group_query;
-        const auto* chunk_ptr = &chunk;
-        const auto* records_ptr = &records_;
-
         try {
-            co_await rocksdb::run([plan_ptr, group_name_ptr, group_query_ptr,
-                                   chunk_ptr, records_ptr, provenance_path] {
-                ProvenanceDatabase pdb(*provenance_path);
-                pdb.init_schema();
-
-                std::uint64_t out_hash = 0;
-                if (fs::exists(chunk_ptr->path)) {
-                    out_hash = static_cast<std::uint64_t>(
-                        fs::file_size(chunk_ptr->path));
-                }
-                int fid =
-                    pdb.get_or_create_file_info(chunk_ptr->path, out_hash);
-
-                indexer::internal::TransactionScope txn(pdb);
-                pdb.insert_info(fid, "version", "2.0");
-                pdb.insert_info(fid, "tool", "dftracer_organize");
-                pdb.insert_group(fid, *group_name_ptr, *group_query_ptr);
+            std::string provenance_path =
+                indexer::determine_provenance_index_path(chunk.path);
+            ProvenanceDatabase pdb(provenance_path);
+            pdb.init_schema();
 
-                for (std::size_t si = 0; si < plan_ptr->source_files.size();
-                     ++si) {
-                    const auto& src = plan_ptr->source_files[si];
-                    pdb.insert_source(fid, static_cast<int>(si), src.file_path,
-                                      static_cast<int>(src.num_checkpoints));
-                }
+            std::uint64_t out_hash = 0;
+            if (fs::exists(chunk.path)) {
+                out_hash =
+                    static_cast<std::uint64_t>(fs::file_size(chunk.path));
+            }
+            int fid = pdb.get_or_create_file_info(chunk.path, out_hash);
+            pdb.insert_info(fid, "version", "2.0");
+            pdb.insert_info(fid, "tool", "dftracer_organize");
+            pdb.insert_group(fid, group_name, group_query);
 
-                for (const auto& rec : *records_ptr) {
-                    if (rec.output_chunk_idx != chunk_ptr->chunk_index)
-                        continue;
-                    pdb.insert_segment(fid, rec.source_file_idx,
-                                       rec.checkpoint_idx,
-                                       rec.output_line_start,
-                                       rec.output_line_end, rec.event_count);
-                }
+            for (std::size_t si = 0; si < plan.source_files.size(); ++si) {
+                const auto& src = plan.source_files[si];
+                pdb.insert_source(fid, static_cast<int>(si), src.file_path,
+                                  static_cast<int>(src.num_checkpoints));
+            }
 
-                txn.commit();
-            });
+            std::unordered_map<std::uint64_t, int> seq_counter;
+            for (const auto& rec : records_) {
+                if (rec.output_chunk_idx != chunk.chunk_index) continue;
+                std::uint64_t k =
+                    (static_cast<std::uint64_t>(rec.source_file_idx) << 32) |
+                    static_cast<std::uint32_t>(rec.checkpoint_idx);
+                int seq = seq_counter[k]++;
+                pdb.insert_segment(fid, rec.source_file_idx, rec.checkpoint_idx,
+                                   seq, rec.output_line_start,
+                                   rec.output_line_end, rec.event_count);
+            }
         } catch (const std::exception& e) {
             DFTRACER_UTILS_LOG_ERROR("Provenance write failed for %s: %s",
                                      chunk.path.c_str(), e.what());
diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.cpp
new file mode 100644
index 00000000..42c83158
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.cpp
@@ -0,0 +1,410 @@
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/common/platform_compat.h>
+#include <dftracer/utils/core/coro/channel.h>
+#include <dftracer/utils/core/coro/spawn_future.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/utilities/composites/dft/internal/utils.h>
+#include <dftracer/utils/utilities/composites/dft/metadata_collector_utility.h>
+#include <dftracer/utils/utilities/composites/dft/reorganize/reconstruction_planner.h>
+#include <dftracer/utils/utilities/composites/dft/reorganize/reconstructor_utility.h>
+#include <dftracer/utils/utilities/composites/indexed_file_reader_utility.h>
+#include <dftracer/utils/utilities/fileio/chunk_writer.h>
+#include <dftracer/utils/utilities/filesystem/pattern_directory_scanner_utility.h>
+#include <dftracer/utils/utilities/indexer/index_visitor.h>
+#include <dftracer/utils/utilities/reader/internal/stream_config.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cstring>
+#include <mutex>
+#include <span>
+#include <unordered_map>
+
+namespace dftracer::utils::utilities::composites::dft::reorganize {
+
+using fileio::ChunkWriter;
+using fileio::ChunkWriterConfig;
+using indexer::SharedLineBuffer;
+
+ReconstructorInput& ReconstructorInput::with_input_dir(std::string dir) {
+    input_dir = std::move(dir);
+    return *this;
+}
+
+ReconstructorInput& ReconstructorInput::with_output_dir(std::string dir) {
+    output_dir = std::move(dir);
+    return *this;
+}
+
+ReconstructorInput& ReconstructorInput::with_checkpoint_size(std::size_t sz) {
+    checkpoint_size = sz;
+    return *this;
+}
+
+ReconstructorInput& ReconstructorInput::with_parallelism(std::size_t n) {
+    parallelism = n;
+    return *this;
+}
+
+ReconstructorInput& ReconstructorInput::with_compress(bool c) {
+    compress = c;
+    return *this;
+}
+
+namespace {
+
+struct SegmentInterval {
+    int line_start;
+    int line_end;
+    std::size_t original_idx;  // Index into original files vector
+};
+
+const SegmentInterval* find_segment(
+    const std::vector<SegmentInterval>& intervals, int line_number) {
+    auto it = std::upper_bound(
+        intervals.begin(), intervals.end(), line_number,
+        [](int ln, const SegmentInterval& seg) { return ln < seg.line_start; });
+    if (it != intervals.begin()) {
+        --it;
+        if (line_number >= it->line_start && line_number <= it->line_end) {
+            return &(*it);
+        }
+    }
+    return nullptr;
+}
+
+std::string output_filename(const std::string& original_path) {
+    auto p = fs::path(original_path).filename().string();
+    if (p.size() > 3 && p.substr(p.size() - 3) == ".gz") {
+        p = p.substr(0, p.size() - 3);
+    }
+    return p;
+}
+
+struct ReconstructLineRecord {
+    SharedLineBuffer buffer;
+    std::string_view line;
+};
+
+struct ReconstructLineBatch {
+    std::vector<ReconstructLineRecord> lines;
+
+    void reserve(std::size_t n) { lines.reserve(n); }
+    std::size_t size() const { return lines.size(); }
+    bool empty() const { return lines.empty(); }
+    void clear() { lines.clear(); }
+};
+
+struct WriterContext {
+    std::string output_dir;
+    bool compress;
+    std::atomic<std::size_t>* total_events;
+    std::atomic<std::size_t>* total_bytes;
+    std::vector<ReconstructedFileInfo>* file_results;
+    std::mutex* results_mutex;
+};
+
+struct ReaderContext {
+    std::size_t checkpoint_size;
+    const std::vector<std::string>* original_paths;
+    const std::vector<std::shared_ptr<coro::Channel<ReconstructLineBatch>>>*
+        channels;
+};
+
+// Named coroutine for writer task (CP.51 - no capturing lambda coroutines)
+static coro::CoroTask<void> run_writer(
+    std::shared_ptr<coro::Channel<ReconstructLineBatch>> channel,
+    std::string orig_path, WriterContext wctx) {
+    std::string fname = output_filename(orig_path);
+    std::string base = fname;
+    if (base.size() > 4 && base.substr(base.size() - 4) == ".pfw") {
+        base = base.substr(0, base.size() - 4);
+    }
+
+    auto config = ChunkWriterConfig()
+                      .with_output_dir(wctx.output_dir)
+                      .with_base_name(base)
+                      .with_chunk_size(std::numeric_limits<std::size_t>::max())
+                      .with_compression(wctx.compress);
+
+    ChunkWriter writer(config);
+    co_await writer.open();
+
+    while (auto batch_opt = co_await channel->receive()) {
+        auto& batch = *batch_opt;
+        for (const auto& record : batch.lines) {
+            co_await writer.write_line(
+                ByteView(record.line.data(), record.line.size()));
+        }
+    }
+
+    co_await writer.close();
+
+    ReconstructedFileInfo info;
+    info.original_path = std::move(orig_path);
+    info.events_written = writer.total_events_written();
+    info.bytes_written = writer.total_bytes_written();
+    if (!writer.chunks().empty()) {
+        info.output_path = writer.chunks().front().path;
+    }
+
+    wctx.total_events->fetch_add(info.events_written);
+    wctx.total_bytes->fetch_add(info.bytes_written);
+
+    {
+        std::lock_guard<std::mutex> lock(*wctx.results_mutex);
+        wctx.file_results->push_back(std::move(info));
+    }
+}
+
+// Named coroutine for reader/producer task (CP.51)
+static coro::CoroTask<void> run_reader(
+    CoroScope& scope, std::string reorg_file,
+    const std::vector<SegmentInterval>* intervals, ReaderContext rctx) {
+    std::string index_path = internal::determine_index_path(reorg_file, "");
+
+    MetadataCollectorUtility meta_collector;
+    auto meta_input = MetadataCollectorUtilityInput::from_file(reorg_file)
+                          .with_index(index_path)
+                          .with_checkpoint_size(rctx.checkpoint_size);
+    auto meta = co_await meta_collector.process(meta_input);
+
+    auto reader_input = IndexedReadInput::from_file(reorg_file)
+                            .with_index(index_path)
+                            .with_checkpoint_size(rctx.checkpoint_size);
+    IndexedFileReaderUtility reader_utility;
+    auto reader = co_await reader_utility.process(reader_input);
+
+    auto stream = reader->stream(
+        reader::internal::StreamConfig()
+            .stream_type(reader::internal::StreamType::MULTI_LINES_BYTES)
+            .range_type(reader::internal::RangeType::BYTE_RANGE)
+            .buffer_size(4 * 1024 * 1024)
+            .from(0)
+            .to(meta.uncompressed_size));
+
+    constexpr std::size_t BATCH_SIZE = 1024;
+    std::unordered_map<std::size_t, ReconstructLineBatch> pending_batches;
+
+    int event_number = 0;
+
+    while (!stream->done()) {
+        std::span<const char> chunk = co_await stream->read_async();
+        if (chunk.empty()) break;
+
+        auto buffer = std::make_shared<std::string>(
+            chunk.data(), static_cast<std::string::size_type>(chunk.size()));
+
+        const char* data = buffer->data();
+        std::size_t bytes_read = buffer->size();
+        std::size_t pos = 0;
+
+        while (pos < bytes_read) {
+            const char* line_start = data + pos;
+            const char* newline = static_cast<const char*>(
+                std::memchr(line_start, '\n', bytes_read - pos));
+            if (!newline) break;
+
+            std::size_t line_len =
+                static_cast<std::size_t>(newline - line_start);
+
+            if (line_len > 0 && line_start[0] == '{') {
+                const auto* seg = find_segment(*intervals, event_number);
+                if (seg) {
+                    auto& batch = pending_batches[seg->original_idx];
+                    ReconstructLineRecord record;
+                    record.buffer = buffer;
+                    record.line = std::string_view(line_start, line_len);
+                    batch.lines.push_back(std::move(record));
+
+                    if (batch.size() >= BATCH_SIZE) {
+                        auto& channel = (*rctx.channels)[seg->original_idx];
+                        co_await channel->send(std::move(batch));
+                        batch.clear();
+                    }
+                }
+                event_number++;
+            }
+
+            pos = static_cast<std::size_t>(newline - data) + 1;
+        }
+    }
+
+    for (auto& [idx, batch] : pending_batches) {
+        if (!batch.empty()) {
+            auto& channel = (*rctx.channels)[idx];
+            co_await channel->send(std::move(batch));
+        }
+    }
+
+    (void)scope;
+}
+
+}  // namespace
+
+coro::CoroTask<ReconstructorResult> ReconstructorUtility::process(
+    const ReconstructorInput& input) {
+    ReconstructorResult result;
+
+    if (!has_context()) {
+        result.error_message = "No context bound";
+        co_return result;
+    }
+    CoroScope& ctx = context();
+
+    std::vector<std::string> reorg_files;
+    if (fs::exists(input.input_dir)) {
+        filesystem::PatternDirectoryScannerUtility scanner;
+        filesystem::PatternDirectoryScannerUtilityInput scan_input{
+            input.input_dir, {".pfw", ".pfw.gz"}, true};
+        auto matched = co_await scanner.process(scan_input);
+        for (const auto& entry : matched) {
+            reorg_files.push_back(entry.path.string());
+        }
+    }
+
+    if (reorg_files.empty()) {
+        result.error_message = "No reorganized files found";
+        co_return result;
+    }
+
+    ReconstructionPlannerUtility planner;
+    ReconstructionPlannerInput planner_input;
+    planner_input.reorganized_files = reorg_files;
+    planner_input.index_dir = "";
+
+    ReconstructionPlan plan;
+    try {
+        plan = co_await planner.process(planner_input);
+    } catch (const std::exception& e) {
+        result.error_message = std::string("Planning failed: ") + e.what();
+        co_return result;
+    }
+
+    if (plan.files.empty()) {
+        result.success = true;
+        co_return result;
+    }
+
+    result.total_segments = plan.total_segments;
+
+    fs::create_directories(input.output_dir);
+
+    // Build original paths vector and index map
+    std::vector<std::string> original_paths;
+    std::unordered_map<std::string, std::size_t> path_to_idx;
+    for (const auto& [orig_path, recon] : plan.files) {
+        path_to_idx[orig_path] = original_paths.size();
+        original_paths.push_back(orig_path);
+    }
+
+    // Build segment intervals using indices instead of strings
+    std::unordered_map<std::string, std::vector<SegmentInterval>>
+        per_reorg_segments;
+    for (const auto& [orig_path, recon] : plan.files) {
+        std::size_t orig_idx = path_to_idx[orig_path];
+        for (const auto& [ckpt, segs] : recon.checkpoint_segments) {
+            for (const auto& seg : segs) {
+                SegmentInterval si;
+                si.line_start = seg.output_line_start;
+                si.line_end = seg.output_line_end;
+                si.original_idx = orig_idx;
+                per_reorg_segments[seg.reorg_file].push_back(si);
+            }
+        }
+    }
+
+    for (auto& [file, segs] : per_reorg_segments) {
+        std::sort(segs.begin(), segs.end(),
+                  [](const SegmentInterval& a, const SegmentInterval& b) {
+                      return a.line_start < b.line_start;
+                  });
+    }
+
+    // Create channels indexed by original file index
+    std::vector<std::shared_ptr<coro::Channel<ReconstructLineBatch>>> channels;
+    channels.reserve(original_paths.size());
+    for (std::size_t i = 0; i < original_paths.size(); ++i) {
+        channels.push_back(
+            std::make_shared<coro::Channel<ReconstructLineBatch>>(16));
+    }
+
+    std::atomic<std::size_t> total_events{0};
+    std::atomic<std::size_t> total_bytes{0};
+    std::vector<ReconstructedFileInfo> file_results;
+    std::mutex results_mutex;
+
+    WriterContext wctx;
+    wctx.output_dir = input.output_dir;
+    wctx.compress = input.compress;
+    wctx.total_events = &total_events;
+    wctx.total_bytes = &total_bytes;
+    wctx.file_results = &file_results;
+    wctx.results_mutex = &results_mutex;
+
+    // Spawn writers (consumers)
+    for (std::size_t i = 0; i < original_paths.size(); ++i) {
+        ctx.spawn([channel = channels[i], orig_path = original_paths[i],
+                   wctx](CoroScope&) -> coro::CoroTask<void> {
+            co_await run_writer(channel, std::move(orig_path), wctx);
+        });
+    }
+
+    auto parallelism = input.parallelism > 0
+                           ? input.parallelism
+                           : dftracer_utils_hardware_concurrency();
+
+    ReaderContext rctx;
+    rctx.checkpoint_size = input.checkpoint_size;
+    rctx.original_paths = &original_paths;
+    rctx.channels = &channels;
+
+    auto* per_reorg_ptr = &per_reorg_segments;
+    auto* rctx_ptr = &rctx;
+
+    co_await ctx.scope([per_reorg_ptr, rctx_ptr, parallelism](
+                           CoroScope& producer_scope) -> coro::CoroTask<void> {
+        auto permits = coro::make_channel<bool>(parallelism * 2);
+        for (std::size_t i = 0; i < parallelism * 2; ++i) {
+            permits->try_send(true);
+        }
+
+        for (auto& [reorg_file, intervals] : *per_reorg_ptr) {
+            const auto* intervals_ptr = &intervals;
+            auto reorg_file_copy = reorg_file;
+
+            producer_scope.spawn(
+                [reorg_file_copy, intervals_ptr, rctx_ptr,
+                 permits](CoroScope& s) -> coro::CoroTask<void> {
+                    co_await s.receive(permits);
+                    try {
+                        co_await run_reader(s, std::move(reorg_file_copy),
+                                            intervals_ptr, *rctx_ptr);
+                        permits->try_send(true);
+                    } catch (...) {
+                        permits->try_send(true);
+                        throw;
+                    }
+                });
+        }
+
+        co_return;
+    });
+
+    // Producers done: close channels
+    for (auto& channel : channels) {
+        channel->close();
+    }
+
+    // Wait for writers
+    co_await ctx.join_all();
+
+    result.files = std::move(file_results);
+    result.total_events = total_events.load();
+    result.total_bytes = total_bytes.load();
+    result.success = true;
+
+    co_return result;
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::reorganize
diff --git a/src/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.cpp b/src/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.cpp
index b64904be..30b7431d 100644
--- a/src/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.cpp
@@ -1,8 +1,11 @@
 #include <dftracer/utils/core/common/constants.h>
 #include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/coro/when_all.h>
 #include <dftracer/utils/core/utils/string.h>
 #include <dftracer/utils/utilities/common/json/json_value.h>
 #include <dftracer/utils/utilities/common/query/query.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/index_resolver_utility.h>
 #include <dftracer/utils/utilities/composites/dft/internal/utils.h>
 #include <dftracer/utils/utilities/composites/dft/metadata_collector_utility.h>
 #include <dftracer/utils/utilities/composites/dft/reorganize/reorganization_planner.h>
@@ -10,7 +13,7 @@
 #include <dftracer/utils/utilities/indexer/index_builder_utility.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
 #include <algorithm>
 #include <cctype>
@@ -23,10 +26,12 @@ namespace dftracer::utils::utilities::composites::dft::reorganize {
 namespace {
 
 using common::query::Query;
-using dftracer::utils::utilities::indexer::IndexBuildConfig;
-using dftracer::utils::utilities::indexer::IndexBuilderUtility;
+using dftracer::utils::utilities::indexer::IndexBatchBuilderUtility;
+using dftracer::utils::utilities::indexer::IndexBuildBatchConfig;
 using dftracer::utils::utilities::indexer::IndexDatabase;
 using fileio::lines::sources::async_streaming_gz_lines;
+using indexing::IndexResolverUtility;
+using indexing::ResolverInput;
 
 }  // namespace
 
@@ -49,6 +54,12 @@ std::vector<PredicateGroup> parse_group_specs(
 
 coro::CoroTask<ExtractionPlan> ReorganizationPlannerUtility::process(
     const ReorganizationPlannerInput& input) {
+    if (!has_context()) {
+        throw std::runtime_error(
+            "ReorganizationPlannerUtility requires CoroScope context");
+    }
+    CoroScope& scope = context();
+
     ExtractionPlan plan;
     plan.groups = input.groups;
 
@@ -67,8 +78,6 @@ coro::CoroTask<ExtractionPlan> ReorganizationPlannerUtility::process(
         }
     }
 
-    // Ensure "remainder" group exists if not already
-    // specified
     bool has_remainder = false;
     for (const auto& g : input.groups) {
         if (g.query.empty()) {
@@ -90,41 +99,94 @@ coro::CoroTask<ExtractionPlan> ReorganizationPlannerUtility::process(
         }
     }
 
-    // Process each source file
-    for (std::size_t fi = 0; fi < input.source_files.size(); ++fi) {
-        const auto& file_path = input.source_files[fi];
+    if (input.source_files.empty()) {
+        co_return plan;
+    }
 
-        // Build the shared `.dftindex` store if needed.
-        IndexBuilderUtility idx_builder;
-        auto idx_input = IndexBuildConfig::for_file(file_path).with_index_dir(
-            input.index_dir);
+    // Use IndexResolverUtility to scan files and check capabilities
+    IndexResolverUtility resolver;
+    ResolverInput resolver_input;
+    resolver_input.files = input.source_files;
+    resolver_input.index_dir = input.index_dir;
+    resolver_input.require_checkpoints = true;
+    resolver_input.require_manifest = true;
+    auto scan_result = co_await scope.spawn(resolver, resolver_input);
+
+    DFTRACER_UTILS_LOG_INFO(
+        "ReorganizationPlanner: %zu files, %zu cached, %zu need checkpoint, "
+        "%zu need manifest",
+        scan_result.all_files.size(), scan_result.cached.size(),
+        scan_result.needs_checkpoint.size(), scan_result.needs_manifest.size());
+
+    // Build indices in parallel for files needing work
+    std::vector<std::string> files_needing_index;
+    for (const auto& item : scan_result.needs_checkpoint) {
+        files_needing_index.push_back(item.file_path);
+    }
+    for (const auto& item : scan_result.needs_manifest) {
+        files_needing_index.push_back(item.file_path);
+    }
+
+    if (!files_needing_index.empty()) {
+        auto batch_config = std::make_shared<IndexBuildBatchConfig>();
+        batch_config->file_paths = std::move(files_needing_index);
+        batch_config->index_dir = input.index_dir;
         if (input.checkpoint_size > 0) {
-            idx_input.with_checkpoint_size(input.checkpoint_size);
+            batch_config->checkpoint_size = input.checkpoint_size;
         }
-        auto idx_result = co_await idx_builder.process(idx_input);
-        if (!idx_result.success) {
-            throw std::runtime_error("Failed to build index for: " + file_path);
+        batch_config->build_manifest = true;
+        batch_config->use_batch_write = true;
+
+        auto batch_result =
+            co_await IndexBatchBuilderUtility::process(&scope, batch_config);
+
+        for (const auto& result : batch_result.results) {
+            if (!result.success) {
+                throw std::runtime_error(
+                    "Failed to build index for: " + result.file_path + ": " +
+                    result.error_message);
+            }
         }
+    }
 
-        // Collect metadata
-        MetadataCollectorUtility metadata_collector;
+    // Collect metadata in parallel using when_all
+    // Store inputs in vector to ensure lifetime across co_await
+    std::vector<MetadataCollectorUtilityInput> meta_inputs;
+    meta_inputs.reserve(input.source_files.size());
+    for (const auto& file_path : input.source_files) {
+        auto index_path =
+            internal::determine_index_path(file_path, input.index_dir);
         auto meta_input =
             MetadataCollectorUtilityInput::from_file(file_path).with_index(
-                idx_result.index_path);
+                index_path);
         if (input.checkpoint_size > 0) {
             meta_input.with_checkpoint_size(input.checkpoint_size);
         }
-        auto meta = co_await metadata_collector.process(meta_input);
+        meta_inputs.push_back(std::move(meta_input));
+    }
+
+    std::vector<coro::CoroTask<MetadataCollectorUtilityOutput>> metadata_tasks;
+    metadata_tasks.reserve(meta_inputs.size());
+    for (const auto& meta_input : meta_inputs) {
+        MetadataCollectorUtility collector;
+        metadata_tasks.push_back(collector.process(meta_input));
+    }
+
+    auto metadata_results = co_await coro::when_all(std::move(metadata_tasks));
+
+    // Build source file info from metadata results (same order as input)
+    plan.source_files.reserve(input.source_files.size());
+    for (std::size_t fi = 0; fi < input.source_files.size(); ++fi) {
+        const auto& file_path = input.source_files[fi];
+        const auto& meta = metadata_results[fi];
+
         if (!meta.success) {
             throw std::runtime_error("Failed to collect metadata for: " +
                                      file_path);
         }
 
-        // Determine the root-local `.dftindex` store path.
         std::string index_path =
             internal::determine_index_path(file_path, input.index_dir);
-
-        // Effective checkpoint count: treat 0 as 1
         std::size_t eff_ckpts =
             meta.num_checkpoints > 0 ? meta.num_checkpoints : 1;
 
@@ -135,12 +197,16 @@ coro::CoroTask<ExtractionPlan> ReorganizationPlannerUtility::process(
         sfi.uncompressed_size = meta.uncompressed_size;
         sfi.checkpoint_size = meta.checkpoint_size;
         plan.source_files.push_back(std::move(sfi));
+    }
+
+    // Plan extraction tasks for each file
+    for (std::size_t fi = 0; fi < input.source_files.size(); ++fi) {
+        const auto& file_path = input.source_files[fi];
+        const auto& meta = metadata_results[fi];
+        const auto& sfi = plan.source_files[fi];
 
-        // Open the shared index store and try manifest-based planning. Fall
-        // back to whole-file streaming when manifest tables are absent (file
-        // was below index_threshold).
         IndexDatabase idx_db(
-            index_path,
+            sfi.index_path,
             dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
         int file_info_id = idx_db.get_file_info_id(
             indexer::internal::get_logical_path(file_path));
@@ -150,6 +216,7 @@ coro::CoroTask<ExtractionPlan> ReorganizationPlannerUtility::process(
         }
 
         const bool has_manifest = idx_db.has_manifest_data(file_info_id);
+        const std::size_t eff_ckpts = sfi.num_checkpoints;
 
         if (has_manifest) {
             // Manifest-based planning: per-checkpoint extraction tasks.
@@ -216,23 +283,12 @@ coro::CoroTask<ExtractionPlan> ReorganizationPlannerUtility::process(
         } else {
             // Whole-file fallback: stream line-by-line, route each
             // event to a group, emit one task per group covering
-            // the entire file.  Only enabled for files at or below
-            // the default index threshold to avoid pathological
-            // memory usage on large traces with corrupt/partial
-            // indexes.
-            auto file_size = fs::file_size(file_path);
-            if (file_size > constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD) {
-                throw std::runtime_error(
-                    "Manifest tables missing for large file (>" +
-                    std::to_string(
-                        constants::indexer::DEFAULT_INDEX_SIZE_THRESHOLD) +
-                    " bytes): " + file_path +
-                    ". Re-index with index_threshold=0.");
-            }
-
+            // the entire file.
             std::map<std::string, std::vector<std::uint32_t>> group_lines;
             std::vector<std::uint32_t> meta_line_numbers;
 
+            simdjson::dom::parser parser;
+
             auto gen = async_streaming_gz_lines(file_path);
             while (auto line_opt = co_await gen.next()) {
                 const auto& line = *line_opt;
@@ -252,42 +308,33 @@ coro::CoroTask<ExtractionPlan> ReorganizationPlannerUtility::process(
                     continue;
                 }
 
-                yyjson_doc* doc =
-                    yyjson_read(begin, static_cast<size_t>(end - begin),
-                                YYJSON_READ_NOFLAG);
-                if (!doc) continue;
+                auto result =
+                    parser.parse(begin, static_cast<size_t>(end - begin));
+                if (result.error()) continue;
 
-                yyjson_val* root = yyjson_doc_get_root(doc);
-                if (!root || !yyjson_is_obj(root)) {
-                    yyjson_doc_free(doc);
-                    continue;
-                }
+                auto root = result.value_unsafe();
+                if (!root.is_object()) continue;
 
                 auto line_num = static_cast<std::uint32_t>(line.line_number);
-                yyjson_val* ph_val = yyjson_obj_get(root, "ph");
+                auto ph_result = root["ph"].get_string();
                 const bool is_metadata =
-                    ph_val && yyjson_is_str(ph_val) &&
-                    std::string_view(yyjson_get_str(ph_val),
-                                     yyjson_get_len(ph_val)) == "M";
+                    !ph_result.error() && ph_result.value_unsafe() == "M";
 
                 if (is_metadata) {
                     meta_line_numbers.push_back(line_num);
-                    yyjson_doc_free(doc);
                     continue;
                 }
 
                 std::string cat_str;
-                if (yyjson_val* cat_val = yyjson_obj_get(root, "cat");
-                    cat_val && yyjson_is_str(cat_val)) {
-                    cat_str.assign(yyjson_get_str(cat_val),
-                                   yyjson_get_len(cat_val));
+                auto cat_result = root["cat"].get_string();
+                if (!cat_result.error()) {
+                    cat_str = std::string(cat_result.value_unsafe());
                 }
 
                 std::string name_str;
-                if (yyjson_val* name_val = yyjson_obj_get(root, "name");
-                    name_val && yyjson_is_str(name_val)) {
-                    name_str.assign(yyjson_get_str(name_val),
-                                    yyjson_get_len(name_val));
+                auto name_result = root["name"].get_string();
+                if (!name_result.error()) {
+                    name_str = std::string(name_result.value_unsafe());
                 }
 
                 bool matched = false;
@@ -306,8 +353,6 @@ coro::CoroTask<ExtractionPlan> ReorganizationPlannerUtility::process(
                     group_lines[remainder_name].push_back(line_num);
                 }
                 plan.total_events++;
-
-                yyjson_doc_free(doc);
             }
 
             for (auto& [gname, lines] : group_lines) {
diff --git a/src/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.cpp b/src/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.cpp
index 8664442d..4a0611a3 100644
--- a/src/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.cpp
@@ -1,11 +1,12 @@
 #include <dftracer/utils/core/common/logging.h>
 #include <dftracer/utils/utilities/common/json/json.h>
+#include <dftracer/utils/utilities/composites/dft/args_map.h>
 #include <dftracer/utils/utilities/composites/dft/event.h>
 #include <dftracer/utils/utilities/composites/dft/statistics/chunk_detail_scanner_utility.h>
 #include <dftracer/utils/utilities/composites/indexed_file_reader_utility.h>
 #include <dftracer/utils/utilities/composites/types.h>
 #include <dftracer/utils/utilities/reader/internal/stream_config.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
 #include <array>
 #include <charconv>
@@ -14,18 +15,13 @@
 #include <string_view>
 #include <unordered_set>
 
-// Import JsonValue from common json namespace
 using dftracer::utils::utilities::common::json::JsonValue;
 using dftracer::utils::utilities::composites::dft::DFTracerEvent;
 
 namespace dftracer::utils::utilities::composites::dft::statistics {
 
-// Constant for global (non-grouped) I/O metrics
 inline constexpr std::string_view GLOBAL_GROUP_KEY = "__global__";
 
-// Event names where args.ret represents bytes transferred (actual I/O).
-// Other syscalls like lseek64 (returns offset) and fork (returns PID)
-// have ret with different semantics and must be excluded.
 static constexpr auto IO_EVENT_NAMES =
     std::to_array<std::string_view>({"read", "write", "pread", "pwrite",
                                      "pread64", "pwrite64", "readv", "writev"});
@@ -35,46 +31,49 @@ static bool is_io_event(std::string_view name) {
            IO_EVENT_NAMES.end();
 }
 
-// Build a composite group key from the requested dimensions.
 static void build_group_key(std::string& key,
                             const std::vector<std::string>& group_by,
-                            const JsonValue& json, const JsonValue& args) {
+                            const DFTracerEvent& ev) {
     key.clear();
 
     for (std::size_t i = 0; i < group_by.size(); ++i) {
         if (i > 0) key.push_back('|');
         const auto& dim = group_by[i];
         if (dim == "name") {
-            key += json["name"].get<std::string_view>();
+            key += ev.name;
         } else if (dim == "cat") {
-            key += json["cat"].get<std::string_view>();
-        } else if (dim == "pid" || dim == "tid") {
-            std::uint64_t val = json[dim].get<std::uint64_t>();
+            key += ev.cat;
+        } else if (dim == "pid") {
             char buf[32];
-            auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), val);
+            auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), ev.pid);
+            if (ec == std::errc()) {
+                key.append(buf, ptr - buf);
+            }
+        } else if (dim == "tid") {
+            char buf[32];
+            auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), ev.tid);
             if (ec == std::errc()) {
                 key.append(buf, ptr - buf);
             }
         } else if (dim == "pid_tid") {
-            std::uint64_t pid = json["pid"].get<std::uint64_t>();
-            std::uint64_t tid = json["tid"].get<std::uint64_t>();
             char buf[64];
-            auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), pid);
+            auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), ev.pid);
             if (ec == std::errc()) {
                 key.append(buf, ptr - buf);
                 key.push_back(':');
-                auto [ptr2, ec2] = std::to_chars(ptr, buf + sizeof(buf), tid);
+                auto [ptr2, ec2] =
+                    std::to_chars(ptr, buf + sizeof(buf), ev.tid);
                 if (ec2 == std::errc()) {
                     key.append(ptr, ptr2 - ptr);
                 }
             }
         } else if (dim == "fhash") {
-            if (args.exists()) {
-                key += args["fhash"].get<std::string_view>();
+            if (ev.args.exists()) {
+                key += ev.args["fhash"].get<std::string_view>();
             }
         } else if (dim == "hhash") {
-            if (args.exists()) {
-                key += args["hhash"].get<std::string_view>();
+            if (ev.args.exists()) {
+                key += ev.args["hhash"].get<std::string_view>();
             }
         }
     }
@@ -85,7 +84,6 @@ coro::CoroTask<ChunkDetailScanOutput> ChunkDetailScannerUtility::process(
     ChunkDetailScanOutput output;
     output.success = false;
 
-    // Build filter sets for O(1) lookup
     std::unordered_set<std::string_view> name_filter;
     std::unordered_set<std::string_view> cat_filter;
     if (input.filter_names) {
@@ -103,7 +101,6 @@ coro::CoroTask<ChunkDetailScanOutput> ChunkDetailScannerUtility::process(
     bool has_cat_filter = !cat_filter.empty();
     bool has_grouping = input.group_by && !input.group_by->empty();
 
-    // Create reader (same pattern as chunk_indexer_utility.cpp)
     auto reader_input = composites::IndexedReadInput::from_file(input.file_path)
                             .with_checkpoint_size(input.checkpoint_size)
                             .with_index(input.index_path);
@@ -141,9 +138,7 @@ coro::CoroTask<ChunkDetailScanOutput> ChunkDetailScannerUtility::process(
     group_key_buf.reserve(128);
     static const std::string global_key{GLOBAL_GROUP_KEY};
 
-    char yy_buf[common::json::YYJSON_LINE_POOL_SIZE];
-    yyjson_alc yy_alc;
-    yyjson_alc_pool_init(&yy_alc, yy_buf, sizeof(yy_buf));
+    simdjson::dom::parser parser;
 
     while (!stream->done()) {
         auto chunk = co_await stream->read_async();
@@ -168,26 +163,18 @@ coro::CoroTask<ChunkDetailScanOutput> ChunkDetailScannerUtility::process(
             std::size_t line_len = newline - line_start;
 
             if (line_len > 0) {
-                yyjson_read_flag flg = YYJSON_READ_NOFLAG;
-                yyjson_doc* doc =
-                    yyjson_read_opts(const_cast<char*>(line_start), line_len,
-                                     flg, &yy_alc, nullptr);
-
-                if (doc) {
-                    yyjson_val* root = yyjson_doc_get_root(doc);
-                    if (root && yyjson_is_obj(root)) {
+                auto result = parser.parse(line_start, line_len);
+                if (!result.error()) {
+                    auto root = result.value_unsafe();
+                    if (root.is_object()) {
                         JsonValue json(root);
                         DFTracerEvent ev;
                         if (!DFTracerEvent::parse(json, ev)) {
-                            yyjson_doc_free(doc);
                             pos = (newline - data) + 1;
                             continue;
                         }
 
                         if (!ev.is_metadata()) {
-                            // Regular event
-
-                            // Apply filters
                             bool passes = true;
                             if (has_name_filter && name_filter.find(ev.name) ==
                                                        name_filter.end()) {
@@ -201,21 +188,17 @@ coro::CoroTask<ChunkDetailScanOutput> ChunkDetailScannerUtility::process(
                             if (passes) {
                                 double dur = static_cast<double>(ev.dur);
 
-                                // Global duration
                                 output.stats.duration.update(dur);
 
-                                // Determine I/O key for this event
                                 bool is_io = is_io_event(ev.name);
                                 const std::string* io_key_ptr;
 
                                 if (has_grouping) {
                                     build_group_key(group_key_buf,
-                                                    *input.group_by, json,
-                                                    ev.args);
+                                                    *input.group_by, ev);
 
                                     output.stats.grouped_duration[group_key_buf]
                                         .update(dur);
-                                    // Only insert category on first occurrence
                                     output.stats.group_key_category.try_emplace(
                                         group_key_buf, ev.cat);
                                     io_key_ptr = &group_key_buf;
@@ -223,7 +206,6 @@ coro::CoroTask<ChunkDetailScanOutput> ChunkDetailScannerUtility::process(
                                     io_key_ptr = &global_key;
                                 }
 
-                                // I/O metrics: only for actual I/O events
                                 if (is_io && ev.args.exists()) {
                                     auto ret_opt =
                                         ev.args["ret"]
@@ -255,7 +237,6 @@ coro::CoroTask<ChunkDetailScanOutput> ChunkDetailScannerUtility::process(
                             }
                         }
                     }
-                    yyjson_doc_free(doc);
                 }
             }
 
diff --git a/src/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.cpp b/src/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.cpp
index 57682886..856843cc 100644
--- a/src/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.cpp
@@ -1,8 +1,9 @@
 #include <dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.h>
-#include <yyjson.h>
 
 #include <cmath>
 #include <cstdint>
+#include <iomanip>
+#include <sstream>
 
 namespace dftracer::utils::utilities::composites::dft::statistics {
 
@@ -71,96 +72,87 @@ void DetailedStatistics::merge(const DetailedStatistics& other) {
     chunks_skipped += other.chunks_skipped;
 }
 
-// Helper: serialize a DistributionStats into a yyjson mutable object
-static yyjson_mut_val* distribution_to_json(yyjson_mut_doc* doc,
-                                            const DistributionStats& dist) {
-    yyjson_mut_val* obj = yyjson_mut_obj(doc);
-
-    yyjson_mut_obj_add_uint(doc, obj, "count", dist.count());
-    yyjson_mut_obj_add_real(doc, obj, "sum", dist.sum);
-    yyjson_mut_obj_add_real(doc, obj, "mean", dist.mean());
-    yyjson_mut_obj_add_real(doc, obj, "stddev", dist.stddev());
+namespace {
+void write_distribution_json(std::ostringstream& ss,
+                             const DistributionStats& dist) {
+    ss << "{\"count\":" << dist.count();
+    ss << ",\"sum\":" << dist.sum;
+    ss << ",\"mean\":" << dist.mean();
+    ss << ",\"stddev\":" << dist.stddev();
 
     if (dist.count() > 0 && !dist.sketch.empty()) {
-        yyjson_mut_obj_add_real(doc, obj, "min", dist.sketch.min());
-        yyjson_mut_obj_add_real(doc, obj, "max", dist.sketch.max());
-
-        yyjson_mut_val* pctls = yyjson_mut_obj(doc);
-        yyjson_mut_obj_add_real(doc, pctls, "p10", dist.sketch.quantile(0.1));
-        yyjson_mut_obj_add_real(doc, pctls, "p25", dist.sketch.quantile(0.25));
-        yyjson_mut_obj_add_real(doc, pctls, "p50", dist.sketch.quantile(0.5));
-        yyjson_mut_obj_add_real(doc, pctls, "p75", dist.sketch.quantile(0.75));
-        yyjson_mut_obj_add_real(doc, pctls, "p90", dist.sketch.quantile(0.9));
-        yyjson_mut_obj_add_real(doc, pctls, "p95", dist.sketch.quantile(0.95));
-        yyjson_mut_obj_add_real(doc, pctls, "p99", dist.sketch.quantile(0.99));
-        yyjson_mut_obj_add_val(doc, obj, "percentiles", pctls);
+        ss << ",\"min\":" << dist.sketch.min();
+        ss << ",\"max\":" << dist.sketch.max();
+        ss << ",\"percentiles\":{";
+        ss << "\"p10\":" << dist.sketch.quantile(0.1);
+        ss << ",\"p25\":" << dist.sketch.quantile(0.25);
+        ss << ",\"p50\":" << dist.sketch.quantile(0.5);
+        ss << ",\"p75\":" << dist.sketch.quantile(0.75);
+        ss << ",\"p90\":" << dist.sketch.quantile(0.9);
+        ss << ",\"p95\":" << dist.sketch.quantile(0.95);
+        ss << ",\"p99\":" << dist.sketch.quantile(0.99);
+        ss << '}';
     }
 
-    // Direct serialization to avoid string roundtrip
-    yyjson_mut_val* hist_val = dist.histogram.to_yyjson(doc);
-    yyjson_mut_obj_add_val(doc, obj, "histogram", hist_val);
-
-    return obj;
+    ss << ",\"histogram\":" << dist.histogram.to_json();
+    ss << '}';
 }
 
-// Helper: serialize IOEventMetrics into a yyjson mutable object
-static yyjson_mut_val* io_metrics_to_json(yyjson_mut_doc* doc,
-                                          const IOEventMetrics& io) {
-    yyjson_mut_val* obj = yyjson_mut_obj(doc);
-    yyjson_mut_obj_add_val(doc, obj, "duration",
-                           distribution_to_json(doc, io.duration));
-    yyjson_mut_obj_add_val(doc, obj, "size",
-                           distribution_to_json(doc, io.size));
+void write_io_metrics_json(std::ostringstream& ss, const IOEventMetrics& io) {
+    ss << "{\"duration\":";
+    write_distribution_json(ss, io.duration);
+    ss << ",\"size\":";
+    write_distribution_json(ss, io.size);
     if (io.bandwidth.count() > 0) {
-        yyjson_mut_obj_add_val(doc, obj, "bandwidth",
-                               distribution_to_json(doc, io.bandwidth));
+        ss << ",\"bandwidth\":";
+        write_distribution_json(ss, io.bandwidth);
     }
     if (io.offset.count() > 0) {
-        yyjson_mut_obj_add_val(doc, obj, "offset",
-                               distribution_to_json(doc, io.offset));
+        ss << ",\"offset\":";
+        write_distribution_json(ss, io.offset);
     }
-    return obj;
+    ss << '}';
 }
+}  // namespace
 
 std::string DetailedStatistics::to_json() const {
-    yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr);
-    yyjson_mut_val* root = yyjson_mut_obj(doc);
-    yyjson_mut_doc_set_root(doc, root);
+    std::ostringstream ss;
+    ss << std::setprecision(17);
+    ss << '{';
 
-    // Scan progress
-    yyjson_mut_obj_add_uint(doc, root, "events_scanned", events_scanned);
-    yyjson_mut_obj_add_uint(doc, root, "chunks_scanned", chunks_scanned);
-    yyjson_mut_obj_add_uint(doc, root, "chunks_skipped", chunks_skipped);
+    ss << "\"events_scanned\":" << events_scanned;
+    ss << ",\"chunks_scanned\":" << chunks_scanned;
+    ss << ",\"chunks_skipped\":" << chunks_skipped;
 
-    // Global duration
-    yyjson_mut_obj_add_val(doc, root, "duration",
-                           distribution_to_json(doc, duration));
+    ss << ",\"duration\":";
+    write_distribution_json(ss, duration);
 
-    // Grouped duration
     if (!grouped_duration.empty()) {
-        yyjson_mut_val* gd = yyjson_mut_obj(doc);
+        ss << ",\"grouped_duration\":{";
+        bool first = true;
         for (const auto& [key, dist] : grouped_duration) {
-            yyjson_mut_obj_add_val(doc, gd, key.c_str(),
-                                   distribution_to_json(doc, dist));
+            if (!first) ss << ',';
+            first = false;
+            ss << '"' << key << "\":";
+            write_distribution_json(ss, dist);
         }
-        yyjson_mut_obj_add_val(doc, root, "grouped_duration", gd);
+        ss << '}';
     }
 
-    // Grouped I/O
     if (!grouped_io.empty()) {
-        yyjson_mut_val* gio = yyjson_mut_obj(doc);
+        ss << ",\"grouped_io\":{";
+        bool first = true;
         for (const auto& [key, io] : grouped_io) {
-            yyjson_mut_obj_add_val(doc, gio, key.c_str(),
-                                   io_metrics_to_json(doc, io));
+            if (!first) ss << ',';
+            first = false;
+            ss << '"' << key << "\":";
+            write_io_metrics_json(ss, io);
         }
-        yyjson_mut_obj_add_val(doc, root, "grouped_io", gio);
+        ss << '}';
     }
 
-    char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_PRETTY, nullptr);
-    std::string result(json_str ? json_str : "{}");
-    if (json_str) free(json_str);
-    yyjson_mut_doc_free(doc);
-    return result;
+    ss << '}';
+    return ss.str();
 }
 
 }  // namespace dftracer::utils::utilities::composites::dft::statistics
diff --git a/src/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.cpp b/src/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.cpp
new file mode 100644
index 00000000..80870278
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.cpp
@@ -0,0 +1,5 @@
+#include <dftracer/utils/utilities/composites/dft/statistics/shared_index_statistics_reader.h>
+
+namespace dftracer::utils::utilities::composites::dft::statistics {
+
+}  // namespace dftracer::utils::utilities::composites::dft::statistics
diff --git a/src/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.cpp b/src/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.cpp
index 6f34cdd0..8a72329f 100644
--- a/src/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.cpp
@@ -1,16 +1,16 @@
 #include <dftracer/utils/core/common/filesystem.h>
-#include <dftracer/utils/core/rocksdb/async.h>
 #include <dftracer/utils/utilities/common/json/json_value.h>
 #include <dftracer/utils/utilities/composites/dft/internal/utils.h>
 #include <dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h>
 #include <dftracer/utils/utilities/fileio/lines/sources/async_streaming_gz_line_generator.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
 namespace dftracer::utils::utilities::composites::dft::statistics {
 
 using dftracer::utils::utilities::common::json::JsonValue;
+using dftracer::utils::utilities::indexer::ChunkStatisticsResult;
 using dftracer::utils::utilities::indexer::IndexDatabase;
 using dftracer::utils::utilities::indexer::internal::get_logical_path;
 using fileio::lines::sources::async_streaming_gz_lines;
@@ -35,33 +35,29 @@ coro::CoroTask<TraceStatistics> StatisticsAggregatorUtility::process(
     }
 
     bool needs_streaming_fallback = false;
-    auto do_query = [&input, &result,
-                     &needs_streaming_fallback]() -> TraceStatistics {
-        try {
-            IndexDatabase idx_db(result.index_path);
-
-            int fid =
-                idx_db.get_file_info_id(get_logical_path(input.file_path));
-            if (fid < 0) {
-                result.success = false;
-                result.error_message =
-                    "File not found in index: " + input.file_path;
-                return result;
-            }
+    try {
+        IndexDatabase idx_db(result.index_path);
 
-            std::vector<IndexDatabase::ChunkStatisticsResult> chunks;
-            try {
-                chunks = idx_db.query_chunk_statistics(fid);
-            } catch (const std::exception&) {
-                needs_streaming_fallback = true;
-                return result;
-            }
+        int fid = idx_db.get_file_info_id(get_logical_path(input.file_path));
+        if (fid < 0) {
+            result.success = false;
+            result.error_message =
+                "File not found in index: " + input.file_path;
+            co_return result;
+        }
 
-            if (chunks.empty()) {
-                needs_streaming_fallback = true;
-                return result;
-            }
+        std::vector<ChunkStatisticsResult> chunks;
+        try {
+            chunks = idx_db.query_chunk_statistics(fid);
+        } catch (const std::exception&) {
+            needs_streaming_fallback = true;
+        }
 
+        if (!needs_streaming_fallback && chunks.empty()) {
+            needs_streaming_fallback = true;
+        }
+
+        if (!needs_streaming_fallback) {
             result.num_chunks = chunks.size();
             result.merged = chunks[0].stats;
             for (std::size_t i = 1; i < chunks.size(); ++i) {
@@ -70,6 +66,8 @@ coro::CoroTask<TraceStatistics> StatisticsAggregatorUtility::process(
 
             auto dim_stats = idx_db.query_chunk_dimension_stats(fid);
             for (const auto& ds : dim_stats) {
+                if (!ds.has_value_counts_payload()) continue;
+                ds.ensure_value_counts_decoded();
                 if (!ds.value_counts) continue;
                 if (ds.dimension == "cat") {
                     for (const auto& [k, v] : *ds.value_counts)
@@ -84,14 +82,13 @@ coro::CoroTask<TraceStatistics> StatisticsAggregatorUtility::process(
             }
 
             result.success = true;
-        } catch (const std::exception& e) {
-            result.success = false;
-            result.error_message = e.what();
+            co_return result;
         }
-        return result;
-    };
-
-    result = co_await rocksdb::run(do_query);
+    } catch (const std::exception& e) {
+        result.success = false;
+        result.error_message = e.what();
+        co_return result;
+    }
 
     if (!needs_streaming_fallback) {
         co_return result;
@@ -104,24 +101,21 @@ coro::CoroTask<TraceStatistics> StatisticsAggregatorUtility::process(
     }
 
     /// Sequential fallback: stream the file line-by-line and compute
-    /// statistics on-the-fly when the index has no chunk_statistics
-    /// (e.g. file was below the index_threshold).
+    /// statistics on-the-fly when the index has no chunk_statistics.
     try {
         indexing::ChunkStatistics stats;
+        simdjson::dom::parser parser;
         auto gen = async_streaming_gz_lines(input.file_path);
         while (auto line_opt = co_await gen.next()) {
             const auto& line = *line_opt;
             if (line.content.empty()) continue;
 
-            yyjson_doc* doc = yyjson_read(
-                line.content.data(), line.content.size(), YYJSON_READ_NOFLAG);
-            if (!doc) continue;
+            auto parse_result =
+                parser.parse(line.content.data(), line.content.size());
+            if (parse_result.error()) continue;
 
-            yyjson_val* root = yyjson_doc_get_root(doc);
-            if (!root || !yyjson_is_obj(root)) {
-                yyjson_doc_free(doc);
-                continue;
-            }
+            auto root = parse_result.value_unsafe();
+            if (!root.is_object()) continue;
 
             try {
                 JsonValue json(root);
@@ -138,11 +132,7 @@ coro::CoroTask<TraceStatistics> StatisticsAggregatorUtility::process(
                     stats.update_from_event(name, cat, pid, tid, ts, dur);
                 }
             } catch (const std::exception&) {
-                // Skip malformed or partial events without
-                // aborting the entire aggregation.
             }
-
-            yyjson_doc_free(doc);
         }
 
         result.merged = std::move(stats);
@@ -156,4 +146,104 @@ coro::CoroTask<TraceStatistics> StatisticsAggregatorUtility::process(
     co_return result;
 }
 
+coro::CoroTask<std::vector<TraceStatistics>>
+StatisticsAggregatorUtility::process_batch(
+    const StatisticsAggregatorBatchInput& input) {
+    if (input.file_paths.empty()) {
+        co_return std::vector<TraceStatistics>{};
+    }
+
+    const auto& index_path = input.index_path;
+    if (!fs::exists(index_path)) {
+        std::vector<TraceStatistics> results;
+        results.reserve(input.file_paths.size());
+        for (const auto& fp : input.file_paths) {
+            TraceStatistics r;
+            r.file_path = fp;
+            r.index_path = index_path;
+            r.success = false;
+            r.error_message = "Index store not found: " + index_path;
+            results.push_back(std::move(r));
+        }
+        co_return results;
+    }
+
+    const auto& files = input.file_paths;
+    std::vector<TraceStatistics> results;
+    results.resize(files.size());
+    for (std::size_t i = 0; i < files.size(); ++i) {
+        results[i].file_path = files[i];
+        results[i].index_path = index_path;
+    }
+
+    try {
+        IndexDatabase db(
+            index_path,
+            dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
+
+        std::vector<int> file_ids(files.size(), -1);
+        for (std::size_t i = 0; i < files.size(); ++i) {
+            file_ids[i] = db.get_file_info_id(get_logical_path(files[i]));
+            if (file_ids[i] < 0) {
+                results[i].success = false;
+                results[i].error_message =
+                    "File not found in index: " + files[i];
+            }
+        }
+
+        std::vector<int> valid_ids;
+        valid_ids.reserve(files.size());
+        for (std::size_t i = 0; i < files.size(); ++i) {
+            if (file_ids[i] >= 0) valid_ids.push_back(file_ids[i]);
+        }
+
+        auto scalar_batch = db.query_file_scalar_stats_batch(valid_ids);
+        auto cat_batch = db.query_file_category_counts_batch(valid_ids);
+        auto pid_tid_batch = db.query_file_pid_tid_counts_batch(valid_ids);
+        auto name_batch = db.query_file_name_summaries_batch(valid_ids);
+
+        for (std::size_t i = 0; i < files.size(); ++i) {
+            if (file_ids[i] < 0) continue;
+            const int fid = file_ids[i];
+
+            auto scalar_it = scalar_batch.find(fid);
+            if (scalar_it == scalar_batch.end()) {
+                results[i].success = false;
+                results[i].error_message =
+                    "No file summary in index for: " + files[i];
+                continue;
+            }
+
+            results[i].merged = scalar_it->second.stats;
+            results[i].num_chunks = scalar_it->second.num_chunks;
+            results[i].success = true;
+
+            auto cat_it = cat_batch.find(fid);
+            if (cat_it != cat_batch.end()) {
+                results[i].merged.category_counts = std::move(cat_it->second);
+            }
+
+            auto pid_it = pid_tid_batch.find(fid);
+            if (pid_it != pid_tid_batch.end()) {
+                results[i].merged.pid_tid_counts = std::move(pid_it->second);
+            }
+
+            auto name_it = name_batch.find(fid);
+            if (name_it != name_batch.end()) {
+                results[i].merged.name_counts =
+                    std::move(name_it->second.counts);
+            }
+        }
+    } catch (const std::exception& e) {
+        for (std::size_t i = 0; i < files.size(); ++i) {
+            if (!results[i].success && results[i].error_message.empty()) {
+                results[i].success = false;
+                results[i].error_message = e.what();
+            }
+        }
+    }
+
+    co_return results;
+}
+
 }  // namespace dftracer::utils::utilities::composites::dft::statistics
diff --git a/src/dftracer/utils/utilities/composites/dft/statistics/statistics_query_utility.cpp b/src/dftracer/utils/utilities/composites/dft/statistics/statistics_query_utility.cpp
index f5f8b641..f70933b6 100644
--- a/src/dftracer/utils/utilities/composites/dft/statistics/statistics_query_utility.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/statistics/statistics_query_utility.cpp
@@ -1,24 +1,29 @@
 #include <dftracer/utils/utilities/composites/dft/statistics/statistics_query_utility.h>
-#include <yyjson.h>
 
 #include <algorithm>
 #include <cmath>
+#include <iomanip>
 #include <limits>
+#include <sstream>
 
 namespace dftracer::utils::utilities::composites::dft::statistics {
 
 namespace {
-std::vector<std::pair<std::string, std::uint64_t>> sorted_desc(
-    const std::unordered_map<std::string, std::uint64_t>& m) {
+template <typename Map>
+std::vector<std::pair<std::string, std::uint64_t>> sorted_desc(const Map& m) {
     std::vector<std::pair<std::string, std::uint64_t>> v(m.begin(), m.end());
     std::sort(v.begin(), v.end(),
               [](const auto& a, const auto& b) { return a.second > b.second; });
     return v;
 }
 
-std::vector<std::pair<std::string, std::uint64_t>> top_n(
-    const std::unordered_map<std::string, std::uint64_t>& m, std::uint64_t n) {
+template <typename Map>
+std::vector<std::pair<std::string, std::uint64_t>> top_n(const Map& m,
+                                                         std::uint64_t n) {
     auto v = sorted_desc(m);
+    if (n == 0) {
+        return v;
+    }
     if (v.size() > n) {
         v.resize(static_cast<std::size_t>(n));
     }
@@ -129,48 +134,44 @@ coro::CoroTask<StatisticsQueryOutput> StatisticsQueryUtility::process(
 }
 
 std::string StatisticsQueryOutput::to_json() const {
-    yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr);
-    yyjson_mut_val* root = yyjson_mut_obj(doc);
-    yyjson_mut_doc_set_root(doc, root);
+    std::ostringstream ss;
+    ss << std::setprecision(17);
+    ss << '{';
 
-    yyjson_mut_obj_add_str(doc, root, "query_type", query_type_name.c_str());
-    yyjson_mut_obj_add_uint(doc, root, "total_events", total_events);
+    ss << "\"query_type\":\"" << query_type_name << '"';
+    ss << ",\"total_events\":" << total_events;
 
     if (!results.empty()) {
-        yyjson_mut_val* arr = yyjson_mut_arr(doc);
+        ss << ",\"results\":[";
+        bool first = true;
         for (const auto& [name, count] : results) {
-            yyjson_mut_val* item = yyjson_mut_obj(doc);
-            yyjson_mut_obj_add_str(doc, item, "name", name.c_str());
-            yyjson_mut_obj_add_uint(doc, item, "count", count);
-            yyjson_mut_arr_append(arr, item);
+            if (!first) ss << ',';
+            first = false;
+            ss << "{\"name\":\"" << name << "\",\"count\":" << count << '}';
         }
-        yyjson_mut_obj_add_val(doc, root, "results", arr);
+        ss << ']';
     }
 
     if (min_timestamp_us > 0 || max_timestamp_us > 0) {
-        yyjson_mut_val* tr = yyjson_mut_obj(doc);
-        yyjson_mut_obj_add_uint(doc, tr, "min_timestamp_us", min_timestamp_us);
-        yyjson_mut_obj_add_uint(doc, tr, "max_timestamp_us", max_timestamp_us);
-        yyjson_mut_obj_add_real(doc, tr, "time_span_seconds",
-                                time_span_seconds);
-        yyjson_mut_obj_add_val(doc, root, "time_range", tr);
+        ss << ",\"time_range\":{";
+        ss << "\"min_timestamp_us\":" << min_timestamp_us;
+        ss << ",\"max_timestamp_us\":" << max_timestamp_us;
+        ss << ",\"time_span_seconds\":" << time_span_seconds;
+        ss << '}';
     }
 
     if (duration_count > 0) {
-        yyjson_mut_val* dur = yyjson_mut_obj(doc);
-        yyjson_mut_obj_add_uint(doc, dur, "count", duration_count);
-        yyjson_mut_obj_add_real(doc, dur, "mean_us", duration_mean_us);
-        yyjson_mut_obj_add_real(doc, dur, "stddev_us", duration_stddev_us);
-        yyjson_mut_obj_add_uint(doc, dur, "min_us", duration_min_us);
-        yyjson_mut_obj_add_uint(doc, dur, "max_us", duration_max_us);
-        yyjson_mut_obj_add_val(doc, root, "duration", dur);
+        ss << ",\"duration\":{";
+        ss << "\"count\":" << duration_count;
+        ss << ",\"mean_us\":" << duration_mean_us;
+        ss << ",\"stddev_us\":" << duration_stddev_us;
+        ss << ",\"min_us\":" << duration_min_us;
+        ss << ",\"max_us\":" << duration_max_us;
+        ss << '}';
     }
 
-    char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_PRETTY, nullptr);
-    std::string result(json_str ? json_str : "{}");
-    if (json_str) free(json_str);
-    yyjson_mut_doc_free(doc);
-    return result;
+    ss << '}';
+    return ss.str();
 }
 
 }  // namespace dftracer::utils::utilities::composites::dft::statistics
diff --git a/src/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.cpp b/src/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.cpp
index 584c3a4c..59778407 100644
--- a/src/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/statistics/trace_statistics.cpp
@@ -1,7 +1,8 @@
 #include <dftracer/utils/utilities/composites/dft/statistics/trace_statistics.h>
-#include <yyjson.h>
 
 #include <cmath>
+#include <iomanip>
+#include <sstream>
 
 namespace dftracer::utils::utilities::composites::dft::statistics {
 
@@ -40,80 +41,77 @@ std::size_t TraceStatistics::num_pid_tids() const {
 }
 
 namespace {
-void add_counts_object(
-    yyjson_mut_doc* doc, yyjson_mut_val* parent, const char* key,
-    const std::unordered_map<std::string, std::uint64_t>& m) {
-    yyjson_mut_val* obj = yyjson_mut_obj(doc);
+template <typename Map>
+void add_counts_object(std::ostringstream& ss, const char* key, const Map& m,
+                       bool& first_field) {
+    if (!first_field) ss << ',';
+    first_field = false;
+    ss << '"' << key << "\":{";
+    bool first = true;
     for (const auto& [k, v] : m) {
-        yyjson_mut_obj_add_uint(doc, obj, k.c_str(), v);
+        if (!first) ss << ',';
+        first = false;
+        ss << '"' << k << "\":" << v;
     }
-    yyjson_mut_obj_add_val(doc, parent, key, obj);
+    ss << '}';
 }
 }  // namespace
 
 std::string TraceStatistics::to_json() const {
-    yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr);
-    yyjson_mut_val* root = yyjson_mut_obj(doc);
-    yyjson_mut_doc_set_root(doc, root);
+    std::ostringstream ss;
+    ss << std::setprecision(17);
+    ss << '{';
 
-    yyjson_mut_obj_add_str(doc, root, "file_path", file_path.c_str());
-    yyjson_mut_obj_add_str(doc, root, "index_path", index_path.c_str());
-    yyjson_mut_obj_add_bool(doc, root, "success", success);
+    ss << "\"file_path\":\"" << file_path << '"';
+    ss << ",\"index_path\":\"" << index_path << '"';
+    ss << ",\"success\":" << (success ? "true" : "false");
+    ss << ",\"total_events\":" << total_events();
 
     if (!success) {
-        yyjson_mut_obj_add_str(doc, root, "error", error_message.c_str());
+        ss << ",\"error\":\"" << error_message << '"';
     } else {
-        yyjson_mut_obj_add_uint(doc, root, "num_chunks", num_chunks);
-        yyjson_mut_obj_add_uint(doc, root, "total_events", total_events());
-        yyjson_mut_obj_add_uint(doc, root, "num_categories", num_categories());
-        yyjson_mut_obj_add_uint(doc, root, "num_unique_names",
-                                num_unique_names());
-        yyjson_mut_obj_add_uint(doc, root, "num_pid_tids", num_pid_tids());
+        ss << ",\"num_chunks\":" << num_chunks;
+        ss << ",\"num_categories\":" << num_categories();
+        ss << ",\"num_unique_names\":" << num_unique_names();
+        ss << ",\"num_pid_tids\":" << num_pid_tids();
 
         // Time range
-        yyjson_mut_val* time_range = yyjson_mut_obj(doc);
+        ss << ",\"time_range\":{";
         if (merged.min_timestamp_us !=
             std::numeric_limits<std::uint64_t>::max()) {
-            yyjson_mut_obj_add_uint(doc, time_range, "min_timestamp_us",
-                                    merged.min_timestamp_us);
-            yyjson_mut_obj_add_uint(doc, time_range, "max_timestamp_us",
-                                    merged.max_timestamp_us);
+            ss << "\"min_timestamp_us\":" << merged.min_timestamp_us;
+            ss << ",\"max_timestamp_us\":" << merged.max_timestamp_us;
+            ss << ',';
         }
-        yyjson_mut_obj_add_real(doc, time_range, "time_span_seconds",
-                                time_span_seconds());
-        yyjson_mut_obj_add_val(doc, root, "time_range", time_range);
+        ss << "\"time_span_seconds\":" << time_span_seconds();
+        ss << '}';
 
         // Duration stats
-        yyjson_mut_val* duration = yyjson_mut_obj(doc);
-        yyjson_mut_obj_add_uint(doc, duration, "count", merged.duration_count);
+        ss << ",\"duration\":{";
+        ss << "\"count\":" << merged.duration_count;
         if (merged.duration_count > 0) {
-            yyjson_mut_obj_add_int(doc, duration, "sum_us",
-                                   merged.duration_sum_us);
-            yyjson_mut_obj_add_real(doc, duration, "mean_us",
-                                    duration_mean_us());
-            yyjson_mut_obj_add_real(doc, duration, "stddev_us",
-                                    duration_stddev_us());
+            ss << ",\"sum_us\":" << merged.duration_sum_us;
+            ss << ",\"mean_us\":" << duration_mean_us();
+            ss << ",\"stddev_us\":" << duration_stddev_us();
             if (merged.duration_min_us !=
                 std::numeric_limits<std::uint64_t>::max()) {
-                yyjson_mut_obj_add_uint(doc, duration, "min_us",
-                                        merged.duration_min_us);
+                ss << ",\"min_us\":" << merged.duration_min_us;
             }
-            yyjson_mut_obj_add_uint(doc, duration, "max_us",
-                                    merged.duration_max_us);
+            ss << ",\"max_us\":" << merged.duration_max_us;
         }
-        yyjson_mut_obj_add_val(doc, root, "duration", duration);
+        ss << '}';
 
         // Count maps
-        add_counts_object(doc, root, "category_counts", merged.category_counts);
-        add_counts_object(doc, root, "name_counts", merged.name_counts);
-        add_counts_object(doc, root, "pid_tid_counts", merged.pid_tid_counts);
+        bool first_field = false;
+        add_counts_object(ss, "category_counts", merged.category_counts,
+                          first_field);
+        add_counts_object(ss, "name_counts", merged.name_counts, first_field);
+        add_counts_object(ss, "pid_tid_counts", merged.pid_tid_counts,
+                          first_field);
     }
 
-    char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_PRETTY, nullptr);
-    std::string result(json_str ? json_str : "{}");
-    if (json_str) free(json_str);
-    yyjson_mut_doc_free(doc);
-    return result;
+    ss << '}';
+    return ss.str();
 }
 
 }  // namespace dftracer::utils::utilities::composites::dft::statistics
diff --git a/src/dftracer/utils/utilities/composites/dft/views/view_definition.cpp b/src/dftracer/utils/utilities/composites/dft/views/view_definition.cpp
index ac132b15..476b2919 100644
--- a/src/dftracer/utils/utilities/composites/dft/views/view_definition.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/views/view_definition.cpp
@@ -1,7 +1,8 @@
 #include <dftracer/utils/utilities/composites/dft/views/view_definition.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
 #include <cstdlib>
+#include <sstream>
 #include <string>
 
 namespace dftracer::utils::utilities::composites::dft::views {
@@ -34,55 +35,99 @@ ViewDefinition& ViewDefinition::with_include_metadata(bool v) {
     return *this;
 }
 
-std::string ViewDefinition::to_json() const {
-    yyjson_mut_doc* doc = yyjson_mut_doc_new(nullptr);
-    yyjson_mut_val* root = yyjson_mut_obj(doc);
-    yyjson_mut_doc_set_root(doc, root);
+namespace {
+
+std::string escape_json_string(const std::string& s) {
+    std::string result;
+    result.reserve(s.size());
+    for (char c : s) {
+        switch (c) {
+            case '"':
+                result += "\\\"";
+                break;
+            case '\\':
+                result += "\\\\";
+                break;
+            case '\b':
+                result += "\\b";
+                break;
+            case '\f':
+                result += "\\f";
+                break;
+            case '\n':
+                result += "\\n";
+                break;
+            case '\r':
+                result += "\\r";
+                break;
+            case '\t':
+                result += "\\t";
+                break;
+            default:
+                result += c;
+                break;
+        }
+    }
+    return result;
+}
 
-    yyjson_mut_obj_add_str(doc, root, "name", name.c_str());
-    yyjson_mut_obj_add_str(doc, root, "description", description.c_str());
+}  // namespace
+
+std::string ViewDefinition::to_json() const {
+    std::ostringstream out;
+    out << "{\n";
+    out << "  \"name\": \"" << escape_json_string(name) << "\",\n";
+    out << "  \"description\": \"" << escape_json_string(description) << "\"";
 
     if (query) {
-        yyjson_mut_obj_add_str(doc, root, "query", query->source().c_str());
+        out << ",\n  \"query\": \"" << escape_json_string(query->source())
+            << "\"";
     }
 
-    yyjson_mut_obj_add_bool(doc, root, "include_metadata", include_metadata);
-
-    char* json_str = yyjson_mut_write(doc, YYJSON_WRITE_PRETTY, nullptr);
-    std::string result(json_str);
-    free(json_str);
-    yyjson_mut_doc_free(doc);
-    return result;
+    out << ",\n  \"include_metadata\": "
+        << (include_metadata ? "true" : "false") << "\n";
+    out << "}";
+    return out.str();
 }
 
 ViewDefinition ViewDefinition::from_json(const std::string& json) {
-    yyjson_doc* doc =
-        yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG);
-    yyjson_val* root = yyjson_doc_get_root(doc);
-
     ViewDefinition view_def;
 
-    yyjson_val* name_val = yyjson_obj_get(root, "name");
-    if (name_val && yyjson_is_str(name_val)) {
-        view_def.name = yyjson_get_str(name_val);
+    simdjson::dom::parser parser;
+    auto result = parser.parse(json);
+    if (result.error()) {
+        return view_def;
+    }
+
+    auto root = result.value_unsafe();
+    if (!root.is_object()) {
+        return view_def;
+    }
+
+    auto name_result = root["name"];
+    if (!name_result.error() && name_result.value_unsafe().is_string()) {
+        view_def.name =
+            std::string(name_result.value_unsafe().get_string().value());
     }
 
-    yyjson_val* desc_val = yyjson_obj_get(root, "description");
-    if (desc_val && yyjson_is_str(desc_val)) {
-        view_def.description = yyjson_get_str(desc_val);
+    auto desc_result = root["description"];
+    if (!desc_result.error() && desc_result.value_unsafe().is_string()) {
+        view_def.description =
+            std::string(desc_result.value_unsafe().get_string().value());
     }
 
-    yyjson_val* query_val = yyjson_obj_get(root, "query");
-    if (query_val && yyjson_is_str(query_val)) {
-        view_def.with_query(yyjson_get_str(query_val));
+    auto query_result = root["query"];
+    if (!query_result.error() && query_result.value_unsafe().is_string()) {
+        view_def.with_query(
+            std::string(query_result.value_unsafe().get_string().value()));
     }
 
-    yyjson_val* meta_val = yyjson_obj_get(root, "include_metadata");
-    if (meta_val && yyjson_is_bool(meta_val)) {
-        view_def.include_metadata = yyjson_get_bool(meta_val);
+    auto meta_result = root["include_metadata"];
+    if (!meta_result.error() && meta_result.value_unsafe().is_bool()) {
+        view_def.include_metadata =
+            meta_result.value_unsafe().get_bool().value();
     }
 
-    yyjson_doc_free(doc);
     return view_def;
 }
 
diff --git a/src/dftracer/utils/utilities/composites/dft/views/view_reader_utility.cpp b/src/dftracer/utils/utilities/composites/dft/views/view_reader_utility.cpp
index 5759ff4e..64403843 100644
--- a/src/dftracer/utils/utilities/composites/dft/views/view_reader_utility.cpp
+++ b/src/dftracer/utils/utilities/composites/dft/views/view_reader_utility.cpp
@@ -7,7 +7,7 @@
 #include <dftracer/utils/utilities/composites/indexed_file_reader_utility.h>
 #include <dftracer/utils/utilities/composites/types.h>
 #include <dftracer/utils/utilities/reader/internal/stream_config.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
 #include <cstring>
 #include <string>
@@ -134,10 +134,7 @@ coro::AsyncGenerator<ViewReaderBatch> ViewReaderUtility::process(
 
     ViewReaderBatch batch;
 
-    // NOTE(perf): reusable yyjson allocator
-    char yy_buf[common::json::YYJSON_LINE_POOL_SIZE];
-    yyjson_alc yy_alc;
-    yyjson_alc_pool_init(&yy_alc, yy_buf, sizeof(yy_buf));
+    simdjson::dom::parser parser;
 
     while (!stream->done()) {
         auto chunk = co_await stream->read_async();
@@ -155,13 +152,10 @@ coro::AsyncGenerator<ViewReaderBatch> ViewReaderUtility::process(
             std::size_t line_len = newline - line_start;
 
             if (line_len > 0) {
-                yyjson_doc* doc =
-                    yyjson_read_opts(const_cast<char*>(line_start), line_len,
-                                     YYJSON_READ_NOFLAG, &yy_alc, nullptr);
-
-                if (doc) {
-                    yyjson_val* root = yyjson_doc_get_root(doc);
-                    if (root && yyjson_is_obj(root)) {
+                auto result = parser.parse(line_start, line_len);
+                if (!result.error()) {
+                    auto root = result.value_unsafe();
+                    if (root.is_object()) {
                         JsonValue json(root);
                         std::string_view ph =
                             json["ph"].get<std::string_view>();
@@ -206,7 +200,6 @@ coro::AsyncGenerator<ViewReaderBatch> ViewReaderUtility::process(
                             }
                         }
                     }
-                    yyjson_doc_free(doc);
                 }
             }
 
@@ -241,69 +234,56 @@ using common::arrow::RecordBatchBuilder;
 
 ArrowExportResult ViewReaderBatch::to_arrow() const {
     RecordBatchBuilder builder;
+    return to_arrow(builder);
+}
+
+ArrowExportResult ViewReaderBatch::to_arrow(RecordBatchBuilder& builder) const {
     builder.reserve(events.size());
-    std::vector<yyjson_doc*> held_docs;
     std::vector<std::string> held_serialized;
+    simdjson::dom::parser parser;
 
     for (const auto& event_str : events) {
-        yyjson_doc* doc = yyjson_read(event_str.data(), event_str.size(), 0);
-        if (!doc) continue;
-        yyjson_val* root = yyjson_doc_get_root(doc);
-        if (!root || !yyjson_is_obj(root)) {
-            yyjson_doc_free(doc);
-            continue;
-        }
-        held_docs.push_back(doc);
+        auto result = parser.parse(event_str.data(), event_str.size());
+        if (result.error()) continue;
+        auto elem = result.value_unsafe();
+        if (!elem.is_object()) continue;
+
+        auto obj_result = elem.get_object();
+        if (obj_result.error()) continue;
+        auto obj = obj_result.value_unsafe();
 
-        yyjson_obj_iter it;
-        yyjson_obj_iter_init(root, &it);
-        yyjson_val* key;
-        while ((key = yyjson_obj_iter_next(&it))) {
-            yyjson_val* val = yyjson_obj_iter_get_val(key);
-            std::string_view key_sv(yyjson_get_str(key), yyjson_get_len(key));
+        for (auto field : obj) {
+            std::string_view key_sv = field.key;
+            auto val = field.value;
 
-            if (yyjson_is_int(val)) {
+            if (val.is_int64()) {
                 auto ci = builder.add_or_get_column(key_sv, ColumnType::INT64);
-                builder.append_int64(ci, yyjson_get_sint(val));
-            } else if (yyjson_is_uint(val)) {
+                builder.append_int64(ci, val.get_int64().value_unsafe());
+            } else if (val.is_uint64()) {
                 auto ci = builder.add_or_get_column(key_sv, ColumnType::UINT64);
-                builder.append_uint64(ci, yyjson_get_uint(val));
-            } else if (yyjson_is_real(val)) {
+                builder.append_uint64(ci, val.get_uint64().value_unsafe());
+            } else if (val.is_double()) {
                 auto ci = builder.add_or_get_column(key_sv, ColumnType::DOUBLE);
-                builder.append_double(ci, yyjson_get_real(val));
-            } else if (yyjson_is_bool(val)) {
+                builder.append_double(ci, val.get_double().value_unsafe());
+            } else if (val.is_bool()) {
                 auto ci = builder.add_or_get_column(key_sv, ColumnType::BOOL);
-                builder.append_bool(ci, yyjson_get_bool(val));
-            } else if (yyjson_is_str(val)) {
+                builder.append_bool(ci, val.get_bool().value_unsafe());
+            } else if (val.is_string()) {
                 auto ci = builder.add_or_get_column(key_sv, ColumnType::STRING);
-                builder.append_string(
-                    ci,
-                    std::string_view(yyjson_get_str(val), yyjson_get_len(val)));
-            } else if (yyjson_is_null(val)) {
-                // Only append null to an existing column; skip if new —
-                // we don't know the type yet and STRING would corrupt later
-                // typed appends.
+                builder.append_string(ci, val.get_string().value_unsafe());
+            } else if (val.is_null()) {
                 auto existing = builder.find_column(key_sv);
                 if (existing) builder.append_null(*existing);
             } else {
                 auto ci = builder.add_or_get_column(key_sv, ColumnType::STRING);
-                std::size_t jlen;
-                char* js = yyjson_val_write(val, 0, &jlen);
-                if (js) {
-                    held_serialized.emplace_back(js, jlen);
-                    free(js);
-                    builder.append_string(ci, held_serialized.back());
-                } else {
-                    builder.append_null(ci);
-                }
+                held_serialized.push_back(simdjson::minify(val));
+                builder.append_string(ci, held_serialized.back());
             }
         }
         builder.end_row();
     }
 
-    auto result = builder.finish();
-    for (auto* d : held_docs) yyjson_doc_free(d);
-    return result;
+    return builder.finish();
 }
 
 }  // namespace dftracer::utils::utilities::composites::dft::views
diff --git a/src/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.cpp b/src/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.cpp
new file mode 100644
index 00000000..b42975e6
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.cpp
@@ -0,0 +1,652 @@
+#include <dftracer/utils/utilities/composites/dft/args_map.h>
+#include <dftracer/utils/utilities/composites/dft/event.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h>
+#include <dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h>
+#include <dftracer/utils/utilities/hash/fnv1a_hasher_utility.h>
+#include <dftracer/utils/utilities/indexer/index_batch_sink.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
+
+#include <charconv>
+#include <cstring>
+#include <string>
+
+using dftracer::utils::utilities::composites::dft::indexing::BloomFilter;
+namespace dftracer::utils::utilities::composites::dft::visitors {
+
+namespace {
+
+constexpr std::string_view DIM_NAME = "name";
+constexpr std::string_view DIM_CAT = "cat";
+constexpr std::string_view DIM_PID = "pid";
+constexpr std::string_view DIM_TID = "tid";
+constexpr std::string_view DIM_PID_TID = "pid_tid";
+constexpr std::string_view DIM_HHASH = "hhash";
+constexpr std::string_view DIM_FHASH = "fhash";
+constexpr std::string_view DIM_SHASH = "shash";
+constexpr std::string_view DIM_TS = "ts";
+constexpr std::string_view DIM_DUR = "dur";
+
+constexpr std::array<std::string_view, BloomVisitor::BF_COUNT>
+    FIXED_BLOOM_NAMES = {DIM_NAME,  DIM_CAT,   DIM_PID,  DIM_TID,
+                         DIM_HHASH, DIM_FHASH, DIM_SHASH};
+
+constexpr std::array<std::string_view, BloomVisitor::FD_COUNT> FIXED_DIM_NAMES =
+    {DIM_NAME,  DIM_CAT,   DIM_PID,   DIM_TID, DIM_PID_TID,
+     DIM_HHASH, DIM_FHASH, DIM_SHASH, DIM_TS,  DIM_DUR};
+
+int fixed_bloom_index(std::string_view name) {
+    for (std::size_t i = 0; i < FIXED_BLOOM_NAMES.size(); ++i) {
+        if (FIXED_BLOOM_NAMES[i] == name) return static_cast<int>(i);
+    }
+    return -1;
+}
+
+inline std::string_view dom_string(simdjson::dom::element obj,
+                                   std::string_view key) {
+    auto r = obj[key];
+    if (r.error()) return {};
+    auto v = r.value_unsafe();
+    if (!v.is_string()) return {};
+    return v.get_string().value_unsafe();
+}
+
+bool dom_value_to_string(simdjson::dom::element val, std::string& out) {
+    out.clear();
+    if (val.is_string()) {
+        auto sv = val.get_string().value_unsafe();
+        out.assign(sv.data(), sv.size());
+        return !out.empty();
+    }
+    char buf[32];
+    if (val.is_uint64()) {
+        auto [p, _] = std::to_chars(buf, buf + sizeof(buf),
+                                    val.get_uint64().value_unsafe());
+        out.assign(buf, p);
+        return true;
+    }
+    if (val.is_int64()) {
+        auto [p, _] = std::to_chars(buf, buf + sizeof(buf),
+                                    val.get_int64().value_unsafe());
+        out.assign(buf, p);
+        return true;
+    }
+    if (val.is_double()) {
+        auto [p, _] = std::to_chars(buf, buf + sizeof(buf),
+                                    val.get_double().value_unsafe());
+        out.assign(buf, p);
+        return true;
+    }
+    if (val.is_bool()) {
+        out = val.get_bool().value_unsafe() ? "true" : "false";
+        return true;
+    }
+    return false;
+}
+
+/// Emit bloom/stats/dimension records to a sink that might be either a
+/// RocksDB-backed writer or an SST file emitter. Returns the accumulated
+/// file-level statistics so downstream callers can use them for name
+/// postings and root-summary refresh on the concrete writer.
+BloomVisitor::ChunkStatistics persist_bloom_sink_writes(
+    indexer::IndexBatchSink& db, int file_id,
+    const std::vector<std::string>& extra_dim_names,
+    const std::vector<BloomVisitor::ChunkState>& chunks,
+    const BloomVisitor::ChunkIndexerConfig& config) {
+    BloomVisitor::ChunkStatistics file_statistics;
+
+    // Accumulate file-level blooms per slot.
+    std::array<BloomFilter, BloomVisitor::BF_COUNT> file_fixed_blooms = {
+        BloomFilter(config.expected_entries_per_chunk,
+                    config.false_positive_rate),
+        BloomFilter(config.expected_entries_per_chunk,
+                    config.false_positive_rate),
+        BloomFilter(config.expected_entries_per_chunk,
+                    config.false_positive_rate),
+        BloomFilter(config.expected_entries_per_chunk,
+                    config.false_positive_rate),
+        BloomFilter(config.expected_entries_per_chunk,
+                    config.false_positive_rate),
+        BloomFilter(config.expected_entries_per_chunk,
+                    config.false_positive_rate),
+        BloomFilter(config.expected_entries_per_chunk,
+                    config.false_positive_rate),
+    };
+    std::vector<BloomFilter> file_extra_blooms;
+    file_extra_blooms.reserve(extra_dim_names.size());
+    for (std::size_t i = 0; i < extra_dim_names.size(); ++i) {
+        file_extra_blooms.emplace_back(config.expected_entries_per_chunk,
+                                       config.false_positive_rate);
+    }
+
+    std::vector<unsigned char> blob;
+
+    for (std::size_t i = 0; i < chunks.size(); ++i) {
+        const auto& chunk = chunks[i];
+        auto checkpoint_idx = static_cast<std::uint64_t>(i);
+
+        // Fixed blooms
+        for (std::size_t b = 0; b < BloomVisitor::BF_COUNT; ++b) {
+            const BloomFilter& bf = chunk.fixed_blooms[b];
+            bf.serialize_into(blob);
+            db.insert_chunk_bloom_filter(
+                file_id, checkpoint_idx, std::string(FIXED_BLOOM_NAMES[b]),
+                std::span<const unsigned char>(blob.data(), blob.size()),
+                static_cast<std::uint64_t>(bf.num_entries()));
+            file_fixed_blooms[b].merge_from(bf);
+        }
+        // Extra blooms
+        for (std::size_t e = 0;
+             e < extra_dim_names.size() && e < chunk.extra_blooms.size(); ++e) {
+            const BloomFilter& bf = chunk.extra_blooms[e];
+            bf.serialize_into(blob);
+            db.insert_chunk_bloom_filter(
+                file_id, checkpoint_idx, extra_dim_names[e],
+                std::span<const unsigned char>(blob.data(), blob.size()),
+                static_cast<std::uint64_t>(bf.num_entries()));
+            file_extra_blooms[e].merge_from(bf);
+        }
+
+        db.insert_chunk_statistics(file_id, checkpoint_idx, chunk.statistics);
+        file_statistics.merge_from(chunk.statistics);
+
+        // Fixed dim_stats
+        for (std::size_t d = 0; d < BloomVisitor::FD_COUNT; ++d) {
+            db.insert_chunk_dimension_stats(file_id, checkpoint_idx,
+                                            chunk.fixed_dim_stats[d],
+                                            config.value_counts_cap);
+        }
+        // Extra dim_stats
+        for (const auto& ds : chunk.extra_dim_stats) {
+            db.insert_chunk_dimension_stats(file_id, checkpoint_idx, ds,
+                                            config.value_counts_cap);
+        }
+    }
+
+    // File-level blooms
+    for (std::size_t b = 0; b < BloomVisitor::BF_COUNT; ++b) {
+        const BloomFilter& bf = file_fixed_blooms[b];
+        bf.serialize_into(blob);
+        db.insert_file_bloom_filter(
+            file_id, std::string(FIXED_BLOOM_NAMES[b]),
+            std::span<const unsigned char>(blob.data(), blob.size()),
+            static_cast<std::uint64_t>(bf.num_entries()));
+    }
+    for (std::size_t e = 0; e < extra_dim_names.size(); ++e) {
+        const BloomFilter& bf = file_extra_blooms[e];
+        bf.serialize_into(blob);
+        db.insert_file_bloom_filter(
+            file_id, extra_dim_names[e],
+            std::span<const unsigned char>(blob.data(), blob.size()),
+            static_cast<std::uint64_t>(bf.num_entries()));
+    }
+
+    for (std::size_t b = 0; b < BloomVisitor::BF_COUNT; ++b) {
+        db.insert_index_dimension(file_id, std::string(FIXED_BLOOM_NAMES[b]));
+    }
+    for (const auto& dim : extra_dim_names) {
+        db.insert_index_dimension(file_id, dim);
+    }
+    db.insert_index_dimension(file_id, std::string(DIM_TS));
+    db.insert_index_dimension(file_id, std::string(DIM_DUR));
+
+    db.insert_file_scalar_stats(file_id, file_statistics, chunks.size());
+    db.insert_file_category_counts(file_id, file_statistics.category_counts);
+    db.insert_file_name_counts(file_id, file_statistics.name_counts);
+    db.insert_file_pid_tid_counts(file_id, file_statistics.pid_tid_counts);
+
+    // Name dictionary + postings. name_id is a pure FNV1a hash of the name
+    // so this is safe on any sink backend (RocksDB or SST). The dictionary
+    // entries are idempotent; duplicate inserts across workers are folded
+    // together at ingest time via `ingest_behind=true`.
+    std::unordered_map<std::string, std::uint64_t> file_name_ids;
+    file_name_ids.reserve(file_statistics.name_counts.size());
+    for (const auto& [name, _] : file_statistics.name_counts) {
+        const auto name_id = hash::fnv1a_hash(name);
+        file_name_ids.emplace(name, name_id);
+        db.insert_name_dictionary_entry(name_id, name);
+        db.insert_name_file_posting(name_id, file_id);
+    }
+
+    for (std::size_t i = 0; i < chunks.size(); ++i) {
+        const auto checkpoint_idx = static_cast<std::uint64_t>(i);
+        const auto& chunk = chunks[i];
+        for (const auto& [name, _] : chunk.statistics.name_counts) {
+            auto name_id_it = file_name_ids.find(name);
+            if (name_id_it != file_name_ids.end()) {
+                db.insert_name_chunk_posting(name_id_it->second, file_id,
+                                             checkpoint_idx);
+            }
+        }
+    }
+
+    return file_statistics;
+}
+
+/// Concrete-only tail: root-summary refresh. Requires a read-through
+/// (`has_file_scalar_stats`) and writes to the ROOT_* column families, which
+/// are not yet covered by the distributed SST path.
+void persist_bloom_concrete_tail(
+    indexer::IndexDatabaseWriterContext& db, int file_id,
+    const BloomVisitor::ChunkStatistics& file_statistics,
+    std::size_t num_chunks, bool refresh_root_summaries) {
+    if (!refresh_root_summaries) return;
+    const bool had_existing_file_summary = db.has_file_scalar_stats(file_id);
+    db.refresh_root_summaries_after_file_write(
+        file_id, file_statistics, num_chunks, had_existing_file_summary);
+}
+
+}  // namespace
+
+BloomVisitor::ChunkState::ChunkState() = default;
+
+BloomVisitor::BloomVisitor(ChunkIndexerConfig config,
+                           std::vector<std::string> dimensions) {
+    config_ = std::move(config);
+    // `dimensions` historically includes fixed + extras. Extract extras only
+    // (anything not matching a fixed bloom slot).
+    for (auto& dim : dimensions) {
+        if (fixed_bloom_index(dim) < 0) {
+            extra_dim_names_.push_back(std::move(dim));
+        }
+    }
+    // Also pick up config_.extra_dimensions (kept for backwards compat with
+    // callers that set extras there and pass only defaults in `dimensions`).
+    for (const auto& dim : config_.extra_dimensions) {
+        bool already = false;
+        for (const auto& e : extra_dim_names_) {
+            if (e == dim) {
+                already = true;
+                break;
+            }
+        }
+        if (!already && fixed_bloom_index(dim) < 0) {
+            extra_dim_names_.push_back(dim);
+        }
+    }
+}
+
+void BloomVisitor::begin(std::size_t /*num_checkpoints*/) {
+    chunks_.clear();
+    chunks_base_idx_ = 0;
+    file_acc_.extra_blooms.clear();
+    file_acc_.statistics = ChunkStatistics{};
+    file_acc_.num_chunks_emitted = 0;
+    file_acc_.initialized = false;
+}
+
+void BloomVisitor::on_checkpoint(std::size_t /*checkpoint_idx*/) {}
+
+void BloomVisitor::ensure_chunk(std::size_t checkpoint_idx) {
+    if (checkpoint_idx < chunks_base_idx_) return;
+    const std::size_t local = checkpoint_idx - chunks_base_idx_;
+    if (local < chunks_.size()) return;
+    auto old_size = chunks_.size();
+    chunks_.resize(local + 1);
+    for (std::size_t i = old_size; i < chunks_.size(); ++i) {
+        auto& chunk = chunks_[i];
+        // Initialize fixed blooms with configured params.
+        for (std::size_t b = 0; b < BF_COUNT; ++b) {
+            chunk.fixed_blooms[b] =
+                BloomFilter(config_.expected_entries_per_chunk,
+                            config_.false_positive_rate);
+        }
+        // Initialize fixed dim_stats metadata.
+        for (std::size_t d = 0; d < FD_COUNT; ++d) {
+            auto& ds = chunk.fixed_dim_stats[d];
+            ds.dimension = std::string(FIXED_DIM_NAMES[d]);
+            ds.value_type =
+                (d == FD_PID || d == FD_TID || d == FD_TS || d == FD_DUR)
+                    ? "uint"
+                    : "string";
+        }
+        // Initialize extras.
+        chunk.extra_blooms.clear();
+        chunk.extra_dim_stats.clear();
+        chunk.extra_blooms.reserve(extra_dim_names_.size());
+        chunk.extra_dim_stats.resize(extra_dim_names_.size());
+        for (std::size_t e = 0; e < extra_dim_names_.size(); ++e) {
+            chunk.extra_blooms.emplace_back(config_.expected_entries_per_chunk,
+                                            config_.false_positive_rate);
+            chunk.extra_dim_stats[e].dimension = extra_dim_names_[e];
+            chunk.extra_dim_stats[e].value_type = "string";
+        }
+    }
+}
+
+void BloomVisitor::on_event(const EventRecord& record) {
+    if (record.checkpoint_idx < chunks_base_idx_) return;
+    ensure_chunk(record.checkpoint_idx);
+
+    const auto& ev = record.ev;
+    ChunkState& chunk = chunks_[record.checkpoint_idx - chunks_base_idx_];
+
+    if (ev.is_metadata()) {
+        if (record.has_args) {
+            std::string_view hash_val = dom_string(record.args_dom, "value");
+            std::string_view resolved = dom_string(record.args_dom, "name");
+
+            if (!hash_val.empty() && !resolved.empty()) {
+                std::string_view dim;
+                if (ev.name == "HH") {
+                    dim = DIM_HHASH;
+                } else if (ev.name == "FH") {
+                    dim = DIM_FHASH;
+                } else if (ev.name == "SH") {
+                    dim = DIM_SHASH;
+                }
+                if (!dim.empty()) {
+                    // Outer StringViewMap: transparent find, emplace on miss.
+                    auto outer_it = chunk.hash_resolutions.find(dim);
+                    if (outer_it == chunk.hash_resolutions.end()) {
+                        outer_it = chunk.hash_resolutions
+                                       .emplace(std::string(dim),
+                                                StringViewMap<std::string>{})
+                                       .first;
+                    }
+                    auto& inner = outer_it->second;
+                    // Inner StringViewMap: find + emplace/update.
+                    auto inner_it = inner.find(hash_val);
+                    if (inner_it == inner.end()) {
+                        inner.emplace(std::string(hash_val),
+                                      std::string(resolved));
+                    } else {
+                        inner_it->second.assign(resolved.data(),
+                                                resolved.size());
+                    }
+                }
+            }
+        }
+    } else {
+        chunk.statistics.update_from_event(ev.name, ev.cat, ev.pid, ev.tid,
+                                           ev.ts, ev.dur);
+
+        // Observe a fixed slot: adds to bloom (if bloom_idx >= 0) and to
+        // dim_stats.
+        auto observe_fixed = [&chunk](int bloom_idx, std::size_t dim_idx,
+                                      std::string_view val) {
+            if (val.empty()) return;
+            if (bloom_idx >= 0) {
+                chunk.fixed_blooms[bloom_idx].add(val);
+            }
+            chunk.fixed_dim_stats[dim_idx].observe(val);
+        };
+
+        observe_fixed(BF_NAME, FD_NAME, ev.name);
+        observe_fixed(BF_CAT, FD_CAT, ev.cat);
+
+        if (ev.pid != last_pid_ || last_pid_len_ == 0) {
+            auto [pp, _1] = std::to_chars(
+                last_pid_buf_, last_pid_buf_ + sizeof(last_pid_buf_), ev.pid);
+            last_pid_len_ = static_cast<std::uint8_t>(pp - last_pid_buf_);
+            last_pid_ = ev.pid;
+        }
+        if (ev.tid != last_tid_ || last_tid_len_ == 0) {
+            auto [tp, _2] = std::to_chars(
+                last_tid_buf_, last_tid_buf_ + sizeof(last_tid_buf_), ev.tid);
+            last_tid_len_ = static_cast<std::uint8_t>(tp - last_tid_buf_);
+            last_tid_ = ev.tid;
+        }
+        std::string_view pid_sv(last_pid_buf_, last_pid_len_);
+        std::string_view tid_sv(last_tid_buf_, last_tid_len_);
+
+        observe_fixed(BF_PID, FD_PID, pid_sv);
+        observe_fixed(BF_TID, FD_TID, tid_sv);
+
+        char pt_buf[52];
+        std::memcpy(pt_buf, last_pid_buf_, last_pid_len_);
+        pt_buf[last_pid_len_] = ':';
+        std::memcpy(pt_buf + last_pid_len_ + 1, last_tid_buf_, last_tid_len_);
+        std::string_view pt_sv(pt_buf, last_pid_len_ + 1 + last_tid_len_);
+        // pid_tid has no bloom slot — only dim_stats.
+        observe_fixed(-1, FD_PID_TID, pt_sv);
+
+        chunk.fixed_dim_stats[FD_TS].observe_range_only(ev.ts);
+        chunk.fixed_dim_stats[FD_DUR].observe_range_only(ev.dur);
+
+        if (record.has_args) {
+            std::string_view hhash = dom_string(record.args_dom, "hhash");
+            observe_fixed(BF_HHASH, FD_HHASH, hhash);
+
+            std::string_view fhash = dom_string(record.args_dom, "fhash");
+            observe_fixed(BF_FHASH, FD_FHASH, fhash);
+
+            std::string_view shash = dom_string(record.args_dom, "cmd_hash");
+            if (shash.empty()) {
+                shash = dom_string(record.args_dom, "exec_hash");
+            }
+            observe_fixed(BF_SHASH, FD_SHASH, shash);
+
+            std::string scratch;
+            for (std::size_t e = 0; e < extra_dim_names_.size(); ++e) {
+                auto r = record.args_dom[extra_dim_names_[e]];
+                if (r.error()) continue;
+                if (dom_value_to_string(r.value_unsafe(), scratch) &&
+                    !scratch.empty()) {
+                    chunk.extra_blooms[e].add(scratch);
+                    chunk.extra_dim_stats[e].observe(scratch);
+                }
+            }
+        }
+
+        chunk.events_processed++;
+    }
+}
+
+std::unique_ptr<DftEventVisitor> BloomVisitor::create_parallel_slice() const {
+    std::vector<std::string> dims;
+    dims.reserve(BloomVisitor::BF_COUNT + extra_dim_names_.size());
+    for (auto sv : FIXED_BLOOM_NAMES) dims.emplace_back(sv);
+    for (const auto& d : extra_dim_names_) dims.push_back(d);
+    return std::make_unique<BloomVisitor>(config_, std::move(dims));
+}
+
+void BloomVisitor::merge_parallel_slice(DftEventVisitor& slice_base) {
+    auto* slice = dynamic_cast<BloomVisitor*>(&slice_base);
+    if (!slice) return;
+
+    for (std::size_t slice_i = chunks_base_idx_;
+         slice_i < slice->chunks_.size(); ++slice_i) {
+        auto& src = slice->chunks_[slice_i];
+        if (src.events_processed == 0) continue;
+        const std::size_t parent_local = slice_i - chunks_base_idx_;
+        ensure_chunk(slice_i);
+        auto& dst = chunks_[parent_local];
+
+        for (std::size_t b = 0; b < BF_COUNT; ++b) {
+            dst.fixed_blooms[b].merge_from(src.fixed_blooms[b]);
+        }
+        for (std::size_t e = 0;
+             e < src.extra_blooms.size() && e < dst.extra_blooms.size(); ++e) {
+            dst.extra_blooms[e].merge_from(src.extra_blooms[e]);
+        }
+
+        for (std::size_t d = 0; d < FD_COUNT; ++d) {
+            auto& sds = src.fixed_dim_stats[d];
+            auto& dds = dst.fixed_dim_stats[d];
+            if (sds.value_counts) {
+                if (!dds.value_counts) dds.value_counts.emplace();
+                for (const auto& [k, v] : *sds.value_counts) {
+                    (*dds.value_counts)[k] += v;
+                }
+                dds.distinct_count = dds.value_counts->size();
+            }
+            if (dds.min_value.empty() ||
+                (!sds.min_value.empty() && sds.min_value < dds.min_value)) {
+                dds.min_value = sds.min_value;
+            }
+            if (sds.max_value > dds.max_value) {
+                dds.max_value = sds.max_value;
+            }
+        }
+        for (std::size_t e = 0;
+             e < src.extra_dim_stats.size() && e < dst.extra_dim_stats.size();
+             ++e) {
+            auto& sds = src.extra_dim_stats[e];
+            auto& dds = dst.extra_dim_stats[e];
+            if (sds.value_counts) {
+                if (!dds.value_counts) dds.value_counts.emplace();
+                for (const auto& [k, v] : *sds.value_counts) {
+                    (*dds.value_counts)[k] += v;
+                }
+                dds.distinct_count = dds.value_counts->size();
+            }
+            if (dds.min_value.empty() ||
+                (!sds.min_value.empty() && sds.min_value < dds.min_value)) {
+                dds.min_value = sds.min_value;
+            }
+            if (sds.max_value > dds.max_value) {
+                dds.max_value = sds.max_value;
+            }
+        }
+
+        dst.statistics.merge_from(src.statistics);
+
+        for (auto& [dim, inner] : src.hash_resolutions) {
+            auto outer_it = dst.hash_resolutions.find(dim);
+            if (outer_it == dst.hash_resolutions.end()) {
+                dst.hash_resolutions.emplace(dim, std::move(inner));
+            } else {
+                for (auto& [k, v] : inner) {
+                    outer_it->second.try_emplace(k, std::move(v));
+                }
+            }
+        }
+
+        dst.events_processed += src.events_processed;
+    }
+}
+
+void BloomVisitor::finalize(indexer::IndexDatabaseWriterContext& db,
+                            int file_id) {
+    auto file_statistics = persist_bloom_sink_writes(
+        db, file_id, extra_dim_names_, chunks_, config_);
+    persist_bloom_concrete_tail(db, file_id, file_statistics, chunks_.size(),
+                                /*refresh_root_summaries=*/true);
+}
+
+void BloomVisitor::finalize_sink_only(indexer::IndexBatchSink& sink,
+                                      int file_id) {
+    flush_per_checkpoint_to_sink(sink, file_id);
+    finalize_file_to_sink(sink, file_id);
+}
+
+void BloomVisitor::flush_per_checkpoint_to_sink(indexer::IndexBatchSink& sink,
+                                                int file_id) {
+    if (chunks_.empty()) return;
+
+    if (!file_acc_.initialized) {
+        for (std::size_t b = 0; b < BF_COUNT; ++b) {
+            file_acc_.fixed_blooms[b] =
+                BloomFilter(config_.expected_entries_per_chunk,
+                            config_.false_positive_rate);
+        }
+        file_acc_.extra_blooms.reserve(extra_dim_names_.size());
+        for (std::size_t e = 0; e < extra_dim_names_.size(); ++e) {
+            file_acc_.extra_blooms.emplace_back(
+                config_.expected_entries_per_chunk,
+                config_.false_positive_rate);
+        }
+        file_acc_.initialized = true;
+    }
+
+    std::vector<unsigned char> blob;
+    for (std::size_t i = 0; i < chunks_.size(); ++i) {
+        const auto& chunk = chunks_[i];
+        const auto checkpoint_idx =
+            static_cast<std::uint64_t>(chunks_base_idx_ + i);
+
+        for (std::size_t b = 0; b < BF_COUNT; ++b) {
+            const BloomFilter& bf = chunk.fixed_blooms[b];
+            bf.serialize_into(blob);
+            sink.insert_chunk_bloom_filter(
+                file_id, checkpoint_idx, std::string(FIXED_BLOOM_NAMES[b]),
+                std::span<const unsigned char>(blob.data(), blob.size()),
+                static_cast<std::uint64_t>(bf.num_entries()));
+            file_acc_.fixed_blooms[b].merge_from(bf);
+        }
+        for (std::size_t e = 0;
+             e < extra_dim_names_.size() && e < chunk.extra_blooms.size();
+             ++e) {
+            const BloomFilter& bf = chunk.extra_blooms[e];
+            bf.serialize_into(blob);
+            sink.insert_chunk_bloom_filter(
+                file_id, checkpoint_idx, extra_dim_names_[e],
+                std::span<const unsigned char>(blob.data(), blob.size()),
+                static_cast<std::uint64_t>(bf.num_entries()));
+            file_acc_.extra_blooms[e].merge_from(bf);
+        }
+
+        sink.insert_chunk_statistics(file_id, checkpoint_idx, chunk.statistics);
+        file_acc_.statistics.merge_from(chunk.statistics);
+
+        for (std::size_t d = 0; d < FD_COUNT; ++d) {
+            sink.insert_chunk_dimension_stats(file_id, checkpoint_idx,
+                                              chunk.fixed_dim_stats[d],
+                                              config_.value_counts_cap);
+        }
+        for (const auto& ds : chunk.extra_dim_stats) {
+            sink.insert_chunk_dimension_stats(file_id, checkpoint_idx, ds,
+                                              config_.value_counts_cap);
+        }
+
+        for (const auto& [name, _] : chunk.statistics.name_counts) {
+            const auto name_id = hash::fnv1a_hash(name);
+            sink.insert_name_chunk_posting(name_id, file_id, checkpoint_idx);
+        }
+    }
+
+    file_acc_.num_chunks_emitted += chunks_.size();
+    chunks_base_idx_ += chunks_.size();
+    chunks_.clear();
+}
+
+void BloomVisitor::finalize_file_to_sink(indexer::IndexBatchSink& sink,
+                                         int file_id) {
+    if (!file_acc_.initialized && chunks_.empty()) return;
+    flush_per_checkpoint_to_sink(sink, file_id);
+
+    std::vector<unsigned char> blob;
+    for (std::size_t b = 0; b < BF_COUNT; ++b) {
+        const BloomFilter& bf = file_acc_.fixed_blooms[b];
+        bf.serialize_into(blob);
+        sink.insert_file_bloom_filter(
+            file_id, std::string(FIXED_BLOOM_NAMES[b]),
+            std::span<const unsigned char>(blob.data(), blob.size()),
+            static_cast<std::uint64_t>(bf.num_entries()));
+    }
+    for (std::size_t e = 0; e < extra_dim_names_.size(); ++e) {
+        const BloomFilter& bf = file_acc_.extra_blooms[e];
+        bf.serialize_into(blob);
+        sink.insert_file_bloom_filter(
+            file_id, extra_dim_names_[e],
+            std::span<const unsigned char>(blob.data(), blob.size()),
+            static_cast<std::uint64_t>(bf.num_entries()));
+    }
+
+    for (std::size_t b = 0; b < BF_COUNT; ++b) {
+        sink.insert_index_dimension(file_id, std::string(FIXED_BLOOM_NAMES[b]));
+    }
+    for (const auto& dim : extra_dim_names_) {
+        sink.insert_index_dimension(file_id, dim);
+    }
+    sink.insert_index_dimension(file_id, std::string(DIM_TS));
+    sink.insert_index_dimension(file_id, std::string(DIM_DUR));
+
+    sink.insert_file_scalar_stats(file_id, file_acc_.statistics,
+                                  file_acc_.num_chunks_emitted);
+    sink.insert_file_category_counts(file_id,
+                                     file_acc_.statistics.category_counts);
+    sink.insert_file_name_counts(file_id, file_acc_.statistics.name_counts);
+    sink.insert_file_pid_tid_counts(file_id,
+                                    file_acc_.statistics.pid_tid_counts);
+
+    for (const auto& [name, _] : file_acc_.statistics.name_counts) {
+        const auto name_id = hash::fnv1a_hash(name);
+        sink.insert_name_dictionary_entry(name_id, name);
+        sink.insert_name_file_posting(name_id, file_id);
+    }
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::visitors
diff --git a/src/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.cpp b/src/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.cpp
new file mode 100644
index 00000000..e0a14297
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.cpp
@@ -0,0 +1,96 @@
+#include <dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.h>
+#include <dftracer/utils/utilities/indexer/index_batch_sink.h>
+
+namespace dftracer::utils::utilities::composites::dft::visitors {
+
+void HashTableVisitor::begin(std::size_t /*num_checkpoints*/) {
+    file_hashes_.clear();
+    host_hashes_.clear();
+    string_hashes_.clear();
+    proc_metadata_.clear();
+}
+
+void HashTableVisitor::on_checkpoint(std::size_t /*checkpoint_idx*/) {}
+
+void HashTableVisitor::on_event(const EventRecord& record) {
+    const auto& ev = record.ev;
+    if (!ev.is_metadata()) {
+        return;
+    }
+    if (!record.has_args) {
+        return;
+    }
+
+    auto dom_string = [](simdjson::dom::element obj,
+                         std::string_view key) -> std::string_view {
+        auto r = obj[key];
+        if (r.error()) return {};
+        auto v = r.value_unsafe();
+        if (!v.is_string()) return {};
+        return v.get_string().value_unsafe();
+    };
+
+    auto name_val = dom_string(record.args_dom, "name");
+    auto hash_val = dom_string(record.args_dom, "value");
+
+    if (name_val.empty() || hash_val.empty()) {
+        return;
+    }
+
+    if (ev.name == "FH") {
+        file_hashes_.try_emplace(std::string(hash_val), std::string(name_val));
+    } else if (ev.name == "HH") {
+        host_hashes_.try_emplace(std::string(hash_val), std::string(name_val));
+    } else if (ev.name == "SH") {
+        string_hashes_.try_emplace(std::string(hash_val),
+                                   std::string(name_val));
+    } else if (ev.name == "PR") {
+        proc_metadata_.try_emplace(std::string(hash_val),
+                                   std::string(name_val));
+    }
+}
+
+std::unique_ptr<DftEventVisitor> HashTableVisitor::create_parallel_slice()
+    const {
+    return std::make_unique<HashTableVisitor>();
+}
+
+void HashTableVisitor::merge_parallel_slice(DftEventVisitor& slice_base) {
+    auto* slice = dynamic_cast<HashTableVisitor*>(&slice_base);
+    if (!slice) return;
+    auto absorb = [](std::unordered_map<std::string, std::string>& dst,
+                     std::unordered_map<std::string, std::string>& src) {
+        for (auto& [k, v] : src) {
+            dst.try_emplace(std::move(const_cast<std::string&>(k)),
+                            std::move(v));
+        }
+    };
+    absorb(file_hashes_, slice->file_hashes_);
+    absorb(host_hashes_, slice->host_hashes_);
+    absorb(string_hashes_, slice->string_hashes_);
+    absorb(proc_metadata_, slice->proc_metadata_);
+}
+
+void HashTableVisitor::finalize(indexer::IndexBatchSink& writer,
+                                int /*file_id*/) {
+    auto write_entries =
+        [&writer](const std::unordered_map<std::string, std::string>& entries,
+                  HashType type) {
+            for (const auto& [hash, name] : entries) {
+                writer.insert_hash_table_entry(static_cast<std::uint8_t>(type),
+                                               hash, name);
+            }
+        };
+
+    write_entries(file_hashes_, HashType::FILE);
+    write_entries(host_hashes_, HashType::HOST);
+    write_entries(string_hashes_, HashType::STRING);
+    write_entries(proc_metadata_, HashType::PROC);
+}
+
+std::size_t HashTableVisitor::num_entries() const {
+    return file_hashes_.size() + host_hashes_.size() + string_hashes_.size() +
+           proc_metadata_.size();
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::visitors
diff --git a/src/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.cpp b/src/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.cpp
new file mode 100644
index 00000000..1a92b3b9
--- /dev/null
+++ b/src/dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.cpp
@@ -0,0 +1,128 @@
+#include <dftracer/utils/utilities/composites/dft/event.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/queries/manifest_queries.h>
+#include <dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.h>
+#include <dftracer/utils/utilities/indexer/index_batch_sink.h>
+
+namespace dftracer::utils::utilities::composites::dft::visitors {
+
+void ManifestVisitor::begin(std::size_t /*num_checkpoints*/) {
+    event_lines_.clear();
+    metadata_lines_.clear();
+    observed_pids_.clear();
+    event_count_ = 0;
+    line_offset_ = 0;
+    base_idx_ = 0;
+}
+
+void ManifestVisitor::on_checkpoint(std::size_t /*checkpoint_idx*/) {}
+
+void ManifestVisitor::ensure_chunk(std::size_t checkpoint_idx) {
+    if (checkpoint_idx < base_idx_) return;
+    const std::size_t local = checkpoint_idx - base_idx_;
+    if (local < event_lines_.size()) return;
+    event_lines_.resize(local + 1);
+    metadata_lines_.resize(local + 1);
+}
+
+void ManifestVisitor::on_event(const EventRecord& record) {
+    if (record.checkpoint_idx < base_idx_) return;
+    auto ln = static_cast<std::uint32_t>(record.line_number);
+    ensure_chunk(record.checkpoint_idx);
+    ++event_count_;
+
+    const auto local = record.checkpoint_idx - base_idx_;
+    const auto& ev = record.ev;
+    if (ev.is_metadata()) {
+        std::string name(ev.name);
+        if (!name.empty()) {
+            metadata_lines_[local][name].push_back(ln);
+        }
+    } else {
+        std::string cat(ev.cat);
+        std::string name(ev.name);
+        event_lines_[local][{cat, name}].push_back(ln);
+        observed_pids_.insert(ev.pid);
+    }
+}
+
+std::unique_ptr<DftEventVisitor> ManifestVisitor::create_parallel_slice()
+    const {
+    return std::make_unique<ManifestVisitor>();
+}
+
+void ManifestVisitor::merge_parallel_slice(DftEventVisitor& slice_base) {
+    auto* slice = dynamic_cast<ManifestVisitor*>(&slice_base);
+    if (!slice) return;
+    const auto offset = static_cast<std::uint32_t>(slice->line_offset_);
+
+    auto map_ci = [this](std::size_t slice_ci) -> std::size_t {
+        return slice_ci - base_idx_;
+    };
+    for (std::size_t slice_ci = base_idx_;
+         slice_ci < slice->event_lines_.size(); ++slice_ci) {
+        if (slice->event_lines_[slice_ci].empty()) continue;
+        const std::size_t parent_local = map_ci(slice_ci);
+        if (parent_local >= event_lines_.size()) {
+            event_lines_.resize(parent_local + 1);
+        }
+        for (auto& [key, lines] : slice->event_lines_[slice_ci]) {
+            auto& dst = event_lines_[parent_local][key];
+            dst.reserve(dst.size() + lines.size());
+            for (auto ln : lines) dst.push_back(ln + offset);
+        }
+    }
+    for (std::size_t slice_ci = base_idx_;
+         slice_ci < slice->metadata_lines_.size(); ++slice_ci) {
+        if (slice->metadata_lines_[slice_ci].empty()) continue;
+        const std::size_t parent_local = map_ci(slice_ci);
+        if (parent_local >= metadata_lines_.size()) {
+            metadata_lines_.resize(parent_local + 1);
+        }
+        for (auto& [meta_type, lines] : slice->metadata_lines_[slice_ci]) {
+            auto& dst = metadata_lines_[parent_local][meta_type];
+            dst.reserve(dst.size() + lines.size());
+            for (auto ln : lines) dst.push_back(ln + offset);
+        }
+    }
+    for (auto pid : slice->observed_pids_) observed_pids_.insert(pid);
+    event_count_ += slice->event_count_;
+}
+
+void ManifestVisitor::finalize(indexer::IndexBatchSink& db, int file_id) {
+    flush_per_checkpoint_to_sink(db, file_id);
+    finalize_file_to_sink(db, file_id);
+}
+
+void ManifestVisitor::flush_per_checkpoint_to_sink(
+    indexer::IndexBatchSink& sink, int file_id) {
+    const std::size_t n = std::max(event_lines_.size(), metadata_lines_.size());
+    for (std::size_t i = 0; i < n; ++i) {
+        const auto ci = static_cast<std::uint64_t>(base_idx_ + i);
+        if (i < event_lines_.size()) {
+            for (auto& [key, lines] : event_lines_[i]) {
+                if (lines.empty()) continue;
+                sink.insert_event_range(file_id, ci, key.first, key.second,
+                                        lines);
+            }
+        }
+        if (i < metadata_lines_.size()) {
+            for (auto& [meta_type, lines] : metadata_lines_[i]) {
+                if (lines.empty()) continue;
+                sink.insert_metadata_lines(file_id, ci, meta_type, lines);
+            }
+        }
+    }
+    base_idx_ += n;
+    event_lines_.clear();
+    metadata_lines_.clear();
+}
+
+void ManifestVisitor::finalize_file_to_sink(indexer::IndexBatchSink& sink,
+                                            int file_id) {
+    flush_per_checkpoint_to_sink(sink, file_id);
+    if (!observed_pids_.empty()) {
+        sink.insert_file_pids(file_id, observed_pids_);
+    }
+}
+
+}  // namespace dftracer::utils::utilities::composites::dft::visitors
diff --git a/src/dftracer/utils/utilities/composites/streaming_file_merger_utility.cpp b/src/dftracer/utils/utilities/composites/streaming_file_merger_utility.cpp
index 042f9357..8b257ada 100644
--- a/src/dftracer/utils/utilities/composites/streaming_file_merger_utility.cpp
+++ b/src/dftracer/utils/utilities/composites/streaming_file_merger_utility.cpp
@@ -5,6 +5,7 @@
 #include <dftracer/utils/utilities/compression/zlib/streaming_compressor_utility.h>
 #include <dftracer/utils/utilities/fileio/lines/streaming_line_reader.h>
 #include <dftracer/utils/utilities/fileio/streaming_file_writer_utility.h>
+#include <dftracer/utils/utilities/hash/fnv1a_hasher_utility.h>
 
 #include <utility>
 
@@ -12,15 +13,7 @@ namespace dftracer::utils::utilities::composites {
 
 namespace {
 
-// FNV-1a hash for byte-level verification
-inline std::size_t fnv1a_line(const char* data, std::size_t len) {
-    std::size_t h = 14695981039346656037ULL;
-    for (std::size_t i = 0; i < len; ++i) {
-        h ^= static_cast<std::size_t>(static_cast<unsigned char>(data[i]));
-        h *= 1099511628211ULL;
-    }
-    return h;
-}
+namespace hash = dftracer::utils::utilities::hash;
 
 // Check if a line is an array delimiter ([ or ]) after trimming whitespace.
 inline bool is_array_delimiter(const char* data, std::size_t len) {
@@ -102,7 +95,7 @@ StreamingFileProducerUtility::process_async(
             ++result.events_sent;
 
             if (input.verify) {
-                batch_hash += fnv1a_line(trimmed, trimmed_length);
+                batch_hash += hash::fnv1a_hash(trimmed, trimmed_length);
             }
 
             if (local_buf.size() >= batch_budget) {
diff --git a/src/dftracer/utils/utilities/fileio/chunk_writer.cpp b/src/dftracer/utils/utilities/fileio/chunk_writer.cpp
index 5a0f8d83..4ed91d08 100644
--- a/src/dftracer/utils/utilities/fileio/chunk_writer.cpp
+++ b/src/dftracer/utils/utilities/fileio/chunk_writer.cpp
@@ -1,10 +1,14 @@
 #include <dftracer/utils/core/common/byte_view.h>
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/coro/generator.h>
+#include <dftracer/utils/core/coro/yield.h>
 #include <dftracer/utils/core/io/io.h>
 #include <dftracer/utils/utilities/fileio/chunk_writer.h>
 #include <fcntl.h>
 
+#include <memory>
+
 namespace dftracer::utils::utilities::fileio {
 
 ChunkWriter::ChunkWriter(ChunkWriterConfig config)
@@ -77,6 +81,13 @@ coro::CoroTask<void> ChunkWriter::write_line(ByteView line) {
         chunk_index_++;
         co_await open_next_chunk();
     }
+
+    // Yield every 256 events to prevent stack overflow from synchronous
+    // coroutine completion chains. This is internal to ChunkWriter so
+    // callers don't need to manage yielding.
+    if ((total_events_ & 0xff) == 0) {
+        co_await coro::yield();
+    }
 }
 
 coro::CoroTask<void> ChunkWriter::write_bytes(ByteView data) {
@@ -93,11 +104,17 @@ coro::CoroTask<void> ChunkWriter::flush_buffer() {
     if (write_buffer_.empty()) co_return;
 
     if (compressor_) {
-        auto gen = compressor_->compress(
-            ByteView(write_buffer_.data(), write_buffer_.size()));
-        while (auto chunk = co_await gen.next()) {
-            co_await io::write(fd_, chunk->as<char>(), chunk->size());
-            total_bytes_ += chunk->size();
+        using GenType = coro::AsyncGenerator<ByteView>;
+        auto gen = std::make_unique<GenType>(compressor_->compress(
+            ByteView(write_buffer_.data(), write_buffer_.size())));
+        while (true) {
+            auto chunk = co_await gen->next();
+            if (!chunk) break;
+            const char* data = chunk->as<char>();
+            std::size_t size = chunk->size();
+            chunk.reset();
+            co_await io::write(fd_, data, size);
+            total_bytes_ += size;
         }
     } else {
         co_await io::write(fd_, write_buffer_.data(), write_buffer_.size());
@@ -108,10 +125,17 @@ coro::CoroTask<void> ChunkWriter::flush_buffer() {
 
 coro::CoroTask<void> ChunkWriter::flush_raw(const char* data, std::size_t len) {
     if (compressor_) {
-        auto gen = compressor_->compress(ByteView(data, len));
-        while (auto chunk = co_await gen.next()) {
-            co_await io::write(fd_, chunk->as<char>(), chunk->size());
-            total_bytes_ += chunk->size();
+        using GenType = coro::AsyncGenerator<ByteView>;
+        auto gen = std::make_unique<GenType>(
+            compressor_->compress(ByteView(data, len)));
+        while (true) {
+            auto chunk = co_await gen->next();
+            if (!chunk) break;
+            const char* cdata = chunk->as<char>();
+            std::size_t csize = chunk->size();
+            chunk.reset();
+            co_await io::write(fd_, cdata, csize);
+            total_bytes_ += csize;
         }
     } else {
         co_await io::write(fd_, data, len);
@@ -131,10 +155,16 @@ coro::CoroTask<void> ChunkWriter::finalize_current_chunk() {
     }
 
     if (compressor_) {
-        auto fin = compressor_->finalize_stream();
-        while (auto chunk = co_await fin.next()) {
-            co_await io::write(fd_, chunk->as<char>(), chunk->size());
-            total_bytes_ += chunk->size();
+        using GenType = coro::AsyncGenerator<ByteView>;
+        auto fin = std::make_unique<GenType>(compressor_->finalize_stream());
+        while (true) {
+            auto chunk = co_await fin->next();
+            if (!chunk) break;
+            const char* data = chunk->as<char>();
+            std::size_t size = chunk->size();
+            chunk.reset();
+            co_await io::write(fd_, data, size);
+            total_bytes_ += size;
         }
         compressor_.reset();
     }
@@ -142,12 +172,18 @@ coro::CoroTask<void> ChunkWriter::finalize_current_chunk() {
     co_await io::close(fd_);
     fd_ = -1;
 
+    auto path = chunk_path(chunk_index_);
     chunks_.push_back(ChunkInfo{
-        .path = chunk_path(chunk_index_),
+        .path = path,
         .bytes_written = current_chunk_bytes_,
         .events_written = current_chunk_events_,
         .chunk_index = chunk_index_,
     });
+
+    if (config_.on_chunk_complete) {
+        config_.on_chunk_complete(static_cast<std::size_t>(chunk_index_), path,
+                                  current_chunk_events_, current_chunk_bytes_);
+    }
 }
 
 coro::CoroTask<void> ChunkWriter::close() {
diff --git a/src/dftracer/utils/utilities/fileio/parallel/layout.cpp b/src/dftracer/utils/utilities/fileio/parallel/layout.cpp
new file mode 100644
index 00000000..ab1c3c73
--- /dev/null
+++ b/src/dftracer/utils/utilities/fileio/parallel/layout.cpp
@@ -0,0 +1,148 @@
+#include <dftracer/utils/core/common/config.h>
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/utilities/fileio/parallel/layout.h>
+
+#ifdef __linux__
+#include <sys/vfs.h>
+#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \
+    defined(__NetBSD__)
+#include <sys/mount.h>
+#include <sys/param.h>
+
+#include <string_view>
+#endif
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdlib>
+#include <cstring>
+
+#ifdef DFTRACER_UTILS_HAVE_LUSTREAPI
+#include <lustre/lustreapi.h>
+#endif
+
+namespace dftracer::utils::utilities::fileio::parallel {
+
+namespace {
+
+#ifdef __linux__
+// From linux/magic.h; inlined to avoid a hard kernel-header dep.
+constexpr unsigned long NFS_MAGIC = 0x6969;
+constexpr unsigned long LUSTRE_MAGIC = 0x0BD00BD0;
+constexpr unsigned long GPFS_MAGIC = 0x47504653;  // "GPFS"
+constexpr unsigned long BEEGFS_MAGIC = 0x19830326;
+
+FilesystemKind classify_magic(unsigned long magic) noexcept {
+    switch (magic) {
+        case NFS_MAGIC:
+            return FilesystemKind::NFS;
+        case LUSTRE_MAGIC:
+            return FilesystemKind::LUSTRE;
+        case GPFS_MAGIC:
+            return FilesystemKind::GPFS;
+        case BEEGFS_MAGIC:
+            return FilesystemKind::BEEGFS;
+        default:
+            return FilesystemKind::LOCAL;
+    }
+}
+#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \
+    defined(__NetBSD__)
+FilesystemKind classify_fstype(const char* fstype) noexcept {
+    if (fstype == nullptr) return FilesystemKind::LOCAL;
+    const std::string_view name(fstype);
+    if (name == "nfs") return FilesystemKind::NFS;
+    if (name == "lustre") return FilesystemKind::LUSTRE;
+    if (name == "gpfs") return FilesystemKind::GPFS;
+    if (name == "beegfs") return FilesystemKind::BEEGFS;
+    return FilesystemKind::LOCAL;
+}
+#endif
+
+std::string probe_path(const std::string& path) noexcept {
+    std::error_code ec;
+    if (fs::exists(path, ec)) return path;
+    auto parent = fs::path(path).parent_path();
+    if (parent.empty()) return std::string(".");
+    if (fs::exists(parent, ec)) return parent.string();
+    return std::string(".");
+}
+
+void query_lustre_stripe(const std::string& probe, LayoutInfo& info) noexcept {
+#ifdef DFTRACER_UTILS_HAVE_LUSTREAPI
+    // When the target file does not exist yet we fall back to the parent dir;
+    // the file inherits the directory's default stripe on creation.
+    const std::size_t lum_size =
+        sizeof(struct lov_user_md) +
+        LOV_MAX_STRIPE_COUNT * sizeof(struct lov_user_ost_data_v1);
+    auto* raw = std::calloc(1, lum_size);
+    if (!raw) return;
+    auto* lum = reinterpret_cast<struct lov_user_md*>(raw);
+    lum->lmm_magic = LOV_USER_MAGIC;
+    if (llapi_file_get_stripe(probe.c_str(), lum) == 0) {
+        info.stripe_size = static_cast<std::size_t>(lum->lmm_stripe_size);
+        info.stripe_count = static_cast<std::size_t>(lum->lmm_stripe_count);
+    }
+    std::free(raw);
+#else
+    (void)probe;
+    (void)info;
+#endif
+}
+
+}  // namespace
+
+LayoutInfo detect_layout(const std::string& path) noexcept {
+    LayoutInfo info{};
+    info.layout = FileLayout::STRIPED;
+    info.fs = FilesystemKind::UNKNOWN;
+    info.stripe_size = 0;
+    info.stripe_count = 0;
+
+    const auto target = probe_path(path);
+#if defined(__linux__)
+    struct statfs st{};
+    if (::statfs(target.c_str(), &st) != 0) {
+        return info;
+    }
+    info.fs = classify_magic(static_cast<unsigned long>(st.f_type));
+#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \
+    defined(__NetBSD__)
+    struct statfs st{};
+    if (::statfs(target.c_str(), &st) != 0) {
+        return info;
+    }
+    info.fs = classify_fstype(st.f_fstypename);
+#else
+    (void)target;
+#endif
+    if (info.fs == FilesystemKind::NFS) {
+        info.layout = FileLayout::SHARDED;
+    }
+    if (info.fs == FilesystemKind::LUSTRE) {
+        query_lustre_stripe(target, info);
+    }
+    return info;
+}
+
+WriterSizing compute_writer_sizing(const LayoutInfo& info,
+                                   std::size_t baseline_workers,
+                                   std::size_t default_flush_bytes,
+                                   std::size_t buffer_headroom_bytes,
+                                   bool padded_layout) noexcept {
+    WriterSizing s{};
+    s.num_workers = baseline_workers == 0 ? 1 : baseline_workers;
+    if (!padded_layout && info.stripe_count > 0) {
+        s.num_workers = std::min(s.num_workers, info.stripe_count);
+    }
+    if (padded_layout && info.stripe_size > 0) {
+        // Uncompressed flush sized to one stripe; compressed fits easily.
+        s.flush_threshold = info.stripe_size;
+    } else {
+        s.flush_threshold = std::max(default_flush_bytes, info.stripe_size);
+    }
+    s.buffer_capacity = s.flush_threshold + buffer_headroom_bytes;
+    return s;
+}
+
+}  // namespace dftracer::utils::utilities::fileio::parallel
diff --git a/src/dftracer/utils/utilities/fileio/parallel/merge.cpp b/src/dftracer/utils/utilities/fileio/parallel/merge.cpp
new file mode 100644
index 00000000..d4d733c0
--- /dev/null
+++ b/src/dftracer/utils/utilities/fileio/parallel/merge.cpp
@@ -0,0 +1,83 @@
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/io/io.h>
+#include <dftracer/utils/utilities/fileio/parallel/merge.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <vector>
+
+namespace dftracer::utils::utilities::fileio::parallel {
+
+namespace {
+
+constexpr std::size_t COPY_BUFFER_BYTES = 256 * 1024;
+
+coro::CoroTask<int> stream_shard_to_fd(int out_fd, const std::string& shard) {
+    ssize_t in_fd =
+        co_await ::dftracer::utils::io::open(shard.c_str(), O_RDONLY, 0);
+    if (in_fd < 0) {
+        DFTRACER_UTILS_LOG_ERROR("merge_shards: failed to open shard: %s",
+                                 shard.c_str());
+        co_return -1;
+    }
+    std::vector<char> buf(COPY_BUFFER_BYTES);
+    while (true) {
+        auto n = co_await ::dftracer::utils::io::read(static_cast<int>(in_fd),
+                                                      buf.data(), buf.size());
+        if (n == 0) break;
+        if (n < 0) {
+            co_await ::dftracer::utils::io::close(static_cast<int>(in_fd));
+            DFTRACER_UTILS_LOG_ERROR("merge_shards: read failed on %s",
+                                     shard.c_str());
+            co_return -1;
+        }
+        std::size_t remaining = static_cast<std::size_t>(n);
+        const char* ptr = buf.data();
+        while (remaining > 0) {
+            auto w =
+                co_await ::dftracer::utils::io::write(out_fd, ptr, remaining);
+            if (w <= 0) {
+                co_await ::dftracer::utils::io::close(static_cast<int>(in_fd));
+                DFTRACER_UTILS_LOG_ERROR(
+                    "merge_shards: write failed while draining %s",
+                    shard.c_str());
+                co_return -1;
+            }
+            remaining -= static_cast<std::size_t>(w);
+            ptr += w;
+        }
+    }
+    co_await ::dftracer::utils::io::close(static_cast<int>(in_fd));
+    co_return 0;
+}
+
+}  // namespace
+
+coro::CoroTask<int> merge_shards(const std::string& target,
+                                 const std::vector<std::string>& shards) {
+    if (shards.empty()) co_return 0;
+
+    ssize_t out_fd = co_await ::dftracer::utils::io::open(
+        target.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
+    if (out_fd < 0) {
+        DFTRACER_UTILS_LOG_ERROR("merge_shards: failed to open target: %s",
+                                 target.c_str());
+        co_return -1;
+    }
+
+    for (const auto& shard : shards) {
+        if (co_await stream_shard_to_fd(static_cast<int>(out_fd), shard) != 0) {
+            co_await ::dftracer::utils::io::close(static_cast<int>(out_fd));
+            co_return -1;
+        }
+    }
+
+    co_await ::dftracer::utils::io::close(static_cast<int>(out_fd));
+
+    for (const auto& shard : shards) {
+        ::unlink(shard.c_str());
+    }
+    co_return 0;
+}
+
+}  // namespace dftracer::utils::utilities::fileio::parallel
diff --git a/src/dftracer/utils/utilities/fileio/parallel/padded_striped_writer.cpp b/src/dftracer/utils/utilities/fileio/parallel/padded_striped_writer.cpp
new file mode 100644
index 00000000..8f420285
--- /dev/null
+++ b/src/dftracer/utils/utilities/fileio/parallel/padded_striped_writer.cpp
@@ -0,0 +1,328 @@
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/coro/channel.h>
+#include <dftracer/utils/core/io/io.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/utilities/fileio/parallel/parallel_writer.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <cstdint>
+#include <cstring>
+#include <deque>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+namespace dftracer::utils::utilities::fileio::parallel {
+
+namespace {
+
+// FEXTRA-only gzip padding member layout (RFC 1952):
+//   hdr(10) + xlen(2) + xlen FEXTRA bytes + empty_stored_block(5) + trailer(8)
+constexpr std::size_t PAD_MEMBER_FIXED_OVERHEAD = 25;
+constexpr std::size_t PAD_MEMBER_MAX_SIZE = PAD_MEMBER_FIXED_OVERHEAD + 65535;
+
+// Worst-case channel depth. One per worker is usually enough; we allow a
+// little slack so bursts don't block producers.
+constexpr std::size_t CHUNK_CHANNEL_CAPACITY = 64;
+
+// Bound on concurrent stripe pwrites. Each stripe routes to a different OST
+// (offset = (idx+1) * stripe_size, OST = (idx+1) % stripe_count), so this
+// keeps multiple OSTs busy without unbounded outstanding I/O.
+constexpr std::size_t MAX_INFLIGHT_PWRITES = 16;
+
+// Append a FEXTRA-only padding member (decompresses to zero bytes).
+void append_padding_member(std::vector<std::uint8_t>& out, std::uint16_t xlen) {
+    const std::size_t start = out.size();
+    out.resize(start + PAD_MEMBER_FIXED_OVERHEAD + xlen);
+    std::uint8_t* p = out.data() + start;
+
+    p[0] = 0x1f;
+    p[1] = 0x8b;
+    p[2] = 0x08;                    // CM = deflate
+    p[3] = 0x04;                    // FLG = FEXTRA
+    p[4] = p[5] = p[6] = p[7] = 0;  // MTIME
+    p[8] = 0;                       // XFL
+    p[9] = 0xff;                    // OS = unknown
+
+    p[10] = static_cast<std::uint8_t>(xlen & 0xff);
+    p[11] = static_cast<std::uint8_t>((xlen >> 8) & 0xff);
+    std::memset(p + 12, 0, xlen);
+
+    // Empty deflate stored block: BFINAL=1, BTYPE=00, LEN=0, NLEN=0xffff.
+    p[12 + xlen + 0] = 0x01;
+    p[12 + xlen + 1] = 0x00;
+    p[12 + xlen + 2] = 0x00;
+    p[12 + xlen + 3] = 0xff;
+    p[12 + xlen + 4] = 0xff;
+
+    // Trailer: CRC32=0, ISIZE=0.
+    std::memset(p + 12 + xlen + 5, 0, 8);
+}
+
+// Fill `out` with padding members until its size reaches exactly stripe_size.
+void pad_to_stripe(std::vector<std::uint8_t>& out, std::size_t stripe_size) {
+    while (stripe_size - out.size() >= PAD_MEMBER_MAX_SIZE) {
+        append_padding_member(out, 65535);
+    }
+    std::size_t remaining = stripe_size - out.size();
+    if (remaining >= PAD_MEMBER_FIXED_OVERHEAD) {
+        append_padding_member(out, static_cast<std::uint16_t>(
+                                       remaining - PAD_MEMBER_FIXED_OVERHEAD));
+    }
+    // < 25 bytes leftover is dropped; next slot still starts at the
+    // declared stripe offset.
+}
+
+class PaddedStripedWriter : public ParallelWriter {
+   public:
+    struct Chunk {
+        std::vector<std::uint8_t> data;
+        std::shared_ptr<coro::Channel<MemberSpan>> ack;
+    };
+
+    explicit PaddedStripedWriter(std::size_t stripe_size)
+        : stripe_size_(stripe_size) {}
+
+    coro::CoroTask<int> open(std::string path, std::size_t num_workers,
+                             bool /*gzip_extension*/,
+                             CoroScope* scope) override {
+        if (!scope) {
+            DFTRACER_UTILS_LOG_ERROR(
+                "PaddedStripedWriter requires a CoroScope to spawn its packer");
+            co_return -1;
+        }
+        path_ = std::move(path);
+        ssize_t fd = co_await ::dftracer::utils::io::open(
+            path_.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
+        if (fd < 0) {
+            DFTRACER_UTILS_LOG_ERROR("Failed to open padded output: %s",
+                                     path_.c_str());
+            co_return -1;
+        }
+        fd_ = static_cast<int>(fd);
+        next_stripe_idx_.store(0, std::memory_order_relaxed);
+        per_worker_last_.assign(num_workers, std::nullopt);
+
+        // Valid-gzip placeholder so callers that skip write_header still
+        // produce a file gunzip can walk.
+        std::vector<std::uint8_t> pad;
+        pad.reserve(stripe_size_);
+        pad_to_stripe(pad, stripe_size_);
+        if (co_await pwrite_bytes(pad.data(), pad.size(), 0) != 0) {
+            co_return -1;
+        }
+
+        // Set up the chunk channel + N pre-registered producers (one per
+        // worker). The packer exits once all producers are released.
+        channel_ = coro::make_channel<Chunk>(CHUNK_CHANNEL_CAPACITY);
+        producers_.reserve(num_workers);
+        for (std::size_t i = 0; i < num_workers; ++i) {
+            producers_.emplace_back(channel_->producer());
+        }
+
+        packer_future_ =
+            scope->spawn([this, consumer = channel_->consumer()](
+                             CoroScope& s) mutable -> coro::CoroTask<int> {
+                co_return co_await run_packer(s, std::move(consumer));
+            });
+        co_return 0;
+    }
+
+    coro::CoroTask<int> write_header(ByteView data) override {
+        if (data.size() + PAD_MEMBER_FIXED_OVERHEAD > stripe_size_) {
+            DFTRACER_UTILS_LOG_ERROR(
+                "padded writer: header %zu + pad overhead exceeds stripe %zu",
+                data.size(), stripe_size_);
+            co_return -1;
+        }
+        std::vector<std::uint8_t> buf;
+        buf.reserve(stripe_size_);
+        buf.insert(
+            buf.end(), reinterpret_cast<const std::uint8_t*>(data.data()),
+            reinterpret_cast<const std::uint8_t*>(data.data()) + data.size());
+        pad_to_stripe(buf, stripe_size_);
+        co_return co_await pwrite_bytes(buf.data(), buf.size(), 0);
+    }
+
+    coro::CoroTask<int> write_chunk(std::size_t worker_idx,
+                                    ByteView data) override {
+        if (worker_idx >= producers_.size()) {
+            DFTRACER_UTILS_LOG_ERROR("padded writer: worker_idx %zu >= %zu",
+                                     worker_idx, producers_.size());
+            co_return -1;
+        }
+        if (data.size() + PAD_MEMBER_FIXED_OVERHEAD > stripe_size_) {
+            DFTRACER_UTILS_LOG_ERROR(
+                "padded writer: chunk %zu + pad overhead exceeds stripe %zu",
+                data.size(), stripe_size_);
+            co_return -1;
+        }
+        Chunk c;
+        c.data.assign(
+            reinterpret_cast<const std::uint8_t*>(data.data()),
+            reinterpret_cast<const std::uint8_t*>(data.data()) + data.size());
+        c.ack = coro::make_channel<MemberSpan>(1);
+        auto ack = c.ack;
+        bool ok = co_await producers_[worker_idx].send(std::move(c));
+        if (!ok) co_return -1;
+        auto span = co_await ack->receive();
+        if (!span) co_return -1;
+        per_worker_last_[worker_idx] = *span;
+        co_return 0;
+    }
+
+    coro::CoroTask<int> write_footer(ByteView data) override {
+        if (co_await drain_packer() != 0) co_return -1;
+        const auto stripes = next_stripe_idx_.load(std::memory_order_relaxed);
+        const auto offset = (stripes + 1) * stripe_size_;  // +1 for header
+        co_return co_await pwrite_all(data, static_cast<off_t>(offset));
+    }
+
+    coro::CoroTask<int> close() override {
+        if (co_await drain_packer() != 0) {
+            if (fd_ >= 0) {
+                co_await ::dftracer::utils::io::close(fd_);
+                fd_ = -1;
+            }
+            co_return -1;
+        }
+        if (fd_ < 0) co_return 0;
+        auto rc = co_await ::dftracer::utils::io::close(fd_);
+        fd_ = -1;
+        co_return static_cast<int>(rc);
+    }
+
+    std::vector<std::string> output_paths() const override { return {path_}; }
+
+    std::optional<MemberSpan> last_member(
+        std::size_t worker_idx) const override {
+        if (worker_idx >= per_worker_last_.size()) return std::nullopt;
+        return per_worker_last_[worker_idx];
+    }
+
+   private:
+    // Drop all producer slots so the channel reports EOF to the packer, then
+    // wait for the packer to emit its final stripe. Safe to call twice.
+    coro::CoroTask<int> drain_packer() {
+        if (!packer_drained_) {
+            producers_.clear();
+            if (packer_future_.has_value()) {
+                auto rc = co_await *packer_future_;
+                packer_future_.reset();
+                if (rc != 0) co_return rc;
+            }
+            packer_drained_ = true;
+        }
+        co_return 0;
+    }
+
+    coro::CoroTask<int> run_packer(CoroScope& parent_scope,
+                                   coro::ChannelConsumer<Chunk> consumer) {
+        std::vector<std::uint8_t> buf;
+        buf.reserve(stripe_size_);
+        std::uint64_t current_stripe_idx =
+            next_stripe_idx_.fetch_add(1, std::memory_order_relaxed);
+
+        std::deque<coro::SpawnFuture<int>> in_flight;
+        int final_rc = 0;
+
+        auto await_one = [&]() -> coro::CoroTask<void> {
+            auto f = std::move(in_flight.front());
+            in_flight.pop_front();
+            auto rc = co_await std::move(f);
+            if (rc != 0 && final_rc == 0) final_rc = rc;
+        };
+
+        auto launch_emit = [&](std::vector<std::uint8_t>&& payload,
+                               std::uint64_t emit_idx) -> coro::CoroTask<void> {
+            while (in_flight.size() >= MAX_INFLIGHT_PWRITES) {
+                co_await await_one();
+            }
+            in_flight.push_back(parent_scope.spawn(
+                [this, p = std::move(payload),
+                 emit_idx](CoroScope&) mutable -> coro::CoroTask<int> {
+                    pad_to_stripe(p, stripe_size_);
+                    const auto offset = (emit_idx + 1) * stripe_size_;
+                    co_return co_await pwrite_bytes(p.data(), p.size(),
+                                                    static_cast<off_t>(offset));
+                }));
+            co_return;
+        };
+
+        while (auto chunk = co_await consumer.receive()) {
+            if (!buf.empty() &&
+                buf.size() + chunk->data.size() + PAD_MEMBER_FIXED_OVERHEAD >
+                    stripe_size_) {
+                co_await launch_emit(std::move(buf), current_stripe_idx);
+                buf.clear();
+                buf.reserve(stripe_size_);
+                current_stripe_idx =
+                    next_stripe_idx_.fetch_add(1, std::memory_order_relaxed);
+            }
+            if (chunk->ack) {
+                MemberSpan span{
+                    (current_stripe_idx + 1) * stripe_size_ + buf.size(),
+                    static_cast<std::uint64_t>(chunk->data.size())};
+                co_await chunk->ack->send(std::move(span));
+                chunk->ack->close();
+            }
+            buf.insert(buf.end(), chunk->data.begin(), chunk->data.end());
+        }
+        if (!buf.empty()) {
+            co_await launch_emit(std::move(buf), current_stripe_idx);
+        }
+
+        while (!in_flight.empty()) {
+            co_await await_one();
+        }
+        co_return final_rc;
+    }
+
+    coro::CoroTask<int> pwrite_all(ByteView data, off_t offset) {
+        co_return co_await pwrite_bytes(
+            reinterpret_cast<const std::uint8_t*>(data.data()), data.size(),
+            offset);
+    }
+
+    coro::CoroTask<int> pwrite_bytes(const std::uint8_t* bytes,
+                                     std::size_t size, off_t offset) {
+        if (size == 0) co_return 0;
+        std::size_t written = 0;
+        while (written < size) {
+            auto n = co_await ::dftracer::utils::io::pwrite(
+                fd_, bytes + written, size - written,
+                offset + static_cast<off_t>(written));
+            if (n <= 0) {
+                DFTRACER_UTILS_LOG_ERROR(
+                    "padded writer pwrite failed at %lld on %s",
+                    static_cast<long long>(offset), path_.c_str());
+                co_return -1;
+            }
+            written += static_cast<std::size_t>(n);
+        }
+        co_return 0;
+    }
+
+    std::string path_;
+    int fd_ = -1;
+    std::size_t stripe_size_;
+    std::atomic<std::uint64_t> next_stripe_idx_{0};
+
+    std::shared_ptr<coro::Channel<Chunk>> channel_;
+    std::vector<coro::ChannelProducer<Chunk>> producers_;
+    std::optional<coro::SpawnFuture<int>> packer_future_;
+    bool packer_drained_ = false;
+    std::vector<std::optional<MemberSpan>> per_worker_last_;
+};
+
+}  // namespace
+
+std::unique_ptr<ParallelWriter> make_padded_striped_writer(
+    std::size_t stripe_size) {
+    return std::make_unique<PaddedStripedWriter>(stripe_size);
+}
+
+}  // namespace dftracer::utils::utilities::fileio::parallel
diff --git a/src/dftracer/utils/utilities/fileio/parallel/sharded_writer.cpp b/src/dftracer/utils/utilities/fileio/parallel/sharded_writer.cpp
new file mode 100644
index 00000000..f7388785
--- /dev/null
+++ b/src/dftracer/utils/utilities/fileio/parallel/sharded_writer.cpp
@@ -0,0 +1,135 @@
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/io/io.h>
+#include <dftracer/utils/utilities/fileio/parallel/parallel_writer.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <vector>
+
+namespace dftracer::utils::utilities::fileio::parallel {
+
+namespace {
+
+class ShardedWriter final : public ParallelWriter {
+   public:
+    coro::CoroTask<int> open(std::string path, std::size_t num_workers,
+                             bool gzip_extension,
+                             CoroScope* /*scope*/) override {
+        base_path_ = std::move(path);
+        const std::string ext = gzip_extension ? ".gz" : "";
+        shard_paths_.resize(num_workers);
+        shard_fds_.assign(num_workers, -1);
+        shard_offsets_.assign(num_workers, 0);
+        per_worker_last_.assign(num_workers, std::nullopt);
+        for (std::size_t i = 0; i < num_workers; ++i) {
+            shard_paths_[i] = base_path_ + ".shard_" + std::to_string(i) + ext;
+            ssize_t fd = co_await ::dftracer::utils::io::open(
+                shard_paths_[i].c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
+            if (fd < 0) {
+                DFTRACER_UTILS_LOG_ERROR("Failed to open shard: %s",
+                                         shard_paths_[i].c_str());
+                co_return -1;
+            }
+            shard_fds_[i] = static_cast<int>(fd);
+        }
+        co_return 0;
+    }
+
+    coro::CoroTask<int> write_header(ByteView data) override {
+        if (shard_fds_.empty()) co_return -1;
+        auto rc = co_await write_all(shard_fds_.front(), data);
+        if (rc == 0) shard_offsets_.front() += data.size();
+        co_return rc;
+    }
+
+    coro::CoroTask<int> write_chunk(std::size_t worker_idx,
+                                    ByteView data) override {
+        if (worker_idx >= shard_fds_.size()) co_return -1;
+        const auto base = shard_offsets_[worker_idx];
+        auto rc = co_await write_all(shard_fds_[worker_idx], data);
+        if (rc != 0) co_return rc;
+        shard_offsets_[worker_idx] += data.size();
+        per_worker_last_[worker_idx] = MemberSpan{base, data.size()};
+        co_return 0;
+    }
+
+    coro::CoroTask<int> write_footer(ByteView data) override {
+        if (shard_fds_.empty()) co_return -1;
+        auto rc = co_await write_all(shard_fds_.back(), data);
+        if (rc == 0) shard_offsets_.back() += data.size();
+        co_return rc;
+    }
+
+    coro::CoroTask<int> close() override {
+        int status = 0;
+        for (auto& fd : shard_fds_) {
+            if (fd < 0) continue;
+            auto rc = co_await ::dftracer::utils::io::close(fd);
+            if (rc < 0) status = -1;
+            fd = -1;
+        }
+        co_return status;
+    }
+
+    std::vector<std::string> output_paths() const override {
+        return shard_paths_;
+    }
+
+    std::optional<MemberSpan> last_member(
+        std::size_t worker_idx) const override {
+        if (worker_idx >= per_worker_last_.size()) return std::nullopt;
+        return per_worker_last_[worker_idx];
+    }
+
+    std::vector<std::uint64_t> shard_base_offsets() const override {
+        std::vector<std::uint64_t> bases(shard_offsets_.size(), 0);
+        std::uint64_t accum = 0;
+        for (std::size_t i = 0; i < shard_offsets_.size(); ++i) {
+            bases[i] = accum;
+            accum += shard_offsets_[i];
+        }
+        return bases;
+    }
+
+   private:
+    coro::CoroTask<int> write_all(int fd, ByteView data) {
+        if (data.size() == 0) co_return 0;
+        const auto* bytes = reinterpret_cast<const char*>(data.data());
+        std::size_t written = 0;
+        while (written < data.size()) {
+            auto n = co_await ::dftracer::utils::io::write(
+                fd, bytes + written, data.size() - written);
+            if (n <= 0) {
+                DFTRACER_UTILS_LOG_ERROR("write failed on shard fd=%d", fd);
+                co_return -1;
+            }
+            written += static_cast<std::size_t>(n);
+        }
+        co_return 0;
+    }
+
+    std::string base_path_;
+    std::vector<std::string> shard_paths_;
+    std::vector<int> shard_fds_;
+    std::vector<std::uint64_t> shard_offsets_;
+    std::vector<std::optional<MemberSpan>> per_worker_last_;
+};
+
+}  // namespace
+
+std::unique_ptr<ParallelWriter> make_sharded_writer() {
+    return std::make_unique<ShardedWriter>();
+}
+
+std::unique_ptr<ParallelWriter> make_writer(const WriterConfig& cfg) {
+    if (cfg.layout == FileLayout::SHARDED) return make_sharded_writer();
+    // Padded striped needs gzip and a large-enough stripe to guarantee a
+    // compressed flush fits one slot. Below the minimum, fall back to the
+    // atomic-byte-offset writer.
+    if (cfg.gzip && cfg.stripe_size >= MIN_PADDED_STRIPE_BYTES) {
+        return make_padded_striped_writer(cfg.stripe_size);
+    }
+    return make_striped_writer();
+}
+
+}  // namespace dftracer::utils::utilities::fileio::parallel
diff --git a/src/dftracer/utils/utilities/fileio/parallel/striped_writer.cpp b/src/dftracer/utils/utilities/fileio/parallel/striped_writer.cpp
new file mode 100644
index 00000000..7d04dcce
--- /dev/null
+++ b/src/dftracer/utils/utilities/fileio/parallel/striped_writer.cpp
@@ -0,0 +1,147 @@
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/io/io.h>
+#include <dftracer/utils/utilities/fileio/parallel/parallel_writer.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cstdint>
+#include <vector>
+
+namespace dftracer::utils::utilities::fileio::parallel {
+
+namespace {
+
+class StripedWriter final : public ParallelWriter {
+   public:
+    coro::CoroTask<int> open(std::string path, std::size_t num_workers,
+                             bool /*gzip_extension*/,
+                             CoroScope* /*scope*/) override {
+        path_ = std::move(path);
+        ssize_t fd = co_await ::dftracer::utils::io::open(
+            path_.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
+        if (fd < 0) {
+            DFTRACER_UTILS_LOG_ERROR("Failed to open striped output: %s",
+                                     path_.c_str());
+            co_return -1;
+        }
+        fd_ = static_cast<int>(fd);
+        offset_.store(0, std::memory_order_relaxed);
+        per_worker_layout_.assign(std::max<std::size_t>(num_workers, 1),
+                                  std::vector<MemberSpan>{});
+        merged_layout_.clear();
+        merged_layout_built_ = false;
+        co_return 0;
+    }
+
+    coro::CoroTask<int> write_header(ByteView data) override {
+        co_return co_await pwrite_all(data);
+    }
+
+    coro::CoroTask<int> write_chunk(std::size_t worker_idx,
+                                    ByteView data) override {
+        if (data.size() == 0) co_return 0;
+        const auto base =
+            offset_.fetch_add(data.size(), std::memory_order_relaxed);
+        // Each worker is sequential (one write_chunk in flight per worker), so
+        // no lock needed when appending to its own bucket.
+        if (worker_idx < per_worker_layout_.size()) {
+            per_worker_layout_[worker_idx].push_back({base, data.size()});
+        }
+        const auto* bytes = reinterpret_cast<const char*>(data.data());
+        std::size_t written = 0;
+        while (written < data.size()) {
+            auto n = co_await ::dftracer::utils::io::pwrite(
+                fd_, bytes + written, data.size() - written,
+                static_cast<off_t>(base + written));
+            if (n <= 0) {
+                DFTRACER_UTILS_LOG_ERROR("pwrite failed on %s (offset=%llu)",
+                                         path_.c_str(),
+                                         static_cast<unsigned long long>(base));
+                co_return -1;
+            }
+            written += static_cast<std::size_t>(n);
+        }
+        co_return 0;
+    }
+
+    coro::CoroTask<int> write_footer(ByteView data) override {
+        co_return co_await pwrite_all(data);
+    }
+
+    coro::CoroTask<int> close() override {
+        if (fd_ < 0) co_return 0;
+        auto rc = co_await ::dftracer::utils::io::close(fd_);
+        fd_ = -1;
+        co_return static_cast<int>(rc);
+    }
+
+    std::vector<std::string> output_paths() const override { return {path_}; }
+
+    std::optional<MemberSpan> last_member(
+        std::size_t worker_idx) const override {
+        if (worker_idx >= per_worker_layout_.size()) return std::nullopt;
+        const auto& v = per_worker_layout_[worker_idx];
+        if (v.empty()) return std::nullopt;
+        return v.back();
+    }
+
+    std::span<const MemberSpan> member_layout() const override {
+        // Lazy merge after close: per-worker vectors -> single offset-sorted
+        // vector. Caller contract: only invoked after `close()`, no concurrent
+        // writers.
+        if (!merged_layout_built_) {
+            std::size_t total = 0;
+            for (const auto& v : per_worker_layout_) total += v.size();
+            merged_layout_.clear();
+            merged_layout_.reserve(total);
+            for (const auto& v : per_worker_layout_) {
+                merged_layout_.insert(merged_layout_.end(), v.begin(), v.end());
+            }
+            std::sort(merged_layout_.begin(), merged_layout_.end(),
+                      [](const MemberSpan& a, const MemberSpan& b) {
+                          return a.offset < b.offset;
+                      });
+            merged_layout_built_ = true;
+        }
+        return std::span<const MemberSpan>(merged_layout_);
+    }
+
+   private:
+    coro::CoroTask<int> pwrite_all(ByteView data) {
+        if (data.size() == 0) co_return 0;
+        const auto base =
+            offset_.fetch_add(data.size(), std::memory_order_relaxed);
+        const auto* bytes = reinterpret_cast<const char*>(data.data());
+        std::size_t written = 0;
+        while (written < data.size()) {
+            auto n = co_await ::dftracer::utils::io::pwrite(
+                fd_, bytes + written, data.size() - written,
+                static_cast<off_t>(base + written));
+            if (n <= 0) {
+                DFTRACER_UTILS_LOG_ERROR("pwrite failed on %s (offset=%llu)",
+                                         path_.c_str(),
+                                         static_cast<unsigned long long>(base));
+                co_return -1;
+            }
+            written += static_cast<std::size_t>(n);
+        }
+        co_return 0;
+    }
+
+    std::string path_;
+    int fd_ = -1;
+    std::atomic<std::uint64_t> offset_{0};
+    std::vector<std::vector<MemberSpan>> per_worker_layout_;
+    mutable std::vector<MemberSpan> merged_layout_;
+    mutable bool merged_layout_built_ = false;
+};
+
+}  // namespace
+
+std::unique_ptr<ParallelWriter> make_striped_writer() {
+    return std::make_unique<StripedWriter>();
+}
+
+}  // namespace dftracer::utils::utilities::fileio::parallel
diff --git a/src/dftracer/utils/utilities/indexer/index_builder_utility.cpp b/src/dftracer/utils/utilities/indexer/index_builder_utility.cpp
index 97b15098..f254a92b 100644
--- a/src/dftracer/utils/utilities/indexer/index_builder_utility.cpp
+++ b/src/dftracer/utils/utilities/indexer/index_builder_utility.cpp
@@ -1,30 +1,37 @@
+#include <dftracer/utils/core/common/config.h>
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/coro/channel.h>
 #include <dftracer/utils/core/coro/task.h>
-#include <dftracer/utils/core/rocksdb/async.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/utilities/composites/dft/dft_event_dispatcher.h>
 #include <dftracer/utils/utilities/composites/dft/internal/utils.h>
+#include <dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.h>
 #include <dftracer/utils/utilities/fileio/lines/sources/async_streaming_gz_line_generator.h>
+#include <dftracer/utils/utilities/indexer/index_batch_sink.h>
 #include <dftracer/utils/utilities/indexer/index_builder_utility.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
+#include <dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
+#include <dftracer/utils/utilities/indexer/internal/index_batch_writer.h>
 #include <dftracer/utils/utilities/indexer/internal/indexer.h>
 #include <dftracer/utils/utilities/indexer/internal/indexer_factory.h>
 #include <dftracer/utils/utilities/indexer/internal/transaction_scope.h>
-#include <dftracer/utils/utilities/indexer/visitors/bloom_visitor.h>
-#include <dftracer/utils/utilities/indexer/visitors/manifest_visitor.h>
 
+#include <atomic>
 #include <chrono>
 #include <optional>
 
 namespace dftracer::utils::utilities::indexer {
 
 using composites::dft::internal::determine_index_path;
+using composites::dft::visitors::BloomVisitor;
+using composites::dft::visitors::HashTableVisitor;
+using composites::dft::visitors::ManifestVisitor;
 using internal::IndexerFactory;
-namespace rocks = dftracer::utils::rocksdb;
-
-// ---------------------------------------------------------------------------
-// IndexBuildConfig builder methods
-// ---------------------------------------------------------------------------
 
 IndexBuildConfig IndexBuildConfig::for_file(const std::string& path) {
     IndexBuildConfig cfg;
@@ -42,22 +49,11 @@ IndexBuildConfig& IndexBuildConfig::with_checkpoint_size(std::size_t size) {
     return *this;
 }
 
-IndexBuildConfig& IndexBuildConfig::with_index_threshold(
-    std::size_t threshold) {
-    index_threshold = threshold;
-    return *this;
-}
-
 IndexBuildConfig& IndexBuildConfig::with_force_rebuild(bool force) {
     force_rebuild = force;
     return *this;
 }
 
-IndexBuildConfig& IndexBuildConfig::with_bloom(bool enable) {
-    build_bloom = enable;
-    return *this;
-}
-
 IndexBuildConfig& IndexBuildConfig::with_manifest(bool enable) {
     build_manifest = enable;
     return *this;
@@ -75,8 +71,8 @@ IndexBuildConfig& IndexBuildConfig::with_bloom_dimensions(
     return *this;
 }
 
-coro::CoroTask<IndexBuildResult> IndexBuilderUtility::process(
-    const IndexBuildConfig& config) {
+static coro::CoroTask<IndexBuildResult> run_index_build(
+    IndexBuildConfig config) {
     IndexBuildResult result;
     result.file_path = config.file_path;
 
@@ -85,15 +81,9 @@ coro::CoroTask<IndexBuildResult> IndexBuilderUtility::process(
             determine_index_path(config.file_path, config.index_dir);
         result.index_path = index_path;
 
-        // Check compressed file size against threshold (0 = always index).
-        std::uintmax_t file_sz = 0;
-        if (fs::exists(config.file_path)) {
-            file_sz = fs::file_size(config.file_path);
-        }
-        const bool below_threshold =
-            config.index_threshold != 0 && file_sz < config.index_threshold;
-
+#if DFTRACER_UTILS_LOGGER_LEVEL_DEBUG
         auto build_start = std::chrono::steady_clock::now();
+#endif
 
         auto indexer = IndexerFactory::create(
             config.file_path, index_path,
@@ -106,15 +96,15 @@ coro::CoroTask<IndexBuildResult> IndexBuilderUtility::process(
             co_return result;
         }
 
-        // NOTE(perf): compute need_rebuild once: used for both skip decision
-        // and checkpoints_valid below. Avoids duplicate fingerprint check.
-        bool idx_exists = !below_threshold && indexer->exists();
+        bool idx_exists = indexer->exists();
         bool needs_rebuild = idx_exists ? indexer->need_rebuild() : true;
 
-        // Skip if index exists, is current, and all requested features present.
-        if (idx_exists && !config.force_rebuild && !needs_rebuild) {
+        // Skip if index exists, is current, all features present,
+        // and no extra visitors need to run.
+        if (idx_exists && !config.force_rebuild && !needs_rebuild &&
+            config.extra_dft_visitors.empty()) {
             auto logical = internal::get_logical_path(config.file_path);
-            bool bloom_ok = !config.build_bloom || [&] {
+            bool bloom_ok = [&] {
                 try {
                     IndexDatabase db(index_path,
                                      dftracer::utils::rocksdb::RocksDatabase::
@@ -138,8 +128,8 @@ coro::CoroTask<IndexBuildResult> IndexBuilderUtility::process(
             }();
 
             if (bloom_ok && manifest_ok) {
-                DFTRACER_UTILS_LOG_INFO("Skipping already-indexed file: %s",
-                                        config.file_path.c_str());
+                DFTRACER_UTILS_LOG_DEBUG("Skipping already-indexed file: %s",
+                                         config.file_path.c_str());
                 result.success = true;
                 result.was_skipped = true;
                 result.index_created = true;
@@ -147,33 +137,39 @@ coro::CoroTask<IndexBuildResult> IndexBuilderUtility::process(
             }
         }
 
-        // Resolve effective bloom dimensions.
         std::vector<std::string> dims =
-            (config.build_bloom && config.bloom_dimensions.empty())
-                ? default_bloom_dimensions()
+            config.bloom_dimensions.empty()
+                ? std::vector<std::string>(DEFAULT_BLOOM_DIMENSIONS.begin(),
+                                           DEFAULT_BLOOM_DIMENSIONS.end())
                 : config.bloom_dimensions;
 
-        // Construct visitors.
-        std::optional<BloomVisitor> bloom_visitor;
+        // Construct DFT event visitors and wrap in single dispatcher.
+        BloomVisitor bloom_visitor(config.bloom_config, dims);
+        HashTableVisitor hash_table_visitor;
         std::optional<ManifestVisitor> manifest_visitor;
 
-        internal::Indexer::VisitorList visitor_list;
-        if (config.build_bloom) {
-            bloom_visitor.emplace(config.bloom_config, dims);
-            visitor_list.emplace_back(*bloom_visitor);
-        }
+        composites::dft::DftEventDispatcher::VisitorList dft_visitors;
+        dft_visitors.emplace_back(bloom_visitor);
+        dft_visitors.emplace_back(hash_table_visitor);
         if (config.build_manifest) {
             manifest_visitor.emplace();
-            visitor_list.emplace_back(*manifest_visitor);
+            dft_visitors.emplace_back(*manifest_visitor);
+        }
+        for (auto& extra : config.extra_dft_visitors) {
+            dft_visitors.emplace_back(extra);
         }
 
+        composites::dft::DftEventDispatcher dispatcher(std::move(dft_visitors));
+        internal::Indexer::VisitorList visitor_list;
+        visitor_list.emplace_back(dispatcher);
+
         // Decide whether checkpoints need rebuilding.
         // Reuses the need_rebuild result computed above.
         bool checkpoints_valid =
             !config.force_rebuild && idx_exists && !needs_rebuild;
 
         if (checkpoints_valid && !visitor_list.empty()) {
-            // Checkpoints exist — only need a streaming pass for visitors.
+            // Checkpoints exist, only need a streaming pass for visitors.
             using fileio::lines::sources::async_streaming_gz_lines;
             for (auto& v : visitor_list) {
                 v.get().begin(0);
@@ -193,12 +189,17 @@ coro::CoroTask<IndexBuildResult> IndexBuilderUtility::process(
                     }
                     ckpt_idx = new_ckpt;
                 }
+                auto buffer = std::make_shared<std::string>(line.content);
+                std::string_view sv(buffer->data(), buffer->size());
                 for (auto& v : visitor_list) {
-                    v.get().on_line(line.content, ckpt_idx);
+                    v.get().on_line(sv, buffer, ckpt_idx);
+                    if (v.get().wants_drain()) {
+                        co_await v.get().drain_pending();
+                    }
                 }
             }
         } else {
-            // Need full checkpoint build — visitors run inline.
+            // Need full checkpoint build, visitors run inline.
             if (!visitor_list.empty()) {
                 indexer->set_visitors(std::move(visitor_list));
             }
@@ -208,10 +209,10 @@ coro::CoroTask<IndexBuildResult> IndexBuilderUtility::process(
         result.total_lines = indexer->get_num_lines();
         result.chunks_processed =
             static_cast<std::size_t>(indexer->get_checkpoints().size());
+        result.events_processed =
+            static_cast<std::size_t>(bloom_visitor.total_events());
 
-        // Persist visitor data into the `.dftindex` store only when the file
-        // meets the size threshold (or threshold is disabled).
-        if (!below_threshold && (config.build_bloom || config.build_manifest)) {
+        {
             const std::string& built_index_path = indexer->get_index_path();
 
             try {
@@ -219,27 +220,24 @@ coro::CoroTask<IndexBuildResult> IndexBuilderUtility::process(
                 auto logical = internal::get_logical_path(config.file_path);
                 const auto hash =
                     internal::calculate_file_hash(config.file_path);
-                auto* db_ptr = &db;
-                auto* logical_ptr = &logical;
-                auto* config_ptr = &config;
-                auto* bloom_visitor_ptr = &bloom_visitor;
-                auto* manifest_visitor_ptr = &manifest_visitor;
-                co_await rocks::run([db_ptr, logical_ptr, hash, config_ptr,
-                                     bloom_visitor_ptr, manifest_visitor_ptr] {
-                    int fid =
-                        db_ptr->get_or_create_file_info(*logical_ptr, hash);
-                    internal::TransactionScope txn(*db_ptr);
-                    if (config_ptr->build_bloom && *bloom_visitor_ptr) {
-                        db_ptr->init_bloom_schema();
-                        db_ptr->delete_chunk_statistics(fid);
-                        (*bloom_visitor_ptr)->finalize(*db_ptr, fid);
-                    }
-                    if (config_ptr->build_manifest && *manifest_visitor_ptr) {
-                        db_ptr->init_manifest_schema();
-                        (*manifest_visitor_ptr)->finalize(*db_ptr, fid);
-                    }
-                    txn.commit();
-                });
+
+                IndexFileEntryCapability caps =
+                    IndexFileEntryCapability::INDEXING_COMPLETE |
+                    IndexFileEntryCapability::BLOOM |
+                    IndexFileEntryCapability::CHECKPOINTS |
+                    IndexFileEntryCapability::FILE_SUMMARY;
+                if (config.build_manifest && manifest_visitor) {
+                    caps |= IndexFileEntryCapability::MANIFEST;
+                }
+                auto writer = db.begin_write();
+                int fid = writer->get_or_create_file_info(logical, hash, caps);
+                writer->delete_chunk_statistics(fid);
+                bloom_visitor.finalize(*writer, fid);
+                hash_table_visitor.finalize(*writer, fid);
+                if (config.build_manifest && manifest_visitor) {
+                    manifest_visitor->finalize(*writer, fid);
+                }
+                writer->commit();
             } catch (const std::exception& e) {
                 result.error_message =
                     std::string("Failed to persist index data: ") + e.what();
@@ -250,16 +248,18 @@ coro::CoroTask<IndexBuildResult> IndexBuilderUtility::process(
             }
         }
 
-        result.index_created = !below_threshold;
+        result.index_created = true;
         result.success = true;
 
+#if DFTRACER_UTILS_LOGGER_LEVEL_DEBUG
         auto build_end = std::chrono::steady_clock::now();
         double elapsed_s =
             std::chrono::duration<double>(build_end - build_start).count();
-        DFTRACER_UTILS_LOG_INFO(
+        DFTRACER_UTILS_LOG_DEBUG(
             "Built index for %s (%zu chunks, %zu lines, %.2fs)",
             config.file_path.c_str(), result.chunks_processed,
             result.total_lines, elapsed_s);
+#endif
     } catch (const std::exception& e) {
         result.error_message = e.what();
         DFTRACER_UTILS_LOG_ERROR("IndexBuilder failed for %s: %s",
@@ -269,4 +269,639 @@ coro::CoroTask<IndexBuildResult> IndexBuilderUtility::process(
     co_return result;
 }
 
+coro::CoroTask<IndexBuildResult> IndexBuilderUtility::process(
+    const IndexBuildConfig& config) {
+    return run_index_build(config);
+}
+
+static coro::CoroTask<IndexBuildBatchResult> process_batch_per_file(
+    CoroScope* scope, std::shared_ptr<IndexBuildBatchConfig> shared_config) {
+    auto results = std::make_shared<std::vector<IndexBuildResult>>(
+        shared_config->file_paths.size());
+    auto indexed = std::make_shared<std::atomic<std::size_t>>(0);
+    auto skipped = std::make_shared<std::atomic<std::size_t>>(0);
+    auto failed = std::make_shared<std::atomic<std::size_t>>(0);
+    auto total_events = std::make_shared<std::atomic<std::uint64_t>>(0);
+
+    auto file_paths = std::make_shared<std::vector<std::string>>(
+        std::move(shared_config->file_paths));
+    const auto parallelism = shared_config->parallelism;
+
+    co_await scope->scope([file_paths, results, indexed, skipped, failed,
+                           total_events, shared_config, parallelism](
+                              CoroScope& child) -> coro::CoroTask<void> {
+        auto file_chan = coro::make_channel<std::size_t>(parallelism * 2);
+
+        child.spawn(
+            [ch = file_chan->producer(), num_files = file_paths->size()](
+                CoroScope&) mutable -> coro::CoroTask<void> {
+                auto guard = ch.guard();
+                for (std::size_t i = 0; i < num_files; ++i) {
+                    if (!co_await ch.send(i)) co_return;
+                }
+                co_return;
+            });
+
+        for (std::size_t w = 0; w < parallelism; ++w) {
+            child.spawn([file_chan, file_paths, results, shared_config, indexed,
+                         skipped, failed,
+                         total_events](CoroScope&) -> coro::CoroTask<void> {
+                while (auto idx_opt = co_await file_chan->receive()) {
+                    std::size_t idx = *idx_opt;
+                    IndexBuilderUtility builder;
+                    auto file_config =
+                        IndexBuildConfig::for_file((*file_paths)[idx])
+                            .with_index_dir(shared_config->index_dir)
+                            .with_checkpoint_size(
+                                shared_config->checkpoint_size)
+                            .with_force_rebuild(shared_config->force_rebuild)
+                            .with_manifest(shared_config->build_manifest)
+                            .with_bloom_config(shared_config->bloom_config)
+                            .with_bloom_dimensions(
+                                shared_config->bloom_dimensions);
+
+                    auto result = co_await builder.process(file_config);
+
+                    if (result.was_skipped) {
+                        skipped->fetch_add(1, std::memory_order_relaxed);
+                    } else if (result.success) {
+                        indexed->fetch_add(1, std::memory_order_relaxed);
+                        total_events->fetch_add(result.events_processed,
+                                                std::memory_order_relaxed);
+                    } else {
+                        failed->fetch_add(1, std::memory_order_relaxed);
+                    }
+                    (*results)[idx] = std::move(result);
+                }
+                co_return;
+            });
+        }
+        co_return;
+    });
+
+    IndexBuildBatchResult batch_result;
+    batch_result.results = std::move(*results);
+    batch_result.indexed = indexed->load(std::memory_order_relaxed);
+    batch_result.skipped = skipped->load(std::memory_order_relaxed);
+    batch_result.failed = failed->load(std::memory_order_relaxed);
+    batch_result.total_events = total_events->load(std::memory_order_relaxed);
+    co_return batch_result;
+}
+
+namespace {
+
+struct PreparedFile {
+    std::size_t index;
+    std::string file_path;
+    std::string logical_path;
+    std::string index_path;
+    std::uint64_t file_hash = 0;
+    int file_id = 0;
+    IndexBuildBatchConfig::FileSlice slice;
+};
+
+struct ParsedBloomJob {
+    PreparedFile identity;
+    IndexBuildResult result;
+    internal::gzip::GzipBuildArtifacts artifacts;
+    std::unique_ptr<BloomVisitor> bloom_visitor;
+    std::unique_ptr<HashTableVisitor> hash_table_visitor;
+    std::unique_ptr<ManifestVisitor> manifest_visitor;
+    std::vector<std::unique_ptr<composites::dft::DftEventVisitor>>
+        extra_visitors;
+};
+
+std::vector<PreparedFile> prepare_file_identities(
+    const std::string& index_path, const std::vector<std::string>& file_paths,
+    bool build_manifest) {
+    IndexDatabase db(index_path);
+    auto writer = db.begin_write();
+
+    std::vector<PreparedFile> prepared;
+    prepared.reserve(file_paths.size());
+    for (std::size_t i = 0; i < file_paths.size(); ++i) {
+        PreparedFile pf;
+        pf.index = i;
+        pf.file_path = file_paths[i];
+        pf.logical_path = internal::get_logical_path(file_paths[i]);
+        pf.index_path = index_path;
+        pf.file_hash = internal::calculate_file_hash(file_paths[i]);
+        IndexFileEntryCapability caps =
+            IndexFileEntryCapability::BLOOM |
+            IndexFileEntryCapability::CHECKPOINTS |
+            IndexFileEntryCapability::FILE_SUMMARY |
+            IndexFileEntryCapability::INDEXING_COMPLETE;
+        if (build_manifest) {
+            caps |= IndexFileEntryCapability::MANIFEST;
+        }
+        pf.file_id = writer->get_or_create_file_info(pf.logical_path,
+                                                     pf.file_hash, caps);
+        prepared.push_back(std::move(pf));
+    }
+    writer->commit();
+    return prepared;
+}
+
+}  // namespace
+
+struct BatchWriteState {
+    std::shared_ptr<std::vector<IndexBuildResult>> results;
+    std::shared_ptr<std::vector<std::optional<ParsedBloomJob>>> parsed_jobs;
+    std::shared_ptr<std::vector<PreparedFile>> prepared;
+    std::shared_ptr<std::vector<std::string>> bloom_dims;
+    std::string index_path;
+    IndexBuildBatchMetrics metrics;
+    composites::dft::indexing::ChunkIndexerConfig bloom_config;
+    std::size_t num_files = 0;
+    std::size_t parallelism = 0;
+    std::size_t checkpoint_size = 0;
+    bool build_manifest = false;
+    IndexBuildBatchConfig::DftVisitorFactory visitor_factory;
+    IndexBuildBatchConfig::SinkFactory sink_factory;
+    IndexBuildBatchConfig::SinkCommitFn sink_commit;
+};
+
+// Parse one file at a time (work-stealing via atomic next_index), and stream
+// the resulting bloom/hash/manifest payload directly to the write channel so
+// write workers can begin committing before the parse phase finishes. The
+// extra_visitors and result are left in parsed_jobs[idx] for
+// finalize_batch_result; the channel item only carries what the write phase
+// needs.
+static coro::CoroTask<void> parse_and_emit_worker(
+    CoroScope* scope, std::atomic<std::size_t>* next_index_ptr,
+    std::vector<IndexBuildResult>* results_ptr,
+    std::vector<std::optional<ParsedBloomJob>>* parsed_jobs_ptr,
+    std::vector<PreparedFile>* prepared_ptr, std::size_t checkpoint_size,
+    composites::dft::indexing::ChunkIndexerConfig bloom_config,
+    const std::vector<std::string>* bloom_dims_ptr,
+    std::atomic<std::uint64_t>* parse_ns_ptr,
+    const IndexBuildBatchConfig::DftVisitorFactory* visitor_factory_ptr,
+    bool build_manifest, coro::ChannelProducer<internal::ParsedIndexJob> ch) {
+    namespace gzip_indexer = internal::gzip;
+    auto guard = ch.guard();
+
+    while (true) {
+        const auto idx =
+            next_index_ptr->fetch_add(1, std::memory_order_relaxed);
+        if (idx >= prepared_ptr->size()) break;
+
+        const auto& pf = (*prepared_ptr)[idx];
+        IndexBuildResult result;
+        result.file_path = pf.file_path;
+        result.index_path = pf.index_path;
+        auto t0 = std::chrono::steady_clock::now();
+
+        ParsedBloomJob job;
+        job.identity = pf;
+        bool parse_ok = false;
+        try {
+            composites::dft::DftEventDispatcher::VisitorList dft_vis;
+            // Built-in file-scoped visitors are skipped for sliced files
+            // where file-scoped writes are disabled (non-first slice of a
+            // cross-rank-split file). BloomVisitor::ensure_chunk would also
+            // resize chunks_ with a large checkpoint_idx_base.
+            if (!pf.slice.skip_file_scoped_writes) {
+                job.bloom_visitor = std::make_unique<BloomVisitor>(
+                    bloom_config, *bloom_dims_ptr);
+                job.hash_table_visitor = std::make_unique<HashTableVisitor>();
+                dft_vis.emplace_back(*job.bloom_visitor);
+                dft_vis.emplace_back(*job.hash_table_visitor);
+                if (build_manifest) {
+                    job.manifest_visitor = std::make_unique<ManifestVisitor>();
+                    dft_vis.emplace_back(*job.manifest_visitor);
+                }
+            }
+            if (visitor_factory_ptr && *visitor_factory_ptr) {
+                job.extra_visitors = (*visitor_factory_ptr)(pf.file_path);
+                for (auto& v : job.extra_visitors) {
+                    dft_vis.emplace_back(*v);
+                }
+            }
+
+            composites::dft::DftEventDispatcher batch_dispatcher(
+                std::move(dft_vis));
+            internal::Indexer::VisitorList visitors;
+            visitors.emplace_back(batch_dispatcher);
+
+            gzip_indexer::GzipMemberSlice slice_arg;
+            const gzip_indexer::GzipMemberSlice* slice_ptr = nullptr;
+            if (pf.slice.members != nullptr &&
+                pf.slice.member_end > pf.slice.member_begin) {
+                slice_arg.members = pf.slice.members;
+                slice_arg.member_begin = pf.slice.member_begin;
+                slice_arg.member_end = pf.slice.member_end;
+                slice_arg.checkpoint_idx_base = pf.slice.checkpoint_idx_base;
+                slice_ptr = &slice_arg;
+            }
+            auto arts = co_await gzip_indexer::build_gzip_index_artifacts(
+                pf.file_path, checkpoint_size, visitors, scope, slice_ptr);
+            if (!arts) {
+                result.error_message = "Failed to build gzip index artifacts";
+            } else {
+                job.artifacts = std::move(*arts);
+                result.total_lines =
+                    static_cast<std::size_t>(job.artifacts.total_lines);
+                result.chunks_processed = job.artifacts.checkpoints.size();
+                if (job.bloom_visitor) {
+                    result.events_processed = static_cast<std::size_t>(
+                        job.bloom_visitor->total_events());
+                }
+                result.index_created = true;
+                result.success = true;
+                job.result = result;
+                for (auto& v : job.extra_visitors) {
+                    co_await v->on_file_complete();
+                }
+                parse_ok = true;
+            }
+        } catch (const std::exception& e) {
+            result.error_message = e.what();
+        }
+
+        auto t1 = std::chrono::steady_clock::now();
+        parse_ns_ptr->fetch_add(
+            static_cast<std::uint64_t>(
+                std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0)
+                    .count()),
+            std::memory_order_relaxed);
+
+        if (!parse_ok) {
+            (*results_ptr)[idx] = std::move(result);
+            continue;
+        }
+
+        // Sliced rank with member_begin > 0: skip file-scoped channel send;
+        // aggregation SSTs already produced via extra visitors are kept in
+        // parsed_jobs[idx] for downstream collection.
+        if (pf.slice.skip_file_scoped_writes) {
+            (*results_ptr)[idx] = std::move(result);
+            (*parsed_jobs_ptr)[idx] = std::move(job);
+            continue;
+        }
+
+        // Build the channel-bound payload (move bloom/hash/manifest into it),
+        // and leave extra_visitors + result behind in parsed_jobs[idx].
+        internal::ParsedIndexJob send_job;
+        send_job.file_id = pf.file_id;
+        send_job.file_path = pf.file_path;
+        send_job.artifacts = std::move(job.artifacts);
+        send_job.bloom_visitor = std::move(job.bloom_visitor);
+        send_job.hash_table_visitor = std::move(job.hash_table_visitor);
+        send_job.manifest_visitor = std::move(job.manifest_visitor);
+        send_job.success = true;
+
+        ParsedBloomJob holder;
+        holder.identity = pf;
+        holder.extra_visitors = std::move(job.extra_visitors);
+        holder.result = result;
+        (*parsed_jobs_ptr)[idx] = std::move(holder);
+        (*results_ptr)[idx] = std::move(result);
+
+        if (!co_await ch.send(std::move(send_job))) co_return;
+    }
+    co_return;
+}
+
+// Streaming parse + write pipeline: parse-and-emit workers and write workers
+// run concurrently inside one scope. Parse workers act as multiple producers
+// on the write channel (each holds its own ProducerGuard); the channel closes
+// for sends when all parse workers exit, after which write workers finish
+// draining buffered items, do their final flush, and exit. Memory is bounded
+// by the channel capacity (write_workers * WRITE_BATCH_SIZE) so peak heap
+// stays bounded regardless of total file count.
+static coro::CoroTask<void> run_streaming_pipeline(CoroScope* scope,
+                                                   BatchWriteState* state) {
+    static constexpr std::size_t WRITE_BATCH_SIZE = 64;
+    const auto parse_workers = state->parallelism;
+    // Write workers are decoupled from parse workers to control SST count.
+    // Floor (parse_workers / 3) reflects the empirical write-vs-parse CPU
+    // ratio for the bloom indexer (~3x). Ceiling (num_files / batch_size)
+    // ensures large workloads, where total SST count is bounded by
+    // ceil(num_files / batch_size) anyway, get full parallelism. Both are
+    // capped at parse_workers and given a minimum of 4 for small workloads.
+    const auto write_workers = std::min(
+        parse_workers, std::max<std::size_t>(
+                           4, std::max(parse_workers / 3,
+                                       state->num_files / WRITE_BATCH_SIZE)));
+
+    DFTRACER_UTILS_LOG_INFO(
+        "IndexBatch: streaming pipeline begin (%zu files, parse_workers=%zu "
+        "write_workers=%zu)",
+        state->num_files, parse_workers, write_workers);
+
+    // GCC 12 coroutine bug: capturing shared_ptr by value in coroutine
+    // lambdas corrupts refcount. Keep shared_ptrs at this scope and pass
+    // raw pointers to lambdas.
+    auto write_chan = coro::make_channel<internal::ParsedIndexJob>(
+        write_workers * WRITE_BATCH_SIZE);
+    auto writer_metrics = std::make_shared<internal::BatchWriterMetrics>();
+
+    // Only open the RocksDB-backed DB when no external sink factory is
+    // provided. The distributed SST path routes writes through caller-owned
+    // SstWriterContext instances and must not hold a process-exclusive
+    // RocksDB handle on the target index dir.
+    std::shared_ptr<IndexDatabase> writer_db;
+    if (!state->sink_factory) {
+        writer_db = std::make_shared<IndexDatabase>(state->index_path);
+    }
+
+    auto next_index = std::make_shared<std::atomic<std::size_t>>(0);
+    auto parse_ns = std::make_shared<std::atomic<std::uint64_t>>(0);
+    auto bloom_config_holder =
+        std::make_shared<composites::dft::indexing::ChunkIndexerConfig>(
+            state->bloom_config);
+
+    auto* next_index_ptr = next_index.get();
+    auto* parse_ns_ptr = parse_ns.get();
+    auto* results_ptr = state->results.get();
+    auto* parsed_jobs_ptr = state->parsed_jobs.get();
+    auto* prepared_ptr = state->prepared.get();
+    auto* db_ptr = writer_db.get();
+    auto* metrics_ptr = writer_metrics.get();
+    auto* write_chan_ptr = write_chan.get();
+    const auto* bloom_config_ptr = bloom_config_holder.get();
+    const auto* bloom_dims_ptr = state->bloom_dims.get();
+    const auto checkpoint_size = state->checkpoint_size;
+    const bool build_manifest = state->build_manifest;
+    const IndexBuildBatchConfig::DftVisitorFactory* visitor_factory_ptr =
+        state->visitor_factory ? &state->visitor_factory : nullptr;
+    auto* sink_factory_ptr = &state->sink_factory;
+    auto* sink_commit_ptr = &state->sink_commit;
+
+    co_await scope->scope([parse_workers, write_workers, next_index_ptr,
+                           parse_ns_ptr, results_ptr, parsed_jobs_ptr,
+                           prepared_ptr, checkpoint_size, bloom_config_ptr,
+                           bloom_dims_ptr, visitor_factory_ptr, build_manifest,
+                           write_chan_ptr, db_ptr, metrics_ptr,
+                           sink_factory_ptr, sink_commit_ptr](
+                              CoroScope& child) -> coro::CoroTask<void> {
+        for (std::size_t w = 0; w < parse_workers; ++w) {
+            child.spawn(
+                [next_index_ptr, parse_ns_ptr, results_ptr, parsed_jobs_ptr,
+                 prepared_ptr, checkpoint_size, bloom_config_ptr,
+                 bloom_dims_ptr, visitor_factory_ptr, build_manifest,
+                 ch = write_chan_ptr->producer()](
+                    CoroScope& own_scope) mutable -> coro::CoroTask<void> {
+                    co_await parse_and_emit_worker(
+                        &own_scope, next_index_ptr, results_ptr,
+                        parsed_jobs_ptr, prepared_ptr, checkpoint_size,
+                        *bloom_config_ptr, bloom_dims_ptr, parse_ns_ptr,
+                        visitor_factory_ptr, build_manifest, std::move(ch));
+                });
+        }
+
+        for (std::size_t w = 0; w < write_workers; ++w) {
+            child.spawn([write_chan_ptr, db_ptr, metrics_ptr, sink_factory_ptr,
+                         sink_commit_ptr](CoroScope&) -> coro::CoroTask<void> {
+                if (*sink_factory_ptr) {
+                    co_await internal::index_batch_write_worker(
+                        write_chan_ptr, WRITE_BATCH_SIZE, metrics_ptr,
+                        *sink_factory_ptr, *sink_commit_ptr);
+                } else {
+                    co_await internal::index_batch_write_worker(
+                        write_chan_ptr, WRITE_BATCH_SIZE, metrics_ptr,
+                        [db_ptr] { return db_ptr->begin_write(); },
+                        [](IndexBatchSink& sink) {
+                            static_cast<IndexDatabaseWriterContext&>(sink)
+                                .commit();
+                        });
+                }
+            });
+        }
+        co_return;
+    });
+
+    state->metrics.parse_ns = parse_ns->load(std::memory_order_relaxed);
+    state->metrics.files_parsed = state->num_files;
+    state->metrics.write_ns =
+        writer_metrics->write_ns.load(std::memory_order_relaxed);
+    state->metrics.files_written =
+        writer_metrics->files_written.load(std::memory_order_relaxed);
+    DFTRACER_UTILS_LOG_INFO(
+        "IndexBatch: streaming pipeline complete (parsed=%zu written=%zu)",
+        state->num_files, state->metrics.files_written);
+    co_return;
+}
+
+static std::unique_ptr<BatchWriteState> init_batch_write_state(
+    IndexBuildBatchConfig& config) {
+    auto state = std::make_unique<BatchWriteState>();
+    state->num_files = config.file_paths.size();
+    state->parallelism = config.parallelism;
+    state->checkpoint_size = config.checkpoint_size;
+    state->bloom_config = config.bloom_config;
+    state->build_manifest = config.build_manifest;
+    state->bloom_dims = std::make_shared<std::vector<std::string>>(
+        config.bloom_dimensions.empty()
+            ? std::vector<std::string>(DEFAULT_BLOOM_DIMENSIONS.begin(),
+                                       DEFAULT_BLOOM_DIMENSIONS.end())
+            : std::move(config.bloom_dimensions));
+    state->results =
+        std::make_shared<std::vector<IndexBuildResult>>(state->num_files);
+    state->index_path =
+        determine_index_path(config.file_paths.front(), config.index_dir);
+    if (!config.file_slices.empty() &&
+        config.file_slices.size() != config.file_paths.size()) {
+        throw std::runtime_error(
+            "file_slices.size() must match file_paths.size() (or be empty)");
+    }
+    if (!config.preassigned_file_ids.empty()) {
+        if (config.preassigned_file_ids.size() != config.file_paths.size()) {
+            throw std::runtime_error(
+                "preassigned_file_ids.size() must match file_paths.size()");
+        }
+        // Distributed path: coordinator has already registered files and
+        // assigned ids. Skip the DEFAULT-CF registry open/write step.
+        std::vector<PreparedFile> prepared;
+        prepared.reserve(config.file_paths.size());
+        for (std::size_t i = 0; i < config.file_paths.size(); ++i) {
+            PreparedFile pf;
+            pf.index = i;
+            pf.file_path = config.file_paths[i];
+            pf.logical_path = internal::get_logical_path(config.file_paths[i]);
+            pf.index_path = state->index_path;
+            pf.file_hash = internal::calculate_file_hash(config.file_paths[i]);
+            pf.file_id = config.preassigned_file_ids[i];
+            if (!config.file_slices.empty()) pf.slice = config.file_slices[i];
+            prepared.push_back(std::move(pf));
+        }
+        state->prepared =
+            std::make_shared<std::vector<PreparedFile>>(std::move(prepared));
+    } else {
+        state->prepared =
+            std::make_shared<std::vector<PreparedFile>>(prepare_file_identities(
+                state->index_path, config.file_paths, config.build_manifest));
+        if (!config.file_slices.empty()) {
+            auto& prepared = *state->prepared;
+            for (std::size_t i = 0; i < prepared.size(); ++i) {
+                prepared[i].slice = config.file_slices[i];
+            }
+        }
+    }
+    state->parsed_jobs =
+        std::make_shared<std::vector<std::optional<ParsedBloomJob>>>(
+            state->num_files);
+    if (config.dft_visitor_factory) {
+        state->visitor_factory = std::move(config.dft_visitor_factory);
+    }
+    state->sink_factory = std::move(config.sink_factory);
+    state->sink_commit = std::move(config.sink_commit);
+    if (static_cast<bool>(state->sink_factory) !=
+        static_cast<bool>(state->sink_commit)) {
+        throw std::runtime_error(
+            "IndexBuildBatchConfig: sink_factory and sink_commit must be set "
+            "together (either both null for the default RocksDB path, or "
+            "both non-null for the distributed SST path).");
+    }
+    return state;
+}
+
+static void finalize_batch_result(BatchWriteState* state,
+                                  IndexBuildBatchResult* out) {
+    out->results = std::move(*state->results);
+    out->metrics = state->metrics;
+    out->metrics.files_enqueued = state->num_files;
+
+    out->extra_visitors.resize(state->num_files);
+    for (std::size_t i = 0; i < state->num_files; ++i) {
+        auto& job_opt = (*state->parsed_jobs)[i];
+        if (job_opt && !job_opt->extra_visitors.empty()) {
+            out->extra_visitors[i] = std::move(job_opt->extra_visitors);
+        }
+    }
+
+    for (const auto& r : out->results) {
+        if (r.was_skipped) {
+            out->skipped++;
+        } else if (r.success) {
+            out->indexed++;
+            out->total_events += r.events_processed;
+        } else {
+            out->failed++;
+        }
+    }
+}
+
+static void run_rebuild_root_summaries(const std::string& index_path) {
+    IndexDatabase db(index_path);
+    auto writer = db.begin_write();
+    writer->rebuild_root_summaries();
+    writer->commit();
+}
+
+static coro::CoroTask<IndexBuildBatchResult> run_single_batch(
+    CoroScope* scope, IndexBuildBatchConfig chunk_config) {
+    auto state = init_batch_write_state(chunk_config);
+    co_await run_streaming_pipeline(scope, state.get());
+    IndexBuildBatchResult partial;
+    finalize_batch_result(state.get(), &partial);
+    co_return partial;
+}
+
+static void merge_partial_into(IndexBuildBatchResult& out,
+                               IndexBuildBatchResult partial) {
+    for (auto& r : partial.results) {
+        out.results.push_back(std::move(r));
+    }
+    out.indexed += partial.indexed;
+    out.skipped += partial.skipped;
+    out.failed += partial.failed;
+    out.total_events += partial.total_events;
+    out.metrics.parse_ns += partial.metrics.parse_ns;
+    out.metrics.write_ns += partial.metrics.write_ns;
+    out.metrics.files_enqueued += partial.metrics.files_enqueued;
+    out.metrics.files_parsed += partial.metrics.files_parsed;
+    out.metrics.files_written += partial.metrics.files_written;
+    for (auto& ev : partial.extra_visitors) {
+        out.extra_visitors.push_back(std::move(ev));
+    }
+}
+
+static coro::CoroTask<IndexBuildBatchResult> run_batch_write_pipeline(
+    CoroScope* scope, std::shared_ptr<IndexBuildBatchConfig> config_ptr) {
+    const bool do_rebuild = config_ptr->rebuild_root_summaries;
+    const std::size_t flush_every = config_ptr->flush_every_files;
+    const std::size_t total = config_ptr->file_paths.size();
+    const std::size_t chunk_size =
+        (flush_every > 0 && flush_every < total) ? flush_every : total;
+
+    IndexBuildBatchResult result;
+    const auto index_path = determine_index_path(config_ptr->file_paths.front(),
+                                                 config_ptr->index_dir);
+
+    const std::size_t num_sub_batches = (total + chunk_size - 1) / chunk_size;
+    std::size_t sub_batch_idx = 0;
+    for (std::size_t start = 0; start < total; start += chunk_size) {
+        const std::size_t end = std::min(start + chunk_size, total);
+        DFTRACER_UTILS_LOG_INFO(
+            "IndexBatch: sub-batch %zu/%zu begin (files %zu..%zu of %zu)",
+            sub_batch_idx + 1, num_sub_batches, start, end - 1, total);
+        IndexBuildBatchConfig chunk_config;
+        chunk_config.file_paths.assign(
+            config_ptr->file_paths.begin() + static_cast<std::ptrdiff_t>(start),
+            config_ptr->file_paths.begin() + static_cast<std::ptrdiff_t>(end));
+        if (!config_ptr->preassigned_file_ids.empty()) {
+            chunk_config.preassigned_file_ids.assign(
+                config_ptr->preassigned_file_ids.begin() +
+                    static_cast<std::ptrdiff_t>(start),
+                config_ptr->preassigned_file_ids.begin() +
+                    static_cast<std::ptrdiff_t>(end));
+        }
+        if (!config_ptr->file_slices.empty()) {
+            chunk_config.file_slices.assign(
+                config_ptr->file_slices.begin() +
+                    static_cast<std::ptrdiff_t>(start),
+                config_ptr->file_slices.begin() +
+                    static_cast<std::ptrdiff_t>(end));
+        }
+        chunk_config.sink_factory = config_ptr->sink_factory;
+        chunk_config.sink_commit = config_ptr->sink_commit;
+        chunk_config.index_dir = config_ptr->index_dir;
+        chunk_config.checkpoint_size = config_ptr->checkpoint_size;
+        chunk_config.parallelism = config_ptr->parallelism;
+        chunk_config.force_rebuild = config_ptr->force_rebuild;
+        chunk_config.build_manifest = config_ptr->build_manifest;
+        chunk_config.bloom_config = config_ptr->bloom_config;
+        chunk_config.bloom_dimensions = config_ptr->bloom_dimensions;
+        chunk_config.use_batch_write = true;
+        chunk_config.rebuild_root_summaries = false;
+        chunk_config.dft_visitor_factory = config_ptr->dft_visitor_factory;
+
+        auto partial =
+            co_await run_single_batch(scope, std::move(chunk_config));
+        if (config_ptr->extra_visitors_drain) {
+            auto drained = std::move(partial.extra_visitors);
+            partial.extra_visitors.clear();
+            config_ptr->extra_visitors_drain(std::move(drained));
+        }
+        DFTRACER_UTILS_LOG_INFO(
+            "IndexBatch: sub-batch %zu/%zu complete (indexed=%zu skipped=%zu "
+            "failed=%zu)",
+            sub_batch_idx + 1, num_sub_batches, partial.indexed,
+            partial.skipped, partial.failed);
+        merge_partial_into(result, std::move(partial));
+        ++sub_batch_idx;
+    }
+
+    config_ptr.reset();
+
+    if (do_rebuild) {
+        run_rebuild_root_summaries(index_path);
+    }
+
+    co_return result;
+}
+
+coro::CoroTask<IndexBuildBatchResult> IndexBatchBuilderUtility::process(
+    CoroScope* scope, std::shared_ptr<IndexBuildBatchConfig> config_ptr) {
+    if (!config_ptr || config_ptr->file_paths.empty()) {
+        co_return IndexBuildBatchResult{};
+    }
+    if (config_ptr->use_batch_write) {
+        co_return co_await run_batch_write_pipeline(scope,
+                                                    std::move(config_ptr));
+    }
+    co_return co_await process_batch_per_file(scope, std::move(config_ptr));
+}
+
 }  // namespace dftracer::utils::utilities::indexer
diff --git a/src/dftracer/utils/utilities/indexer/index_database.cpp b/src/dftracer/utils/utilities/indexer/index_database.cpp
index 4cc3165c..453787e1 100644
--- a/src/dftracer/utils/utilities/indexer/index_database.cpp
+++ b/src/dftracer/utils/utilities/indexer/index_database.cpp
@@ -1,17 +1,27 @@
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
 #include <dftracer/utils/core/rocksdb/key_codec.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_merge_operator.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/association_tracker.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/queries/manifest_queries.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_sst_writer_context.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
 #include <dftracer/utils/utilities/indexer/internal/error.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
+#include <dftracer/utils/utilities/indexer/internal/index_encoding.h>
+#include <dftracer/utils/utilities/indexer/internal/payload_codec.h>
 #include <dftracer/utils/utilities/indexer/internal/scan_prefix.h>
+#include <dftracer/utils/utilities/indexer/internal/statistics_codec.h>
 
 #include <algorithm>
 #include <array>
 #include <cstring>
 #include <limits>
 #include <optional>
+#include <shared_mutex>
 #include <stdexcept>
 #include <utility>
 
@@ -19,12 +29,13 @@ namespace dftracer::utils::utilities::indexer {
 
 namespace queries = composites::dft::indexing::queries;
 namespace rocks = dftracer::utils::rocksdb;
+namespace cf = rocks::cf;
 
-using internal::IndexerError;
+using namespace internal;
 
 namespace {
 
-constexpr std::uint32_t kSchemaVersion = 1;
+constexpr std::uint32_t SCHEMA_VERSION = 1;
 
 [[noreturn]] void throw_db_error(std::string_view message,
                                  const ::rocksdb::Status& status) {
@@ -32,80 +43,6 @@ constexpr std::uint32_t kSchemaVersion = 1;
                        std::string(message) + ": " + status.ToString());
 }
 
-void append_u8(std::string& out, std::uint8_t value) {
-    out.push_back(static_cast<char>(value));
-}
-
-void append_i64(std::string& out, std::int64_t value) {
-    rocks::KeyCodec::append_be64(out, static_cast<std::uint64_t>(value));
-}
-
-void append_u64(std::string& out, std::uint64_t value) {
-    rocks::KeyCodec::append_be64(out, value);
-}
-
-void append_double(std::string& out, double value) {
-    static_assert(sizeof(double) == sizeof(std::uint64_t));
-    std::uint64_t bits = 0;
-    std::memcpy(&bits, &value, sizeof(bits));
-    append_u64(out, bits);
-}
-
-void append_string(std::string& out, std::string_view value) {
-    rocks::KeyCodec::append_be32(out, static_cast<std::uint32_t>(value.size()));
-    out.append(value.data(), value.size());
-}
-
-void append_blob(std::string& out, std::span<const unsigned char> blob) {
-    rocks::KeyCodec::append_be32(out, static_cast<std::uint32_t>(blob.size()));
-    out.append(reinterpret_cast<const char*>(blob.data()), blob.size());
-}
-
-class Cursor {
-   public:
-    explicit Cursor(std::string_view data) : data_(data) {}
-
-    std::uint8_t u8() { return static_cast<std::uint8_t>(take(1)[0]); }
-
-    std::uint32_t u32() { return rocks::KeyCodec::decode_be32(take(4)); }
-
-    std::uint64_t u64() { return rocks::KeyCodec::decode_be64(take(8)); }
-
-    std::int64_t i64() { return static_cast<std::int64_t>(u64()); }
-
-    double f64() {
-        std::uint64_t bits = u64();
-        double value = 0.0;
-        std::memcpy(&value, &bits, sizeof(value));
-        return value;
-    }
-
-    std::string str() {
-        auto len = static_cast<std::size_t>(u32());
-        auto bytes = take(len);
-        return std::string(bytes.data(), bytes.size());
-    }
-
-    std::vector<unsigned char> blob() {
-        auto len = static_cast<std::size_t>(u32());
-        auto bytes = take(len);
-        return std::vector<unsigned char>(bytes.begin(), bytes.end());
-    }
-
-   private:
-    std::string_view take(std::size_t len) {
-        if (offset_ + len > data_.size()) {
-            throw std::runtime_error("Corrupt RocksDB payload");
-        }
-        auto chunk = data_.substr(offset_, len);
-        offset_ += len;
-        return chunk;
-    }
-
-    std::string_view data_;
-    std::size_t offset_ = 0;
-};
-
 std::string file_lookup_key(std::string_view logical_name) {
     return std::string("f|") + std::string(logical_name);
 }
@@ -116,16 +53,14 @@ std::string file_reverse_key(int file_id) {
     return key;
 }
 
-std::string next_file_id_key() { return "_next_file_id"; }
 std::string schema_version_key() { return "_schema_version"; }
 
-std::string encode_file_record(int file_id, std::uint64_t file_hash) {
-    std::string value;
-    rocks::KeyCodec::append_be32(value, static_cast<std::uint32_t>(file_id));
-    append_u64(value, 0);
-    append_u64(value, 0);
-    append_u64(value, file_hash);
-    return value;
+IndexFileEntryCapability decode_file_capabilities(std::string_view record) {
+    if (record.size() < 5) {
+        return IndexFileEntryCapability::NONE;
+    }
+    return static_cast<IndexFileEntryCapability>(
+        static_cast<std::uint8_t>(record[4]));
 }
 
 int decode_file_id(std::string_view record) {
@@ -135,6 +70,13 @@ int decode_file_id(std::string_view record) {
     return static_cast<int>(rocks::KeyCodec::decode_be32(record.substr(0, 4)));
 }
 
+int decode_prefixed_file_id(std::string_view key) {
+    if (key.size() < 4) {
+        throw std::runtime_error("Corrupt file-prefixed key");
+    }
+    return static_cast<int>(rocks::KeyCodec::decode_be32(key.substr(0, 4)));
+}
+
 std::uint64_t decode_file_hash(std::string_view record) {
     if (record.size() < 28) {
         throw std::runtime_error("Corrupt file record");
@@ -142,41 +84,7 @@ std::uint64_t decode_file_hash(std::string_view record) {
     return rocks::KeyCodec::decode_be64(record.substr(20, 8));
 }
 
-std::string prefix_for_file(int file_id) {
-    return rocks::KeyCodec::encode_be32(static_cast<std::uint32_t>(file_id));
-}
-
-std::string make_hash_owner_key(int file_id, std::string_view dimension,
-                                std::string_view hash_value) {
-    std::string key("o|");
-    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_id));
-    key.push_back('\0');
-    key.append(dimension);
-    key.push_back('\0');
-    key.append(hash_value);
-    return key;
-}
-
-std::string make_hash_forward_key(std::string_view dimension,
-                                  std::string_view hash_value) {
-    std::string key("h|");
-    key.append(dimension);
-    key.push_back('\0');
-    key.append(hash_value);
-    return key;
-}
-
-std::string make_hash_reverse_key(std::string_view dimension,
-                                  std::string_view resolved_value,
-                                  std::string_view hash_value) {
-    std::string key("H|");
-    key.append(dimension);
-    key.push_back('\0');
-    key.append(resolved_value);
-    key.push_back('\0');
-    key.append(hash_value);
-    return key;
-}
+using encoding::prefix_for_file;
 
 std::string make_dimension_key(int file_id, std::string_view dimension) {
     std::string key("d|");
@@ -185,88 +93,38 @@ std::string make_dimension_key(int file_id, std::string_view dimension) {
     return key;
 }
 
-std::string chunk_bloom_key(int file_id, std::string_view dimension,
-                            std::uint64_t checkpoint_idx) {
-    std::string key = prefix_for_file(file_id);
-    key.append(dimension);
-    key.push_back('\0');
-    append_u64(key, checkpoint_idx);
-    return key;
-}
-
 std::string file_bloom_key(int file_id, std::string_view dimension) {
     std::string key = prefix_for_file(file_id);
     key.append(dimension);
     return key;
 }
 
-std::string chunk_stats_key(int file_id, std::uint64_t checkpoint_idx) {
-    std::string key = prefix_for_file(file_id);
-    append_u64(key, checkpoint_idx);
-    return key;
+using encoding::metadata_key;
+std::string file_scalar_stats_key(int file_id) {
+    return prefix_for_file(file_id);
 }
-
-std::string checkpoint_key(int file_id, std::uint64_t uc_offset,
-                           std::uint64_t checkpoint_idx) {
-    std::string key = prefix_for_file(file_id);
-    append_u64(key, uc_offset);
-    append_u64(key, checkpoint_idx);
-    return key;
+std::string file_category_counts_key(int file_id) {
+    return prefix_for_file(file_id);
 }
-
-std::string chunk_dim_stats_key(int file_id, std::uint64_t checkpoint_idx,
-                                std::string_view dimension) {
-    std::string key = prefix_for_file(file_id);
-    append_u64(key, checkpoint_idx);
-    key.append(dimension);
-    return key;
+std::string file_pid_tid_counts_key(int file_id) {
+    return prefix_for_file(file_id);
 }
-
-std::string manifest_event_key(int file_id, std::uint64_t checkpoint_idx,
-                               std::string_view cat, std::string_view name) {
-    std::string key("E|");
-    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_id));
-    append_u64(key, checkpoint_idx);
-    key.append(cat);
-    key.push_back('\0');
-    key.append(name);
-    return key;
+std::string file_name_counts_key(int file_id) {
+    return prefix_for_file(file_id);
 }
+std::string root_scalar_stats_key() { return "_root"; }
+std::string root_category_counts_key() { return "_root"; }
+std::string root_name_counts_key() { return "_root"; }
+std::string root_pid_tid_counts_key() { return "_root"; }
 
-std::string manifest_metadata_key(int file_id, std::uint64_t checkpoint_idx,
-                                  std::string_view meta_type) {
-    std::string key("M|");
-    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_id));
-    append_u64(key, checkpoint_idx);
-    key.append(meta_type);
-    return key;
-}
-
-std::string metadata_key(int file_id) { return prefix_for_file(file_id); }
-
+using encoding::name_lookup_key;
+using encoding::name_reverse_key;
 std::string tar_archive_key(int file_id) { return prefix_for_file(file_id); }
 
-std::string tar_file_key(int file_id, std::uint64_t uncompressed_offset,
-                         std::string_view file_name) {
-    std::string key = prefix_for_file(file_id);
-    append_u64(key, uncompressed_offset);
-    key.push_back('\0');
-    key.append(file_name);
-    return key;
-}
-
-std::string encode_bloom_value(std::span<const unsigned char> blob,
-                               std::uint64_t num_entries) {
-    std::string value;
-    append_u64(value, num_entries);
-    value.append(reinterpret_cast<const char*>(blob.data()), blob.size());
-    return value;
-}
-
-IndexDatabase::ChunkBloomResult decode_chunk_bloom(std::string_view key,
-                                                   std::string_view value,
-                                                   std::size_t prefix_size) {
-    IndexDatabase::ChunkBloomResult result;
+ChunkBloomResult decode_chunk_bloom(std::string_view key,
+                                    std::string_view value,
+                                    std::size_t prefix_size) {
+    ChunkBloomResult result;
     auto checkpoint_pos = key.find('\0', prefix_size);
     if (checkpoint_pos == std::string_view::npos ||
         checkpoint_pos + 1 + 8 > key.size()) {
@@ -282,47 +140,19 @@ IndexDatabase::ChunkBloomResult decode_chunk_bloom(std::string_view key,
     return result;
 }
 
-IndexDatabase::FileBloomResult decode_file_bloom(std::string_view value) {
+FileBloomResult decode_file_bloom(std::string_view value) {
     if (value.size() < 8) {
         throw std::runtime_error("Corrupt file bloom value");
     }
-    IndexDatabase::FileBloomResult result;
+    FileBloomResult result;
     result.num_entries = rocks::KeyCodec::decode_be64(value.substr(0, 8));
     result.bloom_data.assign(value.begin() + 8, value.end());
     return result;
 }
 
-std::string encode_chunk_statistics_value(
-    const IndexDatabase::ChunkStatistics& stats) {
-    std::string value;
-    append_u64(value, stats.total_events);
-    append_u64(value, stats.min_timestamp_us);
-    append_u64(value, stats.max_timestamp_us);
-    append_i64(value, stats.duration_sum_us);
-    append_u64(value, stats.duration_min_us);
-    append_u64(value, stats.duration_max_us);
-    append_u64(value, stats.duration_count);
-    append_double(value, stats.duration_m2);
-
-    auto duration_sketch = stats.duration_sketch.serialize();
-    append_blob(value, duration_sketch);
-
-    auto duration_histogram = stats.duration_histogram.to_json();
-    append_string(value, duration_histogram);
-
-    auto name_sketches = stats.serialize_name_duration_sketches();
-    append_blob(value, name_sketches);
-    append_string(value, stats.name_duration_histograms_json());
-    append_string(value, stats.name_duration_sums_json());
-    append_string(value, stats.name_duration_sum_sqs_json());
-    append_string(value, stats.name_category_json());
-    return value;
-}
-
-IndexDatabase::ChunkStatistics decode_chunk_statistics_value(
-    std::string_view value) {
+ChunkStatistics decode_chunk_statistics_value(std::string_view value) {
     Cursor cursor(value);
-    IndexDatabase::ChunkStatistics stats;
+    ChunkStatistics stats;
     stats.total_events = cursor.u64();
     stats.min_timestamp_us = cursor.u64();
     stats.max_timestamp_us = cursor.u64();
@@ -347,42 +177,35 @@ IndexDatabase::ChunkStatistics decode_chunk_statistics_value(
     auto name_sketches = cursor.blob();
     if (!name_sketches.empty()) {
         stats.name_duration_sketches =
-            IndexDatabase::ChunkStatistics::deserialize_name_duration_sketches(
+            ChunkStatistics::deserialize_name_duration_sketches(
                 name_sketches.data(), name_sketches.size());
     }
 
     stats.name_duration_histograms =
-        IndexDatabase::ChunkStatistics::parse_histogram_map_json(cursor.str());
+        ChunkStatistics::parse_histogram_map_json(cursor.str());
     stats.name_duration_sums =
-        IndexDatabase::ChunkStatistics::parse_double_map_json(cursor.str());
+        ChunkStatistics::parse_double_map_json(cursor.str());
     stats.name_duration_sum_sqs =
-        IndexDatabase::ChunkStatistics::parse_double_map_json(cursor.str());
-    stats.name_category =
-        IndexDatabase::ChunkStatistics::parse_string_map_json(cursor.str());
-    return stats;
-}
+        ChunkStatistics::parse_double_map_json(cursor.str());
+    stats.name_category = ChunkStatistics::parse_string_map_json(cursor.str());
 
-std::string encode_checkpoint_value(
-    const IndexDatabase::IndexerCheckpoint& checkpoint) {
-    std::string value;
-    append_u64(value, checkpoint.uc_size);
-    append_u64(value, checkpoint.c_offset);
-    append_u64(value, checkpoint.c_size);
-    append_i64(value, checkpoint.bits);
-    append_blob(value, checkpoint.dict_compressed);
-    append_u64(value, checkpoint.num_lines);
-    append_u64(value, checkpoint.first_line_num);
-    append_u64(value, checkpoint.last_line_num);
-    return value;
+    auto ts_hist_blob = cursor.blob();
+    if (!ts_hist_blob.empty()) {
+        stats.timestamp_histogram =
+            common::statistics::TimestampHistogram::deserialize(
+                ts_hist_blob.data(), ts_hist_blob.size());
+    }
+
+    return stats;
 }
 
-IndexDatabase::IndexerCheckpoint decode_checkpoint(std::string_view key,
-                                                   std::string_view value) {
+IndexerCheckpoint decode_checkpoint(std::string_view key,
+                                    std::string_view value) {
     if (key.size() < 20) {
         throw std::runtime_error("Corrupt checkpoint key");
     }
 
-    IndexDatabase::IndexerCheckpoint checkpoint;
+    IndexerCheckpoint checkpoint;
     checkpoint.uc_offset = rocks::KeyCodec::decode_be64(key.substr(4, 8));
     checkpoint.checkpoint_idx = rocks::KeyCodec::decode_be64(key.substr(12, 8));
 
@@ -398,25 +221,9 @@ IndexDatabase::IndexerCheckpoint decode_checkpoint(std::string_view key,
     return checkpoint;
 }
 
-std::string encode_chunk_dimension_stats_value(
-    const IndexDatabase::ChunkDimensionStats& stats,
-    std::size_t value_counts_cap) {
-    std::string value;
-    append_u64(value, stats.distinct_count);
-    append_string(value, stats.min_value);
-    append_string(value, stats.max_value);
-    append_string(value, stats.value_type);
-    auto compressed = stats.compress_value_counts(value_counts_cap);
-    append_u8(value, compressed.has_value() ? 1 : 0);
-    if (compressed) {
-        append_blob(value, *compressed);
-    }
-    return value;
-}
-
-IndexDatabase::ChunkDimensionStatsResult decode_chunk_dimension_stats_value(
+ChunkDimensionStatsResult decode_chunk_dimension_stats_value(
     std::string_view key, std::string_view value) {
-    IndexDatabase::ChunkDimensionStatsResult result;
+    ChunkDimensionStatsResult result;
     if (key.size() < 12) {
         throw std::runtime_error("Corrupt chunk dimension stats key");
     }
@@ -430,63 +237,71 @@ IndexDatabase::ChunkDimensionStatsResult decode_chunk_dimension_stats_value(
     result.value_type = cursor.str();
     if (cursor.u8() != 0) {
         auto compressed = cursor.blob();
-        result.value_counts =
-            IndexDatabase::ChunkDimensionStats::decompress_value_counts(
-                compressed.data(), compressed.size());
+        // Defer decompression
+        result.compressed_value_counts.assign(compressed.begin(),
+                                              compressed.end());
     }
     return result;
 }
 
-std::string encode_event_range_value(std::span<const std::uint32_t> lines) {
-    std::vector<std::uint32_t> vec(lines.begin(), lines.end());
-    auto blob = queries::pack_line_numbers(vec);
-    std::string value;
-    append_u64(value, vec.size());
-    append_blob(value, blob);
-    return value;
-}
-
 std::vector<std::uint32_t> decode_line_numbers(Cursor& cursor) {
     auto blob = cursor.blob();
     return queries::unpack_line_numbers(blob.data(), blob.size());
 }
 
-std::string encode_metadata_value(std::span<const std::uint32_t> lines) {
-    std::vector<std::uint32_t> vec(lines.begin(), lines.end());
-    auto blob = queries::pack_line_numbers(vec);
-    std::string value;
-    append_blob(value, blob);
-    return value;
+StringViewMap<std::uint64_t> decode_count_map_value(std::string_view value) {
+    Cursor cursor(value);
+    StringViewMap<std::uint64_t> counts;
+    auto num_entries = cursor.u32();
+    counts.reserve(num_entries);
+    for (std::uint32_t i = 0; i < num_entries; ++i) {
+        auto key = cursor.str();
+        counts.emplace(std::move(key), cursor.u64());
+    }
+    return counts;
 }
 
-std::string encode_metadata_record(std::uint64_t checkpoint_size,
-                                   std::uint64_t total_lines,
-                                   std::uint64_t total_uc_size) {
-    std::string value;
-    append_u64(value, checkpoint_size);
-    append_u64(value, total_lines);
-    append_u64(value, total_uc_size);
-    return value;
+NameSummaryResult decode_name_summary_value(std::string_view value) {
+    Cursor cursor(value);
+    NameSummaryResult result;
+    auto num_entries = cursor.u32();
+    result.other_count = cursor.u64();
+    result.unique_count = cursor.u64();
+    result.counts.reserve(num_entries);
+    for (std::uint32_t i = 0; i < num_entries; ++i) {
+        auto key = cursor.str();
+        result.counts.emplace(std::move(key), cursor.u64());
+    }
+    return result;
 }
 
-std::string encode_tar_archive_value(std::string_view archive_name,
-                                     std::uint64_t checkpoint_size,
-                                     std::uint64_t total_lines,
-                                     std::uint64_t total_uc_size,
-                                     std::uint64_t total_files) {
-    std::string value;
-    append_string(value, archive_name);
-    append_u64(value, checkpoint_size);
-    append_u64(value, total_lines);
-    append_u64(value, total_uc_size);
-    append_u64(value, total_files);
-    return value;
+template <typename Callback>
+void for_each_count_map_entry(std::string_view value, Callback&& callback) {
+    Cursor cursor(value);
+    auto num_entries = cursor.u32();
+    for (std::uint32_t i = 0; i < num_entries; ++i) {
+        auto key = cursor.str_view();
+        auto count = cursor.u64();
+        callback(key, count);
+    }
 }
 
-IndexDatabase::TarArchiveMetadata decode_tar_archive_value(
-    std::string_view value) {
+template <typename Callback>
+void for_each_name_summary_entry(std::string_view value, Callback&& callback) {
     Cursor cursor(value);
-    IndexDatabase::TarArchiveMetadata metadata;
+    auto num_entries = cursor.u32();
+    (void)cursor.u64();  // other_count
+    (void)cursor.u64();  // unique_count
+    for (std::uint32_t i = 0; i < num_entries; ++i) {
+        auto key = cursor.str_view();
+        auto count = cursor.u64();
+        callback(key, count);
+    }
+}
+
+TarArchiveMetadata decode_tar_archive_value(std::string_view value) {
+    Cursor cursor(value);
+    TarArchiveMetadata metadata;
     metadata.archive_name = cursor.str();
     metadata.checkpoint_size = cursor.u64();
     metadata.total_lines = cursor.u64();
@@ -495,17 +310,7 @@ IndexDatabase::TarArchiveMetadata decode_tar_archive_value(
     return metadata;
 }
 
-std::string encode_tar_file_value(const IndexDatabase::TarFileRecord& record) {
-    std::string value;
-    append_u64(value, record.file_size);
-    append_u64(value, record.file_mtime);
-    append_u8(value, static_cast<std::uint8_t>(record.typeflag));
-    append_u64(value, record.data_offset);
-    return value;
-}
-
-IndexDatabase::TarFileRecord decode_tar_file(std::string_view key,
-                                             std::string_view value) {
+TarFileRecord decode_tar_file(std::string_view key, std::string_view value) {
     if (key.size() < 13) {
         throw std::runtime_error("Corrupt tar file key");
     }
@@ -516,7 +321,7 @@ IndexDatabase::TarFileRecord decode_tar_file(std::string_view key,
     }
 
     Cursor cursor(value);
-    IndexDatabase::TarFileRecord record;
+    TarFileRecord record;
     record.uncompressed_offset = rocks::KeyCodec::decode_be64(key.substr(4, 8));
     record.file_name = std::string(key.substr(name_pos + 1));
     record.file_size = cursor.u64();
@@ -551,22 +356,287 @@ void scan_prefix(const rocks::RocksDatabase& db, std::string_view column_family,
 
 }  // namespace
 
+namespace {
+
+/// Register merge operators for the AGGREGATION and SYSTEM_METRICS CFs on
+/// every IndexDatabase open. Previously these operators were set only on
+/// the separate handle returned by EventAggregator::open_with_merge_operator,
+/// which meant the main IndexDatabase did NOT know how to combine merge
+/// operands. Ingested SSTs from the distributed pipeline rely on the
+/// operator being registered on the first opener of a given DB path
+/// (RocksDBManager caches one instance per path, so later callers get the
+/// same handle with these operators already configured).
+::rocksdb::CompressionType select_compression_type() {
+#ifdef DFTRACER_UTILS_ENABLE_ZSTD
+    return ::rocksdb::kZSTD;
+#elif defined(DFTRACER_UTILS_ENABLE_LZ4)
+    return ::rocksdb::kLZ4Compression;
+#else
+    return ::rocksdb::kZlibCompression;
+#endif
+}
+
+rocks::RocksDatabase::CfOptionsOverride make_aggregation_cf_override() {
+    using dftracer::utils::utilities::composites::dft::aggregators::
+        AggregationMergeOperator;
+    using dftracer::utils::utilities::composites::dft::aggregators::
+        SystemMetricsMergeOperator;
+    auto agg_merge_op = std::make_shared<AggregationMergeOperator>();
+    auto sys_merge_op = std::make_shared<SystemMetricsMergeOperator>();
+    return [agg_merge_op, sys_merge_op](const std::string& cf_name,
+                                        ::rocksdb::ColumnFamilyOptions& opts) {
+        if (cf_name == cf::AGGREGATION) {
+            opts.merge_operator = agg_merge_op;
+            ::rocksdb::BlockBasedTableOptions bbt;
+            bbt.block_size = 32 * 1024;
+            bbt.format_version = 5;
+            bbt.index_block_restart_interval = 16;
+            bbt.whole_key_filtering = false;
+            opts.table_factory.reset(::rocksdb::NewBlockBasedTableFactory(bbt));
+            opts.level0_file_num_compaction_trigger = 2;
+            opts.max_bytes_for_level_multiplier = 20;
+            opts.compression = select_compression_type();
+            opts.bottommost_compression = select_compression_type();
+        } else if (cf_name == cf::SYSTEM_METRICS) {
+            opts.merge_operator = sys_merge_op;
+            opts.compression = select_compression_type();
+            opts.bottommost_compression = select_compression_type();
+        }
+    };
+}
+
+}  // namespace
+
 IndexDatabase::IndexDatabase(const std::string& index_path,
                              rocks::RocksDatabase::OpenMode open_mode)
     : db_path_(internal::normalize_index_root(index_path)),
       open_mode_(open_mode),
-      db_(rocks::RocksDBManager::instance().get_or_open(db_path_, open_mode_)) {
+      db_(rocks::RocksDBManager::instance().get_or_open(
+          db_path_, open_mode_, make_aggregation_cf_override())) {
     if (open_mode_ == rocks::RocksDatabase::OpenMode::ReadWrite) {
-        init_base_schema();
+        init_schema();
+    }
+}
+
+std::unique_ptr<IndexDatabaseWriterContext> IndexDatabase::begin_write() {
+    return std::unique_ptr<IndexDatabaseWriterContext>(
+        new IndexDatabaseWriterContext(db_));
+}
+
+void IndexDatabase::bulk_ingest(
+    const SstArtifactRegistry& registry,
+    const std::unordered_set<std::string>& skip_cfs) {
+    const auto skipped = [&](std::string_view cf_name) {
+        return skip_cfs.find(std::string(cf_name)) != skip_cfs.end();
+    };
+    const auto ingest = [&](std::string_view cf_name,
+                            const std::vector<std::string>& files) {
+        if (skipped(cf_name)) return;
+        auto status = db_->ingest_external_files(cf_name, files,
+                                                 /*ingest_behind=*/false);
+        if (!status.ok()) {
+            throw_db_error("Failed to ingest SSTs into column family '" +
+                               std::string(cf_name) + "'",
+                           status);
+        }
+    };
+
+    ingest(cf::METADATA, registry.metadata());
+    ingest(cf::CHECKPOINTS, registry.checkpoints());
+    ingest(cf::MANIFEST, registry.manifest());
+    ingest(cf::CHUNK_BLOOM, registry.chunk_bloom());
+    ingest(cf::FILE_BLOOM, registry.file_bloom());
+    ingest(cf::CHUNK_STATS, registry.chunk_stats());
+    ingest(cf::CHUNK_DIM_STATS, registry.chunk_dim_stats());
+    ingest(cf::DIMENSIONS, registry.dimensions());
+    ingest(cf::FILE_SCALAR_STATS, registry.file_scalar_stats());
+    ingest(cf::FILE_CAT_COUNTS, registry.file_cat_counts());
+    ingest(cf::FILE_PID_TID_COUNTS, registry.file_pid_tid_counts());
+    ingest(cf::FILE_NAME_COUNTS, registry.file_name_counts());
+    // Multiple workers emit identical (name_id, name) dictionary pairs for
+    // shared event names, so SSTs across workers have overlapping key ranges.
+    // Regular ingest forbids overlap *within a single call*, so we ingest one
+    // SST at a time. The content-addressed values are deterministic (same
+    // name -> same hash), so the normal LSM sequence-number semantics (later
+    // ingest shadows earlier with identical value) preserve correctness
+    // without requiring `ingest_behind`.
+    if (!skipped(cf::NAME_DICTIONARY)) {
+        for (const auto& path : registry.name_dictionary()) {
+            auto status = db_->ingest_external_files(
+                cf::NAME_DICTIONARY, {path}, /*ingest_behind=*/false);
+            if (!status.ok()) {
+                throw_db_error(
+                    "Failed to ingest SST into column family 'name_dictionary'",
+                    status);
+            }
+        }
+    }
+    ingest(cf::NAME_FILE_POSTINGS, registry.name_file_postings());
+    ingest(cf::NAME_CHUNK_POSTINGS, registry.name_chunk_postings());
+    // HASH_TABLES is content-addressed: same hash -> same name across workers.
+    // Same rationale as NAME_DICTIONARY: ingest one SST at a time so rocksdb
+    // can place overlapping files at L0 with new seqnos; deterministic values
+    // mean last-writer-wins resolves correctly.
+    if (!skipped(cf::HASH_TABLES)) {
+        for (const auto& path : registry.hash_tables()) {
+            auto status = db_->ingest_external_files(cf::HASH_TABLES, {path},
+                                                     /*ingest_behind=*/false);
+            if (!status.ok()) {
+                throw_db_error(
+                    "Failed to ingest SST into column family 'hash_tables'",
+                    status);
+            }
+        }
+    }
+    // AGGREGATION + SYSTEM_METRICS: workers emit mixed Put+Merge SSTs with
+    // overlapping (pid, time_bucket, ...) keys across workers. Ingest one
+    // SST at a time; the rocksdb merge_operator on these CFs collapses
+    // cross-worker merge operands at read/compaction time.
+    if (!skipped(cf::AGGREGATION)) {
+        for (const auto& path : registry.aggregation()) {
+            auto status = db_->ingest_external_files(cf::AGGREGATION, {path},
+                                                     /*ingest_behind=*/false);
+            if (!status.ok()) {
+                throw_db_error(
+                    "Failed to ingest SST into column family 'aggregation'",
+                    status);
+            }
+        }
+    }
+    if (!skipped(cf::SYSTEM_METRICS)) {
+        for (const auto& path : registry.system_metrics()) {
+            auto status = db_->ingest_external_files(cf::SYSTEM_METRICS, {path},
+                                                     /*ingest_behind=*/false);
+            if (!status.ok()) {
+                throw_db_error(
+                    "Failed to ingest SST into column family 'system_metrics'",
+                    status);
+            }
+        }
+    }
+}
+
+void IndexDatabase::rebuild_root_summaries() {
+    auto writer = begin_write();
+    writer->rebuild_root_summaries();
+    writer->commit();
+}
+
+void IndexDatabase::write_agg_global_config(std::uint64_t time_interval_us,
+                                            std::uint32_t config_hash) {
+    using dftracer::utils::utilities::composites::dft::aggregators::
+        AGG_GLOBAL_CONFIG_KEY;
+    using dftracer::utils::utilities::composites::dft::aggregators::
+        AggGlobalConfig;
+    using dftracer::utils::utilities::composites::dft::aggregators::
+        serialize_agg_global_config;
+
+    AggGlobalConfig cfg;
+    cfg.time_interval_us = time_interval_us;
+    cfg.config_hash = config_hash;
+    auto status = db_->put(std::string_view(AGG_GLOBAL_CONFIG_KEY, 2),
+                           serialize_agg_global_config(cfg), cf::AGGREGATION);
+    if (!status.ok()) {
+        throw_db_error("Failed to write aggregation global config", status);
+    }
+}
+
+void IndexDatabase::write_aggregation_tracker(
+    const std::vector<std::string>& blobs) {
+    using dftracer::utils::utilities::composites::dft::aggregators::
+        AssociationTracker;
+
+    AssociationTracker unified;
+    for (const auto& b : blobs) {
+        if (b.empty()) continue;
+        unified.merge(AssociationTracker::deserialize(b));
+    }
+    unified.finalize();
+    constexpr std::string_view TRACKER_KEY = "__tracker__";
+    auto status = db_->put(TRACKER_KEY, unified.serialize(), cf::AGGREGATION);
+    if (!status.ok()) {
+        throw_db_error("Failed to write aggregation tracker", status);
+    }
+}
+
+void IndexDatabase::write_agg_file_markers(const std::vector<int>& file_ids) {
+    using dftracer::utils::utilities::composites::dft::aggregators::
+        make_agg_file_key;
+
+    auto batch = db_->begin_batch();
+    for (int file_id : file_ids) {
+        if (file_id < 0) continue;
+        db_->put(batch, cf::AGGREGATION,
+                 make_agg_file_key(static_cast<std::int32_t>(file_id)), "");
+    }
+    auto status = db_->commit_batch(batch);
+    if (!status.ok()) {
+        throw_db_error("Failed to write aggregation file markers", status);
+    }
+}
+
+std::vector<int> IndexDatabase::register_files(
+    const std::vector<std::string>& file_paths, bool build_manifest) {
+    IndexFileEntryCapability caps = IndexFileEntryCapability::BLOOM |
+                                    IndexFileEntryCapability::CHECKPOINTS |
+                                    IndexFileEntryCapability::FILE_SUMMARY |
+                                    IndexFileEntryCapability::INDEXING_COMPLETE;
+    if (build_manifest) {
+        caps |= IndexFileEntryCapability::MANIFEST;
+    }
+
+    std::vector<int> ids;
+    ids.reserve(file_paths.size());
+    auto writer = begin_write();
+    for (const auto& path : file_paths) {
+        const auto logical = internal::get_logical_path(path);
+        const auto file_hash = internal::calculate_file_hash(path);
+        ids.push_back(
+            writer->get_or_create_file_info(logical, file_hash, caps));
     }
+    writer->commit();
+    return ids;
 }
 
-void IndexDatabase::init_base_schema() {
+int IndexDatabase::reserve_file_id_range(std::size_t count) {
+    if (count == 0) {
+        // Return the next id without advancing the counter.
+        std::string value;
+        const auto key = std::string(encoding::NEXT_FILE_ID_KEY);
+        auto status = db_->get(key, &value);
+        if (status.IsNotFound()) return 1;
+        if (!status.ok()) {
+            throw_db_error("Failed to read next file id", status);
+        }
+        return static_cast<int>(rocks::KeyCodec::decode_be32(value));
+    }
+
+    std::string value;
+    const auto key = std::string(encoding::NEXT_FILE_ID_KEY);
+    auto status = db_->get(key, &value);
+
+    std::uint32_t first = 1;
+    if (status.ok()) {
+        first = rocks::KeyCodec::decode_be32(value);
+    } else if (!status.IsNotFound()) {
+        throw_db_error("Failed to read next file id", status);
+    }
+
+    const std::uint32_t next = first + static_cast<std::uint32_t>(count);
+    const auto encoded = rocks::KeyCodec::encode_be32(next);
+    auto put_status = db_->put(key, encoded);
+    if (!put_status.ok()) {
+        throw_db_error("Failed to advance next file id", put_status);
+    }
+    return static_cast<int>(first);
+}
+
+void IndexDatabase::init_schema() {
     std::string value;
     auto status = db_->get(schema_version_key(), &value);
     if (status.IsNotFound()) {
         status = db_->put(schema_version_key(),
-                          rocks::KeyCodec::encode_be32(kSchemaVersion));
+                          rocks::KeyCodec::encode_be32(SCHEMA_VERSION));
         if (!status.ok()) {
             throw_db_error("Failed to initialize schema version", status);
         }
@@ -575,404 +645,844 @@ void IndexDatabase::init_base_schema() {
     }
 }
 
-void IndexDatabase::init_bloom_schema() {
-    // RocksDB column families are provisioned at DB open; bloom-specific
-    // schema initialization is intentionally a no-op.
-}
-
-void IndexDatabase::init_manifest_schema() {
-    // RocksDB column families are provisioned at DB open; manifest-specific
-    // schema initialization is intentionally a no-op.
-}
-
 bool IndexDatabase::has_bloom_data(int file_id) const {
+    auto caps = get_file_capabilities(file_id);
+    if (has_capability(caps, IndexFileEntryCapability::BLOOM)) return true;
     bool found = false;
     auto prefix = prefix_for_file(file_id);
-    scan_prefix(*db_, "chunk_bloom", prefix,
+    scan_prefix(*db_, cf::CHUNK_BLOOM, prefix,
                 [&found](::rocksdb::Iterator&) { found = true; });
     return found;
 }
 
 bool IndexDatabase::has_manifest_data(int file_id) const {
+    auto caps = get_file_capabilities(file_id);
+    if (has_capability(caps, IndexFileEntryCapability::MANIFEST)) return true;
     bool found = false;
     std::string prefix("E|");
     rocks::KeyCodec::append_be32(prefix, static_cast<std::uint32_t>(file_id));
-    scan_prefix(*db_, "manifest", prefix,
+    scan_prefix(*db_, cf::MANIFEST, prefix,
                 [&found](::rocksdb::Iterator&) { found = true; });
     return found;
 }
 
-int IndexDatabase::get_or_create_file_info(std::string_view path,
-                                           std::uint64_t file_hash) {
-    const auto logical_name = std::string(path);
-    const auto lookup = file_lookup_key(logical_name);
-    std::string existing;
-    auto status = db_->get(lookup, &existing);
-    if (status.ok()) {
-        const auto file_id = decode_file_id(existing);
-        if (decode_file_hash(existing) == file_hash) {
-            return file_id;
+IndexFileEntryCapability IndexDatabase::get_file_capabilities(
+    int file_id) const {
+    std::string name;
+    auto status = db_->get(file_reverse_key(file_id), &name);
+    if (!status.ok()) return IndexFileEntryCapability::NONE;
+
+    std::string record;
+    status = db_->get(file_lookup_key(name), &record);
+    if (!status.ok()) return IndexFileEntryCapability::NONE;
+
+    return decode_file_capabilities(record);
+}
+
+int IndexDatabase::get_file_info_id(std::string_view path) const {
+    std::string value;
+    auto status = db_->get(file_lookup_key(path), &value);
+    if (status.IsNotFound()) {
+        return -1;
+    }
+    if (!status.ok()) {
+        throw_db_error("Failed to look up file info id", status);
+    }
+    return decode_file_id(value);
+}
+
+std::optional<std::uint64_t> IndexDatabase::get_file_hash(
+    std::string_view path) const {
+    std::string value;
+    auto status = db_->get(file_lookup_key(path), &value);
+    if (status.IsNotFound()) {
+        return std::nullopt;
+    }
+    if (!status.ok()) {
+        throw_db_error("Failed to look up file hash", status);
+    }
+    return decode_file_hash(value);
+}
+
+std::unordered_map<std::string, int> IndexDatabase::query_all_file_info_ids()
+    const {
+    std::unordered_map<std::string, int> results;
+    internal::scan_prefix_iterator(
+        "Failed to scan file registry", "f|",
+        [this] { return db_->new_iterator(); },
+        [&](::rocksdb::Iterator& it) {
+            auto key = iterator_key(it);
+            auto value = iterator_value(it);
+            results.emplace(key.substr(2), decode_file_id(value));
+        });
+    return results;
+}
+
+std::unordered_map<std::string, FileRegistryEntry>
+IndexDatabase::query_all_file_registry() const {
+    std::unordered_map<std::string, FileRegistryEntry> results;
+    internal::scan_prefix_iterator(
+        "Failed to scan file registry", "f|",
+        [this] { return db_->new_iterator(); },
+        [&](::rocksdb::Iterator& it) {
+            auto key = iterator_key(it);
+            auto value = iterator_value(it);
+            FileRegistryEntry entry;
+            entry.file_id = decode_file_id(value);
+            entry.capabilities = decode_file_capabilities(value);
+            results.emplace(key.substr(2), entry);
+        });
+    return results;
+}
+
+std::unordered_set<int> IndexDatabase::query_files_with_file_scalar_stats()
+    const {
+    std::unordered_set<int> results;
+    auto it = db_->new_iterator(cf::FILE_SCALAR_STATS);
+    for (it->SeekToFirst(); it->Valid();) {
+        auto key = iterator_key(*it);
+        int file_id = decode_prefixed_file_id(key);
+        results.insert(file_id);
+        if (file_id == std::numeric_limits<int>::max()) {
+            break;
         }
-        delete_file_data(file_id);
-        auto registry = encode_file_record(file_id, file_hash);
-        if (txn_batch_) {
-            status = db_->put(*txn_batch_, "default", lookup, registry);
-            if (!status.ok()) {
-                throw_db_error("Failed to update file registry", status);
-            }
-            status = db_->put(*txn_batch_, "default", file_reverse_key(file_id),
-                              logical_name);
-            if (!status.ok()) {
-                throw_db_error("Failed to update reverse file registry",
-                               status);
-            }
+        auto next_prefix = prefix_for_file(file_id + 1);
+        it->Seek(::rocksdb::Slice(next_prefix.data(), next_prefix.size()));
+    }
+
+    const auto status = it->status();
+    if (!status.ok()) {
+        throw IndexerError(
+            IndexerError::Type::DATABASE_ERROR,
+            "Failed to scan file scalar stats: " + status.ToString());
+    }
+
+    return results;
+}
+
+std::unordered_set<int> IndexDatabase::query_files_with_bloom_data() const {
+    std::unordered_set<int> results;
+    auto it = db_->new_iterator(cf::CHUNK_BLOOM);
+    for (it->SeekToFirst(); it->Valid();) {
+        auto key = iterator_key(*it);
+        int file_id = decode_prefixed_file_id(key);
+        results.insert(file_id);
+        if (file_id == std::numeric_limits<int>::max()) {
+            break;
+        }
+        auto next_prefix = prefix_for_file(file_id + 1);
+        it->Seek(::rocksdb::Slice(next_prefix.data(), next_prefix.size()));
+    }
+
+    const auto status = it->status();
+    if (!status.ok()) {
+        throw IndexerError(IndexerError::Type::DATABASE_ERROR,
+                           "Failed to scan bloom data: " + status.ToString());
+    }
+    return results;
+}
+
+int IndexDatabase::find_file(std::string_view file_path) const {
+    return get_file_info_id(internal::get_logical_path(file_path));
+}
+
+std::optional<std::uint64_t> IndexDatabase::query_name_id(
+    std::string_view name) const {
+    std::string value;
+    auto status = db_->get(name_lookup_key(name), &value, cf::NAME_DICTIONARY);
+    if (status.IsNotFound()) {
+        return std::nullopt;
+    }
+    if (!status.ok()) {
+        throw_db_error("Failed to query name dictionary", status);
+    }
+    return rocks::KeyCodec::decode_be64(value);
+}
+
+std::optional<std::string> IndexDatabase::query_name_by_id(
+    std::uint64_t name_id) const {
+    std::string value;
+    auto status =
+        db_->get(name_reverse_key(name_id), &value, cf::NAME_DICTIONARY);
+    if (status.IsNotFound()) {
+        return std::nullopt;
+    }
+    if (!status.ok()) {
+        throw_db_error("Failed to query name reverse dictionary", status);
+    }
+    return value;
+}
+
+bool IndexDatabase::has_file_scalar_stats(int file_id) const {
+    std::string value;
+    auto status =
+        db_->get(file_scalar_stats_key(file_id), &value, cf::FILE_SCALAR_STATS);
+    if (status.IsNotFound()) {
+        return false;
+    }
+    if (!status.ok()) {
+        throw_db_error("Failed to check file scalar statistics", status);
+    }
+    return true;
+}
+
+std::vector<ChunkBloomResult> IndexDatabase::query_chunk_bloom_filters(
+    int file_id, std::string_view dimension) const {
+    std::vector<ChunkBloomResult> results;
+    std::string prefix = prefix_for_file(file_id);
+    prefix.append(dimension);
+    prefix.push_back('\0');
+    scan_prefix(*db_, cf::CHUNK_BLOOM, prefix, [&](::rocksdb::Iterator& it) {
+        results.push_back(decode_chunk_bloom(
+            iterator_key(it), iterator_value(it), prefix.size() - 1));
+    });
+    return results;
+}
+
+std::unordered_map<std::string, std::vector<ChunkBloomResult>>
+IndexDatabase::query_chunk_bloom_filters_batch(
+    int file_id, const std::vector<std::string>& dimensions) const {
+    std::unordered_map<std::string, std::vector<ChunkBloomResult>> results;
+    for (const auto& dimension : dimensions) {
+        results.emplace(dimension,
+                        query_chunk_bloom_filters(file_id, dimension));
+    }
+    return results;
+}
+
+std::optional<FileBloomResult> IndexDatabase::query_file_bloom_filter(
+    int file_id, std::string_view dimension) const {
+    std::string value;
+    auto status =
+        db_->get(file_bloom_key(file_id, dimension), &value, cf::FILE_BLOOM);
+    if (status.IsNotFound()) {
+        return std::nullopt;
+    }
+    if (!status.ok()) {
+        throw_db_error("Failed to query file bloom filter", status);
+    }
+    return decode_file_bloom(value);
+}
+
+std::unordered_map<std::string, FileBloomResult>
+IndexDatabase::query_file_bloom_filters_batch(
+    int file_id, const std::vector<std::string>& dimensions) const {
+    std::unordered_map<std::string, FileBloomResult> results;
+    for (const auto& dimension : dimensions) {
+        auto bloom = query_file_bloom_filter(file_id, dimension);
+        if (bloom) {
+            results.emplace(dimension, std::move(*bloom));
+        }
+    }
+    return results;
+}
+
+std::vector<std::string> IndexDatabase::query_index_dimensions(
+    int file_id) const {
+    std::vector<std::string> dimensions;
+    std::string prefix("d|");
+    rocks::KeyCodec::append_be32(prefix, static_cast<std::uint32_t>(file_id));
+    scan_prefix(*db_, cf::DIMENSIONS, prefix, [&](::rocksdb::Iterator& it) {
+        auto key = iterator_key(it);
+        dimensions.push_back(key.substr(prefix.size()));
+    });
+    return dimensions;
+}
+
+bool IndexDatabase::has_index_dimension(int file_id,
+                                        std::string_view dimension) const {
+    std::string value;
+    return db_
+        ->get(make_dimension_key(file_id, dimension), &value, cf::DIMENSIONS)
+        .ok();
+}
+
+std::vector<ChunkStatisticsResult> IndexDatabase::query_chunk_statistics(
+    int file_id) const {
+    std::vector<ChunkStatisticsResult> results;
+    const auto prefix = prefix_for_file(file_id);
+    scan_prefix(*db_, cf::CHUNK_STATS, prefix, [&](::rocksdb::Iterator& it) {
+        ChunkStatisticsResult result;
+        auto key = iterator_key(it);
+        result.checkpoint_idx =
+            rocks::KeyCodec::decode_be64(std::string_view(key).substr(4, 8));
+        result.stats = decode_chunk_statistics_value(iterator_value(it));
+        results.push_back(std::move(result));
+    });
+    std::sort(results.begin(), results.end(),
+              [](const auto& lhs, const auto& rhs) {
+                  return lhs.checkpoint_idx < rhs.checkpoint_idx;
+              });
+    return results;
+}
+
+std::unordered_map<int, std::vector<ChunkStatisticsResult>>
+IndexDatabase::query_chunk_statistics_batch(
+    const std::vector<int>& file_ids) const {
+    std::unordered_map<int, std::vector<ChunkStatisticsResult>> results;
+    if (file_ids.empty()) {
+        return results;
+    }
+    results.reserve(file_ids.size());
+
+    std::unordered_set<int> wanted(file_ids.begin(), file_ids.end());
+    const auto [min_it, max_it] =
+        std::minmax_element(file_ids.begin(), file_ids.end());
+    const auto min_prefix = prefix_for_file(*min_it);
+    const int max_file_id = *max_it;
+
+    auto it = db_->new_iterator(cf::CHUNK_STATS);
+    for (it->Seek(::rocksdb::Slice(min_prefix.data(), min_prefix.size()));
+         it->Valid(); it->Next()) {
+        auto key = iterator_key(*it);
+        int file_id = decode_prefixed_file_id(key);
+        if (file_id > max_file_id) {
+            break;
+        }
+        if (!wanted.contains(file_id)) {
+            continue;
+        }
+
+        ChunkStatisticsResult result;
+        result.checkpoint_idx =
+            rocks::KeyCodec::decode_be64(std::string_view(key).substr(4, 8));
+        result.stats = decode_chunk_statistics_value(iterator_value(*it));
+        results[file_id].push_back(std::move(result));
+    }
+
+    const auto status = it->status();
+    if (!status.ok()) {
+        throw IndexerError(
+            IndexerError::Type::DATABASE_ERROR,
+            "Failed to batch query chunk statistics: " + status.ToString());
+    }
+
+    for (auto& [_, entries] : results) {
+        std::sort(entries.begin(), entries.end(),
+                  [](const auto& lhs, const auto& rhs) {
+                      return lhs.checkpoint_idx < rhs.checkpoint_idx;
+                  });
+    }
+    return results;
+}
+
+std::unordered_map<int, MergedStatisticsResult>
+IndexDatabase::query_merged_statistics_batch(
+    const std::vector<int>& file_ids) const {
+    std::unordered_map<int, MergedStatisticsResult> results;
+    if (file_ids.empty()) {
+        return results;
+    }
+    results.reserve(file_ids.size());
+
+    std::unordered_set<int> wanted(file_ids.begin(), file_ids.end());
+    const auto [min_it, max_it] =
+        std::minmax_element(file_ids.begin(), file_ids.end());
+    const auto min_prefix = prefix_for_file(*min_it);
+    const int max_file_id = *max_it;
+
+    auto stats_it = db_->new_iterator(cf::CHUNK_STATS);
+    for (stats_it->Seek(::rocksdb::Slice(min_prefix.data(), min_prefix.size()));
+         stats_it->Valid(); stats_it->Next()) {
+        auto key = iterator_key(*stats_it);
+        int file_id = decode_prefixed_file_id(key);
+        if (file_id > max_file_id) {
+            break;
+        }
+        if (!wanted.contains(file_id)) {
+            continue;
+        }
+
+        auto decoded = decode_chunk_statistics_value(iterator_value(*stats_it));
+        auto& merged = results[file_id];
+        if (merged.num_chunks == 0) {
+            merged.stats = std::move(decoded);
         } else {
-            status = db_->put(lookup, registry);
-            if (!status.ok()) {
-                throw_db_error("Failed to update file registry", status);
+            merged.stats.merge_from(decoded);
+        }
+        ++merged.num_chunks;
+    }
+
+    auto stats_status = stats_it->status();
+    if (!stats_status.ok()) {
+        throw IndexerError(IndexerError::Type::DATABASE_ERROR,
+                           "Failed to batch merge chunk statistics: " +
+                               stats_status.ToString());
+    }
+
+    auto dims_it = db_->new_iterator(cf::CHUNK_DIM_STATS);
+    for (dims_it->Seek(::rocksdb::Slice(min_prefix.data(), min_prefix.size()));
+         dims_it->Valid(); dims_it->Next()) {
+        auto key = iterator_key(*dims_it);
+        int file_id = decode_prefixed_file_id(key);
+        if (file_id > max_file_id) {
+            break;
+        }
+        if (!wanted.contains(file_id)) {
+            continue;
+        }
+
+        auto decoded =
+            decode_chunk_dimension_stats_value(key, iterator_value(*dims_it));
+        if (!decoded.has_value_counts_payload()) continue;
+        decoded.ensure_value_counts_decoded();
+        if (!decoded.value_counts) continue;
+
+        auto& merged = results[file_id].stats;
+        if (decoded.dimension == "cat") {
+            for (const auto& [k, v] : *decoded.value_counts) {
+                merged.category_counts[k] += v;
             }
-            status = db_->put(file_reverse_key(file_id), logical_name);
-            if (!status.ok()) {
-                throw_db_error("Failed to update reverse file registry",
-                               status);
+        } else if (decoded.dimension == "name") {
+            for (const auto& [k, v] : *decoded.value_counts) {
+                merged.name_counts[k] += v;
+            }
+        } else if (decoded.dimension == "pid_tid") {
+            for (const auto& [k, v] : *decoded.value_counts) {
+                merged.pid_tid_counts[k] += v;
             }
         }
-        return file_id;
     }
-    if (!status.IsNotFound()) {
-        throw_db_error("Failed to query file registry", status);
+
+    auto dims_status = dims_it->status();
+    if (!dims_status.ok()) {
+        throw IndexerError(IndexerError::Type::DATABASE_ERROR,
+                           "Failed to batch merge chunk dimension stats: " +
+                               dims_status.ToString());
     }
 
-    std::uint32_t next_id = 1;
-    std::string next_value;
-    status = db_->get(next_file_id_key(), &next_value);
-    if (status.ok()) {
-        next_id = rocks::KeyCodec::decode_be32(next_value);
-    } else if (!status.IsNotFound()) {
-        throw_db_error("Failed to read next file id", status);
+    return results;
+}
+
+std::unordered_map<int, MergedStatisticsResult>
+IndexDatabase::query_file_scalar_stats_batch(
+    const std::vector<int>& file_ids) const {
+    std::unordered_map<int, MergedStatisticsResult> results;
+    results.reserve(file_ids.size());
+    for (const auto file_id : file_ids) {
+        std::string value;
+        auto status = db_->get(file_scalar_stats_key(file_id), &value,
+                               cf::FILE_SCALAR_STATS);
+        if (status.IsNotFound()) {
+            continue;
+        }
+        if (!status.ok()) {
+            throw_db_error("Failed to read file scalar statistics", status);
+        }
+        try {
+            DecodeContextGuard ctx("file_scalar_stats file_id=%d size=%zu",
+                                   file_id, value.size());
+            results.emplace(file_id, decode_file_scalar_stats_value(value));
+        } catch (const std::exception& e) {
+            throw std::runtime_error(
+                "Corrupt file_scalar_stats payload file_id=" +
+                std::to_string(file_id) +
+                " size=" + std::to_string(value.size()) + ": " + e.what());
+        }
     }
+    return results;
+}
 
-    const auto file_id = static_cast<int>(next_id);
-    const auto new_registry = encode_file_record(file_id, file_hash);
-    const auto next_registry = rocks::KeyCodec::encode_be32(next_id + 1);
+std::unordered_map<int, FileMetadataResult>
+IndexDatabase::query_file_metadata_batch(
+    const std::vector<int>& file_ids) const {
+    std::unordered_map<int, FileMetadataResult> results;
+    if (file_ids.empty()) {
+        return results;
+    }
+    results.reserve(file_ids.size());
+
+    std::unordered_set<int> wanted(file_ids.begin(), file_ids.end());
+    const auto [min_it, max_it] =
+        std::minmax_element(file_ids.begin(), file_ids.end());
+    const auto min_prefix = prefix_for_file(*min_it);
+    const int max_file_id = *max_it;
+
+    auto it = db_->new_iterator(cf::METADATA);
+    for (it->Seek(::rocksdb::Slice(min_prefix.data(), min_prefix.size()));
+         it->Valid(); it->Next()) {
+        auto key = iterator_key(*it);
+        int file_id = decode_prefixed_file_id(key);
+        if (file_id > max_file_id) {
+            break;
+        }
+        if (!wanted.contains(file_id)) {
+            continue;
+        }
 
-    if (txn_batch_) {
-        status = db_->put(*txn_batch_, "default", lookup, new_registry);
+        auto value = iterator_value(*it);
+        DecodeContextGuard ctx("metadata file_id=%d size=%zu", file_id,
+                               value.size());
+        auto decoded = decode_metadata_record(value);
+        auto& meta = results[file_id];
+        meta.checkpoint_size = decoded[0];
+        meta.num_lines = decoded[1];
+        meta.max_bytes = decoded[2];
+    }
+
+    const auto status = it->status();
+    if (!status.ok()) {
+        throw IndexerError(
+            IndexerError::Type::DATABASE_ERROR,
+            "Failed to batch read file metadata: " + status.ToString());
+    }
+    return results;
+}
+
+std::unordered_map<int, StringViewMap<std::uint64_t>>
+IndexDatabase::query_file_category_counts_batch(
+    const std::vector<int>& file_ids) const {
+    std::unordered_map<int, StringViewMap<std::uint64_t>> results;
+    results.reserve(file_ids.size());
+    for (const auto file_id : file_ids) {
+        std::string value;
+        auto status = db_->get(file_category_counts_key(file_id), &value,
+                               cf::FILE_CAT_COUNTS);
+        if (status.IsNotFound()) {
+            continue;
+        }
         if (!status.ok()) {
-            throw_db_error("Failed to insert file registry", status);
+            throw_db_error("Failed to read file category counts", status);
+        }
+        try {
+            DecodeContextGuard ctx("file_cat_counts file_id=%d size=%zu",
+                                   file_id, value.size());
+            results.emplace(file_id, decode_count_map_value(value));
+        } catch (const std::exception& e) {
+            throw std::runtime_error(
+                "Corrupt file_cat_counts payload file_id=" +
+                std::to_string(file_id) +
+                " size=" + std::to_string(value.size()) + ": " + e.what());
+        }
+    }
+    return results;
+}
+
+void IndexDatabase::merge_file_category_counts_batch_into(
+    const std::vector<int>& file_ids,
+    std::unordered_map<int, ChunkStatistics*>& targets) const {
+    for (const auto file_id : file_ids) {
+        auto target_it = targets.find(file_id);
+        if (target_it == targets.end() || target_it->second == nullptr) {
+            continue;
+        }
+
+        std::string value;
+        auto status = db_->get(file_category_counts_key(file_id), &value,
+                               cf::FILE_CAT_COUNTS);
+        if (status.IsNotFound()) {
+            continue;
         }
-        status = db_->put(*txn_batch_, "default", file_reverse_key(file_id),
-                          logical_name);
         if (!status.ok()) {
-            throw_db_error("Failed to insert reverse file registry", status);
+            throw_db_error("Failed to read file category counts", status);
+        }
+
+        auto* stats = target_it->second;
+        DecodeContextGuard ctx("file_cat_counts merge file_id=%d size=%zu",
+                               file_id, value.size());
+        for_each_count_map_entry(
+            value, [stats](std::string_view key, std::uint64_t count) {
+                auto entry =
+                    stats->category_counts.try_emplace(std::string(key), 0);
+                entry.first->second += count;
+            });
+    }
+}
+
+std::unordered_map<int, StringViewMap<std::uint64_t>>
+IndexDatabase::query_file_pid_tid_counts_batch(
+    const std::vector<int>& file_ids) const {
+    std::unordered_map<int, StringViewMap<std::uint64_t>> results;
+    results.reserve(file_ids.size());
+    for (const auto file_id : file_ids) {
+        std::string value;
+        auto status = db_->get(file_pid_tid_counts_key(file_id), &value,
+                               cf::FILE_PID_TID_COUNTS);
+        if (status.IsNotFound()) {
+            continue;
         }
-        status =
-            db_->put(*txn_batch_, "default", next_file_id_key(), next_registry);
         if (!status.ok()) {
-            throw_db_error("Failed to update next file id", status);
+            throw_db_error("Failed to read file pid_tid counts", status);
+        }
+        try {
+            DecodeContextGuard ctx("file_pid_tid_counts file_id=%d size=%zu",
+                                   file_id, value.size());
+            results.emplace(file_id, decode_count_map_value(value));
+        } catch (const std::exception& e) {
+            throw std::runtime_error(
+                "Corrupt file_pid_tid_counts payload file_id=" +
+                std::to_string(file_id) +
+                " size=" + std::to_string(value.size()) + ": " + e.what());
+        }
+    }
+    return results;
+}
+
+std::unordered_map<int, NameSummaryResult>
+IndexDatabase::query_file_name_summaries_batch(
+    const std::vector<int>& file_ids) const {
+    std::unordered_map<int, NameSummaryResult> results;
+    results.reserve(file_ids.size());
+    for (const auto file_id : file_ids) {
+        std::string value;
+        auto status = db_->get(file_name_counts_key(file_id), &value,
+                               cf::FILE_NAME_COUNTS);
+        if (status.IsNotFound()) {
+            continue;
         }
-    } else {
-        status = db_->put(lookup, new_registry);
         if (!status.ok()) {
-            throw_db_error("Failed to insert file registry", status);
+            throw_db_error("Failed to read file name counts", status);
+        }
+        try {
+            DecodeContextGuard ctx("file_name_counts file_id=%d size=%zu",
+                                   file_id, value.size());
+            results.emplace(file_id, decode_name_summary_value(value));
+        } catch (const std::exception& e) {
+            throw std::runtime_error(
+                "Corrupt file_name_counts payload file_id=" +
+                std::to_string(file_id) +
+                " size=" + std::to_string(value.size()) + ": " + e.what());
+        }
+    }
+    return results;
+}
+
+void IndexDatabase::merge_file_pid_tid_counts_batch_into(
+    const std::vector<int>& file_ids,
+    std::unordered_map<int, ChunkStatistics*>& targets) const {
+    for (const auto file_id : file_ids) {
+        auto target_it = targets.find(file_id);
+        if (target_it == targets.end() || target_it->second == nullptr) {
+            continue;
+        }
+
+        std::string value;
+        auto status = db_->get(file_pid_tid_counts_key(file_id), &value,
+                               cf::FILE_PID_TID_COUNTS);
+        if (status.IsNotFound()) {
+            continue;
         }
-        status = db_->put(file_reverse_key(file_id), logical_name);
         if (!status.ok()) {
-            throw_db_error("Failed to insert reverse file registry", status);
+            throw_db_error("Failed to read file pid_tid counts", status);
+        }
+
+        auto* stats = target_it->second;
+        DecodeContextGuard ctx("file_pid_tid_counts merge file_id=%d size=%zu",
+                               file_id, value.size());
+        for_each_count_map_entry(value, [stats](std::string_view key,
+                                                std::uint64_t count) {
+            auto entry = stats->pid_tid_counts.try_emplace(std::string(key), 0);
+            entry.first->second += count;
+        });
+    }
+}
+
+void IndexDatabase::merge_file_name_counts_batch_into(
+    const std::vector<int>& file_ids,
+    std::unordered_map<int, ChunkStatistics*>& targets) const {
+    for (const auto file_id : file_ids) {
+        auto target_it = targets.find(file_id);
+        if (target_it == targets.end() || target_it->second == nullptr) {
+            continue;
+        }
+
+        std::string value;
+        auto status = db_->get(file_name_counts_key(file_id), &value,
+                               cf::FILE_NAME_COUNTS);
+        if (status.IsNotFound()) {
+            continue;
         }
-        status = db_->put(next_file_id_key(), next_registry);
         if (!status.ok()) {
-            throw_db_error("Failed to update next file id", status);
+            throw_db_error("Failed to read file name counts", status);
         }
+
+        auto* stats = target_it->second;
+        DecodeContextGuard ctx("file_name_counts merge file_id=%d size=%zu",
+                               file_id, value.size());
+        for_each_name_summary_entry(value, [stats](std::string_view key,
+                                                   std::uint64_t count) {
+            auto entry = stats->name_counts.try_emplace(std::string(key), 0);
+            entry.first->second += count;
+        });
     }
-
-    return file_id;
 }
 
-int IndexDatabase::get_file_info_id(std::string_view path) const {
+std::optional<RootStatisticsResult> IndexDatabase::query_root_scalar_stats()
+    const {
     std::string value;
-    auto status = db_->get(file_lookup_key(path), &value);
+    auto status =
+        db_->get(root_scalar_stats_key(), &value, cf::ROOT_SCALAR_STATS);
     if (status.IsNotFound()) {
-        return -1;
+        return std::nullopt;
     }
     if (!status.ok()) {
-        throw_db_error("Failed to look up file info id", status);
+        throw_db_error("Failed to read root scalar statistics", status);
+    }
+    try {
+        DecodeContextGuard ctx("root_scalar_stats size=%zu", value.size());
+        return decode_root_scalar_stats_value(value);
+    } catch (const std::exception& e) {
+        throw std::runtime_error("Corrupt root_scalar_stats payload size=" +
+                                 std::to_string(value.size()) + ": " +
+                                 e.what());
     }
-    return decode_file_id(value);
 }
 
-std::optional<std::uint64_t> IndexDatabase::get_file_hash(
-    std::string_view path) const {
+StringViewMap<std::uint64_t> IndexDatabase::query_root_category_counts() const {
     std::string value;
-    auto status = db_->get(file_lookup_key(path), &value);
+    auto status =
+        db_->get(root_category_counts_key(), &value, cf::ROOT_CAT_COUNTS);
     if (status.IsNotFound()) {
-        return std::nullopt;
+        return {};
     }
     if (!status.ok()) {
-        throw_db_error("Failed to look up file hash", status);
+        throw_db_error("Failed to read root category counts", status);
+    }
+    try {
+        DecodeContextGuard ctx("root_cat_counts size=%zu", value.size());
+        return decode_count_map_value(value);
+    } catch (const std::exception& e) {
+        throw std::runtime_error("Corrupt root_cat_counts payload size=" +
+                                 std::to_string(value.size()) + ": " +
+                                 e.what());
     }
-    return decode_file_hash(value);
-}
-
-int IndexDatabase::find_file(std::string_view file_path) const {
-    return get_file_info_id(internal::get_logical_path(file_path));
-}
-
-void IndexDatabase::begin_transaction() {
-    txn_batch_ =
-        std::make_unique<rocks::RocksDatabase::Batch>(db_->begin_batch());
 }
 
-void IndexDatabase::commit_transaction() {
-    if (!txn_batch_) {
-        return;
+StringViewMap<std::uint64_t> IndexDatabase::query_root_pid_tid_counts() const {
+    std::string value;
+    auto status =
+        db_->get(root_pid_tid_counts_key(), &value, cf::ROOT_PID_TID_COUNTS);
+    if (status.IsNotFound()) {
+        return {};
     }
-    auto status = db_->commit_batch(*txn_batch_);
-    txn_batch_.reset();
     if (!status.ok()) {
-        throw_db_error("Failed to commit RocksDB batch", status);
+        throw_db_error("Failed to read root pid_tid counts", status);
     }
-}
-
-void IndexDatabase::rollback_transaction() noexcept { txn_batch_.reset(); }
-
-void IndexDatabase::insert_chunk_bloom_filter(
-    int file_id, std::uint64_t checkpoint_idx, std::string_view dimension,
-    std::span<const unsigned char> blob_data, std::uint64_t num_entries) {
-    const auto key = chunk_bloom_key(file_id, dimension, checkpoint_idx);
-    const auto value = encode_bloom_value(blob_data, num_entries);
-    auto status = txn_batch_ ? db_->put(*txn_batch_, "chunk_bloom", key, value)
-                             : db_->put(key, value, "chunk_bloom");
-    if (!status.ok()) {
-        throw_db_error("Failed to insert chunk bloom filter", status);
+    try {
+        DecodeContextGuard ctx("root_pid_tid_counts size=%zu", value.size());
+        return decode_count_map_value(value);
+    } catch (const std::exception& e) {
+        throw std::runtime_error("Corrupt root_pid_tid_counts payload size=" +
+                                 std::to_string(value.size()) + ": " +
+                                 e.what());
     }
 }
 
-void IndexDatabase::insert_chunk_bloom_filter(
-    int file_id, std::uint64_t checkpoint_idx, std::string_view dimension,
-    const void* blob_data, int blob_size, std::uint64_t num_entries) {
-    auto* bytes = static_cast<const unsigned char*>(blob_data);
-    insert_chunk_bloom_filter(file_id, checkpoint_idx, dimension,
-                              std::span<const unsigned char>(
-                                  bytes, static_cast<std::size_t>(blob_size)),
-                              num_entries);
-}
-
-void IndexDatabase::insert_file_bloom_filter(
-    int file_id, std::string_view dimension,
-    std::span<const unsigned char> blob_data, std::uint64_t num_entries) {
-    const auto key = file_bloom_key(file_id, dimension);
-    const auto value = encode_bloom_value(blob_data, num_entries);
-    auto status = txn_batch_ ? db_->put(*txn_batch_, "file_bloom", key, value)
-                             : db_->put(key, value, "file_bloom");
-    if (!status.ok()) {
-        throw_db_error("Failed to insert file bloom filter", status);
+StringViewMap<std::uint64_t> IndexDatabase::query_root_name_counts() const {
+    std::string value;
+    auto status =
+        db_->get(root_name_counts_key(), &value, cf::ROOT_NAME_COUNTS);
+    if (status.IsNotFound()) {
+        return {};
     }
-}
-
-void IndexDatabase::insert_file_bloom_filter(int file_id,
-                                             std::string_view dimension,
-                                             const void* blob_data,
-                                             int blob_size,
-                                             std::uint64_t num_entries) {
-    auto* bytes = static_cast<const unsigned char*>(blob_data);
-    insert_file_bloom_filter(file_id, dimension,
-                             std::span<const unsigned char>(
-                                 bytes, static_cast<std::size_t>(blob_size)),
-                             num_entries);
-}
-
-void IndexDatabase::insert_chunk_statistics(int file_id,
-                                            std::uint64_t checkpoint_idx,
-                                            const ChunkStatistics& stats) {
-    const auto key = chunk_stats_key(file_id, checkpoint_idx);
-    const auto value = encode_chunk_statistics_value(stats);
-    auto status = txn_batch_ ? db_->put(*txn_batch_, "chunk_stats", key, value)
-                             : db_->put(key, value, "chunk_stats");
     if (!status.ok()) {
-        throw_db_error("Failed to insert chunk statistics", status);
+        throw_db_error("Failed to read root name counts", status);
     }
-}
-
-void IndexDatabase::insert_checkpoint(int file_id,
-                                      const IndexerCheckpoint& checkpoint) {
-    const auto key = checkpoint_key(file_id, checkpoint.uc_offset,
-                                    checkpoint.checkpoint_idx);
-    const auto value = encode_checkpoint_value(checkpoint);
-    auto status = txn_batch_ ? db_->put(*txn_batch_, "checkpoints", key, value)
-                             : db_->put(key, value, "checkpoints");
-    if (!status.ok()) {
-        throw_db_error("Failed to insert checkpoint", status);
+    try {
+        DecodeContextGuard ctx("root_name_counts size=%zu", value.size());
+        return decode_count_map_value(value);
+    } catch (const std::exception& e) {
+        throw std::runtime_error("Corrupt root_name_counts payload size=" +
+                                 std::to_string(value.size()) + ": " +
+                                 e.what());
     }
 }
 
-void IndexDatabase::insert_index_dimension(int file_id,
-                                           std::string_view dimension) {
-    const auto key = make_dimension_key(file_id, dimension);
-    auto status = txn_batch_ ? db_->put(*txn_batch_, "dimensions", key, "")
-                             : db_->put(key, "", "dimensions");
-    if (!status.ok()) {
-        throw_db_error("Failed to insert index dimension", status);
-    }
-}
-
-void IndexDatabase::insert_hash_resolution(int file_id,
-                                           std::string_view dimension,
-                                           std::string_view hash_value,
-                                           std::string_view resolved_value) {
-    const auto owner = make_hash_owner_key(file_id, dimension, hash_value);
-    const auto forward = make_hash_forward_key(dimension, hash_value);
-    const auto reverse =
-        make_hash_reverse_key(dimension, resolved_value, hash_value);
-    if (txn_batch_) {
-        db_->put(*txn_batch_, "dimensions", owner, std::string(resolved_value));
-        db_->put(*txn_batch_, "dimensions", forward,
-                 std::string(resolved_value));
-        db_->put(*txn_batch_, "dimensions", reverse, "");
+void IndexDatabase::merge_root_category_counts_into(
+    ChunkStatistics& target) const {
+    std::string value;
+    auto status =
+        db_->get(root_category_counts_key(), &value, cf::ROOT_CAT_COUNTS);
+    if (status.IsNotFound()) {
         return;
     }
-    auto status = db_->put(owner, resolved_value, "dimensions");
-    if (!status.ok()) throw_db_error("Failed to insert hash owner", status);
-    status = db_->put(forward, resolved_value, "dimensions");
-    if (!status.ok())
-        throw_db_error("Failed to insert hash resolution", status);
-    status = db_->put(reverse, "", "dimensions");
     if (!status.ok()) {
-        throw_db_error("Failed to insert reverse hash resolution", status);
-    }
-}
-
-void IndexDatabase::insert_chunk_dimension_stats(
-    int file_id, std::uint64_t checkpoint_idx, const ChunkDimensionStats& stats,
-    std::size_t value_counts_cap) {
-    const auto key =
-        chunk_dim_stats_key(file_id, checkpoint_idx, stats.dimension);
-    const auto value =
-        encode_chunk_dimension_stats_value(stats, value_counts_cap);
-    auto status = txn_batch_
-                      ? db_->put(*txn_batch_, "chunk_dim_stats", key, value)
-                      : db_->put(key, value, "chunk_dim_stats");
-    if (!status.ok()) {
-        throw_db_error("Failed to insert chunk dimension stats", status);
+        throw_db_error("Failed to read root category counts", status);
     }
+    for_each_count_map_entry(value, [&target](std::string_view key,
+                                              std::uint64_t count) {
+        auto entry = target.category_counts.try_emplace(std::string(key), 0);
+        entry.first->second += count;
+    });
 }
 
-void IndexDatabase::insert_tar_archive_metadata(int file_id,
-                                                std::string_view archive_name,
-                                                std::uint64_t checkpoint_size,
-                                                std::uint64_t total_lines,
-                                                std::uint64_t total_uc_size,
-                                                std::uint64_t total_files) {
-    const auto key = tar_archive_key(file_id);
-    const auto value = encode_tar_archive_value(
-        archive_name, checkpoint_size, total_lines, total_uc_size, total_files);
-    auto status = txn_batch_ ? db_->put(*txn_batch_, "archives", key, value)
-                             : db_->put(key, value, "archives");
-    if (!status.ok()) {
-        throw_db_error("Failed to insert tar archive metadata", status);
+void IndexDatabase::merge_root_pid_tid_counts_into(
+    ChunkStatistics& target) const {
+    std::string value;
+    auto status =
+        db_->get(root_pid_tid_counts_key(), &value, cf::ROOT_PID_TID_COUNTS);
+    if (status.IsNotFound()) {
+        return;
     }
-}
-
-void IndexDatabase::insert_tar_file(int file_id, const TarFileRecord& record) {
-    const auto key =
-        tar_file_key(file_id, record.uncompressed_offset, record.file_name);
-    const auto value = encode_tar_file_value(record);
-    auto status = txn_batch_ ? db_->put(*txn_batch_, "tar_files", key, value)
-                             : db_->put(key, value, "tar_files");
     if (!status.ok()) {
-        throw_db_error("Failed to insert tar file metadata", status);
-    }
-}
-
-std::vector<IndexDatabase::ChunkBloomResult>
-IndexDatabase::query_chunk_bloom_filters(int file_id,
-                                         std::string_view dimension) const {
-    std::vector<ChunkBloomResult> results;
-    std::string prefix = prefix_for_file(file_id);
-    prefix.append(dimension);
-    prefix.push_back('\0');
-    scan_prefix(*db_, "chunk_bloom", prefix, [&](::rocksdb::Iterator& it) {
-        results.push_back(decode_chunk_bloom(
-            iterator_key(it), iterator_value(it), prefix.size() - 1));
-    });
-    return results;
-}
-
-std::unordered_map<std::string, std::vector<IndexDatabase::ChunkBloomResult>>
-IndexDatabase::query_chunk_bloom_filters_batch(
-    int file_id, const std::vector<std::string>& dimensions) const {
-    std::unordered_map<std::string, std::vector<ChunkBloomResult>> results;
-    for (const auto& dimension : dimensions) {
-        results.emplace(dimension,
-                        query_chunk_bloom_filters(file_id, dimension));
+        throw_db_error("Failed to read root pid_tid counts", status);
     }
-    return results;
+    for_each_count_map_entry(
+        value, [&target](std::string_view key, std::uint64_t count) {
+            auto entry = target.pid_tid_counts.try_emplace(std::string(key), 0);
+            entry.first->second += count;
+        });
 }
 
-std::optional<IndexDatabase::FileBloomResult>
-IndexDatabase::query_file_bloom_filter(int file_id,
-                                       std::string_view dimension) const {
+void IndexDatabase::merge_root_name_counts_into(ChunkStatistics& target) const {
     std::string value;
     auto status =
-        db_->get(file_bloom_key(file_id, dimension), &value, "file_bloom");
+        db_->get(root_name_counts_key(), &value, cf::ROOT_NAME_COUNTS);
     if (status.IsNotFound()) {
-        return std::nullopt;
+        return;
     }
     if (!status.ok()) {
-        throw_db_error("Failed to query file bloom filter", status);
+        throw_db_error("Failed to read root name counts", status);
     }
-    return decode_file_bloom(value);
+    for_each_count_map_entry(
+        value, [&target](std::string_view key, std::uint64_t count) {
+            auto entry = target.name_counts.try_emplace(std::string(key), 0);
+            entry.first->second += count;
+        });
 }
 
-std::unordered_map<std::string, IndexDatabase::FileBloomResult>
-IndexDatabase::query_file_bloom_filters_batch(
-    int file_id, const std::vector<std::string>& dimensions) const {
-    std::unordered_map<std::string, FileBloomResult> results;
-    for (const auto& dimension : dimensions) {
-        auto bloom = query_file_bloom_filter(file_id, dimension);
-        if (bloom) {
-            results.emplace(dimension, std::move(*bloom));
-        }
+std::vector<int> IndexDatabase::query_name_file_postings(
+    std::string_view name) const {
+    auto name_id = query_name_id(name);
+    if (!name_id) {
+        return {};
     }
-    return results;
-}
 
-std::vector<std::string> IndexDatabase::query_index_dimensions(
-    int file_id) const {
-    std::vector<std::string> dimensions;
-    std::string prefix("d|");
-    rocks::KeyCodec::append_be32(prefix, static_cast<std::uint32_t>(file_id));
-    scan_prefix(*db_, "dimensions", prefix, [&](::rocksdb::Iterator& it) {
-        auto key = iterator_key(it);
-        dimensions.push_back(key.substr(prefix.size()));
-    });
-    return dimensions;
+    std::vector<int> results;
+    std::string prefix("n|");
+    rocks::KeyCodec::append_be64(prefix, *name_id);
+    scan_prefix(
+        *db_, cf::NAME_FILE_POSTINGS, prefix,
+        [&results](::rocksdb::Iterator& it) {
+            auto key = iterator_key(it);
+            // "n|" (2) + be64 name_id (8) + be32 file_id (4) = 14 bytes
+            if (key.size() != 14) return;
+            results.push_back(static_cast<int>(rocks::KeyCodec::decode_be32(
+                std::string_view(key.data() + 10, 4))));
+        });
+    return results;
 }
 
-bool IndexDatabase::has_index_dimension(int file_id,
-                                        std::string_view dimension) const {
-    std::string value;
-    return db_
-        ->get(make_dimension_key(file_id, dimension), &value, "dimensions")
-        .ok();
-}
+std::vector<std::uint64_t> IndexDatabase::query_name_chunk_postings(
+    std::string_view name, int file_id) const {
+    auto name_id = query_name_id(name);
+    if (!name_id) {
+        return {};
+    }
 
-std::vector<IndexDatabase::ChunkStatisticsResult>
-IndexDatabase::query_chunk_statistics(int file_id) const {
-    std::vector<ChunkStatisticsResult> results;
-    const auto prefix = prefix_for_file(file_id);
-    scan_prefix(*db_, "chunk_stats", prefix, [&](::rocksdb::Iterator& it) {
-        ChunkStatisticsResult result;
-        auto key = iterator_key(it);
-        result.checkpoint_idx =
-            rocks::KeyCodec::decode_be64(std::string_view(key).substr(4, 8));
-        result.stats = decode_chunk_statistics_value(iterator_value(it));
-        results.push_back(std::move(result));
-    });
-    std::sort(results.begin(), results.end(),
-              [](const auto& lhs, const auto& rhs) {
-                  return lhs.checkpoint_idx < rhs.checkpoint_idx;
-              });
+    std::vector<std::uint64_t> results;
+    std::string prefix("n|");
+    rocks::KeyCodec::append_be64(prefix, *name_id);
+    rocks::KeyCodec::append_be32(prefix, static_cast<std::uint32_t>(file_id));
+    scan_prefix(*db_, cf::NAME_CHUNK_POSTINGS, prefix,
+                [&results](::rocksdb::Iterator& it) {
+                    auto key = iterator_key(it);
+                    // "n|" (2) + be64 name_id (8) + be32 file_id (4) +
+                    //     be64 checkpoint_idx (8) = 22 bytes
+                    if (key.size() != 22) return;
+                    results.push_back(rocks::KeyCodec::decode_be64(
+                        std::string_view(key.data() + 14, 8)));
+                });
     return results;
 }
 
@@ -984,25 +1494,28 @@ bool IndexDatabase::find_checkpoint(int file_id, std::size_t target_offset,
 
     bool found = false;
     const auto prefix = prefix_for_file(file_id);
-    scan_prefix(*db_, "checkpoints", prefix, [&](::rocksdb::Iterator& it) {
-        auto decoded = decode_checkpoint(iterator_key(it), iterator_value(it));
-        if (decoded.uc_offset <= target_offset &&
-            (!found || decoded.uc_offset >= checkpoint.uc_offset)) {
-            checkpoint = std::move(decoded);
-            found = true;
-        }
-    });
+    scan_prefix(*db_, rocks::cf::CHECKPOINTS, prefix,
+                [&](::rocksdb::Iterator& it) {
+                    auto decoded =
+                        decode_checkpoint(iterator_key(it), iterator_value(it));
+                    if (decoded.uc_offset <= target_offset &&
+                        (!found || decoded.uc_offset >= checkpoint.uc_offset)) {
+                        checkpoint = std::move(decoded);
+                        found = true;
+                    }
+                });
     return found;
 }
 
-std::vector<IndexDatabase::IndexerCheckpoint> IndexDatabase::query_checkpoints(
+std::vector<IndexerCheckpoint> IndexDatabase::query_checkpoints(
     int file_id) const {
     std::vector<IndexerCheckpoint> checkpoints;
     const auto prefix = prefix_for_file(file_id);
-    scan_prefix(*db_, "checkpoints", prefix, [&](::rocksdb::Iterator& it) {
-        checkpoints.push_back(
-            decode_checkpoint(iterator_key(it), iterator_value(it)));
-    });
+    scan_prefix(
+        *db_, rocks::cf::CHECKPOINTS, prefix, [&](::rocksdb::Iterator& it) {
+            checkpoints.push_back(
+                decode_checkpoint(iterator_key(it), iterator_value(it)));
+        });
     std::sort(checkpoints.begin(), checkpoints.end(),
               [](const auto& lhs, const auto& rhs) {
                   return std::tie(lhs.uc_offset, lhs.checkpoint_idx) <
@@ -1011,10 +1524,10 @@ std::vector<IndexDatabase::IndexerCheckpoint> IndexDatabase::query_checkpoints(
     return checkpoints;
 }
 
-std::optional<IndexDatabase::TarArchiveMetadata>
-IndexDatabase::query_tar_archive_metadata(int file_id) const {
+std::optional<TarArchiveMetadata> IndexDatabase::query_tar_archive_metadata(
+    int file_id) const {
     std::string value;
-    auto status = db_->get(tar_archive_key(file_id), &value, "archives");
+    auto status = db_->get(tar_archive_key(file_id), &value, cf::ARCHIVES);
     if (status.IsNotFound()) {
         return std::nullopt;
     }
@@ -1024,11 +1537,10 @@ IndexDatabase::query_tar_archive_metadata(int file_id) const {
     return decode_tar_archive_value(value);
 }
 
-std::vector<IndexDatabase::TarFileRecord> IndexDatabase::query_tar_files(
-    int file_id) const {
+std::vector<TarFileRecord> IndexDatabase::query_tar_files(int file_id) const {
     std::vector<TarFileRecord> files;
     const auto prefix = prefix_for_file(file_id);
-    scan_prefix(*db_, "tar_files", prefix, [&](::rocksdb::Iterator& it) {
+    scan_prefix(*db_, cf::TAR_FILES, prefix, [&](::rocksdb::Iterator& it) {
         files.push_back(decode_tar_file(iterator_key(it), iterator_value(it)));
     });
     std::sort(files.begin(), files.end(), [](const auto& lhs, const auto& rhs) {
@@ -1049,9 +1561,8 @@ bool IndexDatabase::find_tar_file(int file_id, std::string_view file_name,
     return false;
 }
 
-std::vector<IndexDatabase::TarFileRecord>
-IndexDatabase::query_tar_files_in_range(int file_id, std::uint64_t start_offset,
-                                        std::uint64_t end_offset) const {
+std::vector<TarFileRecord> IndexDatabase::query_tar_files_in_range(
+    int file_id, std::uint64_t start_offset, std::uint64_t end_offset) const {
     std::vector<TarFileRecord> files;
     for (auto& entry : query_tar_files(file_id)) {
         const auto entry_end = entry.uncompressed_offset + entry.file_size;
@@ -1063,10 +1574,8 @@ IndexDatabase::query_tar_files_in_range(int file_id, std::uint64_t start_offset,
     return files;
 }
 
-std::vector<IndexDatabase::IndexerCheckpoint>
-IndexDatabase::query_checkpoints_for_line_range(int file_id,
-                                                std::uint64_t start_line,
-                                                std::uint64_t end_line) const {
+std::vector<IndexerCheckpoint> IndexDatabase::query_checkpoints_for_line_range(
+    int file_id, std::uint64_t start_line, std::uint64_t end_line) const {
     std::vector<IndexerCheckpoint> checkpoints;
     for (auto& checkpoint : query_checkpoints(file_id)) {
         if ((checkpoint.first_line_num <= end_line &&
@@ -1079,7 +1588,7 @@ IndexDatabase::query_checkpoints_for_line_range(int file_id,
     return checkpoints;
 }
 
-IndexDatabase::TimeBounds IndexDatabase::query_time_bounds(int file_id) const {
+TimeBounds IndexDatabase::query_time_bounds(int file_id) const {
     TimeBounds bounds;
     for (const auto& row : query_chunk_statistics(file_id)) {
         const auto min_ts = row.stats.min_timestamp_us;
@@ -1095,14 +1604,15 @@ IndexDatabase::TimeBounds IndexDatabase::query_time_bounds(int file_id) const {
     return bounds;
 }
 
-std::vector<IndexDatabase::ChunkDimensionStatsResult>
+std::vector<ChunkDimensionStatsResult>
 IndexDatabase::query_chunk_dimension_stats(int file_id) const {
     std::vector<ChunkDimensionStatsResult> results;
     const auto prefix = prefix_for_file(file_id);
-    scan_prefix(*db_, "chunk_dim_stats", prefix, [&](::rocksdb::Iterator& it) {
-        results.push_back(decode_chunk_dimension_stats_value(
-            iterator_key(it), iterator_value(it)));
-    });
+    scan_prefix(*db_, cf::CHUNK_DIM_STATS, prefix,
+                [&](::rocksdb::Iterator& it) {
+                    results.push_back(decode_chunk_dimension_stats_value(
+                        iterator_key(it), iterator_value(it)));
+                });
     std::sort(results.begin(), results.end(),
               [](const auto& lhs, const auto& rhs) {
                   return std::tie(lhs.checkpoint_idx, lhs.dimension) <
@@ -1111,189 +1621,80 @@ IndexDatabase::query_chunk_dimension_stats(int file_id) const {
     return results;
 }
 
-std::vector<IndexDatabase::ChunkDimensionStatsResult>
-IndexDatabase::query_chunk_dimension_stats_for_dimension(
-    int file_id, std::string_view dimension) const {
-    std::vector<ChunkDimensionStatsResult> results;
-    const auto prefix = prefix_for_file(file_id);
-    scan_prefix(*db_, "chunk_dim_stats", prefix, [&](::rocksdb::Iterator& it) {
-        auto decoded = decode_chunk_dimension_stats_value(iterator_key(it),
-                                                          iterator_value(it));
-        if (decoded.dimension == dimension) {
-            results.push_back(std::move(decoded));
-        }
-    });
-    std::sort(results.begin(), results.end(),
-              [](const auto& lhs, const auto& rhs) {
-                  return lhs.checkpoint_idx < rhs.checkpoint_idx;
-              });
-    return results;
-}
-
-std::optional<std::string> IndexDatabase::query_resolved_by_hash(
-    std::string_view dimension, std::string_view hash_value) const {
-    std::string value;
-    auto status = db_->get(make_hash_forward_key(dimension, hash_value), &value,
-                           "dimensions");
-    if (status.IsNotFound()) {
-        return std::nullopt;
-    }
-    if (!status.ok()) {
-        throw_db_error("Failed to query resolved hash", status);
-    }
-    return value;
-}
-
-std::vector<std::string> IndexDatabase::query_hash_by_resolved(
-    std::string_view dimension, std::string_view resolved_value) const {
-    std::vector<std::string> hashes;
-    auto prefix = make_hash_reverse_key(dimension, resolved_value, "");
-    scan_prefix(*db_, "dimensions", prefix, [&](::rocksdb::Iterator& it) {
-        auto key = iterator_key(it);
-        hashes.push_back(key.substr(prefix.size()));
-    });
-    return hashes;
-}
-
-void IndexDatabase::delete_chunk_bloom_filters(int file_id,
-                                               std::string_view dimension) {
-    std::vector<std::string> keys;
-    std::string prefix = prefix_for_file(file_id);
-    prefix.append(dimension);
-    prefix.push_back('\0');
-    scan_prefix(*db_, "chunk_bloom", prefix, [&](::rocksdb::Iterator& it) {
-        keys.push_back(iterator_key(it));
-    });
-    for (const auto& key : keys) {
-        auto status = txn_batch_ ? db_->del(*txn_batch_, "chunk_bloom", key)
-                                 : db_->del(key, "chunk_bloom");
-        if (!status.ok())
-            throw_db_error("Failed to delete chunk bloom", status);
-    }
-}
-
-void IndexDatabase::delete_file_bloom_filter(int file_id,
-                                             std::string_view dimension) {
-    auto status =
-        txn_batch_ ? db_->del(*txn_batch_, "file_bloom",
-                              file_bloom_key(file_id, dimension))
-                   : db_->del(file_bloom_key(file_id, dimension), "file_bloom");
-    if (!status.ok() && !status.IsNotFound()) {
-        throw_db_error("Failed to delete file bloom", status);
-    }
-}
-
-void IndexDatabase::delete_chunk_statistics(int file_id) {
-    std::vector<std::string> keys;
-    scan_prefix(
-        *db_, "chunk_stats", prefix_for_file(file_id),
-        [&](::rocksdb::Iterator& it) { keys.push_back(iterator_key(it)); });
-    for (const auto& key : keys) {
-        auto status = txn_batch_ ? db_->del(*txn_batch_, "chunk_stats", key)
-                                 : db_->del(key, "chunk_stats");
-        if (!status.ok()) {
-            throw_db_error("Failed to delete chunk statistics", status);
-        }
-    }
-}
-
-void IndexDatabase::delete_chunk_dimension_stats(int file_id) {
-    std::vector<std::string> keys;
-    scan_prefix(
-        *db_, "chunk_dim_stats", prefix_for_file(file_id),
-        [&](::rocksdb::Iterator& it) { keys.push_back(iterator_key(it)); });
-    for (const auto& key : keys) {
-        auto status = txn_batch_ ? db_->del(*txn_batch_, "chunk_dim_stats", key)
-                                 : db_->del(key, "chunk_dim_stats");
-        if (!status.ok()) {
-            throw_db_error("Failed to delete chunk dimension stats", status);
-        }
-    }
-}
-
-void IndexDatabase::delete_hash_resolutions(int file_id) {
-    std::vector<std::pair<std::string, std::string>> owned;
-    std::string prefix("o|");
-    rocks::KeyCodec::append_be32(prefix, static_cast<std::uint32_t>(file_id));
-    prefix.push_back('\0');
-    scan_prefix(*db_, "dimensions", prefix, [&](::rocksdb::Iterator& it) {
-        owned.emplace_back(iterator_key(it), iterator_value(it));
-    });
-    for (const auto& [owner_key, resolved] : owned) {
-        if (owner_key.size() <= prefix.size()) {
-            DFTRACER_UTILS_LOG_WARN(
-                "Skipping malformed owner key for file_id=%d", file_id);
-            continue;
+std::unordered_map<int, std::vector<ChunkDimensionStatsResult>>
+IndexDatabase::query_chunk_dimension_stats_batch(
+    const std::vector<int>& file_ids) const {
+    std::unordered_map<int, std::vector<ChunkDimensionStatsResult>> results;
+    if (file_ids.empty()) {
+        return results;
+    }
+    results.reserve(file_ids.size());
+
+    std::unordered_set<int> wanted(file_ids.begin(), file_ids.end());
+    const auto [min_it, max_it] =
+        std::minmax_element(file_ids.begin(), file_ids.end());
+    const auto min_prefix = prefix_for_file(*min_it);
+    const int max_file_id = *max_it;
+
+    auto it = db_->new_iterator(cf::CHUNK_DIM_STATS);
+    for (it->Seek(::rocksdb::Slice(min_prefix.data(), min_prefix.size()));
+         it->Valid(); it->Next()) {
+        auto key = iterator_key(*it);
+        int file_id = decode_prefixed_file_id(key);
+        if (file_id > max_file_id) {
+            break;
         }
-        const std::string_view payload(owner_key.data() + prefix.size(),
-                                       owner_key.size() - prefix.size());
-        auto split = payload.find('\0');
-        if (split == std::string_view::npos) {
-            DFTRACER_UTILS_LOG_WARN(
-                "Skipping malformed owner key payload for file_id=%d", file_id);
+        if (!wanted.contains(file_id)) {
             continue;
         }
-        auto dimension = payload.substr(0, split);
-        auto hash_value = payload.substr(split + 1);
-        auto forward = make_hash_forward_key(dimension, hash_value);
-        auto reverse = make_hash_reverse_key(dimension, resolved, hash_value);
-        const auto del_one = [&](std::string_view key) {
-            auto status = txn_batch_ ? db_->del(*txn_batch_, "dimensions", key)
-                                     : db_->del(key, "dimensions");
-            if (!status.ok() && !status.IsNotFound()) {
-                throw_db_error("Failed to delete hash resolution", status);
-            }
-        };
-        del_one(owner_key);
-        del_one(forward);
-        del_one(reverse);
+
+        results[file_id].push_back(
+            decode_chunk_dimension_stats_value(key, iterator_value(*it)));
     }
-}
 
-void IndexDatabase::insert_event_range(
-    int file_id, std::uint64_t checkpoint_idx, std::string_view cat,
-    std::string_view name, std::span<const std::uint32_t> line_numbers) {
-    const auto key = manifest_event_key(file_id, checkpoint_idx, cat, name);
-    const auto value = encode_event_range_value(line_numbers);
-    auto status = txn_batch_ ? db_->put(*txn_batch_, "manifest", key, value)
-                             : db_->put(key, value, "manifest");
+    const auto status = it->status();
     if (!status.ok()) {
-        throw_db_error("Failed to insert event range", status);
+        throw IndexerError(IndexerError::Type::DATABASE_ERROR,
+                           "Failed to batch query chunk dimension stats: " +
+                               status.ToString());
     }
-}
-
-void IndexDatabase::insert_event_range(
-    int file_id, std::uint64_t checkpoint_idx, std::string_view cat,
-    std::string_view name, const std::vector<std::uint32_t>& line_numbers) {
-    insert_event_range(file_id, checkpoint_idx, cat, name,
-                       std::span<const std::uint32_t>(line_numbers));
-}
 
-void IndexDatabase::insert_metadata_lines(
-    int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type,
-    std::span<const std::uint32_t> line_numbers) {
-    const auto key = manifest_metadata_key(file_id, checkpoint_idx, meta_type);
-    const auto value = encode_metadata_value(line_numbers);
-    auto status = txn_batch_ ? db_->put(*txn_batch_, "manifest", key, value)
-                             : db_->put(key, value, "manifest");
-    if (!status.ok()) {
-        throw_db_error("Failed to insert metadata lines", status);
+    for (auto& [_, entries] : results) {
+        std::sort(entries.begin(), entries.end(),
+                  [](const auto& lhs, const auto& rhs) {
+                      return std::tie(lhs.checkpoint_idx, lhs.dimension) <
+                             std::tie(rhs.checkpoint_idx, rhs.dimension);
+                  });
     }
+    return results;
 }
 
-void IndexDatabase::insert_metadata_lines(
-    int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type,
-    const std::vector<std::uint32_t>& line_numbers) {
-    insert_metadata_lines(file_id, checkpoint_idx, meta_type,
-                          std::span<const std::uint32_t>(line_numbers));
+std::vector<ChunkDimensionStatsResult>
+IndexDatabase::query_chunk_dimension_stats_for_dimension(
+    int file_id, std::string_view dimension) const {
+    std::vector<ChunkDimensionStatsResult> results;
+    const auto prefix = prefix_for_file(file_id);
+    scan_prefix(*db_, cf::CHUNK_DIM_STATS, prefix,
+                [&](::rocksdb::Iterator& it) {
+                    auto decoded = decode_chunk_dimension_stats_value(
+                        iterator_key(it), iterator_value(it));
+                    if (decoded.dimension == dimension) {
+                        results.push_back(std::move(decoded));
+                    }
+                });
+    std::sort(results.begin(), results.end(),
+              [](const auto& lhs, const auto& rhs) {
+                  return lhs.checkpoint_idx < rhs.checkpoint_idx;
+              });
+    return results;
 }
 
-std::vector<IndexDatabase::EventRangeResult> IndexDatabase::query_event_ranges(
+std::vector<EventRangeResult> IndexDatabase::query_event_ranges(
     int file_id) const {
     std::vector<EventRangeResult> results;
     std::string prefix("E|");
     rocks::KeyCodec::append_be32(prefix, static_cast<std::uint32_t>(file_id));
-    scan_prefix(*db_, "manifest", prefix, [&](::rocksdb::Iterator& it) {
+    scan_prefix(*db_, cf::MANIFEST, prefix, [&](::rocksdb::Iterator& it) {
         auto key = iterator_key(it);
         auto payload = std::string_view(key).substr(2 + 4 + 8);
         auto split = payload.find('\0');
@@ -1319,8 +1720,7 @@ std::vector<IndexDatabase::EventRangeResult> IndexDatabase::query_event_ranges(
     return results;
 }
 
-std::vector<IndexDatabase::EventRangeResult>
-IndexDatabase::query_event_ranges_for_checkpoint(
+std::vector<EventRangeResult> IndexDatabase::query_event_ranges_for_checkpoint(
     int file_id, std::uint64_t checkpoint_idx) const {
     std::vector<EventRangeResult> results;
     for (auto& range : query_event_ranges(file_id)) {
@@ -1331,12 +1731,12 @@ IndexDatabase::query_event_ranges_for_checkpoint(
     return results;
 }
 
-std::vector<IndexDatabase::MetadataLinesResult>
-IndexDatabase::query_metadata_lines(int file_id) const {
+std::vector<MetadataLinesResult> IndexDatabase::query_metadata_lines(
+    int file_id) const {
     std::vector<MetadataLinesResult> results;
     std::string prefix("M|");
     rocks::KeyCodec::append_be32(prefix, static_cast<std::uint32_t>(file_id));
-    scan_prefix(*db_, "manifest", prefix, [&](::rocksdb::Iterator& it) {
+    scan_prefix(*db_, cf::MANIFEST, prefix, [&](::rocksdb::Iterator& it) {
         auto key = iterator_key(it);
         MetadataLinesResult result;
         result.checkpoint_idx =
@@ -1355,7 +1755,7 @@ IndexDatabase::query_metadata_lines(int file_id) const {
     return results;
 }
 
-std::vector<IndexDatabase::MetadataLinesResult>
+std::vector<MetadataLinesResult>
 IndexDatabase::query_metadata_lines_for_checkpoint(
     int file_id, std::uint64_t checkpoint_idx) const {
     std::vector<MetadataLinesResult> results;
@@ -1367,36 +1767,77 @@ IndexDatabase::query_metadata_lines_for_checkpoint(
     return results;
 }
 
-void IndexDatabase::delete_event_ranges(int file_id) {
-    std::vector<std::string> keys;
-    std::string prefix("E|");
-    rocks::KeyCodec::append_be32(prefix, static_cast<std::uint32_t>(file_id));
-    scan_prefix(*db_, "manifest", prefix, [&](::rocksdb::Iterator& it) {
-        keys.push_back(iterator_key(it));
-    });
-    for (const auto& key : keys) {
-        auto status = txn_batch_ ? db_->del(*txn_batch_, "manifest", key)
-                                 : db_->del(key, "manifest");
-        if (!status.ok()) {
-            throw_db_error("Failed to delete manifest event ranges", status);
+std::unordered_set<std::uint64_t> IndexDatabase::query_file_pids(
+    int file_id) const {
+    std::unordered_set<std::uint64_t> pids;
+    std::string key("P|");
+    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_id));
+
+    std::string value;
+    auto status = db_->get(key, &value, cf::MANIFEST);
+    if (status.IsNotFound()) {
+        return pids;
+    }
+    if (!status.ok()) {
+        throw_db_error("Failed to read file PIDs", status);
+    }
+
+    // Decode: count (varint) + sorted PIDs (each as varint)
+    std::size_t off = 0;
+    auto decode_varint = [&value, &off]() -> std::uint64_t {
+        std::uint64_t v = 0;
+        unsigned shift = 0;
+        while (off < value.size()) {
+            auto b = static_cast<std::uint8_t>(value[off++]);
+            v |= static_cast<std::uint64_t>(b & 0x7F) << shift;
+            if ((b & 0x80) == 0) return v;
+            shift += 7;
         }
+        return v;
+    };
+
+    auto count = decode_varint();
+    pids.reserve(count);
+    for (std::uint64_t i = 0; i < count; ++i) {
+        pids.insert(decode_varint());
     }
+    return pids;
 }
 
-void IndexDatabase::delete_metadata_lines(int file_id) {
-    std::vector<std::string> keys;
-    std::string prefix("M|");
-    rocks::KeyCodec::append_be32(prefix, static_cast<std::uint32_t>(file_id));
-    scan_prefix(*db_, "manifest", prefix, [&](::rocksdb::Iterator& it) {
-        keys.push_back(iterator_key(it));
-    });
-    for (const auto& key : keys) {
-        auto status = txn_batch_ ? db_->del(*txn_batch_, "manifest", key)
-                                 : db_->del(key, "manifest");
-        if (!status.ok()) {
-            throw_db_error("Failed to delete metadata lines", status);
+std::unordered_map<int, std::unordered_set<std::uint64_t>>
+IndexDatabase::query_all_file_pids() const {
+    std::unordered_map<int, std::unordered_set<std::uint64_t>> result;
+    std::string prefix("P|");
+    scan_prefix(*db_, cf::MANIFEST, prefix, [&](::rocksdb::Iterator& it) {
+        auto key = iterator_key(it);
+        // Key: "P|" + file_id (4 bytes BE)
+        auto file_id = static_cast<int>(
+            rocks::KeyCodec::decode_be32(std::string_view(key).substr(2, 4)));
+
+        auto value = iterator_value(it);
+        std::size_t off = 0;
+
+        auto decode_varint = [&value, &off]() -> std::uint64_t {
+            std::uint64_t v = 0;
+            unsigned shift = 0;
+            while (off < value.size()) {
+                auto b = static_cast<std::uint8_t>(value[off++]);
+                v |= static_cast<std::uint64_t>(b & 0x7F) << shift;
+                if ((b & 0x80) == 0) return v;
+                shift += 7;
+            }
+            return v;
+        };
+
+        auto count = decode_varint();
+        std::unordered_set<std::uint64_t> pids;
+        pids.reserve(count);
+        for (std::uint64_t i = 0; i < count; ++i) {
+            pids.insert(decode_varint());
         }
-    }
+        result[file_id] = std::move(pids);
+    });
+    return result;
 }
 
 std::uint64_t IndexDatabase::get_total_events(int file_id) const {
@@ -1407,23 +1848,9 @@ std::uint64_t IndexDatabase::get_total_events(int file_id) const {
     return total > 0 ? total : get_num_lines(file_id);
 }
 
-void IndexDatabase::insert_file_metadata(int file_id,
-                                         std::uint64_t checkpoint_size,
-                                         std::uint64_t total_lines,
-                                         std::uint64_t total_uc_size) {
-    const auto key = metadata_key(file_id);
-    const auto value =
-        encode_metadata_record(checkpoint_size, total_lines, total_uc_size);
-    auto status = txn_batch_ ? db_->put(*txn_batch_, "metadata", key, value)
-                             : db_->put(key, value, "metadata");
-    if (!status.ok()) {
-        throw_db_error("Failed to insert metadata", status);
-    }
-}
-
 std::uint64_t IndexDatabase::get_checkpoint_size(int file_id) const {
     std::string value;
-    auto status = db_->get(metadata_key(file_id), &value, "metadata");
+    auto status = db_->get(metadata_key(file_id), &value, cf::METADATA);
     if (status.IsNotFound()) {
         return 0;
     }
@@ -1435,7 +1862,7 @@ std::uint64_t IndexDatabase::get_checkpoint_size(int file_id) const {
 
 std::uint64_t IndexDatabase::get_num_lines(int file_id) const {
     std::string value;
-    auto status = db_->get(metadata_key(file_id), &value, "metadata");
+    auto status = db_->get(metadata_key(file_id), &value, cf::METADATA);
     if (status.IsNotFound()) {
         return 0;
     }
@@ -1447,7 +1874,7 @@ std::uint64_t IndexDatabase::get_num_lines(int file_id) const {
 
 std::uint64_t IndexDatabase::get_max_bytes(int file_id) const {
     std::string value;
-    auto status = db_->get(metadata_key(file_id), &value, "metadata");
+    auto status = db_->get(metadata_key(file_id), &value, cf::METADATA);
     if (status.IsNotFound()) {
         return 0;
     }
@@ -1457,52 +1884,139 @@ std::uint64_t IndexDatabase::get_max_bytes(int file_id) const {
     return decode_metadata_record(value)[2];
 }
 
-void IndexDatabase::delete_file_data(int file_id) {
-    auto delete_default_key = [&](std::string_view key) {
-        auto del_status =
-            txn_batch_ ? db_->del(*txn_batch_, "default", key) : db_->del(key);
-        if (!del_status.ok() && !del_status.IsNotFound()) {
-            throw_db_error("Failed to delete file registry entry", del_status);
-        }
-    };
+void IndexDatabase::ensure_hash_tables_cached() const {
+    if (!hash_cache_) {
+        hash_cache_ = std::make_unique<HashCache>();
+    }
 
-    const auto logical_name_key = file_reverse_key(file_id);
-    std::string logical_name;
-    auto status = db_->get(logical_name_key, &logical_name);
-    if (status.ok()) {
-        delete_default_key(file_lookup_key(logical_name));
-        delete_default_key(logical_name_key);
-    } else if (!status.IsNotFound()) {
-        throw_db_error("Failed to read reverse file registry", status);
+    {
+        std::shared_lock lock(hash_cache_->mutex);
+        if (hash_cache_->loaded) return;
     }
 
-    auto delete_prefix = [&](std::string_view cf, std::string_view prefix) {
-        std::vector<std::string> keys;
-        scan_prefix(*db_, cf, prefix, [&](::rocksdb::Iterator& it) {
-            keys.push_back(iterator_key(it));
-        });
-        for (const auto& key : keys) {
-            auto del_status =
-                txn_batch_ ? db_->del(*txn_batch_, cf, key) : db_->del(key, cf);
-            if (!del_status.ok() && !del_status.IsNotFound()) {
-                throw_db_error("Failed to delete file-scoped RocksDB data",
-                               del_status);
-            }
-        }
-    };
+    std::unique_lock lock(hash_cache_->mutex);
+    if (hash_cache_->loaded) return;
+
+    scan_prefix(*db_, cf::HASH_TABLES, "", [this](::rocksdb::Iterator& it) {
+        auto key = iterator_key(it);
+        if (key.empty()) return;
+        auto type = static_cast<std::uint8_t>(key[0]);
+        auto payload = key.substr(1);
+        auto value = iterator_value(it);
 
-    delete_prefix("checkpoints", prefix_for_file(file_id));
-    delete_prefix("metadata", prefix_for_file(file_id));
-    delete_prefix("archives", prefix_for_file(file_id));
-    delete_prefix("tar_files", prefix_for_file(file_id));
-    delete_prefix("chunk_bloom", prefix_for_file(file_id));
-    delete_prefix("file_bloom", prefix_for_file(file_id));
-    delete_prefix("chunk_stats", prefix_for_file(file_id));
-    delete_prefix("chunk_dim_stats", prefix_for_file(file_id));
-    delete_prefix("dimensions", std::string("d|") + prefix_for_file(file_id));
-    delete_prefix("manifest", std::string("E|") + prefix_for_file(file_id));
-    delete_prefix("manifest", std::string("M|") + prefix_for_file(file_id));
-    delete_hash_resolutions(file_id);
+        switch (type) {
+            case 0:
+                hash_cache_->file_hash.emplace(payload, value);
+                break;
+            case 1:
+                hash_cache_->host_hash.emplace(payload, value);
+                break;
+            case 2:
+                hash_cache_->string_hash.emplace(payload, value);
+                break;
+            case 3:
+                hash_cache_->proc_hash.emplace(payload, value);
+                break;
+            case 4:
+                hash_cache_->file_name.emplace(payload, value);
+                break;
+            case 5:
+                hash_cache_->host_name.emplace(payload, value);
+                break;
+            case 6:
+                hash_cache_->string_name.emplace(payload, value);
+                break;
+            case 7:
+                hash_cache_->proc_name.emplace(payload, value);
+                break;
+            default:
+                break;
+        }
+    });
+    hash_cache_->loaded = true;
+}
+
+std::unordered_map<std::string, std::string> IndexDatabase::query_hash_table(
+    HashType type) const {
+    ensure_hash_tables_cached();
+    std::shared_lock lock(hash_cache_->mutex);
+    switch (type) {
+        case HashType::FILE:
+            return hash_cache_->file_hash;
+        case HashType::HOST:
+            return hash_cache_->host_hash;
+        case HashType::STRING:
+            return hash_cache_->string_hash;
+        case HashType::PROC:
+            return hash_cache_->proc_hash;
+    }
+    return {};
+}
+
+std::optional<std::string> IndexDatabase::resolve_hash(
+    HashType type, std::string_view hash) const {
+    ensure_hash_tables_cached();
+    std::shared_lock lock(hash_cache_->mutex);
+    const std::unordered_map<std::string, std::string>* cache = nullptr;
+    switch (type) {
+        case HashType::FILE:
+            cache = &hash_cache_->file_hash;
+            break;
+        case HashType::HOST:
+            cache = &hash_cache_->host_hash;
+            break;
+        case HashType::STRING:
+            cache = &hash_cache_->string_hash;
+            break;
+        case HashType::PROC:
+            cache = &hash_cache_->proc_hash;
+            break;
+    }
+    if (cache) {
+        auto it = cache->find(std::string(hash));
+        if (it != cache->end()) return it->second;
+    }
+    return std::nullopt;
+}
+
+std::optional<std::string> IndexDatabase::resolve_name_to_hash(
+    HashType type, std::string_view name) const {
+    ensure_hash_tables_cached();
+    std::shared_lock lock(hash_cache_->mutex);
+    const std::unordered_map<std::string, std::string>* cache = nullptr;
+    switch (type) {
+        case HashType::FILE:
+            cache = &hash_cache_->file_name;
+            break;
+        case HashType::HOST:
+            cache = &hash_cache_->host_name;
+            break;
+        case HashType::STRING:
+            cache = &hash_cache_->string_name;
+            break;
+        case HashType::PROC:
+            cache = &hash_cache_->proc_name;
+            break;
+    }
+    if (cache) {
+        auto it = cache->find(std::string(name));
+        if (it != cache->end()) return it->second;
+    }
+    return std::nullopt;
+}
+
+std::unordered_map<IndexDatabase::HashType,
+                   std::unordered_map<std::string, std::string>>
+IndexDatabase::query_all_hash_tables() const {
+    ensure_hash_tables_cached();
+    std::shared_lock lock(hash_cache_->mutex);
+    std::unordered_map<HashType, std::unordered_map<std::string, std::string>>
+        result;
+    result[HashType::FILE] = hash_cache_->file_hash;
+    result[HashType::HOST] = hash_cache_->host_hash;
+    result[HashType::STRING] = hash_cache_->string_hash;
+    result[HashType::PROC] = hash_cache_->proc_hash;
+    return result;
 }
 
 }  // namespace dftracer::utils::utilities::indexer
diff --git a/src/dftracer/utils/utilities/indexer/index_database_sst_writer_context.cpp b/src/dftracer/utils/utilities/indexer/index_database_sst_writer_context.cpp
new file mode 100644
index 00000000..d77473d3
--- /dev/null
+++ b/src/dftracer/utils/utilities/indexer/index_database_sst_writer_context.cpp
@@ -0,0 +1,399 @@
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/rocksdb/database.h>
+#include <dftracer/utils/utilities/indexer/index_database_sst_writer_context.h>
+#include <dftracer/utils/utilities/indexer/internal/error.h>
+#include <dftracer/utils/utilities/indexer/internal/index_encoding.h>
+#include <dftracer/utils/utilities/indexer/internal/statistics_codec.h>
+#include <rocksdb/sst_file_writer.h>
+
+#include <algorithm>
+#include <stdexcept>
+
+namespace dftracer::utils::utilities::indexer {
+
+namespace {
+
+namespace encoding = internal::encoding;
+
+[[noreturn]] void throw_sst_error(std::string_view message,
+                                  const ::rocksdb::Status& status) {
+    throw internal::IndexerError(
+        internal::IndexerError::Type::DATABASE_ERROR,
+        std::string(message) + ": " + status.ToString());
+}
+
+std::string emit_sst(const std::string& path,
+                     std::vector<std::pair<std::string, std::string>>& buffer) {
+    std::sort(buffer.begin(), buffer.end(),
+              [](const auto& a, const auto& b) { return a.first < b.first; });
+    // SstFileWriter requires strict ascending keys. Buffers like
+    // name_dictionary and hash_tables collect duplicate (key, value) pairs
+    // when a batch spans multiple files that share event names / content
+    // hashes. Drop consecutive duplicates so the writer sees unique keys.
+    buffer.erase(std::unique(buffer.begin(), buffer.end(),
+                             [](const auto& a, const auto& b) {
+                                 return a.first == b.first;
+                             }),
+                 buffer.end());
+
+    ::rocksdb::EnvOptions env_opts;
+    ::rocksdb::Options writer_options(
+        rocksdb::RocksDatabase::default_options(),
+        rocksdb::RocksDatabase::default_column_family_options());
+    ::rocksdb::SstFileWriter writer(env_opts, writer_options);
+
+    auto status = writer.Open(path);
+    if (!status.ok()) {
+        throw_sst_error("Failed to open SST writer at '" + path + "'", status);
+    }
+
+    for (const auto& [key, value] : buffer) {
+        status = writer.Put(key, value);
+        if (!status.ok()) {
+            throw_sst_error("Failed to append to SST '" + path + "'", status);
+        }
+    }
+
+    status = writer.Finish();
+    if (!status.ok()) {
+        throw_sst_error("Failed to finalize SST '" + path + "'", status);
+    }
+
+    return path;
+}
+
+/// Emit a mixed Put+Merge SST for the AGGREGATION / SYSTEM_METRICS CFs.
+/// Entries are sorted by key; each entry's `is_merge` decides whether to
+/// call SstFileWriter::Merge (operand, combined by merge_operator at
+/// read/compaction time) or ::Put (overrides prior values).
+std::string emit_mixed_sst(
+    const std::string& path,
+    std::vector<IndexDatabaseSstWriterContext::MergeableKeyValue>& buffer) {
+    std::sort(buffer.begin(), buffer.end(),
+              [](const auto& a, const auto& b) { return a.key < b.key; });
+
+    ::rocksdb::EnvOptions env_opts;
+    ::rocksdb::Options writer_options(
+        rocksdb::RocksDatabase::default_options(),
+        rocksdb::RocksDatabase::default_column_family_options());
+    ::rocksdb::SstFileWriter writer(env_opts, writer_options);
+
+    auto status = writer.Open(path);
+    if (!status.ok()) {
+        throw_sst_error("Failed to open SST writer at '" + path + "'", status);
+    }
+    for (const auto& entry : buffer) {
+        status = entry.is_merge ? writer.Merge(entry.key, entry.value)
+                                : writer.Put(entry.key, entry.value);
+        if (!status.ok()) {
+            throw_sst_error("Failed to append to SST '" + path + "'", status);
+        }
+    }
+    status = writer.Finish();
+    if (!status.ok()) {
+        throw_sst_error("Failed to finalize SST '" + path + "'", status);
+    }
+    return path;
+}
+
+}  // namespace
+
+namespace {
+
+/// Move one file from `src` to `dst`. Uses rename (O(1) same-FS) with a
+/// copy+unlink fallback for cross-FS. `dst` parent directory must exist.
+void move_file(const fs::path& src, const fs::path& dst) {
+    std::error_code ec;
+    fs::rename(src, dst, ec);
+    if (!ec) return;
+    // Cross-FS or other rename failure -> fall back to copy.
+    ec.clear();
+    fs::copy_file(src, dst, fs::copy_options::overwrite_existing, ec);
+    if (ec) {
+        throw std::runtime_error("Failed to move SST '" + src.string() +
+                                 "' to '" + dst.string() +
+                                 "': " + ec.message());
+    }
+    fs::remove(src, ec);  // best-effort; staging cleanup handled by caller
+}
+
+void move_one(const fs::path& dest_dir, std::optional<std::string>& src_slot,
+              std::optional<std::string>& dst_slot) {
+    if (!src_slot.has_value()) return;
+    fs::path src(*src_slot);
+    fs::path dst = dest_dir / src.filename();
+    move_file(src, dst);
+    dst_slot = dst.string();
+    src_slot.reset();
+}
+
+}  // namespace
+
+IndexDatabaseSstWriterContext::Artifacts
+IndexDatabaseSstWriterContext::Artifacts::move_to(
+    std::string_view dest_dir) && {
+    const fs::path dir(dest_dir);
+    std::error_code ec;
+    fs::create_directories(dir, ec);
+    if (ec) {
+        throw std::runtime_error("Failed to create SST move destination '" +
+                                 std::string(dest_dir) + "': " + ec.message());
+    }
+
+    Artifacts out;
+    move_one(dir, metadata_sst, out.metadata_sst);
+    move_one(dir, checkpoints_sst, out.checkpoints_sst);
+    move_one(dir, manifest_sst, out.manifest_sst);
+    move_one(dir, chunk_bloom_sst, out.chunk_bloom_sst);
+    move_one(dir, file_bloom_sst, out.file_bloom_sst);
+    move_one(dir, chunk_stats_sst, out.chunk_stats_sst);
+    move_one(dir, chunk_dim_stats_sst, out.chunk_dim_stats_sst);
+    move_one(dir, dimensions_sst, out.dimensions_sst);
+    move_one(dir, file_scalar_stats_sst, out.file_scalar_stats_sst);
+    move_one(dir, file_cat_counts_sst, out.file_cat_counts_sst);
+    move_one(dir, file_pid_tid_counts_sst, out.file_pid_tid_counts_sst);
+    move_one(dir, file_name_counts_sst, out.file_name_counts_sst);
+    move_one(dir, name_dictionary_sst, out.name_dictionary_sst);
+    move_one(dir, name_file_postings_sst, out.name_file_postings_sst);
+    move_one(dir, name_chunk_postings_sst, out.name_chunk_postings_sst);
+    move_one(dir, hash_tables_sst, out.hash_tables_sst);
+    move_one(dir, aggregation_sst, out.aggregation_sst);
+    move_one(dir, system_metrics_sst, out.system_metrics_sst);
+    return out;
+}
+
+IndexDatabaseSstWriterContext::IndexDatabaseSstWriterContext(
+    std::string staging_dir, std::string batch_id)
+    : staging_dir_(std::move(staging_dir)), batch_id_(std::move(batch_id)) {
+    std::error_code ec;
+    fs::create_directories(fs::path(staging_dir_) / batch_id_, ec);
+    if (ec) {
+        throw std::runtime_error("Failed to create SST staging dir '" +
+                                 staging_dir_ + "/" + batch_id_ +
+                                 "': " + ec.message());
+    }
+}
+
+IndexDatabaseSstWriterContext::IndexDatabaseSstWriterContext(
+    IndexDatabaseSstWriterContext&&) noexcept = default;
+IndexDatabaseSstWriterContext& IndexDatabaseSstWriterContext::operator=(
+    IndexDatabaseSstWriterContext&&) noexcept = default;
+IndexDatabaseSstWriterContext::~IndexDatabaseSstWriterContext() = default;
+
+void IndexDatabaseSstWriterContext::insert_file_metadata(
+    int file_id, std::uint64_t checkpoint_size, std::uint64_t total_lines,
+    std::uint64_t total_uc_size) {
+    metadata_buf_.emplace_back(
+        encoding::metadata_key(file_id),
+        encoding::encode_metadata_record(checkpoint_size, total_lines,
+                                         total_uc_size));
+}
+
+void IndexDatabaseSstWriterContext::insert_checkpoint(
+    int file_id, const IndexerCheckpoint& checkpoint) {
+    checkpoints_buf_.emplace_back(
+        encoding::checkpoint_key(file_id, checkpoint.uc_offset,
+                                 checkpoint.checkpoint_idx),
+        encoding::encode_checkpoint_value(checkpoint));
+}
+
+void IndexDatabaseSstWriterContext::insert_event_range(
+    int file_id, std::uint64_t checkpoint_idx, std::string_view cat,
+    std::string_view name, std::span<const std::uint32_t> line_numbers) {
+    manifest_buf_.emplace_back(
+        encoding::manifest_event_key(file_id, checkpoint_idx, cat, name),
+        encoding::encode_event_range_value(line_numbers));
+}
+
+void IndexDatabaseSstWriterContext::insert_metadata_lines(
+    int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type,
+    std::span<const std::uint32_t> line_numbers) {
+    manifest_buf_.emplace_back(
+        encoding::manifest_metadata_key(file_id, checkpoint_idx, meta_type),
+        encoding::encode_metadata_value(line_numbers));
+}
+
+void IndexDatabaseSstWriterContext::insert_file_pids(
+    int file_id, const std::unordered_set<std::uint64_t>& pids) {
+    manifest_buf_.emplace_back(encoding::file_pids_key(file_id),
+                               encoding::encode_file_pids_value(pids));
+}
+
+void IndexDatabaseSstWriterContext::insert_chunk_bloom_filter(
+    int file_id, std::uint64_t checkpoint_idx, std::string_view dimension,
+    std::span<const unsigned char> blob_data, std::uint64_t num_entries) {
+    chunk_bloom_buf_.emplace_back(
+        encoding::chunk_bloom_key(file_id, dimension, checkpoint_idx),
+        encoding::encode_bloom_value(blob_data, num_entries));
+}
+
+void IndexDatabaseSstWriterContext::insert_file_bloom_filter(
+    int file_id, std::string_view dimension,
+    std::span<const unsigned char> blob_data, std::uint64_t num_entries) {
+    file_bloom_buf_.emplace_back(
+        encoding::file_bloom_key(file_id, dimension),
+        encoding::encode_bloom_value(blob_data, num_entries));
+}
+
+void IndexDatabaseSstWriterContext::insert_chunk_statistics(
+    int file_id, std::uint64_t checkpoint_idx, const ChunkStatistics& stats) {
+    chunk_stats_buf_.emplace_back(
+        encoding::chunk_stats_key(file_id, checkpoint_idx),
+        encoding::encode_chunk_statistics_value(stats));
+}
+
+void IndexDatabaseSstWriterContext::insert_file_scalar_stats(
+    int file_id, const ChunkStatistics& stats, std::uint64_t num_chunks) {
+    file_scalar_stats_buf_.emplace_back(
+        encoding::file_scalar_stats_key(file_id),
+        internal::encode_file_scalar_stats_value(stats, num_chunks));
+}
+
+void IndexDatabaseSstWriterContext::insert_file_category_counts(
+    int file_id, const StringViewMap<std::uint64_t>& counts) {
+    file_cat_counts_buf_.emplace_back(
+        encoding::file_category_counts_key(file_id),
+        encoding::encode_count_map_value(counts));
+}
+
+void IndexDatabaseSstWriterContext::insert_file_pid_tid_counts(
+    int file_id, const StringViewMap<std::uint64_t>& counts) {
+    file_pid_tid_counts_buf_.emplace_back(
+        encoding::file_pid_tid_counts_key(file_id),
+        encoding::encode_count_map_value(counts));
+}
+
+void IndexDatabaseSstWriterContext::insert_file_name_counts(
+    int file_id, const StringViewMap<std::uint64_t>& counts) {
+    file_name_counts_buf_.emplace_back(
+        encoding::file_name_counts_key(file_id),
+        encoding::encode_name_summary_value(counts, /*other_count=*/0,
+                                            /*unique_count=*/counts.size()));
+}
+
+void IndexDatabaseSstWriterContext::insert_index_dimension(
+    int file_id, std::string_view dimension) {
+    dimensions_buf_.emplace_back(
+        encoding::make_dimension_key(file_id, dimension), std::string{});
+}
+
+void IndexDatabaseSstWriterContext::insert_chunk_dimension_stats(
+    int file_id, std::uint64_t checkpoint_idx, const ChunkDimensionStats& stats,
+    std::size_t value_counts_cap) {
+    chunk_dim_stats_buf_.emplace_back(
+        encoding::chunk_dim_stats_key(file_id, checkpoint_idx, stats.dimension),
+        encoding::encode_chunk_dimension_stats_value(stats, value_counts_cap));
+}
+
+void IndexDatabaseSstWriterContext::insert_name_dictionary_entry(
+    std::uint64_t name_id, std::string_view name) {
+    name_dictionary_buf_.emplace_back(
+        encoding::name_lookup_key(name),
+        ::dftracer::utils::rocksdb::KeyCodec::encode_be64(name_id));
+    name_dictionary_buf_.emplace_back(encoding::name_reverse_key(name_id),
+                                      std::string(name));
+}
+
+void IndexDatabaseSstWriterContext::insert_name_file_posting(
+    std::uint64_t name_id, int file_id) {
+    name_file_postings_buf_.emplace_back(
+        encoding::name_file_posting_key(name_id, file_id), std::string{});
+    name_file_postings_buf_.emplace_back(
+        encoding::name_file_owner_key(file_id, name_id), std::string{});
+}
+
+void IndexDatabaseSstWriterContext::insert_name_chunk_posting(
+    std::uint64_t name_id, int file_id, std::uint64_t checkpoint_idx) {
+    name_chunk_postings_buf_.emplace_back(
+        encoding::name_chunk_posting_key(name_id, file_id, checkpoint_idx),
+        std::string{});
+    name_chunk_postings_buf_.emplace_back(
+        encoding::name_chunk_owner_key(file_id, name_id, checkpoint_idx),
+        std::string{});
+}
+
+void IndexDatabaseSstWriterContext::insert_hash_table_entry(
+    std::uint8_t type, std::string_view hash, std::string_view name) {
+    hash_tables_buf_.emplace_back(encoding::hash_table_forward_key(type, hash),
+                                  std::string(name));
+    hash_tables_buf_.emplace_back(encoding::hash_table_reverse_key(type, name),
+                                  std::string(hash));
+}
+
+void IndexDatabaseSstWriterContext::insert_aggregation_merge(
+    std::string_view key, std::string_view operand) {
+    aggregation_buf_.emplace_back(MergeableKeyValue{
+        std::string(key), std::string(operand), /*is_merge=*/true});
+}
+
+void IndexDatabaseSstWriterContext::insert_aggregation_put(
+    std::string_view key, std::string_view value) {
+    aggregation_buf_.emplace_back(MergeableKeyValue{
+        std::string(key), std::string(value), /*is_merge=*/false});
+}
+
+void IndexDatabaseSstWriterContext::insert_system_metrics_merge(
+    std::string_view key, std::string_view operand) {
+    system_metrics_buf_.emplace_back(MergeableKeyValue{
+        std::string(key), std::string(operand), /*is_merge=*/true});
+}
+
+IndexDatabaseSstWriterContext::Artifacts
+IndexDatabaseSstWriterContext::commit() {
+    Artifacts out;
+    if (committed_) {
+        return out;
+    }
+    committed_ = true;
+
+    const auto batch_dir = fs::path(staging_dir_) / batch_id_;
+
+    auto emit_into = [&](const char* name, std::vector<KeyValue>& buf,
+                         std::optional<std::string>& slot) {
+        if (buf.empty()) return;
+        slot = emit_sst((batch_dir / name).string(), buf);
+        buf.clear();
+        buf.shrink_to_fit();
+    };
+
+    emit_into("metadata.sst", metadata_buf_, out.metadata_sst);
+    emit_into("checkpoints.sst", checkpoints_buf_, out.checkpoints_sst);
+    emit_into("manifest.sst", manifest_buf_, out.manifest_sst);
+    emit_into("chunk_bloom.sst", chunk_bloom_buf_, out.chunk_bloom_sst);
+    emit_into("file_bloom.sst", file_bloom_buf_, out.file_bloom_sst);
+    emit_into("chunk_stats.sst", chunk_stats_buf_, out.chunk_stats_sst);
+    emit_into("chunk_dim_stats.sst", chunk_dim_stats_buf_,
+              out.chunk_dim_stats_sst);
+    emit_into("dimensions.sst", dimensions_buf_, out.dimensions_sst);
+    emit_into("file_scalar_stats.sst", file_scalar_stats_buf_,
+              out.file_scalar_stats_sst);
+    emit_into("file_cat_counts.sst", file_cat_counts_buf_,
+              out.file_cat_counts_sst);
+    emit_into("file_pid_tid_counts.sst", file_pid_tid_counts_buf_,
+              out.file_pid_tid_counts_sst);
+    emit_into("file_name_counts.sst", file_name_counts_buf_,
+              out.file_name_counts_sst);
+    emit_into("name_dictionary.sst", name_dictionary_buf_,
+              out.name_dictionary_sst);
+    emit_into("name_file_postings.sst", name_file_postings_buf_,
+              out.name_file_postings_sst);
+    emit_into("name_chunk_postings.sst", name_chunk_postings_buf_,
+              out.name_chunk_postings_sst);
+    emit_into("hash_tables.sst", hash_tables_buf_, out.hash_tables_sst);
+
+    auto emit_mixed_into = [&](const char* name,
+                               std::vector<MergeableKeyValue>& buf,
+                               std::optional<std::string>& slot) {
+        if (buf.empty()) return;
+        slot = emit_mixed_sst((batch_dir / name).string(), buf);
+        buf.clear();
+        buf.shrink_to_fit();
+    };
+    emit_mixed_into("aggregation.sst", aggregation_buf_, out.aggregation_sst);
+    emit_mixed_into("system_metrics.sst", system_metrics_buf_,
+                    out.system_metrics_sst);
+
+    return out;
+}
+
+}  // namespace dftracer::utils::utilities::indexer
diff --git a/src/dftracer/utils/utilities/indexer/index_database_writer_context.cpp b/src/dftracer/utils/utilities/indexer/index_database_writer_context.cpp
new file mode 100644
index 00000000..611eb2f9
--- /dev/null
+++ b/src/dftracer/utils/utilities/indexer/index_database_writer_context.cpp
@@ -0,0 +1,1279 @@
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/rocksdb/key_codec.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/queries/manifest_queries.h>
+#include <dftracer/utils/utilities/hash/fnv1a_hasher_utility.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
+#include <dftracer/utils/utilities/indexer/internal/error.h>
+#include <dftracer/utils/utilities/indexer/internal/index_batch_writer.h>
+#include <dftracer/utils/utilities/indexer/internal/index_encoding.h>
+#include <dftracer/utils/utilities/indexer/internal/payload_codec.h>
+#include <dftracer/utils/utilities/indexer/internal/scan_prefix.h>
+#include <dftracer/utils/utilities/indexer/internal/statistics_codec.h>
+
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <optional>
+#include <stdexcept>
+#include <unordered_set>
+#include <utility>
+
+namespace dftracer::utils::utilities::indexer {
+
+namespace rocks = dftracer::utils::rocksdb;
+namespace cf = rocks::cf;
+
+using namespace internal;
+
+namespace {
+
+using encoding::checkpoint_key;
+using encoding::chunk_bloom_key;
+using encoding::chunk_dim_stats_key;
+using encoding::chunk_stats_key;
+using encoding::encode_bloom_value;
+using encoding::encode_checkpoint_value;
+using encoding::encode_chunk_dimension_stats_value;
+using encoding::encode_chunk_statistics_value;
+using encoding::encode_count_map_value;
+using encoding::encode_event_range_value;
+using encoding::encode_metadata_record;
+using encoding::encode_metadata_value;
+using encoding::encode_name_summary_value;
+using encoding::file_bloom_key;
+using encoding::file_category_counts_key;
+using encoding::file_name_counts_key;
+using encoding::file_pid_tid_counts_key;
+using encoding::file_scalar_stats_key;
+using encoding::make_dimension_key;
+using encoding::manifest_event_key;
+using encoding::name_chunk_owner_key;
+using encoding::name_chunk_owner_prefix;
+using encoding::name_chunk_posting_key;
+using encoding::name_file_owner_key;
+using encoding::name_file_owner_prefix;
+using encoding::name_file_posting_key;
+using encoding::name_lookup_key;
+using encoding::name_reverse_key;
+
+namespace hash = dftracer::utils::utilities::hash;
+using encoding::manifest_metadata_key;
+using encoding::metadata_key;
+using encoding::prefix_for_file;
+
+constexpr std::uint32_t SCHEMA_VERSION = 1;
+
+[[noreturn]] void throw_db_error(std::string_view message,
+                                 const ::rocksdb::Status& status) {
+    throw IndexerError(IndexerError::Type::DATABASE_ERROR,
+                       std::string(message) + ": " + status.ToString());
+}
+
+std::string file_lookup_key(std::string_view logical_name) {
+    return std::string("f|") + std::string(logical_name);
+}
+
+std::string file_reverse_key(int file_id) {
+    std::string key("r|");
+    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_id));
+    return key;
+}
+
+std::string next_file_id_key() {
+    return std::string(encoding::NEXT_FILE_ID_KEY);
+}
+std::string schema_version_key() { return "_schema_version"; }
+
+std::string encode_file_record(
+    int file_id, std::uint64_t file_hash,
+    IndexFileEntryCapability caps = IndexFileEntryCapability::NONE) {
+    std::string value;
+    rocks::KeyCodec::append_be32(value, static_cast<std::uint32_t>(file_id));
+    value.push_back(static_cast<char>(static_cast<std::uint8_t>(caps)));
+    value.append(7, '\0');
+    append_u64(value, 0);
+    append_u64(value, file_hash);
+    return value;
+}
+
+IndexFileEntryCapability decode_file_capabilities(std::string_view record) {
+    if (record.size() < 5) {
+        return IndexFileEntryCapability::NONE;
+    }
+    return static_cast<IndexFileEntryCapability>(
+        static_cast<std::uint8_t>(record[4]));
+}
+
+int decode_file_id(std::string_view record) {
+    if (record.size() < 4) {
+        throw std::runtime_error("Corrupt file record");
+    }
+    return static_cast<int>(rocks::KeyCodec::decode_be32(record.substr(0, 4)));
+}
+
+int decode_prefixed_file_id(std::string_view key) {
+    if (key.size() < 4) {
+        throw std::runtime_error("Corrupt file-prefixed key");
+    }
+    return static_cast<int>(rocks::KeyCodec::decode_be32(key.substr(0, 4)));
+}
+
+std::uint64_t decode_file_hash(std::string_view record) {
+    if (record.size() < 28) {
+        throw std::runtime_error("Corrupt file record");
+    }
+    return rocks::KeyCodec::decode_be64(record.substr(20, 8));
+}
+
+std::string root_scalar_stats_key() { return "_root"; }
+std::string root_category_counts_key() { return "_root"; }
+std::string root_name_counts_key() { return "_root"; }
+std::string root_pid_tid_counts_key() { return "_root"; }
+
+std::string tar_archive_key(int file_id) { return prefix_for_file(file_id); }
+
+std::string tar_file_key(int file_id, std::uint64_t uncompressed_offset,
+                         std::string_view file_name) {
+    std::string key = prefix_for_file(file_id);
+    append_u64(key, uncompressed_offset);
+    key.push_back('\0');
+    key.append(file_name);
+    return key;
+}
+
+std::unordered_map<std::string, std::uint64_t> decode_count_map_value(
+    std::string_view value) {
+    Cursor cursor(value);
+    std::unordered_map<std::string, std::uint64_t> counts;
+    auto num_entries = cursor.u32();
+    counts.reserve(num_entries);
+    for (std::uint32_t i = 0; i < num_entries; ++i) {
+        auto key = cursor.str();
+        counts.emplace(std::move(key), cursor.u64());
+    }
+    return counts;
+}
+
+template <typename Callback>
+void for_each_count_map_entry(std::string_view value, Callback&& callback) {
+    Cursor cursor(value);
+    auto num_entries = cursor.u32();
+    for (std::uint32_t i = 0; i < num_entries; ++i) {
+        auto key = cursor.str_view();
+        auto count = cursor.u64();
+        callback(key, count);
+    }
+}
+
+template <typename Callback>
+void for_each_name_summary_entry(std::string_view value, Callback&& callback) {
+    Cursor cursor(value);
+    auto num_entries = cursor.u32();
+    (void)cursor.u64();  // other_count
+    (void)cursor.u64();  // unique_count
+    for (std::uint32_t i = 0; i < num_entries; ++i) {
+        auto key = cursor.str_view();
+        auto count = cursor.u64();
+        callback(key, count);
+    }
+}
+
+std::string encode_tar_archive_value(std::string_view archive_name,
+                                     std::uint64_t checkpoint_size,
+                                     std::uint64_t total_lines,
+                                     std::uint64_t total_uc_size,
+                                     std::uint64_t total_files) {
+    std::string value;
+    append_string(value, archive_name);
+    append_u64(value, checkpoint_size);
+    append_u64(value, total_lines);
+    append_u64(value, total_uc_size);
+    append_u64(value, total_files);
+    return value;
+}
+
+std::string encode_tar_file_value(
+    const IndexDatabaseWriterContext::TarFileRecord& record) {
+    std::string value;
+    append_u64(value, record.file_size);
+    append_u64(value, record.file_mtime);
+    append_u8(value, static_cast<std::uint8_t>(record.typeflag));
+    append_u64(value, record.data_offset);
+    return value;
+}
+
+std::array<std::uint64_t, 3> decode_metadata_record(std::string_view value) {
+    Cursor cursor(value);
+    return {cursor.u64(), cursor.u64(), cursor.u64()};
+}
+
+std::string iterator_value(::rocksdb::Iterator& it) {
+    const auto slice = it.value();
+    return std::string(slice.data(), slice.size());
+}
+
+std::string iterator_key(::rocksdb::Iterator& it) {
+    const auto slice = it.key();
+    return std::string(slice.data(), slice.size());
+}
+
+template <typename Fn>
+void scan_prefix(const rocks::RocksDatabase& db, std::string_view column_family,
+                 std::string_view prefix, Fn&& fn) {
+    internal::scan_prefix_iterator(
+        "Failed to scan RocksDB prefix", prefix,
+        [&] { return db.new_iterator(column_family); }, std::forward<Fn>(fn));
+}
+
+}  // namespace
+
+namespace internal {
+
+std::string encode_file_scalar_stats_value(const ChunkStatistics& stats,
+                                           std::uint64_t num_chunks) {
+    std::string value;
+    append_u64(value, stats.total_events);
+    append_u64(value, stats.min_timestamp_us);
+    append_u64(value, stats.max_timestamp_us);
+    append_i64(value, stats.duration_sum_us);
+    append_u64(value, stats.duration_min_us);
+    append_u64(value, stats.duration_max_us);
+    append_u64(value, stats.duration_count);
+    append_double(value, stats.duration_m2);
+
+    auto duration_sketch = stats.duration_sketch.serialize();
+    append_blob(value, duration_sketch);
+
+    auto duration_histogram = stats.duration_histogram.to_json();
+    append_string(value, duration_histogram);
+
+    append_u64(value, num_chunks);
+
+    auto ts_hist = stats.timestamp_histogram.serialize();
+    append_blob(value, ts_hist);
+
+    return value;
+}
+
+std::string encode_root_scalar_stats_value(
+    const ChunkStatistics& stats, std::uint64_t num_chunks,
+    std::uint64_t num_files, std::uint64_t total_lines,
+    std::uint64_t total_uncompressed_bytes) {
+    auto value = encode_file_scalar_stats_value(stats, num_chunks);
+    append_u64(value, num_files);
+    append_u64(value, total_lines);
+    append_u64(value, total_uncompressed_bytes);
+    return value;
+}
+
+MergedStatisticsResult decode_file_scalar_stats_value(std::string_view value) {
+    if (value.size() < 8) {
+        throw std::runtime_error("Corrupt file scalar statistics value");
+    }
+    Cursor cursor(value);
+    MergedStatisticsResult result;
+    auto& stats = result.stats;
+    stats.total_events = cursor.u64();
+    stats.min_timestamp_us = cursor.u64();
+    stats.max_timestamp_us = cursor.u64();
+    stats.duration_sum_us = cursor.i64();
+    stats.duration_min_us = cursor.u64();
+    stats.duration_max_us = cursor.u64();
+    stats.duration_count = cursor.u64();
+    stats.duration_m2 = cursor.f64();
+
+    auto duration_sketch = cursor.blob();
+    if (!duration_sketch.empty()) {
+        stats.duration_sketch = common::statistics::DDSketch::deserialize(
+            duration_sketch.data(), duration_sketch.size());
+    }
+
+    auto duration_histogram = cursor.str();
+    if (!duration_histogram.empty()) {
+        stats.duration_histogram =
+            common::statistics::Log2Histogram::from_json(duration_histogram);
+    }
+
+    result.num_chunks = cursor.u64();
+
+    auto ts_hist_blob = cursor.blob();
+    if (!ts_hist_blob.empty()) {
+        stats.timestamp_histogram =
+            common::statistics::TimestampHistogram::deserialize(
+                ts_hist_blob.data(), ts_hist_blob.size());
+    }
+
+    return result;
+}
+
+RootStatisticsResult decode_root_scalar_stats_value(std::string_view value) {
+    Cursor cursor(value);
+    RootStatisticsResult result;
+    auto& stats = result.stats;
+    stats.total_events = cursor.u64();
+    stats.min_timestamp_us = cursor.u64();
+    stats.max_timestamp_us = cursor.u64();
+    stats.duration_sum_us = cursor.i64();
+    stats.duration_min_us = cursor.u64();
+    stats.duration_max_us = cursor.u64();
+    stats.duration_count = cursor.u64();
+    stats.duration_m2 = cursor.f64();
+
+    auto duration_sketch = cursor.blob();
+    if (!duration_sketch.empty()) {
+        stats.duration_sketch = common::statistics::DDSketch::deserialize(
+            duration_sketch.data(), duration_sketch.size());
+    }
+
+    auto duration_histogram = cursor.str();
+    if (!duration_histogram.empty()) {
+        stats.duration_histogram =
+            common::statistics::Log2Histogram::from_json(duration_histogram);
+    }
+
+    result.num_chunks = cursor.u64();
+
+    auto ts_hist_blob = cursor.blob();
+    if (!ts_hist_blob.empty()) {
+        stats.timestamp_histogram =
+            common::statistics::TimestampHistogram::deserialize(
+                ts_hist_blob.data(), ts_hist_blob.size());
+    }
+
+    result.num_files = cursor.u64();
+    result.total_lines = cursor.u64();
+    result.total_uncompressed_bytes = cursor.u64();
+    return result;
+}
+
+}  // namespace internal
+
+IndexDatabaseWriterContext::IndexDatabaseWriterContext(
+    std::shared_ptr<dftracer::utils::rocksdb::RocksDatabase> db)
+    : db_(std::move(db)), batch_(db_->begin_batch()) {}
+
+IndexDatabaseWriterContext::IndexDatabaseWriterContext(
+    IndexDatabaseWriterContext&&) noexcept = default;
+
+IndexDatabaseWriterContext& IndexDatabaseWriterContext::operator=(
+    IndexDatabaseWriterContext&&) noexcept = default;
+
+IndexDatabaseWriterContext::~IndexDatabaseWriterContext() = default;
+
+void IndexDatabaseWriterContext::commit() {
+    if (committed_) return;
+    auto status = db_->commit_batch(batch_);
+    committed_ = true;
+    if (!status.ok()) {
+        throw std::runtime_error("Failed to commit WriteBatch: " +
+                                 status.ToString());
+    }
+}
+
+bool IndexDatabaseWriterContext::has_file_scalar_stats(int file_id) const {
+    std::string value;
+    auto status =
+        db_->get(file_scalar_stats_key(file_id), &value, cf::FILE_SCALAR_STATS);
+    return status.ok();
+}
+
+void IndexDatabaseWriterContext::init_schema() {
+    std::string value;
+    auto status = db_->get(schema_version_key(), &value);
+    if (status.IsNotFound()) {
+        status = db_->put(batch_, cf::DEFAULT, schema_version_key(),
+                          rocks::KeyCodec::encode_be32(SCHEMA_VERSION));
+        if (!status.ok()) {
+            throw_db_error("Failed to initialize schema version", status);
+        }
+    } else if (!status.ok()) {
+        throw_db_error("Failed to read schema version", status);
+    }
+}
+
+void IndexDatabaseWriterContext::set_file_capabilities(
+    int file_id, IndexFileEntryCapability caps) {
+    std::string name;
+    auto status = db_->get(file_reverse_key(file_id), &name);
+    if (!status.ok()) return;
+    set_file_capabilities_by_path(name, caps);
+}
+
+void IndexDatabaseWriterContext::set_file_capabilities_by_path(
+    std::string_view logical_path, IndexFileEntryCapability caps) {
+    auto key = file_lookup_key(logical_path);
+    std::string record;
+    auto status = db_->get(key, &record);
+    if (!status.ok() || record.size() < 5) return;
+
+    record[4] = static_cast<char>(static_cast<std::uint8_t>(caps));
+    db_->put(batch_, cf::DEFAULT, key, record);
+}
+
+void IndexDatabaseWriterContext::add_file_capability(
+    int file_id, IndexFileEntryCapability cap) {
+    // Inline get_file_capabilities logic
+    IndexFileEntryCapability existing = IndexFileEntryCapability::NONE;
+    std::string name;
+    auto status = db_->get(file_reverse_key(file_id), &name);
+    if (status.ok()) {
+        std::string record;
+        status = db_->get(file_lookup_key(name), &record);
+        if (status.ok()) {
+            existing = decode_file_capabilities(record);
+        }
+    }
+    set_file_capabilities(file_id, existing | cap);
+}
+
+int IndexDatabaseWriterContext::get_or_create_file_info(
+    std::string_view path, std::uint64_t file_hash,
+    IndexFileEntryCapability caps) {
+    const auto logical_name = std::string(path);
+    const auto lookup = file_lookup_key(logical_name);
+    std::string existing;
+    auto status = db_->get(lookup, &existing);
+    if (status.ok()) {
+        const auto file_id = decode_file_id(existing);
+        if (decode_file_hash(existing) == file_hash) {
+            if (caps != IndexFileEntryCapability::NONE &&
+                decode_file_capabilities(existing) != caps) {
+                existing[4] =
+                    static_cast<char>(static_cast<std::uint8_t>(caps));
+                db_->put(batch_, cf::DEFAULT, lookup, existing);
+            }
+            return file_id;
+        }
+        delete_file_contents(file_id);
+        // Also delete the registry entries for this file
+        {
+            const auto logical_name_key = file_reverse_key(file_id);
+            std::string old_name;
+            auto rev_status = db_->get(logical_name_key, &old_name);
+            if (rev_status.ok()) {
+                db_->del(batch_, cf::DEFAULT, file_lookup_key(old_name));
+                db_->del(batch_, cf::DEFAULT, logical_name_key);
+            }
+            // Delete root summaries
+            auto delete_prefix_fn = [&](std::string_view cf_name,
+                                        std::string_view prefix) {
+                std::vector<std::string> keys;
+                scan_prefix(*db_, cf_name, prefix,
+                            [&](::rocksdb::Iterator& it) {
+                                keys.push_back(iterator_key(it));
+                            });
+                for (const auto& k : keys) {
+                    db_->del(batch_, cf_name, k);
+                }
+            };
+            delete_prefix_fn(cf::ROOT_SCALAR_STATS, root_scalar_stats_key());
+            delete_prefix_fn(cf::ROOT_CAT_COUNTS, root_category_counts_key());
+            delete_prefix_fn(cf::ROOT_NAME_COUNTS, root_name_counts_key());
+            delete_prefix_fn(cf::ROOT_PID_TID_COUNTS,
+                             root_pid_tid_counts_key());
+        }
+        auto registry = encode_file_record(file_id, file_hash, caps);
+        status = db_->put(batch_, cf::DEFAULT, lookup, registry);
+        if (!status.ok()) {
+            throw_db_error("Failed to update file registry", status);
+        }
+        status = db_->put(batch_, cf::DEFAULT, file_reverse_key(file_id),
+                          logical_name);
+        if (!status.ok()) {
+            throw_db_error("Failed to update reverse file registry", status);
+        }
+        return file_id;
+    }
+    if (!status.IsNotFound()) {
+        throw_db_error("Failed to query file registry", status);
+    }
+
+    std::uint32_t next_id;
+    if (cached_next_file_id_ >= 0) {
+        next_id = static_cast<std::uint32_t>(cached_next_file_id_);
+    } else {
+        next_id = 1;
+        std::string next_value;
+        status = db_->get(next_file_id_key(), &next_value);
+        if (status.ok()) {
+            next_id = rocks::KeyCodec::decode_be32(next_value);
+        } else if (!status.IsNotFound()) {
+            throw_db_error("Failed to read next file id", status);
+        }
+    }
+    cached_next_file_id_ = static_cast<std::int64_t>(next_id + 1);
+
+    const auto file_id = static_cast<int>(next_id);
+    const auto new_registry = encode_file_record(file_id, file_hash, caps);
+    const auto next_registry = rocks::KeyCodec::encode_be32(next_id + 1);
+
+    status = db_->put(batch_, cf::DEFAULT, lookup, new_registry);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert file registry", status);
+    }
+    status =
+        db_->put(batch_, cf::DEFAULT, file_reverse_key(file_id), logical_name);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert reverse file registry", status);
+    }
+    status = db_->put(batch_, cf::DEFAULT, next_file_id_key(), next_registry);
+    if (!status.ok()) {
+        throw_db_error("Failed to update next file id", status);
+    }
+
+    return file_id;
+}
+
+void IndexDatabaseWriterContext::insert_file_metadata(
+    int file_id, std::uint64_t checkpoint_size, std::uint64_t total_lines,
+    std::uint64_t total_uc_size) {
+    const auto key = metadata_key(file_id);
+    const auto value =
+        encode_metadata_record(checkpoint_size, total_lines, total_uc_size);
+    auto status = db_->put(batch_, cf::METADATA, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert metadata", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_chunk_bloom_filter(
+    int file_id, std::uint64_t checkpoint_idx, std::string_view dimension,
+    std::span<const unsigned char> blob_data, std::uint64_t num_entries) {
+    const auto key = chunk_bloom_key(file_id, dimension, checkpoint_idx);
+    const auto value = encode_bloom_value(blob_data, num_entries);
+    auto status = db_->put(batch_, cf::CHUNK_BLOOM, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert chunk bloom filter", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_chunk_bloom_filter(
+    int file_id, std::uint64_t checkpoint_idx, std::string_view dimension,
+    const void* blob_data, int blob_size, std::uint64_t num_entries) {
+    auto* bytes = static_cast<const unsigned char*>(blob_data);
+    insert_chunk_bloom_filter(file_id, checkpoint_idx, dimension,
+                              std::span<const unsigned char>(
+                                  bytes, static_cast<std::size_t>(blob_size)),
+                              num_entries);
+}
+
+void IndexDatabaseWriterContext::insert_file_bloom_filter(
+    int file_id, std::string_view dimension,
+    std::span<const unsigned char> blob_data, std::uint64_t num_entries) {
+    const auto key = file_bloom_key(file_id, dimension);
+    const auto value = encode_bloom_value(blob_data, num_entries);
+    auto status = db_->put(batch_, cf::FILE_BLOOM, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert file bloom filter", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_file_bloom_filter(
+    int file_id, std::string_view dimension, const void* blob_data,
+    int blob_size, std::uint64_t num_entries) {
+    auto* bytes = static_cast<const unsigned char*>(blob_data);
+    insert_file_bloom_filter(file_id, dimension,
+                             std::span<const unsigned char>(
+                                 bytes, static_cast<std::size_t>(blob_size)),
+                             num_entries);
+}
+
+void IndexDatabaseWriterContext::insert_chunk_statistics(
+    int file_id, std::uint64_t checkpoint_idx, const ChunkStatistics& stats) {
+    const auto key = chunk_stats_key(file_id, checkpoint_idx);
+    const auto value = encode_chunk_statistics_value(stats);
+    auto status = db_->put(batch_, cf::CHUNK_STATS, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert chunk statistics", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_file_scalar_stats(
+    int file_id, const ChunkStatistics& stats, std::uint64_t num_chunks) {
+    const auto key = file_scalar_stats_key(file_id);
+    const auto value = encode_file_scalar_stats_value(stats, num_chunks);
+    auto status = db_->put(batch_, cf::FILE_SCALAR_STATS, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert file scalar statistics", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_file_category_counts(
+    int file_id, const StringViewMap<std::uint64_t>& counts) {
+    const auto key = file_category_counts_key(file_id);
+    const auto value = encode_count_map_value(counts);
+    auto status = db_->put(batch_, cf::FILE_CAT_COUNTS, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert file category counts", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_file_pid_tid_counts(
+    int file_id, const StringViewMap<std::uint64_t>& counts) {
+    const auto key = file_pid_tid_counts_key(file_id);
+    const auto value = encode_count_map_value(counts);
+    auto status = db_->put(batch_, cf::FILE_PID_TID_COUNTS, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert file pid_tid counts", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_file_name_counts(
+    int file_id, const StringViewMap<std::uint64_t>& counts) {
+    const auto key = file_name_counts_key(file_id);
+    const auto value = encode_name_summary_value(counts, 0, counts.size());
+    auto status = db_->put(batch_, cf::FILE_NAME_COUNTS, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert file name counts", status);
+    }
+}
+
+std::uint64_t IndexDatabaseWriterContext::get_or_create_name_id(
+    std::string_view name) {
+    const auto name_id = hash::fnv1a_hash(name);
+    insert_name_dictionary_entry(name_id, name);
+    return name_id;
+}
+
+void IndexDatabaseWriterContext::insert_name_dictionary_entry(
+    std::uint64_t name_id, std::string_view name) {
+    const auto encoded_id = rocks::KeyCodec::encode_be64(name_id);
+    auto status = db_->put(batch_, cf::NAME_DICTIONARY, name_lookup_key(name),
+                           encoded_id);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert name dictionary lookup", status);
+    }
+    status = db_->put(batch_, cf::NAME_DICTIONARY, name_reverse_key(name_id),
+                      std::string(name));
+    if (!status.ok()) {
+        throw_db_error("Failed to insert name dictionary reverse", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_name_file_posting(std::uint64_t name_id,
+                                                          int file_id) {
+    const auto key = name_file_posting_key(name_id, file_id);
+    const auto owner_key = name_file_owner_key(file_id, name_id);
+    auto status = db_->put(batch_, cf::NAME_FILE_POSTINGS, key, "");
+    if (!status.ok()) {
+        throw_db_error("Failed to insert name file posting", status);
+    }
+    status = db_->put(batch_, cf::NAME_FILE_POSTINGS, owner_key, "");
+    if (!status.ok()) {
+        throw_db_error("Failed to insert name file owner posting", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_name_chunk_posting(
+    std::uint64_t name_id, int file_id, std::uint64_t checkpoint_idx) {
+    const auto key = name_chunk_posting_key(name_id, file_id, checkpoint_idx);
+    const auto owner_key =
+        name_chunk_owner_key(file_id, name_id, checkpoint_idx);
+    auto status = db_->put(batch_, cf::NAME_CHUNK_POSTINGS, key, "");
+    if (!status.ok()) {
+        throw_db_error("Failed to insert name chunk posting", status);
+    }
+    status = db_->put(batch_, cf::NAME_CHUNK_POSTINGS, owner_key, "");
+    if (!status.ok()) {
+        throw_db_error("Failed to insert name chunk owner posting", status);
+    }
+}
+
+void IndexDatabaseWriterContext::refresh_root_summaries_after_file_write(
+    [[maybe_unused]] int file_id, const ChunkStatistics& stats,
+    std::uint64_t num_chunks, bool had_existing_file_summary,
+    std::uint64_t file_lines, std::uint64_t file_uncompressed_bytes) {
+    auto put_root_scalar = [&](const RootStatisticsResult& root) {
+        auto value = encode_root_scalar_stats_value(
+            root.stats, root.num_chunks, root.num_files, root.total_lines,
+            root.total_uncompressed_bytes);
+        auto status = db_->put(batch_, cf::ROOT_SCALAR_STATS,
+                               root_scalar_stats_key(), value);
+        if (!status.ok()) {
+            throw_db_error("Failed to write root scalar statistics", status);
+        }
+    };
+
+    auto put_root_counts = [&](std::string_view cf_name, std::string_view key,
+                               const auto& counts,
+                               std::string_view error_message) {
+        auto value = encode_count_map_value(counts);
+        auto status = db_->put(batch_, cf_name, key, value);
+        if (!status.ok()) {
+            throw_db_error(error_message, status);
+        }
+    };
+
+    if (had_existing_file_summary) {
+        rebuild_root_summaries();
+        return;
+    }
+
+    // Inline query_root_scalar_stats
+    std::optional<RootStatisticsResult> root_scalar;
+    {
+        std::string value;
+        auto status =
+            db_->get(root_scalar_stats_key(), &value, cf::ROOT_SCALAR_STATS);
+        if (status.IsNotFound()) {
+            rebuild_root_summaries();
+            return;
+        }
+        if (!status.ok()) {
+            throw_db_error("Failed to read root scalar statistics", status);
+        }
+        try {
+            DecodeContextGuard ctx("root_scalar_stats size=%zu", value.size());
+            root_scalar = decode_root_scalar_stats_value(value);
+        } catch (const std::exception& e) {
+            throw std::runtime_error("Corrupt root_scalar_stats payload size=" +
+                                     std::to_string(value.size()) + ": " +
+                                     e.what());
+        }
+    }
+
+    root_scalar->stats.merge_from(stats);
+    root_scalar->num_chunks += num_chunks;
+    root_scalar->num_files += 1;
+    root_scalar->total_lines += file_lines;
+    root_scalar->total_uncompressed_bytes += file_uncompressed_bytes;
+    put_root_scalar(*root_scalar);
+
+    // Inline query_root_category_counts
+    std::unordered_map<std::string, std::uint64_t> category_counts;
+    {
+        std::string value;
+        auto status =
+            db_->get(root_category_counts_key(), &value, cf::ROOT_CAT_COUNTS);
+        if (status.ok()) {
+            try {
+                DecodeContextGuard ctx("root_cat_counts size=%zu",
+                                       value.size());
+                category_counts = decode_count_map_value(value);
+            } catch (const std::exception& e) {
+                throw std::runtime_error(
+                    "Corrupt root_cat_counts payload size=" +
+                    std::to_string(value.size()) + ": " + e.what());
+            }
+        } else if (!status.IsNotFound()) {
+            throw_db_error("Failed to read root category counts", status);
+        }
+    }
+    for (const auto& [key, count] : stats.category_counts) {
+        category_counts[key] += count;
+    }
+    put_root_counts(cf::ROOT_CAT_COUNTS, root_category_counts_key(),
+                    category_counts, "Failed to write root category counts");
+
+    // Inline query_root_name_counts
+    std::unordered_map<std::string, std::uint64_t> name_counts;
+    {
+        std::string value;
+        auto status =
+            db_->get(root_name_counts_key(), &value, cf::ROOT_NAME_COUNTS);
+        if (status.ok()) {
+            try {
+                DecodeContextGuard ctx("root_name_counts size=%zu",
+                                       value.size());
+                name_counts = decode_count_map_value(value);
+            } catch (const std::exception& e) {
+                throw std::runtime_error(
+                    "Corrupt root_name_counts payload size=" +
+                    std::to_string(value.size()) + ": " + e.what());
+            }
+        } else if (!status.IsNotFound()) {
+            throw_db_error("Failed to read root name counts", status);
+        }
+    }
+    for (const auto& [key, count] : stats.name_counts) {
+        name_counts[key] += count;
+    }
+    put_root_counts(cf::ROOT_NAME_COUNTS, root_name_counts_key(), name_counts,
+                    "Failed to write root name counts");
+
+    // Inline query_root_pid_tid_counts
+    std::unordered_map<std::string, std::uint64_t> pid_tid_counts;
+    {
+        std::string value;
+        auto status = db_->get(root_pid_tid_counts_key(), &value,
+                               cf::ROOT_PID_TID_COUNTS);
+        if (status.ok()) {
+            try {
+                DecodeContextGuard ctx("root_pid_tid_counts size=%zu",
+                                       value.size());
+                pid_tid_counts = decode_count_map_value(value);
+            } catch (const std::exception& e) {
+                throw std::runtime_error(
+                    "Corrupt root_pid_tid_counts payload size=" +
+                    std::to_string(value.size()) + ": " + e.what());
+            }
+        } else if (!status.IsNotFound()) {
+            throw_db_error("Failed to read root pid_tid counts", status);
+        }
+    }
+    for (const auto& [key, count] : stats.pid_tid_counts) {
+        pid_tid_counts[key] += count;
+    }
+    put_root_counts(cf::ROOT_PID_TID_COUNTS, root_pid_tid_counts_key(),
+                    pid_tid_counts, "Failed to write root pid_tid counts");
+}
+
+void IndexDatabaseWriterContext::rebuild_root_summaries() {
+    auto put_root_scalar = [&](const RootStatisticsResult& root) {
+        auto value = encode_root_scalar_stats_value(
+            root.stats, root.num_chunks, root.num_files, root.total_lines,
+            root.total_uncompressed_bytes);
+        auto status = db_->put(batch_, cf::ROOT_SCALAR_STATS,
+                               root_scalar_stats_key(), value);
+        if (!status.ok()) {
+            throw_db_error("Failed to write root scalar statistics", status);
+        }
+    };
+
+    auto put_root_counts = [&](std::string_view cf_name, std::string_view key,
+                               const auto& counts,
+                               std::string_view error_message) {
+        auto value = encode_count_map_value(counts);
+        auto status = db_->put(batch_, cf_name, key, value);
+        if (!status.ok()) {
+            throw_db_error(error_message, status);
+        }
+    };
+
+    RootStatisticsResult rebuilt;
+
+    // Inline query_all_file_info_ids
+    std::unordered_map<std::string, int> all_files;
+    internal::scan_prefix_iterator(
+        "Failed to scan file registry", "f|",
+        [this] { return db_->new_iterator(); },
+        [&](::rocksdb::Iterator& it) {
+            auto key = iterator_key(it);
+            auto value = iterator_value(it);
+            all_files.emplace(key.substr(2), decode_file_id(value));
+        });
+
+    std::vector<int> file_ids;
+    file_ids.reserve(all_files.size());
+    for (const auto& [_, existing_file_id] : all_files) {
+        file_ids.push_back(existing_file_id);
+    }
+
+    rebuilt.num_files = static_cast<std::uint64_t>(file_ids.size());
+
+    if (!file_ids.empty()) {
+        std::unordered_set<int> wanted(file_ids.begin(), file_ids.end());
+        const auto [min_it, max_it] =
+            std::minmax_element(file_ids.begin(), file_ids.end());
+        const auto min_prefix = prefix_for_file(*min_it);
+        const int max_file_id = *max_it;
+
+        auto it = db_->new_iterator(cf::FILE_SCALAR_STATS);
+        for (it->Seek(::rocksdb::Slice(min_prefix.data(), min_prefix.size()));
+             it->Valid(); it->Next()) {
+            auto key = iterator_key(*it);
+            int fid = decode_prefixed_file_id(key);
+            if (fid > max_file_id) break;
+            if (!wanted.contains(fid)) continue;
+
+            auto value = iterator_value(*it);
+            try {
+                DecodeContextGuard ctx("file_scalar_stats file_id=%d size=%zu",
+                                       fid, value.size());
+                auto row = decode_file_scalar_stats_value(value);
+                rebuilt.stats.merge_from(row.stats);
+                rebuilt.num_chunks += row.num_chunks;
+            } catch (const std::exception& e) {
+                throw std::runtime_error(
+                    "Corrupt file_scalar_stats payload file_id=" +
+                    std::to_string(fid) +
+                    " size=" + std::to_string(value.size()) + ": " + e.what());
+            }
+        }
+        const auto status = it->status();
+        if (!status.ok()) {
+            throw_db_error("Failed to scan file scalar statistics", status);
+        }
+    }
+
+    // Inline query_file_metadata_batch
+    if (!file_ids.empty()) {
+        std::unordered_set<int> wanted(file_ids.begin(), file_ids.end());
+        const auto [min_it, max_it] =
+            std::minmax_element(file_ids.begin(), file_ids.end());
+        const auto min_prefix = prefix_for_file(*min_it);
+        const int max_file_id = *max_it;
+
+        auto it = db_->new_iterator(cf::METADATA);
+        for (it->Seek(::rocksdb::Slice(min_prefix.data(), min_prefix.size()));
+             it->Valid(); it->Next()) {
+            auto key = iterator_key(*it);
+            int fid = decode_prefixed_file_id(key);
+            if (fid > max_file_id) {
+                break;
+            }
+            if (!wanted.contains(fid)) {
+                continue;
+            }
+
+            auto value = iterator_value(*it);
+            DecodeContextGuard ctx("metadata file_id=%d size=%zu", fid,
+                                   value.size());
+            auto decoded = decode_metadata_record(value);
+            rebuilt.total_lines += decoded[1];
+            rebuilt.total_uncompressed_bytes += decoded[2];
+        }
+
+        const auto status = it->status();
+        if (!status.ok()) {
+            throw IndexerError(
+                IndexerError::Type::DATABASE_ERROR,
+                "Failed to batch read file metadata: " + status.ToString());
+        }
+    }
+
+    if (!file_ids.empty()) {
+        std::unordered_set<int> wanted(file_ids.begin(), file_ids.end());
+        const auto [min_it, max_it] =
+            std::minmax_element(file_ids.begin(), file_ids.end());
+        const auto min_prefix = prefix_for_file(*min_it);
+        const int max_file_id = *max_it;
+
+        auto scan_counts = [&](std::string_view cf_name,
+                               std::string_view error_message, auto& target_map,
+                               auto for_each_entry_fn) {
+            auto it = db_->new_iterator(cf_name);
+            for (it->Seek(
+                     ::rocksdb::Slice(min_prefix.data(), min_prefix.size()));
+                 it->Valid(); it->Next()) {
+                auto key = iterator_key(*it);
+                int fid = decode_prefixed_file_id(key);
+                if (fid > max_file_id) break;
+                if (!wanted.contains(fid)) continue;
+
+                auto value = iterator_value(*it);
+                DecodeContextGuard ctx("%.*s merge file_id=%d size=%zu",
+                                       static_cast<int>(cf_name.size()),
+                                       cf_name.data(), fid, value.size());
+                for_each_entry_fn(value, [&target_map](std::string_view k,
+                                                       std::uint64_t count) {
+                    auto entry = target_map.try_emplace(std::string(k), 0);
+                    entry.first->second += count;
+                });
+            }
+            const auto status = it->status();
+            if (!status.ok()) {
+                throw_db_error(std::string(error_message), status);
+            }
+        };
+
+        scan_counts(cf::FILE_CAT_COUNTS, "Failed to scan file category counts",
+                    rebuilt.stats.category_counts,
+                    [](std::string_view v, auto cb) {
+                        for_each_count_map_entry(v, cb);
+                    });
+        scan_counts(
+            cf::FILE_PID_TID_COUNTS, "Failed to scan file pid_tid counts",
+            rebuilt.stats.pid_tid_counts, [](std::string_view v, auto cb) {
+                for_each_count_map_entry(v, cb);
+            });
+        scan_counts(cf::FILE_NAME_COUNTS, "Failed to scan file name counts",
+                    rebuilt.stats.name_counts, [](std::string_view v, auto cb) {
+                        for_each_name_summary_entry(v, cb);
+                    });
+    }
+
+    put_root_scalar(rebuilt);
+    put_root_counts(cf::ROOT_CAT_COUNTS, root_category_counts_key(),
+                    rebuilt.stats.category_counts,
+                    "Failed to write root category counts");
+    put_root_counts(cf::ROOT_NAME_COUNTS, root_name_counts_key(),
+                    rebuilt.stats.name_counts,
+                    "Failed to write root name counts");
+    put_root_counts(cf::ROOT_PID_TID_COUNTS, root_pid_tid_counts_key(),
+                    rebuilt.stats.pid_tid_counts,
+                    "Failed to write root pid_tid counts");
+}
+
+void IndexDatabaseWriterContext::insert_checkpoint(
+    int file_id, const IndexerCheckpoint& checkpoint) {
+    const auto key = checkpoint_key(file_id, checkpoint.uc_offset,
+                                    checkpoint.checkpoint_idx);
+    const auto value = encode_checkpoint_value(checkpoint);
+    auto status = db_->put(batch_, rocks::cf::CHECKPOINTS, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert checkpoint", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_index_dimension(
+    int file_id, std::string_view dimension) {
+    const auto key = make_dimension_key(file_id, dimension);
+    auto status = db_->put(batch_, cf::DIMENSIONS, key, "");
+    if (!status.ok()) {
+        throw_db_error("Failed to insert index dimension", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_hash_table_entry(
+    std::uint8_t type, std::string_view hash, std::string_view name) {
+    db_->put(batch_, cf::HASH_TABLES,
+             encoding::hash_table_forward_key(type, hash), name);
+    db_->put(batch_, cf::HASH_TABLES,
+             encoding::hash_table_reverse_key(type, name), hash);
+}
+
+void IndexDatabaseWriterContext::insert_aggregation_merge(
+    std::string_view key, std::string_view operand) {
+    auto status = db_->merge(batch_, cf::AGGREGATION, key, operand);
+    if (!status.ok()) {
+        throw_db_error("Failed to merge aggregation operand", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_aggregation_put(
+    std::string_view key, std::string_view value) {
+    auto status = db_->put(batch_, cf::AGGREGATION, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to put aggregation value", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_system_metrics_merge(
+    std::string_view key, std::string_view operand) {
+    auto status = db_->merge(batch_, cf::SYSTEM_METRICS, key, operand);
+    if (!status.ok()) {
+        throw_db_error("Failed to merge system metrics operand", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_chunk_dimension_stats(
+    int file_id, std::uint64_t checkpoint_idx, const ChunkDimensionStats& stats,
+    std::size_t value_counts_cap) {
+    const auto key =
+        chunk_dim_stats_key(file_id, checkpoint_idx, stats.dimension);
+    const auto value =
+        encode_chunk_dimension_stats_value(stats, value_counts_cap);
+    auto status = db_->put(batch_, cf::CHUNK_DIM_STATS, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert chunk dimension stats", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_tar_archive_metadata(
+    int file_id, std::string_view archive_name, std::uint64_t checkpoint_size,
+    std::uint64_t total_lines, std::uint64_t total_uc_size,
+    std::uint64_t total_files) {
+    const auto key = tar_archive_key(file_id);
+    const auto value = encode_tar_archive_value(
+        archive_name, checkpoint_size, total_lines, total_uc_size, total_files);
+    auto status = db_->put(batch_, cf::ARCHIVES, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert tar archive metadata", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_tar_file(int file_id,
+                                                 const TarFileRecord& record) {
+    const auto key =
+        tar_file_key(file_id, record.uncompressed_offset, record.file_name);
+    const auto value = encode_tar_file_value(record);
+    auto status = db_->put(batch_, cf::TAR_FILES, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert tar file metadata", status);
+    }
+}
+
+void IndexDatabaseWriterContext::delete_chunk_bloom_filters(
+    int file_id, std::string_view dimension) {
+    std::vector<std::string> keys;
+    std::string prefix = prefix_for_file(file_id);
+    prefix.append(dimension);
+    prefix.push_back('\0');
+    scan_prefix(*db_, cf::CHUNK_BLOOM, prefix, [&](::rocksdb::Iterator& it) {
+        keys.push_back(iterator_key(it));
+    });
+    for (const auto& key : keys) {
+        auto status = db_->del(batch_, cf::CHUNK_BLOOM, key);
+        if (!status.ok())
+            throw_db_error("Failed to delete chunk bloom", status);
+    }
+}
+
+void IndexDatabaseWriterContext::delete_file_bloom_filter(
+    int file_id, std::string_view dimension) {
+    auto status =
+        db_->del(batch_, cf::FILE_BLOOM, file_bloom_key(file_id, dimension));
+    if (!status.ok() && !status.IsNotFound()) {
+        throw_db_error("Failed to delete file bloom", status);
+    }
+}
+
+void IndexDatabaseWriterContext::delete_chunk_statistics(int file_id) {
+    std::vector<std::string> keys;
+    scan_prefix(
+        *db_, cf::CHUNK_STATS, prefix_for_file(file_id),
+        [&](::rocksdb::Iterator& it) { keys.push_back(iterator_key(it)); });
+    for (const auto& key : keys) {
+        auto status = db_->del(batch_, cf::CHUNK_STATS, key);
+        if (!status.ok()) {
+            throw_db_error("Failed to delete chunk statistics", status);
+        }
+    }
+}
+
+void IndexDatabaseWriterContext::delete_chunk_dimension_stats(int file_id) {
+    std::vector<std::string> keys;
+    scan_prefix(
+        *db_, cf::CHUNK_DIM_STATS, prefix_for_file(file_id),
+        [&](::rocksdb::Iterator& it) { keys.push_back(iterator_key(it)); });
+    for (const auto& key : keys) {
+        auto status = db_->del(batch_, cf::CHUNK_DIM_STATS, key);
+        if (!status.ok()) {
+            throw_db_error("Failed to delete chunk dimension stats", status);
+        }
+    }
+}
+
+void IndexDatabaseWriterContext::delete_file_contents(int file_id) {
+    auto delete_prefix = [&](std::string_view cf_name,
+                             std::string_view prefix) {
+        std::vector<std::string> keys;
+        scan_prefix(*db_, cf_name, prefix, [&](::rocksdb::Iterator& it) {
+            keys.push_back(iterator_key(it));
+        });
+        for (const auto& key : keys) {
+            auto del_status = db_->del(batch_, cf_name, key);
+            if (!del_status.ok() && !del_status.IsNotFound()) {
+                throw_db_error("Failed to delete file-scoped RocksDB data",
+                               del_status);
+            }
+        }
+    };
+
+    auto delete_name_postings_by_owner = [&](std::string_view cf_name,
+                                             int owner_fid,
+                                             std::string_view prefix,
+                                             bool chunk_level) {
+        std::vector<std::string> owner_keys;
+        scan_prefix(*db_, cf_name, prefix, [&](::rocksdb::Iterator& it) {
+            owner_keys.push_back(iterator_key(it));
+        });
+        for (const auto& owner_key : owner_keys) {
+            if (owner_key.size() < prefix.size() + 4) {
+                continue;
+            }
+            const std::string_view payload(owner_key.data() + prefix.size(),
+                                           owner_key.size() - prefix.size());
+            if ((!chunk_level && payload.size() != 4) ||
+                (chunk_level && payload.size() != 12)) {
+                continue;
+            }
+
+            std::string primary_key("n|");
+            primary_key.append(payload.data(), 4);
+            rocks::KeyCodec::append_be32(primary_key,
+                                         static_cast<std::uint32_t>(owner_fid));
+            if (chunk_level) {
+                primary_key.append(payload.data() + 4, payload.size() - 4);
+            }
+
+            auto del_one = [&](std::string_view key) {
+                auto del_status = db_->del(batch_, cf_name, key);
+                if (!del_status.ok() && !del_status.IsNotFound()) {
+                    throw_db_error("Failed to delete exact name posting",
+                                   del_status);
+                }
+            };
+
+            del_one(primary_key);
+            del_one(owner_key);
+        }
+    };
+
+    delete_prefix(rocks::cf::CHECKPOINTS, prefix_for_file(file_id));
+    delete_prefix(cf::METADATA, prefix_for_file(file_id));
+    delete_prefix(cf::ARCHIVES, prefix_for_file(file_id));
+    delete_prefix(cf::TAR_FILES, prefix_for_file(file_id));
+    delete_prefix(cf::CHUNK_BLOOM, prefix_for_file(file_id));
+    delete_prefix(cf::FILE_BLOOM, prefix_for_file(file_id));
+    delete_prefix(cf::CHUNK_STATS, prefix_for_file(file_id));
+    delete_prefix(cf::CHUNK_DIM_STATS, prefix_for_file(file_id));
+    delete_prefix(cf::FILE_SCALAR_STATS, prefix_for_file(file_id));
+    delete_prefix(cf::FILE_CAT_COUNTS, prefix_for_file(file_id));
+    delete_prefix(cf::FILE_NAME_COUNTS, prefix_for_file(file_id));
+    delete_prefix(cf::FILE_PID_TID_COUNTS, prefix_for_file(file_id));
+    delete_name_postings_by_owner(cf::NAME_FILE_POSTINGS, file_id,
+                                  name_file_owner_prefix(file_id), false);
+    delete_name_postings_by_owner(cf::NAME_CHUNK_POSTINGS, file_id,
+                                  name_chunk_owner_prefix(file_id), true);
+    delete_prefix(cf::DIMENSIONS, std::string("d|") + prefix_for_file(file_id));
+    delete_prefix(cf::MANIFEST, std::string("E|") + prefix_for_file(file_id));
+    delete_prefix(cf::MANIFEST, std::string("M|") + prefix_for_file(file_id));
+    delete_prefix(cf::MANIFEST, std::string("P|") + prefix_for_file(file_id));
+}
+
+void IndexDatabaseWriterContext::insert_event_range(
+    int file_id, std::uint64_t checkpoint_idx, std::string_view cat,
+    std::string_view name, std::span<const std::uint32_t> line_numbers) {
+    const auto key = manifest_event_key(file_id, checkpoint_idx, cat, name);
+    const auto value = encode_event_range_value(line_numbers);
+    auto status = db_->put(batch_, cf::MANIFEST, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert event range", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_metadata_lines(
+    int file_id, std::uint64_t checkpoint_idx, std::string_view meta_type,
+    std::span<const std::uint32_t> line_numbers) {
+    const auto key = manifest_metadata_key(file_id, checkpoint_idx, meta_type);
+    const auto value = encode_metadata_value(line_numbers);
+    auto status = db_->put(batch_, cf::MANIFEST, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert metadata lines", status);
+    }
+}
+
+void IndexDatabaseWriterContext::insert_file_pids(
+    int file_id, const std::unordered_set<std::uint64_t>& pids) {
+    const auto key = encoding::file_pids_key(file_id);
+    const auto value = encoding::encode_file_pids_value(pids);
+    auto status = db_->put(batch_, cf::MANIFEST, key, value);
+    if (!status.ok()) {
+        throw_db_error("Failed to insert file PIDs", status);
+    }
+}
+
+void IndexDatabaseWriterContext::delete_event_ranges(int file_id) {
+    std::vector<std::string> keys;
+    std::string prefix("E|");
+    rocks::KeyCodec::append_be32(prefix, static_cast<std::uint32_t>(file_id));
+    scan_prefix(*db_, cf::MANIFEST, prefix, [&](::rocksdb::Iterator& it) {
+        keys.push_back(iterator_key(it));
+    });
+    for (const auto& key : keys) {
+        auto status = db_->del(batch_, cf::MANIFEST, key);
+        if (!status.ok()) {
+            throw_db_error("Failed to delete manifest event ranges", status);
+        }
+    }
+}
+
+void IndexDatabaseWriterContext::delete_metadata_lines(int file_id) {
+    std::vector<std::string> keys;
+    std::string prefix("M|");
+    rocks::KeyCodec::append_be32(prefix, static_cast<std::uint32_t>(file_id));
+    scan_prefix(*db_, cf::MANIFEST, prefix, [&](::rocksdb::Iterator& it) {
+        keys.push_back(iterator_key(it));
+    });
+    for (const auto& key : keys) {
+        auto status = db_->del(batch_, cf::MANIFEST, key);
+        if (!status.ok()) {
+            throw_db_error("Failed to delete metadata lines", status);
+        }
+    }
+}
+
+}  // namespace dftracer::utils::utilities::indexer
diff --git a/src/dftracer/utils/utilities/indexer/internal/common/gzip_inflater.h b/src/dftracer/utils/utilities/indexer/internal/common/gzip_inflater.h
index 3c991f9a..d2842d10 100644
--- a/src/dftracer/utils/utilities/indexer/internal/common/gzip_inflater.h
+++ b/src/dftracer/utils/utilities/indexer/internal/common/gzip_inflater.h
@@ -72,18 +72,43 @@ class GzipInflater : public Inflater {
      * Read and analyze data for indexing purposes.
      * Uses Z_BLOCK to detect deflate boundaries and counts lines.
      */
-    coro::CoroTask<bool> read(int fd, off_t& offset,
-                              GzipInflaterResult& result) {
+    coro::CoroTask<bool> read(int fd, off_t& offset, GzipInflaterResult& result,
+                              std::size_t max_input_bytes = 0) {
+        co_return co_await read_into(fd, offset, out_buffer(), BUFFER_SIZE,
+                                     result, max_input_bytes);
+    }
+
+    /**
+     * Like read() but writes uncompressed output directly into the
+     * caller-provided buffer. Enables zero-copy hand-off to downstream
+     * consumers that own their own memory (e.g. parallel-inflate worker
+     * pools that cycle buffers through a channel without memcpy).
+     *
+     * The caller must keep `out_buf` alive for the duration of this call
+     * and not read it until the coroutine resumes with a successful
+     * result.
+     */
+    coro::CoroTask<bool> read_into(int fd, off_t& offset,
+                                   unsigned char* out_buf, std::size_t out_cap,
+                                   GzipInflaterResult& result,
+                                   std::size_t max_input_bytes = 0) {
         result = {0, 0, false, 0};
 
-        stream.next_out = out_buffer;
-        stream.avail_out = sizeof(out_buffer);
+        stream.next_out = out_buf;
+        stream.avail_out = static_cast<uInt>(out_cap);
 
         while (stream.avail_out > 0) {
             // Read input if needed
             if (stream.avail_in == 0) {
+                std::size_t to_read = BUFFER_SIZE;
+                if (max_input_bytes != 0) {
+                    if (total_input_bytes_ >= max_input_bytes) break;
+                    const std::size_t remaining =
+                        max_input_bytes - total_input_bytes_;
+                    if (remaining < to_read) to_read = remaining;
+                }
                 ssize_t n = co_await ::dftracer::utils::io::pread(
-                    fd, in_buffer, sizeof(in_buffer), offset);
+                    fd, in_buffer(), to_read, offset);
                 if (n == 0) {
                     break;  // EOF
                 }
@@ -94,7 +119,7 @@ class GzipInflater : public Inflater {
                     co_return false;  // Return error
                 }
                 offset += n;
-                stream.next_in = in_buffer;
+                stream.next_in = in_buffer();
                 stream.avail_in = static_cast<uInt>(n);
                 total_input_bytes_ += static_cast<std::size_t>(n);
             }
@@ -108,17 +133,15 @@ class GzipInflater : public Inflater {
                         stream.msg ? stream.msg : "no message");
                     break;
                 }
-                // NOTE: inflateReset clears the zlib
-                // sliding window (state->whave = 0). If we continued filling
-                // the output buffer, the next deflate-block boundary would
-                // pass the "avail_out < sizeof" check (using pre-reset
-                // output) while inflateGetDictionary returns an empty window.
-                // Breaking here lets the caller consume the output produced
-                // so far, and the NEXT read() call starts with fresh output
-                // accounting so block-boundary checks only reflect post-reset
-                // output where the window is valid.
+                // If the member produced no output (e.g. an FEXTRA-only
+                // padding member emitted by the padded-striped writer),
+                // keep inflating so the caller never sees a spurious 0-byte
+                // read (which it would treat as EOF). Otherwise break so
+                // the caller can consume the output and we re-enter with
+                // fresh window-accounting post-reset.
                 result.at_block_boundary = false;
-                break;
+                if (stream.avail_out < out_cap) break;
+                continue;
             }
             if (ret != Z_OK) {
                 DFTRACER_UTILS_LOG_DEBUG(
@@ -135,14 +158,14 @@ class GzipInflater : public Inflater {
             // before any data is decompressed.
             if ((stream.data_type & 0xc0) == 0x80) {
                 result.at_block_boundary = true;
-                if (stream.avail_out < sizeof(out_buffer)) {
+                if (stream.avail_out < out_cap) {
                     break;
                 }
             }
         }
 
-        result.bytes_read = sizeof(out_buffer) - stream.avail_out;
-        result.lines_found = count_lines(out_buffer, result.bytes_read);
+        result.bytes_read = out_cap - stream.avail_out;
+        result.lines_found = count_lines(out_buf, result.bytes_read);
         result.input_bytes_consumed = total_input_bytes_ - stream.avail_in;
 
         co_return true;
diff --git a/src/dftracer/utils/utilities/indexer/internal/common/gzip_member_scanner.h b/src/dftracer/utils/utilities/indexer/internal/common/gzip_member_scanner.h
new file mode 100644
index 00000000..2df746a1
--- /dev/null
+++ b/src/dftracer/utils/utilities/indexer/internal/common/gzip_member_scanner.h
@@ -0,0 +1,107 @@
+#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_COMMON_GZIP_MEMBER_SCANNER_H
+#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_COMMON_GZIP_MEMBER_SCANNER_H
+
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/io/io.h>
+#include <sys/stat.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+namespace dftracer::utils::utilities::indexer::internal {
+
+struct GzipMember {
+    std::uint64_t c_offset;  // compressed byte offset of the 1F 8B header
+    std::uint64_t c_size;    // compressed size of this member (bytes)
+};
+
+/// Cheap validation of a 10-byte gzip header candidate starting at buf[i].
+/// Rejects patterns that look like 1F 8B 08 but don't parse as a real
+/// gzip header (FLG reserved bits, XFL, OS sanity). Callers must have
+/// `end - buf[i] >= 10`.
+inline bool gzip_header_looks_valid(const unsigned char* buf) noexcept {
+    if (buf[0] != 0x1F || buf[1] != 0x8B || buf[2] != 0x08) return false;
+    const unsigned char flg = buf[3];
+    if (flg & 0xE0) return false;  // reserved bits must be zero
+    const unsigned char xfl = buf[8];
+    if (xfl != 0 && xfl != 2 && xfl != 4) return false;
+    const unsigned char os = buf[9];
+    if (os > 13 && os != 255) return false;
+    return true;
+}
+
+/// Scan `fd` in buffered pread windows, collecting compressed byte offsets
+/// of every candidate gzip header. On return, `out` contains at least
+/// one entry (offset 0 if the file starts with a valid gzip header), or
+/// is empty if the file is not a gzip stream. `c_size` is populated as
+/// the gap to the next member's offset; the last member's `c_size`
+/// extends to file end.
+///
+/// False positives (the byte pattern appearing inside compressed data
+/// with a plausible header) are possible; callers must treat a returned
+/// list as "candidate" and validate at inflate time.
+inline coro::CoroTask<bool> enumerate_gzip_member_candidates(
+    int fd, std::uint64_t file_size, std::vector<GzipMember>& out) {
+    out.clear();
+    if (file_size < 18)
+        co_return false;  // min gzip: 10 header + 2 deflate + 8 trailer
+
+    // Window size tuned for Lustre sequential reads; keep a small overlap
+    // so a header straddling a window boundary is still seen by the scan.
+    constexpr std::size_t WIN = 1 << 20;  // 1 MiB
+    constexpr std::size_t OVERLAP = 16;   // >= gzip fixed header size
+
+    std::vector<unsigned char> buf(WIN);
+    std::uint64_t pos = 0;
+    std::uint64_t carry = 0;  // how many bytes at buf[0..carry) are stale
+
+    while (pos < file_size) {
+        const std::size_t want =
+            std::min<std::uint64_t>(WIN - carry, file_size - pos);
+        ssize_t n = co_await dftracer::utils::io::pread(
+            fd, buf.data() + carry, want, static_cast<off_t>(pos));
+        if (n <= 0) {
+            if (out.empty()) co_return false;
+            break;
+        }
+        const std::size_t avail = carry + static_cast<std::size_t>(n);
+        const std::uint64_t base = pos - carry;
+
+        const std::size_t scan_end = (avail >= 10) ? (avail - 9) : 0;
+        for (std::size_t i = 0; i < scan_end; ++i) {
+            if (buf[i] != 0x1F) continue;
+            if (gzip_header_looks_valid(buf.data() + i)) {
+                out.push_back({base + i, 0});
+            }
+        }
+
+        pos += static_cast<std::uint64_t>(n);
+
+        // Copy the trailing OVERLAP bytes to the front so a header
+        // straddling the next read is still caught.
+        if (avail >= OVERLAP) {
+            std::memmove(buf.data(), buf.data() + avail - OVERLAP, OVERLAP);
+            carry = OVERLAP;
+        } else {
+            carry = avail;
+            std::memmove(buf.data(), buf.data() + (avail - carry), carry);
+        }
+    }
+
+    if (out.empty()) co_return false;
+
+    // Fill c_size: gap between consecutive candidates; last one extends to EOF.
+    for (std::size_t i = 0; i + 1 < out.size(); ++i) {
+        out[i].c_size = out[i + 1].c_offset - out[i].c_offset;
+    }
+    out.back().c_size = file_size - out.back().c_offset;
+
+    co_return true;
+}
+
+}  // namespace dftracer::utils::utilities::indexer::internal
+
+#endif  // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_COMMON_GZIP_MEMBER_SCANNER_H
diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.cpp b/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.cpp
index 68f102b5..83628452 100644
--- a/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.cpp
+++ b/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.cpp
@@ -2,28 +2,35 @@
 #include <dftracer/utils/core/common/constants.h>
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/coro/channel.h>
 #include <dftracer/utils/core/coro/task.h>
-#include <dftracer/utils/core/rocksdb/async.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
 #include <dftracer/utils/utilities/indexer/index_visitor.h>
 #include <dftracer/utils/utilities/indexer/internal/checkpoint_size.h>
 #include <dftracer/utils/utilities/indexer/internal/common/gzip_checkpointer.h>
 #include <dftracer/utils/utilities/indexer/internal/common/gzip_inflater.h>
+#include <dftracer/utils/utilities/indexer/internal/common/gzip_member_scanner.h>
 #include <dftracer/utils/utilities/indexer/internal/error.h>
 #include <dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
 #include <dftracer/utils/utilities/indexer/internal/transaction_scope.h>
 #include <fcntl.h>
+#include <sys/stat.h>
 #include <unistd.h>
 
 #include <cstdio>
+#include <cstring>
+#include <memory>
 #include <mutex>
 #include <string>
+#include <utility>
+#include <vector>
 
 namespace dftracer::utils::utilities::indexer::internal::gzip {
 
 using dftracer::utils::utilities::indexer::IndexDatabase;
-namespace rocks = dftracer::utils::rocksdb;
 
 namespace {
 
@@ -50,7 +57,7 @@ void finalize_checkpoints(std::vector<IndexerCheckpoint>& checkpoints,
     }
 }
 
-static dftracer::utils::coro::CoroTask<bool> process_chunks(
+static dftracer::utils::coro::CoroTask<bool> process_chunks_serial(
     int fd, std::uint64_t ckpt_size, std::uint64_t& total_lines,
     std::uint64_t& total_uc_size, std::uint64_t& tail_line_count,
     std::vector<IndexerCheckpoint>& checkpoints,
@@ -67,7 +74,6 @@ static dftracer::utils::coro::CoroTask<bool> process_chunks(
     std::uint64_t line_count_in_chunk = 0;
     std::uint64_t first_line_in_chunk = total_lines + 1;
 
-    std::string line_buf;
     const bool has_visitors = !visitors.empty();
 
     while (true) {
@@ -88,25 +94,16 @@ static dftracer::utils::coro::CoroTask<bool> process_chunks(
         line_count_in_chunk += result.lines_found;
 
         if (has_visitors) {
-            const auto* data = inflater.out_buffer;
-            const std::size_t n = result.bytes_read;
-            std::size_t seg_start = 0;
-            for (std::size_t i = 0; i < n; ++i) {
-                if (data[i] == '\n') {
-                    line_buf.append(
-                        reinterpret_cast<const char*>(data + seg_start),
-                        i - seg_start);
-                    std::string_view line_sv(line_buf);
-                    for (auto& visitor : visitors) {
-                        visitor.get().on_line(line_sv, checkpoint_idx);
-                    }
-                    line_buf.clear();
-                    seg_start = i + 1;
-                }
+            const char* data =
+                reinterpret_cast<const char*>(inflater.out_buffer());
+            for (auto& visitor : visitors) {
+                co_await visitor.get().on_chunk(data, result.bytes_read,
+                                                checkpoint_idx);
             }
-            if (seg_start < n) {
-                line_buf.append(reinterpret_cast<const char*>(data + seg_start),
-                                n - seg_start);
+            for (auto& visitor : visitors) {
+                if (visitor.get().wants_drain()) {
+                    co_await visitor.get().drain_pending();
+                }
             }
         }
 
@@ -135,7 +132,8 @@ static dftracer::utils::coro::CoroTask<bool> process_chunks(
 
                     if (has_visitors) {
                         for (auto& visitor : visitors) {
-                            visitor.get().on_checkpoint(checkpoint_idx - 1);
+                            co_await visitor.get().on_checkpoint(
+                                checkpoint_idx - 1);
                         }
                     }
 
@@ -147,17 +145,445 @@ static dftracer::utils::coro::CoroTask<bool> process_chunks(
         }
     }
 
+    if (has_visitors) {
+        for (auto& visitor : visitors) {
+            co_await visitor.get().flush();
+        }
+    }
+
     total_uc_size = current_uc_offset;
     tail_line_count = line_count_in_chunk;
     co_return true;
 }
 
-static dftracer::utils::coro::CoroTask<bool> build_index(
-    IndexDatabase& db, int file_id, const std::string& gz_path,
-    std::uint64_t ckpt_size, const Indexer::VisitorList& visitors) {
+// -- Parallel path ---------------------------------------------------------
+//
+// When the file is multi-member gzip (the dftracer runtime format), divide
+// the members across N worker coroutines and stream inflated chunks through
+// per-worker channels to a single dispatcher.
+//
+// Checkpoints are emitted by workers at mid-range deflate-block boundaries
+// using GzipCheckpointer (identical semantics to the serial path). The
+// dispatcher finalises each checkpoint with global uc_offset / line numbers
+// and pushes it into the shared `checkpoints` vector in order.
+
+struct ParallelInflateMsg {
+    std::unique_ptr<std::vector<unsigned char>> data;
+    // Per-worker monotonic sequence. Load-bearing: moodycamel (our channel
+    // backend) does not guarantee strict FIFO without producer tokens, so
+    // the dispatcher reorders by this before handing chunks to visitors.
+    std::uint64_t seq = 0;
+    std::uint64_t lines = 0;
+    bool has_checkpoint = false;
+    std::vector<unsigned char> dict_compressed;
+    int bits = 0;
+    std::uint64_t ckpt_c_offset = 0;
+};
+
+using ParallelChan = dftracer::utils::coro::Channel<ParallelInflateMsg>;
+
+static dftracer::utils::coro::CoroTask<bool> parallel_worker(
+    int fd, std::uint64_t range_c_start, std::uint64_t range_c_end,
+    std::uint64_t ckpt_size, bool strip_leading_partial,
+    bool extend_to_newline_past_end,
+    dftracer::utils::coro::ChannelProducer<ParallelInflateMsg> producer) {
+    auto guard = producer.guard();
+
+    GzipInflater inflater;
+    if (!(co_await inflater.initialize(fd, range_c_start))) {
+        co_return false;
+    }
+
+    off_t offset = static_cast<off_t>(range_c_start);
+    const std::uint64_t range_c_size = range_c_end - range_c_start;
+    std::uint64_t local_uc = 0;
+    std::uint64_t last_ckpt_uc = 0;
+    std::uint64_t seq = 0;
+    bool leading_partial_done = !strip_leading_partial;
+    // When extending past end: once get_total_input_consumed() hits
+    // range_c_size, we keep reading (uncapped) until the uncompressed
+    // output contains a `\n`; the byte right after that `\n` belongs
+    // to the next slice, so we truncate this final chunk there. This
+    // captures the single split line that straddles the slice boundary
+    // so it isn't double-counted (the next slice strips its own leading
+    // partial prefix).
+    bool extending = false;
+    bool emit_done = false;
+
+    while (true) {
+        GzipInflaterResult result;
+        // Cap pread at range_c_size normally. While extending (slice's
+        // last worker that must consume the straddling split line) the
+        // cap is lifted so we can read into the next slice's bytes far
+        // enough to find a `\n`.
+        const std::size_t input_cap = extending ? 0 : range_c_size;
+        if (!(co_await inflater.read(fd, offset, result, input_cap))) {
+            co_return false;
+        }
+        if (result.bytes_read == 0) {
+            // `bytes_read==0` here can mean either true EOF or that the
+            // input cap was hit mid-inflate (inflater break'd out with
+            // no new output). For the latter, if this worker still
+            // needs to swallow the straddling split line, flip into
+            // extend mode and retry with an uncapped read.
+            if (extend_to_newline_past_end && !emit_done && !extending) {
+                extending = true;
+                continue;
+            }
+            break;
+        }
+
+        // First-chunk leading-partial-line strip: when this worker is
+        // the first of a mid-file slice (range_c_start at a non-initial
+        // member's header), the first inflated bytes continue a line
+        // from the previous member (not owned by this slice). Skip up
+        // to and including the first `\n` so the dispatcher sees a
+        // clean line-boundary start.
+        std::size_t skip_prefix = 0;
+        if (!leading_partial_done) {
+            const unsigned char* out = inflater.out_buffer();
+            for (std::size_t i = 0; i < result.bytes_read; ++i) {
+                if (out[i] == '\n') {
+                    skip_prefix = i + 1;
+                    leading_partial_done = true;
+                    break;
+                }
+            }
+            if (!leading_partial_done) {
+                // No newline in this chunk; entire chunk is continuation
+                // of the previous slice's line. Count bytes but emit
+                // nothing and loop for more.
+                local_uc += result.bytes_read;
+                if (inflater.get_total_input_consumed() >= range_c_size) break;
+                continue;
+            }
+        }
+
+        std::size_t emit_len = result.bytes_read - skip_prefix;
+        local_uc += result.bytes_read;
+
+        // Extending past end: truncate the emit buffer at the first `\n`
+        // at/after the boundary. Everything after that belongs to the
+        // next slice.
+        if (extending && !emit_done) {
+            const unsigned char* p = inflater.out_buffer() + skip_prefix;
+            for (std::size_t i = 0; i < emit_len; ++i) {
+                if (p[i] == '\n') {
+                    emit_len = i + 1;
+                    emit_done = true;
+                    break;
+                }
+            }
+        }
+
+        ParallelInflateMsg msg;
+        msg.seq = seq++;
+        // Line count adjusted for stripped prefix: approximate by
+        // counting newlines in the emitted region (cheap enough; pipeline
+        // uses counts only for statistics, not for correctness).
+        if (skip_prefix == 0) {
+            msg.lines = result.lines_found;
+        } else {
+            std::uint64_t lines = 0;
+            const unsigned char* p = inflater.out_buffer() + skip_prefix;
+            for (std::size_t i = 0; i < emit_len; ++i) {
+                if (p[i] == '\n') ++lines;
+            }
+            msg.lines = lines;
+        }
+        msg.data = std::make_unique<std::vector<unsigned char>>(
+            inflater.out_buffer() + skip_prefix,
+            inflater.out_buffer() + skip_prefix + emit_len);
+
+        if (result.at_block_boundary && ckpt_size > 0 &&
+            local_uc - last_ckpt_uc >= ckpt_size) {
+            const std::uint64_t absolute_c =
+                range_c_start + inflater.get_total_input_consumed();
+            GzipCheckpointer cp(inflater, static_cast<std::size_t>(local_uc));
+            if (cp.create(static_cast<std::size_t>(absolute_c))) {
+                std::vector<unsigned char> dict;
+                if (cp.compress(dict)) {
+                    msg.has_checkpoint = true;
+                    msg.dict_compressed = std::move(dict);
+                    msg.bits = cp.bits;
+                    msg.ckpt_c_offset = absolute_c;
+                    last_ckpt_uc = local_uc;
+                }
+            }
+        }
+
+        if (!(co_await producer.send(std::move(msg)))) break;
+
+        if (inflater.get_total_input_consumed() >= range_c_size) {
+            // Normal worker: stop at slice boundary.
+            // Extending worker: stop after we've emitted up to and
+            // including the first `\n` past the boundary.
+            if (!extend_to_newline_past_end) break;
+            if (emit_done) break;
+            extending = true;
+        }
+    }
+
+    co_return true;
+}
+
+static dftracer::utils::coro::CoroTask<bool> parallel_dispatcher(
+    const std::vector<std::shared_ptr<ParallelChan>>& chans,
+    std::uint64_t checkpoint_idx_base, std::uint64_t& total_lines,
+    std::uint64_t& total_uc_size, std::uint64_t& tail_line_count,
+    std::vector<IndexerCheckpoint>& checkpoints,
+    const Indexer::VisitorList& visitors) {
+    const bool has_visitors = !visitors.empty();
+    std::uint64_t checkpoint_idx = checkpoint_idx_base;
+    std::uint64_t global_uc = 0;
+    std::uint64_t line_count_in_chunk = 0;
+    std::uint64_t first_line_in_chunk = total_lines + 1;
+
+    std::uint64_t total_chunks_received = 0;
+
+    auto process_msg =
+        [&](ParallelInflateMsg& msg) -> dftracer::utils::coro::CoroTask<void> {
+        const std::size_t data_len = msg.data ? msg.data->size() : 0;
+        ++total_chunks_received;
+
+        if (has_visitors && data_len > 0) {
+            const char* data = reinterpret_cast<const char*>(msg.data->data());
+            for (auto& v : visitors) {
+                co_await v.get().on_chunk(data, data_len, checkpoint_idx);
+            }
+        }
+
+        global_uc += data_len;
+        total_lines += msg.lines;
+        line_count_in_chunk += msg.lines;
+
+        if (msg.has_checkpoint) {
+            IndexerCheckpoint checkpoint{
+                .checkpoint_idx = checkpoint_idx++,
+                .uc_offset = global_uc,
+                .uc_size = 0,
+                .c_offset = msg.ckpt_c_offset,
+                .c_size = 0,
+                .bits = msg.bits,
+                .dict_compressed = std::move(msg.dict_compressed),
+                .num_lines = line_count_in_chunk,
+                .first_line_num = first_line_in_chunk,
+                .last_line_num = total_lines,
+            };
+            checkpoints.push_back(std::move(checkpoint));
+
+            if (has_visitors) {
+                for (auto& v : visitors) {
+                    co_await v.get().on_checkpoint(checkpoint_idx - 1);
+                }
+            }
+
+            line_count_in_chunk = 0;
+            first_line_in_chunk = total_lines + 1;
+        }
+        co_return;
+    };
+
+    // Per-worker reorder buffer: moodycamel::ConcurrentQueue (backing our
+    // coro::Channel) does not guarantee strict FIFO without explicit
+    // producer tokens, so we re-sort by msg.seq here. Channel capacity is
+    // bounded so the buffer is also bounded (~channel capacity entries).
+    auto drain_visitors = [&]() -> dftracer::utils::coro::CoroTask<void> {
+        for (auto& v : visitors) {
+            if (v.get().wants_drain()) {
+                co_await v.get().drain_pending();
+            }
+        }
+    };
+
+    for (auto& chan : chans) {
+        std::uint64_t expected_seq = 0;
+        std::map<std::uint64_t, ParallelInflateMsg> pending;
+        while (auto msg_opt = co_await chan->receive()) {
+            auto& incoming = *msg_opt;
+            if (incoming.seq == expected_seq) {
+                co_await process_msg(incoming);
+                co_await drain_visitors();
+                ++expected_seq;
+                auto it = pending.find(expected_seq);
+                while (it != pending.end()) {
+                    co_await process_msg(it->second);
+                    co_await drain_visitors();
+                    pending.erase(it);
+                    ++expected_seq;
+                    it = pending.find(expected_seq);
+                }
+            } else {
+                pending.emplace(incoming.seq, std::move(incoming));
+            }
+        }
+        while (!pending.empty()) {
+            auto it = pending.begin();
+            if (it->first != expected_seq) break;
+            co_await process_msg(it->second);
+            co_await drain_visitors();
+            pending.erase(it);
+            ++expected_seq;
+        }
+    }
+
+    if (has_visitors) {
+        for (auto& v : visitors) co_await v.get().flush();
+    }
+    total_uc_size = global_uc;
+    tail_line_count = line_count_in_chunk;
+    co_return true;
+}
+
+static dftracer::utils::coro::CoroTask<bool> process_chunks_parallel(
+    CoroScope* scope, int fd, std::uint64_t slice_c_end,
+    std::uint64_t file_size, std::vector<GzipMember> members,
+    std::uint64_t ckpt_size, std::uint64_t checkpoint_idx_base,
+    bool strip_slice_leading_partial, std::uint64_t& total_lines,
+    std::uint64_t& total_uc_size, std::uint64_t& tail_line_count,
+    std::vector<IndexerCheckpoint>& checkpoints,
+    const Indexer::VisitorList& visitors) {
+    // Cap worker count at member count and a reasonable default.
+    constexpr std::size_t DEFAULT_MAX_WORKERS = 16;
+    constexpr std::size_t CHAN_CAP = 4;
+    const std::size_t num_workers =
+        std::min<std::size_t>(DEFAULT_MAX_WORKERS, members.size());
+
+    std::vector<std::shared_ptr<ParallelChan>> chans;
+    chans.reserve(num_workers);
+    for (std::size_t i = 0; i < num_workers; ++i) {
+        chans.push_back(
+            dftracer::utils::coro::make_channel<ParallelInflateMsg>(CHAN_CAP));
+    }
+
+    // Partition members contiguously, remainder spread over the first few
+    // workers so range counts differ by at most 1.
+    std::vector<std::pair<std::size_t, std::size_t>> ranges(num_workers);
+    {
+        const std::size_t per = members.size() / num_workers;
+        const std::size_t rem = members.size() % num_workers;
+        std::size_t cursor = 0;
+        for (std::size_t w = 0; w < num_workers; ++w) {
+            const std::size_t count = per + (w < rem ? 1 : 0);
+            ranges[w] = {cursor, cursor + count};
+            cursor += count;
+        }
+    }
+
+    bool dispatcher_ok = true;
+    std::shared_ptr<std::vector<GzipMember>> members_shared =
+        std::make_shared<std::vector<GzipMember>>(std::move(members));
+
+    co_await scope->scope([&](CoroScope& child)
+                              -> dftracer::utils::coro::CoroTask<void> {
+        for (std::size_t w = 0; w < num_workers; ++w) {
+            const auto [rs, re] = ranges[w];
+            const std::uint64_t c_start = (*members_shared)[rs].c_offset;
+            const std::uint64_t c_end = (re < members_shared->size())
+                                            ? (*members_shared)[re].c_offset
+                                            : slice_c_end;
+            auto producer = chans[w]->producer();
+            // Only the very first worker of a mid-file slice needs
+            // to strip the leading partial line; subsequent workers
+            // see contiguous (whole-line-aligned) data from their
+            // predecessor's stream.
+            const bool strip_this = strip_slice_leading_partial && (w == 0);
+            // The LAST worker of a NON-LAST slice extends past
+            // `slice_c_end` to capture the line that straddles the
+            // slice boundary. If this slice's end is file end, the
+            // slice IS the last one -- no extension needed.
+            const bool extend_this =
+                (w + 1 == num_workers) && (slice_c_end < file_size);
+            child.spawn([fd, c_start, c_end, ckpt_size, strip_this, extend_this,
+                         producer = std::move(producer)](CoroScope&) mutable
+                            -> dftracer::utils::coro::CoroTask<void> {
+                co_await parallel_worker(fd, c_start, c_end, ckpt_size,
+                                         strip_this, extend_this,
+                                         std::move(producer));
+            });
+        }
+
+        child.spawn([&chans, checkpoint_idx_base, &total_lines, &total_uc_size,
+                     &tail_line_count, &checkpoints, &visitors, &dispatcher_ok](
+                        CoroScope&) -> dftracer::utils::coro::CoroTask<void> {
+            dispatcher_ok = co_await parallel_dispatcher(
+                chans, checkpoint_idx_base, total_lines, total_uc_size,
+                tail_line_count, checkpoints, visitors);
+        });
+
+        co_return;
+    });
+
+    co_return dispatcher_ok;
+}
+
+static dftracer::utils::coro::CoroTask<bool> process_chunks(
+    CoroScope* scope, int fd, std::uint64_t ckpt_size,
+    const GzipMemberSlice* slice, std::uint64_t& total_lines,
+    std::uint64_t& total_uc_size, std::uint64_t& tail_line_count,
+    std::vector<IndexerCheckpoint>& checkpoints,
+    const Indexer::VisitorList& visitors) {
+    // Pre-scanned slice path: caller supplied the member map and a range.
+    // Used by the MPI/distributed indexer to split one file across ranks
+    // without re-scanning. `checkpoint_idx_base` disambiguates keys so
+    // multiple slices of the same file_id produce disjoint SST entries.
+    if (scope != nullptr && slice != nullptr && slice->members != nullptr &&
+        slice->member_end > slice->member_begin) {
+        struct stat st;
+        if (::fstat(fd, &st) != 0) co_return false;
+        const std::uint64_t file_size = static_cast<std::uint64_t>(st.st_size);
+        const auto& all = *slice->members;
+        const std::size_t mb = slice->member_begin;
+        const std::size_t me = slice->member_end;
+        if (me > all.size() || mb >= me) co_return false;
+        // Slice end: next-member offset if this isn't the last slice of
+        // the file, else EOF. Crucial: a non-last slice's workers must
+        // not inflate past this boundary into another slice's bytes.
+        const std::uint64_t slice_c_end =
+            (me < all.size()) ? all[me].c_offset : file_size;
+        std::vector<GzipMember> sliced(all.begin() + mb, all.begin() + me);
+        const bool strip_leading = (mb > 0);
+        co_return co_await process_chunks_parallel(
+            scope, fd, slice_c_end, file_size, std::move(sliced), ckpt_size,
+            slice->checkpoint_idx_base, strip_leading, total_lines,
+            total_uc_size, tail_line_count, checkpoints, visitors);
+    }
+
+    // Try to discover member boundaries so we can parallelise. The scan is
+    // zero-copy, sequential, and fast relative to inflate; for single-member
+    // files we fall through to the scan-then-resume path which captures
+    // internal deflate-block checkpoints to fan out anyway.
+    if (scope != nullptr) {
+        struct stat st;
+        if (::fstat(fd, &st) == 0 && st.st_size >= 18) {
+            std::vector<GzipMember> members;
+            const bool scan_ok = co_await enumerate_gzip_member_candidates(
+                fd, static_cast<std::uint64_t>(st.st_size), members);
+            const std::uint64_t sz = static_cast<std::uint64_t>(st.st_size);
+            if (scan_ok && members.size() >= 2) {
+                co_return co_await process_chunks_parallel(
+                    scope, fd, sz, sz, std::move(members), ckpt_size,
+                    /*checkpoint_idx_base=*/0,
+                    /*strip_slice_leading_partial=*/false, total_lines,
+                    total_uc_size, tail_line_count, checkpoints, visitors);
+            }
+        }
+    }
+
+    co_return co_await process_chunks_serial(fd, ckpt_size, total_lines,
+                                             total_uc_size, tail_line_count,
+                                             checkpoints, visitors);
+}
+
+}  // namespace
+
+dftracer::utils::coro::CoroTask<std::optional<GzipBuildArtifacts>>
+build_gzip_index_artifacts(const std::string& gz_path, std::uint64_t ckpt_size,
+                           const Indexer::VisitorList& visitors,
+                           CoroScope* scope, const GzipMemberSlice* slice) {
     int fd = ::open(gz_path.c_str(), O_RDONLY);
     if (fd < 0) {
-        co_return false;
+        co_return std::nullopt;
     }
 
     if (!visitors.empty()) {
@@ -174,35 +600,34 @@ static dftracer::utils::coro::CoroTask<bool> build_index(
     std::uint64_t tail_line_count = 0;
     std::vector<IndexerCheckpoint> checkpoints;
 
-    const bool success =
-        co_await process_chunks(fd, ckpt_size, total_lines, total_uc_size,
-                                tail_line_count, checkpoints, visitors);
+    const bool success = co_await process_chunks(
+        scope, fd, ckpt_size, slice, total_lines, total_uc_size,
+        tail_line_count, checkpoints, visitors);
     ::close(fd);
 
     if (!success) {
-        co_return false;
+        co_return std::nullopt;
     }
 
     finalize_checkpoints(checkpoints, total_uc_size, total_lines,
                          tail_line_count);
 
-    auto* db_ptr = &db;
-    auto* checkpoints_ptr = &checkpoints;
-    co_await rocks::run([db_ptr, file_id, ckpt_size, total_lines, total_uc_size,
-                         checkpoints_ptr] {
-        internal::TransactionScope txn(*db_ptr);
-        for (const auto& checkpoint : *checkpoints_ptr) {
-            db_ptr->insert_checkpoint(file_id, checkpoint);
-        }
-        db_ptr->insert_file_metadata(file_id, ckpt_size, total_lines,
-                                     total_uc_size);
-        txn.commit();
-    });
-
-    co_return true;
+    GzipBuildArtifacts artifacts;
+    artifacts.checkpoint_size = ckpt_size;
+    artifacts.total_lines = total_lines;
+    artifacts.total_uc_size = total_uc_size;
+    artifacts.checkpoints = std::move(checkpoints);
+    co_return artifacts;
 }
 
-}  // namespace
+void persist_gzip_index_artifacts(IndexDatabaseWriterContext& db, int file_id,
+                                  const GzipBuildArtifacts& artifacts) {
+    for (const auto& checkpoint : artifacts.checkpoints) {
+        db.insert_checkpoint(file_id, checkpoint);
+    }
+    db.insert_file_metadata(file_id, artifacts.checkpoint_size,
+                            artifacts.total_lines, artifacts.total_uc_size);
+}
 
 GzipIndexer::GzipIndexer(const std::string& gz_path_,
                          const std::string& idx_path_, std::uint64_t ckpt_size_,
@@ -299,42 +724,36 @@ dftracer::utils::coro::CoroTask<void> GzipIndexer::build_async() const {
     const std::uint64_t final_ckpt_size =
         determine_checkpoint_size(ckpt_size, gz_path);
     const std::string logical = gz_path_logical_path;
-    const auto* logical_ptr = &logical;
-    const int file_id = co_await rocks::run([db_ptr = &db, logical_ptr, hash] {
-        return db_ptr->get_or_create_file_info(*logical_ptr, hash);
-    });
+    auto writer = db.begin_write();
+    const int file_id = writer->get_or_create_file_info(logical, hash);
+    writer->commit();
 
-    if (!(co_await build_index(db, file_id, gz_path, final_ckpt_size,
-                               visitors_))) {
+    auto artifacts = co_await build_gzip_index_artifacts(
+        gz_path, final_ckpt_size, visitors_, nullptr);
+    if (!artifacts) {
         throw IndexerError(IndexerError::Type::BUILD_ERROR,
                            "Failed to build index for " + gz_path);
     }
 
+    {
+        auto w = db.begin_write();
+        persist_gzip_index_artifacts(*w, file_id, *artifacts);
+        w->commit();
+    }
+
     (void)mtime;
     (void)bytes;
-    struct CacheSnapshot {
-        std::uint64_t num_lines = 0;
-        std::uint64_t max_bytes = 0;
-        std::vector<IndexerCheckpoint> checkpoints;
-    };
-    auto snapshot = co_await rocks::run([db_ptr = &db, file_id] {
-        CacheSnapshot cache;
-        cache.num_lines = db_ptr->get_num_lines(file_id);
-        cache.max_bytes = db_ptr->get_max_bytes(file_id);
-        cache.checkpoints = db_ptr->query_checkpoints(file_id);
-        return cache;
-    });
 
     cached_is_valid = true;
     cached_file_id = file_id;
     cached_checkpoint_size = final_ckpt_size;
     cached_checkpoint_size_ready = true;
-    cached_num_lines = snapshot.num_lines;
+    cached_num_lines = db.get_num_lines(file_id);
     cached_num_lines_ready = true;
-    cached_max_bytes = snapshot.max_bytes;
+    cached_max_bytes = db.get_max_bytes(file_id);
     cached_max_bytes_ready = true;
     std::lock_guard<std::mutex> lock(cached_checkpoints_mutex);
-    cached_checkpoints = std::move(snapshot.checkpoints);
+    cached_checkpoints = db.query_checkpoints(file_id);
     co_return;
 }
 
diff --git a/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h b/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h
index 94c0f04b..21f73e82 100644
--- a/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h
+++ b/src/dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h
@@ -4,20 +4,64 @@
 #include <dftracer/utils/core/common/archive_format.h>
 #include <dftracer/utils/core/common/constants.h>
 #include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
 #include <dftracer/utils/utilities/indexer/index_visitor.h>
 #include <dftracer/utils/utilities/indexer/internal/checkpoint.h>
+#include <dftracer/utils/utilities/indexer/internal/common/gzip_member_scanner.h>
 #include <dftracer/utils/utilities/indexer/internal/indexer.h>
 
 #include <atomic>
 #include <cstddef>
 #include <cstdint>
 #include <mutex>
+#include <optional>
 #include <string>
 #include <vector>
 
 namespace dftracer::utils::utilities::indexer::internal::gzip {
 
+struct GzipBuildArtifacts {
+    std::uint64_t checkpoint_size = 0;
+    std::uint64_t total_lines = 0;
+    std::uint64_t total_uc_size = 0;
+    std::vector<IndexerCheckpoint> checkpoints;
+};
+
+/// Optional slice of a multi-member gzip file. When set, the indexer
+/// processes only members `[member_begin, member_end)` of the file
+/// (byte range `[members[member_begin].c_offset, members[member_end-1]
+/// .c_offset + members[member_end-1].c_size)`). Used for cross-rank
+/// splitting of large files; uc_offsets/line numbers in emitted
+/// checkpoints are slice-local and `checkpoint_idx` is offset by
+/// `checkpoint_idx_base` so multiple ranks writing the same file_id
+/// produce disjoint keys.
+struct GzipMemberSlice {
+    const std::vector<internal::GzipMember> *members = nullptr;
+    std::size_t member_begin = 0;
+    std::size_t member_end = 0;  // exclusive
+    std::uint64_t checkpoint_idx_base = 0;
+};
+
+/// Build gzip index artifacts (checkpoints, dispatched visitor events).
+///
+/// When `scope` is non-null and the input is multi-member gzip (the
+/// dftracer runtime format), the inflate pass is parallelised across the
+/// scope's executor. On single-member files or when `scope` is null,
+/// falls back to the serial inflate loop with identical semantics.
+///
+/// When `slice` is non-null, only the specified member range is
+/// processed. The caller is responsible for ensuring `slice->members`
+/// outlives this coroutine.
+coro::CoroTask<std::optional<GzipBuildArtifacts>> build_gzip_index_artifacts(
+    const std::string &gz_path, std::uint64_t ckpt_size,
+    const Indexer::VisitorList &visitors, CoroScope *scope = nullptr,
+    const GzipMemberSlice *slice = nullptr);
+
+void persist_gzip_index_artifacts(IndexDatabaseWriterContext &db, int file_id,
+                                  const GzipBuildArtifacts &artifacts);
+
 class GzipIndexer : public Indexer {
    public:
     static constexpr std::uint64_t DEFAULT_CHECKPOINT_SIZE =
diff --git a/src/dftracer/utils/utilities/indexer/internal/helpers.cpp b/src/dftracer/utils/utilities/indexer/internal/helpers.cpp
index e16da3ee..bbd1dd4f 100644
--- a/src/dftracer/utils/utilities/indexer/internal/helpers.cpp
+++ b/src/dftracer/utils/utilities/indexer/internal/helpers.cpp
@@ -40,7 +40,6 @@ std::string normalize_index_root(std::string_view path) {
 
 time_t get_file_modification_time(const std::string &file_path) {
 #if defined(DFTRACER_UTILS_USE_STD_FS)
-    // Use std::filesystem when available and working
     auto ftime = fs::last_write_time(file_path);
     auto sctp =
         std::chrono::time_point_cast<std::chrono::system_clock::duration>(
diff --git a/src/dftracer/utils/utilities/indexer/internal/index_batch_writer.h b/src/dftracer/utils/utilities/indexer/internal/index_batch_writer.h
new file mode 100644
index 00000000..8b36b804
--- /dev/null
+++ b/src/dftracer/utils/utilities/indexer/internal/index_batch_writer.h
@@ -0,0 +1,120 @@
+#ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_INDEX_BATCH_WRITER_H
+#define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_INDEX_BATCH_WRITER_H
+
+#include <dftracer/utils/core/coro/channel.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/visitors/hash_table_visitor.h>
+#include <dftracer/utils/utilities/composites/dft/visitors/manifest_visitor.h>
+#include <dftracer/utils/utilities/indexer/index_batch_sink.h>
+#include <dftracer/utils/utilities/indexer/internal/gzip/gzip_indexer.h>
+
+#include <atomic>
+#include <chrono>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace dftracer::utils::utilities::indexer::internal {
+
+using composites::dft::visitors::BloomVisitor;
+using composites::dft::visitors::HashTableVisitor;
+using composites::dft::visitors::ManifestVisitor;
+
+struct ParsedIndexJob {
+    int file_id = 0;
+    std::string file_path;
+    gzip::GzipBuildArtifacts artifacts;
+    std::unique_ptr<BloomVisitor> bloom_visitor;
+    std::unique_ptr<HashTableVisitor> hash_table_visitor;
+    std::unique_ptr<ManifestVisitor> manifest_visitor;
+    bool success = true;
+    std::string error_message;
+};
+
+struct BatchWriterMetrics {
+    std::atomic<std::uint64_t> write_ns{0};
+    std::atomic<std::size_t> files_written{0};
+    std::atomic<std::size_t> batches_committed{0};
+};
+
+/// Drain `channel`, group `ParsedIndexJob`s into batches of `batch_size`,
+/// and commit each batch through a fresh `IndexBatchSink` produced by
+/// `make_sink()`. The caller-provided `commit_sink(sink)` finalises the
+/// batch: for RocksDB-backed sinks it calls `.commit()`; for SST-backed
+/// sinks it flushes to disk and routes `Artifacts` to a registry.
+///
+/// `MakeSink` must be invocable as `() -> std::unique_ptr<IndexBatchSink>`
+/// (or any subclass thereof). `CommitSink` must be invocable as
+/// `(IndexBatchSink&) -> void`.
+template <typename MakeSink, typename CommitSink>
+inline coro::CoroTask<void> index_batch_write_worker(
+    coro::Channel<ParsedIndexJob>* channel, std::size_t batch_size,
+    BatchWriterMetrics* metrics, MakeSink make_sink, CommitSink commit_sink) {
+    std::vector<ParsedIndexJob> batch;
+    batch.reserve(batch_size);
+
+    auto flush = [&]() {
+        if (batch.empty()) return;
+        auto start = std::chrono::steady_clock::now();
+
+        auto sink_owned = make_sink();
+        IndexBatchSink& sink = *sink_owned;
+        for (auto& job : batch) {
+            if (!job.success) continue;
+            try {
+                for (const auto& checkpoint : job.artifacts.checkpoints) {
+                    sink.insert_checkpoint(job.file_id, checkpoint);
+                }
+                sink.insert_file_metadata(
+                    job.file_id, job.artifacts.checkpoint_size,
+                    job.artifacts.total_lines, job.artifacts.total_uc_size);
+                if (job.bloom_visitor) {
+                    job.bloom_visitor->finalize_sink_only(sink, job.file_id);
+                }
+                if (job.hash_table_visitor) {
+                    job.hash_table_visitor->finalize(sink, job.file_id);
+                }
+                if (job.manifest_visitor) {
+                    job.manifest_visitor->finalize(sink, job.file_id);
+                }
+            } catch (const std::exception& e) {
+                job.success = false;
+                job.error_message = e.what();
+            }
+        }
+        commit_sink(sink);
+
+        auto end = std::chrono::steady_clock::now();
+        if (metrics) {
+            metrics->write_ns.fetch_add(
+                static_cast<std::uint64_t>(
+                    std::chrono::duration_cast<std::chrono::nanoseconds>(end -
+                                                                         start)
+                        .count()),
+                std::memory_order_relaxed);
+            std::size_t written = 0;
+            for (const auto& job : batch) {
+                if (job.success) ++written;
+            }
+            metrics->files_written.fetch_add(written,
+                                             std::memory_order_relaxed);
+            metrics->batches_committed.fetch_add(1, std::memory_order_relaxed);
+        }
+        batch.clear();
+    };
+
+    while (auto item = co_await channel->receive()) {
+        batch.push_back(std::move(*item));
+        if (batch.size() >= batch_size) {
+            flush();
+        }
+    }
+    flush();
+    co_return;
+}
+
+}  // namespace dftracer::utils::utilities::indexer::internal
+
+#endif  // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_INDEX_BATCH_WRITER_H
diff --git a/src/dftracer/utils/utilities/indexer/internal/index_encoding.cpp b/src/dftracer/utils/utilities/indexer/internal/index_encoding.cpp
new file mode 100644
index 00000000..2b535767
--- /dev/null
+++ b/src/dftracer/utils/utilities/indexer/internal/index_encoding.cpp
@@ -0,0 +1,309 @@
+#include <dftracer/utils/core/rocksdb/key_codec.h>
+#include <dftracer/utils/utilities/indexer/internal/index_encoding.h>
+#include <dftracer/utils/utilities/indexer/internal/payload_codec.h>
+
+#include <algorithm>
+#include <vector>
+
+namespace dftracer::utils::utilities::indexer::internal::encoding {
+
+namespace {
+namespace rocks = dftracer::utils::rocksdb;
+}  // namespace
+
+std::string prefix_for_file(int file_id) {
+    return rocks::KeyCodec::encode_be32(static_cast<std::uint32_t>(file_id));
+}
+
+std::string metadata_key(int file_id) { return prefix_for_file(file_id); }
+
+std::string checkpoint_key(int file_id, std::uint64_t uc_offset,
+                           std::uint64_t checkpoint_idx) {
+    std::string key = prefix_for_file(file_id);
+    append_u64(key, uc_offset);
+    append_u64(key, checkpoint_idx);
+    return key;
+}
+
+std::string manifest_event_key(int file_id, std::uint64_t checkpoint_idx,
+                               std::string_view cat, std::string_view name) {
+    std::string key("E|");
+    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_id));
+    append_u64(key, checkpoint_idx);
+    key.append(cat);
+    key.push_back('\0');
+    key.append(name);
+    return key;
+}
+
+std::string manifest_metadata_key(int file_id, std::uint64_t checkpoint_idx,
+                                  std::string_view meta_type) {
+    std::string key("M|");
+    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_id));
+    append_u64(key, checkpoint_idx);
+    key.append(meta_type);
+    return key;
+}
+
+std::string encode_metadata_record(std::uint64_t checkpoint_size,
+                                   std::uint64_t total_lines,
+                                   std::uint64_t total_uc_size) {
+    std::string value;
+    append_u64(value, checkpoint_size);
+    append_u64(value, total_lines);
+    append_u64(value, total_uc_size);
+    return value;
+}
+
+std::string encode_checkpoint_value(const IndexerCheckpoint& checkpoint) {
+    std::string value;
+    append_u64(value, checkpoint.uc_size);
+    append_u64(value, checkpoint.c_offset);
+    append_u64(value, checkpoint.c_size);
+    append_i64(value, checkpoint.bits);
+    append_blob(value, checkpoint.dict_compressed);
+    append_u64(value, checkpoint.num_lines);
+    append_u64(value, checkpoint.first_line_num);
+    append_u64(value, checkpoint.last_line_num);
+    return value;
+}
+
+namespace {
+
+// Packs lines directly into `out` as the `blob` payload of append_blob's
+// wire format: u32 byte-length followed by raw little-endian uint32s.
+void append_line_numbers_blob(std::string& out,
+                              std::span<const std::uint32_t> lines) {
+    const auto bytes =
+        static_cast<std::uint32_t>(lines.size() * sizeof(std::uint32_t));
+    rocks::KeyCodec::append_be32(out, bytes);
+    if (!lines.empty()) {
+        out.append(reinterpret_cast<const char*>(lines.data()), bytes);
+    }
+}
+
+}  // namespace
+
+std::string encode_event_range_value(std::span<const std::uint32_t> lines) {
+    std::string value;
+    value.reserve(sizeof(std::uint64_t) + sizeof(std::uint32_t) +
+                  lines.size() * sizeof(std::uint32_t));
+    append_u64(value, lines.size());
+    append_line_numbers_blob(value, lines);
+    return value;
+}
+
+std::string encode_metadata_value(std::span<const std::uint32_t> lines) {
+    std::string value;
+    value.reserve(sizeof(std::uint32_t) + lines.size() * sizeof(std::uint32_t));
+    append_line_numbers_blob(value, lines);
+    return value;
+}
+
+std::string file_pids_key(int file_id) {
+    std::string key("P|");
+    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_id));
+    return key;
+}
+
+std::string make_dimension_key(int file_id, std::string_view dimension) {
+    std::string key("d|");
+    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_id));
+    key.append(dimension);
+    return key;
+}
+
+std::string chunk_bloom_key(int file_id, std::string_view dimension,
+                            std::uint64_t checkpoint_idx) {
+    std::string key = prefix_for_file(file_id);
+    key.append(dimension);
+    key.push_back('\0');
+    append_u64(key, checkpoint_idx);
+    return key;
+}
+
+std::string file_bloom_key(int file_id, std::string_view dimension) {
+    std::string key = prefix_for_file(file_id);
+    key.append(dimension);
+    return key;
+}
+
+std::string chunk_stats_key(int file_id, std::uint64_t checkpoint_idx) {
+    std::string key = prefix_for_file(file_id);
+    append_u64(key, checkpoint_idx);
+    return key;
+}
+
+std::string file_scalar_stats_key(int file_id) {
+    return prefix_for_file(file_id);
+}
+
+std::string file_category_counts_key(int file_id) {
+    return prefix_for_file(file_id);
+}
+
+std::string file_pid_tid_counts_key(int file_id) {
+    return prefix_for_file(file_id);
+}
+
+std::string file_name_counts_key(int file_id) {
+    return prefix_for_file(file_id);
+}
+
+std::string chunk_dim_stats_key(int file_id, std::uint64_t checkpoint_idx,
+                                std::string_view dimension) {
+    std::string key = prefix_for_file(file_id);
+    append_u64(key, checkpoint_idx);
+    key.append(dimension);
+    return key;
+}
+
+std::string encode_bloom_value(std::span<const unsigned char> blob,
+                               std::uint64_t num_entries) {
+    std::string value;
+    append_u64(value, num_entries);
+    value.append(reinterpret_cast<const char*>(blob.data()), blob.size());
+    return value;
+}
+
+std::string encode_chunk_statistics_value(
+    const composites::dft::indexing::ChunkStatistics& stats) {
+    std::string value;
+    append_u64(value, stats.total_events);
+    append_u64(value, stats.min_timestamp_us);
+    append_u64(value, stats.max_timestamp_us);
+    append_i64(value, stats.duration_sum_us);
+    append_u64(value, stats.duration_min_us);
+    append_u64(value, stats.duration_max_us);
+    append_u64(value, stats.duration_count);
+    append_double(value, stats.duration_m2);
+
+    auto duration_sketch = stats.duration_sketch.serialize();
+    append_blob(value, duration_sketch);
+
+    auto duration_histogram = stats.duration_histogram.to_json();
+    append_string(value, duration_histogram);
+
+    auto name_sketches = stats.serialize_name_duration_sketches();
+    append_blob(value, name_sketches);
+    append_string(value, stats.name_duration_histograms_json());
+    append_string(value, stats.name_duration_sums_json());
+    append_string(value, stats.name_duration_sum_sqs_json());
+    append_string(value, stats.name_category_json());
+
+    auto ts_hist = stats.timestamp_histogram.serialize();
+    append_blob(value, ts_hist);
+
+    return value;
+}
+
+std::string encode_chunk_dimension_stats_value(
+    const composites::dft::indexing::ChunkDimensionStats& stats,
+    std::size_t value_counts_cap) {
+    std::string value;
+    append_u64(value, stats.distinct_count);
+    append_string(value, stats.min_value);
+    append_string(value, stats.max_value);
+    append_string(value, stats.value_type);
+    auto compressed = stats.compress_value_counts(value_counts_cap);
+    append_u8(value, compressed.has_value() ? 1 : 0);
+    if (compressed) {
+        append_blob(value, *compressed);
+    }
+    return value;
+}
+
+std::string name_lookup_key(std::string_view name) {
+    std::string key("s|");
+    key.append(name);
+    return key;
+}
+
+std::string name_reverse_key(std::uint64_t name_id) {
+    std::string key("i|");
+    append_u64(key, name_id);
+    return key;
+}
+
+std::string name_file_posting_key(std::uint64_t name_id, int file_id) {
+    std::string key("n|");
+    append_u64(key, name_id);
+    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_id));
+    return key;
+}
+
+std::string name_file_owner_key(int file_id, std::uint64_t name_id) {
+    std::string key("o|");
+    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_id));
+    append_u64(key, name_id);
+    return key;
+}
+
+std::string name_file_owner_prefix(int file_id) {
+    std::string key("o|");
+    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_id));
+    return key;
+}
+
+std::string name_chunk_posting_key(std::uint64_t name_id, int file_id,
+                                   std::uint64_t checkpoint_idx) {
+    std::string key("n|");
+    append_u64(key, name_id);
+    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_id));
+    append_u64(key, checkpoint_idx);
+    return key;
+}
+
+std::string name_chunk_owner_key(int file_id, std::uint64_t name_id,
+                                 std::uint64_t checkpoint_idx) {
+    std::string key("o|");
+    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_id));
+    append_u64(key, name_id);
+    append_u64(key, checkpoint_idx);
+    return key;
+}
+
+std::string name_chunk_owner_prefix(int file_id) {
+    std::string key("o|");
+    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_id));
+    return key;
+}
+
+std::string hash_table_forward_key(std::uint8_t type, std::string_view hash) {
+    std::string key;
+    key.reserve(1 + hash.size());
+    key.push_back(static_cast<char>(type));
+    key.append(hash);
+    return key;
+}
+
+std::string hash_table_reverse_key(std::uint8_t type, std::string_view name) {
+    std::string key;
+    key.reserve(1 + name.size());
+    key.push_back(static_cast<char>(type + 4));
+    key.append(name);
+    return key;
+}
+
+std::string encode_file_pids_value(
+    const std::unordered_set<std::uint64_t>& pids) {
+    std::vector<std::uint64_t> sorted_pids(pids.begin(), pids.end());
+    std::sort(sorted_pids.begin(), sorted_pids.end());
+
+    std::string value;
+    auto encode_varint = [&value](std::uint64_t v) {
+        while (v >= 0x80) {
+            value.push_back(static_cast<char>(v | 0x80));
+            v >>= 7;
+        }
+        value.push_back(static_cast<char>(v));
+    };
+
+    encode_varint(sorted_pids.size());
+    for (auto pid : sorted_pids) {
+        encode_varint(pid);
+    }
+    return value;
+}
+
+}  // namespace dftracer::utils::utilities::indexer::internal::encoding
diff --git a/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.cpp b/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.cpp
index 157553ed..3620c580 100644
--- a/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.cpp
+++ b/src/dftracer/utils/utilities/indexer/internal/tar/tar_indexer.cpp
@@ -1,8 +1,8 @@
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
 #include <dftracer/utils/core/coro/task.h>
-#include <dftracer/utils/core/rocksdb/async.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
 #include <dftracer/utils/utilities/indexer/internal/common/gzip_inflater.h>
 #include <dftracer/utils/utilities/indexer/internal/error.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
@@ -18,7 +18,7 @@
 namespace dftracer::utils::utilities::indexer::internal::tar {
 
 using dftracer::utils::utilities::indexer::IndexDatabase;
-namespace rocks = dftracer::utils::rocksdb;
+using dftracer::utils::utilities::indexer::IndexDatabaseWriterContext;
 
 namespace {
 
@@ -37,8 +37,8 @@ std::string normalize_idx_path(const std::string& path) {
 }
 
 dftracer::utils::coro::CoroTask<bool> build_tar_index(
-    IndexDatabase& db, int file_id, const std::string& tar_gz_path,
-    std::uint64_t ckpt_size) {
+    IndexDatabaseWriterContext& writer, int file_id,
+    const std::string& tar_gz_path, std::uint64_t ckpt_size) {
     int fd = ::open(tar_gz_path.c_str(), O_RDONLY);
     if (fd < 0) {
         co_return false;
@@ -73,8 +73,8 @@ dftracer::utils::coro::CoroTask<bool> build_tar_index(
             break;
         }
 
-        accumulated_data.insert(accumulated_data.end(), inflater.out_buffer,
-                                inflater.out_buffer + result.bytes_read);
+        accumulated_data.insert(accumulated_data.end(), inflater.out_buffer(),
+                                inflater.out_buffer() + result.bytes_read);
         current_uc_offset += result.bytes_read;
         total_lines += result.lines_found;
     }
@@ -87,38 +87,29 @@ dftracer::utils::coro::CoroTask<bool> build_tar_index(
 
     total_uc_size = current_uc_offset;
 
-    auto* db_ptr = &db;
-    auto* tar_entries_ptr = &tar_entries;
     const std::string archive_name = fs::path(tar_gz_path).filename().string();
-    const auto* archive_name_ptr = &archive_name;
-    co_await rocks::run([db_ptr, file_id, ckpt_size, total_lines, total_uc_size,
-                         tar_entries_ptr, archive_name_ptr] {
-        internal::TransactionScope txn(*db_ptr);
-        std::uint64_t regular_files = 0;
-        for (const auto& entry : *tar_entries_ptr) {
-            if (!entry.is_regular_file()) {
-                continue;
-            }
-
-            ++regular_files;
-            db_ptr->insert_tar_file(
-                file_id, IndexDatabase::TarFileRecord{
-                             .file_name = entry.name,
-                             .file_size = entry.size,
-                             .file_mtime = entry.mtime,
-                             .typeflag = entry.typeflag,
-                             .data_offset = entry.data_offset,
-                             .uncompressed_offset = entry.uncompressed_offset,
-                         });
+    std::uint64_t regular_files = 0;
+    for (const auto& entry : tar_entries) {
+        if (!entry.is_regular_file()) {
+            continue;
         }
 
-        db_ptr->insert_file_metadata(file_id, ckpt_size, total_lines,
-                                     total_uc_size);
-        db_ptr->insert_tar_archive_metadata(file_id, *archive_name_ptr,
-                                            ckpt_size, total_lines,
-                                            total_uc_size, regular_files);
-        txn.commit();
-    });
+        ++regular_files;
+        writer.insert_tar_file(
+            file_id, IndexDatabaseWriterContext::TarFileRecord{
+                         .file_name = entry.name,
+                         .file_size = entry.size,
+                         .file_mtime = entry.mtime,
+                         .typeflag = entry.typeflag,
+                         .data_offset = entry.data_offset,
+                         .uncompressed_offset = entry.uncompressed_offset,
+                     });
+    }
+
+    writer.insert_file_metadata(file_id, ckpt_size, total_lines, total_uc_size);
+    writer.insert_tar_archive_metadata(file_id, archive_name, ckpt_size,
+                                       total_lines, total_uc_size,
+                                       regular_files);
 
     ::close(fd);
     co_return true;
@@ -198,18 +189,18 @@ dftracer::utils::coro::CoroTask<void> TarIndexer::build_async() const {
     }
 
     IndexDatabase db(index_path);
+    auto writer = db.begin_write();
     const auto hash = calculate_file_hash(tar_gz_path);
     const std::string logical = tar_gz_path_logical_path;
-    const auto* logical_ptr = &logical;
-    const int file_id = co_await rocks::run([db_ptr = &db, logical_ptr, hash] {
-        return db_ptr->get_or_create_file_info(*logical_ptr, hash);
-    });
+    const int file_id = writer->get_or_create_file_info(logical, hash);
 
-    if (!(co_await build_tar_index(db, file_id, tar_gz_path, ckpt_size))) {
+    if (!(co_await build_tar_index(*writer, file_id, tar_gz_path, ckpt_size))) {
         throw IndexerError(IndexerError::Type::BUILD_ERROR,
                            "Failed to build TAR index for " + tar_gz_path);
     }
 
+    writer->commit();
+
     struct CacheSnapshot {
         std::uint64_t checkpoint_size = 0;
         std::uint64_t num_lines = 0;
@@ -220,22 +211,18 @@ dftracer::utils::coro::CoroTask<void> TarIndexer::build_async() const {
     };
     const std::string fallback_archive_name =
         fs::path(tar_gz_path).filename().string();
-    const auto* fallback_archive_name_ptr = &fallback_archive_name;
-    auto snapshot =
-        co_await rocks::run([db_ptr = &db, file_id, fallback_archive_name_ptr] {
-            CacheSnapshot cache;
-            cache.checkpoint_size = db_ptr->get_checkpoint_size(file_id);
-            cache.num_lines = db_ptr->get_num_lines(file_id);
-            cache.max_bytes = db_ptr->get_max_bytes(file_id);
-            if (auto metadata = db_ptr->query_tar_archive_metadata(file_id)) {
-                cache.num_files = metadata->total_files;
-                cache.archive_name = metadata->archive_name;
-            } else {
-                cache.archive_name = *fallback_archive_name_ptr;
-            }
-            cache.checkpoints = db_ptr->query_checkpoints(file_id);
-            return cache;
-        });
+
+    CacheSnapshot snapshot;
+    snapshot.checkpoint_size = db.get_checkpoint_size(file_id);
+    snapshot.num_lines = db.get_num_lines(file_id);
+    snapshot.max_bytes = db.get_max_bytes(file_id);
+    if (auto metadata = db.query_tar_archive_metadata(file_id)) {
+        snapshot.num_files = metadata->total_files;
+        snapshot.archive_name = metadata->archive_name;
+    } else {
+        snapshot.archive_name = fallback_archive_name;
+    }
+    snapshot.checkpoints = db.query_checkpoints(file_id);
 
     std::lock_guard<std::mutex> lock(cache_mutex);
     cached_is_valid = true;
@@ -508,7 +495,7 @@ bool TarIndexer::find_file(const std::string& file_name,
     IndexDatabase db(
         index_path,
         dftracer::utils::rocksdb::RocksDatabase::OpenMode::ReadOnly);
-    IndexDatabase::TarFileRecord record;
+    TarFileRecord record;
     if (!db.find_tar_file(archive_id, file_name, record)) {
         return false;
     }
diff --git a/src/dftracer/utils/utilities/indexer/internal/transaction_scope.h b/src/dftracer/utils/utilities/indexer/internal/transaction_scope.h
index b23a23e1..a3e56fdd 100644
--- a/src/dftracer/utils/utilities/indexer/internal/transaction_scope.h
+++ b/src/dftracer/utils/utilities/indexer/internal/transaction_scope.h
@@ -1,39 +1,10 @@
 #ifndef DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_TRANSACTION_SCOPE_H
 #define DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_TRANSACTION_SCOPE_H
 
-namespace dftracer::utils::utilities::indexer::internal {
-
-template <typename Database>
-class TransactionScope {
-   public:
-    explicit TransactionScope(Database& db) : db_(db) {
-        db_.begin_transaction();
-    }
-
-    TransactionScope(const TransactionScope&) = delete;
-    TransactionScope& operator=(const TransactionScope&) = delete;
-
-    TransactionScope(TransactionScope&& other) noexcept
-        : db_(other.db_), committed_(other.committed_) {
-        other.committed_ = true;
-    }
-
-    ~TransactionScope() {
-        if (!committed_) {
-            db_.rollback_transaction();
-        }
-    }
-
-    void commit() {
-        db_.commit_transaction();
-        committed_ = true;
-    }
-
-   private:
-    Database& db_;
-    bool committed_ = false;
-};
-
-}  // namespace dftracer::utils::utilities::indexer::internal
+// TransactionScope has been removed.
+// IndexDatabase no longer has begin_transaction/commit_transaction.
+// Use the IndexBatchSink API on IndexDatabaseWriterContext (obtained via
+// IndexDatabase::begin_write()) or individual insert methods followed by
+// commit().
 
 #endif  // DFTRACER_UTILS_UTILITIES_INDEXER_INTERNAL_TRANSACTION_SCOPE_H
diff --git a/src/dftracer/utils/utilities/indexer/provenance_database.cpp b/src/dftracer/utils/utilities/indexer/provenance_database.cpp
index 4896a54e..947cb0a0 100644
--- a/src/dftracer/utils/utilities/indexer/provenance_database.cpp
+++ b/src/dftracer/utils/utilities/indexer/provenance_database.cpp
@@ -2,6 +2,7 @@
 #include <dftracer/utils/core/rocksdb/key_codec.h>
 #include <dftracer/utils/utilities/indexer/internal/error.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
+#include <dftracer/utils/utilities/indexer/internal/payload_codec.h>
 #include <dftracer/utils/utilities/indexer/internal/scan_prefix.h>
 #include <dftracer/utils/utilities/indexer/provenance_database.h>
 
@@ -11,8 +12,9 @@
 namespace dftracer::utils::utilities::indexer {
 
 namespace rocks = dftracer::utils::rocksdb;
+namespace cf = rocks::cf;
 
-using internal::IndexerError;
+using namespace internal;
 
 namespace {
 
@@ -82,60 +84,23 @@ std::string group_key(int file_info_id, std::string_view name) {
     return key;
 }
 
-std::string segment_key(int file_info_id, int source_idx,
-                        int source_checkpoint) {
+std::string segment_key(int file_info_id, int source_idx, int source_checkpoint,
+                        int segment_seq) {
     std::string key("px|");
     rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(file_info_id));
     rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(source_idx));
     rocks::KeyCodec::append_be32(key,
                                  static_cast<std::uint32_t>(source_checkpoint));
+    rocks::KeyCodec::append_be32(key, static_cast<std::uint32_t>(segment_seq));
     return key;
 }
 
-void append_string(std::string& out, std::string_view value) {
-    rocks::KeyCodec::append_be32(out, static_cast<std::uint32_t>(value.size()));
-    out.append(value.data(), value.size());
-}
-
-void append_u32(std::string& out, std::uint32_t value) {
-    rocks::KeyCodec::append_be32(out, value);
-}
-
-class Cursor {
-   public:
-    explicit Cursor(std::string_view data) : data_(data) {}
-
-    std::uint32_t u32() {
-        auto part = take(4);
-        return rocks::KeyCodec::decode_be32(part);
-    }
-
-    std::string str() {
-        const auto len = static_cast<std::size_t>(u32());
-        auto bytes = take(len);
-        return std::string(bytes.data(), bytes.size());
-    }
-
-   private:
-    std::string_view take(std::size_t len) {
-        if (offset_ + len > data_.size()) {
-            throw std::runtime_error("Corrupt provenance payload");
-        }
-        auto part = data_.substr(offset_, len);
-        offset_ += len;
-        return part;
-    }
-
-    std::string_view data_;
-    std::size_t offset_ = 0;
-};
-
 template <typename Fn>
 void scan_prefix(const rocks::RocksDatabase& db, std::string_view prefix,
                  Fn&& fn) {
     internal::scan_prefix_iterator(
         "Failed to scan provenance prefix", prefix,
-        [&] { return db.new_iterator("provenance"); }, std::forward<Fn>(fn));
+        [&] { return db.new_iterator(cf::PROVENANCE); }, std::forward<Fn>(fn));
 }
 
 }  // namespace
@@ -156,22 +121,23 @@ int ProvenanceDatabase::get_or_create_file_info(const std::string& path,
                                                 std::uint64_t file_hash) {
     const auto key = file_key(path);
     std::string value;
-    auto status = db_->get(key, &value, "provenance");
+    auto status = db_->get(key, &value, cf::PROVENANCE);
     if (status.ok()) {
         const auto id = decode_file_id(value);
         if (decode_hash(value) == file_hash) {
             return id;
         }
         const auto encoded = encode_file_record(id, file_hash);
-        status = txn_batch_ ? db_->put(*txn_batch_, "provenance", key, encoded)
-                            : db_->put(key, encoded, "provenance");
+        status = txn_batch_
+                     ? db_->put(*txn_batch_, cf::PROVENANCE, key, encoded)
+                     : db_->put(key, encoded, cf::PROVENANCE);
         if (!status.ok()) {
             throw_db_error("Failed to update provenance file info", status);
         }
         status = txn_batch_
-                     ? db_->put(*txn_batch_, "provenance", file_reverse_key(id),
-                                path)
-                     : db_->put(file_reverse_key(id), path, "provenance");
+                     ? db_->put(*txn_batch_, cf::PROVENANCE,
+                                file_reverse_key(id), path)
+                     : db_->put(file_reverse_key(id), path, cf::PROVENANCE);
         if (!status.ok()) {
             throw_db_error("Failed to update provenance reverse file info",
                            status);
@@ -184,7 +150,7 @@ int ProvenanceDatabase::get_or_create_file_info(const std::string& path,
 
     std::uint32_t next_id = 1;
     std::string next_value;
-    status = db_->get(next_file_id_key(), &next_value, "provenance");
+    status = db_->get(next_file_id_key(), &next_value, cf::PROVENANCE);
     if (status.ok()) {
         next_id = rocks::KeyCodec::decode_be32(next_value);
     } else if (!status.IsNotFound()) {
@@ -195,26 +161,26 @@ int ProvenanceDatabase::get_or_create_file_info(const std::string& path,
         encode_file_record(static_cast<int>(next_id), file_hash);
     const auto next_encoded = rocks::KeyCodec::encode_be32(next_id + 1);
     if (txn_batch_) {
-        status = db_->put(*txn_batch_, "provenance", key, encoded);
+        status = db_->put(*txn_batch_, cf::PROVENANCE, key, encoded);
         if (!status.ok()) throw_db_error("Failed to insert file info", status);
-        status = db_->put(*txn_batch_, "provenance", file_reverse_key(next_id),
-                          path);
+        status = db_->put(*txn_batch_, cf::PROVENANCE,
+                          file_reverse_key(next_id), path);
         if (!status.ok()) {
             throw_db_error("Failed to insert reverse file info", status);
         }
-        status = db_->put(*txn_batch_, "provenance", next_file_id_key(),
+        status = db_->put(*txn_batch_, cf::PROVENANCE, next_file_id_key(),
                           next_encoded);
         if (!status.ok()) {
             throw_db_error("Failed to update next provenance file id", status);
         }
     } else {
-        status = db_->put(key, encoded, "provenance");
+        status = db_->put(key, encoded, cf::PROVENANCE);
         if (!status.ok()) throw_db_error("Failed to insert file info", status);
-        status = db_->put(file_reverse_key(next_id), path, "provenance");
+        status = db_->put(file_reverse_key(next_id), path, cf::PROVENANCE);
         if (!status.ok()) {
             throw_db_error("Failed to insert reverse file info", status);
         }
-        status = db_->put(next_file_id_key(), next_encoded, "provenance");
+        status = db_->put(next_file_id_key(), next_encoded, cf::PROVENANCE);
         if (!status.ok()) {
             throw_db_error("Failed to update next provenance file id", status);
         }
@@ -224,7 +190,7 @@ int ProvenanceDatabase::get_or_create_file_info(const std::string& path,
 
 int ProvenanceDatabase::get_file_info_id(const std::string& path) const {
     std::string value;
-    auto status = db_->get(file_key(path), &value, "provenance");
+    auto status = db_->get(file_key(path), &value, cf::PROVENANCE);
     if (status.IsNotFound()) {
         return -1;
     }
@@ -263,8 +229,8 @@ void ProvenanceDatabase::insert_info(int file_info_id, std::string_view key,
                                      std::string_view value) {
     const auto db_key = info_key(file_info_id, key);
     auto status = txn_batch_
-                      ? db_->put(*txn_batch_, "provenance", db_key, value)
-                      : db_->put(db_key, value, "provenance");
+                      ? db_->put(*txn_batch_, cf::PROVENANCE, db_key, value)
+                      : db_->put(db_key, value, cf::PROVENANCE);
     if (!status.ok()) {
         throw_db_error("Failed to insert provenance info", status);
     }
@@ -279,10 +245,10 @@ void ProvenanceDatabase::insert_source(int file_info_id, int source_idx,
     append_u32(value, static_cast<std::uint32_t>(num_checkpoints));
     append_string(value, event_hash);
     auto status = txn_batch_
-                      ? db_->put(*txn_batch_, "provenance",
+                      ? db_->put(*txn_batch_, cf::PROVENANCE,
                                  source_key(file_info_id, source_idx), value)
                       : db_->put(source_key(file_info_id, source_idx), value,
-                                 "provenance");
+                                 cf::PROVENANCE);
     if (!status.ok()) {
         throw_db_error("Failed to insert provenance source", status);
     }
@@ -291,30 +257,27 @@ void ProvenanceDatabase::insert_source(int file_info_id, int source_idx,
 void ProvenanceDatabase::insert_group(int file_info_id, std::string_view name,
                                       std::string_view predicate) {
     const auto db_key = group_key(file_info_id, name);
-    auto status = txn_batch_
-                      ? db_->put(*txn_batch_, "provenance", db_key,
-                                 std::string(predicate))
-                      : db_->put(db_key, std::string(predicate), "provenance");
+    auto status =
+        txn_batch_ ? db_->put(*txn_batch_, cf::PROVENANCE, db_key,
+                              std::string(predicate))
+                   : db_->put(db_key, std::string(predicate), cf::PROVENANCE);
     if (!status.ok()) {
         throw_db_error("Failed to insert provenance group", status);
     }
 }
 
 void ProvenanceDatabase::insert_segment(int file_info_id, int source_idx,
-                                        int source_checkpoint,
+                                        int source_checkpoint, int segment_seq,
                                         int output_line_start,
                                         int output_line_end, int event_count) {
     std::string value;
     append_u32(value, static_cast<std::uint32_t>(output_line_start));
     append_u32(value, static_cast<std::uint32_t>(output_line_end));
     append_u32(value, static_cast<std::uint32_t>(event_count));
-    auto status =
-        txn_batch_
-            ? db_->put(*txn_batch_, "provenance",
-                       segment_key(file_info_id, source_idx, source_checkpoint),
-                       value)
-            : db_->put(segment_key(file_info_id, source_idx, source_checkpoint),
-                       value, "provenance");
+    auto key =
+        segment_key(file_info_id, source_idx, source_checkpoint, segment_seq);
+    auto status = txn_batch_ ? db_->put(*txn_batch_, cf::PROVENANCE, key, value)
+                             : db_->put(key, value, cf::PROVENANCE);
     if (!status.ok()) {
         throw_db_error("Failed to insert provenance segment", status);
     }
@@ -391,7 +354,7 @@ ProvenanceDatabase::query_all_segments(int file_info_id) const {
 std::string ProvenanceDatabase::query_info(int file_info_id,
                                            std::string_view key) const {
     std::string value;
-    auto status = db_->get(info_key(file_info_id, key), &value, "provenance");
+    auto status = db_->get(info_key(file_info_id, key), &value, cf::PROVENANCE);
     if (status.IsNotFound()) {
         return {};
     }
diff --git a/src/dftracer/utils/utilities/indexer/visitors/bloom_visitor.cpp b/src/dftracer/utils/utilities/indexer/visitors/bloom_visitor.cpp
deleted file mode 100644
index 931d8188..00000000
--- a/src/dftracer/utils/utilities/indexer/visitors/bloom_visitor.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-#include <dftracer/utils/utilities/common/json/json_value.h>
-#include <dftracer/utils/utilities/composites/dft/event.h>
-#include <dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h>
-#include <dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h>
-#include <dftracer/utils/utilities/indexer/index_database.h>
-#include <dftracer/utils/utilities/indexer/visitors/bloom_visitor.h>
-#include <yyjson.h>
-
-#include <charconv>
-#include <cstring>
-#include <string>
-
-using dftracer::utils::utilities::common::json::JsonValue;
-using dftracer::utils::utilities::composites::dft::DFTracerEvent;
-using dftracer::utils::utilities::composites::dft::indexing::BloomFilter;
-namespace dftracer::utils::utilities::indexer {
-
-namespace {
-
-static const std::string DIM_NAME = "name";
-static const std::string DIM_CAT = "cat";
-static const std::string DIM_PID = "pid";
-static const std::string DIM_TID = "tid";
-static const std::string DIM_PID_TID = "pid_tid";
-static const std::string DIM_HHASH = "hhash";
-static const std::string DIM_FHASH = "fhash";
-static const std::string DIM_SHASH = "shash";
-
-std::string json_value_to_string(const JsonValue& val) {
-    if (val.is_string()) return val.get<std::string>();
-    if (val.is_uint()) return std::to_string(val.get<std::uint64_t>());
-    if (val.is_int()) return std::to_string(val.get<std::int64_t>());
-    if (val.is_number()) return std::to_string(val.get<double>());
-    if (val.is_bool()) return val.get<bool>() ? "true" : "false";
-    return {};
-}
-
-}  // namespace
-
-BloomVisitor::BloomVisitor(ChunkIndexerConfig config,
-                           std::vector<std::string> dimensions)
-    : config_(std::move(config)), dimensions_(std::move(dimensions)) {}
-
-void BloomVisitor::begin(std::size_t /*num_checkpoints*/) { chunks_.clear(); }
-
-void BloomVisitor::on_checkpoint(std::size_t /*checkpoint_idx*/) {}
-
-void BloomVisitor::ensure_chunk(std::size_t checkpoint_idx) {
-    if (checkpoint_idx < chunks_.size()) return;
-    auto old_size = chunks_.size();
-    chunks_.resize(checkpoint_idx + 1);
-    for (std::size_t i = old_size; i < chunks_.size(); ++i) {
-        auto& chunk = chunks_[i];
-        for (const auto& dim : dimensions_) {
-            chunk.bloom_filters.emplace(
-                dim, BloomFilter(config_.expected_entries_per_chunk,
-                                 config_.false_positive_rate));
-        }
-        for (const auto& dim : dimensions_) {
-            auto& ds = chunk.dimension_stats[dim];
-            ds.dimension = dim;
-            if (dim == DIM_PID || dim == DIM_TID) {
-                ds.value_type = "uint";
-            } else {
-                ds.value_type = "string";
-            }
-        }
-        auto& pt = chunk.dimension_stats[DIM_PID_TID];
-        pt.dimension = DIM_PID_TID;
-        pt.value_type = "string";
-    }
-}
-
-void BloomVisitor::on_line(std::string_view line, std::size_t checkpoint_idx) {
-    if (line.empty()) return;
-    ensure_chunk(checkpoint_idx);
-
-    ChunkState& chunk = chunks_[checkpoint_idx];
-
-    if (!yy_alc_initialized_) {
-        yyjson_alc_pool_init(&yy_alc_, yy_buf_.data(), yy_buf_.size());
-        yy_alc_initialized_ = true;
-    }
-
-    yyjson_doc* doc =
-        yyjson_read_opts(const_cast<char*>(line.data()), line.size(),
-                         YYJSON_READ_NOFLAG, &yy_alc_, nullptr);
-    if (!doc) return;
-
-    yyjson_val* root = yyjson_doc_get_root(doc);
-    if (!root || !yyjson_is_obj(root)) {
-        yyjson_doc_free(doc);
-        return;
-    }
-
-    JsonValue json(root);
-    DFTracerEvent ev;
-    if (!DFTracerEvent::parse(json, ev)) {
-        yyjson_doc_free(doc);
-        return;
-    }
-
-    if (ev.is_metadata()) {
-        if (ev.args.exists()) {
-            std::string hash_val = ev.args["value"].get<std::string>();
-            std::string resolved = ev.args["name"].get<std::string>();
-
-            if (!hash_val.empty() && !resolved.empty()) {
-                if (ev.name == "HH") {
-                    chunk.hash_resolutions[DIM_HHASH][hash_val] = resolved;
-                } else if (ev.name == "FH") {
-                    chunk.hash_resolutions[DIM_FHASH][hash_val] = resolved;
-                } else if (ev.name == "SH") {
-                    chunk.hash_resolutions[DIM_SHASH][hash_val] = resolved;
-                }
-            }
-        }
-    } else {
-        chunk.statistics.update_from_event(ev.name, ev.cat, ev.pid, ev.tid,
-                                           ev.ts, ev.dur);
-
-        // Helper: add to bloom filter and observe dimension stats
-        auto observe = [&chunk](const std::string& dim, std::string_view val) {
-            if (val.empty()) return;
-            auto bf_it = chunk.bloom_filters.find(dim);
-            if (bf_it != chunk.bloom_filters.end()) {
-                bf_it->second.add(val);
-            }
-            auto ds_it = chunk.dimension_stats.find(dim);
-            if (ds_it != chunk.dimension_stats.end()) {
-                ds_it->second.observe(val);
-            }
-        };
-
-        observe(DIM_NAME, ev.name);
-        observe(DIM_CAT, ev.cat);
-
-        char pid_buf[24], tid_buf[24], pt_buf[52];
-        auto [pp, _1] =
-            std::to_chars(pid_buf, pid_buf + sizeof(pid_buf), ev.pid);
-        std::string_view pid_sv(pid_buf, pp - pid_buf);
-        auto [tp, _2] =
-            std::to_chars(tid_buf, tid_buf + sizeof(tid_buf), ev.tid);
-        std::string_view tid_sv(tid_buf, tp - tid_buf);
-
-        observe(DIM_PID, pid_sv);
-        observe(DIM_TID, tid_sv);
-
-        auto len = pp - pid_buf;
-        std::memcpy(pt_buf, pid_buf, len);
-        pt_buf[len] = ':';
-        std::memcpy(pt_buf + len + 1, tid_buf, tp - tid_buf);
-        std::string_view pt_sv(pt_buf, len + 1 + (tp - tid_buf));
-        observe(DIM_PID_TID, pt_sv);
-
-        if (ev.args.exists()) {
-            std::string_view hhash = ev.args["hhash"].get<std::string_view>();
-            observe(DIM_HHASH, hhash);
-
-            std::string_view fhash = ev.args["fhash"].get<std::string_view>();
-            observe(DIM_FHASH, fhash);
-
-            std::string_view shash =
-                ev.args["cmd_hash"].get<std::string_view>();
-            if (shash.empty()) {
-                shash = ev.args["exec_hash"].get<std::string_view>();
-            }
-            observe(DIM_SHASH, shash);
-
-            for (const auto& dim : config_.extra_dimensions) {
-                JsonValue val = ev.args.at(dim.c_str());
-                if (val.exists()) {
-                    std::string str_val = json_value_to_string(val);
-                    observe(dim, str_val);
-                }
-            }
-        }
-
-        chunk.events_processed++;
-    }
-
-    yyjson_doc_free(doc);
-}
-
-void BloomVisitor::finalize(IndexDatabase& db, int file_id) {
-    std::unordered_map<std::string, BloomFilter> file_blooms;
-    for (const auto& dim : dimensions_) {
-        file_blooms.emplace(dim, BloomFilter(config_.expected_entries_per_chunk,
-                                             config_.false_positive_rate));
-    }
-
-    std::vector<unsigned char> blob;
-
-    for (std::size_t i = 0; i < chunks_.size(); ++i) {
-        ChunkState& chunk = chunks_[i];
-        auto checkpoint_idx = static_cast<std::uint64_t>(i);
-
-        for (const auto& dim : dimensions_) {
-            auto it = chunk.bloom_filters.find(dim);
-            if (it == chunk.bloom_filters.end()) continue;
-
-            const BloomFilter& bf = it->second;
-            bf.serialize_into(blob);
-            db.insert_chunk_bloom_filter(
-                file_id, checkpoint_idx, dim,
-                std::span<const unsigned char>(blob.data(), blob.size()),
-                static_cast<std::uint64_t>(bf.num_entries()));
-
-            file_blooms.at(dim).merge_from(bf);
-        }
-
-        db.insert_chunk_statistics(file_id, checkpoint_idx, chunk.statistics);
-
-        for (const auto& [dim, ds] : chunk.dimension_stats) {
-            db.insert_chunk_dimension_stats(file_id, checkpoint_idx, ds,
-                                            config_.value_counts_cap);
-        }
-
-        for (const auto& [dim, resolutions] : chunk.hash_resolutions) {
-            for (const auto& [hash_val, resolved] : resolutions) {
-                db.insert_hash_resolution(file_id, dim, hash_val, resolved);
-            }
-        }
-    }
-
-    for (const auto& dim : dimensions_) {
-        const BloomFilter& bf = file_blooms.at(dim);
-        bf.serialize_into(blob);
-        db.insert_file_bloom_filter(
-            file_id, dim,
-            std::span<const unsigned char>(blob.data(), blob.size()),
-            static_cast<std::uint64_t>(bf.num_entries()));
-    }
-
-    for (const auto& dim : dimensions_) {
-        db.insert_index_dimension(file_id, dim);
-    }
-}
-
-}  // namespace dftracer::utils::utilities::indexer
diff --git a/src/dftracer/utils/utilities/indexer/visitors/manifest_visitor.cpp b/src/dftracer/utils/utilities/indexer/visitors/manifest_visitor.cpp
deleted file mode 100644
index ec388b51..00000000
--- a/src/dftracer/utils/utilities/indexer/visitors/manifest_visitor.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-#include <dftracer/utils/utilities/common/json/json_value.h>
-#include <dftracer/utils/utilities/composites/dft/indexing/queries/manifest_queries.h>
-#include <dftracer/utils/utilities/indexer/index_database.h>
-#include <dftracer/utils/utilities/indexer/visitors/manifest_visitor.h>
-#include <yyjson.h>
-
-using dftracer::utils::utilities::common::json::JsonValue;
-namespace queries =
-    dftracer::utils::utilities::composites::dft::indexing::queries;
-
-namespace dftracer::utils::utilities::indexer {
-
-void ManifestVisitor::begin(std::size_t /*num_checkpoints*/) {
-    event_lines_.clear();
-    metadata_lines_.clear();
-    chunk_line_ = 0;
-}
-
-void ManifestVisitor::on_checkpoint(std::size_t /*checkpoint_idx*/) {
-    chunk_line_ = 0;
-}
-
-void ManifestVisitor::ensure_chunk(std::size_t checkpoint_idx) {
-    if (checkpoint_idx < event_lines_.size()) return;
-    event_lines_.resize(checkpoint_idx + 1);
-    metadata_lines_.resize(checkpoint_idx + 1);
-}
-
-void ManifestVisitor::on_line(std::string_view line,
-                              std::size_t checkpoint_idx) {
-    std::uint32_t ln = chunk_line_++;
-
-    if (line.empty()) return;
-    ensure_chunk(checkpoint_idx);
-
-    yyjson_doc* doc = yyjson_read(line.data(), line.size(), YYJSON_READ_NOFLAG);
-    if (!doc) return;
-
-    yyjson_val* root = yyjson_doc_get_root(doc);
-    if (root && yyjson_is_obj(root)) {
-        JsonValue json(root);
-        std::string_view ph = json["ph"].get<std::string_view>();
-
-        if (ph == "M") {
-            std::string name = json["name"].get<std::string>();
-            if (!name.empty()) {
-                metadata_lines_[checkpoint_idx][name].push_back(ln);
-            }
-        } else {
-            std::string cat = json["cat"].get<std::string>();
-            std::string name = json["name"].get<std::string>();
-            event_lines_[checkpoint_idx][{cat, name}].push_back(ln);
-        }
-    }
-
-    yyjson_doc_free(doc);
-}
-
-void ManifestVisitor::finalize(IndexDatabase& db, int file_id) {
-    for (std::size_t ci = 0; ci < event_lines_.size(); ++ci) {
-        for (auto& [key, lines] : event_lines_[ci]) {
-            db.insert_event_range(file_id, static_cast<std::uint64_t>(ci),
-                                  key.first, key.second, lines);
-        }
-
-        for (auto& [meta_type, lines] : metadata_lines_[ci]) {
-            db.insert_metadata_lines(file_id, static_cast<std::uint64_t>(ci),
-                                     meta_type, lines);
-        }
-    }
-}
-
-}  // namespace dftracer::utils::utilities::indexer
diff --git a/src/dftracer/utils/utilities/reader/internal/gzip_reader.cpp b/src/dftracer/utils/utilities/reader/internal/gzip_reader.cpp
index 86449551..d60aecd8 100644
--- a/src/dftracer/utils/utilities/reader/internal/gzip_reader.cpp
+++ b/src/dftracer/utils/utilities/reader/internal/gzip_reader.cpp
@@ -368,6 +368,7 @@ std::unique_ptr<ReaderStream> GzipReader::stream(const StreamConfig &config) {
     std::size_t start = config.start();
     std::size_t end = config.end();
     std::size_t buffer_size = config.buffer_size();
+    bool extend_to_line_boundary = config.extend_to_line_boundary();
 
     // Convert line range to byte range if needed
     std::size_t start_bytes = start;
@@ -472,6 +473,8 @@ std::unique_ptr<ReaderStream> GzipReader::stream(const StreamConfig &config) {
             // Single line-aligned bytes at a time
             auto line_byte_stream =
                 std::make_unique<GzipLineByteStream>(buffer_size);
+            line_byte_stream->set_extend_to_line_boundary(
+                extend_to_line_boundary);
             line_byte_stream->initialize(gz_path, start_bytes, end_bytes,
                                          *indexer);
 
@@ -488,6 +491,8 @@ std::unique_ptr<ReaderStream> GzipReader::stream(const StreamConfig &config) {
             // Multiple line-aligned bytes per read
             auto line_byte_stream =
                 std::make_unique<GzipLineByteStream>(buffer_size);
+            line_byte_stream->set_extend_to_line_boundary(
+                extend_to_line_boundary);
             line_byte_stream->initialize(gz_path, start_bytes, end_bytes,
                                          *indexer);
             return line_byte_stream;
@@ -496,6 +501,8 @@ std::unique_ptr<ReaderStream> GzipReader::stream(const StreamConfig &config) {
             // Single parsed line per read
             auto line_byte_stream =
                 std::make_unique<GzipLineByteStream>(buffer_size);
+            line_byte_stream->set_extend_to_line_boundary(
+                extend_to_line_boundary);
             line_byte_stream->initialize(gz_path, start_bytes, end_bytes,
                                          *indexer);
 
@@ -511,6 +518,8 @@ std::unique_ptr<ReaderStream> GzipReader::stream(const StreamConfig &config) {
             // Multiple parsed lines per read
             auto line_byte_stream =
                 std::make_unique<GzipLineByteStream>(buffer_size);
+            line_byte_stream->set_extend_to_line_boundary(
+                extend_to_line_boundary);
             line_byte_stream->initialize(gz_path, start_bytes, end_bytes,
                                          *indexer);
 
diff --git a/src/dftracer/utils/utilities/reader/internal/inflater.h b/src/dftracer/utils/utilities/reader/internal/inflater.h
index 997fa1a6..94d08ca3 100644
--- a/src/dftracer/utils/utilities/reader/internal/inflater.h
+++ b/src/dftracer/utils/utilities/reader/internal/inflater.h
@@ -232,7 +232,7 @@ class ReaderInflater : public Inflater {
      * Check if the stream has reached the end
      */
     bool is_at_end() const {
-        return stream.avail_in == 0 && stream.avail_out == sizeof(out_buffer);
+        return stream.avail_in == 0 && stream.avail_out == BUFFER_SIZE;
     }
 };
 
diff --git a/src/dftracer/utils/utilities/reader/internal/streams/gzip_line_byte_stream.h b/src/dftracer/utils/utilities/reader/internal/streams/gzip_line_byte_stream.h
index 63ab0047..92f7df0f 100644
--- a/src/dftracer/utils/utilities/reader/internal/streams/gzip_line_byte_stream.h
+++ b/src/dftracer/utils/utilities/reader/internal/streams/gzip_line_byte_stream.h
@@ -22,6 +22,7 @@ class GzipLineByteStream : public GzipStream {
     std::vector<char> partial_line_buffer_;
     std::size_t actual_start_bytes_;
     std::size_t bytes_returned_;  // Track how many bytes we've returned to user
+    bool extend_to_line_boundary_ = false;
 
     // Buffer for zero-copy reads
     std::vector<char> buffer_;
@@ -39,6 +40,8 @@ class GzipLineByteStream : public GzipStream {
         partial_line_buffer_.reserve(1 * 1024 * 1024);
     }
 
+    void set_extend_to_line_boundary(bool v) { extend_to_line_boundary_ = v; }
+
     void initialize(const std::string &gz_path, std::size_t start_bytes,
                     std::size_t end_bytes,
                     dftracer::utils::utilities::indexer::internal::Indexer
@@ -145,10 +148,42 @@ class GzipLineByteStream : public GzipStream {
         }
 
         if (is_at_target_end()) {
-            DFTRACER_UTILS_LOG_DEBUG(
-                "GzipLineByteStream: at target end, current_position=%zu, "
-                "target_end_bytes=%zu",
-                current_position_, target_end_bytes_);
+            if (extend_to_line_boundary_ && !partial_line_buffer_.empty() &&
+                current_position_ < max_file_bytes_) {
+                std::size_t partial_size = partial_line_buffer_.size();
+                if (partial_size <= buffer_.size()) {
+                    std::memcpy(buffer_.data(), partial_line_buffer_.data(),
+                                partial_size);
+                    std::size_t avail = buffer_.size() - partial_size;
+                    std::size_t cap = static_cast<std::size_t>(
+                        max_file_bytes_ - current_position_);
+                    std::size_t to_read = std::min(avail, cap);
+                    std::size_t got = 0;
+                    bool ok = co_await inflater_.read(
+                        fd_, file_offset_,
+                        reinterpret_cast<unsigned char *>(buffer_.data() +
+                                                          partial_size),
+                        to_read, got);
+                    if (ok && got > 0) {
+                        current_position_ += got;
+                        std::size_t total = partial_size + got;
+                        std::size_t emit = 0;
+                        for (std::size_t i = partial_size; i < total; ++i) {
+                            if (buffer_[i] == '\n') {
+                                emit = i + 1;
+                                break;
+                            }
+                        }
+                        partial_line_buffer_.clear();
+                        is_finished_ = true;
+                        if (emit > 0) {
+                            bytes_returned_ += emit;
+                            co_return emit;
+                        }
+                        co_return 0;
+                    }
+                }
+            }
             is_finished_ = true;
             co_return 0;
         }
diff --git a/src/dftracer/utils/utilities/reader/internal/streams/line_stream.h b/src/dftracer/utils/utilities/reader/internal/streams/line_stream.h
index 3ddcdbc7..2b8612d6 100644
--- a/src/dftracer/utils/utilities/reader/internal/streams/line_stream.h
+++ b/src/dftracer/utils/utilities/reader/internal/streams/line_stream.h
@@ -12,14 +12,6 @@
 
 namespace dftracer::utils::utilities::reader::internal {
 
-/**
- * @brief Stream that returns one single line at a time from a LINE_BYTES
- * stream.
- *
- * Wraps a LINE_BYTES stream and provides single-line reading.
- * Each call to read() returns exactly one complete line (with newline).
- * Can optionally filter by line range when line numbers are specified.
- */
 class LineStream : public ReaderStream {
    private:
     std::unique_ptr<internal::ReaderStream> underlying_stream_;
@@ -35,6 +27,14 @@ class LineStream : public ReaderStream {
     std::size_t output_position_;
     std::size_t span_pos_;
 
+    enum class ParseResult { HAS_LINE, NEED_MORE_DATA, FINISHED };
+
+    struct DirectOutputResult {
+        std::size_t bytes_written;
+        bool need_more_data;
+        bool finished;
+    };
+
    public:
     explicit LineStream(std::unique_ptr<ReaderStream> underlying_stream,
                         std::size_t start_line = 0, std::size_t end_line = 0,
@@ -56,22 +56,37 @@ class LineStream : public ReaderStream {
     ~LineStream() override { reset(); }
 
     coro::CoroTask<std::span<const char>> read_async() override {
-        if (!underlying_stream_) {
+        if (!underlying_stream_ || is_finished_) {
             co_return {};
         }
 
-        if (is_finished_) {
-            co_return {};
-        }
+        while (true) {
+            auto result = try_parse_next_line();
 
-        // Parse next line into current_line_
-        if (!co_await parse_next_line()) {
-            co_return {};
-        }
+            if (result == ParseResult::HAS_LINE) {
+                co_return std::span<const char>(current_line_.data(),
+                                                current_line_.size());
+            }
+            if (result == ParseResult::FINISHED) {
+                co_return {};
+            }
 
-        // Return view to current_line_
-        co_return std::span<const char>(current_line_.data(),
-                                        current_line_.size());
+            if (underlying_stream_->done()) {
+                if (handle_eof_line()) {
+                    co_return std::span<const char>(current_line_.data(),
+                                                    current_line_.size());
+                }
+                is_finished_ = true;
+                co_return {};
+            }
+
+            current_span_ = co_await underlying_stream_->read_async();
+            span_pos_ = 0;
+            if (current_span_.empty()) {
+                is_finished_ = true;
+                co_return {};
+            }
+        }
     }
 
     coro::CoroTask<std::size_t> read_async(char* buffer,
@@ -80,7 +95,6 @@ class LineStream : public ReaderStream {
             co_return 0;
         }
 
-        // Handle any pending line from previous call
         if (has_pending_line_) {
             co_return output_pending_line(buffer, buffer_size);
         }
@@ -89,19 +103,40 @@ class LineStream : public ReaderStream {
             co_return 0;
         }
 
-        // Try fast path: direct output from read_buffer_ to output buffer
-        std::size_t written = co_await try_direct_output(buffer, buffer_size);
+        while (true) {
+            auto direct = try_direct_output(buffer, buffer_size);
+            if (direct.bytes_written > 0) {
+                co_return direct.bytes_written;
+            }
+            if (direct.finished) {
+                co_return 0;
+            }
 
-        if (written > 0) {
-            co_return written;
-        }
+            if (!direct.need_more_data) {
+                auto result = try_parse_next_line();
+                if (result == ParseResult::HAS_LINE) {
+                    co_return output_pending_line(buffer, buffer_size);
+                }
+                if (result == ParseResult::FINISHED) {
+                    co_return 0;
+                }
+            }
 
-        // Slow path: need to use intermediate storage
-        if (!co_await parse_next_line()) {
-            co_return 0;
-        }
+            if (underlying_stream_->done()) {
+                if (handle_eof_line()) {
+                    co_return output_pending_line(buffer, buffer_size);
+                }
+                is_finished_ = true;
+                co_return 0;
+            }
 
-        co_return output_pending_line(buffer, buffer_size);
+            current_span_ = co_await underlying_stream_->read_async();
+            span_pos_ = 0;
+            if (current_span_.empty()) {
+                is_finished_ = true;
+                co_return 0;
+            }
+        }
     }
 
     bool done() const override { return is_finished_ && !has_pending_line_; }
@@ -121,10 +156,6 @@ class LineStream : public ReaderStream {
     }
 
    private:
-    // ========================================================================
-    // Range Checking Helpers
-    // ========================================================================
-
     bool is_beyond_range() const {
         return end_line_ > 0 && current_line_number_ > end_line_;
     }
@@ -133,32 +164,13 @@ class LineStream : public ReaderStream {
         if (start_line_ == 0 && end_line_ == 0) {
             return true;
         }
-
         bool after_start =
             (start_line_ == 0 || current_line_number_ >= start_line_);
         bool before_end = (end_line_ == 0 || current_line_number_ <= end_line_);
-
         return after_start && before_end;
     }
 
-    // ========================================================================
-    // Buffer Management
-    // ========================================================================
-
-    coro::CoroTask<bool> refill_span_if_needed() {
-        if (span_pos_ < current_span_.size()) {
-            co_return true;
-        }
-
-        if (underlying_stream_->done()) {
-            co_return false;
-        }
-
-        // Get new span from underlying stream (zero-copy)
-        current_span_ = co_await underlying_stream_->read_async();
-        span_pos_ = 0;
-        co_return !current_span_.empty();
-    }
+    bool has_data_in_span() const { return span_pos_ < current_span_.size(); }
 
     const char* find_next_newline() const {
         return static_cast<const char*>(
@@ -172,80 +184,91 @@ class LineStream : public ReaderStream {
         span_pos_ = current_span_.size();
     }
 
-    // ========================================================================
-    // Fast Path: Direct Output (No Intermediate Storage)
-    // ========================================================================
-
-    /**
-     * @brief Attempt to write a line directly from read_buffer_ to output
-     * buffer.
-     *
-     * This fast path avoids intermediate string copies when:
-     * - No accumulated data exists
-     * - A complete line fits in the output buffer
-     *
-     * Uses a loop to skip filtered lines efficiently.
-     *
-     * @return Number of bytes written, or 0 if fast path unavailable
-     */
-    coro::CoroTask<std::size_t> try_direct_output(char* buffer,
-                                                  std::size_t buffer_size) {
-        // Fast path requires no accumulated data
+    ParseResult try_parse_next_line() {
+        if (is_beyond_range()) {
+            is_finished_ = true;
+            return ParseResult::FINISHED;
+        }
+
+        while (has_data_in_span()) {
+            const char* newline_ptr = find_next_newline();
+
+            if (!newline_ptr) {
+                accumulate_remaining_span();
+                return ParseResult::NEED_MORE_DATA;
+            }
+
+            std::size_t newline_pos = newline_ptr - current_span_.data();
+            if (process_complete_line(newline_pos)) {
+                return ParseResult::HAS_LINE;
+            }
+        }
+
+        return ParseResult::NEED_MORE_DATA;
+    }
+
+    DirectOutputResult try_direct_output(char* buffer,
+                                         std::size_t buffer_size) {
         if (!line_accumulator_.empty()) {
-            co_return 0;
+            return {0, false, false};
         }
 
-        // Loop to skip filtered lines efficiently
         while (true) {
             if (is_beyond_range()) {
                 is_finished_ = true;
-                co_return 0;
+                return {0, false, true};
             }
 
-            if (!co_await refill_span_if_needed()) {
-                co_return 0;
+            if (!has_data_in_span()) {
+                return {0, true, false};
             }
 
             const char* newline_ptr = find_next_newline();
             if (!newline_ptr) {
-                // No complete line available, must use slow path
-                co_return 0;
+                return {0, false, false};
             }
 
             std::size_t newline_pos = newline_ptr - current_span_.data();
             std::size_t line_length = newline_pos - span_pos_ + 1;
 
-            // Line must fit in output buffer for fast path
             if (line_length > buffer_size) {
-                co_return 0;
+                return {0, false, false};
             }
 
             bool should_output = should_output_current_line();
 
             if (is_beyond_range()) {
                 is_finished_ = true;
-                co_return 0;
+                return {0, false, true};
             }
 
             current_line_number_++;
 
             if (should_output) {
-                // Direct copy: span -> output buffer (zero-copy from underlying
-                // stream!)
                 std::memcpy(buffer, current_span_.data() + span_pos_,
                             line_length);
                 span_pos_ = newline_pos + 1;
-                co_return line_length;
+                return {line_length, false, false};
             }
 
-            // Line filtered out, skip and continue to next
             span_pos_ = newline_pos + 1;
         }
     }
 
-    // ========================================================================
-    // Slow Path: Parse and Store Line
-    // ========================================================================
+    bool handle_eof_line() {
+        if (line_accumulator_.empty()) {
+            return false;
+        }
+        current_line_ = std::move(line_accumulator_);
+        line_accumulator_.clear();
+        if (should_output_current_line() && !is_beyond_range()) {
+            has_pending_line_ = true;
+            output_position_ = 0;
+            current_line_number_++;
+            return true;
+        }
+        return false;
+    }
 
     void finalize_line_with_accumulator(std::size_t line_length) {
         line_accumulator_.append(current_span_.data() + span_pos_, line_length);
@@ -262,7 +285,6 @@ class LineStream : public ReaderStream {
     bool process_complete_line(std::size_t newline_pos) {
         std::size_t line_length = newline_pos - span_pos_;
 
-        // Build complete line with or without accumulated data
         if (!line_accumulator_.empty()) {
             finalize_line_with_accumulator(line_length);
         } else {
@@ -286,60 +308,10 @@ class LineStream : public ReaderStream {
             return true;
         }
 
-        // Line filtered out, continue parsing
         current_line_.clear();
         return false;
     }
 
-    coro::CoroTask<bool> parse_next_line() {
-        if (is_beyond_range()) {
-            is_finished_ = true;
-            co_return false;
-        }
-
-        while (true) {
-            if (!co_await refill_span_if_needed()) {
-                break;
-            }
-
-            // Process all complete lines in current span
-            while (span_pos_ < current_span_.size()) {
-                const char* newline_ptr = find_next_newline();
-
-                if (!newline_ptr) {
-                    accumulate_remaining_span();
-                    break;
-                }
-
-                std::size_t newline_pos = newline_ptr - current_span_.data();
-
-                if (process_complete_line(newline_pos)) {
-                    co_return true;
-                }
-            }
-        }
-
-        // Handle final line at EOF without trailing newline
-        if (underlying_stream_->done() && !line_accumulator_.empty()) {
-            current_line_ = std::move(line_accumulator_);
-            line_accumulator_.clear();
-
-            if (should_output_current_line() && !is_beyond_range()) {
-                has_pending_line_ = true;
-                output_position_ = 0;
-                current_line_number_++;
-                co_return true;
-            }
-        }
-
-        is_finished_ = true;
-        co_return false;
-    }
-
-    // ========================================================================
-    // Output Helpers
-    // ========================================================================
-
     std::size_t output_pending_line(char* buffer, std::size_t buffer_size) {
         if (output_position_ >= current_line_.size()) {
             has_pending_line_ = false;
diff --git a/src/dftracer/utils/utilities/reader/trace_reader.cpp b/src/dftracer/utils/utilities/reader/trace_reader.cpp
index d3ddef20..858581bf 100644
--- a/src/dftracer/utils/utilities/reader/trace_reader.cpp
+++ b/src/dftracer/utils/utilities/reader/trace_reader.cpp
@@ -1,5 +1,6 @@
 #include <dftracer/utils/core/common/archive_format.h>
 #include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/utils/string.h>
 #include <dftracer/utils/utilities/common/json/json_value.h>
 #include <dftracer/utils/utilities/common/query/query.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h>
@@ -7,6 +8,8 @@
 #include <dftracer/utils/utilities/fileio/lines/sources/async_plain_file_bytes_generator.h>
 #include <dftracer/utils/utilities/fileio/lines/sources/async_plain_file_line_generator.h>
 #include <dftracer/utils/utilities/fileio/lines/sources/async_streaming_gz_line_generator.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/internal/helpers.h>
 #include <dftracer/utils/utilities/indexer/internal/indexer_factory.h>
 #include <dftracer/utils/utilities/reader/internal/reader.h>
 #include <dftracer/utils/utilities/reader/internal/reader_factory.h>
@@ -14,11 +17,17 @@
 #include <dftracer/utils/utilities/reader/internal/stream_config.h>
 #include <dftracer/utils/utilities/reader/internal/stream_type.h>
 #include <dftracer/utils/utilities/reader/trace_reader.h>
-#include <yyjson.h>
+#include <simdjson.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+#include <dftracer/utils/utilities/common/arrow/column_builder.h>
+#endif
 
+#include <algorithm>
 #include <cstring>
 #include <optional>
 #include <span>
+#include <type_traits>
+#include <unordered_map>
 
 namespace dftracer::utils::utilities::reader {
 
@@ -31,17 +40,675 @@ using indexer::internal::IndexerFactory;
 
 namespace {
 
+thread_local simdjson::dom::parser tl_parser;
+
 bool line_matches_query(const Query& q, std::string_view content) {
-    yyjson_doc* doc = yyjson_read(content.data(), content.size(), 0);
-    if (!doc) return false;
-    yyjson_val* root = yyjson_doc_get_root(doc);
-    bool result = false;
-    if (root && yyjson_is_obj(root)) {
-        JsonValue json(root);
-        result = q.evaluate(json);
+    auto result = tl_parser.parse(content.data(), content.size());
+    if (result.error()) return false;
+    auto root = result.value_unsafe();
+    if (!root.is_object()) return false;
+    JsonValue json(root);
+    return q.evaluate(json);
+}
+
+struct LineRange {
+    std::size_t start_line;
+    std::size_t end_line;
+};
+
+// Cheap byte-level pre-filter derived from a query AST.
+//
+// The filter holds a list of literal substrings that MUST appear (verbatim) in
+// any line matching the query. Currently populated only for ASTs of the form
+// "AND of field == literal"; the common shape of dftindex equality queries.
+// For unsupported shapes (range ops, OR, NOT, IN/NOT IN, non-equality compares)
+// `required` is left empty and `may_match` trivially returns true.
+//
+// Semantically false-positive-safe: any line we accept still gets re-checked
+// against the real query downstream. Lines we reject are guaranteed not to
+// match because the literal representation of the comparison is missing.
+struct LinePrefilter {
+    std::vector<std::string> required;
+
+    bool empty() const { return required.empty(); }
+
+    bool may_match(std::string_view bytes) const {
+        for (const auto& lit : required) {
+            if (::memmem(bytes.data(), bytes.size(), lit.data(), lit.size()) ==
+                nullptr)
+                return false;
+        }
+        return true;
+    }
+};
+
+bool collect_and_eq_literals(const common::query::QueryNode& node,
+                             std::vector<std::string>& out) {
+    return std::visit(
+        [&out](const auto& n) -> bool {
+            using T = std::decay_t<decltype(n)>;
+            if constexpr (std::is_same_v<T, common::query::CompareNode>) {
+                if (n.op != common::query::CompareOp::EQ) return false;
+                std::string lit;
+                lit.reserve(n.field.path.size() + 16);
+                lit += '"';
+                lit += n.field.path;
+                lit += "\":";
+                const auto& val = n.value.value;
+                if (std::holds_alternative<std::string>(val)) {
+                    lit += '"';
+                    lit += std::get<std::string>(val);
+                    lit += '"';
+                } else if (std::holds_alternative<int64_t>(val)) {
+                    lit += std::to_string(std::get<int64_t>(val));
+                } else if (std::holds_alternative<uint64_t>(val)) {
+                    lit += std::to_string(std::get<uint64_t>(val));
+                } else if (std::holds_alternative<bool>(val)) {
+                    lit += std::get<bool>(val) ? "true" : "false";
+                } else {
+                    return false;  // double or other: skip pre-filter
+                }
+                out.push_back(std::move(lit));
+                return true;
+            } else if constexpr (std::is_same_v<T, common::query::AndNode>) {
+                return collect_and_eq_literals(*n.left, out) &&
+                       collect_and_eq_literals(*n.right, out);
+            }
+            return false;  // OrNode, NotNode, InNode, NotInNode, CompareNode
+                           // with non-EQ op: conservative skip
+        },
+        node.data);
+}
+
+// Strip a leading `[` and trailing `]` (plus surrounding whitespace) from a
+// chunk buffer. These bookends appear in `.pfw.gz` files to keep them
+// Perfetto-viewable as JSON arrays, but break simdjson iterate_many which
+// expects whitespace-separated NDJSON. Safe to call on any chunk: if the
+// bookends are absent the range is returned unchanged.
+std::string_view strip_ndjson_bookends(std::string_view bytes) {
+    const char* s = bytes.data();
+    const char* e = bytes.data() + bytes.size();
+    auto is_ws = [](char c) {
+        return c == ' ' || c == '\t' || c == '\n' || c == '\r';
+    };
+    while (s < e && is_ws(*s)) ++s;
+    if (s < e && *s == '[') {
+        ++s;
+        while (s < e && is_ws(*s)) ++s;
+    }
+    while (e > s && is_ws(e[-1])) --e;
+    if (e > s && e[-1] == ']') {
+        --e;
+        while (e > s && is_ws(e[-1])) --e;
+    }
+    return std::string_view(s, static_cast<std::size_t>(e - s));
+}
+
+// AND-of-EQ predicates with concrete typed literals can be evaluated
+// directly against simdjson without going through ValueMap (which costs
+// wyhash + per-field std::string allocation per row). Anything more
+// complex (OR/NOT/IN/range) falls back to the generic visitor.
+struct CompiledEqProbe {
+    std::string top_key;     // "pid", "args", "name", etc.
+    std::string nested_key;  // "" for top-level, else e.g. "fhash"
+    enum class Kind { String, Int64, UInt64, Double, Bool };
+    Kind kind = Kind::String;
+    std::string s_val;
+    std::int64_t i64_val = 0;
+    std::uint64_t u64_val = 0;
+    double d_val = 0.0;
+    bool b_val = false;
+};
+
+// Top-level JSON keys in dftracer events. Anything else in the query DSL
+// (e.g. `epoch == 0`, `fhash == "..."`) refers to a field nested under
+// "args"; the same convention collect_query_fields relies on when it
+// folds nested object keys into the flat ValueMap.
+bool is_top_level_event_key(std::string_view k) {
+    return k == "id" || k == "name" || k == "cat" || k == "pid" || k == "tid" ||
+           k == "ts" || k == "dur" || k == "ph";
+}
+
+// Walk a CompareNode-with-EQ leaf into a probe. Returns false on
+// unsupported shapes (more than one '.' or a literal type the simdjson
+// get_X path can't compare directly).
+bool compile_eq_leaf(const common::query::CompareNode& n,
+                     CompiledEqProbe& out) {
+    if (n.op != common::query::CompareOp::EQ) return false;
+    auto dot = n.field.path.find('.');
+    if (dot == std::string::npos) {
+        if (is_top_level_event_key(n.field.path)) {
+            out.top_key = n.field.path;
+            out.nested_key.clear();
+        } else {
+            // Bare arg-style key: foo -> args.foo.
+            out.top_key = "args";
+            out.nested_key = n.field.path;
+        }
+    } else {
+        if (n.field.path.find('.', dot + 1) != std::string::npos) return false;
+        out.top_key = n.field.path.substr(0, dot);
+        out.nested_key = n.field.path.substr(dot + 1);
+    }
+    return std::visit(
+        [&out](auto&& v) -> bool {
+            using T = std::decay_t<decltype(v)>;
+            if constexpr (std::is_same_v<T, std::string>) {
+                out.kind = CompiledEqProbe::Kind::String;
+                out.s_val = v;
+                return true;
+            } else if constexpr (std::is_same_v<T, std::int64_t>) {
+                out.kind = CompiledEqProbe::Kind::Int64;
+                out.i64_val = v;
+                return true;
+            } else if constexpr (std::is_same_v<T, std::uint64_t>) {
+                out.kind = CompiledEqProbe::Kind::UInt64;
+                out.u64_val = v;
+                return true;
+            } else if constexpr (std::is_same_v<T, double>) {
+                out.kind = CompiledEqProbe::Kind::Double;
+                out.d_val = v;
+                return true;
+            } else if constexpr (std::is_same_v<T, bool>) {
+                out.kind = CompiledEqProbe::Kind::Bool;
+                out.b_val = v;
+                return true;
+            } else {
+                return false;
+            }
+        },
+        n.value.value);
+}
+
+// Try to compile the query AST as an AND of EQ leaves. nullopt on
+// unsupported shapes; the ValueMap path handles those.
+std::optional<std::vector<CompiledEqProbe>> try_compile_eq_probes(
+    const common::query::QueryNode& node) {
+    using namespace common::query;
+    return std::visit(
+        [&](const auto& n) -> std::optional<std::vector<CompiledEqProbe>> {
+            using T = std::decay_t<decltype(n)>;
+            if constexpr (std::is_same_v<T, CompareNode>) {
+                CompiledEqProbe p;
+                if (!compile_eq_leaf(n, p)) return std::nullopt;
+                return std::vector<CompiledEqProbe>{std::move(p)};
+            } else if constexpr (std::is_same_v<T, AndNode>) {
+                auto l = try_compile_eq_probes(*n.left);
+                if (!l) return std::nullopt;
+                auto r = try_compile_eq_probes(*n.right);
+                if (!r) return std::nullopt;
+                l->insert(l->end(), std::make_move_iterator(r->begin()),
+                          std::make_move_iterator(r->end()));
+                return l;
+            } else {
+                return std::nullopt;
+            }
+        },
+        node.data);
+}
+
+bool probe_matches_value(const CompiledEqProbe& p,
+                         simdjson::ondemand::value val) {
+    switch (p.kind) {
+        case CompiledEqProbe::Kind::String: {
+            auto r = val.get_string();
+            if (r.error()) return false;
+            auto sv = r.value_unsafe();
+            return sv.size() == p.s_val.size() &&
+                   std::memcmp(sv.data(), p.s_val.data(), sv.size()) == 0;
+        }
+        case CompiledEqProbe::Kind::Int64: {
+            auto t = val.type();
+            if (t.error()) return false;
+            if (t.value_unsafe() == simdjson::ondemand::json_type::number) {
+                auto num = val.get_number();
+                if (num.error()) return false;
+                auto n = num.value_unsafe();
+                if (n.is_int64()) return n.get_int64() == p.i64_val;
+                if (n.is_uint64()) {
+                    if (p.i64_val < 0) return false;
+                    return n.get_uint64() ==
+                           static_cast<std::uint64_t>(p.i64_val);
+                }
+                return n.get_double() == static_cast<double>(p.i64_val);
+            }
+            return false;
+        }
+        case CompiledEqProbe::Kind::UInt64: {
+            auto num = val.get_number();
+            if (num.error()) return false;
+            auto n = num.value_unsafe();
+            if (n.is_uint64()) return n.get_uint64() == p.u64_val;
+            if (n.is_int64()) {
+                auto v = n.get_int64();
+                if (v < 0) return false;
+                return static_cast<std::uint64_t>(v) == p.u64_val;
+            }
+            return n.get_double() == static_cast<double>(p.u64_val);
+        }
+        case CompiledEqProbe::Kind::Double: {
+            auto r = val.get_double();
+            if (r.error()) return false;
+            return r.value_unsafe() == p.d_val;
+        }
+        case CompiledEqProbe::Kind::Bool: {
+            auto r = val.get_bool();
+            if (r.error()) return false;
+            return r.value_unsafe() == p.b_val;
+        }
+    }
+    return false;
+}
+
+// Evaluate compiled AND-of-EQ probes by directly probing simdjson fields.
+bool eval_compiled_eq(const std::vector<CompiledEqProbe>& probes,
+                      simdjson::ondemand::document_reference doc) {
+    for (const auto& p : probes) {
+        doc.rewind();
+        auto top_r = doc.find_field_unordered(
+            std::string_view(p.top_key.data(), p.top_key.size()));
+        if (top_r.error()) return false;
+        auto top_v = top_r.value();
+        if (p.nested_key.empty()) {
+            if (!probe_matches_value(p, top_v)) return false;
+        } else {
+            auto obj_r = top_v.get_object();
+            if (obj_r.error()) return false;
+            auto inner_r = obj_r.value().find_field_unordered(
+                std::string_view(p.nested_key.data(), p.nested_key.size()));
+            if (inner_r.error()) return false;
+            if (!probe_matches_value(p, inner_r.value())) return false;
+        }
+    }
+    return true;
+}
+
+LinePrefilter build_prefilter(const Query& q) {
+    // Short literals like `"pid":1000` or `"epoch":0` are common enough in
+    // practice that memmem on every line costs more than it saves on the
+    // parse side. Only keep literals long enough that rarity is plausible
+    // (hashes, filenames, host names).
+    constexpr std::size_t MIN_LITERAL_LEN = 16;
+
+    LinePrefilter pf;
+    std::vector<std::string> tmp;
+    if (collect_and_eq_literals(q.root(), tmp)) {
+        for (auto& lit : tmp) {
+            if (lit.size() >= MIN_LITERAL_LEN) {
+                pf.required.push_back(std::move(lit));
+            }
+        }
+    }
+    return pf;
+}
+
+coro::AsyncGenerator<Line> yield_lines_from_stream(
+    std::unique_ptr<internal::ReaderStream> stream, std::size_t start_line_num,
+    const Query* query, bool chunk_prune_only = false,
+    const LinePrefilter* prefilter = nullptr) {
+    std::size_t line_num = start_line_num;
+    while (!stream->done()) {
+        auto chunk = co_await stream->read_async();
+        if (chunk.empty()) break;
+        const char* data = chunk.data();
+        std::size_t len = chunk.size();
+
+        // Chunk-level pre-filter: if any required literal is absent from this
+        // entire buffer, no line within it can match. Skip without splitting.
+        // Line numbers must stay correct for subsequent chunks.
+        if (prefilter && !prefilter->empty() &&
+            !prefilter->may_match(std::string_view(data, len))) {
+            line_num += std::count(data, data + len, '\n');
+            continue;
+        }
+
+        std::size_t pos = 0;
+        while (pos < len) {
+            const void* nl_ptr = std::memchr(data + pos, '\n', len - pos);
+            std::size_t end_pos =
+                nl_ptr ? static_cast<const char*>(nl_ptr) - data : len;
+            if (end_pos > pos) {
+                auto line_sv = std::string_view(data + pos, end_pos - pos);
+                bool accept = chunk_prune_only || !query ||
+                              line_matches_query(*query, line_sv);
+                if (accept && prefilter && !prefilter->empty() &&
+                    !prefilter->may_match(line_sv)) {
+                    accept = false;
+                }
+                if (accept) {
+                    co_yield Line(line_sv, line_num);
+                }
+                ++line_num;
+            } else {
+                ++line_num;
+            }
+            pos = end_pos + 1;
+        }
+    }
+}
+
+coro::AsyncGenerator<Line> yield_lines_from_ranges(
+    std::shared_ptr<internal::Reader> reader, std::vector<LineRange> ranges,
+    std::size_t buffer_size, Query query, bool chunk_prune_only = false,
+    LinePrefilter prefilter = {}) {
+    for (const auto& range : ranges) {
+        auto stream =
+            reader->stream(internal::StreamConfig()
+                               .stream_type(internal::StreamType::MULTI_LINES)
+                               .range_type(internal::RangeType::LINE_RANGE)
+                               .from(range.start_line)
+                               .to(range.end_line)
+                               .buffer_size(buffer_size));
+        auto gen =
+            yield_lines_from_stream(std::move(stream), range.start_line, &query,
+                                    chunk_prune_only, &prefilter);
+        while (auto line = co_await gen.next()) {
+            co_yield *line;
+        }
+    }
+}
+
+// Raw-chunk variants of the yield/read helpers. Same pruning logic as the
+// line-yielding flavors but emit std::span<const char> buffers untouched
+// (multi-line boundary respected by stream type). Used by read_json to run
+// simdjson iterate_many over each chunk instead of parsing line by line.
+coro::AsyncGenerator<std::span<const char>> yield_chunks_from_stream(
+    std::unique_ptr<internal::ReaderStream> stream,
+    const LinePrefilter* prefilter = nullptr) {
+    while (!stream->done()) {
+        auto chunk = co_await stream->read_async();
+        if (chunk.empty()) break;
+        if (prefilter && !prefilter->empty() &&
+            !prefilter->may_match(
+                std::string_view(chunk.data(), chunk.size()))) {
+            continue;
+        }
+        co_yield chunk;
+    }
+}
+
+coro::AsyncGenerator<std::span<const char>> yield_chunks_from_ranges(
+    std::shared_ptr<internal::Reader> reader, std::vector<LineRange> ranges,
+    std::size_t buffer_size, LinePrefilter prefilter = {}) {
+    for (const auto& range : ranges) {
+        auto stream =
+            reader->stream(internal::StreamConfig()
+                               .stream_type(internal::StreamType::MULTI_LINES)
+                               .range_type(internal::RangeType::LINE_RANGE)
+                               .from(range.start_line)
+                               .to(range.end_line)
+                               .buffer_size(buffer_size));
+        auto gen = yield_chunks_from_stream(std::move(stream), &prefilter);
+        while (auto chunk = co_await gen.next()) {
+            co_yield *chunk;
+        }
+    }
+}
+
+coro::AsyncGenerator<std::span<const char>> read_chunks_indexed(
+    std::shared_ptr<internal::Reader> reader, std::string index_path,
+    std::string file_path, ReadConfig config, std::optional<Query> query,
+    bool extend_to_line_boundary = false) {
+    // Keep RocksDB alive for the generator's lifetime so per-method opens
+    // in GzipIndexer reuse DBManager's cached handle.
+    std::optional<indexer::IndexDatabase> db_keep_alive;
+    if (!index_path.empty()) {
+        try {
+            db_keep_alive.emplace(index_path,
+                                  rocksdb::RocksDatabase::OpenMode::ReadOnly);
+        } catch (...) {
+        }
+    }
+
+    LinePrefilter prefilter = query ? build_prefilter(*query) : LinePrefilter{};
+    auto range_type = config.has_line_range() ? internal::RangeType::LINE_RANGE
+                                              : internal::RangeType::BYTE_RANGE;
+    std::size_t start =
+        config.has_line_range() ? config.start_line : config.start_byte;
+    std::size_t end =
+        config.has_line_range() ? config.end_line : config.end_byte;
+
+    if (range_type == internal::RangeType::LINE_RANGE) {
+        auto total_lines = reader->get_num_lines();
+        if (start == 0) start = 1;
+        if (end == 0 || end > total_lines) end = total_lines;
+        if (start > total_lines) co_return;
+    } else {
+        auto max_bytes = reader->get_max_bytes();
+        if (end == 0 || end > max_bytes) end = max_bytes;
+        if (start >= max_bytes) co_return;
+    }
+
+    if (query && !index_path.empty() && !config.skip_pruning) {
+        ChunkPrunerInput pruner_input{index_path, file_path, *query, nullptr};
+        ChunkPrunerUtility pruner;
+        auto pruner_out = co_await pruner.process(pruner_input);
+        if (pruner_out.success && !pruner_out.file_may_match) {
+            co_return;
+        }
+
+        if (pruner_out.success && !pruner_out.candidate_checkpoints.empty() &&
+            pruner_out.candidate_checkpoints.size() <
+                pruner_out.total_checkpoints) {
+            indexer::IndexDatabase idx_db(
+                index_path, rocksdb::RocksDatabase::OpenMode::ReadOnly);
+            auto logical = indexer::internal::get_logical_path(file_path);
+            int fid = idx_db.get_file_info_id(logical);
+
+            if (fid >= 0) {
+                auto all_ckpts = idx_db.query_checkpoints(fid);
+                std::unordered_map<std::uint64_t, indexer::IndexerCheckpoint>
+                    ckpt_map;
+                for (auto& ckpt : all_ckpts) {
+                    ckpt_map.emplace(ckpt.checkpoint_idx, std::move(ckpt));
+                }
+
+                std::vector<LineRange> ranges;
+                std::uint64_t prev_idx = UINT64_MAX;
+                for (auto ckpt_idx : pruner_out.candidate_checkpoints) {
+                    auto it = ckpt_map.find(ckpt_idx);
+                    if (it == ckpt_map.end()) continue;
+                    const auto& ckpt = it->second;
+                    // Intersect with the caller's window (byte or line) so
+                    // checkpoint-level parallel work items stay disjoint.
+                    if (range_type == internal::RangeType::BYTE_RANGE) {
+                        std::size_t ckpt_start = ckpt.uc_offset;
+                        std::size_t ckpt_end = ckpt.uc_offset + ckpt.uc_size;
+                        if (ckpt_end <= start) continue;
+                        if (ckpt_start >= end) continue;
+                    } else {
+                        if (ckpt.last_line_num < start) continue;
+                        if (ckpt.first_line_num > end) continue;
+                    }
+                    if (ranges.empty() || ckpt_idx != prev_idx + 1) {
+                        ranges.push_back(
+                            {ckpt.first_line_num, ckpt.last_line_num});
+                    } else {
+                        ranges.back().end_line = ckpt.last_line_num;
+                    }
+                    prev_idx = ckpt_idx;
+                }
+
+                if (ranges.empty()) {
+                    co_return;
+                }
+
+                auto gen = yield_chunks_from_ranges(
+                    reader, std::move(ranges), config.buffer_size, prefilter);
+                while (auto chunk = co_await gen.next()) {
+                    co_yield *chunk;
+                }
+                co_return;
+            }
+        }
+    }
+
+    auto stream_type = (range_type == internal::RangeType::BYTE_RANGE)
+                           ? internal::StreamType::MULTI_LINES_BYTES
+                           : internal::StreamType::MULTI_LINES;
+    auto stream =
+        reader->stream(internal::StreamConfig()
+                           .stream_type(stream_type)
+                           .range_type(range_type)
+                           .from(start)
+                           .to(end)
+                           .buffer_size(config.buffer_size)
+                           .extend_to_line_boundary(
+                               extend_to_line_boundary &&
+                               range_type == internal::RangeType::BYTE_RANGE));
+
+    auto gen = yield_chunks_from_stream(std::move(stream), &prefilter);
+    while (auto chunk = co_await gen.next()) {
+        co_yield *chunk;
+    }
+}
+
+coro::AsyncGenerator<Line> read_lines_indexed(
+    std::shared_ptr<internal::Reader> reader, std::string index_path,
+    std::string file_path, ReadConfig config, std::optional<Query> query,
+    bool chunk_prune_only = false) {
+    // Keep RocksDB alive for the generator's lifetime so per-method opens
+    // in GzipIndexer reuse DBManager's cached handle.
+    std::optional<indexer::IndexDatabase> db_keep_alive;
+    if (!index_path.empty()) {
+        try {
+            db_keep_alive.emplace(index_path,
+                                  rocksdb::RocksDatabase::OpenMode::ReadOnly);
+        } catch (...) {
+        }
+    }
+
+    LinePrefilter prefilter = query ? build_prefilter(*query) : LinePrefilter{};
+    auto range_type = config.has_line_range() ? internal::RangeType::LINE_RANGE
+                                              : internal::RangeType::BYTE_RANGE;
+    std::size_t start =
+        config.has_line_range() ? config.start_line : config.start_byte;
+    std::size_t end =
+        config.has_line_range() ? config.end_line : config.end_byte;
+
+    if (range_type == internal::RangeType::LINE_RANGE) {
+        auto total_lines = reader->get_num_lines();
+        if (start == 0) start = 1;
+        if (end == 0 || end > total_lines) end = total_lines;
+        if (start > total_lines) co_return;
+    } else {
+        auto max_bytes = reader->get_max_bytes();
+        if (end == 0 || end > max_bytes) end = max_bytes;
+        if (start >= max_bytes) co_return;
+    }
+
+    if (query && !index_path.empty() &&
+        range_type == internal::RangeType::BYTE_RANGE) {
+        ChunkPrunerInput pruner_input{index_path, file_path, *query, nullptr};
+        ChunkPrunerUtility pruner;
+        auto pruner_out = co_await pruner.process(pruner_input);
+        if (pruner_out.success && !pruner_out.file_may_match) {
+            co_return;
+        }
+
+        if (pruner_out.success && !pruner_out.candidate_checkpoints.empty() &&
+            pruner_out.candidate_checkpoints.size() <
+                pruner_out.total_checkpoints) {
+            indexer::IndexDatabase idx_db(
+                index_path, rocksdb::RocksDatabase::OpenMode::ReadOnly);
+            auto logical = indexer::internal::get_logical_path(file_path);
+            int fid = idx_db.get_file_info_id(logical);
+
+            if (fid >= 0) {
+                auto all_ckpts = idx_db.query_checkpoints(fid);
+                std::unordered_map<std::uint64_t, indexer::IndexerCheckpoint>
+                    ckpt_map;
+                for (auto& ckpt : all_ckpts) {
+                    ckpt_map.emplace(ckpt.checkpoint_idx, std::move(ckpt));
+                }
+
+                std::vector<LineRange> ranges;
+                std::uint64_t prev_idx = UINT64_MAX;
+
+                for (auto ckpt_idx : pruner_out.candidate_checkpoints) {
+                    auto it = ckpt_map.find(ckpt_idx);
+                    if (it == ckpt_map.end()) continue;
+                    const auto& ckpt = it->second;
+
+                    if (ranges.empty() || ckpt_idx != prev_idx + 1) {
+                        ranges.push_back(
+                            {ckpt.first_line_num, ckpt.last_line_num});
+                    } else {
+                        ranges.back().end_line = ckpt.last_line_num;
+                    }
+                    prev_idx = ckpt_idx;
+                }
+
+                auto gen = yield_lines_from_ranges(reader, std::move(ranges),
+                                                   config.buffer_size, *query,
+                                                   chunk_prune_only, prefilter);
+                while (auto line = co_await gen.next()) {
+                    co_yield *line;
+                }
+                co_return;
+            }
+        }
+    }
+
+    auto stream =
+        reader->stream(internal::StreamConfig()
+                           .stream_type(internal::StreamType::MULTI_LINES)
+                           .range_type(range_type)
+                           .from(start)
+                           .to(end)
+                           .buffer_size(config.buffer_size));
+
+    auto gen = yield_lines_from_stream(std::move(stream), start,
+                                       query ? &*query : nullptr,
+                                       chunk_prune_only, &prefilter);
+    while (auto line = co_await gen.next()) {
+        co_yield *line;
+    }
+}
+
+coro::AsyncGenerator<Line> read_lines_gz(std::string file_path,
+                                         ReadConfig config,
+                                         std::optional<Query> query,
+                                         bool chunk_prune_only = false) {
+    std::size_t start = config.has_line_range() ? config.start_line : 0;
+    std::size_t end = config.has_line_range() ? config.end_line : 0;
+    auto gen =
+        fileio::lines::sources::async_streaming_gz_lines(file_path, start, end);
+    while (auto opt = co_await gen.next()) {
+        if (chunk_prune_only || !query ||
+            line_matches_query(*query, opt->content)) {
+            co_yield *opt;
+        }
+    }
+}
+
+coro::AsyncGenerator<Line> read_lines_plain_bytes(
+    std::string file_path, ReadConfig config, std::optional<Query> query,
+    bool chunk_prune_only = false) {
+    auto gen = fileio::lines::sources::async_plain_file_bytes(
+        file_path, config.start_byte, config.end_byte, config.buffer_size);
+    while (auto opt = co_await gen.next()) {
+        if (chunk_prune_only || !query ||
+            line_matches_query(*query, opt->content)) {
+            co_yield *opt;
+        }
+    }
+}
+
+coro::AsyncGenerator<Line> read_lines_plain(std::string file_path,
+                                            ReadConfig config,
+                                            std::optional<Query> query,
+                                            bool chunk_prune_only = false) {
+    std::size_t start = config.has_line_range() ? config.start_line : 0;
+    std::size_t end = config.has_line_range() ? config.end_line : 0;
+    auto gen =
+        fileio::lines::sources::async_plain_file_lines(file_path, start, end);
+    while (auto opt = co_await gen.next()) {
+        if (chunk_prune_only || !query ||
+            line_matches_query(*query, opt->content)) {
+            co_yield *opt;
+        }
     }
-    yyjson_doc_free(doc);
-    return result;
 }
 
 }  // namespace
@@ -119,104 +786,223 @@ coro::AsyncGenerator<Line> TraceReader::read_lines(ReadConfig config) {
         query = std::move(*parsed);
     }
 
+    bool cpo = config.chunk_prune_only;
+
     if (has_index_) {
-        auto reader = create_indexed_reader();
-        auto range_type = resolve_range_type(config);
-        std::size_t start =
-            config.has_line_range() ? config.start_line : config.start_byte;
-        std::size_t end =
-            config.has_line_range() ? config.end_line : config.end_byte;
+        return read_lines_indexed(create_indexed_reader(), index_path_,
+                                  config_.file_path, std::move(config),
+                                  std::move(query), cpo);
+    }
+    if (format_ == ArchiveFormat::GZIP || format_ == ArchiveFormat::TAR_GZ) {
+        return read_lines_gz(config_.file_path, std::move(config),
+                             std::move(query), cpo);
+    }
+    if (config.has_byte_range()) {
+        return read_lines_plain_bytes(config_.file_path, std::move(config),
+                                      std::move(query), cpo);
+    }
+    return read_lines_plain(config_.file_path, std::move(config),
+                            std::move(query), cpo);
+}
 
-        if (range_type == internal::RangeType::LINE_RANGE) {
-            auto total_lines = reader->get_num_lines();
-            if (start == 0) start = 1;
-            if (end == 0 || end > total_lines) end = total_lines;
-            if (start > total_lines) co_return;
-        } else {
-            auto max_bytes = reader->get_max_bytes();
-            if (end == 0 || end > max_bytes) end = max_bytes;
-            if (start >= max_bytes) co_return;
-        }
+namespace {
 
-        if (has_index_ && query && !index_path_.empty() &&
-            range_type == internal::RangeType::BYTE_RANGE) {
-            ChunkPrunerInput pruner_input{index_path_, config_.file_path,
-                                          *query, nullptr};
-            ChunkPrunerUtility pruner;
-            auto pruner_out = co_await pruner.process(pruner_input);
-            if (pruner_out.success && !pruner_out.file_may_match) {
-                co_return;
+common::query::LiteralValue ondemand_to_literal(simdjson::ondemand::value val) {
+    auto type = val.type().value_unsafe();
+    switch (type) {
+        case simdjson::ondemand::json_type::string: {
+            auto r = val.get_string();
+            if (!r.error()) return std::string(r.value_unsafe());
+            break;
+        }
+        case simdjson::ondemand::json_type::number: {
+            auto num = val.get_number();
+            if (!num.error()) {
+                auto n = num.value_unsafe();
+                if (n.is_int64()) return n.get_int64();
+                if (n.is_uint64()) return n.get_uint64();
+                return n.get_double();
             }
+            break;
+        }
+        case simdjson::ondemand::json_type::boolean: {
+            auto r = val.get_bool();
+            if (!r.error()) return r.value_unsafe();
+            break;
         }
+        default:
+            break;
+    }
+    return std::string{};
+}
 
-        auto stream =
-            reader->stream(internal::StreamConfig()
-                               .stream_type(internal::StreamType::MULTI_LINES)
-                               .range_type(range_type)
-                               .from(start)
-                               .to(end)
-                               .buffer_size(config.buffer_size));
+}  // namespace
 
-        std::size_t line_num = start;
-        while (!stream->done()) {
-            auto chunk = co_await stream->read_async();
-            if (chunk.empty()) break;
-            const char* data = chunk.data();
-            std::size_t len = chunk.size();
-            std::size_t pos = 0;
-            while (pos < len) {
-                const void* nl_ptr = std::memchr(data + pos, '\n', len - pos);
-                std::size_t end_pos =
-                    nl_ptr ? static_cast<const char*>(nl_ptr) - data : len;
-                if (end_pos > pos) {
-                    auto line_sv = std::string_view(data + pos, end_pos - pos);
-                    if (!query || line_matches_query(*query, line_sv)) {
-                        co_yield Line(line_sv, line_num);
+coro::AsyncGenerator<JsonLine> TraceReader::read_json(ReadConfig config) {
+    std::optional<Query> query;
+    if (!config.query.empty()) {
+        auto parsed = Query::from_string(config.query);
+        if (!parsed) throw common::query::QueryParseError(parsed.error());
+        query = std::move(*parsed);
+    }
+
+    // chunk_prune_only path: dim_stats already proved every event with the
+    // predicate field matches; we still need to skip events lacking the
+    // field (e.g., metadata "ph":"M" events). Field-presence probe is
+    // cheaper than full ValueMap eval.
+    std::vector<std::string> presence_check_paths;
+    if (query && config.chunk_prune_only) {
+        const auto& fset = query->fields();
+        presence_check_paths.assign(fset.begin(), fset.end());
+    }
+
+    // Fast path: indexed gz files go through a chunk generator with
+    // simdjson iterate_many. Query is evaluated on the ondemand document
+    // directly, so non-matching docs never hit the yield_parser.
+    if (has_index_) {
+        auto reader = create_indexed_reader();
+        auto chunk_gen = read_chunks_indexed(reader, index_path_,
+                                             config_.file_path, config, query);
+
+        simdjson::ondemand::parser bulk_parser;
+        common::json::JsonParser yield_parser;
+
+        while (auto chunk_opt = co_await chunk_gen.next()) {
+            auto chunk = *chunk_opt;
+            if (chunk.empty()) continue;
+            auto trimmed = strip_ndjson_bookends(
+                std::string_view(chunk.data(), chunk.size()));
+            if (trimmed.empty()) continue;
+            simdjson::padded_string padded(trimmed);
+
+            auto docs_r = bulk_parser.iterate_many(
+                padded, 1 << 20, /*allow_comma_separated=*/false);
+            if (docs_r.error()) continue;
+            auto& docs = docs_r.value();
+
+            for (auto it = docs.begin(); it != docs.end(); ++it) {
+                auto doc_result = *it;
+                if (doc_result.error()) continue;
+                auto& doc = doc_result.value();
+
+                std::string_view src(it.source().data(), it.source().size());
+
+                if (query && config.chunk_prune_only) {
+                    bool all_present = true;
+                    for (const auto& path : presence_check_paths) {
+                        auto fld = doc.find_field_unordered(path);
+                        if (fld.error()) {
+                            all_present = false;
+                            break;
+                        }
                     }
-                    ++line_num;
-                } else {
-                    ++line_num;
+                    if (!all_present) continue;
+                    doc.rewind();
+                } else if (query) {
+                    common::query::ValueMap fields;
+                    auto obj = doc.get_object();
+                    if (obj.error()) continue;
+                    for (auto field : obj.value()) {
+                        if (field.error()) continue;
+                        auto key_r = field.unescaped_key();
+                        if (key_r.error()) continue;
+                        auto val_r = field.value();
+                        if (val_r.error()) continue;
+                        auto key = key_r.value();
+                        auto val = val_r.value();
+                        auto type_r = val.type();
+                        if (type_r.error()) continue;
+                        auto type = type_r.value();
+                        if (type == simdjson::ondemand::json_type::object) {
+                            auto nested = val.get_object();
+                            if (nested.error()) continue;
+                            for (auto nf : nested.value()) {
+                                if (nf.error()) continue;
+                                auto nk_r = nf.unescaped_key();
+                                if (nk_r.error()) continue;
+                                auto nv_r = nf.value();
+                                if (nv_r.error()) continue;
+                                auto nk = nk_r.value();
+                                if (!query->references(nk)) continue;
+                                fields[std::string(nk)] =
+                                    ondemand_to_literal(nv_r.value());
+                            }
+                        } else if (query->references(key)) {
+                            fields[std::string(key)] = ondemand_to_literal(val);
+                        }
+                    }
+                    if (!query->evaluate(fields)) continue;
                 }
-                pos = end_pos + 1;
-            }
-        }
-    } else if (format_ == ArchiveFormat::GZIP ||
-               format_ == ArchiveFormat::TAR_GZ) {
-        std::size_t start = config.has_line_range() ? config.start_line : 0;
-        std::size_t end = config.has_line_range() ? config.end_line : 0;
-        auto gen = fileio::lines::sources::async_streaming_gz_lines(
-            config_.file_path, start, end);
-        while (auto opt = co_await gen.next()) {
-            if (!query || line_matches_query(*query, opt->content)) {
-                co_yield *opt;
-            }
-        }
-    } else if (config.has_byte_range()) {
-        // Plain file with byte range
-        auto gen = fileio::lines::sources::async_plain_file_bytes(
-            config_.file_path, config.start_byte, config.end_byte,
-            config.buffer_size);
-        while (auto opt = co_await gen.next()) {
-            if (!query || line_matches_query(*query, opt->content)) {
-                co_yield *opt;
+
+                // Matched (or no query): lend the iterate_many doc to
+                // yield_parser without re-parsing. Consumers like
+                // build_arrow_row call parser.for_each_field which now
+                // iterates the borrowed doc_reference.
+                doc.rewind();
+                yield_parser.set_borrowed_document(
+                    simdjson::ondemand::document_reference(doc));
+                co_yield JsonLine{src, 0, &yield_parser};
             }
         }
-    } else {
-        std::size_t start = config.has_line_range() ? config.start_line : 0;
-        std::size_t end = config.has_line_range() ? config.end_line : 0;
-        auto gen = fileio::lines::sources::async_plain_file_lines(
-            config_.file_path, start, end);
-        while (auto opt = co_await gen.next()) {
-            if (!query || line_matches_query(*query, opt->content)) {
-                co_yield *opt;
+        co_return;
+    }
+
+    // Fallback: non-indexed paths use the per-line pipeline unchanged.
+    config.chunk_prune_only = true;
+    auto line_gen = read_lines(config);
+
+    common::json::JsonParser parser;
+
+    while (auto opt = co_await line_gen.next()) {
+        const char* trimmed;
+        std::size_t trimmed_len;
+        if (!dftracer::utils::json_trim_and_validate_with_comma(
+                opt->content.data(), opt->content.size(), trimmed, trimmed_len))
+            continue;
+        if (!parser.parse(std::string_view(trimmed, trimmed_len))) continue;
+
+        if (query) {
+            common::query::ValueMap fields;
+            std::vector<std::string> nested_keys;
+            parser.for_each_field(
+                [&](std::string_view key, simdjson::ondemand::value val) {
+                    auto type = val.type().value_unsafe();
+                    if (type == simdjson::ondemand::json_type::object) {
+                        nested_keys.emplace_back(key);
+                    } else if (query->references(key)) {
+                        fields[std::string(key)] = ondemand_to_literal(val);
+                    }
+                });
+            for (auto& nk : nested_keys) {
+                parser.rewind();
+                parser.for_each_field(nk, [&](std::string_view key,
+                                              simdjson::ondemand::value val) {
+                    if (query->references(key)) {
+                        fields[std::string(key)] = ondemand_to_literal(val);
+                    }
+                });
             }
+            if (!query->evaluate(fields)) continue;
+            parser.rewind();
         }
+
+        co_yield JsonLine{opt->content, opt->line_number, &parser};
     }
 }
 
 coro::AsyncGenerator<std::span<const char>> TraceReader::read_raw(
     ReadConfig config) {
     if (has_index_) {
+        // Keep RocksDB alive for the generator's lifetime so per-method
+        // opens in GzipIndexer reuse DBManager's cached handle.
+        std::optional<indexer::IndexDatabase> db_keep_alive;
+        if (!index_path_.empty()) {
+            try {
+                db_keep_alive.emplace(
+                    index_path_, rocksdb::RocksDatabase::OpenMode::ReadOnly);
+            } catch (...) {
+            }
+        }
         auto reader = create_indexed_reader();
         auto stream_type = resolve_raw_stream_type(config);
         auto range_type = resolve_range_type(config);
@@ -236,7 +1022,7 @@ coro::AsyncGenerator<std::span<const char>> TraceReader::read_raw(
             if (start >= max_bytes) co_return;
         }
 
-        if (has_index_ && !config.query.empty() && !index_path_.empty() &&
+        if (!config.query.empty() && !index_path_.empty() &&
             range_type == internal::RangeType::BYTE_RANGE) {
             auto parsed = Query::from_string(config.query);
             if (!parsed) throw common::query::QueryParseError(parsed.error());
@@ -293,4 +1079,510 @@ coro::AsyncGenerator<std::span<const char>> TraceReader::read_raw(
     }
 }
 
+#ifdef DFTRACER_UTILS_ENABLE_ARROW
+
+namespace {
+
+using common::arrow::ArrowExportResult;
+using common::arrow::ColumnType;
+using common::arrow::RecordBatchBuilder;
+
+// Bump arena for string_views that must survive until builder.finish().
+struct ArrowStringArena {
+    static constexpr std::size_t BLOCK_SIZE = 64 * 1024;
+    std::vector<std::vector<char>> blocks;
+    std::size_t pos = 0;
+
+    ArrowStringArena() { blocks.emplace_back(BLOCK_SIZE); }
+
+    std::string_view push(const char* data, std::size_t len) {
+        if (pos + len > blocks.back().size()) {
+            blocks.emplace_back(std::max(BLOCK_SIZE, len));
+            pos = 0;
+        }
+        char* dst = blocks.back().data() + pos;
+        std::memcpy(dst, data, len);
+        pos += len;
+        return {dst, len};
+    }
+
+    void clear() {
+        if (blocks.size() > 1) blocks.resize(1);
+        pos = 0;
+    }
+};
+
+struct ArrowKeyHint {
+    std::string key;
+    std::size_t col_idx = 0;
+    ColumnType type = ColumnType::INT64;
+    bool valid = false;
+};
+
+inline std::size_t resolve_col_idx(RecordBatchBuilder& builder,
+                                   std::vector<ArrowKeyHint>& hints,
+                                   std::size_t pos, std::string_view key_sv,
+                                   ColumnType type) {
+    if (pos < hints.size()) {
+        auto& h = hints[pos];
+        if (h.valid && h.type == type && h.key.size() == key_sv.size() &&
+            std::memcmp(h.key.data(), key_sv.data(), key_sv.size()) == 0) {
+            return h.col_idx;
+        }
+    }
+    // Position-keyed miss. Variable-shape rows (e.g., open vs read events
+    // with different args fields) push fields to different positions, so
+    // the position cache misses constantly while the underlying schema is
+    // small (~15 keys). A linear scan over the hint vector with a SIMD
+    // memcmp beats RecordBatchBuilder's name_to_index_ hash lookup for this
+    // size.
+    for (std::size_t i = 0; i < hints.size(); ++i) {
+        if (i == pos) continue;
+        auto& h = hints[i];
+        if (h.valid && h.type == type && h.key.size() == key_sv.size() &&
+            std::memcmp(h.key.data(), key_sv.data(), key_sv.size()) == 0) {
+            if (pos < hints.size()) {
+                auto& slot = hints[pos];
+                slot.key.assign(key_sv);
+                slot.type = type;
+                slot.col_idx = h.col_idx;
+                slot.valid = true;
+            }
+            return h.col_idx;
+        }
+    }
+    std::size_t idx = builder.add_or_get_column(key_sv, type);
+    if (pos >= hints.size()) hints.resize(pos + 1);
+    auto& h = hints[pos];
+    h.key.assign(key_sv);
+    h.type = type;
+    h.col_idx = idx;
+    h.valid = true;
+    return idx;
+}
+
+// Append a typed scalar value under `key_sv`. Nested objects/arrays are
+// always round-tripped as JSON strings (flattening is one level only).
+void append_scalar_or_json(RecordBatchBuilder& builder,
+                           std::vector<ArrowKeyHint>& hints, std::size_t& pos,
+                           std::string_view key_sv,
+                           simdjson::ondemand::value val,
+                           simdjson::ondemand::json_type type) {
+    switch (type) {
+        case simdjson::ondemand::json_type::number: {
+            auto num_r = val.get_number();
+            if (num_r.error()) break;
+            auto num = num_r.value();
+            if (num.is_int64()) {
+                auto idx = resolve_col_idx(builder, hints, pos++, key_sv,
+                                           ColumnType::INT64);
+                builder.append_int64(idx, num.get_int64());
+            } else if (num.is_uint64()) {
+                auto idx = resolve_col_idx(builder, hints, pos++, key_sv,
+                                           ColumnType::UINT64);
+                builder.append_uint64(idx, num.get_uint64());
+            } else {
+                auto idx = resolve_col_idx(builder, hints, pos++, key_sv,
+                                           ColumnType::DOUBLE);
+                builder.append_double(idx, num.get_double());
+            }
+            break;
+        }
+        case simdjson::ondemand::json_type::string: {
+            auto str_r = val.get_string();
+            if (str_r.error()) break;
+            auto idx = resolve_col_idx(builder, hints, pos++, key_sv,
+                                       ColumnType::STRING);
+            builder.append_string(idx, str_r.value());
+            break;
+        }
+        case simdjson::ondemand::json_type::boolean: {
+            auto b_r = val.get_bool();
+            if (b_r.error()) break;
+            auto idx = resolve_col_idx(builder, hints, pos++, key_sv,
+                                       ColumnType::BOOL);
+            builder.append_bool(idx, b_r.value());
+            break;
+        }
+        case simdjson::ondemand::json_type::null: {
+            auto existing = builder.find_column(key_sv);
+            if (existing) builder.append_null(*existing);
+            ++pos;
+            break;
+        }
+        case simdjson::ondemand::json_type::object:
+        case simdjson::ondemand::json_type::array: {
+            auto raw_r = val.raw_json();
+            auto idx = resolve_col_idx(builder, hints, pos++, key_sv,
+                                       ColumnType::STRING);
+            if (!raw_r.error()) {
+                auto sv = raw_r.value();
+                builder.append_string(idx, sv);
+            } else {
+                builder.append_null(idx);
+            }
+            break;
+        }
+        default:
+            ++pos;
+            break;
+    }
+}
+
+// Append one Arrow row from an already-parsed simdjson document.
+// Dynamic schema: new columns appended as they appear. When flatten_objects
+// is true, top-level object values are expanded one level into `parent.child`
+// columns; deeper nesting still lands as a JSON string under the flattened
+// key. Returns false on error paths so callers can skip the row.
+bool arrow_row_from_doc(RecordBatchBuilder& builder,
+                        std::vector<ArrowKeyHint>& hints,
+                        simdjson::ondemand::document_reference doc,
+                        bool flatten_objects = false) {
+    auto obj_result = doc.get_object();
+    if (obj_result.error()) return false;
+    char key_buf[512];
+    std::size_t pos = 0;
+    for (auto field : obj_result.value()) {
+        if (field.error()) continue;
+        auto key_r = field.unescaped_key();
+        if (key_r.error()) continue;
+        auto key_sv = key_r.value();
+        auto val_r = field.value();
+        if (val_r.error()) continue;
+        auto val = val_r.value();
+        auto type_r = val.type();
+        if (type_r.error()) continue;
+        auto type = type_r.value();
+
+        if (flatten_objects && type == simdjson::ondemand::json_type::object) {
+            auto nested = val.get_object();
+            if (nested.error()) continue;
+            for (auto nf : nested.value()) {
+                if (nf.error()) continue;
+                auto nk_r = nf.unescaped_key();
+                if (nk_r.error()) continue;
+                auto nk = nk_r.value();
+                auto nv_r = nf.value();
+                if (nv_r.error()) continue;
+                auto nv = nv_r.value();
+                auto nt_r = nv.type();
+                if (nt_r.error()) continue;
+                std::size_t needed = key_sv.size() + 1 + nk.size();
+                if (needed >= sizeof(key_buf)) continue;
+                std::memcpy(key_buf, key_sv.data(), key_sv.size());
+                key_buf[key_sv.size()] = '.';
+                std::memcpy(key_buf + key_sv.size() + 1, nk.data(), nk.size());
+                append_scalar_or_json(builder, hints, pos,
+                                      std::string_view(key_buf, needed), nv,
+                                      nt_r.value());
+            }
+            continue;
+        }
+
+        append_scalar_or_json(builder, hints, pos, key_sv, val, type);
+    }
+    builder.end_row();
+    return true;
+}
+
+void collect_query_fields(simdjson::ondemand::document_reference doc,
+                          const Query& query, common::query::ValueMap& out);
+
+// Run iterate_many over `padded`, build arrow rows, and emit completed
+// batches via `yield_one`. Updates `carry` with the truncated tail (if any)
+// for the caller to prepend to the next chunk.
+template <typename Yield>
+void parse_padded_into_arrow(simdjson::ondemand::parser& bulk_parser,
+                             simdjson::padded_string& padded,
+                             const std::optional<Query>& query, bool flatten,
+                             RecordBatchBuilder& builder,
+                             ArrowStringArena& arena,
+                             std::vector<ArrowKeyHint>& hints,
+                             std::size_t batch_size, std::string* carry,
+                             Yield&& yield_one) {
+    auto docs_r = bulk_parser.iterate_many(padded, 1 << 20, false);
+    if (docs_r.error()) {
+        if (carry) carry->clear();
+        return;
+    }
+    auto& docs = docs_r.value();
+    for (auto it = docs.begin(); it != docs.end(); ++it) {
+        auto doc_result = *it;
+        if (doc_result.error()) continue;
+        auto& doc = doc_result.value();
+        if (query) {
+            common::query::ValueMap fields;
+            collect_query_fields(doc, *query, fields);
+            if (!query->evaluate(fields)) continue;
+            doc.rewind();
+        }
+        if (!arrow_row_from_doc(builder, hints, doc, flatten)) continue;
+        if (builder.num_rows() >= batch_size) {
+            auto result = builder.finish();
+            arena.clear();
+            if (!builder.is_schema_locked()) builder.lock_schema();
+            builder.reset(true);
+            builder.reserve(batch_size);
+            yield_one(std::move(result));
+        }
+    }
+    if (carry) {
+        std::size_t total = padded.size();
+        std::size_t truncated = docs.truncated_bytes();
+        if (truncated > 0 && truncated <= total) {
+            carry->assign(padded.data() + total - truncated,
+                          padded.data() + total);
+        } else {
+            carry->clear();
+        }
+    }
+}
+
+// Build a simdjson-padded buffer containing only the lines in `chunk` that
+// pass the line-level prefilter. For queries with no useful prefilter, the
+// caller should skip this and feed the raw chunk directly.
+std::string collect_matching_lines(std::span<const char> chunk,
+                                   const LinePrefilter& prefilter) {
+    std::string out;
+    out.reserve(chunk.size());
+    const char* data = chunk.data();
+    std::size_t len = chunk.size();
+    std::size_t pos = 0;
+    while (pos < len) {
+        const void* nl = std::memchr(data + pos, '\n', len - pos);
+        std::size_t end_pos = nl ? static_cast<const char*>(nl) - data : len;
+        if (end_pos > pos) {
+            std::string_view line(data + pos, end_pos - pos);
+            if (prefilter.may_match(line)) {
+                out.append(line);
+                out.push_back('\n');
+            }
+        }
+        pos = end_pos + 1;
+    }
+    return out;
+}
+
+// Extract fields referenced by the query into a ValueMap, walking one level
+// of object nesting. Fields not referenced by the query are skipped.
+void collect_query_fields(simdjson::ondemand::document_reference doc,
+                          const Query& query, common::query::ValueMap& out) {
+    auto obj = doc.get_object();
+    if (obj.error()) return;
+    for (auto field : obj.value()) {
+        if (field.error()) continue;
+        auto key_r = field.unescaped_key();
+        if (key_r.error()) continue;
+        auto val_r = field.value();
+        if (val_r.error()) continue;
+        auto key = key_r.value();
+        auto val = val_r.value();
+        auto type_r = val.type();
+        if (type_r.error()) continue;
+        auto type = type_r.value();
+        if (type == simdjson::ondemand::json_type::object) {
+            auto nested = val.get_object();
+            if (nested.error()) continue;
+            for (auto nf : nested.value()) {
+                if (nf.error()) continue;
+                auto nk_r = nf.unescaped_key();
+                if (nk_r.error()) continue;
+                auto nv_r = nf.value();
+                if (nv_r.error()) continue;
+                if (!query.references(nk_r.value())) continue;
+                out[std::string(nk_r.value())] =
+                    ondemand_to_literal(nv_r.value());
+            }
+        } else if (query.references(key)) {
+            out[std::string(key)] = ondemand_to_literal(val);
+        }
+    }
+}
+
+}  // namespace
+
+coro::AsyncGenerator<ArrowExportResult> TraceReader::read_arrow(
+    ReadConfig config, std::size_t batch_size) {
+    std::optional<Query> query;
+    if (!config.query.empty()) {
+        auto parsed = Query::from_string(config.query);
+        if (!parsed) throw common::query::QueryParseError(parsed.error());
+        query = std::move(*parsed);
+    }
+
+    // When chunk_prune_only is set, dim_stats already proved every event in
+    // the chunk that has the predicate field matches the literal. We still
+    // need to skip events that lack the field (e.g., metadata "ph":"M"
+    // events lack pid), since the original predicate would reject them.
+    std::vector<std::string> presence_check_paths;
+    if (query && config.chunk_prune_only) {
+        const auto& fset = query->fields();
+        presence_check_paths.assign(fset.begin(), fset.end());
+    }
+
+    // For AND-of-EQ predicates, evaluate directly against simdjson without
+    // ValueMap (avoids wyhash + per-field std::string allocation per row).
+    // Falls back to the ValueMap path on unsupported AST shapes.
+    std::vector<CompiledEqProbe> compiled_probes;
+    bool use_compiled = false;
+    if (query && !config.chunk_prune_only) {
+        if (auto p = try_compile_eq_probes(query->root())) {
+            compiled_probes = std::move(*p);
+            use_compiled = !compiled_probes.empty();
+        }
+    }
+
+    bool flatten = config.flatten_objects;
+
+    if (!has_index_) {
+        // Fallback: drive the per-line read_json path and build rows.
+        auto json_gen = read_json(config);
+        RecordBatchBuilder builder;
+        ArrowStringArena arena;
+        std::vector<ArrowKeyHint> hints;
+        builder.reserve(batch_size);
+        while (auto opt = co_await json_gen.next()) {
+            if (!arrow_row_from_doc(builder, hints,
+                                    simdjson::ondemand::document_reference(
+                                        opt->parser->raw_document()),
+                                    flatten))
+                continue;
+            if (builder.num_rows() >= batch_size) {
+                co_yield builder.finish();
+                arena.clear();
+                if (!builder.is_schema_locked()) builder.lock_schema();
+                builder.reset(true);
+                builder.reserve(batch_size);
+            }
+        }
+        if (builder.num_rows() > 0) {
+            co_yield builder.finish();
+        }
+        co_return;
+    }
+
+    // Keep RocksDB alive for the generator's lifetime so per-method opens
+    // in GzipIndexer reuse DBManager's cached handle.
+    std::optional<indexer::IndexDatabase> db_keep_alive;
+    if (has_index_ && !index_path_.empty()) {
+        try {
+            db_keep_alive.emplace(index_path_,
+                                  rocksdb::RocksDatabase::OpenMode::ReadOnly);
+        } catch (...) {
+        }
+    }
+
+    auto reader = create_indexed_reader();
+    auto chunk_gen = read_chunks_indexed(
+        reader, index_path_, config_.file_path, config, query,
+        /*extend_to_line_boundary=*/config.end_at_checkpoint);
+
+    LinePrefilter prefilter = (query && !config.chunk_prune_only)
+                                  ? build_prefilter(*query)
+                                  : LinePrefilter{};
+    bool have_line_prefilter = !prefilter.empty();
+
+    simdjson::ondemand::parser bulk_parser;
+    RecordBatchBuilder builder;
+    ArrowStringArena arena;
+    std::vector<ArrowKeyHint> hints;
+    builder.reserve(batch_size);
+
+    auto maybe_flush = [&builder, &arena, batch_size](
+                           bool final) -> std::optional<ArrowExportResult> {
+        if (builder.num_rows() == 0) return std::nullopt;
+        if (!final && builder.num_rows() < batch_size) return std::nullopt;
+        auto result = builder.finish();
+        arena.clear();
+        if (!builder.is_schema_locked()) builder.lock_schema();
+        builder.reset(true);
+        builder.reserve(batch_size);
+        return result;
+    };
+
+    bool first_chunk = true;
+    while (auto chunk_opt = co_await chunk_gen.next()) {
+        auto chunk = *chunk_opt;
+        if (chunk.empty()) continue;
+
+        // Work items with start_byte > 0 begin at a deflate-block boundary
+        // that is typically mid-line; the previous worker emitted that
+        // spanning line via its tail-flush, so drop bytes up to (and
+        // including) the first newline in our first chunk.
+        if (first_chunk && config.start_byte > 0 &&
+            config.start_at_checkpoint) {
+            const char* nl = static_cast<const char*>(
+                std::memchr(chunk.data(), '\n', chunk.size()));
+            if (nl) {
+                std::size_t skip =
+                    static_cast<std::size_t>(nl - chunk.data()) + 1;
+                if (skip < chunk.size()) {
+                    chunk = chunk.subspan(skip);
+                } else {
+                    first_chunk = false;
+                    continue;
+                }
+            }
+        }
+        first_chunk = false;
+
+        simdjson::padded_string padded;
+        if (have_line_prefilter) {
+            auto collected = collect_matching_lines(chunk, prefilter);
+            if (collected.empty()) continue;
+            padded = simdjson::padded_string(std::move(collected));
+        } else {
+            auto trimmed = strip_ndjson_bookends(
+                std::string_view(chunk.data(), chunk.size()));
+            if (trimmed.empty()) continue;
+            padded = simdjson::padded_string(trimmed);
+        }
+
+        auto docs_r = bulk_parser.iterate_many(padded, 1 << 20,
+                                               /*allow_comma_separated=*/false);
+        if (docs_r.error()) continue;
+        auto& docs = docs_r.value();
+
+        for (auto it = docs.begin(); it != docs.end(); ++it) {
+            auto doc_result = *it;
+            if (doc_result.error()) continue;
+            auto& doc = doc_result.value();
+
+            if (query && !config.chunk_prune_only) {
+                if (use_compiled) {
+                    if (!eval_compiled_eq(compiled_probes, doc)) continue;
+                } else {
+                    common::query::ValueMap fields;
+                    collect_query_fields(doc, *query, fields);
+                    if (!query->evaluate(fields)) continue;
+                }
+                doc.rewind();
+            } else if (!presence_check_paths.empty()) {
+                bool all_present = true;
+                for (const auto& path : presence_check_paths) {
+                    auto fld = doc.find_field_unordered(path);
+                    if (fld.error()) {
+                        all_present = false;
+                        break;
+                    }
+                }
+                if (!all_present) continue;
+                doc.rewind();
+            }
+
+            if (!arrow_row_from_doc(builder, hints, doc, flatten)) continue;
+
+            if (auto flushed = maybe_flush(/*final=*/false)) {
+                co_yield std::move(*flushed);
+            }
+        }
+    }
+
+    if (auto flushed = maybe_flush(/*final=*/true)) {
+        co_yield std::move(*flushed);
+    }
+}
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW
+
 }  // namespace dftracer::utils::utilities::reader
diff --git a/src/dftracer/utils/utilities/replay/replay.cpp b/src/dftracer/utils/utilities/replay/replay.cpp
index 2b34e22b..753af3e4 100644
--- a/src/dftracer/utils/utilities/replay/replay.cpp
+++ b/src/dftracer/utils/utilities/replay/replay.cpp
@@ -1,22 +1,21 @@
 #include <dftracer/utils/call_tree/call_tree.h>
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/common/string_intern.h>
+#include <dftracer/utils/utilities/common/json/parser.h>
+#include <dftracer/utils/utilities/composites/dft/event.h>
 #include <dftracer/utils/utilities/composites/dft/internal/utils.h>
 #include <dftracer/utils/utilities/indexer/internal/indexer.h>
-#include <dftracer/utils/utilities/reader/internal/reader_factory.h>
+#include <dftracer/utils/utilities/reader/trace_reader.h>
 #include <dftracer/utils/utilities/replay/replay.h>
-#include <errno.h>
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <unistd.h>
-#include <yyjson.h>
 
 #include <algorithm>
 #include <chrono>
+#include <cstdio>
 #include <cstring>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
 #include <random>
 #include <thread>
 
@@ -24,117 +23,16 @@ namespace dftracer::utils::utilities::replay {
 
 namespace {
 
-/**
- * Trim whitespace and validate JSON string
- */
-bool json_trim_and_validate(const char* input, std::size_t input_length,
-                            const char*& trimmed, std::size_t& trimmed_length) {
-    if (!input || input_length == 0) {
-        return false;
-    }
-
-    // Trim leading whitespace
-    std::size_t start = 0;
-    while (start < input_length &&
-           (input[start] == ' ' || input[start] == '\t' ||
-            input[start] == '\n' || input[start] == '\r')) {
-        start++;
-    }
-
-    // Trim trailing whitespace and comma
-    std::size_t end = input_length;
-    while (end > start && (input[end - 1] == ' ' || input[end - 1] == '\t' ||
-                           input[end - 1] == '\n' || input[end - 1] == '\r' ||
-                           input[end - 1] == ',')) {
-        end--;
-    }
-
-    if (start >= end) {
-        return false;
-    }
-
-    trimmed = input + start;
-    trimmed_length = end - start;
-
-    // Basic validation: must start with '{' and end with '}'
-    if (trimmed[0] != '{' || trimmed[trimmed_length - 1] != '}') {
-        return false;
-    }
-
-    return true;
+// Process-wide intern pool for replay strings. Function names, categories,
+// and per-file hashes (fhash/hhash) have either small bounded cardinality
+// (~tens for cat/name) or stable identity per file (fhash).
+dftracer::utils::StringIntern& replay_intern() {
+    static dftracer::utils::StringIntern instance;
+    return instance;
 }
 
-/**
- * Parse a JSON string value from yyjson
- */
-std::string get_json_string(yyjson_val* val, const char* key,
-                            const std::string& default_value = "") {
-    yyjson_val* field = yyjson_obj_get(val, key);
-    if (field && yyjson_is_str(field)) {
-        return yyjson_get_str(field);
-    }
-    return default_value;
-}
-
-/**
- * Parse a JSON uint64 value from yyjson
- */
-std::uint64_t get_json_uint64(yyjson_val* val, const char* key,
-                              std::uint64_t default_value = 0) {
-    yyjson_val* field = yyjson_obj_get(val, key);
-    if (field && yyjson_is_uint(field)) {
-        return yyjson_get_uint(field);
-    } else if (field && yyjson_is_int(field)) {
-        std::int64_t int_val = yyjson_get_int(field);
-        return int_val >= 0 ? static_cast<std::uint64_t>(int_val)
-                            : default_value;
-    }
-    return default_value;
-}
-
-/**
- * Parse a JSON double value from yyjson
- */
-double get_json_double(yyjson_val* val, const char* key,
-                       double default_value = 0.0) {
-    yyjson_val* field = yyjson_obj_get(val, key);
-    if (field && yyjson_is_real(field)) {
-        return yyjson_get_real(field);
-    } else if (field && yyjson_is_uint(field)) {
-        return static_cast<double>(yyjson_get_uint(field));
-    } else if (field && yyjson_is_int(field)) {
-        return static_cast<double>(yyjson_get_int(field));
-    }
-    return default_value;
-}
-
-/**
- * Get a string value from args object
- */
-std::string get_args_string(yyjson_val* root, const char* key,
-                            const std::string& default_value = "") {
-    yyjson_val* args = yyjson_obj_get(root, "args");
-    if (args && yyjson_is_obj(args)) {
-        return get_json_string(args, key, default_value);
-    }
-    return default_value;
-}
-
-/**
- * Get an int64 value from args object
- */
-std::int64_t get_args_int64(yyjson_val* root, const char* key,
-                            std::int64_t default_value = 0) {
-    yyjson_val* args = yyjson_obj_get(root, "args");
-    if (args && yyjson_is_obj(args)) {
-        yyjson_val* field = yyjson_obj_get(args, key);
-        if (field && yyjson_is_int(field)) {
-            return yyjson_get_int(field);
-        } else if (field && yyjson_is_uint(field)) {
-            return static_cast<std::int64_t>(yyjson_get_uint(field));
-        }
-    }
-    return default_value;
+std::string_view intern_sv(std::string_view sv) {
+    return replay_intern().intern(sv);
 }
 
 /**
@@ -161,11 +59,12 @@ bool ensure_directory_exists(const std::string& path) {
 // =============================================================================
 
 bool PosixExecutor::execute(const Trace& trace, const ReplayConfig& config) {
-    const std::string& func_name = trace.func_name;
+    std::string_view func_name = trace.func_name;
 
     if (config.dry_run) {
-        DFTRACER_UTILS_LOG_DEBUG("DRY RUN: Would execute POSIX %s",
-                                 func_name.c_str());
+        DFTRACER_UTILS_LOG_DEBUG("DRY RUN: Would execute POSIX %.*s",
+                                 static_cast<int>(func_name.size()),
+                                 func_name.data());
         return true;
     }
 
@@ -186,8 +85,9 @@ bool PosixExecutor::execute(const Trace& trace, const ReplayConfig& config) {
         return execute_stat(trace, config);
     }
 
-    DFTRACER_UTILS_LOG_DEBUG("Unsupported POSIX function: %s",
-                             func_name.c_str());
+    DFTRACER_UTILS_LOG_DEBUG("Unsupported POSIX function: %.*s",
+                             static_cast<int>(func_name.size()),
+                             func_name.data());
     return false;
 }
 
@@ -200,10 +100,17 @@ bool PosixExecutor::execute_open(const Trace& trace,
     DFTRACER_UTILS_LOG_DEBUG("Executing POSIX open");
 
     if (!trace.fhash.empty()) {
-        std::string file_path =
-            config.output_directory.empty()
-                ? ("replay_file_" + trace.fhash)
-                : (config.output_directory + "/replay_file_" + trace.fhash);
+        std::string file_path;
+        if (config.output_directory.empty()) {
+            file_path.reserve(12 + trace.fhash.size());
+            file_path = "replay_file_";
+        } else {
+            file_path.reserve(config.output_directory.size() + 13 +
+                              trace.fhash.size());
+            file_path = config.output_directory;
+            file_path += "/replay_file_";
+        }
+        file_path.append(trace.fhash.data(), trace.fhash.size());
 
         ensure_directory_exists(file_path);
 
@@ -231,13 +138,20 @@ bool PosixExecutor::execute_close(const Trace& trace,
     if (it != open_files_.end()) {
         close(it->second);
         open_files_.erase(it);
-        DFTRACER_UTILS_LOG_DEBUG("Closed file with hash %s",
-                                 trace.fhash.c_str());
+        DFTRACER_UTILS_LOG_DEBUG("Closed file with hash %.*s",
+                                 static_cast<int>(trace.fhash.size()),
+                                 trace.fhash.data());
     }
 
     return true;
 }
 
+void PosixExecutor::ensure_io_buffer(std::size_t size) {
+    if (io_buffer_.size() < size) {
+        io_buffer_.resize(size, 'A');
+    }
+}
+
 bool PosixExecutor::execute_read(const Trace& trace,
                                  const ReplayConfig& config) {
     DFTRACER_UTILS_LOG_DEBUG("Executing POSIX read (size: %lld)",
@@ -245,10 +159,11 @@ bool PosixExecutor::execute_read(const Trace& trace,
 
     auto it = open_files_.find(trace.fhash);
     if (it != open_files_.end() && trace.size > 0) {
-        std::vector<char> buffer(std::min(static_cast<std::size_t>(trace.size),
-                                          config.max_file_size));
+        std::size_t n = std::min(static_cast<std::size_t>(trace.size),
+                                 config.max_file_size);
+        ensure_io_buffer(n);
         [[maybe_unused]] ssize_t bytes_read =
-            read(it->second, buffer.data(), buffer.size());
+            read(it->second, io_buffer_.data(), n);
         DFTRACER_UTILS_LOG_DEBUG("Read %zd bytes", bytes_read);
     }
 
@@ -264,9 +179,9 @@ bool PosixExecutor::execute_write(const Trace& trace,
     if (it != open_files_.end() && trace.size > 0) {
         std::size_t write_size = std::min(static_cast<std::size_t>(trace.size),
                                           config.max_file_size);
-        std::vector<char> buffer(write_size, 'A');
+        ensure_io_buffer(write_size);
         [[maybe_unused]] ssize_t bytes_written =
-            write(it->second, buffer.data(), buffer.size());
+            write(it->second, io_buffer_.data(), write_size);
         DFTRACER_UTILS_LOG_DEBUG("Wrote %zd bytes", bytes_written);
     }
 
@@ -295,8 +210,9 @@ bool PosixExecutor::execute_stat([[maybe_unused]] const Trace& trace,
     DFTRACER_UTILS_LOG_DEBUG("Executing POSIX stat");
 
     if (!trace.fhash.empty()) {
-        DFTRACER_UTILS_LOG_DEBUG("Would stat file with hash %s",
-                                 trace.fhash.c_str());
+        DFTRACER_UTILS_LOG_DEBUG("Would stat file with hash %.*s",
+                                 static_cast<int>(trace.fhash.size()),
+                                 trace.fhash.data());
     }
 
     return true;
@@ -322,16 +238,17 @@ bool DFTracerExecutor::execute(const Trace& trace, const ReplayConfig& config) {
 
     if (config.no_sleep) {
         if (config.verbose && duration_us >= 100000.0) {
-            std::cout << "DFTracer would sleep for " << std::fixed
-                      << std::setprecision(3) << duration_us / 1000.0
-                      << " ms for " << trace.func_name << " (skipped)"
-                      << std::endl;
+            std::printf("DFTracer would sleep for %.3f ms for %.*s (skipped)\n",
+                        duration_us / 1000.0,
+                        static_cast<int>(trace.func_name.size()),
+                        trace.func_name.data());
         }
     } else {
         if (config.verbose && duration_us >= 100.0) {
-            std::cout << "DFTracer sleeping for " << std::fixed
-                      << std::setprecision(3) << duration_us / 1000.0
-                      << " ms for " << trace.func_name << std::endl;
+            std::printf("DFTracer sleeping for %.3f ms for %.*s\n",
+                        duration_us / 1000.0,
+                        static_cast<int>(trace.func_name.size()),
+                        trace.func_name.data());
         }
         sleep_for_duration(duration_us);
     }
@@ -385,69 +302,106 @@ void ReplayEngine::add_executor(std::unique_ptr<TraceExecutor> executor) {
     executors_.push_back(std::move(executor));
 }
 
-ReplayResult ReplayEngine::replay(const std::string& trace_file,
-                                  const std::string& index_file) {
-    ReplayResult result;
+coro::AsyncGenerator<Trace> ReplayEngine::stream_traces(
+    const std::vector<std::string>& files) {
+    using reader::ReadConfig;
+    using reader::TraceReader;
+    using reader::TraceReaderConfig;
+
+    for (const auto& file : files) {
+        TraceReaderConfig cfg;
+        cfg.file_path = file;
+        cfg.auto_build_index = true;
+        TraceReader rdr(std::move(cfg));
+        auto gen = rdr.read_json(ReadConfig{});
+        while (auto opt = co_await gen.next()) {
+            if (!opt->parser) continue;
+            Trace trace;
+            if (parse_trace_json(*opt->parser, trace)) {
+                co_yield std::move(trace);
+            }
+        }
+    }
+}
 
-    DFTRACER_UTILS_LOG_DEBUG("Starting replay of file: %s", trace_file.c_str());
+coro::CoroTask<void> ReplayEngine::run_pipelined(
+    dftracer::utils::CoroScope& scope, const std::vector<std::string>& files,
+    ReplayResult& result, std::size_t channel_capacity) {
+    coro::Channel<Trace> ch_instance(channel_capacity);
+    auto* channel = &ch_instance;
+
+    co_await scope.scope([this, channel, &files,
+                          &result](dftracer::utils::CoroScope& child)
+                             -> coro::CoroTask<void> {
+        // Producer
+        child.spawn([this, channel, &files](
+                        dftracer::utils::CoroScope&) -> coro::CoroTask<void> {
+            auto producer = channel->producer();
+            auto guard = producer.guard();
+            auto gen = stream_traces(files);
+            while (auto trace = co_await gen.next()) {
+                if (!co_await producer.send(std::move(*trace))) {
+                    co_return;
+                }
+            }
+            co_return;
+        });
+
+        // Consumer
+        child.spawn([this, channel, &result](
+                        dftracer::utils::CoroScope&) -> coro::CoroTask<void> {
+            auto consumer = channel->consumer();
+            while (auto item = co_await consumer.receive()) {
+                dispatch_trace(*item, result);
+            }
+            co_return;
+        });
+        co_return;
+    });
 
-    auto start_time = std::chrono::steady_clock::now();
+    co_return;
+}
 
-    try {
-        // Check if the file is compressed
-        bool is_compressed =
-            (trace_file.size() >= 3 &&
-             trace_file.substr(trace_file.size() - 3) == ".gz") ||
-            (trace_file.size() >= 7 &&
-             trace_file.substr(trace_file.size() - 7) == ".tar.gz");
-
-        if (is_compressed) {
-            // Handle compressed files with ReaderFactory
-            std::string index_path =
-                index_file.empty() ? utilities::composites::dft::internal::
-                                         determine_index_path(trace_file, "")
-                                   : index_file;
-
-            auto reader =
-                reader::internal::ReaderFactory::create(trace_file, index_path);
-
-            if (!reader) {
-                result.error_messages.push_back(
-                    "Failed to create reader for file: " + trace_file);
-                return result;
-            }
+namespace {
 
-            // Create line processor for handling trace lines
-            ReplayLineProcessor processor(*this, result);
+// Sync drive used by the existing replay(file)/replay(vector) entry points.
+// Pipeline-driven callers use ReplayEngine::run_pipelined instead.
+coro::CoroTask<void> replay_file_async(ReplayEngine* engine,
+                                       std::string trace_file,
+                                       std::string index_file,
+                                       ReplayResult* result) {
+    using reader::ReadConfig;
+    using reader::TraceReader;
+    using reader::TraceReaderConfig;
+
+    TraceReaderConfig cfg;
+    cfg.file_path = std::move(trace_file);
+    if (!index_file.empty()) {
+        cfg.index_dir = std::move(index_file);
+    }
+    cfg.auto_build_index = true;
+
+    TraceReader rdr(std::move(cfg));
+    auto gen = rdr.read_json(ReadConfig{});
+    while (auto opt = co_await gen.next()) {
+        if (!opt->parser) continue;
+        engine->process_trace_line(*opt->parser, *result);
+    }
+    co_return;
+}
 
-            // Read all lines using the line processor
-            reader->read_lines_with_processor(0, reader->get_num_lines(),
-                                              processor);
-        } else {
-            // Handle plain text files directly
-            std::ifstream file(trace_file);
-            if (!file.is_open()) {
-                result.error_messages.push_back(
-                    "Failed to open plain text file: " + trace_file);
-                return result;
-            }
+}  // namespace
 
-            std::string line;
-            while (std::getline(file, line)) {
-                // Skip empty lines and bracket lines
-                if (line.empty() || line == "[" || line == "]") {
-                    continue;
-                }
+ReplayResult ReplayEngine::replay(const std::string& trace_file,
+                                  const std::string& index_file) {
+    ReplayResult result;
 
-                // Remove trailing comma if present
-                if (!line.empty() && line.back() == ',') {
-                    line.pop_back();
-                }
+    DFTRACER_UTILS_LOG_DEBUG("Starting replay of file: %s", trace_file.c_str());
 
-                process_trace_line(line, result);
-            }
-        }
+    auto start_time = std::chrono::steady_clock::now();
 
+    try {
+        replay_file_async(this, trace_file, index_file, &result).get();
     } catch (const std::exception& e) {
         result.error_messages.push_back("Exception during replay: " +
                                         std::string(e.what()));
@@ -497,14 +451,17 @@ ReplayResult ReplayEngine::replay(const std::vector<std::string>& trace_files) {
     return aggregated_result;
 }
 
-bool ReplayEngine::process_trace_line(const std::string& line,
+bool ReplayEngine::process_trace_line(common::json::JsonParser& parser,
                                       ReplayResult& result) {
     Trace trace;
-
-    if (!parse_trace_json(line, trace)) {
+    if (!parse_trace_json(parser, trace)) {
         return false;
     }
+    dispatch_trace(trace, result);
+    return true;
+}
 
+void ReplayEngine::dispatch_trace(const Trace& trace, ReplayResult& result) {
     result.total_events++;
     result.function_counts[trace.func_name]++;
     result.category_counts[trace.cat]++;
@@ -536,12 +493,12 @@ bool ReplayEngine::process_trace_line(const std::string& line,
     if (config_.max_events > 0 &&
         result.executed_events >= config_.max_events) {
         // Silently skip - limit already reached
-        return false;
+        return;
     }
 
     if (!should_execute_trace(trace)) {
         result.filtered_events++;
-        return true;
+        return;
     }
 
     // Apply timing logic (skip during dry-run or dftracer-mode)
@@ -550,6 +507,13 @@ bool ReplayEngine::process_trace_line(const std::string& line,
         apply_timing(trace);
     }
 
+    // Fidelity-observation point: callers can hook here to capture the
+    // wall-clock time at which each event is about to be dispatched and
+    // compare it against the trace timeline. No production paths set this.
+    if (config_.on_dispatch) {
+        config_.on_dispatch(trace, std::chrono::steady_clock::now());
+    }
+
     // Find and execute with appropriate executor
     TraceExecutor* executor = find_executor(trace);
     if (executor) {
@@ -565,62 +529,57 @@ bool ReplayEngine::process_trace_line(const std::string& line,
             result.executed_events++;
         } else {
             result.failed_events++;
-            result.error_messages.push_back("Failed to execute " +
-                                            trace.func_name + " with " +
-                                            executor->get_name());
+            std::string msg = "Failed to execute ";
+            msg.append(trace.func_name);
+            msg += " with ";
+            msg += executor->get_name();
+            result.error_messages.push_back(std::move(msg));
         }
     } else {
         result.failed_events++;
         if (config_.verbose) {
             DFTRACER_UTILS_LOG_DEBUG(
-                "No executor found for function: %s (category: %s)",
-                trace.func_name.c_str(), trace.cat.c_str());
+                "No executor found for function: %.*s (category: %.*s)",
+                static_cast<int>(trace.func_name.size()),
+                trace.func_name.data(), static_cast<int>(trace.cat.size()),
+                trace.cat.data());
         }
     }
-
-    return true;
 }
 
-bool ReplayEngine::parse_trace_json(const std::string& json_line,
+bool ReplayEngine::parse_trace_json(common::json::JsonParser& parser,
                                     Trace& trace) {
-    const char* trimmed;
-    std::size_t trimmed_length;
-    if (!json_trim_and_validate(json_line.c_str(), json_line.length(), trimmed,
-                                trimmed_length)) {
-        return false;
-    }
-
-    yyjson_doc* doc = yyjson_read(trimmed, trimmed_length, 0);
-    if (!doc) {
-        return false;
-    }
+    composites::dft::DFTracerEvent ev;
+    // parse_ondemand returns false only when no "ph" was found; other fields
+    // are still populated. Match the legacy DOM-based behavior, which keyed
+    // validity on a non-empty name and treated missing ph as Regular.
+    composites::dft::DFTracerEvent::parse_ondemand(parser, ev);
 
-    yyjson_val* root = yyjson_doc_get_root(doc);
-    if (!root || !yyjson_is_obj(root)) {
-        yyjson_doc_free(doc);
+    if (ev.name.empty()) {
         return false;
     }
 
-    // Parse basic fields
-    trace.func_name = get_json_string(root, "name");
-    trace.cat = get_json_string(root, "cat");
-    std::string phase = get_json_string(root, "ph");
-
-    trace.pid = get_json_uint64(root, "pid");
-    trace.tid = get_json_uint64(root, "tid");
-    trace.time_start = get_json_uint64(root, "ts");
-    trace.duration = get_json_double(root, "dur");
-    trace.time_end =
-        trace.time_start + static_cast<std::uint64_t>(trace.duration);
-
-    // Parse arguments
-    trace.fhash = get_args_string(root, "fhash");
-    trace.hhash = get_args_string(root, "hhash");
-    trace.size = get_args_int64(root, "size", -1);
-    trace.offset = get_args_int64(root, "offset", -1);
-
-    // Determine trace type
-    if (phase == "M") {
+    trace.func_name = intern_sv(ev.name);
+    trace.cat = intern_sv(ev.cat);
+    trace.pid = ev.pid;
+    trace.tid = ev.tid;
+    trace.time_start = ev.ts;
+    trace.duration = static_cast<double>(ev.dur);
+    trace.time_end = trace.time_start + ev.dur;
+
+    // ArgsValueProxy::get<string_view> returns a view directly into the
+    // variant's owned string without copying; we then intern so the view
+    // outlives ev/ArgsMap (which die at the end of this function).
+    auto fhash_sv = ev.args["fhash"].get<std::string_view>(std::string_view{});
+    auto hhash_sv = ev.args["hhash"].get<std::string_view>(std::string_view{});
+    trace.fhash = fhash_sv.empty() ? std::string_view{} : intern_sv(fhash_sv);
+    trace.hhash = hhash_sv.empty() ? std::string_view{} : intern_sv(hhash_sv);
+    trace.size =
+        ev.args["size"].get<std::int64_t>(static_cast<std::int64_t>(-1));
+    trace.offset =
+        ev.args["offset"].get<std::int64_t>(static_cast<std::int64_t>(-1));
+
+    if (ev.ph == "M") {
         if (trace.func_name == "FH") {
             trace.type = TraceType::FileHash;
         } else if (trace.func_name == "HH") {
@@ -632,10 +591,8 @@ bool ReplayEngine::parse_trace_json(const std::string& json_line,
         trace.type = TraceType::Regular;
     }
 
-    trace.is_valid = !trace.func_name.empty();
-
-    yyjson_doc_free(doc);
-    return trace.is_valid;
+    trace.is_valid = true;
+    return true;
 }
 
 void ReplayEngine::apply_timing(const Trace& trace) {
@@ -644,7 +601,16 @@ void ReplayEngine::apply_timing(const Trace& trace) {
     }
 
     if (!first_timestamp_set_) {
+        // Anchor BOTH clocks on the first event. The wall-clock anchor was
+        // initialized at engine construction time, but for any consumer
+        // path with warmup (e.g. Pipeline producer fills, channel hops),
+        // that anchor is "behind" by the warmup gap. Without resetting it
+        // here, the next event sees replay_elapsed >> trace_elapsed and
+        // we never sleep, collapsing the timing model. The trace-time
+        // anchor is set on first event regardless, so co-locating the
+        // wall-clock reset here keeps the two in lockstep.
         first_trace_timestamp_ = trace.time_start;
+        replay_start_time_ = std::chrono::steady_clock::now();
         first_timestamp_set_ = true;
         return;
     }
@@ -669,17 +635,16 @@ void ReplayEngine::apply_timing(const Trace& trace) {
         const std::uint64_t MAX_SLEEP_US = 10 * 1000 * 1000;
         if (sleep_us > MAX_SLEEP_US) {
             if (config_.verbose) {
-                std::cout << "Warning: Capping sleep from "
-                          << static_cast<double>(sleep_us) / 1000.0 << " ms to "
-                          << MAX_SLEEP_US / 1000.0 << " ms" << std::endl;
+                std::printf("Warning: Capping sleep from %.3f ms to %.3f ms\n",
+                            static_cast<double>(sleep_us) / 1000.0,
+                            static_cast<double>(MAX_SLEEP_US) / 1000.0);
             }
             sleep_us = MAX_SLEEP_US;
         }
 
         if (config_.verbose && sleep_us > 1000) {
-            std::cout << "Timing sleep: "
-                      << static_cast<double>(sleep_us) / 1000.0 << " ms"
-                      << std::endl;
+            std::printf("Timing sleep: %.3f ms\n",
+                        static_cast<double>(sleep_us) / 1000.0);
         }
 
         std::this_thread::sleep_for(std::chrono::microseconds(sleep_us));
@@ -735,15 +700,16 @@ bool ReplayEngine::should_execute_trace(const Trace& trace) const {
         return false;
     }
 
-    // Check function filters
     if (!config_.filter_functions.empty()) {
-        if (config_.filter_functions.find(trace.func_name) ==
+        std::string key(trace.func_name);
+        if (config_.filter_functions.find(key) ==
             config_.filter_functions.end()) {
             return false;
         }
     }
     if (!config_.exclude_functions.empty()) {
-        if (config_.exclude_functions.find(trace.func_name) !=
+        std::string key(trace.func_name);
+        if (config_.exclude_functions.find(key) !=
             config_.exclude_functions.end()) {
             return false;
         }
@@ -751,13 +717,15 @@ bool ReplayEngine::should_execute_trace(const Trace& trace) const {
 
     // Check category filters
     if (!config_.filter_categories.empty()) {
-        if (config_.filter_categories.find(trace.cat) ==
+        std::string key(trace.cat);
+        if (config_.filter_categories.find(key) ==
             config_.filter_categories.end()) {
             return false;
         }
     }
     if (!config_.exclude_categories.empty()) {
-        if (config_.exclude_categories.find(trace.cat) !=
+        std::string key(trace.cat);
+        if (config_.exclude_categories.find(key) !=
             config_.exclude_categories.end()) {
             return false;
         }
@@ -945,8 +913,8 @@ void ReplayEngine::replay_call_tree_node(
     ReplayResult& result) {
     // Convert CallTreeNodeInfo to Trace structure
     Trace trace;
-    trace.func_name = node.name;
-    trace.cat = node.category;
+    trace.func_name = intern_sv(node.name);
+    trace.cat = intern_sv(node.category);
     trace.time_start = node.start_time_us;
     trace.duration = static_cast<double>(node.duration_us);
     trace.time_end = trace.time_start + node.duration_us;
@@ -975,13 +943,13 @@ void ReplayEngine::replay_call_tree_node(
     }
 
     auto fhash_it = args.find("fhash");
-    if (fhash_it != args.end()) {
-        trace.fhash = fhash_it->second;
+    if (fhash_it != args.end() && !fhash_it->second.empty()) {
+        trace.fhash = intern_sv(fhash_it->second);
     }
 
     auto hhash_it = args.find("hhash");
-    if (hhash_it != args.end()) {
-        trace.hhash = hhash_it->second;
+    if (hhash_it != args.end() && !hhash_it->second.empty()) {
+        trace.hhash = intern_sv(hhash_it->second);
     }
 
     auto size_it = args.find("size");
@@ -1064,99 +1032,85 @@ void ReplayEngine::replay_call_tree_node(
             result.executed_events++;
         } else {
             result.failed_events++;
-            result.error_messages.push_back("Failed to execute " +
-                                            trace.func_name + " with " +
-                                            executor->get_name());
+            std::string msg = "Failed to execute ";
+            msg.append(trace.func_name);
+            msg += " with ";
+            msg += executor->get_name();
+            result.error_messages.push_back(std::move(msg));
         }
     } else {
         result.failed_events++;
         if (config_.verbose) {
             DFTRACER_UTILS_LOG_DEBUG(
-                "No executor found for function: %s (category: %s)",
-                trace.func_name.c_str(), trace.cat.c_str());
+                "No executor found for function: %.*s (category: %.*s)",
+                static_cast<int>(trace.func_name.size()),
+                trace.func_name.data(), static_cast<int>(trace.cat.size()),
+                trace.cat.data());
         }
     }
 }
 
-// =============================================================================
-// ReplayLineProcessor Implementation
-// =============================================================================
-
-ReplayLineProcessor::ReplayLineProcessor(ReplayEngine& engine,
-                                         ReplayResult& result)
-    : engine_(engine), result_(result) {}
-
-coro::CoroTask<bool> ReplayLineProcessor::process(const char* data,
-                                                  std::size_t length) {
-    std::string line(data, length);
-    co_return engine_.process_trace_line(line, result_);
-}
-
 // =============================================================================
 // ReplayResult::print_summary Implementation
 // =============================================================================
 
 void ReplayResult::print_summary(bool verbose) const {
-    std::cout << "\n=== Replay Summary ===" << std::endl;
-    std::cout << "Total events: " << total_events << std::endl;
-    std::cout << "Executed: " << executed_events << std::endl;
-    std::cout << "Filtered: " << filtered_events << std::endl;
-    std::cout << "Failed: " << failed_events << std::endl;
+    std::printf("\n=== Replay Summary ===\n");
+    std::printf("Total events: %zu\n", total_events);
+    std::printf("Executed: %zu\n", executed_events);
+    std::printf("Filtered: %zu\n", filtered_events);
+    std::printf("Failed: %zu\n", failed_events);
 
     double success_rate = total_events > 0
                               ? (static_cast<double>(executed_events) /
                                  static_cast<double>(total_events) * 100.0)
                               : 0.0;
-    std::cout << "Success rate: " << std::fixed << std::setprecision(2)
-              << success_rate << "%" << std::endl;
+    std::printf("Success rate: %.2f%%\n", success_rate);
 
-    std::cout << "\nTiming:" << std::endl;
-    std::cout << "  Total duration: "
-              << static_cast<double>(total_duration.count()) / 1000.0 << " ms"
-              << std::endl;
-    std::cout << "  Execution duration: "
-              << static_cast<double>(execution_duration.count()) / 1000.0
-              << " ms" << std::endl;
+    std::printf("\nTiming:\n");
+    std::printf("  Total duration: %.3f ms\n",
+                static_cast<double>(total_duration.count()) / 1000.0);
+    std::printf("  Execution duration: %.3f ms\n",
+                static_cast<double>(execution_duration.count()) / 1000.0);
 
     if (first_timestamp != UINT64_MAX && last_timestamp > 0) {
-        std::cout << "  Trace timespan: "
-                  << static_cast<double>(last_timestamp - first_timestamp) /
-                         1000000.0
-                  << " seconds" << std::endl;
+        std::printf(
+            "  Trace timespan: %.6f seconds\n",
+            static_cast<double>(last_timestamp - first_timestamp) / 1000000.0);
     }
 
-    std::cout << "\nI/O Statistics:" << std::endl;
-    std::cout << "  Bytes read: " << total_bytes_read << " ("
-              << static_cast<double>(total_bytes_read) / (1024.0 * 1024.0)
-              << " MB)" << std::endl;
-    std::cout << "  Bytes written: " << total_bytes_written << " ("
-              << static_cast<double>(total_bytes_written) / (1024.0 * 1024.0)
-              << " MB)" << std::endl;
+    std::printf("\nI/O Statistics:\n");
+    std::printf("  Bytes read: %zu (%.2f MB)\n", total_bytes_read,
+                static_cast<double>(total_bytes_read) / (1024.0 * 1024.0));
+    std::printf("  Bytes written: %zu (%.2f MB)\n", total_bytes_written,
+                static_cast<double>(total_bytes_written) / (1024.0 * 1024.0));
 
-    std::cout << "\nProcess/Thread Statistics:" << std::endl;
-    std::cout << "  Unique PIDs: " << pid_counts.size() << std::endl;
-    std::cout << "  Unique TIDs: " << tid_counts.size() << std::endl;
+    std::printf("\nProcess/Thread Statistics:\n");
+    std::printf("  Unique PIDs: %zu\n", pid_counts.size());
+    std::printf("  Unique TIDs: %zu\n", tid_counts.size());
 
     if (verbose) {
         if (!pid_counts.empty()) {
-            std::cout << "\n  Events per PID:" << std::endl;
+            std::printf("\n  Events per PID:\n");
             for (const auto& [pid, count] : pid_counts) {
-                std::cout << "    PID " << pid << ": " << count << " events"
-                          << std::endl;
+                std::printf("    PID %u: %zu events\n", pid, count);
             }
         }
 
         if (!tid_counts.empty() && tid_counts.size() > 1) {
-            std::cout << "\n  Events per TID:" << std::endl;
+            std::printf("\n  Events per TID:\n");
             for (const auto& [tid, count] : tid_counts) {
-                std::cout << "    TID " << tid << ": " << count << " events"
-                          << std::endl;
+                std::printf("    TID %u: %zu events\n", tid, count);
             }
         }
 
         if (!function_counts.empty()) {
-            std::cout << "\n  Top functions by count:" << std::endl;
-            std::vector<std::pair<std::string, std::size_t>> sorted_funcs(
+            std::printf("\n  Top functions by count:\n");
+            // function_counts keys are string_views into the replay intern
+            // pool; sorting needs an indexable copy. Keep the views to avoid
+            // re-allocating strings for the dictionary entries (read,
+            // write, ...).
+            std::vector<std::pair<std::string_view, std::size_t>> sorted_funcs(
                 function_counts.begin(), function_counts.end());
             std::sort(sorted_funcs.begin(), sorted_funcs.end(),
                       [](const auto& a, const auto& b) {
@@ -1166,36 +1120,36 @@ void ReplayResult::print_summary(bool verbose) const {
             std::size_t max_display =
                 std::min(sorted_funcs.size(), std::size_t(10));
             for (std::size_t i = 0; i < max_display; i++) {
-                std::cout << "    " << std::setw(30) << std::left
-                          << sorted_funcs[i].first << ": "
-                          << sorted_funcs[i].second << std::endl;
+                std::printf("    %-30.*s: %zu\n",
+                            static_cast<int>(sorted_funcs[i].first.size()),
+                            sorted_funcs[i].first.data(),
+                            sorted_funcs[i].second);
             }
         }
 
         if (!category_counts.empty()) {
-            std::cout << "\n  Events per category:" << std::endl;
+            std::printf("\n  Events per category:\n");
             for (const auto& [cat, count] : category_counts) {
-                std::cout << "    " << std::setw(20) << std::left << cat << ": "
-                          << count << std::endl;
+                std::printf("    %-20.*s: %zu\n", static_cast<int>(cat.size()),
+                            cat.data(), count);
             }
         }
     }
 
     if (!error_messages.empty()) {
-        std::cout << "\n=== Errors (" << error_messages.size()
-                  << " total) ===" << std::endl;
+        std::printf("\n=== Errors (%zu total) ===\n", error_messages.size());
         std::size_t max_errors =
             std::min(error_messages.size(), std::size_t(10));
         for (std::size_t i = 0; i < max_errors; i++) {
-            std::cout << "  " << error_messages[i] << std::endl;
+            std::printf("  %s\n", error_messages[i].c_str());
         }
         if (error_messages.size() > 10) {
-            std::cout << "  ... and " << (error_messages.size() - 10)
-                      << " more errors" << std::endl;
+            std::printf("  ... and %zu more errors\n",
+                        error_messages.size() - 10);
         }
     }
 
-    std::cout << "=====================" << std::endl;
+    std::printf("=====================\n");
 }
 
 }  // namespace dftracer::utils::utilities::replay
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e23f12fb..f6c2ea51 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -57,6 +57,7 @@ set(TEST_CPP_SOURCES
     # reader/test_reader_tar_comprehensive.cpp
     # Replay tests
     replay/test_replay.cpp
+    replay/test_replay_fidelity.cpp
     # Server unit tests
     server/test_http_parser.cpp
     server/test_http_response.cpp
@@ -115,7 +116,8 @@ foreach(test_file ${TEST_CPP_SOURCES})
   if(bin_exec STREQUAL "pipeline/test_task_scope"
      OR bin_exec STREQUAL "coro/test_channel")
     set(heavy_thread_timeout 180)
-    if(DFTRACER_UTILS_ENABLE_TSAN OR DFTRACER_UTILS_ENABLE_COVERAGE)
+    if(DFTRACER_UTILS_ENABLE_TSAN OR DFTRACER_UTILS_ENABLE_COVERAGE
+       OR DFTRACER_UTILS_ENABLE_ASAN)
       set(heavy_thread_timeout 600)
     endif()
     set_tests_properties(${bin_exec} PROPERTIES
@@ -206,8 +208,15 @@ set(TEST_BINARY_SOURCES
     binaries/test_dftracer_tar.cpp
     binaries/test_dftracer_replay.cpp
     binaries/test_dftracer_comparator.cpp
+    binaries/test_dftracer_call_tree.cpp
     )
 
+if(DFTRACER_UTILS_ENABLE_MPI)
+  list(APPEND TEST_BINARY_SOURCES
+    binaries/test_dftracer_aggregator_mpi.cpp
+    binaries/test_dftracer_call_tree_mpi.cpp)
+endif()
+
 foreach(test_file ${TEST_BINARY_SOURCES})
   string(REPLACE ".cpp" "" bin_exec ${test_file})
   string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" bin_exec ${bin_exec})
@@ -244,8 +253,25 @@ foreach(test_file ${TEST_BINARY_SOURCES})
   if(DFTRACER_UTILS_ENABLE_TSAN)
     set(integration_test_timeout 180)
   endif()
+  # The MPI aggregator spawns mpiexec several times per test case and
+  # each child runs under ASan, which is ~10x slower than release. The
+  # default 120s is not enough.
+  if(bin_exec STREQUAL "binaries/test_dftracer_aggregator_mpi" OR
+     bin_exec STREQUAL "binaries/test_dftracer_call_tree_mpi")
+    set(integration_test_timeout 600)
+  endif()
   set_tests_properties(${bin_exec} PROPERTIES TIMEOUT ${integration_test_timeout})
 
+  # dftracer_tar wraps the gzip indexer's parallel pipeline and has shown
+  # intermittent deadlocks on CI. Tar is a secondary code path (real
+  # workflows use directories of .pfw.gz).
+  if(bin_exec STREQUAL "binaries/test_dftracer_tar")
+    set_tests_properties(${bin_exec} PROPERTIES
+      TIMEOUT 240
+      RUN_SERIAL TRUE
+      REPEAT "UNTIL_PASS:3")
+  endif()
+
   # Pass binary paths so tests can find them
   if(bin_exec STREQUAL "binaries/test_dftracer_server")
     set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT
@@ -291,10 +317,19 @@ foreach(test_file ${TEST_BINARY_SOURCES})
       "DFTRACER_TAR_PATH=$<TARGET_FILE:dftracer_tar>")
   elseif(bin_exec STREQUAL "binaries/test_dftracer_replay")
     set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT
-      "DFTRACER_REPLAY_PATH=$<TARGET_FILE:dftracer_replay>")
+      "DFTRACER_REPLAY_PATH=$<TARGET_FILE:dftracer_replay>;ASAN_OPTIONS=detect_leaks=0;LSAN_OPTIONS=detect_leaks=0")
   elseif(bin_exec STREQUAL "binaries/test_dftracer_comparator")
     set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT
       "DFTRACER_COMPARATOR_PATH=$<TARGET_FILE:dftracer_comparator>")
+  elseif(bin_exec STREQUAL "binaries/test_dftracer_call_tree")
+    set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT
+      "DFTRACER_CALL_TREE_PATH=$<TARGET_FILE:dftracer_call_tree>")
+  elseif(bin_exec STREQUAL "binaries/test_dftracer_call_tree_mpi")
+    set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT
+      "DFTRACER_CALL_TREE_PATH=$<TARGET_FILE:dftracer_call_tree>;DFTRACER_CALL_TREE_MPI_PATH=$<TARGET_FILE:dftracer_call_tree_mpi>;MPIEXEC_EXECUTABLE=${MPIEXEC_EXECUTABLE};ASAN_OPTIONS=detect_leaks=0;LSAN_OPTIONS=detect_leaks=0")
+  elseif(bin_exec STREQUAL "binaries/test_dftracer_aggregator_mpi")
+    set_tests_properties(${bin_exec} PROPERTIES ENVIRONMENT
+      "DFTRACER_AGGREGATOR_PATH=$<TARGET_FILE:dftracer_aggregator>;DFTRACER_AGGREGATOR_MPI_PATH=$<TARGET_FILE:dftracer_aggregator_mpi>;MPIEXEC_EXECUTABLE=${MPIEXEC_EXECUTABLE};ASAN_OPTIONS=detect_leaks=0;LSAN_OPTIONS=detect_leaks=0")
   endif()
 
 endforeach()
diff --git a/tests/binaries/test_dftracer_aggregator_mpi.cpp b/tests/binaries/test_dftracer_aggregator_mpi.cpp
new file mode 100644
index 00000000..cac9a571
--- /dev/null
+++ b/tests/binaries/test_dftracer_aggregator_mpi.cpp
@@ -0,0 +1,391 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/core/common/filesystem.h>
+#include <doctest/doctest.h>
+#include <sys/wait.h>
+#include <testing_utilities.h>
+#include <unistd.h>
+#include <zlib.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+// ============================================================================
+// Helpers
+// ============================================================================
+
+namespace {
+
+std::string create_pfw_gz(dft_utils_test::TestEnvironment& env, int num_events,
+                          int id) {
+    auto trace_gz = env.create_dft_test_gzip_file(num_events);
+    if (trace_gz.empty()) return "";
+
+    std::string pfw_path =
+        env.get_dir() + "/trace_" + std::to_string(id) + ".pfw.gz";
+    fs::rename(trace_gz, pfw_path);
+    return pfw_path;
+}
+
+// Prefer env-provided path (set by CMake), fall back to the common build
+// output locations.
+std::string find_binary(const char* env_name,
+                        const std::vector<std::string>& candidates) {
+    const char* env_path = std::getenv(env_name);
+    if (env_path != nullptr && ::access(env_path, X_OK) == 0) return env_path;
+    for (const auto& path : candidates) {
+        if (::access(path.c_str(), X_OK) == 0) return path;
+    }
+    return "";
+}
+
+std::string find_serial_binary() {
+    return find_binary("DFTRACER_AGGREGATOR_PATH",
+                       {
+                           "./dftracer_aggregator",
+                           "../dftracer_aggregator",
+                           "../../dftracer_aggregator",
+                           "../bin/dftracer_aggregator",
+                           "../../bin/dftracer_aggregator",
+                       });
+}
+
+std::string find_mpi_binary() {
+    return find_binary("DFTRACER_AGGREGATOR_MPI_PATH",
+                       {
+                           "./dftracer_aggregator_mpi",
+                           "../dftracer_aggregator_mpi",
+                           "../../dftracer_aggregator_mpi",
+                           "../bin/dftracer_aggregator_mpi",
+                           "../../bin/dftracer_aggregator_mpi",
+                       });
+}
+
+// Locate an MPI launcher on $PATH. We prefer mpiexec to line up with the
+// CMake `MPIEXEC_EXECUTABLE` default; fall back to mpirun.
+std::string find_mpi_launcher() {
+    const char* env_path = std::getenv("MPIEXEC_EXECUTABLE");
+    if (env_path != nullptr && ::access(env_path, X_OK) == 0) return env_path;
+    for (const auto& name : {"mpiexec", "mpirun"}) {
+        std::string cmd = std::string("command -v ") + name + " 2>/dev/null";
+        FILE* p = ::popen(cmd.c_str(), "r");
+        if (!p) continue;
+        char buf[4096];
+        std::string out;
+        while (std::fgets(buf, sizeof(buf), p)) out += buf;
+        ::pclose(p);
+        while (!out.empty() && (out.back() == '\n' || out.back() == ' '))
+            out.pop_back();
+        if (!out.empty() && ::access(out.c_str(), X_OK) == 0) return out;
+    }
+    return "";
+}
+
+int run_process(const std::string& binary,
+                const std::vector<std::string>& args) {
+    pid_t pid = ::fork();
+    if (pid < 0) return -1;
+    if (pid == 0) {
+        std::vector<const char*> argv;
+        argv.push_back(binary.c_str());
+        for (const auto& arg : args) argv.push_back(arg.c_str());
+        argv.push_back(nullptr);
+        ::execv(binary.c_str(), const_cast<char* const*>(argv.data()));
+        ::_exit(127);
+    }
+    int status = 0;
+    ::waitpid(pid, &status, 0);
+    if (WIFEXITED(status)) return WEXITSTATUS(status);
+    return -1;
+}
+
+int run_mpi(const std::string& launcher, int np, const std::string& binary,
+            const std::vector<std::string>& binary_args) {
+    std::vector<std::string> args = {"--allow-run-as-root", "-n",
+                                     std::to_string(np), binary};
+    for (const auto& a : binary_args) args.push_back(a);
+    return run_process(launcher, args);
+}
+
+// Read a gzip-compressed file fully into memory (as a string).
+std::string read_gz_to_string(const std::string& path) {
+    gzFile gz = gzopen(path.c_str(), "rb");
+    if (!gz) return {};
+    std::string out;
+    char buf[1 << 16];
+    int n;
+    while ((n = gzread(gz, buf, sizeof(buf))) > 0) {
+        out.append(buf, static_cast<std::size_t>(n));
+    }
+    gzclose(gz);
+    return out;
+}
+
+// Sort lines in `content` and return the sorted blob. Used to make output
+// comparison independent of row ordering, which isn't guaranteed across
+// rank counts (parallel scan traverses shard ranges in different orders).
+std::string sort_lines(const std::string& content) {
+    std::vector<std::string> lines;
+    std::istringstream ss(content);
+    std::string line;
+    while (std::getline(ss, line)) lines.push_back(std::move(line));
+    std::sort(lines.begin(), lines.end());
+    std::string out;
+    out.reserve(content.size());
+    for (auto& l : lines) {
+        out += l;
+        out += '\n';
+    }
+    return out;
+}
+
+// Helper: read output and return sorted lines. Handles three shapes:
+//   1. `path` itself ends with ".gz"          -> decompress `path`
+//   2. `path` does not end in ".gz" but a
+//      sibling `path.gz` exists               -> decompress `path.gz`
+//   3. otherwise                               -> read `path` as plain text
+// The aggregator writes gzip when `-o foo.json.gz` is used but plain text
+// when `-o foo.json` is used (without `--compress`). The MPI binary
+// always gzips its final output.
+std::string read_output_sorted(const std::string& path) {
+    const bool ends_with_gz =
+        path.size() >= 3 && path.compare(path.size() - 3, 3, ".gz") == 0;
+    if (ends_with_gz && fs::exists(path)) {
+        return sort_lines(read_gz_to_string(path));
+    }
+    if (fs::exists(path + ".gz")) {
+        return sort_lines(read_gz_to_string(path + ".gz"));
+    }
+    std::ifstream ifs(path, std::ios::binary);
+    if (!ifs.is_open()) return {};
+    std::ostringstream ss;
+    ss << ifs.rdbuf();
+    return sort_lines(ss.str());
+}
+
+struct Env {
+    std::string serial_bin;
+    std::string mpi_bin;
+    std::string launcher;
+    bool ready = false;
+    std::string skip_reason;
+
+    Env() {
+        serial_bin = find_serial_binary();
+        mpi_bin = find_mpi_binary();
+        launcher = find_mpi_launcher();
+        if (serial_bin.empty()) {
+            skip_reason = "dftracer_aggregator binary not found";
+            return;
+        }
+        if (mpi_bin.empty()) {
+            skip_reason =
+                "dftracer_aggregator_mpi binary not found (set "
+                "DFTRACER_AGGREGATOR_MPI_PATH or build with "
+                "DFTRACER_UTILS_ENABLE_MPI=ON)";
+            return;
+        }
+        if (launcher.empty()) {
+            skip_reason = "no mpiexec/mpirun on PATH";
+            return;
+        }
+        ready = true;
+    }
+};
+
+// Byte-copy helper -- input files must be byte-identical between the
+// serial and MPI runs or their outputs will diverge (TestEnvironment
+// seeds randomness internally and does not promise cross-instance
+// reproducibility).
+bool copy_file(const std::string& src, const std::string& dst) {
+    std::ifstream in(src, std::ios::binary);
+    std::ofstream out(dst, std::ios::binary);
+    if (!in.is_open() || !out.is_open()) return false;
+    out << in.rdbuf();
+    return out.good();
+}
+
+// Compare two sorted blobs without blowing up doctest output: CHECK
+// gets a bare bool so failure dumps just "false", and we print a
+// compact summary (sizes + first-mismatch line) via MESSAGE.
+void check_outputs_equal(const std::string& ser, const std::string& mpi) {
+    const bool equal = ser == mpi;
+    if (!equal) {
+        std::size_t diff_pos = 0;
+        const std::size_t n = std::min(ser.size(), mpi.size());
+        while (diff_pos < n && ser[diff_pos] == mpi[diff_pos]) ++diff_pos;
+        auto snippet = [&](const std::string& s) -> std::string {
+            // Show up to 120 chars around the first differing byte.
+            if (s.empty()) return "<empty>";
+            std::size_t start = diff_pos > 60 ? diff_pos - 60 : 0;
+            std::size_t len = std::min<std::size_t>(120, s.size() - start);
+            return s.substr(start, len);
+        };
+        MESSAGE("serial bytes=" << ser.size() << " mpi bytes=" << mpi.size()
+                                << " first_diff_offset=" << diff_pos);
+        MESSAGE("serial near diff: " << snippet(ser));
+        MESSAGE("mpi    near diff: " << snippet(mpi));
+    }
+    CHECK(equal);
+}
+
+// Drive one parity test: generate fixtures, clone into sibling dirs,
+// run serial vs MPI, return both sorted outputs. Empty pair on setup
+// failure.
+std::pair<std::string, std::string> run_and_compare(
+    const Env& e, int mpi_ranks, int num_events, int num_files,
+    bool use_shared_staging = false) {
+    dft_utils_test::TestEnvironment src_env(100);
+    if (!src_env.is_valid()) return {};
+    std::vector<std::string> src_files;
+    for (int i = 0; i < num_files; ++i) {
+        auto f = create_pfw_gz(src_env, num_events, i);
+        if (f.empty()) return {};
+        src_files.push_back(f);
+    }
+
+    std::string ser_in = src_env.get_dir() + "/_ser_in";
+    std::string mpi_in = src_env.get_dir() + "/_mpi_in";
+    fs::create_directories(ser_in);
+    fs::create_directories(mpi_in);
+    for (const auto& f : src_files) {
+        std::string name = fs::path(f).filename().string();
+        if (!copy_file(f, ser_in + "/" + name)) return {};
+        if (!copy_file(f, mpi_in + "/" + name)) return {};
+    }
+
+    std::string ser_out = src_env.get_dir() + "/ser.json";
+    std::string ser_idx = src_env.get_dir() + "/ser_idx";
+    int rser = run_process(e.serial_bin, {"-d", ser_in, "--index-dir", ser_idx,
+                                          "-o", ser_out, "--force"});
+    if (rser != 0) return {};
+
+    std::string mpi_out = src_env.get_dir() + "/mpi.json.gz";
+    std::string mpi_idx = src_env.get_dir() + "/mpi_idx";
+    std::string mpi_stg = src_env.get_dir() + "/mpi_stg";
+    std::vector<std::string> mpi_args = {
+        "-d",    mpi_in, "--index-dir", mpi_idx,  "--staging-dir",
+        mpi_stg, "-o",   mpi_out,       "--force"};
+    if (use_shared_staging) {
+        // Force a distinct shared dir so Artifacts::move_to actually runs
+        // (proves aggregation_sst / system_metrics_sst survive the move).
+        mpi_args.push_back("--shared-staging");
+        mpi_args.push_back(src_env.get_dir() + "/mpi_shared_stg");
+    }
+    int rmpi = run_mpi(e.launcher, mpi_ranks, e.mpi_bin, mpi_args);
+    if (rmpi != 0) return {};
+
+    return {read_output_sorted(ser_out), read_output_sorted(mpi_out)};
+}
+
+}  // namespace
+
+// ============================================================================
+// Integration tests
+// ============================================================================
+
+TEST_SUITE("DFTracerAggregatorMpi") {
+    TEST_CASE("binary exists") {
+        Env e;
+        if (!e.ready) {
+            MESSAGE("skipping: " << e.skip_reason);
+            return;
+        }
+        CHECK(!e.mpi_bin.empty());
+        CHECK(!e.launcher.empty());
+    }
+
+    TEST_CASE("basic aggregation (n=1)") {
+        Env e;
+        if (!e.ready) {
+            MESSAGE("skipping: " << e.skip_reason);
+            return;
+        }
+        dft_utils_test::TestEnvironment env(100);
+        REQUIRE(env.is_valid());
+        REQUIRE(!create_pfw_gz(env, 100, 0).empty());
+
+        std::string out = env.get_dir() + "/mpi_basic.json.gz";
+        std::string idx = env.get_dir() + "/mpi_basic_idx";
+        std::string stg = env.get_dir() + "/mpi_basic_stg";
+        int rc = run_mpi(e.launcher, 1, e.mpi_bin,
+                         {"-d", env.get_dir(), "--index-dir", idx,
+                          "--staging-dir", stg, "-o", out, "--force"});
+        CHECK(rc == 0);
+        CHECK(fs::exists(out));
+    }
+
+    // Serial vs MPI (n=1): bit-for-bit identical. No cross-rank splitting
+    // exercised; this is the canonical correctness guarantee for the
+    // MPI binary's single-rank mode.
+    TEST_CASE("serial parity (n=1)") {
+        Env e;
+        if (!e.ready) {
+            MESSAGE("skipping: " << e.skip_reason);
+            return;
+        }
+        auto [ser, mpi] = run_and_compare(e, /*mpi_ranks=*/1,
+                                          /*num_events=*/200,
+                                          /*num_files=*/1);
+        REQUIRE(!ser.empty());
+        REQUIRE(!mpi.empty());
+        check_outputs_equal(ser, mpi);
+    }
+
+    // Serial vs MPI (n=4): bit-for-bit identical, including cross-rank
+    // splitting of a multi-member .pfw.gz. The power-sum MetricStats
+    // representation makes the merge order-independent.
+    TEST_CASE("serial parity (n=4, cross-rank splitting)") {
+        Env e;
+        if (!e.ready) {
+            MESSAGE("skipping: " << e.skip_reason);
+            return;
+        }
+        auto [ser, mpi] = run_and_compare(e, /*mpi_ranks=*/4,
+                                          /*num_events=*/5000,
+                                          /*num_files=*/1);
+        REQUIRE(!ser.empty());
+        REQUIRE(!mpi.empty());
+        check_outputs_equal(ser, mpi);
+    }
+
+    // Multi-file parity: events spread across several input files with
+    // per-file LPT (no cross-rank splitting needed). Serial and MPI
+    // should still produce byte-identical output.
+    TEST_CASE("serial parity (n=2, multiple files)") {
+        Env e;
+        if (!e.ready) {
+            MESSAGE("skipping: " << e.skip_reason);
+            return;
+        }
+        auto [ser, mpi] = run_and_compare(e, /*mpi_ranks=*/2,
+                                          /*num_events=*/500,
+                                          /*num_files=*/3);
+        REQUIRE(!ser.empty());
+        REQUIRE(!mpi.empty());
+        check_outputs_equal(ser, mpi);
+    }
+
+    // Shared-staging parity: forces the node-local -> shared-FS relocation
+    // path via Artifacts::move_to. Regression guard for the bug where
+    // aggregation_sst / system_metrics_sst were not listed in move_to and
+    // silently dropped during the move, leaving only root_process records
+    // in the final output.
+    TEST_CASE("serial parity (n=4, shared-staging move path)") {
+        Env e;
+        if (!e.ready) {
+            MESSAGE("skipping: " << e.skip_reason);
+            return;
+        }
+        auto [ser, mpi] = run_and_compare(e, /*mpi_ranks=*/4,
+                                          /*num_events=*/5000,
+                                          /*num_files=*/1,
+                                          /*use_shared_staging=*/true);
+        REQUIRE(!ser.empty());
+        REQUIRE(!mpi.empty());
+        check_outputs_equal(ser, mpi);
+    }
+}
diff --git a/tests/binaries/test_dftracer_call_tree.cpp b/tests/binaries/test_dftracer_call_tree.cpp
new file mode 100644
index 00000000..7562acc9
--- /dev/null
+++ b/tests/binaries/test_dftracer_call_tree.cpp
@@ -0,0 +1,124 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/core/common/filesystem.h>
+#include <doctest/doctest.h>
+#include <sys/wait.h>
+#include <testing_utilities.h>
+#include <unistd.h>
+
+#include <cstdlib>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace {
+
+std::string find_binary() {
+    const char* env_path = std::getenv("DFTRACER_CALL_TREE_PATH");
+    if (env_path && ::access(env_path, X_OK) == 0) return env_path;
+    for (const auto& p :
+         {"./dftracer_call_tree", "../dftracer_call_tree",
+          "../../dftracer_call_tree", "../bin/dftracer_call_tree",
+          "../../bin/dftracer_call_tree"}) {
+        if (::access(p, X_OK) == 0) return p;
+    }
+    return "";
+}
+
+int run_process(const std::string& binary,
+                const std::vector<std::string>& args) {
+    pid_t pid = ::fork();
+    if (pid < 0) return -1;
+    if (pid == 0) {
+        std::vector<const char*> argv;
+        argv.push_back(binary.c_str());
+        for (const auto& a : args) argv.push_back(a.c_str());
+        argv.push_back(nullptr);
+        ::execv(binary.c_str(), const_cast<char* const*>(argv.data()));
+        ::_exit(127);
+    }
+    int status = 0;
+    ::waitpid(pid, &status, 0);
+    return WIFEXITED(status) ? WEXITSTATUS(status) : -1;
+}
+
+std::string create_pfw_gz(dft_utils_test::TestEnvironment& env, int num_events,
+                          int id) {
+    auto trace_gz = env.create_dft_test_gzip_file(num_events);
+    if (trace_gz.empty()) return "";
+    std::string path =
+        env.get_dir() + "/trace_" + std::to_string(id) + ".pfw.gz";
+    fs::rename(trace_gz, path);
+    return path;
+}
+
+// Counts non-bracket JSON lines and verifies the array opens with "[" and
+// closes with "]". Returns -1 on shape error.
+int count_events_basic(const std::string& path) {
+    std::ifstream f(path);
+    if (!f.is_open()) return -1;
+    std::string line;
+    bool saw_open = false, saw_close = false;
+    int events = 0;
+    while (std::getline(f, line)) {
+        if (line == "[") {
+            saw_open = true;
+            continue;
+        }
+        if (line == "]") {
+            saw_close = true;
+            continue;
+        }
+        if (!line.empty()) events++;
+    }
+    return (saw_open && saw_close) ? events : -1;
+}
+
+}  // namespace
+
+TEST_SUITE("DFTracerCallTree") {
+    TEST_CASE("binary exists") {
+        std::string bin = find_binary();
+        if (bin.empty()) {
+            MESSAGE("skipping: dftracer_call_tree binary not found");
+            return;
+        }
+        CHECK(!bin.empty());
+    }
+
+    TEST_CASE("basic run produces valid JSON") {
+        std::string bin = find_binary();
+        if (bin.empty()) {
+            MESSAGE("skipping: binary not found");
+            return;
+        }
+        dft_utils_test::TestEnvironment env(100);
+        REQUIRE(env.is_valid());
+        REQUIRE(!create_pfw_gz(env, 100, 0).empty());
+
+        std::string out = env.get_dir() + "/ct.pfw";
+        int rc = run_process(bin, {env.get_dir(), "-o", out});
+        CHECK(rc == 0);
+        REQUIRE(fs::exists(out));
+        CHECK(count_events_basic(out) > 0);
+    }
+
+    TEST_CASE("multi-file input") {
+        std::string bin = find_binary();
+        if (bin.empty()) {
+            MESSAGE("skipping: binary not found");
+            return;
+        }
+        dft_utils_test::TestEnvironment env(100);
+        REQUIRE(env.is_valid());
+        for (int i = 0; i < 3; ++i) {
+            REQUIRE(!create_pfw_gz(env, 200, i).empty());
+        }
+
+        std::string out = env.get_dir() + "/ct.pfw";
+        int rc = run_process(bin, {env.get_dir(), "-o", out});
+        CHECK(rc == 0);
+        REQUIRE(fs::exists(out));
+        CHECK(count_events_basic(out) > 0);
+    }
+}
diff --git a/tests/binaries/test_dftracer_call_tree_mpi.cpp b/tests/binaries/test_dftracer_call_tree_mpi.cpp
new file mode 100644
index 00000000..99ab7f8d
--- /dev/null
+++ b/tests/binaries/test_dftracer_call_tree_mpi.cpp
@@ -0,0 +1,281 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/core/common/filesystem.h>
+#include <doctest/doctest.h>
+#include <sys/wait.h>
+#include <testing_utilities.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace {
+
+std::string find_binary(const char* env_name,
+                        const std::vector<std::string>& candidates) {
+    const char* env_path = std::getenv(env_name);
+    if (env_path && ::access(env_path, X_OK) == 0) return env_path;
+    for (const auto& p : candidates) {
+        if (::access(p.c_str(), X_OK) == 0) return p;
+    }
+    return "";
+}
+
+std::string find_serial_binary() {
+    return find_binary("DFTRACER_CALL_TREE_PATH",
+                       {"./dftracer_call_tree", "../dftracer_call_tree",
+                        "../../dftracer_call_tree", "../bin/dftracer_call_tree",
+                        "../../bin/dftracer_call_tree"});
+}
+
+std::string find_mpi_binary() {
+    return find_binary(
+        "DFTRACER_CALL_TREE_MPI_PATH",
+        {"./dftracer_call_tree_mpi", "../dftracer_call_tree_mpi",
+         "../../dftracer_call_tree_mpi", "../bin/dftracer_call_tree_mpi",
+         "../../bin/dftracer_call_tree_mpi"});
+}
+
+std::string find_launcher() {
+    const char* env_path = std::getenv("MPIEXEC_EXECUTABLE");
+    if (env_path && ::access(env_path, X_OK) == 0) return env_path;
+    for (const auto& name : {"mpiexec", "mpirun"}) {
+        std::string cmd = std::string("command -v ") + name + " 2>/dev/null";
+        FILE* p = ::popen(cmd.c_str(), "r");
+        if (!p) continue;
+        char buf[4096];
+        std::string out;
+        while (std::fgets(buf, sizeof(buf), p)) out += buf;
+        ::pclose(p);
+        while (!out.empty() && (out.back() == '\n' || out.back() == ' '))
+            out.pop_back();
+        if (!out.empty() && ::access(out.c_str(), X_OK) == 0) return out;
+    }
+    return "";
+}
+
+int run_process(const std::string& binary,
+                const std::vector<std::string>& args) {
+    pid_t pid = ::fork();
+    if (pid < 0) return -1;
+    if (pid == 0) {
+        std::vector<const char*> argv;
+        argv.push_back(binary.c_str());
+        for (const auto& a : args) argv.push_back(a.c_str());
+        argv.push_back(nullptr);
+        ::execv(binary.c_str(), const_cast<char* const*>(argv.data()));
+        ::_exit(127);
+    }
+    int status = 0;
+    ::waitpid(pid, &status, 0);
+    return WIFEXITED(status) ? WEXITSTATUS(status) : -1;
+}
+
+int run_mpi(const std::string& launcher, int np, const std::string& binary,
+            const std::vector<std::string>& binary_args) {
+    std::vector<std::string> args = {"--allow-run-as-root", "-n",
+                                     std::to_string(np), binary};
+    for (const auto& a : binary_args) args.push_back(a);
+    return run_process(launcher, args);
+}
+
+std::string create_pfw_gz(dft_utils_test::TestEnvironment& env, int num_events,
+                          int id) {
+    auto trace_gz = env.create_dft_test_gzip_file(num_events);
+    if (trace_gz.empty()) return "";
+    std::string path =
+        env.get_dir() + "/trace_" + std::to_string(id) + ".pfw.gz";
+    fs::rename(trace_gz, path);
+    return path;
+}
+
+bool copy_file(const std::string& src, const std::string& dst) {
+    std::ifstream in(src, std::ios::binary);
+    std::ofstream out(dst, std::ios::binary);
+    if (!in.is_open() || !out.is_open()) return false;
+    out << in.rdbuf();
+    return out.good();
+}
+
+// Strip the "id":<N>, prefix from a Chrome Tracing event line. Event id
+// differs between serial (sequential) and MPI (rank-base + slice stride)
+// runs even when the underlying events are identical, so we compare the
+// remaining fields. Non-event lines (header brackets) pass through.
+std::string strip_event_id(const std::string& line) {
+    static const std::string prefix = "{\"id\":";
+    if (line.compare(0, prefix.size(), prefix) != 0) return line;
+    std::size_t comma = line.find(',', prefix.size());
+    if (comma == std::string::npos) return line;
+    return std::string("{") + line.substr(comma + 1);
+}
+
+std::vector<std::string> read_event_lines_sorted(const std::string& path) {
+    std::vector<std::string> lines;
+    std::ifstream f(path);
+    if (!f.is_open()) return lines;
+    std::string line;
+    while (std::getline(f, line)) {
+        if (line.empty() || line == "[" || line == "]") continue;
+        // Drop trailing comma so equivalent events match.
+        if (!line.empty() && line.back() == ',') line.pop_back();
+        // Metadata events ({"name":"M",...}) embed wall-clock timestamp and
+        // change between independent process invocations. Skip them; we
+        // only compare actual tree events.
+        static const std::string meta_prefix = "{\"name\":\"M\"";
+        if (line.compare(0, meta_prefix.size(), meta_prefix) == 0) continue;
+        lines.push_back(strip_event_id(line));
+    }
+    std::sort(lines.begin(), lines.end());
+    return lines;
+}
+
+struct Env {
+    std::string serial_bin, mpi_bin, launcher;
+    bool ready = false;
+    std::string skip_reason;
+
+    Env() {
+        serial_bin = find_serial_binary();
+        mpi_bin = find_mpi_binary();
+        launcher = find_launcher();
+        if (serial_bin.empty()) {
+            skip_reason = "dftracer_call_tree binary not found";
+            return;
+        }
+        if (mpi_bin.empty()) {
+            skip_reason = "dftracer_call_tree_mpi binary not found";
+            return;
+        }
+        if (launcher.empty()) {
+            skip_reason = "no mpiexec/mpirun on PATH";
+            return;
+        }
+        ready = true;
+    }
+};
+
+std::pair<std::vector<std::string>, std::vector<std::string>> run_and_compare(
+    const Env& e, int mpi_ranks, int num_events, int num_files) {
+    dft_utils_test::TestEnvironment src(100);
+    if (!src.is_valid()) return {};
+    std::vector<std::string> srcs;
+    for (int i = 0; i < num_files; ++i) {
+        auto p = create_pfw_gz(src, num_events, i);
+        if (p.empty()) return {};
+        srcs.push_back(p);
+    }
+
+    // Identical input dirs for serial and MPI runs.
+    std::string ser_in = src.get_dir() + "/_ser_in";
+    std::string mpi_in = src.get_dir() + "/_mpi_in";
+    fs::create_directories(ser_in);
+    fs::create_directories(mpi_in);
+    for (const auto& f : srcs) {
+        auto name = fs::path(f).filename().string();
+        if (!copy_file(f, ser_in + "/" + name)) return {};
+        if (!copy_file(f, mpi_in + "/" + name)) return {};
+    }
+
+    std::string ser_out = src.get_dir() + "/ser.pfw";
+    int rs = run_process(e.serial_bin, {ser_in, "-o", ser_out});
+    if (rs != 0) return {};
+
+    std::string mpi_out = src.get_dir() + "/mpi.pfw";
+    std::string mpi_stg = src.get_dir() + "/mpi_stg";
+    int rm = run_mpi(e.launcher, mpi_ranks, e.mpi_bin,
+                     {mpi_in, "-o", mpi_out, "--staging-dir", mpi_stg});
+    if (rm != 0) return {};
+
+    return {read_event_lines_sorted(ser_out), read_event_lines_sorted(mpi_out)};
+}
+
+void check_parity(const std::vector<std::string>& ser,
+                  const std::vector<std::string>& mpi) {
+    const bool equal = ser == mpi;
+    if (!equal) {
+        MESSAGE("serial events=" << ser.size() << " mpi events=" << mpi.size());
+        std::size_t shown = 0;
+        for (std::size_t i = 0;
+             i < std::min(ser.size(), mpi.size()) && shown < 3; ++i) {
+            if (ser[i] != mpi[i]) {
+                MESSAGE("first diff at " << i);
+                MESSAGE("  ser: " << ser[i]);
+                MESSAGE("  mpi: " << mpi[i]);
+                ++shown;
+            }
+        }
+    }
+    CHECK(equal);
+}
+
+}  // namespace
+
+TEST_SUITE("DFTracerCallTreeMpi") {
+    TEST_CASE("binary exists") {
+        Env e;
+        if (!e.ready) {
+            MESSAGE("skipping: " << e.skip_reason);
+            return;
+        }
+        CHECK(!e.mpi_bin.empty());
+        CHECK(!e.launcher.empty());
+    }
+
+    TEST_CASE("basic MPI run (n=1)") {
+        Env e;
+        if (!e.ready) {
+            MESSAGE("skipping: " << e.skip_reason);
+            return;
+        }
+        dft_utils_test::TestEnvironment env(100);
+        REQUIRE(env.is_valid());
+        REQUIRE(!create_pfw_gz(env, 100, 0).empty());
+        std::string out = env.get_dir() + "/mpi.pfw";
+        std::string stg = env.get_dir() + "/stg";
+        int rc = run_mpi(e.launcher, 1, e.mpi_bin,
+                         {env.get_dir(), "-o", out, "--staging-dir", stg});
+        CHECK(rc == 0);
+        CHECK(fs::exists(out));
+    }
+
+    TEST_CASE("serial parity (n=1)") {
+        Env e;
+        if (!e.ready) {
+            MESSAGE("skipping: " << e.skip_reason);
+            return;
+        }
+        auto [s, m] = run_and_compare(e, 1, 200, 1);
+        REQUIRE(!s.empty());
+        REQUIRE(!m.empty());
+        check_parity(s, m);
+    }
+
+    TEST_CASE("serial parity (n=2 multi-file)") {
+        Env e;
+        if (!e.ready) {
+            MESSAGE("skipping: " << e.skip_reason);
+            return;
+        }
+        auto [s, m] = run_and_compare(e, 2, 200, 4);
+        REQUIRE(!s.empty());
+        REQUIRE(!m.empty());
+        check_parity(s, m);
+    }
+
+    TEST_CASE("serial parity (n=4 multi-file)") {
+        Env e;
+        if (!e.ready) {
+            MESSAGE("skipping: " << e.skip_reason);
+            return;
+        }
+        auto [s, m] = run_and_compare(e, 4, 300, 8);
+        REQUIRE(!s.empty());
+        REQUIRE(!m.empty());
+        check_parity(s, m);
+    }
+}
diff --git a/tests/binaries/test_dftracer_comparator.cpp b/tests/binaries/test_dftracer_comparator.cpp
index 0f599fc0..57a9c2b3 100644
--- a/tests/binaries/test_dftracer_comparator.cpp
+++ b/tests/binaries/test_dftracer_comparator.cpp
@@ -1,10 +1,10 @@
 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
 #include <dftracer/utils/core/common/filesystem.h>
 #include <doctest/doctest.h>
+#include <simdjson.h>
 #include <sys/wait.h>
 #include <testing_utilities.h>
 #include <unistd.h>
-#include <yyjson.h>
 
 #include <cstdio>
 #include <cstdlib>
@@ -264,72 +264,75 @@ TEST_SUITE("DFTracerComparator") {
         REQUIRE(!content.empty());
 
         // Parse JSON
-        yyjson_doc* doc = yyjson_read(content.c_str(), content.size(), 0);
-        REQUIRE(doc != nullptr);
-        yyjson_val* root = yyjson_doc_get_root(doc);
-        REQUIRE(root != nullptr);
-        REQUIRE(yyjson_is_obj(root));
+        simdjson::dom::parser parser;
+        auto result = parser.parse(content);
+        REQUIRE(!result.error());
+        auto root = result.value_unsafe();
+        REQUIRE(root.is_object());
 
         // Top-level fields
-        CHECK(yyjson_is_str(yyjson_obj_get(root, "baseline")));
-        CHECK(yyjson_is_str(yyjson_obj_get(root, "variant")));
-        CHECK(yyjson_is_obj(yyjson_obj_get(root, "baseline_meta")));
-        CHECK(yyjson_is_obj(yyjson_obj_get(root, "variant_meta")));
-        CHECK(yyjson_is_num(yyjson_obj_get(root, "execution_time_ms")));
+        CHECK(root["baseline"].is_string());
+        CHECK(root["variant"].is_string());
+        CHECK(root["baseline_meta"].is_object());
+        CHECK(root["variant_meta"].is_object());
+        CHECK(root["execution_time_ms"].is_number());
 
         // Nodes array
-        yyjson_val* nodes = yyjson_obj_get(root, "nodes");
-        REQUIRE(yyjson_is_arr(nodes));
-        REQUIRE(yyjson_arr_size(nodes) > 0);
+        auto nodes = root["nodes"];
+        REQUIRE(!nodes.error());
+        REQUIRE(nodes.is_array());
+        auto nodes_arr = nodes.get_array().value_unsafe();
+        REQUIRE(nodes_arr.size() > 0);
 
         // First node structure
-        yyjson_val* node0 = yyjson_arr_get_first(nodes);
-        REQUIRE(yyjson_is_obj(node0));
-        CHECK(yyjson_is_str(yyjson_obj_get(node0, "name")));
-        CHECK(yyjson_is_str(yyjson_obj_get(node0, "query")));
+        auto node0 = nodes_arr.at(0);
+        REQUIRE(node0.is_object());
+        CHECK(node0["name"].is_string());
+        CHECK(node0["query"].is_string());
 
         // Summary
-        yyjson_val* summary = yyjson_obj_get(node0, "summary");
-        REQUIRE(yyjson_is_obj(summary));
-        yyjson_val* sum_metrics = yyjson_obj_get(summary, "metrics");
-        REQUIRE(yyjson_is_arr(sum_metrics));
-        REQUIRE(yyjson_arr_size(sum_metrics) > 0);
+        auto summary = node0["summary"];
+        REQUIRE(!summary.error());
+        REQUIRE(summary.is_object());
+        auto sum_metrics = summary["metrics"];
+        REQUIRE(!sum_metrics.error());
+        REQUIRE(sum_metrics.is_array());
+        auto sum_metrics_arr = sum_metrics.get_array().value_unsafe();
+        REQUIRE(sum_metrics_arr.size() > 0);
 
         // First metric structure
-        yyjson_val* metric0 = yyjson_arr_get_first(sum_metrics);
-        REQUIRE(yyjson_is_obj(metric0));
-        CHECK(yyjson_is_str(yyjson_obj_get(metric0, "name")));
-        CHECK(yyjson_is_num(yyjson_obj_get(metric0, "baseline")));
-        CHECK(yyjson_is_num(yyjson_obj_get(metric0, "variant")));
-        CHECK(yyjson_is_num(yyjson_obj_get(metric0, "delta")));
-        CHECK(yyjson_is_num(yyjson_obj_get(metric0, "pct_change")));
-        CHECK(yyjson_is_num(yyjson_obj_get(metric0, "cohens_d")));
-        CHECK(yyjson_is_str(yyjson_obj_get(metric0, "significance")));
-        CHECK(yyjson_is_bool(yyjson_obj_get(metric0, "is_regression")));
+        auto metric0 = sum_metrics_arr.at(0);
+        REQUIRE(metric0.is_object());
+        CHECK(metric0["name"].is_string());
+        CHECK(metric0["baseline"].is_number());
+        CHECK(metric0["variant"].is_number());
+        CHECK(metric0["delta"].is_number());
+        CHECK(metric0["pct_change"].is_number());
+        CHECK(metric0["cohens_d"].is_number());
+        CHECK(metric0["significance"].is_string());
+        CHECK(metric0["is_regression"].is_bool());
 
         // Groups array exists
-        yyjson_val* groups = yyjson_obj_get(node0, "groups");
-        CHECK(yyjson_is_arr(groups));
+        CHECK(node0["groups"].is_array());
 
         // Children array exists
-        yyjson_val* children = yyjson_obj_get(node0, "children");
-        CHECK(yyjson_is_arr(children));
+        CHECK(node0["children"].is_array());
 
         // Metadata objects
-        yyjson_val* base_meta = yyjson_obj_get(root, "baseline_meta");
-        REQUIRE(yyjson_is_obj(base_meta));
-        CHECK(yyjson_is_int(yyjson_obj_get(base_meta, "files")));
-        CHECK(yyjson_is_int(yyjson_obj_get(base_meta, "processes")));
-        CHECK(yyjson_is_int(yyjson_obj_get(base_meta, "threads")));
-        CHECK(yyjson_is_num(yyjson_obj_get(base_meta, "total_bytes")));
-        CHECK(yyjson_is_num(yyjson_obj_get(base_meta, "total_io_time_us")));
-        CHECK(yyjson_is_num(yyjson_obj_get(base_meta, "makespan_us")));
-
-        yyjson_val* var_meta = yyjson_obj_get(root, "variant_meta");
-        REQUIRE(yyjson_is_obj(var_meta));
-        CHECK(yyjson_is_int(yyjson_obj_get(var_meta, "files")));
-
-        yyjson_doc_free(doc);
+        auto base_meta = root["baseline_meta"];
+        REQUIRE(!base_meta.error());
+        REQUIRE(base_meta.is_object());
+        CHECK(base_meta["files"].is_number());
+        CHECK(base_meta["processes"].is_number());
+        CHECK(base_meta["threads"].is_number());
+        CHECK(base_meta["total_bytes"].is_number());
+        CHECK(base_meta["total_io_time_us"].is_number());
+        CHECK(base_meta["makespan_us"].is_number());
+
+        auto var_meta = root["variant_meta"];
+        REQUIRE(!var_meta.error());
+        REQUIRE(var_meta.is_object());
+        CHECK(var_meta["files"].is_number());
     }
 
     TEST_CASE("json output - same file deltas are zero") {
@@ -355,24 +358,21 @@ TEST_SUITE("DFTracerComparator") {
         auto content = read_file(output);
         REQUIRE(!content.empty());
 
-        yyjson_doc* doc = yyjson_read(content.c_str(), content.size(), 0);
-        REQUIRE(doc != nullptr);
-        yyjson_val* root = yyjson_doc_get_root(doc);
-        yyjson_val* nodes = yyjson_obj_get(root, "nodes");
-        yyjson_val* node0 = yyjson_arr_get_first(nodes);
-        yyjson_val* summary = yyjson_obj_get(node0, "summary");
-        yyjson_val* metrics = yyjson_obj_get(summary, "metrics");
+        simdjson::dom::parser parser;
+        auto result = parser.parse(content);
+        REQUIRE(!result.error());
+        auto root = result.value_unsafe();
+        auto nodes_arr = root["nodes"].get_array().value_unsafe();
+        auto node0 = nodes_arr.at(0);
+        auto metrics_arr =
+            node0["summary"]["metrics"].get_array().value_unsafe();
 
         // All deltas should be ~0 when comparing same file
-        std::size_t idx, max;
-        yyjson_val* m;
-        yyjson_arr_foreach(metrics, idx, max, m) {
-            double baseline = yyjson_get_real(yyjson_obj_get(m, "baseline"));
-            double variant = yyjson_get_real(yyjson_obj_get(m, "variant"));
+        for (auto m : metrics_arr) {
+            double baseline = m["baseline"].get_double().value();
+            double variant = m["variant"].get_double().value();
             CHECK(baseline == doctest::Approx(variant).epsilon(0.01));
         }
-
-        yyjson_doc_free(doc);
     }
 
     TEST_CASE("custom time interval") {
diff --git a/tests/binaries/test_dftracer_gen_fake_trace.cpp b/tests/binaries/test_dftracer_gen_fake_trace.cpp
index b8aaa429..6eab51c0 100644
--- a/tests/binaries/test_dftracer_gen_fake_trace.cpp
+++ b/tests/binaries/test_dftracer_gen_fake_trace.cpp
@@ -151,10 +151,10 @@ TEST_SUITE("DFTracerGenFakeTrace") {
         std::string rank0 = out_dir + "/rank_0.pfw.gz";
         REQUIRE(fs::exists(rank0));
 
-        // DFTracer events are JSON objects; first line starts with '{'.
+        // First line is the opening JSON array bracket.
         auto first = gz_first_line(rank0);
         REQUIRE(!first.empty());
-        CHECK(first.front() == '{');
+        CHECK(first.front() == '[');
     }
 
     TEST_CASE("deterministic output with fixed seed") {
diff --git a/tests/binaries/test_dftracer_organize.cpp b/tests/binaries/test_dftracer_organize.cpp
index e45afdf6..642475b2 100644
--- a/tests/binaries/test_dftracer_organize.cpp
+++ b/tests/binaries/test_dftracer_organize.cpp
@@ -289,9 +289,10 @@ TEST_SUITE("DFTracerOrganize") {
         fs::create_directories(org_dir);
         fs::create_directories(rec_dir);
 
-        int rc_org =
-            run_binary(org_binary, {"-d", env.get_dir(), "-o", org_dir,
-                                    "--groups", R"(io:cat == "POSIX")"});
+        // Route all events (POSIX and STDIO) to properly test round-trip
+        int rc_org = run_binary(org_binary,
+                                {"-d", env.get_dir(), "-o", org_dir, "--groups",
+                                 R"(io:cat == "POSIX" || cat == "STDIO")"});
         REQUIRE(rc_org == 0);
 
         int rc_rec = run_binary(
diff --git a/tests/binaries/test_dftracer_server.cpp b/tests/binaries/test_dftracer_server.cpp
index 292f4097..fe17c9dd 100644
--- a/tests/binaries/test_dftracer_server.cpp
+++ b/tests/binaries/test_dftracer_server.cpp
@@ -112,7 +112,7 @@ bool wait_for_port(int port, int timeout_s = 10) {
 
 /// Send a raw HTTP request and receive the response.
 std::string http_request(int port, const std::string& request,
-                         int recv_timeout_s = 2) {
+                         int recv_timeout_s = 15) {
     int sock = ::socket(AF_INET, SOCK_STREAM, 0);
     if (sock < 0) return "";
 
diff --git a/tests/pipeline/test_coro_scope.cpp b/tests/pipeline/test_coro_scope.cpp
index 9f1ae876..f95f5e53 100644
--- a/tests/pipeline/test_coro_scope.cpp
+++ b/tests/pipeline/test_coro_scope.cpp
@@ -123,7 +123,7 @@ TEST_CASE("CoroScope - Producer-consumer with channel (shared_ptr)") {
             auto channel = coro::make_channel<int>(16);
 
             co_await ctx.coro_scope(
-                [&sum, channel](CoroScope& scope) -> coro::CoroTask<void> {
+                [&sum, &channel](CoroScope& scope) -> coro::CoroTask<void> {
                     scope.spawn_producer(
                         channel, [](CoroScope&) -> coro::Generator<int> {
                             for (int i = 1; i <= 10; ++i) {
@@ -164,8 +164,8 @@ TEST_CASE("CoroScope - Transform pipeline") {
             auto output = coro::make_channel<int>(16);
 
             co_await ctx.coro_scope(
-                [&sum, input,
-                 output](CoroScope& scope) -> coro::CoroTask<void> {
+                [&sum, &input,
+                 &output](CoroScope& scope) -> coro::CoroTask<void> {
                     // Producer: 1..5
                     scope.spawn_producer(
                         input, [](CoroScope&) -> coro::Generator<int> {
@@ -242,12 +242,12 @@ TEST_CASE("CoroScope - spawn_producers (N producers, shared_ptr)") {
             auto channel = coro::make_channel<int>(32);
 
             co_await ctx.coro_scope(
-                [&sum, channel](CoroScope& scope) -> coro::CoroTask<void> {
+                [&sum, &channel](CoroScope& scope) -> coro::CoroTask<void> {
                     // 3 producers, each sends its index
                     scope.spawn_producers(
                         channel, 3,
-                        [channel](CoroScope&,
-                                  std::size_t idx) -> coro::CoroTask<void> {
+                        [&channel](CoroScope&,
+                                   std::size_t idx) -> coro::CoroTask<void> {
                             int val = static_cast<int>(idx + 1);
                             co_await channel->send(val);
                             co_return;
diff --git a/tests/python/common.py b/tests/python/common.py
index 82f4aa15..8f1dad55 100644
--- a/tests/python/common.py
+++ b/tests/python/common.py
@@ -11,7 +11,7 @@
 
 import pytest
 
-import dftracer.utils as dft_utils
+from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer
 
 
 def determine_index_path(file_path: str, index_dir: str = "") -> str:
@@ -208,6 +208,106 @@ def create_test_gzip_file_with_nested_json(self):
         self.test_files.append(file_path)
         return file_path
 
+    def create_varying_schema_file(self, filename="varying_schema.pfw.gz", num_events=500):
+        """Create events with varying schemas to test elastic Arrow schema.
+
+        Some events have extra fields (offset, whence, size) that others don't.
+        This tests that the Arrow writer handles schema evolution correctly.
+        """
+        file_path = os.path.join(self.temp_dir, filename)
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+
+        with gzip.open(file_path, "wt", encoding="utf-8") as f:
+            f.write("[\n")
+            for i in range(num_events):
+                name = ["read", "write", "open", "close", "stat"][i % 5]
+                cat = "POSIX"
+
+                event = {
+                    "name": name,
+                    "cat": cat,
+                    "pid": 1000 + i % 4,
+                    "tid": 2000 + i % 8,
+                    "ts": 1000000 + i * 1000,
+                    "dur": (i * 123) % 10000,
+                    "ph": "X",
+                    "args": {"ret": 1024 * i, "hhash": f"hash_{i}"},
+                }
+
+                if name == "read" or name == "write":
+                    event["args"]["offset"] = i * 4096
+                    event["args"]["size"] = 4096
+
+                if name == "open":
+                    event["args"]["flags"] = "O_RDONLY"
+                    event["args"]["mode"] = 0o644
+
+                if name == "stat":
+                    event["args"]["path"] = f"/tmp/file_{i}.txt"
+
+                if i % 7 == 0:
+                    event["args"]["extra_field"] = f"extra_{i}"
+
+                if i % 11 == 0:
+                    event["args"]["rare_field"] = i * 1000
+
+                import json
+
+                f.write(json.dumps(event, separators=(",", ":")) + "\n")
+            f.write("]\n")
+
+        self.test_files.append(file_path)
+        return file_path
+
+    def create_dft_trace_file_with_pid(self, filename, pid, num_events=None):
+        """Create a DFTracer trace with a specific PID, hash metadata, and proper aggregation fields."""
+        file_path = os.path.join(self.temp_dir, filename)
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+        n = num_events if num_events is not None else self.lines
+        io_names = ["read", "write", "open", "close", "pread", "pwrite", "fread", "fwrite"]
+        cats = ["POSIX", "POSIX", "POSIX", "POSIX", "POSIX", "POSIX", "STDIO", "STDIO"]
+        hhash = f"h{pid}"
+        fhash = f"f{pid}"
+        with gzip.open(file_path, "wt", encoding="utf-8") as f:
+            f.write(
+                f'{{"name":"HH","ph":"M","pid":{pid},"tid":1,"args":{{"name":"host{pid}","value":"{hhash}"}}}}\n'
+            )
+            f.write(
+                f'{{"name":"FH","ph":"M","pid":{pid},"tid":1,"args":{{"name":"/data/file{pid}.dat","value":"{fhash}"}}}}\n'
+            )
+            for i in range(n):
+                name = io_names[i % len(io_names)]
+                cat = cats[i % len(cats)]
+                f.write(
+                    f'{{"name":"{name}","cat":"{cat}","pid":{pid},"tid":{1 + i % 3},'
+                    f'"ts":{1000000 + i * 1000},"dur":{100 + i * 10},'
+                    f'"ph":"X","args":{{"ret":{1024 * (i + 1)},"hhash":"{hhash}","fhash":"{fhash}"}}}}\n'
+                )
+        self.test_files.append(file_path)
+        return file_path
+
+    def create_indexed_traces(self, pids=None, num_events=None):
+        """Create trace files and build full index with aggregation.
+
+        Returns the temp directory path (use as directory= for Indexer).
+        """
+        from dftracer.utils import AggregationConfig, Indexer
+
+        if pids is None:
+            pids = [1]
+        files = []
+        for pid in pids:
+            files.append(
+                self.create_dft_trace_file_with_pid(f"trace_p{pid}.pfw.gz", pid, num_events)
+            )
+        indexer = Indexer(
+            files=files,
+            require_aggregation=AggregationConfig(time_interval_ms=5000),
+            force_rebuild=True,
+        )
+        indexer.ensure_indexed()
+        return self.temp_dir
+
     def get_index_path(self, gz_file_path):
         """Get the `.dftindex` path for a gzip file."""
         return determine_index_path(gz_file_path, "")
@@ -220,7 +320,7 @@ def build_index(self, gz_file_path, checkpoint_size_bytes=None):
         index_path = self.get_index_path(gz_file_path)
 
         try:
-            with dft_utils.Indexer(gz_file_path, index_path, checkpoint_size_bytes) as indexer:
+            with NativeIndexer(gz_file_path, index_path, checkpoint_size_bytes) as indexer:
                 if indexer.need_rebuild():
                     indexer.build()
 
@@ -236,7 +336,7 @@ def create_indexer(self, gz_file_path, checkpoint_size_bytes=None):
             checkpoint_size_bytes = 32 * 1024 * 1024  # 32MB default
 
         try:
-            indexer = dft_utils.Indexer(gz_file_path, checkpoint_size=checkpoint_size_bytes)
+            indexer = NativeIndexer(gz_file_path, checkpoint_size=checkpoint_size_bytes)
             if indexer.need_rebuild():
                 indexer.build()
             return indexer
diff --git a/tests/python/test_aggregator.py b/tests/python/test_aggregator.py
index 1bcd9c4b..3e3a957e 100644
--- a/tests/python/test_aggregator.py
+++ b/tests/python/test_aggregator.py
@@ -1,5 +1,6 @@
 """Tests for AggregatorUtility."""
 
+import gzip
 from pathlib import Path
 from typing import Dict, Tuple
 
@@ -38,8 +39,8 @@ class TestAggregatorUtility:
 
     @staticmethod
     def _write_mixed_counter_trace(env: Environment) -> str:
-        path = Path(env.temp_dir) / "mixed_trace.pfw"
-        path.write_text(
+        path = Path(env.temp_dir) / "mixed_trace.pfw.gz"
+        content = (
             "\n".join(
                 [
                     '{"name":"read","cat":"POSIX","pid":7,"tid":3,"ts":1000,"dur":50,"ph":"X","args":{"ret":64,"bytes":64,"hhash":"event_h","fhash":"event_f"}}',
@@ -47,9 +48,10 @@ def _write_mixed_counter_trace(env: Environment) -> str:
                     '{"name":"mem_bw","cat":"sys","pid":7,"tid":3,"ts":2500,"dur":0,"ph":"C","args":{"count":2,"dur_sum":40,"dur_min":15,"dur_max":25,"ret_sum":600,"ret_min":250,"ret_max":350,"bytes_sum":1200,"bytes_min":500,"bytes_max":700,"hhash":"system_h","fhash":"system_f"}}',
                 ]
             )
-            + "\n",
-            encoding="utf-8",
+            + "\n"
         )
+        with gzip.open(path, "wt", encoding="utf-8") as f:
+            f.write(content)
         env.test_files.append(str(path))
         return str(path)
 
@@ -198,17 +200,18 @@ def test_iter_arrow_emits_separate_event_profile_and_system_batches(self):
 
     def test_process_unions_group_keys_and_custom_metric_columns(self):
         with Environment(lines=0) as env:
-            path = Path(env.temp_dir) / "mixed_schema_trace.pfw"
-            path.write_text(
+            path = Path(env.temp_dir) / "mixed_schema_trace.pfw.gz"
+            content = (
                 "\n".join(
                     [
                         '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":1000,"dur":10,"ph":"X","args":{"ret":4,"epoch":"1","bytes":4,"hhash":"h1"}}',
                         '{"name":"write","cat":"POSIX","pid":1,"tid":1,"ts":2000,"dur":20,"ph":"X","args":{"ret":8,"step":"2","ops":3,"hhash":"h2"}}',
                     ]
                 )
-                + "\n",
-                encoding="utf-8",
+                + "\n"
             )
+            with gzip.open(path, "wt", encoding="utf-8") as f:
+                f.write(content)
             env.test_files.append(str(path))
 
             table = AggregatorUtility().process(
@@ -235,3 +238,279 @@ def test_process_unions_group_keys_and_custom_metric_columns(self):
             assert by_name["write"]["step"] == "2"
             assert by_name["write"]["bytes_total"] is None
             assert by_name["write"]["ops_total"] == 3
+
+    def test_time_interval_expansion_adds_ci_columns(self):
+        """When querying with smaller interval than stored, CI columns appear."""
+        with Environment(lines=0) as env:
+            # Create trace with events spread across time
+            path = Path(env.temp_dir) / "time_interval_trace.pfw.gz"
+            content = (
+                "\n".join(
+                    [
+                        '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":1000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}',
+                        '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":2000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}',
+                        '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":3000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}',
+                        '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":4000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}',
+                    ]
+                )
+                + "\n"
+            )
+            with gzip.open(path, "wt", encoding="utf-8") as f:
+                f.write(content)
+            env.test_files.append(str(path))
+
+            # First, build index with large time interval (5000ms)
+            _ = AggregatorUtility().process(
+                env.temp_dir,
+                index_dir=env.temp_dir,
+                force_rebuild=True,
+                time_interval_ms=5000.0,
+            )
+
+            # Query with smaller interval (1000ms) - should trigger expansion
+            result = AggregatorUtility().process(
+                env.temp_dir,
+                index_dir=env.temp_dir,
+                force_rebuild=False,
+                time_interval_ms=1000.0,
+            )
+
+            pa = pytest.importorskip("pyarrow")
+            batches = [pa.record_batch(batch) for batch in result.batches()]
+            assert len(batches) > 0
+
+            # CI columns should be present when expansion happened
+            schema_names = set(batches[0].schema.names)
+            assert "count_ci_lower" in schema_names, (
+                f"Missing count_ci_lower. Schema: {schema_names}"
+            )
+            assert "count_ci_upper" in schema_names, (
+                f"Missing count_ci_upper. Schema: {schema_names}"
+            )
+
+            # Verify CI values are sensible (upper >= count >= lower)
+            rows = pa.Table.from_batches(batches).to_pylist()
+            for row in rows:
+                assert row["count_ci_lower"] <= row["count"]
+                assert row["count_ci_upper"] >= row["count"]
+
+    def test_time_interval_shrink_no_ci_columns(self):
+        """When querying with larger interval than stored, no CI columns (lossless)."""
+        with Environment(lines=0) as env:
+            path = Path(env.temp_dir) / "shrink_trace.pfw.gz"
+            content = (
+                "\n".join(
+                    [
+                        '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":1000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}',
+                        '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":6000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}',
+                    ]
+                )
+                + "\n"
+            )
+            with gzip.open(path, "wt", encoding="utf-8") as f:
+                f.write(content)
+            env.test_files.append(str(path))
+
+            # Build with small interval (1000ms)
+            _ = AggregatorUtility().process(
+                env.temp_dir,
+                index_dir=env.temp_dir,
+                force_rebuild=True,
+                time_interval_ms=1000.0,
+            )
+
+            # Query with larger interval (5000ms) - shrinking is lossless
+            result = AggregatorUtility().process(
+                env.temp_dir,
+                index_dir=env.temp_dir,
+                force_rebuild=False,
+                time_interval_ms=5000.0,
+            )
+
+            pa = pytest.importorskip("pyarrow")
+            batches = [pa.record_batch(batch) for batch in result.batches()]
+            assert len(batches) > 0
+
+            # CI columns should NOT be present for shrinking (lossless)
+            schema_names = set(batches[0].schema.names)
+            assert "count_ci_lower" not in schema_names
+            assert "count_ci_upper" not in schema_names
+
+    def test_process_with_query_filter(self):
+        """Query parameter filters aggregation results."""
+        with Environment(lines=0) as env:
+            path = Path(env.temp_dir) / "query_trace.pfw.gz"
+            content = (
+                "\n".join(
+                    [
+                        '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":1000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}',
+                        '{"name":"compute","cat":"APP","pid":1,"tid":1,"ts":2000,"dur":20,"ph":"X","args":{"ret":8,"hhash":"h2","fhash":"f2"}}',
+                        '{"name":"write","cat":"POSIX","pid":1,"tid":1,"ts":3000,"dur":15,"ph":"X","args":{"ret":6,"hhash":"h3","fhash":"f3"}}',
+                    ]
+                )
+                + "\n"
+            )
+            with gzip.open(path, "wt", encoding="utf-8") as f:
+                f.write(content)
+            env.test_files.append(str(path))
+
+            # Query for POSIX only
+            result = AggregatorUtility().process(
+                env.temp_dir,
+                index_dir=env.temp_dir,
+                force_rebuild=True,
+                query='cat == "POSIX"',
+            )
+
+            pa = pytest.importorskip("pyarrow")
+            batches = [pa.record_batch(batch) for batch in result.batches()]
+            rows = pa.Table.from_batches(batches).to_pylist()
+
+            # Should only have POSIX entries
+            assert len(rows) == 2
+            assert all(row["cat"] == "POSIX" for row in rows)
+
+    def test_iter_arrow_with_query_filter(self):
+        """Query parameter filters streaming results."""
+        with Environment(lines=0) as env:
+            path = Path(env.temp_dir) / "iter_query_trace.pfw.gz"
+            content = (
+                "\n".join(
+                    [
+                        '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":1000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}',
+                        '{"name":"compute","cat":"APP","pid":1,"tid":1,"ts":2000,"dur":20,"ph":"X","args":{"ret":8,"hhash":"h2","fhash":"f2"}}',
+                    ]
+                )
+                + "\n"
+            )
+            with gzip.open(path, "wt", encoding="utf-8") as f:
+                f.write(content)
+            env.test_files.append(str(path))
+
+            util = AggregatorUtility()
+            batches = list(
+                util.iter_arrow(
+                    env.temp_dir,
+                    index_dir=env.temp_dir,
+                    force_rebuild=True,
+                    query='cat == "APP"',
+                )
+            )
+
+            pa = pytest.importorskip("pyarrow")
+            pa_batches = [pa.record_batch(b) for b in batches]
+            rows = pa.Table.from_batches(pa_batches).to_pylist()
+
+            # Should only have APP entries
+            assert len(rows) == 1
+            assert rows[0]["cat"] == "APP"
+
+    def test_write_arrow_creates_files(self):
+        """write_arrow creates Arrow IPC files."""
+        with Environment(lines=20) as env:
+            env.create_test_gzip_file()
+            output_dir = Path(env.temp_dir) / "arrow_output"
+
+            result = AggregatorUtility().write_arrow(
+                env.temp_dir,
+                str(output_dir),
+                index_dir=env.temp_dir,
+                force_rebuild=True,
+            )
+
+            assert "views" in result
+            assert "all" in result["views"]
+            assert result["views"]["all"]["rows"] > 0
+            assert len(result["views"]["all"]["files"]) > 0
+
+            # Verify files are readable
+            ipc = pytest.importorskip("pyarrow.ipc")
+            first_file = result["views"]["all"]["files"][0]
+            with ipc.open_file(first_file) as f:
+                assert f.num_record_batches > 0
+
+    def test_write_arrow_with_views(self):
+        """write_arrow with views creates filtered outputs."""
+        with Environment(lines=0) as env:
+            path = Path(env.temp_dir) / "views_trace.pfw.gz"
+            content = (
+                "\n".join(
+                    [
+                        '{"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":1000,"dur":10,"ph":"X","args":{"ret":4,"hhash":"h1","fhash":"f1"}}',
+                        '{"name":"compute","cat":"APP","pid":1,"tid":1,"ts":2000,"dur":20,"ph":"X","args":{"ret":8,"hhash":"h2","fhash":"f2"}}',
+                        '{"name":"write","cat":"POSIX","pid":1,"tid":1,"ts":3000,"dur":15,"ph":"X","args":{"ret":6,"hhash":"h3","fhash":"f3"}}',
+                    ]
+                )
+                + "\n"
+            )
+            with gzip.open(path, "wt", encoding="utf-8") as f:
+                f.write(content)
+            env.test_files.append(str(path))
+
+            output_dir = Path(env.temp_dir) / "views_output"
+
+            result = AggregatorUtility().write_arrow(
+                env.temp_dir,
+                str(output_dir),
+                index_dir=env.temp_dir,
+                force_rebuild=True,
+                views=[
+                    {"name": "io", "query": 'cat == "POSIX"'},
+                    {"name": "compute", "query": 'cat == "APP"'},
+                ],
+            )
+
+            assert "io" in result["views"]
+            assert "compute" in result["views"]
+            assert result["views"]["io"]["rows"] == 2
+            assert result["views"]["compute"]["rows"] == 1
+
+            # Verify view directories exist
+            assert (output_dir / "io").exists()
+            assert (output_dir / "compute").exists()
+
+            # Verify content is filtered correctly
+            pa = pytest.importorskip("pyarrow")
+            ipc = pytest.importorskip("pyarrow.ipc")
+
+            io_files = list((output_dir / "io").glob("*.arrow"))
+            assert len(io_files) > 0
+            with ipc.open_file(str(io_files[0])) as f:
+                table = pa.Table.from_batches([f.get_batch(i) for i in range(f.num_record_batches)])
+                assert all(row["cat"] == "POSIX" for row in table.to_pylist())
+
+    def test_write_arrow_compression(self):
+        """write_arrow respects compression setting."""
+        with Environment(lines=20) as env:
+            env.create_test_gzip_file()
+
+            # Write with no compression
+            output_none = Path(env.temp_dir) / "arrow_none"
+            result_none = AggregatorUtility().write_arrow(
+                env.temp_dir,
+                str(output_none),
+                index_dir=env.temp_dir,
+                force_rebuild=True,
+                compression="none",
+            )
+
+            # Write with zstd compression
+            output_zstd = Path(env.temp_dir) / "arrow_zstd"
+            result_zstd = AggregatorUtility().write_arrow(
+                env.temp_dir,
+                str(output_zstd),
+                index_dir=env.temp_dir,
+                force_rebuild=False,
+                compression="zstd",
+            )
+
+            # Both should have same row count
+            assert result_none["total_rows"] == result_zstd["total_rows"]
+
+            # Compressed should be smaller (or at least both readable)
+            ipc = pytest.importorskip("pyarrow.ipc")
+
+            for result in [result_none, result_zstd]:
+                first_file = result["views"]["all"]["files"][0]
+                with ipc.open_file(first_file) as f:
+                    assert f.num_record_batches > 0
diff --git a/tests/python/test_dask.py b/tests/python/test_dask.py
index 4ebd6a9f..9d3f0108 100644
--- a/tests/python/test_dask.py
+++ b/tests/python/test_dask.py
@@ -18,6 +18,7 @@
     DASK_AVAILABLE = False
 
 import dftracer.utils as dft_utils
+from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer
 
 from .common import Environment
 
@@ -44,7 +45,7 @@ def test_parallel_indexer_creation(self):
             def create_and_build_indexer(gz_file):
                 """Helper function to create and build an indexer"""
                 try:
-                    with dft_utils.Indexer(gz_file, checkpoint_size=256 * 1024) as indexer:
+                    with NativeIndexer(gz_file, checkpoint_size=256 * 1024) as indexer:
                         if indexer.need_rebuild():
                             indexer.build()
                         return {
@@ -107,7 +108,7 @@ def read_chunk(gz_file_path, start_bytes, end_bytes, reader_type):
                     return {"type": reader_type, "error": str(e), "success": False}
 
             # Get file info from a temporary indexer
-            with dft_utils.Indexer(gz_file, checkpoint_size=512 * 1024) as temp_indexer:
+            with NativeIndexer(gz_file, checkpoint_size=512 * 1024) as temp_indexer:
                 max_bytes = temp_indexer.get_max_bytes()
             chunk_size = max_bytes // 4
 
@@ -184,7 +185,7 @@ def extract_json_data(gz_file_path, start_bytes, end_bytes):
                     return []
 
             # Get file info and create chunks
-            with dft_utils.Indexer(gz_file, checkpoint_size=512 * 1024) as temp_indexer:
+            with NativeIndexer(gz_file, checkpoint_size=512 * 1024) as temp_indexer:
                 max_bytes = temp_indexer.get_max_bytes()
             chunk_size = max_bytes // 4
 
@@ -229,7 +230,7 @@ def test_multiple_batch_sizes_no_duplication(self):
             gz_file = env.create_test_gzip_file(bytes_per_line=512)
             env.build_index(gz_file, checkpoint_size_bytes=256 * 1024)
 
-            with dft_utils.Indexer(gz_file, checkpoint_size=256 * 1024) as temp_indexer:
+            with NativeIndexer(gz_file, checkpoint_size=256 * 1024) as temp_indexer:
                 max_bytes = temp_indexer.get_max_bytes()
 
             # Test various batch sizes including boundary-critical ones
@@ -366,7 +367,7 @@ def test_boundary_edge_cases(self):
             gz_file = env.create_test_gzip_file(bytes_per_line=512)
             env.build_index(gz_file, checkpoint_size_bytes=256 * 1024)
 
-            with dft_utils.Indexer(gz_file, checkpoint_size=256 * 1024) as temp_indexer:
+            with NativeIndexer(gz_file, checkpoint_size=256 * 1024) as temp_indexer:
                 max_bytes = temp_indexer.get_max_bytes()
 
             def process_batch(batch_info):
@@ -446,5 +447,82 @@ def process_batch(batch_info):
             print("Boundary edge case test passed: Complete data recovery, no duplicates")
 
 
+@pytest.mark.skipif(not DASK_AVAILABLE, reason="Dask not available")
+class TestDirectoryIndexerWithDask:
+    """Tests for the directory-level Indexer API with Dask."""
+
+    def test_directory_indexer_indexes_all_files(self):
+        """Test that directory-level Indexer indexes all files in a directory."""
+        with Environment(lines=100) as env:
+            # Create multiple test files in the same directory
+            gz_files = []
+            for i in range(3):
+                gz_file = env.create_test_gzip_file(f"test_{i}.pfw.gz", bytes_per_line=256)
+                gz_files.append(gz_file)
+
+            # Use directory-level Indexer
+            indexer = dft_utils.Indexer(env.temp_dir)
+
+            # Check status before build
+            before = indexer.resolve()
+            assert before.total_files == 3
+            assert len(before.needs_work) == 3
+            assert len(before.ready) == 0
+
+            # Build indexes
+            indexer.build()
+
+            # Check status after build
+            after = indexer.resolve()
+            assert after.total_files == 3
+            assert len(after.ready) == 3
+            assert len(after.needs_work) == 0
+
+    def test_directory_indexer_with_dask_parallel_reading(self):
+        """Test directory-level Indexer followed by parallel reading with Dask."""
+        with Environment(lines=500) as env:
+            # Create test files
+            gz_files = []
+            for i in range(3):
+                gz_file = env.create_test_gzip_file(f"test_{i}.pfw.gz", bytes_per_line=512)
+                gz_files.append(gz_file)
+
+            # Use directory-level Indexer to build all indexes at once
+            indexer = dft_utils.Indexer(env.temp_dir)
+            indexer.ensure_indexed()
+
+            # Verify all files are indexed
+            status = indexer.resolve()
+            assert len(status.ready) == 3
+
+            # Now use Dask for parallel reading
+            def read_file_lines(gz_file):
+                with dft_utils.TraceReader(gz_file) as reader:
+                    return len(reader.read_lines())
+
+            delayed_tasks = [dask.delayed(read_file_lines)(f) for f in gz_files]
+            results = dask.compute(*delayed_tasks)
+
+            # Each file should have 500 events + 2 JSON wrapper lines ([ and ])
+            for line_count in results:
+                assert line_count == 502
+
+    def test_directory_indexer_ensure_indexed_idempotent(self):
+        """Test that ensure_indexed is idempotent - calling multiple times is safe."""
+        with Environment(lines=50) as env:
+            env.create_test_gzip_file()
+
+            indexer = dft_utils.Indexer(env.temp_dir)
+
+            # First call builds the index
+            status1 = indexer.ensure_indexed()
+            assert len(status1.ready) == 1
+
+            # Second call should find everything already indexed
+            status2 = indexer.ensure_indexed()
+            assert len(status2.ready) == 1
+            assert len(status2.needs_work) == 0
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/test_distributed_manifest.py b/tests/python/test_distributed_manifest.py
new file mode 100644
index 00000000..c24d183d
--- /dev/null
+++ b/tests/python/test_distributed_manifest.py
@@ -0,0 +1,204 @@
+"""Tests for the distributed-build path producing a unified-shape index.
+
+Covers per-file AGG markers, ensure_indexed no-op after build, and end-to-end
+correctness vs a serial single-node build.
+"""
+
+import os
+
+import pytest
+
+try:
+    import dask  # noqa: F401
+
+    DASK_AVAILABLE = True
+except ImportError:
+    DASK_AVAILABLE = False
+
+from dftracer.utils import AggregationConfig, Indexer
+from dftracer.utils.dask import distributed_index
+
+from .common import Environment
+
+AGG_CFG = AggregationConfig(time_interval_ms=5000)
+
+
+def _build_distributed(env, pids, num_events=100, rebuild_root=True):
+    files = [env.create_dft_trace_file_with_pid(f"trace_p{p}.pfw.gz", p, num_events) for p in pids]
+    index_dir = os.path.join(env.temp_dir, "idx")
+    os.makedirs(index_dir, exist_ok=True)
+    index_path = os.path.join(index_dir, ".dftindex")
+    staging = os.path.join(env.temp_dir, "stage")
+    os.makedirs(staging, exist_ok=True)
+    result = distributed_index(
+        files=files,
+        index_path=index_path,
+        local_staging=staging,
+        lustre_staging=staging,
+        client=None,
+        aggregation_config=AGG_CFG,
+        rebuild_root_summaries=rebuild_root,
+    )
+    return files, index_path, result
+
+
+@pytest.mark.skipif(not DASK_AVAILABLE, reason="Dask not available")
+class TestDistributedIndexUnified:
+    def test_no_manifest_left_behind(self):
+        with Environment(lines=50) as env:
+            _, index_path, _ = _build_distributed(env, pids=[1, 2])
+            manifest_path = os.path.join(index_path, "agg_manifest.json")
+            assert not os.path.exists(manifest_path), (
+                "distributed_index should produce a unified-shape index"
+            )
+
+    def test_aggregation_matches_serial(self):
+        """Distributed build's aggregation data must equal a serial build."""
+        with Environment(lines=200) as env:
+            files_dist, dist_index_path, _ = _build_distributed(env, pids=[1, 2])
+
+            uni_index_dir = os.path.join(env.temp_dir, "idx_uni")
+            os.makedirs(uni_index_dir, exist_ok=True)
+            uni_indexer = Indexer(
+                files=files_dist,
+                index_dir=uni_index_dir,
+                require_aggregation=AGG_CFG,
+                force_rebuild=True,
+            )
+            uni_indexer.ensure_indexed()
+            uni_batches = uni_indexer.iter_arrow_dfanalyzer_all(
+                time_granularity=5.0,
+                time_resolution=1_000_000.0,
+            )
+
+            dist_indexer = Indexer(
+                files=files_dist,
+                index_dir=os.path.dirname(dist_index_path),
+                require_aggregation=AGG_CFG,
+                force_rebuild=False,
+            )
+            dist_batches = dist_indexer.iter_arrow_dfanalyzer_all(
+                time_granularity=5.0,
+                time_resolution=1_000_000.0,
+            )
+
+            import pyarrow as pa
+
+            def _total_count(batches_dict, key):
+                batches = [pa.record_batch(b) for b in batches_dict.get(key, [])]
+                if not batches:
+                    return 0
+                table = pa.Table.from_batches(batches)
+                if "count" in table.column_names:
+                    return int(pa.compute.sum(table["count"]).as_py() or 0)
+                return table.num_rows
+
+            uni_count = _total_count(uni_batches, "events")
+            dist_count = _total_count(dist_batches, "events")
+            assert uni_count == dist_count, (
+                f"event count mismatch: unified={uni_count} distributed={dist_count}"
+            )
+            assert uni_count > 0
+
+    def test_move_artifacts_preserves_per_file_agg_ssts(self):
+        """With cross-FS staging, per-file `aggregation.sst` must not collapse."""
+        with Environment(lines=120) as env:
+            files = [
+                env.create_dft_trace_file_with_pid(f"trace_p{p}.pfw.gz", p, 120)
+                for p in [1, 2, 3, 4]
+            ]
+            local_staging = os.path.join(env.temp_dir, "local_stage")
+            lustre_staging = os.path.join(env.temp_dir, "lustre_stage")
+            os.makedirs(local_staging, exist_ok=True)
+            os.makedirs(lustre_staging, exist_ok=True)
+            index_dir = os.path.join(env.temp_dir, "idx")
+            os.makedirs(index_dir, exist_ok=True)
+            index_path = os.path.join(index_dir, ".dftindex")
+
+            distributed_index(
+                files=files,
+                index_path=index_path,
+                local_staging=local_staging,
+                lustre_staging=lustre_staging,
+                client=None,
+                aggregation_config=AGG_CFG,
+            )
+
+            indexer = Indexer(
+                files=files,
+                index_dir=os.path.dirname(index_path),
+                require_aggregation=AGG_CFG,
+                force_rebuild=False,
+            )
+            batches = indexer.iter_arrow_dfanalyzer_all(
+                time_granularity=5.0,
+                time_resolution=1_000_000.0,
+            )
+            import pyarrow as pa
+
+            event_batches = [pa.record_batch(b) for b in batches.get("events", [])]
+            assert event_batches, "no events: per-file SSTs likely clobbered each other"
+            table = pa.Table.from_batches(event_batches)
+            total = int(pa.compute.sum(table["count"]).as_py() or 0)
+            assert total > 0
+
+    def test_ensure_indexed_is_noop_after_distributed_build(self):
+        import time as _time
+
+        with Environment(lines=150) as env:
+            files, index_path, _ = _build_distributed(env, pids=[1, 2, 3])
+
+            t0 = _time.monotonic()
+            indexer = Indexer(
+                files=files,
+                index_dir=os.path.dirname(index_path),
+                require_checkpoint=True,
+                require_bloom=True,
+                require_manifest=True,
+                require_aggregation=AGG_CFG,
+                force_rebuild=False,
+            )
+            status = indexer.ensure_indexed()
+            elapsed = _time.monotonic() - t0
+
+            assert status.total_files == len(files)
+            assert len(status.ready) == len(files), (
+                f"post-distributed ensure_indexed wants to rebuild "
+                f"{len(status.needs_work)} files (markers missing?)"
+            )
+            assert len(status.needs_work) == 0
+            assert elapsed < 5.0, (
+                f"ensure_indexed took {elapsed:.2f}s on a {len(files)}-file "
+                "distributed index; likely re-running the build"
+            )
+
+
+@pytest.mark.skipif(not DASK_AVAILABLE, reason="Dask not available")
+class TestDistributedWithDask:
+    def test_multi_worker_with_local_cluster(self):
+        from dask.distributed import Client, LocalCluster
+
+        with LocalCluster(
+            n_workers=2, threads_per_worker=1, dashboard_address=None, processes=True
+        ) as cluster, Client(cluster) as client:
+            with Environment(lines=40) as env:
+                pids = [1, 2, 3, 4]
+                files = [
+                    env.create_dft_trace_file_with_pid(f"trace_p{p}.pfw.gz", p, 40) for p in pids
+                ]
+                index_dir = os.path.join(env.temp_dir, "idx")
+                os.makedirs(index_dir, exist_ok=True)
+                index_path = os.path.join(index_dir, ".dftindex")
+                staging = os.path.join(env.temp_dir, "stage")
+                os.makedirs(staging, exist_ok=True)
+                result = distributed_index(
+                    files=files,
+                    index_path=index_path,
+                    local_staging=staging,
+                    lustre_staging=staging,
+                    client=client,
+                    aggregation_config=AGG_CFG,
+                )
+                assert result["total_files"] == len(files)
+                assert result["artifact_batches"] > 0
+                assert not os.path.exists(os.path.join(index_path, "agg_manifest.json"))
diff --git a/tests/python/test_indexer.py b/tests/python/test_indexer.py
index 8079c833..9951a636 100644
--- a/tests/python/test_indexer.py
+++ b/tests/python/test_indexer.py
@@ -8,129 +8,67 @@
 import pytest
 
 import dftracer.utils as dft_utils
+from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer
 
 from .common import Environment
 
 
-class TestIndexer:
-    """Test cases for Indexer"""
+class TestCheckpointIndexer:
+    """Test cases for checkpoint-level indexer operations via get_checkpoint_indexer"""
 
-    def test_indexer_creation(self):
-        """Test indexer creation"""
-        with Environment() as env:
-            gz_file = env.create_test_gzip_file()
-            index_path = env.get_index_path(gz_file)
-
-            # Test basic creation using context manager
-            with dft_utils.Indexer(gz_file, index_path) as indexer:
-                assert indexer.gz_path == gz_file
-                assert indexer.index_path == index_path
-                assert indexer.checkpoint_size > 0
-
-    def test_indexer_creation_with_defaults(self):
-        """Test indexer creation with default parameters"""
-        with Environment() as env:
-            gz_file = env.create_test_gzip_file()
-
-            # Test creation with defaults using context manager
-            with dft_utils.Indexer(gz_file) as indexer:
-                assert indexer.gz_path == gz_file
-                assert indexer.index_path == env.get_index_path(gz_file)
-                assert indexer.checkpoint_size <= 33554432  # Should be <= 32MB default
-
-    def test_indexer_custom_checkpoint_size(self):
-        """Test indexer with custom checkpoint size"""
+    def test_checkpoint_indexer_creation(self):
+        """Test checkpoint indexer creation via Indexer.get_checkpoint_indexer"""
         with Environment() as env:
             gz_file = env.create_test_gzip_file()
-            checkpoint_size = 1024 * 1024  # 1MB
 
-            with dft_utils.Indexer(gz_file, checkpoint_size=checkpoint_size) as indexer:
-                assert indexer.checkpoint_size <= checkpoint_size
-
-    def test_indexer_nonexistent_file(self):
-        """Test indexer creation with non-existent file"""
-        # Indexer creation doesn't fail, but building should fail
-        with pytest.raises(RuntimeError):
-            dft_utils.Indexer("nonexistent_file.gz")
-
-    def test_indexer_build_and_rebuild(self):
-        """Test indexer build and rebuild functionality"""
-        with Environment() as env:
-            gz_file = env.create_test_gzip_file()
-            index_path = env.get_index_path(gz_file)
-
-            with dft_utils.Indexer(gz_file, index_path) as indexer:
-                # Should need rebuild initially
-                assert indexer.need_rebuild()
-
-                # Build the index
-                indexer.build()
-
-                # Index file should exist
-                assert os.path.exists(index_path)
-
-                # Should not need rebuild after building
-                assert not indexer.need_rebuild()
+            with dft_utils.Indexer(files=[gz_file]) as indexer:
+                indexer.ensure_indexed()
+                cp_indexer = indexer.get_checkpoint_indexer(gz_file)
 
-            # Test force rebuild with a new indexer
-            # Note: force_rebuild affects the build process, not need_rebuild() check
-            # The need_rebuild() method checks file consistency, not force_rebuild flag
-            with dft_utils.Indexer(gz_file, index_path, force_rebuild=True) as indexer_force:
-                # Since the index already exists and file hasn't changed, need_rebuild should be False
-                # But force_rebuild will cause a rebuild when build() is called
-                assert not indexer_force.need_rebuild()
-                # The force_rebuild behavior is tested by calling build() which should succeed
-                indexer_force.build()  # This should rebuild due to force_rebuild=True
+                assert cp_indexer.gz_path == gz_file
+                assert cp_indexer.checkpoint_size > 0
 
-    def test_indexer_file_info(self):
-        """Test indexer file information methods"""
+    def test_checkpoint_indexer_file_info(self):
+        """Test checkpoint indexer file information methods"""
         with Environment() as env:
             gz_file = env.create_test_gzip_file()
 
-            with dft_utils.Indexer(gz_file) as indexer:
-                if indexer.need_rebuild():
-                    indexer.build()
+            with dft_utils.Indexer(files=[gz_file]) as indexer:
+                indexer.ensure_indexed()
+                cp_indexer = indexer.get_checkpoint_indexer(gz_file)
 
-                # Test file info methods
-                max_bytes = indexer.get_max_bytes()
-                num_lines = indexer.get_num_lines()
+                max_bytes = cp_indexer.get_max_bytes()
+                num_lines = cp_indexer.get_num_lines()
 
                 assert isinstance(max_bytes, int)
                 assert isinstance(num_lines, int)
                 assert max_bytes > 0
                 assert num_lines > 0
 
-    def test_indexer_checkpoints(self):
-        """Test indexer checkpoint functionality"""
-        with Environment(lines=100000) as env:  # Larger file for checkpoints
+    def test_checkpoint_indexer_checkpoints(self):
+        """Test checkpoint indexer checkpoint functionality"""
+        with Environment(lines=100000) as env:
             gz_file = env.create_test_gzip_file()
-            checkpoint_size = 256 * 1024  # 256KB checkpoint size
+            checkpoint_size = 256 * 1024  # 256KB
 
-            with dft_utils.Indexer(gz_file, checkpoint_size=checkpoint_size) as indexer:
-                if indexer.need_rebuild():
-                    indexer.build()
+            with dft_utils.Indexer(
+                files=[gz_file],
+                checkpoint_size=checkpoint_size,
+            ) as indexer:
+                indexer.ensure_indexed()
+                cp_indexer = indexer.get_checkpoint_indexer(gz_file)
 
-                # Debug: Check file size and checkpoint configuration
-                max_bytes = indexer.get_max_bytes()
-                num_lines = indexer.get_num_lines()
+                max_bytes = cp_indexer.get_max_bytes()
+                num_lines = cp_indexer.get_num_lines()
                 print(
-                    f"File stats: {max_bytes} bytes, {num_lines} lines, checkpoint_size={checkpoint_size}"
+                    f"File stats: {max_bytes} bytes, {num_lines} lines, "
+                    f"checkpoint_size={checkpoint_size}"
                 )
 
-                # Test get_checkpoints
-                checkpoints = indexer.get_checkpoints()
+                checkpoints = cp_indexer.get_checkpoints()
                 assert isinstance(checkpoints, list)
                 print(f"Number of checkpoints created: {len(checkpoints)}")
 
-                # NOTE: Checkpoint creation depends on deflate block boundaries in the compressed stream,
-                # not just uncompressed file size. This is correct behavior for zlib-based random access.
-                # The indexer may create 0, 1, or multiple checkpoints depending on how gzip compressed
-                # the data and where deflate block boundaries fall relative to the checkpoint size.
-
-                # Test that the API works correctly regardless of checkpoint count
-                assert isinstance(checkpoints, list)
-
-                # Test checkpoint properties if any exist
                 for checkpoint in checkpoints:
                     assert hasattr(checkpoint, "checkpoint_idx")
                     assert hasattr(checkpoint, "uc_offset")
@@ -147,28 +85,28 @@ def test_indexer_checkpoints(self):
                     assert checkpoint.uc_offset >= 0
                     assert checkpoint.num_lines >= 0
 
-    def test_indexer_find_checkpoint(self):
-        """Test indexer single checkpoint search"""
-        with Environment(lines=2000) as env:  # Large file for testing
-            gz_file = env.create_test_gzip_file(bytes_per_line=2048)  # Larger lines
-            checkpoint_size = 512 * 1024  # 512KB checkpoint size
+    def test_checkpoint_indexer_find_checkpoint(self):
+        """Test checkpoint indexer single checkpoint search"""
+        with Environment(lines=2000) as env:
+            gz_file = env.create_test_gzip_file(bytes_per_line=2048)
+            checkpoint_size = 512 * 1024  # 512KB
 
-            with dft_utils.Indexer(gz_file, checkpoint_size=checkpoint_size) as indexer:
-                if indexer.need_rebuild():
-                    indexer.build()
+            with dft_utils.Indexer(
+                files=[gz_file],
+                checkpoint_size=checkpoint_size,
+            ) as indexer:
+                indexer.ensure_indexed()
+                cp_indexer = indexer.get_checkpoint_indexer(gz_file)
 
-                max_bytes = indexer.get_max_bytes()
-                checkpoints = indexer.get_checkpoints()
+                max_bytes = cp_indexer.get_max_bytes()
+                checkpoints = cp_indexer.get_checkpoints()
 
                 print(f"File has {max_bytes} bytes and {len(checkpoints)} checkpoints")
 
-                # Test find_checkpoint API regardless of whether checkpoints exist
                 target_offset = max_bytes // 2 if max_bytes > 0 else 0
-                checkpoint = indexer.find_checkpoint(target_offset)
+                checkpoint = cp_indexer.find_checkpoint(target_offset)
 
-                # The find_checkpoint method should always return either a CheckpointInfo or None
                 if checkpoint is not None:
-                    # If a checkpoint is found, verify its properties
                     assert hasattr(checkpoint, "uc_offset")
                     assert hasattr(checkpoint, "uc_size")
                     assert hasattr(checkpoint, "num_lines")
@@ -177,78 +115,106 @@ def test_indexer_find_checkpoint(self):
                     assert isinstance(checkpoint.uc_size, int)
                     assert isinstance(checkpoint.num_lines, int)
 
-                # Test with offset 0 (per the C++ code, this should return None as a special case)
-                checkpoint_0 = indexer.find_checkpoint(0)
-                # According to indexer.cpp line 1104-1106, target_offset 0 always returns false
-                assert checkpoint_0 is None, (
-                    "find_checkpoint(0) should return None per implementation"
-                )
+                # find_checkpoint(0) should return None per implementation
+                checkpoint_0 = cp_indexer.find_checkpoint(0)
+                assert checkpoint_0 is None
 
-                # Test with offset beyond file size
                 if max_bytes > 0:
-                    checkpoint_beyond = indexer.find_checkpoint(max_bytes + 1000)
-                    # This might return None or the last checkpoint, both are valid
+                    checkpoint_beyond = cp_indexer.find_checkpoint(max_bytes + 1000)
                     if checkpoint_beyond is not None:
                         assert checkpoint_beyond.uc_offset <= max_bytes
 
 
-class TestIndexerIntegration:
-    """Integration tests for indexer with reader"""
+class TestNativeIndexerDirect:
+    """Test native Indexer class directly for low-level operations"""
 
-    def test_indexer_with_reader_creation(self):
-        """Test creating readers from indexer"""
+    def test_native_indexer_creation(self):
+        """Test native indexer creation"""
         with Environment() as env:
             gz_file = env.create_test_gzip_file()
+            index_path = env.get_index_path(gz_file)
 
-            # Create and build indexer using context manager
-            with dft_utils.Indexer(gz_file) as indexer:
-                if indexer.need_rebuild():
-                    indexer.build()
+            with NativeIndexer(gz_file, index_path) as indexer:
+                assert indexer.gz_path == gz_file
+                assert indexer.index_path == index_path
+                assert indexer.checkpoint_size > 0
 
-                # Test creating reader after indexer builds the shared index store
-                reader = dft_utils.TraceReader(gz_file)
-                assert reader.get_max_bytes() > 0
-                assert reader.file_path == gz_file
+    def test_native_indexer_build_and_rebuild(self):
+        """Test native indexer build and rebuild functionality"""
+        with Environment() as env:
+            gz_file = env.create_test_gzip_file()
+            index_path = env.get_index_path(gz_file)
 
-    def test_indexer_with_reader_creation_context_manager(self):
-        """Test using indexer with reader creation via context manager"""
+            with NativeIndexer(gz_file, index_path) as indexer:
+                assert indexer.need_rebuild()
+                indexer.build()
+                assert os.path.exists(index_path)
+                assert not indexer.need_rebuild()
+
+            with NativeIndexer(gz_file, index_path, force_rebuild=True) as indexer_force:
+                assert not indexer_force.need_rebuild()
+                indexer_force.build()
+
+    def test_native_indexer_nonexistent_file(self):
+        """Test native indexer creation with non-existent file"""
+        with pytest.raises(RuntimeError):
+            NativeIndexer("nonexistent_file.gz")
+
+    def test_native_indexer_build_bloom(self):
+        """Test building with bloom=True"""
         with Environment() as env:
             gz_file = env.create_test_gzip_file()
+            index_path = env.get_index_path(gz_file)
+            with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer:
+                indexer.build()
+                assert indexer.has_bloom
 
-            # Create and build indexer using context manager
-            with dft_utils.Indexer(gz_file) as indexer:
-                if indexer.need_rebuild():
-                    indexer.build()
+    def test_native_indexer_build_manifest(self):
+        """Test building with manifest=True"""
+        with Environment() as env:
+            gz_file = env.create_test_gzip_file()
+            index_path = env.get_index_path(gz_file)
+            with NativeIndexer(gz_file, index_path, build_manifest=True) as indexer:
+                indexer.build()
+                assert indexer.has_manifest
+
+
+class TestCheckpointIndexerIntegration:
+    """Integration tests for checkpoint indexer with reader"""
+
+    def test_checkpoint_indexer_with_reader_creation(self):
+        """Test creating readers from checkpoint indexer"""
+        with Environment() as env:
+            gz_file = env.create_test_gzip_file()
+
+            with dft_utils.Indexer(files=[gz_file]) as indexer:
+                indexer.ensure_indexed()
 
-                # Test creating reader after indexer builds the shared index store
                 reader = dft_utils.TraceReader(gz_file)
                 assert reader.get_max_bytes() > 0
+                assert reader.path == gz_file
 
-    def test_multiple_readers_same_indexer(self):
-        """Test creating multiple readers from the same indexer"""
+    def test_multiple_readers_same_index(self):
+        """Test creating multiple readers from the same index"""
         with Environment() as env:
             gz_file = env.create_test_gzip_file()
 
-            # Create and build indexer using context manager
-            with dft_utils.Indexer(gz_file) as indexer:
-                if indexer.need_rebuild():
-                    indexer.build()
+            with dft_utils.Indexer(files=[gz_file]) as indexer:
+                indexer.ensure_indexed()
 
-                # Create multiple readers (all use the same shared index store)
                 readers = []
                 for i in range(3):
                     reader = dft_utils.TraceReader(gz_file)
                     assert reader.get_max_bytes() > 0
                     readers.append(reader)
 
-                # All should have same file info
                 max_bytes = readers[0].get_max_bytes()
                 for reader in readers[1:]:
                     assert reader.get_max_bytes() == max_bytes
 
 
-class TestIndexerLifetime:
-    """Python wrapper lifetime should not own the shared index store."""
+class TestCheckpointIndexerLifetime:
+    """Test checkpoint indexer lifetime management"""
 
     def test_indexer_close_releases_wrapper_not_index_store(self):
         """close() should release the Python handle without deleting .dftindex."""
@@ -256,7 +222,7 @@ def test_indexer_close_releases_wrapper_not_index_store(self):
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
 
-            indexer = dft_utils.Indexer(gz_file, index_path)
+            indexer = NativeIndexer(gz_file, index_path)
             assert indexer.need_rebuild()
             indexer.build()
             assert os.path.exists(index_path)
@@ -264,7 +230,7 @@ def test_indexer_close_releases_wrapper_not_index_store(self):
             indexer.close()
             assert os.path.exists(index_path)
 
-            with dft_utils.Indexer(gz_file, index_path) as reopened:
+            with NativeIndexer(gz_file, index_path) as reopened:
                 assert not reopened.need_rebuild()
                 assert reopened.get_num_lines() > 0
 
@@ -274,7 +240,7 @@ def test_indexer_context_exit_keeps_shared_index_store(self):
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
 
-            with dft_utils.Indexer(gz_file, index_path) as indexer:
+            with NativeIndexer(gz_file, index_path) as indexer:
                 if indexer.need_rebuild():
                     indexer.build()
                 assert indexer.get_num_lines() > 0
@@ -285,212 +251,484 @@ def test_indexer_context_exit_keeps_shared_index_store(self):
             assert reader.get_num_lines() > 0
 
 
-class TestIndexerUnified:
-    """Test unified IndexBuilder features via Python Indexer"""
+class TestDirectoryIndexer:
+    """Test cases for the directory-level Indexer API"""
 
-    def test_indexer_build_bloom(self):
-        """Test building with bloom=True"""
+    def test_indexer_creation(self):
+        """Test directory indexer creation"""
+        with Environment() as env:
+            env.create_test_gzip_file()
+            env.create_test_gzip_file()
+
+            indexer = dft_utils.Indexer(env.temp_dir)
+            assert indexer is not None
+
+    def test_indexer_context_manager(self):
+        """Test directory indexer as context manager"""
+        with Environment() as env:
+            env.create_test_gzip_file()
+
+            with dft_utils.Indexer(env.temp_dir) as indexer:
+                assert indexer is not None
+
+    def test_indexer_resolve(self):
+        """Test resolve() returns IndexStatus"""
+        with Environment() as env:
+            env.create_test_gzip_file()
+
+            with dft_utils.Indexer(env.temp_dir) as indexer:
+                status = indexer.resolve()
+                assert isinstance(status, dft_utils.IndexStatus)
+                assert status.total_files >= 1
+                assert len(status.needs_work) >= 1
+
+    def test_indexer_build(self):
+        """Test build() creates indexes"""
         with Environment() as env:
             gz_file = env.create_test_gzip_file()
-            index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
-                gz_file, index_path, build_bloom=True, index_threshold=0
-            ) as indexer:
+            print(f"\nCreated test file: {gz_file}")
+            print(f"Directory: {env.temp_dir}")
+            print(f"Files in dir: {os.listdir(env.temp_dir)}")
+
+            with dft_utils.Indexer(env.temp_dir) as indexer:
+                status_before = indexer.resolve()
+                print(f"Before build: {status_before}")
+                assert len(status_before.needs_work) >= 1
+                assert status_before.index_path != ""
+
                 indexer.build()
-                assert indexer.has_bloom
 
-    def test_indexer_build_manifest(self):
-        """Test building with manifest=True"""
+                assert os.path.isdir(status_before.index_path), (
+                    f"Index dir not created: {status_before.index_path}"
+                )
+                print(f"Index dir contents: {os.listdir(status_before.index_path)}")
+
+                status_after = indexer.resolve()
+                print(f"After build: {status_after}")
+                assert len(status_after.ready) >= 1, (
+                    f"Expected ready>=1, got {len(status_after.ready)}"
+                )
+
+    def test_indexer_ensure_indexed(self):
+        """Test ensure_indexed() builds if needed"""
         with Environment() as env:
-            gz_file = env.create_test_gzip_file()
-            index_path = env.get_index_path(gz_file)
+            env.create_test_gzip_file()
+
+            with dft_utils.Indexer(env.temp_dir) as indexer:
+                status = indexer.ensure_indexed()
+                assert isinstance(status, dft_utils.IndexStatus)
+                assert len(status.ready) >= 1
+
+    def test_indexer_with_require_bloom(self):
+        """Test indexer with bloom filter requirement"""
+        with Environment() as env:
+            env.create_test_gzip_file()
+
+            with dft_utils.Indexer(env.temp_dir, require_bloom=True) as indexer:
+                status = indexer.ensure_indexed()
+                assert len(status.ready) >= 1
+
+    def test_indexer_with_require_manifest(self):
+        """Test indexer with manifest requirement"""
+        with Environment() as env:
+            env.create_test_gzip_file()
+
+            with dft_utils.Indexer(env.temp_dir, require_manifest=True) as indexer:
+                status = indexer.ensure_indexed()
+                assert len(status.ready) >= 1
+
+    def test_indexer_with_aggregation_config(self):
+        """Test indexer with aggregation config"""
+        with Environment() as env:
+            env.create_test_gzip_file()
+
+            agg_config = dft_utils.AggregationConfig(
+                time_interval_ms=1000.0,
+                compute_percentiles=False,
+            )
             with dft_utils.Indexer(
-                gz_file, index_path, build_manifest=True, index_threshold=0
+                env.temp_dir,
+                require_aggregation=agg_config,
             ) as indexer:
-                indexer.build()
-                assert indexer.has_manifest
+                assert indexer.aggregation_config is not None
+                assert indexer.aggregation_config.time_interval_ms == 1000.0
 
-    def test_indexer_build_bloom_and_manifest(self):
-        """Test building with both bloom and manifest"""
+    def test_indexer_aggregation_true(self):
+        """Test indexer with require_aggregation=True uses defaults"""
         with Environment() as env:
-            gz_file = env.create_test_gzip_file()
-            index_path = env.get_index_path(gz_file)
+            env.create_test_gzip_file()
+
             with dft_utils.Indexer(
-                gz_file,
-                index_path,
-                build_bloom=True,
-                build_manifest=True,
-                index_threshold=0,
+                env.temp_dir,
+                require_aggregation=True,
             ) as indexer:
-                indexer.build()
-                assert indexer.has_bloom
-                assert indexer.has_manifest
+                assert indexer.aggregation_config is not None
+                assert indexer.aggregation_config.time_interval_ms == 5000.0
+
+    def test_index_status_dataclass(self):
+        """Test IndexStatus dataclass"""
+        status = dft_utils.IndexStatus(
+            total_files=5,
+            ready=["a.pfw.gz", "b.pfw.gz"],
+            needs_work=["c.pfw.gz"],
+            index_path="/tmp/index",
+        )
+        assert status.total_files == 5
+        assert len(status.ready) == 2
+        assert len(status.needs_work) == 1
+        assert status.index_path == "/tmp/index"
+
+    def test_aggregation_config_dataclass(self):
+        """Test AggregationConfig dataclass"""
+        config = dft_utils.AggregationConfig(
+            time_interval_ms=2000.0,
+            group_keys=["host", "rank"],
+            custom_metric_fields=["bytes"],
+            compute_percentiles=True,
+        )
+        assert config.time_interval_ms == 2000.0
+        assert config.group_keys == ["host", "rank"]
+        assert config.custom_metric_fields == ["bytes"]
+        assert config.compute_percentiles is True
+
+    def test_indexer_with_files_list(self):
+        """Test indexer with explicit file list instead of directory"""
+        with Environment() as env:
+            file_path = env.create_test_gzip_file()
+
+            with dft_utils.Indexer(
+                files=[file_path],
+                index_dir=env.temp_dir,
+            ) as indexer:
+                status = indexer.resolve()
+                assert status.total_files == 1
 
-    def test_indexer_no_bloom_by_default(self):
-        """Test that bloom is not built when build_bloom is omitted"""
+    def test_indexer_files_and_directory(self):
+        """Test indexer with both files and directory (files take precedence)"""
         with Environment() as env:
-            gz_file = env.create_test_gzip_file()
-            index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(gz_file, index_path, index_threshold=0) as indexer:
-                indexer.build()
-                assert not indexer.has_bloom
+            file_path = env.create_test_gzip_file()
 
-    def test_indexer_no_manifest_by_default(self):
-        """Test that manifest is not built when build_manifest is omitted"""
+            with dft_utils.Indexer(
+                directory=env.temp_dir,
+                files=[file_path],
+            ) as indexer:
+                status = indexer.resolve()
+                assert status.total_files >= 1
+
+    def test_indexer_requires_directory_or_files(self):
+        """Test that indexer requires at least directory or files"""
+        with pytest.raises(ValueError, match="directory.*files"):
+            dft_utils.Indexer()
+
+    def test_indexer_get_checkpoint_indexer(self):
+        """Test get_checkpoint_indexer returns working checkpoint indexer"""
         with Environment() as env:
             gz_file = env.create_test_gzip_file()
-            index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(gz_file, index_path, index_threshold=0) as indexer:
-                indexer.build()
-                assert not indexer.has_manifest
 
-    def test_indexer_has_bloom_is_bool(self):
-        """Test that has_bloom returns a bool"""
+            with dft_utils.Indexer(env.temp_dir) as indexer:
+                indexer.ensure_indexed()
+
+                cp_indexer = indexer.get_checkpoint_indexer(gz_file)
+
+                assert cp_indexer.gz_path == gz_file
+                assert cp_indexer.get_max_bytes() > 0
+                assert cp_indexer.get_num_lines() > 0
+                checkpoints = cp_indexer.get_checkpoints()
+                assert isinstance(checkpoints, list)
+
+    def test_indexer_get_checkpoint_indexer_uses_index_dir(self):
+        """Test that get_checkpoint_indexer uses the same index_dir"""
         with Environment() as env:
             gz_file = env.create_test_gzip_file()
-            index_path = env.get_index_path(gz_file)
+            custom_index_dir = os.path.join(env.temp_dir, "custom_index")
+            os.makedirs(custom_index_dir, exist_ok=True)
+
             with dft_utils.Indexer(
-                gz_file, index_path, build_bloom=True, index_threshold=0
+                env.temp_dir,
+                index_dir=custom_index_dir,
             ) as indexer:
-                indexer.build()
-                assert isinstance(indexer.has_bloom, bool)
+                indexer.ensure_indexed()
+
+                cp_indexer = indexer.get_checkpoint_indexer(gz_file)
+                assert custom_index_dir in cp_indexer.index_path
 
-    def test_indexer_has_manifest_is_bool(self):
-        """Test that has_manifest returns a bool"""
+
+class TestIndexerDfanalyzerAPIs:
+    """Test cases for dfanalyzer integration APIs (hash tables, PID manifest)"""
+
+    def test_get_hash_table_file(self):
+        """Test get_hash_table returns file hash mappings"""
         with Environment() as env:
-            gz_file = env.create_test_gzip_file()
-            index_path = env.get_index_path(gz_file)
+            gz_file = env.create_dft_trace_file()
+
             with dft_utils.Indexer(
-                gz_file, index_path, build_manifest=True, index_threshold=0
+                files=[gz_file],
+                require_bloom=True,
+                require_manifest=True,
             ) as indexer:
-                indexer.build()
-                assert isinstance(indexer.has_manifest, bool)
+                indexer.ensure_indexed()
+
+                file_hashes = indexer.get_hash_table("file")
+                assert isinstance(file_hashes, dict)
 
-    def test_indexer_custom_index_threshold(self):
-        """Test that index_threshold is accepted without error"""
+    def test_get_hash_table_host(self):
+        """Test get_hash_table returns host hash mappings"""
         with Environment() as env:
-            gz_file = env.create_test_gzip_file()
-            index_path = env.get_index_path(gz_file)
-            # A very large threshold skips bloom for small files
+            gz_file = env.create_dft_trace_file()
+
             with dft_utils.Indexer(
-                gz_file,
-                index_path,
-                build_bloom=True,
-                index_threshold=1024 * 1024 * 1024,
+                files=[gz_file],
+                require_bloom=True,
+                require_manifest=True,
             ) as indexer:
-                indexer.build()
-                assert isinstance(indexer.has_bloom, bool)
-                assert not indexer.has_bloom
+                indexer.ensure_indexed()
 
-    def test_indexer_bloom_persists_across_instances(self):
-        """Bloom data written to the index store is visible from a new Indexer"""
+                host_hashes = indexer.get_hash_table("host")
+                assert isinstance(host_hashes, dict)
+
+    def test_get_hash_table_string(self):
+        """Test get_hash_table returns string hash mappings"""
         with Environment() as env:
-            gz_file = env.create_test_gzip_file()
-            index_path = env.get_index_path(gz_file)
+            gz_file = env.create_dft_trace_file()
+
             with dft_utils.Indexer(
-                gz_file, index_path, build_bloom=True, index_threshold=0
+                files=[gz_file],
+                require_bloom=True,
+                require_manifest=True,
             ) as indexer:
-                indexer.build()
+                indexer.ensure_indexed()
 
-            with dft_utils.Indexer(gz_file, index_path) as indexer2:
-                assert indexer2.has_bloom
+                string_hashes = indexer.get_hash_table("string")
+                assert isinstance(string_hashes, dict)
 
-    def test_indexer_manifest_persists_across_instances(self):
-        """Manifest data written to the index store is visible from a new Indexer"""
+    def test_get_hash_table_invalid_type(self):
+        """Test get_hash_table raises error for invalid type"""
         with Environment() as env:
-            gz_file = env.create_test_gzip_file()
-            index_path = env.get_index_path(gz_file)
+            gz_file = env.create_dft_trace_file()
+
             with dft_utils.Indexer(
-                gz_file, index_path, build_manifest=True, index_threshold=0
+                files=[gz_file],
+                require_bloom=True,
+                require_manifest=True,
             ) as indexer:
-                indexer.build()
-
-            with dft_utils.Indexer(gz_file, index_path) as indexer2:
-                assert indexer2.has_manifest
+                indexer.ensure_indexed()
 
+                with pytest.raises((ValueError, RuntimeError)):
+                    indexer.get_hash_table("invalid_type")
 
-class TestIndexerThreshold:
-    """Test that index_threshold skips bloom/manifest for small files"""
+    def test_query_file_pids(self):
+        """Test query_file_pids returns set of PIDs for a file"""
+        with Environment() as env:
+            gz_file = env.create_dft_trace_file()
 
-    def test_threshold_skips_bloom_for_small_file(self):
-        """Explicit large threshold should skip bloom for small files"""
-        with Environment(lines=5) as env:
-            gz_file = env.create_test_gzip_file(bytes_per_line=128)
-            index_path = env.get_index_path(gz_file)
             with dft_utils.Indexer(
-                gz_file,
-                index_path,
-                build_bloom=True,
-                index_threshold=10 * 1024 * 1024,
+                files=[gz_file],
+                require_manifest=True,
             ) as indexer:
-                indexer.build()
-                assert not indexer.has_bloom
+                indexer.ensure_indexed()
 
-    def test_threshold_skips_manifest_for_small_file(self):
-        """Explicit large threshold should skip manifest for small files"""
-        with Environment(lines=5) as env:
-            gz_file = env.create_test_gzip_file(bytes_per_line=128)
-            index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
-                gz_file,
-                index_path,
-                build_manifest=True,
-                index_threshold=10 * 1024 * 1024,
-            ) as indexer:
-                indexer.build()
-                assert not indexer.has_manifest
+                # File ID 1 is typically the first indexed file
+                pids = indexer.query_file_pids(1)
+                assert isinstance(pids, set)
+                # PIDs should be integers
+                for pid in pids:
+                    assert isinstance(pid, int)
+
+    def test_query_file_pids_nonexistent(self):
+        """Test query_file_pids returns empty set for nonexistent file"""
+        with Environment() as env:
+            gz_file = env.create_dft_trace_file()
 
-    def test_threshold_skips_bloom_and_manifest_for_small_file(self):
-        """Explicit large threshold should skip bloom and manifest for small files"""
-        with Environment(lines=5) as env:
-            gz_file = env.create_test_gzip_file(bytes_per_line=128)
-            index_path = env.get_index_path(gz_file)
             with dft_utils.Indexer(
-                gz_file,
-                index_path,
-                build_bloom=True,
-                build_manifest=True,
-                index_threshold=10 * 1024 * 1024,
+                files=[gz_file],
+                require_manifest=True,
             ) as indexer:
-                indexer.build()
-                assert not indexer.has_bloom
-                assert not indexer.has_manifest
+                indexer.ensure_indexed()
 
-    def test_explicit_large_threshold_skips_bloom(self):
-        """Explicit large threshold should skip bloom for small files"""
+                pids = indexer.query_file_pids(9999)
+                assert isinstance(pids, set)
+                assert len(pids) == 0
+
+    def test_query_all_file_pids(self):
+        """Test query_all_file_pids returns dict mapping file_id to PID sets"""
         with Environment() as env:
-            gz_file = env.create_test_gzip_file()
-            index_path = env.get_index_path(gz_file)
+            gz_file1 = env.create_dft_trace_file(filename="trace1.pfw.gz")
+            gz_file2 = env.create_dft_trace_file(filename="trace2.pfw.gz")
+
             with dft_utils.Indexer(
-                gz_file,
-                index_path,
-                build_bloom=True,
-                index_threshold=1024 * 1024 * 1024,
+                files=[gz_file1, gz_file2],
+                require_manifest=True,
             ) as indexer:
-                indexer.build()
-                assert not indexer.has_bloom
+                indexer.ensure_indexed()
+
+                all_pids = indexer.query_all_file_pids()
+                assert isinstance(all_pids, dict)
+
+                for file_id, pid_set in all_pids.items():
+                    assert isinstance(file_id, int)
+                    assert isinstance(pid_set, set)
+                    for pid in pid_set:
+                        assert isinstance(pid, int)
 
-    def test_zero_threshold_forces_bloom(self):
-        """index_threshold=0 disables threshold, bloom should be built"""
+    def test_query_all_file_pids_empty_index(self):
+        """Test query_all_file_pids returns empty dict for unindexed files"""
         with Environment() as env:
             gz_file = env.create_test_gzip_file()
-            index_path = env.get_index_path(gz_file)
+
             with dft_utils.Indexer(
-                gz_file, index_path, build_bloom=True, index_threshold=0
+                files=[gz_file],
+                require_manifest=False,
             ) as indexer:
-                indexer.build()
-                assert indexer.has_bloom
+                # Only checkpoint tier, no manifest
+                indexer.ensure_indexed()
+
+                all_pids = indexer.query_all_file_pids()
+                assert isinstance(all_pids, dict)
 
-    def test_zero_threshold_forces_manifest(self):
-        """index_threshold=0 disables threshold, manifest should be built"""
+    def test_integration_hash_tables_and_pids(self):
+        """Integration test: hash tables and PIDs work together"""
         with Environment() as env:
-            gz_file = env.create_test_gzip_file()
-            index_path = env.get_index_path(gz_file)
+            gz_file = env.create_dft_trace_file()
+
             with dft_utils.Indexer(
-                gz_file, index_path, build_manifest=True, index_threshold=0
+                files=[gz_file],
+                require_bloom=True,
+                require_manifest=True,
             ) as indexer:
-                indexer.build()
-                assert indexer.has_manifest
+                indexer.ensure_indexed()
+
+                # Get hash tables
+                file_hashes = indexer.get_hash_table("file")
+                host_hashes = indexer.get_hash_table("host")
+
+                # Get PIDs
+                all_pids = indexer.query_all_file_pids()
+
+                # Both should be populated for a valid DFT trace
+                assert isinstance(file_hashes, dict)
+                assert isinstance(host_hashes, dict)
+                assert isinstance(all_pids, dict)
+
+
+class TestQueryFilter:
+    """Test cases for query filter parameter in iter_arrow_dfanalyzer APIs"""
+
+    def _make_indexer(self, directory):
+        return dft_utils.Indexer(
+            directory=directory,
+            require_aggregation=dft_utils.AggregationConfig(time_interval_ms=5000),
+        )
+
+    def test_iter_arrow_dfanalyzer_all_no_query(self):
+        pa = pytest.importorskip("pyarrow")
+        with Environment() as env:
+            directory = env.create_indexed_traces(pids=[1])
+            with self._make_indexer(directory) as indexer:
+                indexer.ensure_indexed()
+                result = indexer.iter_arrow_dfanalyzer_all()
+                rows = sum(pa.record_batch(b).num_rows for b in result.get("events", []))
+                assert rows > 0
+
+    def test_iter_arrow_dfanalyzer_all_pid_filter(self):
+        pa = pytest.importorskip("pyarrow")
+        with Environment() as env:
+            directory = env.create_indexed_traces(pids=[1])
+            with self._make_indexer(directory) as indexer:
+                indexer.ensure_indexed()
+                result = indexer.iter_arrow_dfanalyzer_all(query="pid == 1")
+                rows = sum(pa.record_batch(b).num_rows for b in result.get("events", []))
+                assert rows > 0
+
+    def test_iter_arrow_dfanalyzer_all_pid_filter_reduces_rows(self):
+        pa = pytest.importorskip("pyarrow")
+        with Environment() as env:
+            directory = env.create_indexed_traces(pids=[1, 2])
+            with self._make_indexer(directory) as indexer:
+                indexer.ensure_indexed()
+
+                all_rows = sum(
+                    pa.record_batch(b).num_rows
+                    for b in indexer.iter_arrow_dfanalyzer_all().get("events", [])
+                )
+                filtered_rows = sum(
+                    pa.record_batch(b).num_rows
+                    for b in indexer.iter_arrow_dfanalyzer_all(query="pid == 1").get("events", [])
+                )
+                assert 0 < filtered_rows < all_rows
+
+    def test_iter_arrow_dfanalyzer_all_invalid_query(self):
+        with Environment() as env:
+            directory = env.create_indexed_traces(pids=[1])
+            with self._make_indexer(directory) as indexer:
+                indexer.ensure_indexed()
+                with pytest.raises((ValueError, RuntimeError)):
+                    indexer.iter_arrow_dfanalyzer_all(query="invalid ==")
+
+    def test_iter_arrow_dfanalyzer_query_param(self):
+        pa = pytest.importorskip("pyarrow")
+        with Environment() as env:
+            directory = env.create_indexed_traces(pids=[1])
+            with self._make_indexer(directory) as indexer:
+                indexer.ensure_indexed()
+                batches = list(indexer.iter_arrow_dfanalyzer("events", query="pid == 1"))
+                rows = sum(pa.record_batch(b).num_rows for b in batches)
+                assert rows > 0
+
+    def test_iter_arrow_dfanalyzer_query_matches_all(self):
+        pa = pytest.importorskip("pyarrow")
+        with Environment() as env:
+            directory = env.create_indexed_traces(pids=[1])
+            with self._make_indexer(directory) as indexer:
+                indexer.ensure_indexed()
+
+                single_rows = sum(
+                    pa.record_batch(b).num_rows
+                    for b in indexer.iter_arrow_dfanalyzer("events", query="pid == 1")
+                )
+                all_rows = sum(
+                    pa.record_batch(b).num_rows
+                    for b in indexer.iter_arrow_dfanalyzer_all(query="pid == 1").get("events", [])
+                )
+                assert single_rows == all_rows
+
+    def test_iter_arrow_dfanalyzer_all_multi_pid_filter(self):
+        pa = pytest.importorskip("pyarrow")
+        with Environment() as env:
+            directory = env.create_indexed_traces(pids=[10, 20, 30])
+            with self._make_indexer(directory) as indexer:
+                indexer.ensure_indexed()
+
+                filtered_rows = sum(
+                    pa.record_batch(b).num_rows
+                    for b in indexer.iter_arrow_dfanalyzer_all(query="pid == 10 or pid == 20").get(
+                        "events", []
+                    )
+                )
+                all_rows = sum(
+                    pa.record_batch(b).num_rows
+                    for b in indexer.iter_arrow_dfanalyzer_all().get("events", [])
+                )
+                assert 0 < filtered_rows < all_rows
+
+    def test_iter_arrow_dfanalyzer_all_string_filter(self):
+        pa = pytest.importorskip("pyarrow")
+        with Environment() as env:
+            directory = env.create_indexed_traces(pids=[1])
+            with self._make_indexer(directory) as indexer:
+                indexer.ensure_indexed()
+                result = indexer.iter_arrow_dfanalyzer_all(query='cat == "POSIX"')
+                rows = sum(pa.record_batch(b).num_rows for b in result.get("events", []))
+                assert rows > 0
+
+    def test_iter_arrow_dfanalyzer_all_no_match(self):
+        pa = pytest.importorskip("pyarrow")
+        with Environment() as env:
+            directory = env.create_indexed_traces(pids=[1])
+            with self._make_indexer(directory) as indexer:
+                indexer.ensure_indexed()
+                result = indexer.iter_arrow_dfanalyzer_all(query="pid == 999999")
+                rows = sum(pa.record_batch(b).num_rows for b in result.get("events", []))
+                assert rows == 0
 
 
 if __name__ == "__main__":
diff --git a/tests/python/test_reorganization_planner.py b/tests/python/test_reorganization_planner.py
index b5518fe6..c67ee297 100644
--- a/tests/python/test_reorganization_planner.py
+++ b/tests/python/test_reorganization_planner.py
@@ -1,8 +1,6 @@
 """Tests for ReorganizationPlannerUtility."""
 
-import sys
-
-import dftracer.utils as dft_utils
+from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer
 from dftracer.utils.dftracer_utils_ext import ReorganizationPlannerUtility
 
 from .common import Environment
@@ -10,7 +8,6 @@
 # Threshold large enough to guarantee bloom/manifest are skipped for any
 # test fixture, making WithoutIndex tests deterministic regardless of
 # fixture size.
-_SKIP_INDEX_THRESHOLD = sys.maxsize
 
 
 class TestReorganizationPlannerUtility:
@@ -18,12 +15,11 @@ def test_plan_returns_dict(self):
         with Environment(lines=20) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
+            with NativeIndexer(
                 gz_file,
                 index_path,
                 build_bloom=True,
                 build_manifest=True,
-                index_threshold=0,
             ) as indexer:
                 indexer.build()
             groups = [{"name": "posix", "query": 'cat == "POSIX"'}]
@@ -38,12 +34,11 @@ def test_call_delegates_to_process(self):
         with Environment(lines=20) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
+            with NativeIndexer(
                 gz_file,
                 index_path,
                 build_bloom=True,
                 build_manifest=True,
-                index_threshold=0,
             ) as indexer:
                 indexer.build()
             util = ReorganizationPlannerUtility()
@@ -61,12 +56,11 @@ def test_plan_succeeds_without_manifest(self):
         with Environment(lines=5) as env:
             gz_file = env.create_test_gzip_file(bytes_per_line=128)
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
+            with NativeIndexer(
                 gz_file,
                 index_path,
                 build_bloom=True,
-                build_manifest=True,
-                index_threshold=_SKIP_INDEX_THRESHOLD,
+                build_manifest=False,
             ) as indexer:
                 indexer.build()
                 assert not indexer.has_manifest
@@ -82,12 +76,11 @@ def test_plan_has_tasks_without_manifest(self):
         with Environment(lines=5) as env:
             gz_file = env.create_test_gzip_file(bytes_per_line=128)
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
+            with NativeIndexer(
                 gz_file,
                 index_path,
                 build_bloom=True,
-                build_manifest=True,
-                index_threshold=_SKIP_INDEX_THRESHOLD,
+                build_manifest=False,
             ) as indexer:
                 indexer.build()
                 assert not indexer.has_manifest
diff --git a/tests/python/test_statistics_aggregator.py b/tests/python/test_statistics_aggregator.py
index 3be995d5..44fb3ad6 100644
--- a/tests/python/test_statistics_aggregator.py
+++ b/tests/python/test_statistics_aggregator.py
@@ -1,8 +1,6 @@
 """Tests for StatisticsAggregatorUtility."""
 
-import sys
-
-import dftracer.utils as dft_utils
+from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer
 from dftracer.utils.dftracer_utils_ext import StatisticsAggregatorUtility
 
 from .common import Environment
@@ -10,7 +8,6 @@
 # Threshold large enough to guarantee bloom/manifest are skipped for any
 # test fixture, making WithoutIndex tests deterministic regardless of
 # fixture size.
-_SKIP_INDEX_THRESHOLD = sys.maxsize
 
 
 class TestStatisticsAggregatorUtility:
@@ -18,9 +15,7 @@ def test_compute_returns_dict(self):
         with Environment(lines=20) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
-                gz_file, index_path, build_bloom=True, index_threshold=0
-            ) as indexer:
+            with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer:
                 indexer.build()
             result = StatisticsAggregatorUtility().process(gz_file)
             assert isinstance(result, dict)
@@ -31,9 +26,7 @@ def test_compute_correct_event_count(self):
         with Environment(lines=30) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
-                gz_file, index_path, build_bloom=True, index_threshold=0
-            ) as indexer:
+            with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer:
                 indexer.build()
             result = StatisticsAggregatorUtility().process(gz_file)
             assert result["success"] is True
@@ -43,9 +36,7 @@ def test_compute_has_statistics_fields(self):
         with Environment(lines=10) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
-                gz_file, index_path, build_bloom=True, index_threshold=0
-            ) as indexer:
+            with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer:
                 indexer.build()
             result = StatisticsAggregatorUtility().process(gz_file)
             assert "num_categories" in result
@@ -58,9 +49,7 @@ def test_call_delegates_to_process(self):
         with Environment(lines=10) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
-                gz_file, index_path, build_bloom=True, index_threshold=0
-            ) as indexer:
+            with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer:
                 indexer.build()
             util = StatisticsAggregatorUtility()
             result = util(gz_file)
@@ -76,11 +65,10 @@ def test_returns_dict_without_bloom(self):
         with Environment(lines=20) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
+            with NativeIndexer(
                 gz_file,
                 index_path,
-                build_bloom=True,
-                index_threshold=_SKIP_INDEX_THRESHOLD,
+                build_bloom=False,
             ) as indexer:
                 indexer.build()
                 assert not indexer.has_bloom
@@ -95,11 +83,10 @@ def test_correct_event_count_without_bloom(self):
         with Environment(lines=20) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
+            with NativeIndexer(
                 gz_file,
                 index_path,
-                build_bloom=True,
-                index_threshold=_SKIP_INDEX_THRESHOLD,
+                build_bloom=False,
             ) as indexer:
                 indexer.build()
                 assert not indexer.has_bloom
@@ -112,11 +99,10 @@ def test_has_statistics_fields_without_bloom(self):
         with Environment(lines=10) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
+            with NativeIndexer(
                 gz_file,
                 index_path,
-                build_bloom=True,
-                index_threshold=_SKIP_INDEX_THRESHOLD,
+                build_bloom=False,
             ) as indexer:
                 indexer.build()
                 assert not indexer.has_bloom
diff --git a/tests/python/test_statistics_query.py b/tests/python/test_statistics_query.py
index fa2cb123..55d6d00d 100644
--- a/tests/python/test_statistics_query.py
+++ b/tests/python/test_statistics_query.py
@@ -1,8 +1,6 @@
 """Tests for StatisticsQueryUtility."""
 
-import sys
-
-import dftracer.utils as dft_utils
+from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer
 from dftracer.utils.dftracer_utils_ext import StatisticsQueryUtility
 
 from .common import Environment
@@ -10,7 +8,6 @@
 # Threshold large enough to guarantee bloom/manifest are skipped for any
 # test fixture, making WithoutIndex tests deterministic regardless of
 # fixture size.
-_SKIP_INDEX_THRESHOLD = sys.maxsize
 
 
 class TestStatisticsQueryUtility:
@@ -18,9 +15,7 @@ def test_query_summary(self):
         with Environment(lines=20) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
-                gz_file, index_path, build_bloom=True, index_threshold=0
-            ) as indexer:
+            with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer:
                 indexer.build()
             result = StatisticsQueryUtility().process(gz_file, query_type="summary")
             assert isinstance(result, dict)
@@ -31,9 +26,7 @@ def test_query_categories(self):
         with Environment(lines=10) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
-                gz_file, index_path, build_bloom=True, index_threshold=0
-            ) as indexer:
+            with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer:
                 indexer.build()
             result = StatisticsQueryUtility().process(gz_file, query_type="categories")
             assert "results" in result
@@ -43,9 +36,7 @@ def test_query_names(self):
         with Environment(lines=10) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
-                gz_file, index_path, build_bloom=True, index_threshold=0
-            ) as indexer:
+            with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer:
                 indexer.build()
             result = StatisticsQueryUtility().process(gz_file, query_type="names")
             assert "results" in result
@@ -54,9 +45,7 @@ def test_query_top_n_names(self):
         with Environment(lines=20) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
-                gz_file, index_path, build_bloom=True, index_threshold=0
-            ) as indexer:
+            with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer:
                 indexer.build()
             result = StatisticsQueryUtility().process(gz_file, query_type="top_n_names", top_n=5)
             assert "results" in result
@@ -66,9 +55,7 @@ def test_query_duration_stats(self):
         with Environment(lines=10) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
-                gz_file, index_path, build_bloom=True, index_threshold=0
-            ) as indexer:
+            with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer:
                 indexer.build()
             result = StatisticsQueryUtility().process(gz_file, query_type="duration_stats")
             assert "duration_mean_us" in result
@@ -78,9 +65,7 @@ def test_call_delegates_to_process(self):
         with Environment(lines=10) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
-                gz_file, index_path, build_bloom=True, index_threshold=0
-            ) as indexer:
+            with NativeIndexer(gz_file, index_path, build_bloom=True) as indexer:
                 indexer.build()
             util = StatisticsQueryUtility()
             result = util(gz_file, query_type="summary")
@@ -96,11 +81,10 @@ def test_summary_correct_events_without_bloom(self):
         with Environment(lines=20) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
+            with NativeIndexer(
                 gz_file,
                 index_path,
-                build_bloom=True,
-                index_threshold=_SKIP_INDEX_THRESHOLD,
+                build_bloom=False,
             ) as indexer:
                 indexer.build()
                 assert not indexer.has_bloom
@@ -113,11 +97,10 @@ def test_categories_populated_without_bloom(self):
         with Environment(lines=10) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(
+            with NativeIndexer(
                 gz_file,
                 index_path,
-                build_bloom=True,
-                index_threshold=_SKIP_INDEX_THRESHOLD,
+                build_bloom=False,
             ) as indexer:
                 indexer.build()
                 assert not indexer.has_bloom
diff --git a/tests/python/test_trace_reader.py b/tests/python/test_trace_reader.py
index 586fbd7a..8f627886 100644
--- a/tests/python/test_trace_reader.py
+++ b/tests/python/test_trace_reader.py
@@ -4,6 +4,7 @@
 import pytest
 
 import dftracer.utils as dft_utils
+from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer
 
 from .common import Environment
 
@@ -12,11 +13,11 @@ class TestTraceReaderCreation:
     """Construction and property tests."""
 
     def test_creation_basic(self):
-        """TraceReader accepts a valid file path and exposes file_path."""
+        """TraceReader accepts a valid file path and exposes path."""
         with Environment() as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file)
-            assert reader.file_path == gz_file
+            assert reader.path == gz_file
 
     def test_creation_nonexistent_file(self):
         """TraceReader with nonexistent file creates but read_lines fails."""
@@ -37,7 +38,7 @@ def test_has_index_true_after_indexer_build(self):
         with Environment() as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(gz_file, index_path) as indexer:
+            with NativeIndexer(gz_file, index_path) as indexer:
                 indexer.build()
             # TraceReader probes for the index store at __init__ time
             reader = dft_utils.TraceReader(gz_file)
@@ -65,12 +66,12 @@ def test_has_index_is_bool(self):
             reader = dft_utils.TraceReader(gz_file)
             assert isinstance(reader.has_index, bool)
 
-    def test_file_path_is_str(self):
-        """file_path property returns a str."""
+    def test_path_is_str(self):
+        """path property returns a str."""
         with Environment() as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file)
-            assert isinstance(reader.file_path, str)
+            assert isinstance(reader.path, str)
 
 
 class TestTraceReaderReadLines:
@@ -85,13 +86,13 @@ def test_read_all_lines_default_args(self):
             assert isinstance(lines, list)
             assert len(lines) == 22
 
-    def test_read_lines_returns_strings(self):
-        """Every element returned by read_lines() is a str."""
+    def test_read_lines_returns_memoryviews(self):
+        """Every element returned by read_lines() is a memoryview."""
         with Environment(lines=10) as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file)
             lines = reader.read_lines()
-            assert all(isinstance(line, str) for line in lines)
+            assert all(isinstance(line, memoryview) for line in lines)
 
     def test_read_lines_content_is_json(self):
         """Lines contain the JSON fields written by Environment."""
@@ -100,17 +101,19 @@ def test_read_lines_content_is_json(self):
             reader = dft_utils.TraceReader(gz_file)
             lines = reader.read_lines()
             for line in lines:
-                stripped = line.strip()
-                if stripped in ("[", "]"):
+                text = bytes(line).decode("utf-8").strip()
+                if text in ("[", "]"):
                     continue
-                assert '"name"' in line
+                assert b'"name"' in bytes(line)
 
     def test_read_lines_explicit_zero_zero(self):
         """read_lines(0, 0) is equivalent to read_lines()."""
         with Environment(lines=15) as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file)
-            assert reader.read_lines(0, 0) == reader.read_lines()
+            a = [bytes(m) for m in reader.read_lines(0, 0)]
+            b = [bytes(m) for m in reader.read_lines()]
+            assert a == b
 
     def test_read_lines_with_range(self):
         """read_lines(start, end) returns a subset of lines."""
@@ -127,11 +130,10 @@ def test_read_lines_range_is_subset_of_all(self):
         with Environment(lines=20) as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file)
-            all_lines = reader.read_lines()
+            all_bytes = [bytes(m) for m in reader.read_lines()]
             partial = reader.read_lines(start_line=3, end_line=8)
-            # Every line in the partial result must appear in all_lines
             for line in partial:
-                assert line in all_lines
+                assert bytes(line) in all_bytes
 
     def test_read_lines_negative_start_raises(self):
         """read_lines raises ValueError for a negative start_line."""
@@ -154,7 +156,7 @@ def test_read_lines_with_index(self):
         with Environment(lines=20) as env:
             gz_file = env.create_test_gzip_file()
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(gz_file, index_path) as indexer:
+            with NativeIndexer(gz_file, index_path) as indexer:
                 indexer.build()
             reader = dft_utils.TraceReader(gz_file)
             assert reader.has_index
@@ -166,13 +168,13 @@ def test_read_lines_indexed_matches_sequential(self):
         with Environment(lines=20) as env:
             gz_file = env.create_test_gzip_file()
             # Sequential (no index)
-            sequential = dft_utils.TraceReader(gz_file).read_lines()
+            sequential = [bytes(m) for m in dft_utils.TraceReader(gz_file).read_lines()]
 
             # Build index, then read again
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(gz_file, index_path) as indexer:
+            with NativeIndexer(gz_file, index_path) as indexer:
                 indexer.build()
-            indexed = dft_utils.TraceReader(gz_file).read_lines()
+            indexed = [bytes(m) for m in dft_utils.TraceReader(gz_file).read_lines()]
 
             assert sequential == indexed
 
@@ -226,7 +228,7 @@ def test_with_statement_properties_accessible(self):
         with Environment() as env:
             gz_file = env.create_test_gzip_file()
             with dft_utils.TraceReader(gz_file) as reader:
-                assert reader.file_path == gz_file
+                assert reader.path == gz_file
                 assert isinstance(reader.has_index, bool)
 
     def test_with_statement_exit_does_not_raise(self):
@@ -245,21 +247,14 @@ def test_custom_checkpoint_size_accepted(self):
         with Environment() as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file, checkpoint_size=1024 * 1024)
-            assert reader.file_path == gz_file
+            assert reader.path == gz_file
 
     def test_auto_build_index_accepted(self):
         """auto_build_index kwarg is accepted without error."""
         with Environment() as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file, auto_build_index=False)
-            assert reader.file_path == gz_file
-
-    def test_index_threshold_accepted(self):
-        """index_threshold kwarg is accepted without error."""
-        with Environment() as env:
-            gz_file = env.create_test_gzip_file()
-            reader = dft_utils.TraceReader(gz_file, index_threshold=16 * 1024 * 1024)
-            assert reader.file_path == gz_file
+            assert reader.path == gz_file
 
     def test_all_optional_params_together(self):
         """All optional constructor params can be supplied simultaneously."""
@@ -271,9 +266,8 @@ def test_all_optional_params_together(self):
                 index_dir=env.temp_dir,
                 checkpoint_size=512 * 1024,
                 auto_build_index=False,
-                index_threshold=4 * 1024 * 1024,
             )
-            assert reader.file_path == gz_file
+            assert reader.path == gz_file
             assert reader.index_dir == env.temp_dir
 
 
@@ -288,12 +282,12 @@ def test_iter_lines_returns_iterator(self):
             assert hasattr(it, "__iter__")
             assert hasattr(it, "__next__")
 
-    def test_iter_lines_yields_strings(self):
+    def test_iter_lines_yields_memoryviews(self):
         with Environment(lines=10) as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file)
             for line in reader.iter_lines():
-                assert isinstance(line, str)
+                assert isinstance(line, memoryview)
 
     def test_iter_lines_count(self):
         with Environment(lines=20) as env:
@@ -306,8 +300,8 @@ def test_iter_lines_matches_read_lines(self):
         with Environment(lines=15) as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file)
-            from_iter = list(reader.iter_lines())
-            from_read = reader.read_lines()
+            from_iter = [bytes(m) for m in reader.iter_lines()]
+            from_read = [bytes(m) for m in reader.read_lines()]
             assert from_iter == from_read
 
     def test_iter_lines_with_range(self):
@@ -356,12 +350,12 @@ def test_iter_raw_returns_iterator(self):
             assert hasattr(it, "__iter__")
             assert hasattr(it, "__next__")
 
-    def test_iter_raw_yields_bytes(self):
+    def test_iter_raw_yields_memoryviews(self):
         with Environment(lines=10) as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file)
             for chunk in reader.iter_raw():
-                assert isinstance(chunk, bytes)
+                assert isinstance(chunk, memoryview)
 
     def test_iter_raw_single_line_mode(self):
         """multi_line=False yields one chunk per line."""
@@ -402,20 +396,20 @@ def test_iter_raw_negative_raises(self):
 class TestTraceReaderReadRaw:
     """read_raw() materialized list tests."""
 
-    def test_read_raw_returns_list_of_bytes(self):
+    def test_read_raw_returns_list_of_memoryviews(self):
         with Environment(lines=10) as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file)
             chunks = reader.read_raw()
             assert isinstance(chunks, list)
-            assert all(isinstance(c, bytes) for c in chunks)
+            assert all(isinstance(c, memoryview) for c in chunks)
 
     def test_read_raw_matches_iter_raw(self):
         with Environment(lines=15) as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file)
-            from_read = reader.read_raw()
-            from_iter = list(reader.iter_raw())
+            from_read = [bytes(m) for m in reader.read_raw()]
+            from_iter = [bytes(m) for m in reader.iter_raw()]
             assert from_read == from_iter
 
     def test_read_raw_single_line_count(self):
@@ -457,10 +451,10 @@ def test_default_runtime_works(self):
 
 
 class TestTraceReaderJSON:
-    """JSON reading tests."""
+    """JSON reading tests (shimmed via Arrow)."""
 
     def test_read_lines_json_returns_list(self):
-        """read_lines_json returns a list of JSON objects."""
+        """read_lines_json returns a list of dicts."""
         with Environment(lines=32) as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file)
@@ -469,25 +463,25 @@ def test_read_lines_json_returns_list(self):
             assert len(result) == 32
 
     def test_read_lines_json_objects_have_keys(self):
-        """Each JSON object has expected keys."""
+        """Each dict has expected keys."""
         with Environment(lines=10) as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file)
             result = reader.read_lines_json()
             for obj in result:
+                assert isinstance(obj, dict)
                 assert "name" in obj
                 assert "cat" in obj
                 assert "dur" in obj
 
     def test_read_lines_json_values_correct(self):
-        """JSON values match what was written."""
+        """Dict values match what was written."""
         with Environment(lines=5) as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file)
             result = reader.read_lines_json()
             assert result[0]["name"] == "write"
             assert result[0]["cat"] == "POSIX"
-            assert result[0]["ph"] == "X"
 
     def test_iter_lines_json_is_lazy(self):
         """iter_lines_json returns an iterator, not a list."""
@@ -508,7 +502,6 @@ def test_iter_lines_json_partial_iteration(self):
             it = reader.iter_lines_json()
             first = next(it)
             assert "name" in first
-            # Don't exhaust the iterator
 
     def test_read_lines_json_with_line_range(self):
         """read_lines_json respects start_line/end_line."""
@@ -516,8 +509,6 @@ def test_read_lines_json_with_line_range(self):
             gz_file = env.create_test_gzip_file()
             env.build_index(gz_file)
             reader = dft_utils.TraceReader(gz_file)
-            # Lines are 1-indexed; line 1 is "[", line 2 is first JSON, etc.
-            # But iter_lines_json skips non-JSON lines, so we get JSON objects
             all_json = reader.read_lines_json()
             subset = reader.read_lines_json(start_line=1, end_line=10)
             assert len(subset) <= len(all_json)
@@ -579,7 +570,6 @@ def test_num_lines_property_still_works(self):
         with Environment(lines=20) as env:
             gz_file = env.create_test_gzip_file()
             reader = dft_utils.TraceReader(gz_file)
-            # Property should still work (falls back to reading all lines)
             assert reader.num_lines > 0
 
 
@@ -592,7 +582,6 @@ def test_end_line_beyond_total_clamped(self):
             gz_file = env.create_test_gzip_file()
             env.build_index(gz_file)
             reader = dft_utils.TraceReader(gz_file)
-            # Request way more lines than exist
             result = reader.read_lines(start_line=1, end_line=99999)
             assert len(result) > 0
 
@@ -690,7 +679,7 @@ def test_query_by_name(self):
             filtered = reader.read_lines(query='name == "read"')
             assert len(filtered) > 0
             for line in filtered:
-                assert '"name":"read"' in line
+                assert b'"name":"read"' in bytes(line)
 
     def test_query_and(self):
         with Environment() as env:
@@ -732,7 +721,7 @@ def test_iter_lines_with_query(self):
             lines = list(reader.iter_lines(query='name == "write"'))
             assert len(lines) > 0
             for line in lines:
-                assert '"name":"write"' in line
+                assert b'"name":"write"' in bytes(line)
 
     def test_iter_lines_json_with_query(self):
         with Environment() as env:
@@ -756,8 +745,8 @@ def test_query_with_field_class(self):
             filtered = reader.read_lines(query=str(q))
             assert len(filtered) > 0
             for line in filtered:
-                assert '"cat":"IO"' in line
-                assert '"name":"read"' in line
+                assert b'"cat":"IO"' in bytes(line)
+                assert b'"name":"read"' in bytes(line)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/test_trace_reader_arrow.py b/tests/python/test_trace_reader_arrow.py
index 822554c5..b6bcb21a 100644
--- a/tests/python/test_trace_reader_arrow.py
+++ b/tests/python/test_trace_reader_arrow.py
@@ -3,6 +3,7 @@
 
 import dftracer.utils as dft_utils
 from dftracer.utils.arrow import ArrowBatch, ArrowTable
+from dftracer.utils.dftracer_utils_ext import CheckpointIndexer as NativeIndexer
 
 from .common import Environment
 
@@ -69,7 +70,7 @@ def test_iter_arrow_with_line_range(self):
             gz_file = env.create_test_gzip_file()
             # Build index for line-based access
             index_path = env.get_index_path(gz_file)
-            with dft_utils.Indexer(gz_file, index_path) as indexer:
+            with NativeIndexer(gz_file, index_path) as indexer:
                 indexer.build()
             with dft_utils.TraceReader(gz_file) as reader:
                 batches = list(reader.iter_arrow(start_line=10, end_line=20, batch_size=100))
@@ -120,6 +121,265 @@ def test_read_arrow_properties(self):
             assert not table.empty
 
 
+class TestIterArrowStream:
+    """Tests for TraceReader.iter_arrow_stream()."""
+
+    def test_iter_arrow_stream_exposes_c_stream(self):
+        """iter_arrow_stream returns an object with __arrow_c_stream__."""
+        with Environment(lines=20) as env:
+            gz_file = env.create_test_gzip_file()
+            with dft_utils.TraceReader(gz_file) as reader:
+                stream = reader.iter_arrow_stream(batch_size=100)
+                assert hasattr(stream, "__arrow_c_stream__")
+
+    def test_iter_arrow_stream_row_count_matches_iter_arrow(self):
+        """Stream drains the same row count as the per-batch iterator."""
+        import pyarrow as pa
+
+        with Environment(lines=50) as env:
+            gz_file = env.create_test_gzip_file()
+            with dft_utils.TraceReader(gz_file) as reader:
+                batches_expected = list(reader.iter_arrow(batch_size=20))
+            expected_rows = sum(b.num_rows for b in batches_expected)
+
+            with dft_utils.TraceReader(gz_file) as reader:
+                stream = reader.iter_arrow_stream(batch_size=20)
+                rbr = pa.RecordBatchReader.from_stream(stream)
+                total = sum(b.num_rows for b in rbr)
+            assert total == expected_rows
+            assert total == 50
+
+    def test_iter_arrow_stream_schema_matches_iter_arrow(self):
+        """Stream schema equals iter_arrow's schema plus the _extra catch-all column."""
+        import pyarrow as pa
+
+        with Environment(lines=20) as env:
+            gz_file = env.create_test_gzip_file()
+            with dft_utils.TraceReader(gz_file) as reader:
+                expected_batch = pa.record_batch(next(iter(reader.iter_arrow(batch_size=100))))
+                expected_names = list(expected_batch.schema.names)
+
+            with dft_utils.TraceReader(gz_file) as reader:
+                stream = reader.iter_arrow_stream(batch_size=100)
+                rbr = pa.RecordBatchReader.from_stream(stream)
+                stream_names = list(rbr.schema.names)
+            assert stream_names[-1] == "_extra"
+            assert set(stream_names[:-1]) == set(expected_names)
+
+    def test_iter_arrow_stream_pa_table(self):
+        """pa.table(stream) materializes a full Table."""
+        import pyarrow as pa
+
+        with Environment(lines=40) as env:
+            gz_file = env.create_test_gzip_file()
+            with dft_utils.TraceReader(gz_file) as reader:
+                stream = reader.iter_arrow_stream(batch_size=10)
+                table = pa.table(stream)
+            assert isinstance(table, pa.Table)
+            assert table.num_rows == 40
+
+    def test_iter_arrow_stream_single_use(self):
+        """__arrow_c_stream__ can only be consumed once."""
+        with Environment(lines=5) as env:
+            gz_file = env.create_test_gzip_file()
+            with dft_utils.TraceReader(gz_file) as reader:
+                stream = reader.iter_arrow_stream(batch_size=100)
+                stream.__arrow_c_stream__()
+                try:
+                    stream.__arrow_c_stream__()
+                    raise AssertionError("expected RuntimeError on second consume")
+                except RuntimeError:
+                    pass
+
+    def test_iter_arrow_stream_survives_early_drop(self):
+        """Dropping an in-flight batch must not double-free the stream."""
+        import gc
+
+        import pyarrow as pa
+
+        with Environment(lines=30) as env:
+            gz_file = env.create_test_gzip_file()
+            with dft_utils.TraceReader(gz_file) as reader:
+                stream = reader.iter_arrow_stream(batch_size=5)
+                rbr = pa.RecordBatchReader.from_stream(stream)
+                first = next(iter(rbr))
+                assert first.num_rows > 0
+                # Drop the first batch while stream is still live.
+                del first
+                gc.collect()
+                total = sum(b.num_rows for b in rbr)
+            assert total >= 0
+
+    def test_read_arrow_uses_stream_path(self):
+        """read_arrow still produces a correct ArrowTable via the stream."""
+        with Environment(lines=30) as env:
+            gz_file = env.create_test_gzip_file()
+            with dft_utils.TraceReader(gz_file) as reader:
+                table = reader.read_arrow(batch_size=10)
+            assert isinstance(table, ArrowTable)
+            assert table.num_rows == 30
+
+
+class TestIterArrowStreamReconciliation:
+    """Stream emits a single locked schema across batches with diverging columns."""
+
+    @staticmethod
+    def _write_trace(path, rows):
+        import gzip
+        import json
+        import os
+
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        with gzip.open(path, "wt", encoding="utf-8") as f:
+            for r in rows:
+                f.write(json.dumps(r) + "\n")
+
+    def _make_divergent_dir(self, env):
+        """Directory whose top-level keys differ across files. `args` is
+        serialized as a single JSON string column by the Arrow builder, so
+        divergence has to be at the top level to reach the reconciler."""
+        import os
+
+        base = os.path.join(env.temp_dir, "divergent")
+        common = {"name": "read", "cat": "POSIX", "pid": 1, "tid": 1, "dur": 10, "ph": "X"}
+        rows_a = [{**common, "ts": i, "only_a_int": i * 3} for i in range(30)]
+        rows_b = [{**common, "ts": i, "only_b_str": f"b-{i}"} for i in range(30)]
+        rows_c = [{**common, "ts": i, "only_c_dbl": float(i) * 0.5} for i in range(30)]
+        self._write_trace(os.path.join(base, "a.pfw.gz"), rows_a)
+        self._write_trace(os.path.join(base, "b.pfw.gz"), rows_b)
+        self._write_trace(os.path.join(base, "c.pfw.gz"), rows_c)
+        for f in ("a.pfw.gz", "b.pfw.gz", "c.pfw.gz"):
+            gz = os.path.join(base, f)
+            env.test_files.append(gz)
+            idx = env.get_index_path(gz)
+            with NativeIndexer(gz, idx) as indexer:
+                indexer.build()
+        return base
+
+    def test_stream_schema_has_extra_column(self):
+        import pyarrow as pa
+
+        with Environment() as env:
+            gz_file = env.create_test_gzip_file()
+            with dft_utils.TraceReader(gz_file) as reader:
+                stream = reader.iter_arrow_stream(batch_size=100)
+                rbr = pa.RecordBatchReader.from_stream(stream)
+                assert "_extra" in rbr.schema.names
+
+    def test_stream_survives_divergent_schemas(self):
+        """Directory-mode stream with differing args shapes must not error."""
+        import pyarrow as pa
+
+        with Environment() as env:
+            data_dir = self._make_divergent_dir(env)
+            with dft_utils.TraceReader(data_dir) as reader:
+                stream = reader.iter_arrow_stream(batch_size=25)
+                rbr = pa.RecordBatchReader.from_stream(stream)
+                table = rbr.read_all()
+            assert table.num_rows == 90
+
+    def test_stream_preserves_all_column_data(self):
+        """Each file's unique column ends up either as a native column (with
+        nulls for the other files) or in _extra JSON. No data is lost."""
+        import pyarrow as pa
+
+        with Environment() as env:
+            data_dir = self._make_divergent_dir(env)
+            with dft_utils.TraceReader(data_dir) as reader:
+                stream = reader.iter_arrow_stream(batch_size=25)
+                rbr = pa.RecordBatchReader.from_stream(stream)
+                table = rbr.read_all()
+
+            names = set(table.schema.names)
+            assert "_extra" in names
+
+            def hits_for(colname):
+                if colname in names:
+                    return sum(1 for v in table.column(colname).to_pylist() if v is not None)
+                extras = table.column("_extra").to_pylist()
+                return sum(1 for e in extras if e and colname in e)
+
+            # Each file's unique column must appear 30 times, either natively
+            # or via _extra — the reconciler preserves every value.
+            assert hits_for("only_a_int") == 30
+            assert hits_for("only_b_str") == 30
+            assert hits_for("only_c_dbl") == 30
+
+    def test_stream_matches_pa_table_from_stream(self):
+        """pa.table(stream) yields the same row count as RecordBatchReader.read_all."""
+        import pyarrow as pa
+
+        with Environment() as env:
+            data_dir = self._make_divergent_dir(env)
+            with dft_utils.TraceReader(data_dir) as reader:
+                stream = reader.iter_arrow_stream(batch_size=25)
+                table = pa.table(stream)
+            assert table.num_rows == 90
+
+    def test_stream_empty_result_has_schema(self):
+        """Empty stream still exposes a schema with _extra so callers don't crash."""
+        import pyarrow as pa
+
+        with Environment(lines=10) as env:
+            gz_file = env.create_test_gzip_file()
+            with dft_utils.TraceReader(gz_file) as reader:
+                stream = reader.iter_arrow_stream(batch_size=100, query="pid == 99999999")
+                rbr = pa.RecordBatchReader.from_stream(stream)
+                assert "_extra" in rbr.schema.names
+                total = sum(b.num_rows for b in rbr)
+            assert total == 0
+
+    def test_stream_flatten_promotes_nested_keys(self):
+        """flatten_objects=True expands top-level object values one level."""
+        import pyarrow as pa
+
+        with Environment(lines=20) as env:
+            gz_file = env.create_dft_trace_file()
+            with dft_utils.TraceReader(gz_file) as reader:
+                stream = reader.iter_arrow_stream(batch_size=100, flatten_objects=True)
+                rbr = pa.RecordBatchReader.from_stream(stream)
+                table = rbr.read_all()
+            names = set(table.schema.names)
+            # args.ret and args.hhash should be promoted to native typed columns.
+            assert "args.ret" in names
+            assert "args.hhash" in names
+            # Native type survives the reconciler; values must round-trip.
+            rets = table.column("args.ret").to_pylist()
+            assert all(isinstance(v, int) for v in rets if v is not None)
+            assert rets[0] == 1024
+
+    def test_stream_no_flatten_keeps_args_as_json(self):
+        """flatten_objects=False leaves `args` as a single JSON string column."""
+        import pyarrow as pa
+
+        with Environment(lines=10) as env:
+            gz_file = env.create_dft_trace_file()
+            with dft_utils.TraceReader(gz_file) as reader:
+                stream = reader.iter_arrow_stream(batch_size=100, flatten_objects=False)
+                rbr = pa.RecordBatchReader.from_stream(stream)
+                table = rbr.read_all()
+            names = set(table.schema.names)
+            assert "args" in names
+            assert "args.ret" not in names
+            first = table.column("args").to_pylist()[0]
+            assert isinstance(first, str)
+            assert first.startswith("{") and "ret" in first
+
+    def test_stream_extra_is_null_when_no_divergence(self):
+        """_extra should be all-null when every batch matches the discovered schema."""
+        import pyarrow as pa
+
+        with Environment(lines=40) as env:
+            gz_file = env.create_test_gzip_file()
+            with dft_utils.TraceReader(gz_file) as reader:
+                stream = reader.iter_arrow_stream(batch_size=10)
+                rbr = pa.RecordBatchReader.from_stream(stream)
+                table = rbr.read_all()
+            assert table.num_rows == 40
+            extra = table.column("_extra")
+            assert extra.null_count == extra.length()
+
+
 class TestArrowBatchWrapper:
     """Tests for the ArrowBatch Python wrapper."""
 
diff --git a/tests/python/test_trace_reader_directory.py b/tests/python/test_trace_reader_directory.py
new file mode 100644
index 00000000..3481d6fb
--- /dev/null
+++ b/tests/python/test_trace_reader_directory.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+"""Test cases for directory-level parallel TraceReader (iter_arrow / read_arrow)."""
+
+import os
+
+import dftracer.utils as dft_utils
+from dftracer.utils.arrow import ArrowTable
+
+from .common import Environment
+
+
+def _create_directory_with_files(env, num_files=3, lines_per_file=20, nested=False):
+    """Create multiple .pfw.gz files in env.temp_dir, optionally in subdirectories."""
+    files = []
+    for i in range(num_files):
+        if nested:
+            subdir = f"rank_{i}"
+            filename = os.path.join(subdir, f"trace_{i}.pfw.gz")
+        else:
+            filename = f"trace_{i}.pfw.gz"
+        f = env.create_dft_trace_file(filename=filename, num_events=lines_per_file)
+        files.append(f)
+    return files
+
+
+class TestDirectoryIterArrow:
+    """Tests for TraceReader.iter_arrow() with a directory path."""
+
+    def test_iter_arrow_directory_returns_batches(self):
+        """iter_arrow on a directory yields Arrow batches from all files."""
+        with Environment(lines=20) as env:
+            _create_directory_with_files(env, num_files=3, lines_per_file=20)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                batches = list(reader.iter_arrow(batch_size=100))
+            rt.shutdown()
+            assert len(batches) >= 1
+            total_rows = sum(b.num_rows for b in batches)
+            assert total_rows == 60
+
+    def test_iter_arrow_directory_single_file(self):
+        """Directory with one file produces same results as single-file path."""
+        with Environment(lines=30) as env:
+            files = _create_directory_with_files(env, num_files=1, lines_per_file=30)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as dir_reader:
+                dir_batches = list(dir_reader.iter_arrow(batch_size=100))
+            with dft_utils.TraceReader(files[0], runtime=rt) as file_reader:
+                file_batches = list(file_reader.iter_arrow(batch_size=100))
+            rt.shutdown()
+            dir_rows = sum(b.num_rows for b in dir_batches)
+            file_rows = sum(b.num_rows for b in file_batches)
+            assert dir_rows == file_rows == 30
+
+    def test_iter_arrow_directory_nested_subdirs(self):
+        """iter_arrow discovers .pfw.gz files in nested subdirectories."""
+        with Environment(lines=10) as env:
+            _create_directory_with_files(env, num_files=4, lines_per_file=10, nested=True)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                batches = list(reader.iter_arrow(batch_size=100))
+            rt.shutdown()
+            total_rows = sum(b.num_rows for b in batches)
+            assert total_rows == 40
+
+    def test_iter_arrow_directory_batch_size(self):
+        """Batch size is respected when reading from a directory."""
+        with Environment(lines=25) as env:
+            _create_directory_with_files(env, num_files=3, lines_per_file=25)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                batches = list(reader.iter_arrow(batch_size=10))
+            rt.shutdown()
+            for b in batches:
+                assert b.num_rows <= 10
+            total_rows = sum(b.num_rows for b in batches)
+            assert total_rows == 75
+
+    def test_iter_arrow_directory_empty(self):
+        """Directory with no .pfw.gz files yields no batches."""
+        with Environment(lines=10) as env:
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                batches = list(reader.iter_arrow(batch_size=100))
+            rt.shutdown()
+            assert len(batches) == 0
+
+
+class TestDirectoryReadArrow:
+    """Tests for TraceReader.read_arrow() with a directory path."""
+
+    def test_read_arrow_directory(self):
+        """read_arrow on a directory returns an ArrowTable with all rows."""
+        with Environment(lines=15) as env:
+            _create_directory_with_files(env, num_files=4, lines_per_file=15)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                table = reader.read_arrow(batch_size=100)
+            rt.shutdown()
+            assert isinstance(table, ArrowTable)
+            assert table.num_rows == 60
+
+    def test_read_arrow_directory_properties(self):
+        """ArrowTable from directory has correct properties."""
+        with Environment(lines=10) as env:
+            _create_directory_with_files(env, num_files=2, lines_per_file=10)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                table = reader.read_arrow(batch_size=100)
+            rt.shutdown()
+            assert table.num_rows == 20
+            assert table.num_batches >= 1
+            assert not table.empty
+
+
+class TestDirectoryWithQuery:
+    """Tests for directory reading with query filtering."""
+
+    def test_directory_query_filters_events(self):
+        """Query filtering works across directory files."""
+        with Environment(lines=50) as env:
+            _create_directory_with_files(env, num_files=3, lines_per_file=50)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                table_all = reader.read_arrow(batch_size=1000)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                table_filtered = reader.read_arrow(batch_size=1000, query='name == "read"')
+            rt.shutdown()
+            assert table_all.num_rows == 150
+            assert table_filtered.num_rows > 0
+            assert table_filtered.num_rows < table_all.num_rows
+
+    def test_directory_query_no_match(self):
+        """Query that matches nothing returns empty table."""
+        with Environment(lines=20) as env:
+            _create_directory_with_files(env, num_files=2, lines_per_file=20)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                table = reader.read_arrow(batch_size=100, query='name == "nonexistent_op"')
+            rt.shutdown()
+            assert table.num_rows == 0
+
+
+class TestDirectoryIterJson:
+    """Tests for TraceReader.iter_json() with a directory path."""
+
+    def test_iter_json_directory_returns_events(self):
+        """iter_json on a directory yields JsonDictValue events from all files."""
+        with Environment(lines=20) as env:
+            _create_directory_with_files(env, num_files=3, lines_per_file=20)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                events = list(reader.iter_json())
+            rt.shutdown()
+            assert len(events) == 60
+            for ev in events:
+                assert "name" in ev
+
+    def test_iter_json_directory_single_file(self):
+        """Directory with one file matches single-file iter_json."""
+        with Environment(lines=30) as env:
+            files = _create_directory_with_files(env, num_files=1, lines_per_file=30)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as dir_reader:
+                dir_events = list(dir_reader.iter_json())
+            with dft_utils.TraceReader(files[0], runtime=rt) as file_reader:
+                file_events = list(file_reader.iter_json())
+            rt.shutdown()
+            assert len(dir_events) == len(file_events) == 30
+
+    def test_iter_json_directory_nested_subdirs(self):
+        """iter_json discovers .pfw.gz files in nested subdirectories."""
+        with Environment(lines=10) as env:
+            _create_directory_with_files(env, num_files=4, lines_per_file=10, nested=True)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                events = list(reader.iter_json())
+            rt.shutdown()
+            assert len(events) == 40
+
+    def test_iter_json_directory_empty(self):
+        """Directory with no .pfw.gz files yields no events."""
+        with Environment(lines=10) as env:
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                events = list(reader.iter_json())
+            rt.shutdown()
+            assert len(events) == 0
+
+    def test_read_json_directory(self):
+        """read_json on a directory returns all events."""
+        with Environment(lines=15) as env:
+            _create_directory_with_files(env, num_files=4, lines_per_file=15)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                events = reader.read_json()
+            rt.shutdown()
+            assert len(events) == 60
+
+    def test_iter_json_directory_to_dict(self):
+        """JsonDictValue.to_dict() works for directory-sourced events."""
+        with Environment(lines=10) as env:
+            _create_directory_with_files(env, num_files=2, lines_per_file=10)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                events = list(reader.iter_json())
+            rt.shutdown()
+            for ev in events:
+                d = ev.to_dict()
+                assert isinstance(d, dict)
+                assert "name" in d
+
+
+class TestDirectoryIterLines:
+    """Tests for TraceReader.iter_lines() with a directory path."""
+
+    def test_iter_lines_directory_returns_lines(self):
+        """iter_lines on a directory yields memoryview lines from all files."""
+        with Environment(lines=20) as env:
+            _create_directory_with_files(env, num_files=3, lines_per_file=20)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                lines = list(reader.iter_lines())
+            rt.shutdown()
+            assert len(lines) == 60
+            for line in lines:
+                assert isinstance(line, memoryview)
+
+    def test_read_lines_directory(self):
+        """read_lines on a directory returns all lines."""
+        with Environment(lines=15) as env:
+            _create_directory_with_files(env, num_files=4, lines_per_file=15)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                lines = reader.read_lines()
+            rt.shutdown()
+            assert len(lines) == 60
+
+    def test_iter_lines_directory_empty(self):
+        """Directory with no .pfw.gz files yields no lines."""
+        with Environment(lines=10) as env:
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                lines = list(reader.iter_lines())
+            rt.shutdown()
+            assert len(lines) == 0
+
+
+class TestDirectoryIterRaw:
+    """Tests for TraceReader.iter_raw() with a directory path."""
+
+    def test_iter_raw_directory_returns_chunks(self):
+        """iter_raw on a directory yields memoryview chunks from all files."""
+        with Environment(lines=20) as env:
+            _create_directory_with_files(env, num_files=3, lines_per_file=20)
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                chunks = list(reader.iter_raw())
+            rt.shutdown()
+            assert len(chunks) >= 1
+            for chunk in chunks:
+                assert isinstance(chunk, memoryview)
+                assert len(chunk) > 0
+
+    def test_iter_raw_directory_empty(self):
+        """Directory with no .pfw.gz files yields no chunks."""
+        with Environment(lines=10) as env:
+            rt = dft_utils.Runtime(threads=2)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                chunks = list(reader.iter_raw())
+            rt.shutdown()
+            assert len(chunks) == 0
+
+
+class TestDirectoryMultiThreaded:
+    """Tests for directory reading with various thread counts."""
+
+    def test_directory_single_thread(self):
+        """Directory reading works with single thread."""
+        with Environment(lines=20) as env:
+            _create_directory_with_files(env, num_files=3, lines_per_file=20)
+            rt = dft_utils.Runtime(threads=1)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                table = reader.read_arrow(batch_size=100)
+            rt.shutdown()
+            assert table.num_rows == 60
+
+    def test_directory_many_threads(self):
+        """Directory reading works with more threads than files."""
+        with Environment(lines=10) as env:
+            _create_directory_with_files(env, num_files=2, lines_per_file=10)
+            rt = dft_utils.Runtime(threads=8)
+            with dft_utils.TraceReader(env.temp_dir, runtime=rt) as reader:
+                table = reader.read_arrow(batch_size=100)
+            rt.shutdown()
+            assert table.num_rows == 20
diff --git a/tests/python/test_trace_reader_write_arrow.py b/tests/python/test_trace_reader_write_arrow.py
new file mode 100644
index 00000000..5e8645ba
--- /dev/null
+++ b/tests/python/test_trace_reader_write_arrow.py
@@ -0,0 +1,490 @@
+"""Tests for TraceReader.write_arrow with bloom filter pruning."""
+
+import os
+import tempfile
+
+import pyarrow as pa
+import pyarrow.ipc as ipc
+import pytest
+
+import dftracer.utils as dft_utils
+
+from .common import Environment
+
+
+class TestTraceReaderWriteArrow:
+    """Test TraceReader.write_arrow functionality."""
+
+    def test_write_arrow_basic(self):
+        """Basic write_arrow produces readable Arrow IPC files."""
+        with Environment(lines=50) as env:
+            gz_file = env.create_test_gzip_file()
+            env.build_index(gz_file)
+            reader = dft_utils.TraceReader(gz_file)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                result = reader.write_arrow(output_dir)
+
+                assert "partitions" in result
+                assert "total_rows" in result
+                assert "total_bytes" in result
+                assert "chunks_scanned" in result
+                assert "chunks_skipped" in result
+
+                assert result["total_rows"] > 0
+
+                for view_name, stats in result["partitions"].items():
+                    assert "files" in stats
+                    assert "rows" in stats
+                    assert len(stats["files"]) > 0
+
+                    for arrow_file in stats["files"]:
+                        assert os.path.exists(arrow_file)
+                        reader_ipc = ipc.open_file(arrow_file)
+                        table = reader_ipc.read_all()
+                        assert table.num_rows > 0
+
+    def test_write_arrow_predefined_views(self):
+        """Test predefined views (io) - view may filter some events."""
+        with Environment(lines=50) as env:
+            gz_file = env.create_test_gzip_file()
+            env.build_index(gz_file)
+            reader = dft_utils.TraceReader(gz_file)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                result = reader.write_arrow(output_dir, views=["io"])
+
+                assert "io" in result["partitions"]
+                stats = result["partitions"]["io"]
+
+                if stats["rows"] > 0:
+                    for arrow_file in stats["files"]:
+                        reader_ipc = ipc.open_file(arrow_file)
+                        table = reader_ipc.read_all()
+                        assert table.num_rows > 0
+                        cats = table.column("cat").to_pylist()
+                        for cat in cats:
+                            assert cat in ["POSIX", "STDIO"]
+
+    def test_write_arrow_custom_query(self):
+        """Test custom query view."""
+        with Environment(lines=50) as env:
+            gz_file = env.create_test_gzip_file()
+            env.build_index(gz_file)
+            reader = dft_utils.TraceReader(gz_file)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                views = [{"name": "reads", "query": 'name == "read"'}]
+                result = reader.write_arrow(output_dir, views=views)
+
+                assert "reads" in result["partitions"]
+                stats = result["partitions"]["reads"]
+
+                if stats["rows"] > 0:
+                    for arrow_file in stats["files"]:
+                        reader_ipc = ipc.open_file(arrow_file)
+                        table = reader_ipc.read_all()
+                        names = table.column("name").to_pylist()
+                        for name in names:
+                            assert name == "read"
+
+    def test_write_arrow_bloom_filter_pruning(self):
+        """Verify bloom filter pruning returns stats."""
+        with Environment(lines=100) as env:
+            gz_file = env.create_test_gzip_file()
+            env.build_index(gz_file)
+            reader = dft_utils.TraceReader(gz_file)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                views = [{"name": "posix_only", "query": 'cat == "POSIX"'}]
+                result = reader.write_arrow(output_dir, views=views)
+
+                assert "chunks_scanned" in result
+                assert "chunks_skipped" in result
+                assert result["chunks_scanned"] >= 0
+                assert result["chunks_skipped"] >= 0
+
+    def test_write_arrow_multiple_views(self):
+        """Test multiple views in single call."""
+        with Environment(lines=50) as env:
+            gz_file = env.create_test_gzip_file()
+            env.build_index(gz_file)
+            reader = dft_utils.TraceReader(gz_file)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                views = [
+                    {"name": "reads", "query": 'name == "read"'},
+                    {"name": "writes", "query": 'name == "write"'},
+                ]
+                result = reader.write_arrow(output_dir, views=views)
+
+                assert "reads" in result["partitions"]
+                assert "writes" in result["partitions"]
+
+                reads_dir = os.path.join(output_dir, "reads")
+                writes_dir = os.path.join(output_dir, "writes")
+                assert os.path.isdir(reads_dir)
+                assert os.path.isdir(writes_dir)
+
+    def test_write_arrow_compression(self):
+        """Test different compression options."""
+        with Environment(lines=30) as env:
+            gz_file = env.create_test_gzip_file()
+            env.build_index(gz_file)
+            reader = dft_utils.TraceReader(gz_file)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                result_zstd = reader.write_arrow(
+                    os.path.join(output_dir, "zstd"), compression="zstd"
+                )
+                result_none = reader.write_arrow(
+                    os.path.join(output_dir, "none"), compression="none"
+                )
+
+                assert result_zstd["total_rows"] == result_none["total_rows"]
+
+                zstd_files = result_zstd["partitions"]["all"]["files"]
+                none_files = result_none["partitions"]["all"]["files"]
+
+                zstd_size = sum(os.path.getsize(f) for f in zstd_files)
+                none_size = sum(os.path.getsize(f) for f in none_files)
+
+                assert zstd_size < none_size
+
+    def test_write_arrow_chunk_size(self):
+        """Test chunk_size_mb controls file splitting."""
+        with Environment(lines=100) as env:
+            gz_file = env.create_test_gzip_file(bytes_per_line=4096)
+            env.build_index(gz_file)
+            reader = dft_utils.TraceReader(gz_file)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                result = reader.write_arrow(output_dir, chunk_size_mb=0)
+
+                stats = result["partitions"]["all"]
+                assert len(stats["files"]) == 1
+
+    def test_write_arrow_no_metadata(self):
+        """Test include_metadata=False excludes metadata events."""
+        with Environment(lines=30) as env:
+            gz_file = env.create_test_gzip_file()
+            env.build_index(gz_file)
+            reader = dft_utils.TraceReader(gz_file)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                views = [{"name": "no_meta", "query": 'cat == "POSIX"', "include_metadata": False}]
+                result = reader.write_arrow(output_dir, views=views)
+
+                assert "no_meta" in result["partitions"]
+
+
+class TestElasticArrowSchema:
+    """Test elastic Arrow schema with varying event fields."""
+
+    def test_varying_schema_single_file(self):
+        """Events with different fields produce consistent Arrow schema."""
+        with Environment(lines=500) as env:
+            gz_file = env.create_varying_schema_file()
+            env.build_index(gz_file, checkpoint_size_bytes=4 * 1024)
+            reader = dft_utils.TraceReader(gz_file, checkpoint_size=4 * 1024)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                result = reader.write_arrow(output_dir)
+
+                assert result["total_rows"] > 0
+                stats = result["partitions"]["all"]
+                assert len(stats["files"]) >= 1
+
+                schemas = []
+                for arrow_file in stats["files"]:
+                    reader_ipc = ipc.open_file(arrow_file)
+                    schemas.append(reader_ipc.schema)
+
+                if len(schemas) > 1:
+                    first_schema = schemas[0]
+                    for i, schema in enumerate(schemas[1:], 1):
+                        assert schema.equals(first_schema), (
+                            f"Schema mismatch between file 0 and file {i}"
+                        )
+
+    def test_varying_schema_column_order_stable(self):
+        """Column order remains consistent across batches."""
+        with Environment(lines=1000) as env:
+            gz_file = env.create_varying_schema_file(num_events=1000)
+            env.build_index(gz_file, checkpoint_size_bytes=2 * 1024)
+            reader = dft_utils.TraceReader(gz_file, checkpoint_size=2 * 1024)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                result = reader.write_arrow(output_dir, batch_size=100)
+
+                assert result["total_rows"] > 0
+                stats = result["partitions"]["all"]
+
+                all_tables = []
+                for arrow_file in stats["files"]:
+                    reader_ipc = ipc.open_file(arrow_file)
+                    all_tables.append(reader_ipc.read_all())
+
+                if len(all_tables) > 1:
+                    first_columns = all_tables[0].column_names
+                    for i, table in enumerate(all_tables[1:], 1):
+                        assert table.column_names == first_columns, (
+                            f"Column order mismatch between table 0 and table {i}"
+                        )
+
+    def test_varying_schema_null_backfill(self):
+        """Fields not present in all events are backfilled with nulls."""
+        with Environment(lines=500) as env:
+            gz_file = env.create_varying_schema_file()
+            env.build_index(gz_file, checkpoint_size_bytes=4 * 1024)
+            reader = dft_utils.TraceReader(gz_file, checkpoint_size=4 * 1024)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                result = reader.write_arrow(output_dir)
+
+                stats = result["partitions"]["all"]
+                tables = [ipc.open_file(f).read_all() for f in stats["files"]]
+                combined = pa.concat_tables(tables)
+
+                if "rare_field" in combined.column_names:
+                    rare_col = combined.column("rare_field")
+                    null_count = rare_col.null_count
+                    assert null_count > 0, "rare_field should have null values"
+                    assert null_count < len(rare_col), "rare_field should have some non-null values"
+
+    def test_varying_schema_pyarrow_concat(self):
+        """Multiple IPC files can be concatenated with pyarrow."""
+        with Environment(lines=1000) as env:
+            gz_file = env.create_varying_schema_file(num_events=1000)
+            env.build_index(gz_file, checkpoint_size_bytes=2 * 1024)
+            reader = dft_utils.TraceReader(gz_file, checkpoint_size=2 * 1024)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                result = reader.write_arrow(output_dir)
+
+                stats = result["partitions"]["all"]
+                if len(stats["files"]) > 1:
+                    tables = [ipc.open_file(f).read_all() for f in stats["files"]]
+                    combined = pa.concat_tables(tables)
+                    assert combined.num_rows == result["total_rows"]
+
+
+class TestTraceReaderWriteArrowDask:
+    """Test write_arrow integration with Dask."""
+
+    def test_write_arrow_dask_read(self):
+        """Verify Arrow output is readable by Dask."""
+        pytest.importorskip("dask")
+        pytest.importorskip("dask.dataframe")
+        import dask.dataframe as dd
+
+        with Environment(lines=50) as env:
+            gz_file = env.create_test_gzip_file()
+            env.build_index(gz_file)
+            reader = dft_utils.TraceReader(gz_file)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                result = reader.write_arrow(output_dir)
+
+                arrow_files = result["partitions"]["all"]["files"]
+                assert len(arrow_files) > 0
+
+                tables = []
+                for f in arrow_files:
+                    reader_ipc = ipc.open_file(f)
+                    tables.append(reader_ipc.read_all())
+
+                combined = pa.concat_tables(tables)
+                pdf = combined.to_pandas()
+
+                ddf = dd.from_pandas(pdf, npartitions=2)
+                assert len(ddf) == result["total_rows"]
+
+    def test_write_arrow_parallel_views_dask(self):
+        """Test reading multiple view outputs with Dask."""
+        pytest.importorskip("dask")
+        pytest.importorskip("dask.dataframe")
+        import dask.dataframe as dd
+
+        with Environment(lines=50) as env:
+            gz_file = env.create_test_gzip_file()
+            env.build_index(gz_file)
+            reader = dft_utils.TraceReader(gz_file)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                views = [
+                    {"name": "posix", "query": 'cat == "POSIX"'},
+                    {"name": "stdio", "query": 'cat == "STDIO"'},
+                ]
+                result = reader.write_arrow(output_dir, views=views)
+
+                for view_name in ["posix", "stdio"]:
+                    if result["partitions"][view_name]["rows"] > 0:
+                        files = result["partitions"][view_name]["files"]
+                        tables = [ipc.open_file(f).read_all() for f in files]
+                        combined = pa.concat_tables(tables)
+                        ddf = dd.from_pandas(combined.to_pandas(), npartitions=1)
+                        assert len(ddf) == result["partitions"][view_name]["rows"]
+
+
+class TestTraceReaderViewChunks:
+    """Test get_view_chunks and write_view_chunk APIs."""
+
+    def test_get_view_chunks_basic(self):
+        """Test get_view_chunks returns chunk metadata."""
+        with Environment(lines=50) as env:
+            gz_file = env.create_test_gzip_file()
+            env.build_index(gz_file)
+            reader = dft_utils.TraceReader(gz_file)
+
+            result = reader.get_view_chunks(view={"name": "all", "query": 'cat == "POSIX"'})
+
+            assert "chunks" in result
+            assert "total_checkpoints" in result
+            assert "skipped_checkpoints" in result
+            assert "file_may_match" in result
+            assert result["total_checkpoints"] >= 0
+
+    def test_write_view_chunk_basic(self):
+        """Test write_view_chunk writes Arrow IPC file."""
+        with Environment(lines=50) as env:
+            gz_file = env.create_test_gzip_file()
+            env.build_index(gz_file)
+            reader = dft_utils.TraceReader(gz_file)
+
+            chunks_result = reader.get_view_chunks()
+            if not chunks_result["chunks"]:
+                pytest.skip("No chunks to process")
+
+            chunk = chunks_result["chunks"][0]
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                output_file = os.path.join(output_dir, "chunk-00000.arrow")
+                result = reader.write_view_chunk(
+                    output_file=output_file,
+                    checkpoint_idx=chunk["checkpoint_idx"],
+                    start_byte=chunk["start_byte"],
+                    end_byte=chunk["end_byte"],
+                )
+
+                assert "output_file" in result
+                assert "rows_written" in result
+                assert os.path.exists(result["output_file"])
+
+                if result["rows_written"] > 0:
+                    reader_ipc = ipc.open_file(result["output_file"])
+                    table = reader_ipc.read_all()
+                    assert table.num_rows == result["rows_written"]
+
+    def test_write_view_chunks_parallel(self):
+        """Test write_view_chunks processes multiple chunks in parallel."""
+        with Environment(lines=5000) as env:
+            gz_file = env.create_test_gzip_file(bytes_per_line=512)
+            env.build_index(gz_file, checkpoint_size_bytes=4 * 1024)
+            reader = dft_utils.TraceReader(gz_file, checkpoint_size=4 * 1024)
+
+            chunks_result = reader.get_view_chunks()
+            if len(chunks_result["chunks"]) < 2:
+                pytest.skip("Need at least 2 chunks for parallel test")
+
+            chunks = chunks_result["chunks"][:4]
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                result = reader.write_view_chunks(
+                    chunks=chunks,
+                    output_dir=output_dir,
+                )
+
+                assert "results" in result
+                assert "total_rows" in result
+                assert "total_events_matched" in result
+
+                assert len(result["results"]) == len(chunks)
+
+                total_rows = 0
+                for r in result["results"]:
+                    assert "output_file" in r
+                    assert "rows_written" in r
+                    if r["rows_written"] > 0:
+                        assert os.path.exists(r["output_file"])
+                        reader_ipc = ipc.open_file(r["output_file"])
+                        table = reader_ipc.read_all()
+                        assert table.num_rows == r["rows_written"]
+                        total_rows += r["rows_written"]
+
+                assert result["total_rows"] == total_rows
+
+
+class TestDistributedWriteArrow:
+    """Test distributed_write_arrow with Dask."""
+
+    def test_distributed_write_arrow_basic(self):
+        """Test distributed_write_arrow produces readable files."""
+        pytest.importorskip("dask")
+        from dftracer.utils.arrow import read_arrow
+        from dftracer.utils.dask import distributed_write_arrow
+
+        with Environment(lines=50) as env:
+            gz_file = env.create_test_gzip_file()
+            env.build_index(gz_file)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                result = distributed_write_arrow(
+                    gz_file, output_dir, view={"name": "all", "query": 'cat == "POSIX"'}
+                )
+
+                assert "files" in result
+                assert "total_chunks" in result
+                assert "skipped_chunks" in result
+                assert "total_rows" in result
+
+                if result["files"]:
+                    table = read_arrow(result["files"])
+                    assert table is not None
+                    assert table.num_rows == result["total_rows"]
+
+    def test_distributed_write_arrow_with_view(self):
+        """Test distributed_write_arrow with predefined view."""
+        pytest.importorskip("dask")
+        from dftracer.utils.arrow import read_arrow
+        from dftracer.utils.dask import distributed_write_arrow
+
+        with Environment(lines=50) as env:
+            gz_file = env.create_test_gzip_file()
+            env.build_index(gz_file)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                result = distributed_write_arrow(gz_file, output_dir)
+
+                assert "files" in result
+                if result["files"]:
+                    table = read_arrow(result["files"])
+                    assert table is not None
+
+    def test_distributed_write_arrow_batched(self):
+        """Test distributed_write_arrow with chunks_per_task batching."""
+        pytest.importorskip("dask")
+        from dftracer.utils.arrow import read_arrow
+        from dftracer.utils.dask import distributed_write_arrow
+
+        with Environment(lines=5000) as env:
+            gz_file = env.create_test_gzip_file(bytes_per_line=512)
+            env.build_index(gz_file, checkpoint_size_bytes=4 * 1024)
+
+            with tempfile.TemporaryDirectory() as output_dir:
+                result = distributed_write_arrow(
+                    gz_file,
+                    output_dir,
+                    view={"name": "all", "query": 'cat == "POSIX"'},
+                    checkpoint_size=4 * 1024,
+                    chunks_per_task=2,
+                )
+
+                assert "files" in result
+                assert "total_chunks" in result
+                assert "total_rows" in result
+
+                if result["files"]:
+                    table = read_arrow(result["files"])
+                    assert table is not None
+                    assert table.num_rows == result["total_rows"]
diff --git a/tests/replay/test_replay_fidelity.cpp b/tests/replay/test_replay_fidelity.cpp
new file mode 100644
index 00000000..1bd7c4b8
--- /dev/null
+++ b/tests/replay/test_replay_fidelity.cpp
@@ -0,0 +1,271 @@
+// Fidelity tests for ReplayEngine.
+//
+// "Fidelity" here means: when maintain_timing is on, each event is dispatched
+// at wall-clock time close to its scheduled position on the trace timeline.
+// Excessive lateness compounds into wrong inter-event gaps, defeating the
+// point of timing-preserved replay. Two failure modes we want to catch:
+//
+//   1. Per-event lateness: an individual event fires more than a few ms
+//      late vs. when apply_timing should have woken up.
+//   2. End-to-end drift: total wall-clock duration diverges from the trace's
+//      timespan. Sensitive to the apply_timing anchor bug (where
+//      replay_start_time_ wasn't reset on the first event, making every
+//      subsequent sleep be skipped).
+
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/common/logging.h>
+#include <dftracer/utils/core/pipeline/pipeline.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/core/tasks/task.h>
+#include <dftracer/utils/utilities/replay/replay.h>
+#include <doctest/doctest.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cstdlib>
+#include <fstream>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+using namespace dftracer::utils;
+using namespace dftracer::utils::utilities::replay;
+
+namespace {
+
+// Generate a `.pfw` trace with `n` events spaced `step_us` microseconds
+// apart. The events name "read" / cat "POSIX" pass the default filters but
+// dry_run=true skips actual I/O so the consumer's per-event work is
+// dominated by apply_timing sleep.
+void write_evenly_spaced_trace(const std::string& path, std::size_t n,
+                               std::uint64_t step_us) {
+    std::ofstream f(path);
+    REQUIRE(f.is_open());
+    const std::uint64_t base_ts = 1'000'000;
+    f << "[\n";
+    for (std::size_t i = 0; i < n; ++i) {
+        f << R"({"id":)" << i
+          << R"(,"name":"read","cat":"POSIX","pid":12345,"tid":12345,"ts":)"
+          << (base_ts + i * step_us) << R"(,"dur":10,"ph":"X","args":{}})";
+        if (i + 1 < n) f << ",";
+        f << "\n";
+    }
+    f << "]";
+}
+
+struct DispatchSample {
+    std::uint64_t trace_ts;  // microseconds since arbitrary trace epoch
+    std::chrono::steady_clock::time_point wall;
+};
+
+// Capture (trace_ts, wall_now) for each event in dispatch order. on_dispatch
+// is invoked from the consumer thread; we still protect the vector since
+// future executor changes may dispatch from multiple workers.
+struct DispatchRecorder {
+    std::mutex m;
+    std::vector<DispatchSample> samples;
+
+    void record(const Trace& t, std::chrono::steady_clock::time_point now) {
+        std::lock_guard lock(m);
+        samples.push_back({t.time_start, now});
+    }
+};
+
+struct FidelityStats {
+    std::int64_t max_lateness_us = 0;
+    std::int64_t p99_lateness_us = 0;
+    std::int64_t total_wall_span_us = 0;
+    std::int64_t expected_trace_span_us = 0;
+};
+
+FidelityStats analyze(const std::vector<DispatchSample>& samples) {
+    REQUIRE(samples.size() >= 2);
+
+    const auto& first = samples.front();
+    const auto& last = samples.back();
+
+    std::vector<std::int64_t> lateness;
+    lateness.reserve(samples.size());
+    for (std::size_t i = 0; i < samples.size(); ++i) {
+        auto expected_offset =
+            std::chrono::microseconds(samples[i].trace_ts - first.trace_ts);
+        auto expected_wall = first.wall + expected_offset;
+        auto delta = std::chrono::duration_cast<std::chrono::microseconds>(
+                         samples[i].wall - expected_wall)
+                         .count();
+        lateness.push_back(delta);
+    }
+
+    FidelityStats out;
+    out.max_lateness_us = *std::max_element(lateness.begin(), lateness.end());
+    auto sorted = lateness;
+    std::sort(sorted.begin(), sorted.end());
+    out.p99_lateness_us = sorted[(sorted.size() * 99) / 100];
+    out.total_wall_span_us =
+        std::chrono::duration_cast<std::chrono::microseconds>(last.wall -
+                                                              first.wall)
+            .count();
+    out.expected_trace_span_us =
+        static_cast<std::int64_t>(last.trace_ts - first.trace_ts);
+    return out;
+}
+
+// Fidelity tolerances
+bool is_ci_env() {
+    return std::getenv("CI") != nullptr ||
+           std::getenv("GITHUB_ACTIONS") != nullptr;
+}
+
+struct Tolerances {
+    // Set to a negative value to skip the corresponding check.
+    std::int64_t max_per_event_us;
+    std::int64_t max_p99_us;
+    double wall_span;
+};
+
+Tolerances tolerances() {
+    if (is_ci_env()) {
+        return {/*max_per_event_us=*/-1, /*max_p99_us=*/-1,
+                /*wall_span=*/1.0};
+    }
+    // Local dev: tight microsecond-grade bounds catch regressions early.
+    return {/*max_per_event_us=*/10'000, /*max_p99_us=*/5'000,
+            /*wall_span=*/0.25};
+}
+
+void check_fidelity(const FidelityStats& s, const char* label) {
+    const auto t = tolerances();
+    INFO("[" << label << " ci=" << is_ci_env() << "] max_lateness="
+             << s.max_lateness_us << "us p99=" << s.p99_lateness_us
+             << "us wall=" << s.total_wall_span_us
+             << "us trace=" << s.expected_trace_span_us << "us");
+    if (t.max_per_event_us >= 0) {
+        CHECK(s.max_lateness_us <= t.max_per_event_us);
+    }
+    if (t.max_p99_us >= 0) {
+        CHECK(s.p99_lateness_us <= t.max_p99_us);
+    }
+
+    const std::int64_t low = static_cast<std::int64_t>(
+        s.expected_trace_span_us * (1.0 - t.wall_span));
+    CHECK(s.total_wall_span_us >= low);
+
+    if (!is_ci_env()) {
+        const std::int64_t high = static_cast<std::int64_t>(
+            s.expected_trace_span_us * (1.0 + t.wall_span));
+        CHECK(s.total_wall_span_us <= high);
+    }
+}
+
+}  // namespace
+
+TEST_CASE("Replay fidelity - sync path") {
+    DFTRACER_UTILS_LOGGER_INIT();
+
+    fs::path temp_dir = fs::temp_directory_path() / "dftracer_replay_fid_sync";
+    fs::create_directories(temp_dir);
+    std::string trace_file = (temp_dir / "fid.pfw").string();
+
+    constexpr std::size_t N = 40;
+    constexpr std::uint64_t STEP_US = 5'000;
+    write_evenly_spaced_trace(trace_file, N, STEP_US);
+
+    DispatchRecorder rec;
+    ReplayConfig config;
+    config.maintain_timing = true;
+    config.dry_run = false;
+    config.on_dispatch = [&rec](const Trace& t,
+                                std::chrono::steady_clock::time_point now) {
+        rec.record(t, now);
+    };
+
+    ReplayEngine engine(config);
+    auto result = engine.replay(trace_file);
+    CHECK(result.total_events == N);
+
+    auto stats = analyze(rec.samples);
+    check_fidelity(stats, "sync");
+
+    std::error_code ec;
+    fs::remove_all(temp_dir, ec);
+}
+
+TEST_CASE("Replay fidelity - pipelined path") {
+    DFTRACER_UTILS_LOGGER_INIT();
+
+    fs::path temp_dir =
+        fs::temp_directory_path() / "dftracer_replay_fid_pipelined";
+    fs::create_directories(temp_dir);
+    std::string trace_file = (temp_dir / "fid.pfw").string();
+
+    constexpr std::size_t N = 40;
+    constexpr std::uint64_t STEP_US = 5'000;
+    write_evenly_spaced_trace(trace_file, N, STEP_US);
+
+    DispatchRecorder rec;
+    ReplayConfig config;
+    config.maintain_timing = true;
+    config.dry_run = false;
+    config.on_dispatch = [&rec](const Trace& t,
+                                std::chrono::steady_clock::time_point now) {
+        rec.record(t, now);
+    };
+
+    ReplayEngine engine(config);
+    ReplayResult result;
+    std::vector<std::string> files = {trace_file};
+
+    Pipeline pipeline(PipelineConfig::parallel(4));
+    auto root = make_task(
+        [&engine, &files, &result](CoroScope& scope) -> coro::CoroTask<void> {
+            co_await engine.run_pipelined(scope, files, result, /*cap=*/64);
+        },
+        "replay_pipelined");
+    pipeline.set_source(root);
+    pipeline.execute();
+
+    CHECK(result.total_events == N);
+    auto stats = analyze(rec.samples);
+    check_fidelity(stats, "pipelined");
+
+    std::error_code ec;
+    fs::remove_all(temp_dir, ec);
+}
+
+TEST_CASE("Replay fidelity - first-event anchor reset survives warmup gap") {
+    DFTRACER_UTILS_LOGGER_INIT();
+
+    fs::path temp_dir =
+        fs::temp_directory_path() / "dftracer_replay_fid_anchor";
+    fs::create_directories(temp_dir);
+    std::string trace_file = (temp_dir / "fid.pfw").string();
+
+    constexpr std::size_t N = 20;
+    constexpr std::uint64_t STEP_US = 5'000;  // 95ms span
+    write_evenly_spaced_trace(trace_file, N, STEP_US);
+
+    DispatchRecorder rec;
+    ReplayConfig config;
+    config.maintain_timing = true;
+    config.dry_run = false;
+    config.on_dispatch = [&rec](const Trace& t,
+                                std::chrono::steady_clock::time_point now) {
+        rec.record(t, now);
+    };
+
+    ReplayEngine engine(config);
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+    auto result = engine.replay(trace_file);
+    CHECK(result.total_events == N);
+
+    auto stats = analyze(rec.samples);
+    CHECK(stats.total_wall_span_us >= 70'000);
+    check_fidelity(stats, "anchor-reset");
+
+    std::error_code ec;
+    fs::remove_all(temp_dir, ec);
+}
diff --git a/tests/utilities/CMakeLists.txt b/tests/utilities/CMakeLists.txt
index 1818992b..c3ab418f 100644
--- a/tests/utilities/CMakeLists.txt
+++ b/tests/utilities/CMakeLists.txt
@@ -55,6 +55,7 @@ set(UTILITIES_TEST_SOURCES
     composites/dft/statistics/test_statistics_aggregator.cpp
     composites/dft/statistics/test_statistics_query.cpp
     common/statistics/test_log2_histogram.cpp
+    common/statistics/test_timestamp_histogram.cpp
     composites/dft/statistics/test_detailed_statistics.cpp
 
     # Query language
@@ -67,9 +68,13 @@ set(UTILITIES_TEST_SOURCES
     composites/dft/aggregators/test_aggregation_metrics.cpp
     composites/dft/aggregators/test_aggregation_key.cpp
     composites/dft/aggregators/test_aggregation_config.cpp
+    composites/dft/aggregators/test_aggregation_serialization.cpp
     composites/dft/aggregators/test_aggregator_utility.cpp
     composites/dft/aggregators/test_chunk_aggregator_utility.cpp
     composites/dft/aggregators/test_event_aggregator_utility.cpp
+    composites/dft/aggregators/test_aggregation_augmentation.cpp
+    composites/dft/aggregators/test_system_metrics.cpp
+    composites/dft/aggregators/test_system_metrics_merge_operator.cpp
 
     # DFT Comparator Composites
     composites/dft/comparator/test_comparison_result.cpp
@@ -82,6 +87,7 @@ set(UTILITIES_TEST_SOURCES
     indexer/test_index_database.cpp
     indexer/test_provenance_database.cpp
     indexer/test_index_builder.cpp
+    indexer/test_sst_ingest_spike.cpp
 
     # Compression
     compression/zlib/test_streaming_compressor.cpp
@@ -101,6 +107,12 @@ set(UTILITIES_TEST_SOURCES
     fileio/test_streaming_file_reader.cpp
     fileio/test_streaming_file_writer.cpp
 
+    # I/O Parallel
+    fileio/parallel/test_layout_sizing.cpp
+    fileio/parallel/test_striped_writer.cpp
+    fileio/parallel/test_sharded_writer.cpp
+    fileio/parallel/test_padded_striped_writer.cpp
+
     # I/O Lines
     fileio/lines/test_streaming_line_reader.cpp
     fileio/lines/sources/test_indexed_file_line_iterator.cpp
@@ -160,6 +172,7 @@ endif()
 # +++++++++++++++++++++++++++++++++++++++++
 
 if(DFTRACER_UTILS_ENABLE_ARROW_IPC)
+  # IPC Writer tests
   set(ARROW_IPC_TEST_SOURCE common/arrow/test_arrow_ipc_writer.cpp)
   string(REPLACE ".cpp" "" arrow_ipc_bin_exec ${ARROW_IPC_TEST_SOURCE})
   string(REPLACE "/" "_" arrow_ipc_target "utilities_${arrow_ipc_bin_exec}")
@@ -188,6 +201,37 @@ if(DFTRACER_UTILS_ENABLE_ARROW_IPC)
   add_test(NAME utilities/${arrow_ipc_bin_exec} COMMAND ${arrow_ipc_target})
   set_tests_properties(utilities/${arrow_ipc_bin_exec} PROPERTIES
                        WORKING_DIRECTORY "${arrow_ipc_workdir}")
+
+  # IPC Reader tests
+  set(ARROW_IPC_READER_SOURCE common/arrow/test_arrow_ipc_reader.cpp)
+  string(REPLACE ".cpp" "" arrow_ipc_reader_bin ${ARROW_IPC_READER_SOURCE})
+  string(REPLACE "/" "_" arrow_ipc_reader_target "utilities_${arrow_ipc_reader_bin}")
+
+  add_executable(${arrow_ipc_reader_target} ${ARROW_IPC_READER_SOURCE})
+  target_include_directories(${arrow_ipc_reader_target} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
+  target_link_libraries(${arrow_ipc_reader_target} PRIVATE doctest::doctest dftracer_utils testing_utilities)
+  link_nanoarrow(${arrow_ipc_reader_target} STATIC)
+  target_set_warnings(${arrow_ipc_reader_target})
+  target_enable_coroutine(${arrow_ipc_reader_target})
+
+  get_filename_component(arrow_ipc_reader_dir ${arrow_ipc_reader_bin} DIRECTORY)
+  get_filename_component(arrow_ipc_reader_name ${arrow_ipc_reader_bin} NAME)
+  if(arrow_ipc_reader_dir)
+    set_target_properties(${arrow_ipc_reader_target} PROPERTIES
+      RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${arrow_ipc_reader_dir}")
+  endif()
+  set_target_properties(${arrow_ipc_reader_target} PROPERTIES OUTPUT_NAME ${arrow_ipc_reader_name})
+
+  if(DFTRACER_UTILS_COVERAGE AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+    target_compile_options(${arrow_ipc_reader_target} PRIVATE --coverage -fprofile-arcs -ftest-coverage)
+    target_link_libraries(${arrow_ipc_reader_target} PRIVATE --coverage)
+  endif()
+
+  set(arrow_ipc_reader_workdir "${CMAKE_CURRENT_BINARY_DIR}/workdirs/${arrow_ipc_reader_target}")
+  file(MAKE_DIRECTORY "${arrow_ipc_reader_workdir}")
+  add_test(NAME utilities/${arrow_ipc_reader_bin} COMMAND ${arrow_ipc_reader_target})
+  set_tests_properties(utilities/${arrow_ipc_reader_bin} PROPERTIES
+                       WORKING_DIRECTORY "${arrow_ipc_reader_workdir}")
 endif()
 
 # +++++++++++++++++++++++++++++++++++++++++
diff --git a/tests/utilities/call_tree/test_call_tree_internal.cpp b/tests/utilities/call_tree/test_call_tree_internal.cpp
index 1aea5b54..04f6a4dc 100644
--- a/tests/utilities/call_tree/test_call_tree_internal.cpp
+++ b/tests/utilities/call_tree/test_call_tree_internal.cpp
@@ -4,6 +4,11 @@
 #include <dftracer/utils/call_tree/internal/node.h>
 #include <dftracer/utils/call_tree/internal/process_call_tree.h>
 #include <dftracer/utils/call_tree/internal/process_key.h>
+#include <dftracer/utils/call_tree/mpi/serializable.h>
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/pipeline/pipeline.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/core/tasks/task.h>
 #include <doctest/doctest.h>
 
 #include <memory>
@@ -91,17 +96,18 @@ TEST_CASE("CallTreeFactory - Create nodes") {
     }
 
     SUBCASE("Create node with arguments") {
-        std::unordered_map<std::string, std::string> args;
-        args["arg1"] = "value1";
-        args["arg2"] = "value2";
+        dftracer::utils::call_tree::internal::ArgsMap args;
+        args.set_valid(true);
+        args.insert("arg1", std::string("value1"));
+        args.insert("arg2", std::string("value2"));
 
         auto node = factory.create_node(2, "test_func", "category", 2000, 1000,
-                                        1, args);
+                                        1, std::move(args));
 
         CHECK(node != nullptr);
-        CHECK(node->get_args().size() == 2);
-        CHECK(node->get_args().at("arg1") == "value1");
-        CHECK(node->get_args().at("arg2") == "value2");
+        CHECK(node->get_args().raw().size() == 2);
+        CHECK(node->get_args()["arg1"].get<std::string>() == "value1");
+        CHECK(node->get_args()["arg2"].get<std::string>() == "value2");
     }
 
     SUBCASE("Multiple nodes with unique IDs") {
@@ -246,3 +252,154 @@ TEST_CASE("CallTree - Integration test with nodes") {
 
     tree.cleanup();
 }
+
+// ============================================================================
+// Save / Load round-trips
+// ============================================================================
+
+namespace {
+
+using dftracer::utils::CoroScope;
+using dftracer::utils::make_task;
+using dftracer::utils::Pipeline;
+namespace coro = dftracer::utils::coro;
+using dftracer::utils::call_tree::load_arrow;
+using dftracer::utils::call_tree::load_binary;
+using dftracer::utils::call_tree::save_arrow;
+using dftracer::utils::call_tree::save_binary;
+
+std::unique_ptr<CallTree> make_fixture() {
+    auto tree = std::make_unique<CallTree>();
+    tree->initialize();
+
+    auto add_proc = [&](std::uint32_t pid, std::uint32_t tid,
+                        std::uint32_t pkid) {
+        ProcessKey key(pid, tid, pkid);
+        dftracer::utils::utilities::composites::dft::ArgsMap a1;
+        a1.set_valid(true);
+        a1.insert("level", static_cast<std::uint64_t>(0));
+        a1.insert("tid", static_cast<std::uint64_t>(tid));
+        a1.insert("fhash", std::string("abc123"));
+        auto root = tree->get_factory().create_node(1, "main", "function", 0,
+                                                    1000, 0, std::move(a1));
+        dftracer::utils::utilities::composites::dft::ArgsMap a2;
+        a2.set_valid(true);
+        a2.insert("level", static_cast<std::uint64_t>(1));
+        a2.insert("tid", static_cast<std::uint64_t>(tid));
+        auto child = tree->get_factory().create_node(
+            2, "child", "function", 100, 500, 1, std::move(a2));
+        child->set_parent_id(1);
+        root->add_child(2);
+        tree->add_call(key, root);
+        tree->add_call(key, child);
+        auto* pgraph = tree->get(key);
+        pgraph->root_calls.push_back(1);
+        pgraph->call_sequence = {1, 2};
+    };
+    add_proc(100, 200, 0);
+    add_proc(101, 201, 0);
+    return tree;
+}
+
+template <typename SaveFn, typename LoadFn>
+std::unique_ptr<CallTree> roundtrip(const CallTree& src,
+                                    const std::string& path, SaveFn save_fn,
+                                    LoadFn load_fn, bool* save_ok_out,
+                                    bool* load_ok_out) {
+    struct Ctx {
+        const CallTree* src;
+        std::string path;
+        std::unique_ptr<CallTree> loaded;
+        bool save_ok = false;
+        bool load_ok = false;
+    };
+    Ctx ctx{&src, path, nullptr, false, false};
+
+    Pipeline pipeline;
+    auto run = make_task(
+        [&ctx, save_fn, load_fn](CoroScope& scope) -> coro::CoroTask<void> {
+            ctx.save_ok = co_await save_fn(&scope, *ctx.src, ctx.path);
+            if (ctx.save_ok) {
+                ctx.loaded = co_await load_fn(&scope, ctx.path);
+                ctx.load_ok = (ctx.loaded != nullptr);
+            }
+        },
+        "save_load");
+    pipeline.set_source(run);
+    pipeline.set_destination(run);
+    pipeline.execute();
+    *save_ok_out = ctx.save_ok;
+    *load_ok_out = ctx.load_ok;
+    return std::move(ctx.loaded);
+}
+
+void check_structure_matches(const CallTree& src, const CallTree& loaded) {
+    auto src_keys = const_cast<CallTree&>(src).keys();
+    auto loaded_keys = const_cast<CallTree&>(loaded).keys();
+    CHECK(src_keys.size() == loaded_keys.size());
+
+    for (const auto& key : src_keys) {
+        auto* sg = const_cast<CallTree&>(src).get(key);
+        auto* lg = const_cast<CallTree&>(loaded).get(key);
+        REQUIRE(sg != nullptr);
+        REQUIRE(lg != nullptr);
+        CHECK(sg->calls.size() == lg->calls.size());
+        CHECK(sg->root_calls.size() == lg->root_calls.size());
+        CHECK(sg->call_sequence.size() == lg->call_sequence.size());
+        for (const auto& [id, sn] : sg->calls) {
+            auto it = lg->calls.find(id);
+            REQUIRE(it != lg->calls.end());
+            const auto& ln = it->second;
+            CHECK(sn->get_name() == ln->get_name());
+            CHECK(sn->get_category() == ln->get_category());
+            CHECK(sn->get_start_time() == ln->get_start_time());
+            CHECK(sn->get_duration() == ln->get_duration());
+            CHECK(sn->get_level() == ln->get_level());
+            CHECK(sn->get_parent_id() == ln->get_parent_id());
+            CHECK(sn->get_children().size() == ln->get_children().size());
+            CHECK(sn->get_args().raw().size() == ln->get_args().raw().size());
+        }
+    }
+}
+
+}  // namespace
+
+TEST_CASE("CallTree - custom binary save/load round-trip") {
+    auto tmp = fs::temp_directory_path() /
+               ("ct_binary_test_" + std::to_string(::getpid()));
+    fs::remove_all(tmp);
+    fs::create_directories(tmp);
+    auto path = (tmp / "tree.bin").string();
+
+    auto src = make_fixture();
+    bool save_ok = false, load_ok = false;
+    auto loaded =
+        roundtrip(*src, path, save_binary, load_binary, &save_ok, &load_ok);
+    REQUIRE(save_ok);
+    REQUIRE(load_ok);
+    REQUIRE(loaded != nullptr);
+    check_structure_matches(*src, *loaded);
+
+    fs::remove_all(tmp);
+}
+
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+TEST_CASE("CallTree - arrow IPC save/load round-trip") {
+    auto tmp = fs::temp_directory_path() /
+               ("ct_arrow_test_" + std::to_string(::getpid()));
+    fs::remove_all(tmp);
+    fs::create_directories(tmp);
+    auto path = (tmp / "tree.arrow").string();
+
+    auto src = make_fixture();
+    bool save_ok = false, load_ok = false;
+    auto loaded =
+        roundtrip(*src, path, save_arrow, load_arrow, &save_ok, &load_ok);
+    REQUIRE(save_ok);
+    REQUIRE(load_ok);
+    REQUIRE(loaded != nullptr);
+    check_structure_matches(*src, *loaded);
+
+    fs::remove_all(tmp);
+}
+#endif
diff --git a/tests/utilities/common/arrow/test_arrow_column_builder.cpp b/tests/utilities/common/arrow/test_arrow_column_builder.cpp
index 1a98462e..0eb25bc2 100644
--- a/tests/utilities/common/arrow/test_arrow_column_builder.cpp
+++ b/tests/utilities/common/arrow/test_arrow_column_builder.cpp
@@ -1,3 +1,4 @@
+#include <dftracer/utils/core/common/config.h>
 #ifdef DFTRACER_UTILS_ENABLE_ARROW
 
 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
diff --git a/tests/utilities/common/arrow/test_arrow_ipc_reader.cpp b/tests/utilities/common/arrow/test_arrow_ipc_reader.cpp
new file mode 100644
index 00000000..435cb504
--- /dev/null
+++ b/tests/utilities/common/arrow/test_arrow_ipc_reader.cpp
@@ -0,0 +1,528 @@
+#include <dftracer/utils/core/common/config.h>
+#ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
+
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/runtime.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/utilities/common/arrow/arrow.h>
+#include <doctest/doctest.h>
+
+#include <cstdio>
+#include <string>
+#include <vector>
+
+using namespace dftracer::utils;
+using namespace dftracer::utils::coro;
+using namespace dftracer::utils::utilities::common::arrow;
+
+static std::string tmp_path(const char* name) {
+    return (fs::temp_directory_path() / name).string();
+}
+
+static void write_test_file(const std::string& path, int num_batches,
+                            int rows_per_batch,
+                            IpcCompression compression = IpcCompression::NONE) {
+    Runtime runtime(2);
+
+    auto task = [&]() -> CoroTask<void> {
+        IpcWriter writer;
+        int rc = co_await writer.open(path, compression);
+        if (rc != 0) co_return;
+
+        RecordBatchBuilder builder;
+        builder.declare_schema({{"id", ColumnType::INT64},
+                                {"name", ColumnType::STRING},
+                                {"value", ColumnType::DOUBLE}});
+
+        for (int b = 0; b < num_batches; ++b) {
+            builder.reserve(rows_per_batch);
+            for (int i = 0; i < rows_per_batch; ++i) {
+                int row_id = b * rows_per_batch + i;
+                builder.append_int64(0, row_id);
+                std::string name = "item_" + std::to_string(row_id);
+                builder.append_string(1, name);
+                builder.append_double(2, row_id * 1.5);
+                builder.end_row();
+            }
+            auto batch = builder.finish();
+            co_await writer.write_batch(batch);
+            builder.reset(true);
+        }
+
+        co_await writer.close();
+    };
+
+    runtime.submit(task(), "write_test_file").get();
+    runtime.shutdown();
+}
+
+// ---------------------------------------------------------------------------
+// IpcReader Tests
+// ---------------------------------------------------------------------------
+
+TEST_CASE("IpcReader - basic read single batch") {
+    std::string path = tmp_path("test_ipc_reader_basic.arrow");
+    std::remove(path.c_str());
+
+    write_test_file(path, 1, 10);
+
+    IpcReader reader;
+    CHECK_FALSE(reader.is_open());
+    CHECK(reader.open(path) == 0);
+    CHECK(reader.is_open());
+    CHECK(reader.num_batches() == 1);
+
+    auto batch = reader.read_batch(0);
+    CHECK(batch.valid());
+    CHECK(batch.num_rows() == 10);
+    CHECK(batch.num_columns() == 3);
+
+    reader.close();
+    CHECK_FALSE(reader.is_open());
+
+    fs::remove(path);
+}
+
+TEST_CASE("IpcReader - read multiple batches") {
+    std::string path = tmp_path("test_ipc_reader_multi.arrow");
+    std::remove(path.c_str());
+
+    write_test_file(path, 5, 20);
+
+    IpcReader reader;
+    CHECK(reader.open(path) == 0);
+    CHECK(reader.num_batches() == 5);
+
+    std::int64_t total_rows = 0;
+    for (std::size_t i = 0; i < reader.num_batches(); ++i) {
+        auto batch = reader.read_batch(i);
+        CHECK(batch.valid());
+        CHECK(batch.num_rows() == 20);
+        total_rows += batch.num_rows();
+    }
+    CHECK(total_rows == 100);
+
+    reader.close();
+    fs::remove(path);
+}
+
+TEST_CASE("IpcReader - read_all") {
+    std::string path = tmp_path("test_ipc_reader_all.arrow");
+    std::remove(path.c_str());
+
+    write_test_file(path, 3, 15);
+
+    IpcReader reader;
+    CHECK(reader.open(path) == 0);
+
+    auto batches = reader.read_all();
+    CHECK(batches.size() == 3);
+
+    std::int64_t total_rows = 0;
+    for (const auto& batch : batches) {
+        CHECK(batch.valid());
+        total_rows += batch.num_rows();
+    }
+    CHECK(total_rows == 45);
+
+    reader.close();
+    fs::remove(path);
+}
+
+TEST_CASE("IpcReader - for_each_batch") {
+    std::string path = tmp_path("test_ipc_reader_foreach.arrow");
+    std::remove(path.c_str());
+
+    write_test_file(path, 4, 25);
+
+    IpcReader reader;
+    CHECK(reader.open(path) == 0);
+
+    std::int64_t total_rows = 0;
+    int batch_count = 0;
+    int rc = reader.for_each_batch([&](ArrowExportResult& batch) {
+        CHECK(batch.valid());
+        total_rows += batch.num_rows();
+        batch_count++;
+        return 0;
+    });
+
+    CHECK(rc == 0);
+    CHECK(batch_count == 4);
+    CHECK(total_rows == 100);
+
+    reader.close();
+    fs::remove(path);
+}
+
+TEST_CASE("IpcReader - open fails on non-existent file") {
+    IpcReader reader;
+    CHECK(reader.open("/nonexistent/path/file.arrow") != 0);
+    CHECK_FALSE(reader.is_open());
+}
+
+TEST_CASE("IpcReader - open fails on invalid file") {
+    std::string path = tmp_path("test_ipc_reader_invalid.arrow");
+    std::remove(path.c_str());
+
+    // Write garbage data
+    std::FILE* f = std::fopen(path.c_str(), "wb");
+    const char* garbage = "this is not an arrow file";
+    std::fwrite(garbage, 1, strlen(garbage), f);
+    std::fclose(f);
+
+    IpcReader reader;
+    CHECK(reader.open(path) != 0);
+    CHECK_FALSE(reader.is_open());
+
+    fs::remove(path);
+}
+
+TEST_CASE("IpcReader - move semantics") {
+    std::string path = tmp_path("test_ipc_reader_move.arrow");
+    std::remove(path.c_str());
+
+    write_test_file(path, 2, 10);
+
+    IpcReader r1;
+    CHECK(r1.open(path) == 0);
+    CHECK(r1.is_open());
+    CHECK(r1.num_batches() == 2);
+
+    IpcReader r2 = std::move(r1);
+    CHECK_FALSE(r1.is_open());
+    CHECK(r2.is_open());
+    CHECK(r2.num_batches() == 2);
+
+    auto batch = r2.read_batch(0);
+    CHECK(batch.valid());
+
+    r2.close();
+    fs::remove(path);
+}
+
+TEST_CASE("IpcReader - read batch out of range") {
+    std::string path = tmp_path("test_ipc_reader_range.arrow");
+    std::remove(path.c_str());
+
+    write_test_file(path, 2, 10);
+
+    IpcReader reader;
+    CHECK(reader.open(path) == 0);
+    CHECK(reader.num_batches() == 2);
+
+    // Valid indices
+    CHECK(reader.read_batch(0).valid());
+    CHECK(reader.read_batch(1).valid());
+
+    // Invalid index
+    CHECK_FALSE(reader.read_batch(2).valid());
+    CHECK_FALSE(reader.read_batch(100).valid());
+
+    reader.close();
+    fs::remove(path);
+}
+
+#ifdef DFTRACER_UTILS_ENABLE_ZSTD
+TEST_CASE("IpcReader - read ZSTD compressed file") {
+    std::string path = tmp_path("test_ipc_reader_zstd.arrow");
+    std::remove(path.c_str());
+
+    write_test_file(path, 3, 50, IpcCompression::ZSTD);
+
+    IpcReader reader;
+    CHECK(reader.open(path) == 0);
+    CHECK(reader.num_batches() == 3);
+
+    auto batches = reader.read_all();
+    CHECK(batches.size() == 3);
+
+    std::int64_t total_rows = 0;
+    for (const auto& batch : batches) {
+        CHECK(batch.valid());
+        total_rows += batch.num_rows();
+    }
+    CHECK(total_rows == 150);
+
+    reader.close();
+    fs::remove(path);
+}
+#endif
+
+TEST_CASE("IpcReader - roundtrip with all column types") {
+    std::string path = tmp_path("test_ipc_reader_types.arrow");
+    std::remove(path.c_str());
+
+    {
+        Runtime runtime(2);
+
+        auto task = [&]() -> CoroTask<void> {
+            IpcWriter writer;
+            co_await writer.open(path, IpcCompression::NONE);
+
+            RecordBatchBuilder builder;
+            builder.declare_schema({{"i64", ColumnType::INT64},
+                                    {"u64", ColumnType::UINT64},
+                                    {"f64", ColumnType::DOUBLE},
+                                    {"str", ColumnType::STRING},
+                                    {"boo", ColumnType::BOOL}});
+
+            builder.reserve(3);
+            for (int i = 0; i < 3; ++i) {
+                builder.append_int64(0, -i);
+                builder.append_uint64(1, i * 100);
+                builder.append_double(2, i * 1.5);
+                std::string s = "row_" + std::to_string(i);
+                builder.append_string(3, s);
+                builder.append_bool(4, i % 2 == 0);
+                builder.end_row();
+            }
+            auto batch = builder.finish();
+            co_await writer.write_batch(batch);
+            co_await writer.close();
+        };
+
+        runtime.submit(task(), "write_types").get();
+        runtime.shutdown();
+    }
+
+    // Read and verify
+    {
+        IpcReader reader;
+        CHECK(reader.open(path) == 0);
+        CHECK(reader.num_batches() == 1);
+
+        auto batch = reader.read_batch(0);
+        CHECK(batch.valid());
+        CHECK(batch.num_rows() == 3);
+        CHECK(batch.num_columns() == 5);
+
+        reader.close();
+    }
+
+    fs::remove(path);
+}
+
+// ---------------------------------------------------------------------------
+// Parallel Reader Tests
+// ---------------------------------------------------------------------------
+
+// Helper to run parallel read coroutine synchronously
+static ParallelReadResult run_parallel_read(Runtime& runtime,
+                                            std::vector<std::string> paths) {
+    auto task = read_arrow_files_parallel(std::move(paths));
+    return runtime.submit(std::move(task), "read_arrow_files").get();
+}
+
+TEST_CASE("read_arrow_files_parallel - single file") {
+    std::string path = tmp_path("test_parallel_single.arrow");
+    std::remove(path.c_str());
+
+    write_test_file(path, 2, 50);
+
+    Runtime runtime(2);
+
+    std::vector<std::string> paths = {path};
+    auto result = run_parallel_read(runtime, paths);
+
+    CHECK(result.files_read == 1);
+    CHECK(result.files_failed == 0);
+    CHECK(result.total_rows == 100);
+    CHECK(result.total_batches == 2);
+    CHECK(result.file_results.size() == 1);
+    CHECK(result.file_results[0].success);
+    CHECK(result.file_results[0].batches->size() == 2);
+
+    runtime.shutdown();
+    fs::remove(path);
+}
+
+TEST_CASE("read_arrow_files_parallel - multiple files") {
+    std::string dir = tmp_path("test_parallel_multi");
+    fs::remove_all(dir);
+    fs::create_directories(dir);
+
+    std::vector<std::string> paths;
+    for (int i = 0; i < 4; ++i) {
+        std::string path = dir + "/file_" + std::to_string(i) + ".arrow";
+        write_test_file(path, 2, 25);
+        paths.push_back(path);
+    }
+
+    Runtime runtime(4);
+
+    auto result = run_parallel_read(runtime, paths);
+
+    CHECK(result.files_read == 4);
+    CHECK(result.files_failed == 0);
+    CHECK(result.total_rows == 200);   // 4 files * 2 batches * 25 rows
+    CHECK(result.total_batches == 8);  // 4 files * 2 batches
+    CHECK(result.file_results.size() == 4);
+
+    for (const auto& fr : result.file_results) {
+        CHECK(fr.success);
+        CHECK(fr.total_rows == 50);
+        CHECK(fr.batches->size() == 2);
+    }
+
+    runtime.shutdown();
+    fs::remove_all(dir);
+}
+
+TEST_CASE("read_arrow_files_parallel - handles non-existent files") {
+    std::string path = tmp_path("test_parallel_exists.arrow");
+    std::remove(path.c_str());
+    write_test_file(path, 1, 10);
+
+    Runtime runtime(2);
+
+    std::vector<std::string> paths = {path, "/nonexistent/file.arrow"};
+
+    auto result = run_parallel_read(runtime, paths);
+
+    CHECK(result.files_read == 1);
+    CHECK(result.files_failed == 1);
+    CHECK(result.total_rows == 10);
+
+    runtime.shutdown();
+    fs::remove(path);
+}
+
+TEST_CASE("read_arrow_files_parallel - empty list") {
+    Runtime runtime(2);
+
+    std::vector<std::string> paths;
+    auto result = run_parallel_read(runtime, paths);
+
+    CHECK(result.files_read == 0);
+    CHECK(result.files_failed == 0);
+    CHECK(result.total_rows == 0);
+    CHECK(result.total_batches == 0);
+    CHECK(result.file_results.empty());
+
+    runtime.shutdown();
+}
+
+TEST_CASE("read_arrow_files_streaming - completion order callback") {
+    std::string dir = tmp_path("test_streaming");
+    fs::remove_all(dir);
+    fs::create_directories(dir);
+
+    std::vector<std::string> paths;
+    for (int i = 0; i < 4; ++i) {
+        std::string path = dir + "/file_" + std::to_string(i) + ".arrow";
+        write_test_file(path, 1, 25);
+        paths.push_back(path);
+    }
+
+    Runtime runtime(4);
+
+    std::vector<std::string> received_paths;
+    std::int64_t total_rows = 0;
+    ParallelReadResult result;
+
+    auto task = run_coro_scope(
+        runtime.executor(),
+        [&result, &received_paths, &total_rows](
+            CoroScope& scope,
+            std::vector<std::string> file_paths) -> CoroTask<void> {
+            result = co_await read_arrow_files_streaming(
+                scope, std::move(file_paths), [&](ArrowFileReadResult&& fr) {
+                    if (fr.success) {
+                        received_paths.push_back(fr.path);
+                        total_rows += fr.total_rows;
+                    }
+                    return true;  // continue
+                });
+        },
+        paths);
+
+    runtime.submit(std::move(task), "test_streaming").get();
+
+    CHECK(result.files_read == 4);
+    CHECK(result.files_failed == 0);
+    CHECK(result.total_rows == 100);
+    CHECK(received_paths.size() == 4);
+    CHECK(total_rows == 100);
+
+    runtime.shutdown();
+    fs::remove_all(dir);
+}
+
+TEST_CASE("read_arrow_files_streaming - early cancel") {
+    std::string dir = tmp_path("test_streaming_cancel");
+    fs::remove_all(dir);
+    fs::create_directories(dir);
+
+    std::vector<std::string> paths;
+    for (int i = 0; i < 4; ++i) {
+        std::string path = dir + "/file_" + std::to_string(i) + ".arrow";
+        write_test_file(path, 1, 25);
+        paths.push_back(path);
+    }
+
+    Runtime runtime(4);
+
+    int callback_count = 0;
+    ParallelReadResult result;
+
+    auto task = run_coro_scope(
+        runtime.executor(),
+        [&result, &callback_count](
+            CoroScope& scope,
+            std::vector<std::string> file_paths) -> CoroTask<void> {
+            result = co_await read_arrow_files_streaming(
+                scope, std::move(file_paths), [&](ArrowFileReadResult&&) {
+                    callback_count++;
+                    return callback_count < 2;  // cancel after 2
+                });
+        },
+        paths);
+
+    runtime.submit(std::move(task), "test_streaming_cancel").get();
+
+    // All files still processed (for stats), but callback cancelled early
+    CHECK(result.files_read == 4);
+    CHECK(callback_count == 2);  // Only 2 callbacks before cancel
+
+    runtime.shutdown();
+    fs::remove_all(dir);
+}
+
+#ifdef DFTRACER_UTILS_ENABLE_ZSTD
+TEST_CASE("read_arrow_files_parallel - mixed compression") {
+    std::string dir = tmp_path("test_parallel_mixed");
+    fs::remove_all(dir);
+    fs::create_directories(dir);
+
+    std::string path_none = dir + "/none.arrow";
+    std::string path_zstd = dir + "/zstd.arrow";
+
+    write_test_file(path_none, 2, 30, IpcCompression::NONE);
+    write_test_file(path_zstd, 2, 30, IpcCompression::ZSTD);
+
+    Runtime runtime(2);
+
+    std::vector<std::string> paths = {path_none, path_zstd};
+    auto result = run_parallel_read(runtime, paths);
+
+    CHECK(result.files_read == 2);
+    CHECK(result.files_failed == 0);
+    CHECK(result.total_rows == 120);
+    CHECK(result.total_batches == 4);
+
+    runtime.shutdown();
+    fs::remove_all(dir);
+}
+#endif
+
+#else
+
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <doctest/doctest.h>
+
+TEST_CASE("IpcReader - disabled") { CHECK(true); }
+
+#endif  // DFTRACER_UTILS_ENABLE_ARROW_IPC
diff --git a/tests/utilities/common/arrow/test_arrow_ipc_writer.cpp b/tests/utilities/common/arrow/test_arrow_ipc_writer.cpp
index 7ceb0e3e..7160257e 100644
--- a/tests/utilities/common/arrow/test_arrow_ipc_writer.cpp
+++ b/tests/utilities/common/arrow/test_arrow_ipc_writer.cpp
@@ -1,57 +1,59 @@
+#include <dftracer/utils/core/common/config.h>
 #ifdef DFTRACER_UTILS_ENABLE_ARROW_IPC
 
 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
 #include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/runtime.h>
 #include <dftracer/utils/utilities/common/arrow/arrow.h>
 #include <doctest/doctest.h>
 
 #include <cstdio>
 #include <string>
 
+using namespace dftracer::utils;
+using namespace dftracer::utils::coro;
 using namespace dftracer::utils::utilities::common::arrow;
 
-// ---------------------------------------------------------------------------
-// Helpers
-// ---------------------------------------------------------------------------
-
 static std::string tmp_path(const char* name) {
     return (fs::temp_directory_path() / name).string();
 }
 
-// ---------------------------------------------------------------------------
-// Tests
-// ---------------------------------------------------------------------------
-
 TEST_CASE("IpcWriter - basic write and close") {
-    RecordBatchBuilder builder;
-    builder.declare_schema({{"id", ColumnType::INT64},
-                            {"name", ColumnType::STRING},
-                            {"value", ColumnType::DOUBLE}});
-    builder.reserve(2);
-
-    std::string s0 = "hello", s1 = "world";
-    builder.append_int64(0, 1);
-    builder.append_string(1, s0);
-    builder.append_double(2, 3.14);
-    builder.end_row();
-    builder.append_int64(0, 2);
-    builder.append_string(1, s1);
-    builder.append_double(2, 2.72);
-    builder.end_row();
-
-    auto batch = builder.finish();
-
     std::string path = tmp_path("test_ipc_basic.arrows");
     std::remove(path.c_str());
 
-    IpcWriter writer;
-    CHECK_FALSE(writer.is_open());
-    CHECK(writer.open(path) == 0);
-    CHECK(writer.is_open());
-    CHECK(writer.write_batch(batch) == 0);
-    CHECK(writer.close() == 0);
-    CHECK_FALSE(writer.is_open());
+    Runtime runtime(2);
+    auto task = [&]() -> CoroTask<int> {
+        RecordBatchBuilder builder;
+        builder.declare_schema({{"id", ColumnType::INT64},
+                                {"name", ColumnType::STRING},
+                                {"value", ColumnType::DOUBLE}});
+        builder.reserve(2);
+
+        std::string s0 = "hello", s1 = "world";
+        builder.append_int64(0, 1);
+        builder.append_string(1, s0);
+        builder.append_double(2, 3.14);
+        builder.end_row();
+        builder.append_int64(0, 2);
+        builder.append_string(1, s1);
+        builder.append_double(2, 2.72);
+        builder.end_row();
 
+        auto batch = builder.finish();
+
+        IpcWriter writer;
+        if (co_await writer.open(path) != 0) co_return 1;
+        if (!writer.is_open()) co_return 2;
+        if (co_await writer.write_batch(batch) != 0) co_return 3;
+        if (co_await writer.close() != 0) co_return 4;
+        if (writer.is_open()) co_return 5;
+        co_return 0;
+    };
+    auto result = runtime.submit(task(), "test").get();
+
+    CHECK(result == 0);
     CHECK(fs::exists(path));
     CHECK(fs::file_size(path) > 0);
     fs::remove(path);
@@ -61,24 +63,31 @@ TEST_CASE("IpcWriter - multiple batches") {
     std::string path = tmp_path("test_ipc_multi.arrows");
     std::remove(path.c_str());
 
-    IpcWriter writer;
-    CHECK(writer.open(path) == 0);
-
-    RecordBatchBuilder builder;
-    builder.declare_schema({{"x", ColumnType::INT64}});
-
-    for (int b = 0; b < 3; ++b) {
-        builder.reserve(10);
-        for (int i = 0; i < 10; ++i) {
-            builder.append_int64(0, b * 10 + i);
-            builder.end_row();
+    Runtime runtime(2);
+    auto task = [&]() -> CoroTask<int> {
+        IpcWriter writer;
+        if (co_await writer.open(path) != 0) co_return 1;
+
+        RecordBatchBuilder builder;
+        builder.declare_schema({{"x", ColumnType::INT64}});
+
+        for (int b = 0; b < 3; ++b) {
+            builder.reserve(10);
+            for (int i = 0; i < 10; ++i) {
+                builder.append_int64(0, b * 10 + i);
+                builder.end_row();
+            }
+            auto batch = builder.finish();
+            if (co_await writer.write_batch(batch) != 0) co_return 2;
+            builder.reset(true);
         }
-        auto batch = builder.finish();
-        CHECK(writer.write_batch(batch) == 0);
-        builder.reset(true);
-    }
 
-    CHECK(writer.close() == 0);
+        if (co_await writer.close() != 0) co_return 3;
+        co_return 0;
+    };
+    auto result = runtime.submit(task(), "test").get();
+
+    CHECK(result == 0);
     CHECK(fs::exists(path));
     CHECK(fs::file_size(path) > 0);
     fs::remove(path);
@@ -88,91 +97,184 @@ TEST_CASE("IpcWriter - close without writing batches") {
     std::string path = tmp_path("test_ipc_empty.arrows");
     std::remove(path.c_str());
 
-    IpcWriter writer;
-    CHECK(writer.open(path) == 0);
-    // No write_batch calls — close should still succeed (no footer needed).
-    CHECK(writer.close() == 0);
-    CHECK_FALSE(writer.is_open());
+    Runtime runtime(2);
+    auto task = [&]() -> CoroTask<int> {
+        IpcWriter writer;
+        if (co_await writer.open(path) != 0) co_return 1;
+        if (co_await writer.close() != 0) co_return 2;
+        if (writer.is_open()) co_return 3;
+        co_return 0;
+    };
+    auto result = runtime.submit(task(), "test").get();
+
+    CHECK(result == 0);
     fs::remove(path);
 }
 
-TEST_CASE("IpcWriter - double close is safe") {
-    std::string path = tmp_path("test_ipc_dblclose.arrows");
+#ifdef DFTRACER_UTILS_ENABLE_ZSTD
+TEST_CASE("IpcWriter - explicit ZSTD compression") {
+    std::string path = tmp_path("test_ipc_zstd_compression.arrows");
     std::remove(path.c_str());
 
-    IpcWriter writer;
-    CHECK(writer.open(path) == 0);
-    CHECK(writer.close() == 0);
-    CHECK(writer.close() == 0);  // idempotent
+    Runtime runtime(2);
+    auto task = [&]() -> CoroTask<int> {
+        RecordBatchBuilder builder;
+        builder.declare_schema(
+            {{"x", ColumnType::INT64}, {"y", ColumnType::DOUBLE}});
+        builder.reserve(100);
+        for (int i = 0; i < 100; ++i) {
+            builder.append_int64(0, i);
+            builder.append_double(1, i * 1.5);
+            builder.end_row();
+        }
+        auto batch = builder.finish();
+
+        IpcWriter writer;
+        if (co_await writer.open(path, IpcCompression::ZSTD) != 0) co_return 1;
+        if (co_await writer.write_batch(batch) != 0) co_return 2;
+        if (co_await writer.close() != 0) co_return 3;
+        co_return 0;
+    };
+    auto result = runtime.submit(task(), "test").get();
+
+    CHECK(result == 0);
+    CHECK(fs::exists(path));
+    CHECK(fs::file_size(path) > 0);
     fs::remove(path);
 }
+#endif
 
-TEST_CASE("IpcWriter - move semantics") {
-    std::string path = tmp_path("test_ipc_move.arrows");
-    std::remove(path.c_str());
+TEST_CASE("PartitionWriter - basic single file") {
+    std::string dir = tmp_path("test_partition_basic");
+    fs::remove_all(dir);
 
-    IpcWriter w1;
-    CHECK(w1.open(path) == 0);
-    CHECK(w1.is_open());
+    Runtime runtime(2);
+    PartitionWriteStats stats;
 
-    IpcWriter w2 = std::move(w1);
-    CHECK_FALSE(w1.is_open());
-    CHECK(w2.is_open());
+    auto task = [&]() -> CoroTask<int> {
+        PartitionWriter writer;
+        if (co_await writer.open(dir, 0) != 0) co_return 1;
+        if (!writer.is_open()) co_return 2;
 
-    RecordBatchBuilder builder;
-    builder.declare_schema({{"v", ColumnType::UINT64}});
-    builder.append_uint64(0, 42);
-    builder.end_row();
-    auto batch = builder.finish();
-
-    CHECK(w2.write_batch(batch) == 0);
-    CHECK(w2.close() == 0);
+        RecordBatchBuilder builder;
+        builder.declare_schema({{"id", ColumnType::INT64}});
+        builder.reserve(100);
+        for (int i = 0; i < 100; ++i) {
+            builder.append_int64(0, i);
+            builder.end_row();
+        }
+        auto batch = builder.finish();
 
-    CHECK(fs::exists(path));
-    CHECK(fs::file_size(path) > 0);
-    fs::remove(path);
+        if (co_await writer.write_batch(batch) != 0) co_return 3;
+        stats = co_await writer.close();
+        if (writer.is_open()) co_return 4;
+        co_return 0;
+    };
+    auto result = runtime.submit(task(), "test").get();
+
+    CHECK(result == 0);
+    CHECK(stats.files.size() == 1);
+    CHECK(stats.total_rows == 100);
+    CHECK(stats.row_counts.size() == 1);
+    CHECK(stats.row_counts[0] == 100);
+    CHECK(fs::exists(stats.files[0]));
+
+    fs::remove_all(dir);
 }
 
-TEST_CASE("IpcWriter - open fails on bad path") {
-    IpcWriter writer;
-    CHECK(writer.open("/nonexistent_dir/no_such_file.arrows") != 0);
-    CHECK_FALSE(writer.is_open());
+TEST_CASE("PartitionRouter - NONE mode pass-through") {
+    std::string dir = tmp_path("test_router_none");
+    fs::remove_all(dir);
+
+    PartitionConfig config;
+    config.mode = PartitionConfig::Mode::NONE;
+
+    Runtime runtime(2);
+    RouterWriteStats stats;
+
+    auto task = [&]() -> CoroTask<int> {
+        PartitionRouter router;
+        if (router.open(dir, config, 0) != 0) co_return 1;
+
+        RecordBatchBuilder builder;
+        builder.declare_schema(
+            {{"id", ColumnType::INT64}, {"cat", ColumnType::STRING}});
+        builder.reserve(10);
+        for (int i = 0; i < 10; ++i) {
+            builder.append_int64(0, i);
+            builder.append_string(1, "POSIX");
+            builder.end_row();
+        }
+        auto batch = builder.finish();
+
+        if (co_await router.write_batch(batch) != 0) co_return 2;
+        stats = co_await router.close();
+        co_return 0;
+    };
+    auto result = runtime.submit(task(), "test").get();
+
+    CHECK(result == 0);
+    CHECK(stats.total_rows == 10);
+    CHECK(stats.partitions.size() == 1);
+    CHECK(stats.partitions.count("") == 1);
+
+    fs::remove_all(dir);
 }
 
-TEST_CASE("IpcWriter - all column types") {
-    std::string path = tmp_path("test_ipc_types.arrows");
-    std::remove(path.c_str());
+TEST_CASE("PartitionRouter - COLUMN mode single column") {
+    std::string dir = tmp_path("test_router_column");
+    fs::remove_all(dir);
 
-    RecordBatchBuilder builder;
-    builder.declare_schema({{"i64", ColumnType::INT64},
-                            {"u64", ColumnType::UINT64},
-                            {"f64", ColumnType::DOUBLE},
-                            {"str", ColumnType::STRING},
-                            {"boo", ColumnType::BOOL}});
+    PartitionConfig config;
+    config.mode = PartitionConfig::Mode::COLUMN;
+    config.partition_columns = {"cat"};
 
-    std::string sv = "test";
-    builder.append_int64(0, -1);
-    builder.append_uint64(1, 1);
-    builder.append_double(2, 1.0);
-    builder.append_string(3, sv);
-    builder.append_bool(4, true);
-    builder.end_row();
+    Runtime runtime(2);
+    RouterWriteStats stats;
 
-    auto batch = builder.finish();
+    auto task = [&]() -> CoroTask<int> {
+        PartitionRouter router;
+        if (router.open(dir, config, 0) != 0) co_return 1;
 
-    IpcWriter writer;
-    CHECK(writer.open(path) == 0);
-    CHECK(writer.write_batch(batch) == 0);
-    CHECK(writer.close() == 0);
+        RecordBatchBuilder builder;
+        builder.declare_schema(
+            {{"id", ColumnType::INT64}, {"cat", ColumnType::STRING}});
+        builder.reserve(6);
 
-    CHECK(fs::exists(path));
-    CHECK(fs::file_size(path) > 0);
-    fs::remove(path);
+        for (int i = 0; i < 3; ++i) {
+            builder.append_int64(0, i);
+            builder.append_string(1, "POSIX");
+            builder.end_row();
+        }
+        for (int i = 3; i < 6; ++i) {
+            builder.append_int64(0, i);
+            builder.append_string(1, "APP");
+            builder.end_row();
+        }
+        auto batch = builder.finish();
+
+        if (co_await router.write_batch(batch) != 0) co_return 2;
+        stats = co_await router.close();
+        co_return 0;
+    };
+    auto result = runtime.submit(task(), "test").get();
+
+    CHECK(result == 0);
+    CHECK(stats.total_rows == 6);
+    CHECK(stats.partitions.size() == 2);
+    CHECK(stats.partitions.count("cat=POSIX") == 1);
+    CHECK(stats.partitions.count("cat=APP") == 1);
+    CHECK(stats.partitions["cat=POSIX"].total_rows == 3);
+    CHECK(stats.partitions["cat=APP"].total_rows == 3);
+
+    CHECK(fs::exists(dir + "/cat=POSIX"));
+    CHECK(fs::exists(dir + "/cat=APP"));
+
+    fs::remove_all(dir);
 }
 
 #else
 
-// Provide main when IPC is disabled so the binary still links.
 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
 #include <doctest/doctest.h>
 
diff --git a/tests/utilities/common/query/test_evaluator.cpp b/tests/utilities/common/query/test_evaluator.cpp
index 74c4639f..fb6cda58 100644
--- a/tests/utilities/common/query/test_evaluator.cpp
+++ b/tests/utilities/common/query/test_evaluator.cpp
@@ -2,10 +2,9 @@
 #include <dftracer/utils/utilities/common/query/evaluator.h>
 #include <dftracer/utils/utilities/common/query/parser.h>
 #include <doctest/doctest.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
-#include <memory>
-#include <string>
+#include <cstring>
 
 using namespace dftracer::utils::utilities::common::query;
 using dftracer::utils::utilities::common::json::JsonValue;
@@ -13,19 +12,25 @@ using dftracer::utils::utilities::common::json::JsonValue;
 namespace {
 
 struct JsonDoc {
-    yyjson_doc* doc;
-    JsonDoc(const char* json) : doc(yyjson_read(json, std::strlen(json), 0)) {}
-    ~JsonDoc() {
-        if (doc) yyjson_doc_free(doc);
+    simdjson::dom::parser parser;
+    simdjson::dom::element elem;
+    bool valid = false;
+
+    JsonDoc(const char* json) {
+        auto result = parser.parse(json, std::strlen(json));
+        if (!result.error()) {
+            elem = result.value_unsafe();
+            valid = true;
+        }
     }
-    JsonValue root() { return JsonValue(yyjson_doc_get_root(doc)); }
+    JsonValue root() { return valid ? JsonValue(elem) : JsonValue(); }
 };
 
 bool eval(const char* query_str, const char* json_str) {
     auto ast = parse(query_str);
     REQUIRE(ast.has_value());
     JsonDoc doc(json_str);
-    REQUIRE(doc.doc != nullptr);
+    REQUIRE(doc.valid);
     return evaluate(**ast, doc.root());
 }
 
diff --git a/tests/utilities/common/query/test_query.cpp b/tests/utilities/common/query/test_query.cpp
index 9f86f2be..e0f627db 100644
--- a/tests/utilities/common/query/test_query.cpp
+++ b/tests/utilities/common/query/test_query.cpp
@@ -1,7 +1,7 @@
 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
 #include <dftracer/utils/utilities/common/query/query.h>
 #include <doctest/doctest.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
 #include <cstring>
 
@@ -11,12 +11,18 @@ using dftracer::utils::utilities::common::json::JsonValue;
 namespace {
 
 struct JsonDoc {
-    yyjson_doc* doc;
-    JsonDoc(const char* json) : doc(yyjson_read(json, std::strlen(json), 0)) {}
-    ~JsonDoc() {
-        if (doc) yyjson_doc_free(doc);
+    simdjson::dom::parser parser;
+    simdjson::dom::element elem;
+    bool valid = false;
+
+    JsonDoc(const char* json) {
+        auto result = parser.parse(json, std::strlen(json));
+        if (!result.error()) {
+            elem = result.value_unsafe();
+            valid = true;
+        }
     }
-    JsonValue root() { return JsonValue(yyjson_doc_get_root(doc)); }
+    JsonValue root() { return valid ? JsonValue(elem) : JsonValue(); }
 };
 
 }  // namespace
@@ -85,3 +91,62 @@ TEST_CASE("Query with NOT IN") {
     JsonDoc no_match(R"({"cat":"STDIO"})");
     CHECK_FALSE(q->evaluate(no_match.root()));
 }
+
+TEST_CASE("Query::fields - simple equality") {
+    auto q = Query::from_string(R"(cat == "POSIX")");
+    REQUIRE(q.has_value());
+    auto& f = q->fields();
+    CHECK(f.size() == 1);
+    CHECK(f.count("cat") == 1);
+}
+
+TEST_CASE("Query::fields - compound OR") {
+    auto q = Query::from_string(R"(pid == 1 or tid == 2)");
+    REQUIRE(q.has_value());
+    auto& f = q->fields();
+    CHECK(f.size() == 2);
+    CHECK(f.count("pid") == 1);
+    CHECK(f.count("tid") == 1);
+}
+
+TEST_CASE("Query::fields - compound AND") {
+    auto q = Query::from_string(R"(cat == "POSIX" and dur > 100)");
+    REQUIRE(q.has_value());
+    auto& f = q->fields();
+    CHECK(f.size() == 2);
+    CHECK(f.count("cat") == 1);
+    CHECK(f.count("dur") == 1);
+}
+
+TEST_CASE("Query::fields - NOT query") {
+    auto q = Query::from_string(R"(not cat == "STDIO")");
+    REQUIRE(q.has_value());
+    auto& f = q->fields();
+    CHECK(f.size() == 1);
+    CHECK(f.count("cat") == 1);
+}
+
+TEST_CASE("Query::fields - IN query") {
+    auto q = Query::from_string(R"(cat in ["POSIX", "STDIO"])");
+    REQUIRE(q.has_value());
+    auto& f = q->fields();
+    CHECK(f.size() == 1);
+    CHECK(f.count("cat") == 1);
+}
+
+TEST_CASE("Query::references") {
+    auto q = Query::from_string(R"(pid == 1 and dur > 50)");
+    REQUIRE(q.has_value());
+    CHECK(q->references("pid"));
+    CHECK(q->references("dur"));
+    CHECK_FALSE(q->references("cat"));
+    CHECK_FALSE(q->references("tid"));
+}
+
+TEST_CASE("Query::fields - no duplicates for repeated field") {
+    auto q = Query::from_string(R"(pid == 1 or pid == 2)");
+    REQUIRE(q.has_value());
+    auto& f = q->fields();
+    CHECK(f.size() == 1);
+    CHECK(f.count("pid") == 1);
+}
diff --git a/tests/utilities/common/statistics/test_timestamp_histogram.cpp b/tests/utilities/common/statistics/test_timestamp_histogram.cpp
new file mode 100644
index 00000000..a5cf5ecc
--- /dev/null
+++ b/tests/utilities/common/statistics/test_timestamp_histogram.cpp
@@ -0,0 +1,240 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/utilities/common/statistics/timestamp_histogram.h>
+#include <doctest/doctest.h>
+
+#include <cstdint>
+
+using namespace dftracer::utils::utilities::common::statistics;
+
+TEST_SUITE("TimestampHistogram") {
+    TEST_CASE("empty histogram") {
+        TimestampHistogram h;
+        CHECK(h.empty());
+        CHECK(h.total_count() == 0);
+        CHECK(h.num_bins() == 0);
+        CHECK(h.count_in_range(0, 1'000'000) == 0);
+        CHECK(h.selectivity(0, 1'000'000) == 0.0);
+    }
+
+    TEST_CASE("single event") {
+        TimestampHistogram h;
+        h.add(500'000);  // 500ms -> bin 5
+
+        CHECK(h.total_count() == 1);
+        CHECK(h.num_bins() == 1);
+        CHECK(h.bins()[0].first == 5);
+        CHECK(h.bins()[0].second == 1);
+    }
+
+    TEST_CASE("bin_index static") {
+        CHECK(TimestampHistogram::bin_index(0) == 0);
+        CHECK(TimestampHistogram::bin_index(99'999) == 0);
+        CHECK(TimestampHistogram::bin_index(100'000) == 1);
+        CHECK(TimestampHistogram::bin_index(199'999) == 1);
+        CHECK(TimestampHistogram::bin_index(200'000) == 2);
+        CHECK(TimestampHistogram::bin_index(1'000'000) == 10);
+    }
+
+    TEST_CASE("bin_start_us and bin_end_us") {
+        CHECK(TimestampHistogram::bin_start_us(0) == 0);
+        CHECK(TimestampHistogram::bin_end_us(0) == 100'000);
+        CHECK(TimestampHistogram::bin_start_us(10) == 1'000'000);
+        CHECK(TimestampHistogram::bin_end_us(10) == 1'100'000);
+    }
+
+    TEST_CASE("multiple events in same bin") {
+        TimestampHistogram h;
+        h.add(150'000);
+        h.add(160'000);
+        h.add(190'000);
+
+        CHECK(h.total_count() == 3);
+        CHECK(h.num_bins() == 1);
+        CHECK(h.bins()[0].first == 1);
+        CHECK(h.bins()[0].second == 3);
+    }
+
+    TEST_CASE("events across bins") {
+        TimestampHistogram h;
+        h.add(50'000);     // bin 0
+        h.add(150'000);    // bin 1
+        h.add(250'000);    // bin 2
+        h.add(1'050'000);  // bin 10
+
+        CHECK(h.total_count() == 4);
+        CHECK(h.num_bins() == 4);
+        CHECK(h.bins()[0] ==
+              std::make_pair(std::uint64_t{0}, std::uint64_t{1}));
+        CHECK(h.bins()[1] ==
+              std::make_pair(std::uint64_t{1}, std::uint64_t{1}));
+        CHECK(h.bins()[2] ==
+              std::make_pair(std::uint64_t{2}, std::uint64_t{1}));
+        CHECK(h.bins()[3] ==
+              std::make_pair(std::uint64_t{10}, std::uint64_t{1}));
+    }
+
+    TEST_CASE("count_in_range") {
+        TimestampHistogram h;
+        // 10 events at 0.0-0.1s, 20 at 0.5-0.6s, 5 at 1.0-1.1s
+        for (int i = 0; i < 10; ++i) h.add(50'000);
+        for (int i = 0; i < 20; ++i) h.add(550'000);
+        for (int i = 0; i < 5; ++i) h.add(1'050'000);
+
+        CHECK(h.count_in_range(0, 100'000) == 10);
+        CHECK(h.count_in_range(0, 600'000) == 30);
+        CHECK(h.count_in_range(0, 2'000'000) == 35);
+        CHECK(h.count_in_range(500'000, 600'000) == 20);
+        CHECK(h.count_in_range(500'000, 1'100'000) == 25);
+        CHECK(h.count_in_range(200'000, 400'000) == 0);
+    }
+
+    TEST_CASE("selectivity") {
+        TimestampHistogram h;
+        for (int i = 0; i < 100; ++i) h.add(50'000);
+        for (int i = 0; i < 100; ++i) h.add(550'000);
+
+        CHECK(h.selectivity(0, 100'000) == doctest::Approx(0.5));
+        CHECK(h.selectivity(500'000, 600'000) == doctest::Approx(0.5));
+        CHECK(h.selectivity(0, 600'000) == doctest::Approx(1.0));
+        CHECK(h.selectivity(200'000, 400'000) == doctest::Approx(0.0));
+    }
+
+    TEST_CASE("merge") {
+        TimestampHistogram a;
+        a.add(50'000);   // bin 0
+        a.add(150'000);  // bin 1
+
+        TimestampHistogram b;
+        b.add(50'000);   // bin 0
+        b.add(250'000);  // bin 2
+
+        a.merge(b);
+
+        CHECK(a.total_count() == 4);
+        CHECK(a.num_bins() == 3);
+        CHECK(a.bins()[0] ==
+              std::make_pair(std::uint64_t{0}, std::uint64_t{2}));
+        CHECK(a.bins()[1] ==
+              std::make_pair(std::uint64_t{1}, std::uint64_t{1}));
+        CHECK(a.bins()[2] ==
+              std::make_pair(std::uint64_t{2}, std::uint64_t{1}));
+    }
+
+    TEST_CASE("merge with empty") {
+        TimestampHistogram a;
+        a.add(50'000);
+
+        TimestampHistogram empty;
+        a.merge(empty);
+
+        CHECK(a.total_count() == 1);
+        CHECK(a.num_bins() == 1);
+    }
+
+    TEST_CASE("expansion_weights - uniform") {
+        TimestampHistogram h;
+        for (int i = 0; i < 100; ++i) h.add(i * 10'000);  // 0-1s uniform
+
+        auto weights = h.expansion_weights(0, 1'000'000, 10);
+        CHECK(weights.size() == 10);
+        for (auto w : weights) {
+            CHECK(w == doctest::Approx(0.1).epsilon(0.01));
+        }
+    }
+
+    TEST_CASE("expansion_weights - bursty") {
+        TimestampHistogram h;
+        // 800 events in 0.2-0.4s, 200 events elsewhere
+        for (int i = 0; i < 100; ++i) h.add(50'000);   // bin 0
+        for (int i = 0; i < 400; ++i) h.add(250'000);  // bin 2
+        for (int i = 0; i < 400; ++i) h.add(350'000);  // bin 3
+        for (int i = 0; i < 100; ++i) h.add(950'000);  // bin 9
+
+        auto weights = h.expansion_weights(0, 1'000'000, 5);
+        CHECK(weights.size() == 5);
+        // sub 0 [0-200ms]: bin 0 = 100
+        // sub 1 [200-400ms]: bins 2+3 = 800
+        // sub 2 [400-600ms]: 0
+        // sub 3 [600-800ms]: 0
+        // sub 4 [800-1000ms]: bin 9 = 100
+        CHECK(weights[0] == doctest::Approx(0.1).epsilon(0.01));
+        CHECK(weights[1] == doctest::Approx(0.8).epsilon(0.01));
+        CHECK(weights[2] == doctest::Approx(0.0));
+        CHECK(weights[3] == doctest::Approx(0.0));
+        CHECK(weights[4] == doctest::Approx(0.1).epsilon(0.01));
+    }
+
+    TEST_CASE("expansion_weights - no data in range falls back to uniform") {
+        TimestampHistogram h;
+        h.add(5'000'000);  // 5s, outside query range
+
+        auto weights = h.expansion_weights(0, 1'000'000, 5);
+        CHECK(weights.size() == 5);
+        for (auto w : weights) {
+            CHECK(w == doctest::Approx(0.2));
+        }
+    }
+
+    TEST_CASE("serialize and deserialize roundtrip") {
+        TimestampHistogram h;
+        h.add(50'000);
+        h.add(150'000);
+        h.add(150'000);
+        h.add(1'000'050'000);
+
+        auto data = h.serialize();
+        auto h2 = TimestampHistogram::deserialize(data.data(), data.size());
+
+        CHECK(h2.total_count() == h.total_count());
+        CHECK(h2.num_bins() == h.num_bins());
+        REQUIRE(h2.bins().size() == h.bins().size());
+        for (std::size_t i = 0; i < h.bins().size(); ++i) {
+            CHECK(h2.bins()[i].first == h.bins()[i].first);
+            CHECK(h2.bins()[i].second == h.bins()[i].second);
+        }
+    }
+
+    TEST_CASE("serialize empty") {
+        TimestampHistogram h;
+        auto data = h.serialize();
+        auto h2 = TimestampHistogram::deserialize(data.data(), data.size());
+        CHECK(h2.empty());
+        CHECK(h2.total_count() == 0);
+    }
+
+    TEST_CASE("deserialize null/empty") {
+        auto h = TimestampHistogram::deserialize(nullptr, 0);
+        CHECK(h.empty());
+    }
+
+    TEST_CASE("varint encoding handles large timestamps") {
+        TimestampHistogram h;
+        // Typical 2026 timestamp: ~1.77e15 us
+        h.add(1'773'074'570'000'000ULL);
+        h.add(1'773'074'570'100'000ULL);
+
+        auto data = h.serialize();
+        auto h2 = TimestampHistogram::deserialize(data.data(), data.size());
+
+        CHECK(h2.total_count() == 2);
+        CHECK(h2.num_bins() == 2);
+        CHECK(h2.bins()[0].first == h.bins()[0].first);
+        CHECK(h2.bins()[1].first == h.bins()[1].first);
+    }
+
+    TEST_CASE("serialization is compact with delta encoding") {
+        TimestampHistogram h;
+        // 100 consecutive bins (10s of data)
+        std::uint64_t base = 17'730'745'700ULL;  // ~2026 timestamp / 100ms
+        for (std::uint64_t i = 0; i < 100; ++i) {
+            for (int j = 0; j < 50; ++j) {
+                h.add((base + i) * 100'000 + j * 1000);
+            }
+        }
+
+        auto data = h.serialize();
+        // 100 bins with delta=1 each = ~1 byte per delta + ~1 byte per count
+        // Plus header. Should be well under 500 bytes.
+        CHECK(data.size() < 500);
+    }
+}
diff --git a/tests/utilities/composites/dft/aggregators/test_aggregation_augmentation.cpp b/tests/utilities/composites/dft/aggregators/test_aggregation_augmentation.cpp
new file mode 100644
index 00000000..f155bff5
--- /dev/null
+++ b/tests/utilities/composites/dft/aggregators/test_aggregation_augmentation.cpp
@@ -0,0 +1,128 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_augmentation.h>
+#include <doctest/doctest.h>
+
+using namespace dftracer::utils::utilities::composites::dft::aggregators;
+
+namespace {
+
+AggregationBatch create_test_batch(std::uint64_t time_bucket,
+                                   std::uint64_t count, std::uint64_t ts,
+                                   std::uint64_t te) {
+    AggregationBatch batch;
+    AggregationKey key;
+    key.cat_id = 1;
+    key.name_id = 1;
+    key.pid = 100;
+    key.tid = 1;
+    key.time_bucket = time_bucket;
+
+    AggregationMetrics metrics;
+    metrics.count = count;
+    metrics.ts = ts;
+    metrics.te = te;
+    metrics.duration.count = count;
+    metrics.duration.total = count * 1000;
+    metrics.duration.mean = 1000.0;
+
+    batch.entries.emplace_back(key, metrics);
+    return batch;
+}
+
+}  // namespace
+
+TEST_SUITE("AggregationAugmentation") {
+    TEST_CASE("PassThrough - Same interval") {
+        auto batch = create_test_batch(0, 100, 0, 5000);
+
+        AugmentationConfig config{5000, 5000};
+        auto result = augment_batch(batch, config);
+
+        CHECK_FALSE(result.has_approximated_entries);
+        REQUIRE(result.entries.size() == 1);
+        CHECK(result.entries[0].metrics.count == 100);
+        CHECK_FALSE(result.entries[0].is_approximated);
+    }
+
+    TEST_CASE("Shrink - Merge buckets") {
+        AggregationBatch batch;
+
+        for (std::uint64_t i = 0; i < 5; ++i) {
+            AggregationKey key;
+            key.cat_id = 1;
+            key.name_id = 1;
+            key.pid = 100;
+            key.tid = 1;
+            key.time_bucket = i;
+
+            AggregationMetrics metrics;
+            metrics.count = 20;
+            metrics.ts = i * 1000;
+            metrics.te = (i + 1) * 1000;
+            metrics.duration.count = 20;
+            metrics.duration.total = 20000;
+
+            batch.entries.emplace_back(key, metrics);
+        }
+
+        AugmentationConfig config{1000, 5000};  // shrink 5x
+        auto result = augment_batch(batch, config);
+
+        CHECK_FALSE(result.has_approximated_entries);
+        REQUIRE(result.entries.size() == 1);
+        CHECK(result.entries[0].metrics.count == 100);  // 5 * 20
+        CHECK(result.entries[0].key.time_bucket == 0);
+    }
+
+    TEST_CASE("Expand - Split bucket") {
+        auto batch = create_test_batch(0, 100, 1000, 4000);
+
+        AugmentationConfig config{5000, 1000};  // expand 5x
+        auto result = augment_batch(batch, config);
+
+        CHECK(result.has_approximated_entries);
+
+        std::uint64_t total_count = 0;
+        for (const auto& entry : result.entries) {
+            CHECK(entry.is_approximated);
+            CHECK(entry.key.time_bucket >= 1);
+            CHECK(entry.key.time_bucket <= 3);
+            total_count += entry.metrics.count;
+            CHECK(entry.count_ci.upper > 0);
+        }
+
+        CHECK(total_count == 100);
+    }
+
+    TEST_CASE("Expand - All events at same time") {
+        auto batch = create_test_batch(0, 100, 2500, 2500);
+
+        AugmentationConfig config{5000, 1000};
+        auto result = augment_batch(batch, config);
+
+        CHECK(result.has_approximated_entries);
+        REQUIRE(result.entries.size() == 1);
+        CHECK(result.entries[0].key.time_bucket == 2);
+        CHECK(result.entries[0].metrics.count == 100);
+    }
+
+    TEST_CASE("Poisson CI calculation") {
+        SUBCASE("Count = 100") {
+            auto ci = compute_poisson_ci(100.0);
+            CHECK(ci.lower == doctest::Approx(80.4).epsilon(0.01));
+            CHECK(ci.upper == doctest::Approx(119.6).epsilon(0.01));
+        }
+
+        SUBCASE("Count = 4") {
+            auto ci = compute_poisson_ci(4.0);
+            CHECK(ci.lower == doctest::Approx(0.08).epsilon(0.1));
+            CHECK(ci.upper == doctest::Approx(7.92).epsilon(0.1));
+        }
+
+        SUBCASE("Count = 0") {
+            auto ci = compute_poisson_ci(0.0);
+            CHECK(ci.lower == 0.0);
+            CHECK(ci.upper == 0.0);
+        }
+    }
+}
diff --git a/tests/utilities/composites/dft/aggregators/test_aggregation_metrics.cpp b/tests/utilities/composites/dft/aggregators/test_aggregation_metrics.cpp
index 4cd826eb..eb00e16b 100644
--- a/tests/utilities/composites/dft/aggregators/test_aggregation_metrics.cpp
+++ b/tests/utilities/composites/dft/aggregators/test_aggregation_metrics.cpp
@@ -9,11 +9,11 @@ using namespace dftracer::utils::utilities::composites::dft::aggregators;
 TEST_SUITE("MetricStats") {
     TEST_CASE("MetricStats - Single value") {
         MetricStats stats;
-        std::uint64_t count = 1;
-        stats.update(42, count);
+        stats.update(42);
 
+        CHECK(stats.count == 1);
         CHECK(stats.mean == doctest::Approx(42.0));
-        CHECK(stats.get_stddev(count) == 0.0);
+        CHECK(stats.get_stddev() == 0.0);
         CHECK(stats.total == 42);
         CHECK(stats.min == 42);
         CHECK(stats.max == 42);
@@ -21,16 +21,17 @@ TEST_SUITE("MetricStats") {
 
     TEST_CASE("MetricStats - Two values") {
         MetricStats stats;
-        stats.update(10, 1);
-        stats.update(20, 2);
+        stats.update(10);
+        stats.update(20);
 
+        CHECK(stats.count == 2);
         CHECK(stats.mean == doctest::Approx(15.0));
         CHECK(stats.total == 30);
         CHECK(stats.min == 10);
         CHECK(stats.max == 20);
 
         // stddev = sqrt(((10-15)^2 + (20-15)^2) / 1) = sqrt(50) ~ 7.071
-        double stddev = stats.get_stddev(2);
+        double stddev = stats.get_stddev();
         CHECK(stddev == doctest::Approx(std::sqrt(50.0)).epsilon(0.001));
     }
 
@@ -39,10 +40,11 @@ TEST_SUITE("MetricStats") {
         // Mean = 40/8 = 5.0
         MetricStats stats;
         std::vector<std::uint64_t> values = {2, 4, 4, 4, 5, 5, 7, 9};
-        for (std::uint64_t i = 0; i < values.size(); ++i) {
-            stats.update(values[i], i + 1);
+        for (auto v : values) {
+            stats.update(v);
         }
 
+        CHECK(stats.count == 8);
         CHECK(stats.mean == doctest::Approx(5.0));
         CHECK(stats.total == 40);
         CHECK(stats.min == 2);
@@ -50,64 +52,67 @@ TEST_SUITE("MetricStats") {
 
         // Sample stddev = sqrt(sum((x-mean)^2) / (n-1))
         // = sqrt((9+1+1+1+0+0+4+16)/7) = sqrt(32/7) ~ 2.138
-        double stddev = stats.get_stddev(8);
+        double stddev = stats.get_stddev();
         CHECK(stddev == doctest::Approx(std::sqrt(32.0 / 7.0)).epsilon(0.01));
     }
 
     TEST_CASE("MetricStats - Identical values") {
         MetricStats stats;
         for (std::uint64_t i = 0; i < 10; ++i) {
-            stats.update(5, i + 1);
+            stats.update(5);
         }
 
+        CHECK(stats.count == 10);
         CHECK(stats.mean == doctest::Approx(5.0));
-        CHECK(stats.get_stddev(10) == doctest::Approx(0.0).epsilon(1e-10));
-        CHECK(stats.get_skewness(10) == doctest::Approx(0.0).epsilon(1e-10));
-        CHECK(stats.get_kurtosis(10) == doctest::Approx(0.0).epsilon(1e-10));
+        CHECK(stats.get_stddev() == doctest::Approx(0.0).epsilon(1e-10));
+        CHECK(stats.get_skewness() == doctest::Approx(0.0).epsilon(1e-10));
+        CHECK(stats.get_kurtosis() == doctest::Approx(0.0).epsilon(1e-10));
     }
 
     TEST_CASE("MetricStats - Merge equivalence") {
         // Single-pass
         MetricStats single;
         std::vector<std::uint64_t> all_values = {2, 4, 6, 8, 10, 12, 14, 16};
-        for (std::uint64_t i = 0; i < all_values.size(); ++i) {
-            single.update(all_values[i], i + 1);
+        for (auto v : all_values) {
+            single.update(v);
         }
 
         // Split into two halves
         MetricStats first_half;
         for (std::uint64_t i = 0; i < 4; ++i) {
-            first_half.update(all_values[i], i + 1);
+            first_half.update(all_values[i]);
         }
 
         MetricStats second_half;
         for (std::uint64_t i = 0; i < 4; ++i) {
-            second_half.update(all_values[i + 4], i + 1);
+            second_half.update(all_values[i + 4]);
         }
 
-        std::uint64_t n1 = 4, n2 = 4, n = 8;
-        first_half.merge_from(second_half, n1, n2, n);
+        first_half.merge_from(second_half);
 
+        CHECK(first_half.count == single.count);
         CHECK(first_half.mean == doctest::Approx(single.mean).epsilon(0.001));
         CHECK(first_half.total == single.total);
         CHECK(first_half.min == single.min);
         CHECK(first_half.max == single.max);
-        CHECK(first_half.get_stddev(n) ==
-              doctest::Approx(single.get_stddev(n)).epsilon(0.01));
+        CHECK(first_half.get_stddev() ==
+              doctest::Approx(single.get_stddev()).epsilon(0.01));
     }
 
     TEST_CASE("MetricStats - Merge with empty") {
         MetricStats stats;
-        stats.update(10, 1);
-        stats.update(20, 2);
+        stats.update(10);
+        stats.update(20);
 
         MetricStats empty_stats;
 
         double mean_before = stats.mean;
         std::uint64_t total_before = stats.total;
+        std::uint64_t count_before = stats.count;
 
-        stats.merge_from(empty_stats, 2, 0, 2);
+        stats.merge_from(empty_stats);
 
+        CHECK(stats.count == count_before);
         CHECK(stats.mean == doctest::Approx(mean_before));
         CHECK(stats.total == total_before);
     }
@@ -115,9 +120,10 @@ TEST_SUITE("MetricStats") {
     TEST_CASE("MetricStats - Percentile integration") {
         MetricStats stats;
         for (std::uint64_t i = 1; i <= 100; ++i) {
-            stats.update(i, i, true);  // compute_percentiles = true
+            stats.update(i, true);  // compute_percentiles = true
         }
 
+        CHECK(stats.count == 100);
         CHECK(stats.sketch != nullptr);
         CHECK_FALSE(stats.sketch->empty());
         REQUIRE(stats.sketch != nullptr);
@@ -133,20 +139,24 @@ TEST_SUITE("AggregationMetrics") {
 
         metrics.update_duration(100);
         CHECK(metrics.count == 1);
+        CHECK(metrics.duration.count == 1);
         CHECK(metrics.duration.total == 100);
         CHECK(metrics.duration.min == 100);
         CHECK(metrics.duration.max == 100);
 
         metrics.update_duration(200);
         CHECK(metrics.count == 2);
+        CHECK(metrics.duration.count == 2);
         CHECK(metrics.duration.total == 300);
 
         metrics.update_size(50);
+        CHECK(metrics.size.count == 1);
         CHECK(metrics.size.total == 50);
         CHECK(metrics.size.min == 50);
         CHECK(metrics.size.max == 50);
 
         metrics.update_size(150);
+        CHECK(metrics.size.count == 2);
         CHECK(metrics.size.total == 200);
     }
 
@@ -191,19 +201,39 @@ TEST_SUITE("AggregationMetrics") {
     TEST_CASE("AggregationMetrics - update_custom_metric") {
         AggregationMetrics metrics;
 
-        // First call creates the metric
         metrics.update_duration(100);  // increment count to 1
         metrics.update_custom_metric("bytes_read", 1024);
         REQUIRE(metrics.custom_metrics != nullptr);
         CHECK(metrics.custom_metrics->count("bytes_read") == 1);
+        CHECK((*metrics.custom_metrics)["bytes_read"].count == 1);
         CHECK((*metrics.custom_metrics)["bytes_read"].total == 1024);
 
-        // Subsequent call updates it
         metrics.update_duration(200);  // count = 2
         metrics.update_custom_metric("bytes_read", 2048);
+        CHECK((*metrics.custom_metrics)["bytes_read"].count == 2);
         CHECK((*metrics.custom_metrics)["bytes_read"].total == 3072);
     }
 
+    TEST_CASE("AggregationMetrics - sparse custom metrics have correct count") {
+        AggregationMetrics metrics;
+
+        // 3 events, but only 2 have the custom field
+        metrics.update_duration(100);
+        metrics.update_custom_metric("bytes_read", 1024);
+
+        metrics.update_duration(200);
+        // no bytes_read for this event
+
+        metrics.update_duration(300);
+        metrics.update_custom_metric("bytes_read", 2048);
+
+        CHECK(metrics.count == 3);
+        CHECK(metrics.duration.count == 3);
+        CHECK((*metrics.custom_metrics)["bytes_read"].count == 2);
+        CHECK((*metrics.custom_metrics)["bytes_read"].mean ==
+              doctest::Approx(1536.0));  // (1024+2048)/2
+    }
+
     TEST_CASE("AggregationMetrics - merge_from") {
         AggregationMetrics a, b;
 
@@ -223,25 +253,48 @@ TEST_SUITE("AggregationMetrics") {
         a.merge_from(b);
 
         CHECK(a.count == 3);
+        CHECK(a.duration.count == 3);
         CHECK(a.duration.total == 600);  // 100+200+300
+        CHECK(a.size.count == 3);
         CHECK(a.size.total == 450);      // 50+150+250
         CHECK(a.ts == 500);              // min of 1000, 500
         CHECK(a.te == 1100);             // max of 1100, 700
         REQUIRE(a.custom_metrics != nullptr);
+        CHECK((*a.custom_metrics)["io_ops"].count == 3);
         CHECK((*a.custom_metrics)["io_ops"].total == 60);
     }
 
-    TEST_CASE("AggregationMetrics - get_stddev delegates") {
+    TEST_CASE("AggregationMetrics - merge sparse custom metrics") {
+        AggregationMetrics a, b;
+
+        // a has 2 events, 1 with custom metric
+        a.update_duration(100);
+        a.update_custom_metric("bytes", 500);
+        a.update_duration(200);
+
+        // b has 1 event with custom metric
+        b.update_duration(300);
+        b.update_custom_metric("bytes", 1000);
+
+        a.merge_from(b);
+
+        CHECK(a.count == 3);
+        auto& bytes = (*a.custom_metrics)["bytes"];
+        CHECK(bytes.count == 2);  // only 2 events had bytes, not 3
+        CHECK(bytes.total == 1500);
+        CHECK(bytes.mean == doctest::Approx(750.0));
+    }
+
+    TEST_CASE("AggregationMetrics - get_stddev via MetricStats") {
         AggregationMetrics metrics;
         metrics.update_duration(10);
         metrics.update_duration(20);
         metrics.update_size(30);
         metrics.update_size(40);
 
-        double dur_stddev = metrics.get_stddev_duration();
-        CHECK(dur_stddev == doctest::Approx(metrics.duration.get_stddev(2)));
-
-        double size_stddev = metrics.get_stddev_size();
-        CHECK(size_stddev == doctest::Approx(metrics.size.get_stddev(2)));
+        CHECK(metrics.duration.get_stddev() ==
+              doctest::Approx(std::sqrt(50.0)).epsilon(0.01));
+        CHECK(metrics.size.get_stddev() ==
+              doctest::Approx(std::sqrt(50.0)).epsilon(0.01));
     }
 }
diff --git a/tests/utilities/composites/dft/aggregators/test_aggregation_serialization.cpp b/tests/utilities/composites/dft/aggregators/test_aggregation_serialization.cpp
new file mode 100644
index 00000000..dfb074da
--- /dev/null
+++ b/tests/utilities/composites/dft/aggregators/test_aggregation_serialization.cpp
@@ -0,0 +1,205 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/utilities/composites/dft/aggregators/aggregation_serialization.h>
+#include <doctest/doctest.h>
+
+using namespace dftracer::utils::utilities::composites::dft::aggregators;
+
+TEST_SUITE("AggregationSerialization") {
+    TEST_CASE("key roundtrip - basic") {
+        auto& intern = aggregation_intern();
+        AggregationKey key;
+        key.cat_id = intern.get_or_insert("POSIX");
+        key.name_id = intern.get_or_insert("read");
+        key.pid = 12345;
+        key.tid = 67890;
+        key.hhash_id = intern.get_or_insert("abc123");
+        key.fhash_id = intern.get_or_insert("def456");
+        key.time_bucket = 5000000;
+
+        auto data = serialize_agg_key(42, AggMapType::EVENT, key);
+        auto result = deserialize_agg_key(data);
+
+        CHECK(result.map_type == AggMapType::EVENT);
+        CHECK(result.key.cat() == "POSIX");
+        CHECK(result.key.name() == "read");
+        CHECK(result.key.pid == key.pid);
+        CHECK(result.key.tid == key.tid);
+        CHECK(result.key.hhash() == "abc123");
+        CHECK(result.key.fhash() == "def456");
+        CHECK(result.key.time_bucket == key.time_bucket);
+        CHECK(result.key.extra_keys == nullptr);
+    }
+
+    TEST_CASE("key roundtrip - with extra keys") {
+        auto& intern = aggregation_intern();
+        AggregationKey key;
+        key.cat_id = intern.get_or_insert("MPI");
+        key.name_id = intern.get_or_insert("send");
+        key.pid = 100;
+        key.tid = 200;
+        key.time_bucket = 1000000;
+        key.extra_keys = std::make_unique<
+            std::vector<std::pair<std::uint32_t, std::uint32_t>>>();
+        auto ek_a = intern.get_or_insert("epoch");
+        auto ev_a = intern.get_or_insert("1");
+        auto ek_b = intern.get_or_insert("step");
+        auto ev_b = intern.get_or_insert("42");
+        key.extra_keys->emplace_back(ek_a, ev_a);
+        key.extra_keys->emplace_back(ek_b, ev_b);
+
+        auto data = serialize_agg_key(99, AggMapType::PROFILE, key);
+        auto result = deserialize_agg_key(data);
+
+        CHECK(result.map_type == AggMapType::PROFILE);
+        CHECK(result.key.cat() == "MPI");
+        REQUIRE(result.key.extra_keys != nullptr);
+        REQUIRE(result.key.extra_keys->size() == 2);
+        CHECK(intern.resolve((*result.key.extra_keys)[0].first) == "epoch");
+        CHECK(intern.resolve((*result.key.extra_keys)[0].second) == "1");
+        CHECK(intern.resolve((*result.key.extra_keys)[1].first) == "step");
+        CHECK(intern.resolve((*result.key.extra_keys)[1].second) == "42");
+    }
+
+    TEST_CASE("key roundtrip - map type preserved") {
+        auto& intern = aggregation_intern();
+        AggregationKey key;
+        key.cat_id = intern.get_or_insert("CAT");
+        key.name_id = intern.get_or_insert("NAME");
+        key.pid = 1;
+        key.tid = 1;
+        key.time_bucket = 1000000;
+
+        for (auto mt :
+             {AggMapType::EVENT, AggMapType::PROFILE, AggMapType::SYSTEM}) {
+            auto data = serialize_agg_key(0, mt, key);
+            auto result = deserialize_agg_key(data);
+            CHECK(result.map_type == mt);
+        }
+    }
+
+    TEST_CASE("key sort order - shard prefix") {
+        auto& intern = aggregation_intern();
+        AggregationKey a, b;
+        a.cat_id = intern.get_or_insert("AAA");
+        a.name_id = intern.get_or_insert("aaa");
+        a.pid = 1;
+        a.tid = 1;
+        a.time_bucket = 1000000;
+
+        b = a;
+        b.cat_id = intern.get_or_insert("BBB");
+        auto ka = serialize_agg_key(0, AggMapType::EVENT, a);
+        auto kb = serialize_agg_key(0, AggMapType::EVENT, b);
+        CHECK(ka < kb);
+    }
+
+    TEST_CASE("key uniqueness - different time_bucket") {
+        auto& intern = aggregation_intern();
+        AggregationKey a, b;
+        a.cat_id = intern.get_or_insert("AAA");
+        a.name_id = intern.get_or_insert("aaa");
+        a.pid = 1;
+        a.tid = 1;
+        a.time_bucket = 1000000;
+
+        b = a;
+        b.time_bucket = 2000000;
+
+        auto ka = serialize_agg_key(0, AggMapType::EVENT, a);
+        auto kb = serialize_agg_key(0, AggMapType::EVENT, b);
+        CHECK(ka != kb);
+    }
+
+    TEST_CASE("value roundtrip - basic") {
+        AggregationMetrics m;
+        m.count = 100;
+        m.duration.count = 100;
+        m.duration.total = 5000;
+        m.duration.min = 10;
+        m.duration.max = 200;
+        m.duration.mean = 50.0;
+        m.duration.m2 = 1234.5;
+        m.size.count = 50;
+        m.size.total = 2000;
+        m.size.min = 5;
+        m.size.max = 100;
+        m.size.mean = 40.0;
+        m.ts = 1000000;
+        m.te = 2000000;
+        m.parent_pid = 42;
+
+        auto data = serialize_agg_value(m);
+        auto m2 = deserialize_agg_value(data);
+
+        CHECK(m2.count == 100);
+        CHECK(m2.duration.count == 100);
+        CHECK(m2.duration.total == 5000);
+        CHECK(m2.duration.min == 10);
+        CHECK(m2.duration.max == 200);
+        CHECK(m2.duration.mean == doctest::Approx(50.0));
+        CHECK(m2.duration.m2 == doctest::Approx(1234.5));
+        CHECK(m2.size.count == 50);
+        CHECK(m2.size.total == 2000);
+        CHECK(m2.ts == 1000000);
+        CHECK(m2.te == 2000000);
+        CHECK(m2.parent_pid == 42);
+        CHECK(m2.custom_metrics == nullptr);
+    }
+
+    TEST_CASE("value roundtrip - with custom metrics") {
+        AggregationMetrics m;
+        m.count = 10;
+        m.duration.count = 10;
+        m.duration.total = 500;
+        m.duration.min = 10;
+        m.duration.max = 100;
+        m.duration.mean = 50.0;
+        m.ts = 100;
+        m.te = 200;
+        m.custom_metrics = std::make_unique<CustomMetricsMap>();
+        MetricStats cm;
+        cm.count = 5;
+        cm.total = 250;
+        cm.min = 20;
+        cm.max = 80;
+        cm.mean = 50.0;
+        cm.m2 = 100.0;
+        m.custom_metrics->emplace("offset", std::move(cm));
+
+        auto data = serialize_agg_value(m);
+        auto m2 = deserialize_agg_value(data);
+
+        REQUIRE(m2.custom_metrics != nullptr);
+        REQUIRE(m2.custom_metrics->count("offset") == 1);
+        auto& cm2 = m2.custom_metrics->at("offset");
+        CHECK(cm2.count == 5);
+        CHECK(cm2.total == 250);
+        CHECK(cm2.min == 20);
+        CHECK(cm2.max == 80);
+        CHECK(cm2.mean == doctest::Approx(50.0));
+    }
+
+    TEST_CASE("value roundtrip - with sketch") {
+        AggregationMetrics m;
+        m.count = 3;
+        m.duration.count = 3;
+        m.duration.total = 300;
+        m.duration.min = 50;
+        m.duration.max = 150;
+        m.duration.mean = 100.0;
+        m.ts = 100;
+        m.te = 200;
+
+        m.duration.update(50, true);
+        m.duration.update(100, true);
+        m.duration.update(150, true);
+
+        REQUIRE(m.duration.sketch != nullptr);
+
+        auto data = serialize_agg_value(m);
+        auto m2 = deserialize_agg_value(data);
+
+        REQUIRE(m2.duration.sketch != nullptr);
+        CHECK(m2.duration.sketch->count() == m.duration.sketch->count());
+    }
+}
diff --git a/tests/utilities/composites/dft/aggregators/test_aggregator_utility.cpp b/tests/utilities/composites/dft/aggregators/test_aggregator_utility.cpp
index ab59b514..e6df7b44 100644
--- a/tests/utilities/composites/dft/aggregators/test_aggregator_utility.cpp
+++ b/tests/utilities/composites/dft/aggregators/test_aggregator_utility.cpp
@@ -1,6 +1,10 @@
 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/pipeline/executor.h>
+#include <dftracer/utils/core/pipeline/scheduler.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/core/tasks/task.h>
 #include <dftracer/utils/utilities/composites/dft/aggregators/aggregator_utility.h>
 #include <doctest/doctest.h>
 #include <testing_utilities.h>
@@ -8,31 +12,20 @@
 #include <fstream>
 #include <vector>
 
+using namespace dftracer::utils;
 using namespace dftracer::utils::utilities::composites::dft::aggregators;
 using namespace dftracer::utils::coro;
 using namespace dft_utils_test;
 
-namespace {
-
-static CoroTask<std::vector<AggregationBatch>> collect_batches(
-    AsyncGenerator<AggregationBatch> gen) {
-    std::vector<AggregationBatch> batches;
-    while (auto batch = co_await gen.next()) {
-        batches.push_back(std::move(*batch));
-    }
-    co_return batches;
-}
-
-}  // namespace
-
 TEST_SUITE("AggregatorUtility") {
     TEST_CASE("Collects event profile and system counter batches end-to-end") {
         TestEnvironment env(0);
         REQUIRE(env.is_valid());
 
-        auto trace = fs::path(env.get_dir()) / "mixed_trace.pfw";
+        auto trace_plain = fs::path(env.get_dir()) / "mixed_trace.pfw";
+        auto trace = fs::path(env.get_dir()) / "mixed_trace.pfw.gz";
         {
-            std::ofstream out(trace);
+            std::ofstream out(trace_plain);
             out << R"({"name":"read","cat":"POSIX","pid":7,"tid":3,"ts":1000,"dur":50,"ph":"X","args":{"ret":64,"bytes":64,"hhash":"event_h","fhash":"event_f"}})"
                 << "\n";
             out << R"({"name":"cpu_usage","cat":"PROFILE","pid":7,"tid":3,"ts":1500,"dur":0,"ph":"C","args":{"count":4,"dur_sum":80,"dur_min":10,"dur_max":30,"ret_sum":400,"ret_min":50,"ret_max":150,"bytes_sum":1000,"bytes_min":100,"bytes_max":400,"hhash":"profile_h","fhash":"profile_f"}})"
@@ -40,6 +33,8 @@ TEST_SUITE("AggregatorUtility") {
             out << R"({"name":"mem_bw","cat":"sys","pid":7,"tid":3,"ts":2500,"dur":0,"ph":"C","args":{"count":2,"dur_sum":40,"dur_min":15,"dur_max":25,"ret_sum":600,"ret_min":250,"ret_max":350,"bytes_sum":1200,"bytes_min":500,"bytes_max":700,"hhash":"system_h","fhash":"system_f"}})"
                 << "\n";
         }
+        REQUIRE(compress_file_to_gzip(trace_plain.string(), trace.string()));
+        fs::remove(trace_plain);
 
         AggregatorInput input;
         input.directory = env.get_dir();
@@ -49,8 +44,25 @@ TEST_SUITE("AggregatorUtility") {
         input.config.custom_metric_fields = {"bytes"};
         input.config.track_process_parents = false;
 
-        auto batches =
-            collect_batches(AggregatorUtility{}.process(input)).get();
+        Executor executor(ExecutorConfig{.num_threads = 2});
+        Scheduler scheduler(&executor);
+
+        std::vector<AggregationBatch> batches;
+        auto task = make_task(
+            [&](CoroScope& ctx) -> coro::CoroTask<void> {
+                AggregatorUtility agg;
+                agg.bind_context(ctx);
+                auto gen = agg.process(input);
+                while (auto batch = co_await gen.next()) {
+                    batches.push_back(std::move(*batch));
+                }
+                agg.unbind_context();
+            },
+            "AggregatorTest");
+
+        scheduler.schedule(task);
+        task->wait();
+        executor.shutdown();
 
         REQUIRE(batches.size() == 3);
 
@@ -77,32 +89,30 @@ TEST_SUITE("AggregatorUtility") {
         CHECK(profile_batch->entries.size() == 1);
         CHECK(system_batch->entries.size() == 1);
 
-        const auto& [event_key, event_metrics] = event_batch->entries.front();
-        CHECK(event_key.cat() == "POSIX");
-        CHECK(event_key.name() == "read");
-        CHECK(event_metrics.count == 1);
-        CHECK(event_metrics.duration.total == 50);
-        CHECK(event_metrics.size.total == 64);
-
-        const auto& [profile_key, profile_metrics] =
-            profile_batch->entries.front();
-        CHECK(profile_key.cat() == "PROFILE");
-        CHECK(profile_key.name() == "cpu_usage");
-        CHECK(profile_metrics.count == 4);
-        CHECK(profile_metrics.duration.total == 80);
-        CHECK(profile_metrics.size.total == 400);
-        REQUIRE(profile_metrics.custom_metrics != nullptr);
-        CHECK((*profile_metrics.custom_metrics)["bytes"].total == 1000);
-
-        const auto& [system_key, system_metrics] =
-            system_batch->entries.front();
-        CHECK(system_key.cat() == "sys");
-        CHECK(system_key.name() == "mem_bw");
-        CHECK(system_metrics.count == 2);
-        CHECK(system_metrics.duration.total == 40);
-        CHECK(system_metrics.size.total == 600);
-        REQUIRE(system_metrics.custom_metrics != nullptr);
-        CHECK((*system_metrics.custom_metrics)["bytes"].total == 1200);
+        const auto& event_entry = event_batch->entries.front();
+        CHECK(event_entry.key.cat() == "POSIX");
+        CHECK(event_entry.key.name() == "read");
+        CHECK(event_entry.metrics.count == 1);
+        CHECK(event_entry.metrics.duration.total == 50);
+        CHECK(event_entry.metrics.size.total == 64);
+
+        const auto& profile_entry = profile_batch->entries.front();
+        CHECK(profile_entry.key.cat() == "PROFILE");
+        CHECK(profile_entry.key.name() == "cpu_usage");
+        CHECK(profile_entry.metrics.count == 4);
+        CHECK(profile_entry.metrics.duration.total == 80);
+        CHECK(profile_entry.metrics.size.total == 400);
+        REQUIRE(profile_entry.metrics.custom_metrics != nullptr);
+        CHECK((*profile_entry.metrics.custom_metrics)["bytes"].total == 1000);
+
+        const auto& system_entry = system_batch->entries.front();
+        CHECK(system_entry.key.cat() == "sys");
+        CHECK(system_entry.key.name() == "mem_bw");
+        CHECK(system_entry.metrics.count == 2);
+        CHECK(system_entry.metrics.duration.total == 40);
+        CHECK(system_entry.metrics.size.total == 600);
+        REQUIRE(system_entry.metrics.custom_metrics != nullptr);
+        CHECK((*system_entry.metrics.custom_metrics)["bytes"].total == 1200);
 
         CHECK(event_batch->total_events_processed == 3);
         CHECK(profile_batch->total_events_processed == 3);
diff --git a/tests/utilities/composites/dft/aggregators/test_event_aggregator_utility.cpp b/tests/utilities/composites/dft/aggregators/test_event_aggregator_utility.cpp
index 490bc219..9ab786c9 100644
--- a/tests/utilities/composites/dft/aggregators/test_event_aggregator_utility.cpp
+++ b/tests/utilities/composites/dft/aggregators/test_event_aggregator_utility.cpp
@@ -1,5 +1,5 @@
 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
-#include <dftracer/utils/utilities/composites/dft/aggregators/event_aggregator_utility.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/event_aggregator.h>
 #include <doctest/doctest.h>
 
 using namespace dftracer::utils::utilities::composites::dft::aggregators;
@@ -28,7 +28,7 @@ AggregationMetrics make_metrics(std::uint64_t count, std::uint64_t dur_total) {
 
 }  // namespace
 
-TEST_SUITE("EventAggregatorUtility") {
+TEST_SUITE("EventAggregator") {
     TEST_CASE("Merges event profile and system maps independently") {
         ChunkAggregationOutput first;
         first.success = true;
@@ -50,7 +50,7 @@ TEST_SUITE("EventAggregatorUtility") {
         second.profile_aggregations.emplace(make_key("PROFILE", "cpu"),
                                             make_metrics(1, 30));
 
-        EventAggregatorUtility utility;
+        EventAggregator utility;
         utility.merge_chunk(std::move(first));
         utility.merge_chunk(std::move(second));
         auto output = utility.finalize();
diff --git a/tests/utilities/composites/dft/aggregators/test_system_metrics.cpp b/tests/utilities/composites/dft/aggregators/test_system_metrics.cpp
new file mode 100644
index 00000000..a30880ad
--- /dev/null
+++ b/tests/utilities/composites/dft/aggregators/test_system_metrics.cpp
@@ -0,0 +1,309 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/utilities/composites/dft/aggregators/system_metrics.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.h>
+#include <doctest/doctest.h>
+
+#include <cmath>
+#include <limits>
+
+using namespace dftracer::utils::utilities::composites::dft::aggregators;
+
+TEST_SUITE("FloatMetricStats") {
+    TEST_CASE("default construction") {
+        FloatMetricStats stats;
+        CHECK(stats.count == 0);
+        CHECK(stats.total == 0.0);
+        CHECK(stats.min == std::numeric_limits<double>::max());
+        CHECK(stats.max == std::numeric_limits<double>::lowest());
+        CHECK(stats.mean == 0.0);
+        CHECK(stats.m2 == 0.0);
+        CHECK(stats.sketch == nullptr);
+    }
+
+    TEST_CASE("single value update") {
+        FloatMetricStats stats;
+        stats.update(42.5);
+
+        CHECK(stats.count == 1);
+        CHECK(stats.total == doctest::Approx(42.5));
+        CHECK(stats.min == doctest::Approx(42.5));
+        CHECK(stats.max == doctest::Approx(42.5));
+        CHECK(stats.mean == doctest::Approx(42.5));
+        CHECK(stats.get_stddev() == doctest::Approx(0.0));
+    }
+
+    TEST_CASE("multiple values update") {
+        FloatMetricStats stats;
+        stats.update(10.0);
+        stats.update(20.0);
+        stats.update(30.0);
+
+        CHECK(stats.count == 3);
+        CHECK(stats.total == doctest::Approx(60.0));
+        CHECK(stats.min == doctest::Approx(10.0));
+        CHECK(stats.max == doctest::Approx(30.0));
+        CHECK(stats.mean == doctest::Approx(20.0));
+        CHECK(stats.get_stddev() == doctest::Approx(10.0));
+    }
+
+    TEST_CASE("update with percentiles") {
+        FloatMetricStats stats;
+        stats.update(10.0, true);
+        stats.update(20.0, true);
+        stats.update(30.0, true);
+
+        CHECK(stats.sketch != nullptr);
+        CHECK(stats.sketch->quantile(0.5) ==
+              doctest::Approx(20.0).epsilon(0.1));
+    }
+
+    TEST_CASE("merge_from empty into empty") {
+        FloatMetricStats a, b;
+        a.merge_from(b);
+
+        CHECK(a.count == 0);
+        CHECK(a.total == 0.0);
+    }
+
+    TEST_CASE("merge_from populated into empty") {
+        FloatMetricStats a, b;
+        b.update(10.0);
+        b.update(20.0);
+
+        a.merge_from(b);
+
+        CHECK(a.count == 2);
+        CHECK(a.total == doctest::Approx(30.0));
+        CHECK(a.min == doctest::Approx(10.0));
+        CHECK(a.max == doctest::Approx(20.0));
+        CHECK(a.mean == doctest::Approx(15.0));
+    }
+
+    TEST_CASE("merge_from two populated stats") {
+        FloatMetricStats a, b;
+        a.update(10.0);
+        a.update(20.0);
+        b.update(30.0);
+        b.update(40.0);
+
+        a.merge_from(b);
+
+        CHECK(a.count == 4);
+        CHECK(a.total == doctest::Approx(100.0));
+        CHECK(a.min == doctest::Approx(10.0));
+        CHECK(a.max == doctest::Approx(40.0));
+        CHECK(a.mean == doctest::Approx(25.0));
+    }
+
+    TEST_CASE("merge_from with sketches") {
+        FloatMetricStats a, b;
+        a.update(10.0, true);
+        a.update(20.0, true);
+        b.update(30.0, true);
+        b.update(40.0, true);
+
+        a.merge_from(b);
+
+        CHECK(a.sketch != nullptr);
+        CHECK(a.count == 4);
+    }
+
+    TEST_CASE("copy construction") {
+        FloatMetricStats original;
+        original.update(10.0, true);
+        original.update(20.0, true);
+
+        FloatMetricStats copy(original);
+
+        CHECK(copy.count == original.count);
+        CHECK(copy.total == original.total);
+        CHECK(copy.min == original.min);
+        CHECK(copy.max == original.max);
+        CHECK(copy.mean == original.mean);
+        CHECK(copy.sketch != nullptr);
+        CHECK(copy.sketch != original.sketch);
+    }
+}
+
+TEST_SUITE("SystemAggregationMetrics") {
+    TEST_CASE("default construction") {
+        SystemAggregationMetrics metrics;
+        CHECK(metrics.count == 0);
+        CHECK(metrics.ts == std::numeric_limits<std::uint64_t>::max());
+        CHECK(metrics.te == 0);
+        CHECK(metrics.metrics == nullptr);
+    }
+
+    TEST_CASE("update_metric creates metrics map") {
+        SystemAggregationMetrics metrics;
+        metrics.update_metric("cpu_usage", 50.0);
+
+        CHECK(metrics.metrics != nullptr);
+        CHECK(metrics.metrics->size() == 1);
+        CHECK(metrics.metrics->at("cpu_usage").count == 1);
+        CHECK(metrics.metrics->at("cpu_usage").mean == doctest::Approx(50.0));
+    }
+
+    TEST_CASE("update_metric multiple metrics") {
+        SystemAggregationMetrics metrics;
+        metrics.update_metric("cpu_usage", 50.0);
+        metrics.update_metric("memory_usage", 70.0);
+        metrics.update_metric("cpu_usage", 60.0);
+
+        CHECK(metrics.metrics->size() == 2);
+        CHECK(metrics.metrics->at("cpu_usage").count == 2);
+        CHECK(metrics.metrics->at("cpu_usage").mean == doctest::Approx(55.0));
+        CHECK(metrics.metrics->at("memory_usage").count == 1);
+    }
+
+    TEST_CASE("update_timestamp") {
+        SystemAggregationMetrics metrics;
+        metrics.update_timestamp(1000);
+        metrics.update_timestamp(500);
+        metrics.update_timestamp(1500);
+
+        CHECK(metrics.ts == 500);
+        CHECK(metrics.te == 1500);
+    }
+
+    TEST_CASE("merge_from empty into empty") {
+        SystemAggregationMetrics a, b;
+        a.merge_from(b);
+
+        CHECK(a.count == 0);
+        CHECK(a.metrics == nullptr);
+    }
+
+    TEST_CASE("merge_from populated into empty") {
+        SystemAggregationMetrics a, b;
+        b.count = 2;
+        b.ts = 100;
+        b.te = 200;
+        b.update_metric("cpu", 50.0);
+
+        a.merge_from(b);
+
+        CHECK(a.count == 2);
+        CHECK(a.ts == 100);
+        CHECK(a.te == 200);
+        CHECK(a.metrics != nullptr);
+        CHECK(a.metrics->at("cpu").count == 1);
+    }
+
+    TEST_CASE("merge_from two populated metrics") {
+        SystemAggregationMetrics a, b;
+        a.count = 2;
+        a.ts = 100;
+        a.te = 200;
+        a.update_metric("cpu", 40.0);
+        a.update_metric("cpu", 60.0);
+
+        b.count = 2;
+        b.ts = 50;
+        b.te = 250;
+        b.update_metric("cpu", 50.0);
+        b.update_metric("memory", 80.0);
+
+        a.merge_from(b);
+
+        CHECK(a.count == 4);
+        CHECK(a.ts == 50);
+        CHECK(a.te == 250);
+        CHECK(a.metrics->size() == 2);
+        CHECK(a.metrics->at("cpu").count == 3);
+        CHECK(a.metrics->at("memory").count == 1);
+    }
+
+    TEST_CASE("copy construction") {
+        SystemAggregationMetrics original;
+        original.count = 5;
+        original.ts = 100;
+        original.te = 500;
+        original.update_metric("cpu", 50.0);
+
+        SystemAggregationMetrics copy(original);
+
+        CHECK(copy.count == original.count);
+        CHECK(copy.ts == original.ts);
+        CHECK(copy.te == original.te);
+        CHECK(copy.metrics != nullptr);
+        CHECK(copy.metrics != original.metrics);
+        CHECK(copy.metrics->at("cpu").count == 1);
+    }
+}
+
+TEST_SUITE("SystemMetricsSerialization") {
+    TEST_CASE("key serialization round-trip") {
+        std::string hhash = "host123";
+        std::uint64_t time_bucket = 42;
+
+        std::string serialized = serialize_system_key(hhash, time_bucket);
+        auto deserialized = deserialize_system_key(serialized);
+
+        CHECK(deserialized.key.hhash == hhash);
+        CHECK(deserialized.key.time_bucket == time_bucket);
+    }
+
+    TEST_CASE("value serialization round-trip - empty metrics") {
+        SystemAggregationMetrics original;
+        original.count = 10;
+        original.ts = 1000;
+        original.te = 2000;
+
+        std::string serialized = serialize_system_value(original);
+        auto deserialized = deserialize_system_value(serialized);
+
+        CHECK(deserialized.count == original.count);
+        CHECK(deserialized.ts == original.ts);
+        CHECK(deserialized.te == original.te);
+        CHECK(deserialized.metrics == nullptr);
+    }
+
+    TEST_CASE("value serialization round-trip - with metrics") {
+        SystemAggregationMetrics original;
+        original.count = 10;
+        original.ts = 1000;
+        original.te = 2000;
+        original.update_metric("cpu_user", 25.5);
+        original.update_metric("cpu_user", 30.0);
+        original.update_metric("cpu_system", 5.0);
+        original.update_metric("memory_available", 8000000.0);
+
+        std::string serialized = serialize_system_value(original);
+        auto deserialized = deserialize_system_value(serialized);
+
+        CHECK(deserialized.count == original.count);
+        CHECK(deserialized.ts == original.ts);
+        CHECK(deserialized.te == original.te);
+        REQUIRE(deserialized.metrics != nullptr);
+        CHECK(deserialized.metrics->size() == 3);
+
+        auto& cpu_user = deserialized.metrics->at("cpu_user");
+        CHECK(cpu_user.count == 2);
+        CHECK(cpu_user.mean == doctest::Approx(27.75));
+        CHECK(cpu_user.min == doctest::Approx(25.5));
+        CHECK(cpu_user.max == doctest::Approx(30.0));
+
+        auto& cpu_system = deserialized.metrics->at("cpu_system");
+        CHECK(cpu_system.count == 1);
+        CHECK(cpu_system.mean == doctest::Approx(5.0));
+
+        auto& memory = deserialized.metrics->at("memory_available");
+        CHECK(memory.count == 1);
+        CHECK(memory.mean == doctest::Approx(8000000.0));
+    }
+
+    TEST_CASE("value serialization preserves variance") {
+        SystemAggregationMetrics original;
+        original.count = 3;
+        original.update_metric("test", 10.0);
+        original.update_metric("test", 20.0);
+        original.update_metric("test", 30.0);
+
+        std::string serialized = serialize_system_value(original);
+        auto deserialized = deserialize_system_value(serialized);
+
+        auto& test_stats = deserialized.metrics->at("test");
+        CHECK(test_stats.get_stddev() == doctest::Approx(10.0));
+    }
+}
diff --git a/tests/utilities/composites/dft/aggregators/test_system_metrics_merge_operator.cpp b/tests/utilities/composites/dft/aggregators/test_system_metrics_merge_operator.cpp
new file mode 100644
index 00000000..b96008e8
--- /dev/null
+++ b/tests/utilities/composites/dft/aggregators/test_system_metrics_merge_operator.cpp
@@ -0,0 +1,183 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/utilities/composites/dft/aggregators/system_metrics.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/system_metrics_merge_operator.h>
+#include <dftracer/utils/utilities/composites/dft/aggregators/system_metrics_serialization.h>
+#include <doctest/doctest.h>
+#include <rocksdb/slice.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+using namespace dftracer::utils::utilities::composites::dft::aggregators;
+
+TEST_SUITE("SystemMetricsMergeOperator") {
+    TEST_CASE("Name returns correct identifier") {
+        SystemMetricsMergeOperator op;
+        CHECK(std::string(op.Name()) == "SystemMetricsMergeOperator");
+    }
+
+    TEST_CASE("PartialMerge combines two operands") {
+        SystemMetricsMergeOperator op;
+
+        // Create two system metrics
+        SystemAggregationMetrics left;
+        left.count = 2;
+        left.ts = 100;
+        left.te = 200;
+        left.update_metric("cpu_user", 40.0);
+        left.update_metric("cpu_user", 50.0);
+
+        SystemAggregationMetrics right;
+        right.count = 1;
+        right.ts = 150;
+        right.te = 250;
+        right.update_metric("cpu_user", 60.0);
+        right.update_metric("memory", 1000.0);
+
+        std::string left_serialized = serialize_system_value(left);
+        std::string right_serialized = serialize_system_value(right);
+
+        std::string new_value;
+        ::rocksdb::Slice key("test_key");
+        ::rocksdb::Slice left_slice(left_serialized);
+        ::rocksdb::Slice right_slice(right_serialized);
+
+        bool result =
+            op.PartialMerge(key, left_slice, right_slice, &new_value, nullptr);
+        CHECK(result);
+
+        auto merged = deserialize_system_value(new_value);
+        CHECK(merged.count == 3);
+        CHECK(merged.ts == 100);
+        CHECK(merged.te == 250);
+        REQUIRE(merged.metrics != nullptr);
+        CHECK(merged.metrics->size() == 2);
+        CHECK(merged.metrics->at("cpu_user").count == 3);
+        CHECK(merged.metrics->at("memory").count == 1);
+    }
+
+    TEST_CASE("FullMergeV2 merges existing value with operands") {
+        SystemMetricsMergeOperator op;
+
+        // Existing value
+        SystemAggregationMetrics existing;
+        existing.count = 1;
+        existing.ts = 50;
+        existing.te = 100;
+        existing.update_metric("cpu", 30.0);
+        std::string existing_serialized = serialize_system_value(existing);
+
+        // First operand
+        SystemAggregationMetrics op1;
+        op1.count = 1;
+        op1.ts = 100;
+        op1.te = 150;
+        op1.update_metric("cpu", 40.0);
+        std::string op1_serialized = serialize_system_value(op1);
+
+        // Second operand
+        SystemAggregationMetrics op2;
+        op2.count = 1;
+        op2.ts = 150;
+        op2.te = 200;
+        op2.update_metric("cpu", 50.0);
+        std::string op2_serialized = serialize_system_value(op2);
+
+        ::rocksdb::Slice key("test_key");
+        ::rocksdb::Slice existing_slice(existing_serialized);
+        std::vector<::rocksdb::Slice> operands = {
+            ::rocksdb::Slice(op1_serialized), ::rocksdb::Slice(op2_serialized)};
+
+        ::rocksdb::MergeOperator::MergeOperationInput merge_in(
+            key, &existing_slice, operands, nullptr);
+        std::string new_value;
+        ::rocksdb::Slice existing_operand;
+        ::rocksdb::MergeOperator::MergeOperationOutput merge_out(
+            new_value, existing_operand);
+
+        bool result = op.FullMergeV2(merge_in, &merge_out);
+        CHECK(result);
+
+        auto merged = deserialize_system_value(new_value);
+        CHECK(merged.count == 3);
+        CHECK(merged.ts == 50);
+        CHECK(merged.te == 200);
+        REQUIRE(merged.metrics != nullptr);
+        CHECK(merged.metrics->at("cpu").count == 3);
+        CHECK(merged.metrics->at("cpu").mean == doctest::Approx(40.0));
+    }
+
+    TEST_CASE("FullMergeV2 handles null existing value") {
+        SystemMetricsMergeOperator op;
+
+        // First operand
+        SystemAggregationMetrics op1;
+        op1.count = 2;
+        op1.ts = 100;
+        op1.te = 200;
+        op1.update_metric("memory", 1000.0);
+        std::string op1_serialized = serialize_system_value(op1);
+
+        // Second operand
+        SystemAggregationMetrics op2;
+        op2.count = 3;
+        op2.ts = 200;
+        op2.te = 300;
+        op2.update_metric("memory", 2000.0);
+        std::string op2_serialized = serialize_system_value(op2);
+
+        ::rocksdb::Slice key("test_key");
+        std::vector<::rocksdb::Slice> operands = {
+            ::rocksdb::Slice(op1_serialized), ::rocksdb::Slice(op2_serialized)};
+
+        ::rocksdb::MergeOperator::MergeOperationInput merge_in(
+            key, nullptr, operands, nullptr);
+        std::string new_value;
+        ::rocksdb::Slice existing_operand;
+        ::rocksdb::MergeOperator::MergeOperationOutput merge_out(
+            new_value, existing_operand);
+
+        bool result = op.FullMergeV2(merge_in, &merge_out);
+        CHECK(result);
+
+        auto merged = deserialize_system_value(new_value);
+        CHECK(merged.count == 5);
+        CHECK(merged.ts == 100);
+        CHECK(merged.te == 300);
+        REQUIRE(merged.metrics != nullptr);
+        CHECK(merged.metrics->at("memory").count == 2);
+    }
+
+    TEST_CASE("FullMergeV2 handles single operand") {
+        SystemMetricsMergeOperator op;
+
+        SystemAggregationMetrics op1;
+        op1.count = 5;
+        op1.ts = 100;
+        op1.te = 500;
+        op1.update_metric("disk_io", 100.0);
+        std::string op1_serialized = serialize_system_value(op1);
+
+        ::rocksdb::Slice key("test_key");
+        std::vector<::rocksdb::Slice> operands = {
+            ::rocksdb::Slice(op1_serialized)};
+
+        ::rocksdb::MergeOperator::MergeOperationInput merge_in(
+            key, nullptr, operands, nullptr);
+        std::string new_value;
+        ::rocksdb::Slice existing_operand;
+        ::rocksdb::MergeOperator::MergeOperationOutput merge_out(
+            new_value, existing_operand);
+
+        bool result = op.FullMergeV2(merge_in, &merge_out);
+        CHECK(result);
+
+        auto merged = deserialize_system_value(new_value);
+        CHECK(merged.count == 5);
+        CHECK(merged.ts == 100);
+        CHECK(merged.te == 500);
+        REQUIRE(merged.metrics != nullptr);
+        CHECK(merged.metrics->at("disk_io").count == 1);
+    }
+}
diff --git a/tests/utilities/composites/dft/comparator/test_comparison_result.cpp b/tests/utilities/composites/dft/comparator/test_comparison_result.cpp
index 32737a80..585ee2e0 100644
--- a/tests/utilities/composites/dft/comparator/test_comparison_result.cpp
+++ b/tests/utilities/composites/dft/comparator/test_comparison_result.cpp
@@ -5,19 +5,29 @@
 
 #include <algorithm>
 #include <cmath>
-#include <string>
 
 using namespace dftracer::utils::utilities::composites::dft::comparator;
 using namespace dftracer::utils::utilities::composites::dft::aggregators;
 
-static MetricStats make_stats(double mean, double m2, uint64_t total,
-                              uint64_t min_val, uint64_t max_val) {
+// MetricStats representation change: `m2` now holds the raw power sum
+// `sum_x^2` (not Welford central M2). The caller passes `central_m2`
+// (central moment, = sum((x-mean)^2)); we translate it to raw via
+//   raw_sum_x^2 = central_m2 + n * mean^2
+// so callers keep Welford semantics but we store the new canonical form.
+static MetricStats make_stats(double mean, double central_m2, uint64_t total,
+                              uint64_t min_val, uint64_t max_val,
+                              uint64_t count = 0) {
     MetricStats s;
     s.mean = mean;
-    s.m2 = m2;
     s.total = total;
     s.min = min_val;
     s.max = max_val;
+    s.count = count;
+    // If count not provided, fall back to total/mean ratio (integer-rounded).
+    const double n =
+        count > 0 ? static_cast<double>(count)
+                  : (mean != 0.0 ? static_cast<double>(total) / mean : 0.0);
+    s.m2 = central_m2 + n * mean * mean;
     return s;
 }
 
diff --git a/tests/utilities/composites/dft/indexing/test_bloom_query.cpp b/tests/utilities/composites/dft/indexing/test_bloom_query.cpp
index c27c9b96..abea71f1 100644
--- a/tests/utilities/composites/dft/indexing/test_bloom_query.cpp
+++ b/tests/utilities/composites/dft/indexing/test_bloom_query.cpp
@@ -3,6 +3,7 @@
 #include <dftracer/utils/utilities/composites/dft/indexing/bloom_filter.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/bloom_query_utility.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
 #include <doctest/doctest.h>
 
@@ -19,17 +20,13 @@ using dftracer::utils::utilities::indexer::internal::get_logical_path;
 static void populate_test_idx(const std::string& index_path,
                               const std::string& file_path) {
     IndexDatabase idx_db(index_path);
-    idx_db.init_base_schema();
-    idx_db.init_bloom_schema();
+    auto writer = idx_db.begin_write();
+    writer->init_schema();
 
     int fid =
-        idx_db.get_or_create_file_info(get_logical_path(file_path), 12345);
+        writer->get_or_create_file_info(get_logical_path(file_path), 12345);
 
-    idx_db.begin_transaction();
-
-    // Create chunk bloom filters for 3 checkpoints
     for (int ckpt = 0; ckpt < 3; ++ckpt) {
-        // name dimension
         BloomFilter name_bloom(100, 0.01);
         if (ckpt == 0) {
             name_bloom.add("read");
@@ -43,11 +40,10 @@ static void populate_test_idx(const std::string& index_path,
         }
 
         auto blob = name_bloom.serialize();
-        idx_db.insert_chunk_bloom_filter(
+        writer->insert_chunk_bloom_filter(
             fid, static_cast<std::uint64_t>(ckpt), "name", blob.data(),
             static_cast<int>(blob.size()), name_bloom.num_entries());
 
-        // cat dimension
         BloomFilter cat_bloom(100, 0.01);
         if (ckpt == 0 || ckpt == 2) {
             cat_bloom.add("POSIX");
@@ -56,12 +52,11 @@ static void populate_test_idx(const std::string& index_path,
         }
 
         auto cat_blob = cat_bloom.serialize();
-        idx_db.insert_chunk_bloom_filter(
+        writer->insert_chunk_bloom_filter(
             fid, static_cast<std::uint64_t>(ckpt), "cat", cat_blob.data(),
             static_cast<int>(cat_blob.size()), cat_bloom.num_entries());
     }
 
-    // Create file-level bloom filters (merged from all chunks)
     BloomFilter file_name_bloom(100, 0.01);
     file_name_bloom.add("read");
     file_name_bloom.add("write");
@@ -69,42 +64,38 @@ static void populate_test_idx(const std::string& index_path,
     file_name_bloom.add("close");
     file_name_bloom.add("stat");
     auto name_blob = file_name_bloom.serialize();
-    idx_db.insert_file_bloom_filter(fid, "name", name_blob.data(),
-                                    static_cast<int>(name_blob.size()),
-                                    file_name_bloom.num_entries());
+    writer->insert_file_bloom_filter(fid, "name", name_blob.data(),
+                                     static_cast<int>(name_blob.size()),
+                                     file_name_bloom.num_entries());
 
     BloomFilter file_cat_bloom(100, 0.01);
     file_cat_bloom.add("POSIX");
     file_cat_bloom.add("storage");
     auto cat_blob = file_cat_bloom.serialize();
-    idx_db.insert_file_bloom_filter(fid, "cat", cat_blob.data(),
-                                    static_cast<int>(cat_blob.size()),
-                                    file_cat_bloom.num_entries());
+    writer->insert_file_bloom_filter(fid, "cat", cat_blob.data(),
+                                     static_cast<int>(cat_blob.size()),
+                                     file_cat_bloom.num_entries());
 
-    // Add fhash with resolution
     BloomFilter fhash_bloom(100, 0.01);
     fhash_bloom.add("abc123");
     auto fhash_blob = fhash_bloom.serialize();
-    idx_db.insert_file_bloom_filter(fid, "fhash", fhash_blob.data(),
-                                    static_cast<int>(fhash_blob.size()),
-                                    fhash_bloom.num_entries());
+    writer->insert_file_bloom_filter(fid, "fhash", fhash_blob.data(),
+                                     static_cast<int>(fhash_blob.size()),
+                                     fhash_bloom.num_entries());
 
     for (int ckpt = 0; ckpt < 3; ++ckpt) {
         auto blob = fhash_bloom.serialize();
-        idx_db.insert_chunk_bloom_filter(
+        writer->insert_chunk_bloom_filter(
             fid, static_cast<std::uint64_t>(ckpt), "fhash", blob.data(),
             static_cast<int>(blob.size()), fhash_bloom.num_entries());
     }
 
-    // Hash resolutions
-    idx_db.insert_hash_resolution(fid, "fhash", "abc123", "./data/file.h5");
-
-    // Record dimensions
-    idx_db.insert_index_dimension(fid, "name");
-    idx_db.insert_index_dimension(fid, "cat");
-    idx_db.insert_index_dimension(fid, "fhash");
+    writer->insert_hash_table_entry(0, "abc123", "./data/file.h5");
 
-    idx_db.commit_transaction();
+    writer->insert_index_dimension(fid, "name");
+    writer->insert_index_dimension(fid, "cat");
+    writer->insert_index_dimension(fid, "fhash");
+    writer->commit();
 }
 
 TEST_SUITE("BloomQueryUtility") {
diff --git a/tests/utilities/composites/dft/indexing/test_chunk_pruner.cpp b/tests/utilities/composites/dft/indexing/test_chunk_pruner.cpp
index 5a606ad8..e4dbad4b 100644
--- a/tests/utilities/composites/dft/indexing/test_chunk_pruner.cpp
+++ b/tests/utilities/composites/dft/indexing/test_chunk_pruner.cpp
@@ -4,6 +4,7 @@
 #include <dftracer/utils/utilities/composites/dft/indexing/chunk_pruner_utility.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
 #include <doctest/doctest.h>
 
@@ -20,89 +21,84 @@ using dftracer::utils::utilities::indexer::internal::get_logical_path;
 static void populate_test_idx(const std::string& index_path,
                               const std::string& file_path) {
     IndexDatabase idx_db(index_path);
-    idx_db.init_base_schema();
-    idx_db.init_bloom_schema();
+    auto writer = idx_db.begin_write();
+    writer->init_schema();
 
     int fid =
-        idx_db.get_or_create_file_info(get_logical_path(file_path), 12345);
+        writer->get_or_create_file_info(get_logical_path(file_path), 12345);
 
-    idx_db.begin_transaction();
-
-    // Chunk 0: POSIX reads, dur 100-200
     {
         ChunkDimensionStats cat_ds;
         cat_ds.dimension = "cat";
         cat_ds.value_type = "string";
         cat_ds.observe("POSIX");
         cat_ds.observe("POSIX");
-        idx_db.insert_chunk_dimension_stats(fid, 0, cat_ds);
+        writer->insert_chunk_dimension_stats(fid, 0, cat_ds);
 
         ChunkDimensionStats name_ds;
         name_ds.dimension = "name";
         name_ds.value_type = "string";
         name_ds.observe("read");
         name_ds.observe("read");
-        idx_db.insert_chunk_dimension_stats(fid, 0, name_ds);
+        writer->insert_chunk_dimension_stats(fid, 0, name_ds);
 
         ChunkDimensionStats dur_ds;
         dur_ds.dimension = "dur";
         dur_ds.value_type = "uint";
         dur_ds.observe("100");
         dur_ds.observe("200");
-        idx_db.insert_chunk_dimension_stats(fid, 0, dur_ds);
+        writer->insert_chunk_dimension_stats(fid, 0, dur_ds);
 
-        idx_db.insert_index_dimension(fid, "cat");
-        idx_db.insert_index_dimension(fid, "name");
-        idx_db.insert_index_dimension(fid, "dur");
+        writer->insert_index_dimension(fid, "cat");
+        writer->insert_index_dimension(fid, "name");
+        writer->insert_index_dimension(fid, "dur");
     }
 
-    // Chunk 1: STDIO writes, dur 500-600
     {
         ChunkDimensionStats cat_ds;
         cat_ds.dimension = "cat";
         cat_ds.value_type = "string";
         cat_ds.observe("STDIO");
-        idx_db.insert_chunk_dimension_stats(fid, 1, cat_ds);
+        writer->insert_chunk_dimension_stats(fid, 1, cat_ds);
 
         ChunkDimensionStats name_ds;
         name_ds.dimension = "name";
         name_ds.value_type = "string";
         name_ds.observe("write");
-        idx_db.insert_chunk_dimension_stats(fid, 1, name_ds);
+        writer->insert_chunk_dimension_stats(fid, 1, name_ds);
 
         ChunkDimensionStats dur_ds;
         dur_ds.dimension = "dur";
         dur_ds.value_type = "uint";
         dur_ds.observe("500");
         dur_ds.observe("600");
-        idx_db.insert_chunk_dimension_stats(fid, 1, dur_ds);
+        writer->insert_chunk_dimension_stats(fid, 1, dur_ds);
     }
 
-    // Chunk 2: POSIX + MPI mixed, dur 50-1000
     {
         ChunkDimensionStats cat_ds;
         cat_ds.dimension = "cat";
         cat_ds.value_type = "string";
         cat_ds.observe("POSIX");
         cat_ds.observe("MPI");
-        idx_db.insert_chunk_dimension_stats(fid, 2, cat_ds);
+        writer->insert_chunk_dimension_stats(fid, 2, cat_ds);
 
         ChunkDimensionStats name_ds;
         name_ds.dimension = "name";
         name_ds.value_type = "string";
         name_ds.observe("read");
         name_ds.observe("send");
-        idx_db.insert_chunk_dimension_stats(fid, 2, name_ds);
+        writer->insert_chunk_dimension_stats(fid, 2, name_ds);
 
         ChunkDimensionStats dur_ds;
         dur_ds.dimension = "dur";
         dur_ds.value_type = "uint";
         dur_ds.observe("50");
         dur_ds.observe("1000");
-        idx_db.insert_chunk_dimension_stats(fid, 2, dur_ds);
+        writer->insert_chunk_dimension_stats(fid, 2, dur_ds);
     }
 
-    idx_db.commit_transaction();
+    writer->commit();
 }
 
 static ChunkPrunerOutput run_pruner(const std::string& index_path,
diff --git a/tests/utilities/composites/dft/indexing/test_manifest_index_builder.cpp b/tests/utilities/composites/dft/indexing/test_manifest_index_builder.cpp
index 3761d6a2..02d17a08 100644
--- a/tests/utilities/composites/dft/indexing/test_manifest_index_builder.cpp
+++ b/tests/utilities/composites/dft/indexing/test_manifest_index_builder.cpp
@@ -86,8 +86,7 @@ TEST_SUITE("ManifestIndexBuilder") {
 
         auto config = IndexBuildConfig::for_file(trace_file)
                           .with_index_dir(test_dir)
-                          .with_manifest(true)
-                          .with_index_threshold(0);
+                          .with_manifest(true);
 
         auto result = run_index_build(config);
 
@@ -97,8 +96,7 @@ TEST_SUITE("ManifestIndexBuilder") {
         CHECK(fs::exists(result.index_path));
 
         IndexDatabase idx_db(result.index_path);
-        idx_db.init_base_schema();
-        idx_db.init_manifest_schema();
+        idx_db.init_schema();
         int fid = idx_db.get_file_info_id(get_logical_path(trace_file));
         REQUIRE(fid >= 0);
 
@@ -136,7 +134,6 @@ TEST_SUITE("ManifestIndexBuilder") {
             auto config = IndexBuildConfig::for_file(trace_file)
                               .with_index_dir(test_dir)
                               .with_manifest(true)
-                              .with_index_threshold(0)
                               .with_force_rebuild(false);
 
             auto result = run_index_build(config);
@@ -149,7 +146,6 @@ TEST_SUITE("ManifestIndexBuilder") {
             auto config = IndexBuildConfig::for_file(trace_file)
                               .with_index_dir(test_dir)
                               .with_manifest(true)
-                              .with_index_threshold(0)
                               .with_force_rebuild(false);
 
             auto result = run_index_build(config);
diff --git a/tests/utilities/composites/dft/indexing/test_manifest_queries.cpp b/tests/utilities/composites/dft/indexing/test_manifest_queries.cpp
index b7450e18..d50c5a8b 100644
--- a/tests/utilities/composites/dft/indexing/test_manifest_queries.cpp
+++ b/tests/utilities/composites/dft/indexing/test_manifest_queries.cpp
@@ -2,6 +2,7 @@
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/utilities/composites/dft/indexing/queries/manifest_queries.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
 #include <doctest/doctest.h>
 
@@ -33,19 +34,19 @@ TEST_SUITE("ManifestQueries") {
         std::string index_path = test_dir + "/test.pfw.gz.idx";
 
         IndexDatabase idx_db(index_path);
-        idx_db.init_base_schema();
-        idx_db.init_manifest_schema();
-        int fid =
-            idx_db.get_or_create_file_info(get_logical_path("test.pfw.gz"), 0);
-
-        idx_db.begin_transaction();
-
-        idx_db.insert_event_range(fid, 0, "POSIX", "read", {0, 2, 5});
-        idx_db.insert_event_range(fid, 0, "POSIX", "write", {1});
-        idx_db.insert_event_range(fid, 0, "APP", "compute", {3, 4});
-        idx_db.insert_event_range(fid, 1, "POSIX", "read", {0, 1});
-
-        idx_db.commit_transaction();
+        int fid;
+        {
+            auto writer = idx_db.begin_write();
+            writer->init_schema();
+            fid = writer->get_or_create_file_info(
+                get_logical_path("test.pfw.gz"), 0);
+
+            writer->insert_event_range(fid, 0, "POSIX", "read", {0, 2, 5});
+            writer->insert_event_range(fid, 0, "POSIX", "write", {1});
+            writer->insert_event_range(fid, 0, "APP", "compute", {3, 4});
+            writer->insert_event_range(fid, 1, "POSIX", "read", {0, 1});
+            writer->commit();
+        }
 
         auto all = idx_db.query_event_ranges(fid);
         CHECK(all.size() == 4);
@@ -71,18 +72,18 @@ TEST_SUITE("ManifestQueries") {
         std::string index_path = test_dir + "/test.pfw.gz.idx";
 
         IndexDatabase idx_db(index_path);
-        idx_db.init_base_schema();
-        idx_db.init_manifest_schema();
-        int fid =
-            idx_db.get_or_create_file_info(get_logical_path("test.pfw.gz"), 0);
-
-        idx_db.begin_transaction();
-
-        idx_db.insert_metadata_lines(fid, 0, "HH", {0, 3});
-        idx_db.insert_metadata_lines(fid, 0, "FH", {1});
-        idx_db.insert_metadata_lines(fid, 1, "HH", {0});
-
-        idx_db.commit_transaction();
+        int fid;
+        {
+            auto writer = idx_db.begin_write();
+            writer->init_schema();
+            fid = writer->get_or_create_file_info(
+                get_logical_path("test.pfw.gz"), 0);
+
+            writer->insert_metadata_lines(fid, 0, "HH", {0, 3});
+            writer->insert_metadata_lines(fid, 0, "FH", {1});
+            writer->insert_metadata_lines(fid, 1, "HH", {0});
+            writer->commit();
+        }
 
         auto all = idx_db.query_metadata_lines(fid);
         CHECK(all.size() == 3);
@@ -104,23 +105,33 @@ TEST_SUITE("ManifestQueries") {
         std::string index_path = test_dir + "/test.pfw.gz.idx";
 
         IndexDatabase idx_db(index_path);
-        idx_db.init_base_schema();
-        idx_db.init_manifest_schema();
-        int fid =
-            idx_db.get_or_create_file_info(get_logical_path("test.pfw.gz"), 0);
-
-        idx_db.begin_transaction();
-        idx_db.insert_event_range(fid, 0, "POSIX", "read", {0, 1});
-        idx_db.insert_metadata_lines(fid, 0, "HH", {2});
-        idx_db.commit_transaction();
+        int fid;
+        {
+            auto writer = idx_db.begin_write();
+            writer->init_schema();
+            fid = writer->get_or_create_file_info(
+                get_logical_path("test.pfw.gz"), 0);
+
+            writer->insert_event_range(fid, 0, "POSIX", "read", {0, 1});
+            writer->insert_metadata_lines(fid, 0, "HH", {2});
+            writer->commit();
+        }
 
         CHECK(idx_db.query_event_ranges(fid).size() == 1);
         CHECK(idx_db.query_metadata_lines(fid).size() == 1);
 
-        idx_db.delete_event_ranges(fid);
+        {
+            auto writer = idx_db.begin_write();
+            writer->delete_event_ranges(fid);
+            writer->commit();
+        }
         CHECK(idx_db.query_event_ranges(fid).empty());
 
-        idx_db.delete_metadata_lines(fid);
+        {
+            auto writer = idx_db.begin_write();
+            writer->delete_metadata_lines(fid);
+            writer->commit();
+        }
         CHECK(idx_db.query_metadata_lines(fid).empty());
 
         fs::remove_all(test_dir);
diff --git a/tests/utilities/composites/dft/reorganize/test_reconstruct_integration.cpp b/tests/utilities/composites/dft/reorganize/test_reconstruct_integration.cpp
index f4efdc68..f0a7c48a 100644
--- a/tests/utilities/composites/dft/reorganize/test_reconstruct_integration.cpp
+++ b/tests/utilities/composites/dft/reorganize/test_reconstruct_integration.cpp
@@ -39,6 +39,27 @@ using dftracer::utils::utilities::indexer::IndexBuilderUtility;
 using dftracer::utils::utilities::indexer::ProvenanceDatabase;
 namespace tags = dftracer::utils::utilities::tags;
 
+static ExtractionPlan run_planner(const ReorganizationPlannerInput& input) {
+    Runtime rt(4);
+    ExtractionPlan result;
+    auto* result_ptr = &result;
+
+    auto task = run_coro_scope(
+        rt.executor(),
+        [input, result_ptr](CoroScope& scope) -> coro::CoroTask<void> {
+            auto planner = std::make_shared<ReorganizationPlannerUtility>();
+            UtilityExecutor<ReorganizationPlannerInput, ExtractionPlan,
+                            tags::NeedsContext>
+                exec(planner, BehaviorChain<ReorganizationPlannerInput,
+                                            ExtractionPlan>{});
+            *result_ptr = co_await exec.execute_with_context(scope, input);
+        });
+
+    rt.submit(std::move(task), "run_planner").wait();
+    rt.shutdown();
+    return result;
+}
+
 // Test trace layout:
 // Line 0: HH metadata
 // Line 1: FH metadata
@@ -103,8 +124,7 @@ static void build_idx(const std::string& trace_file,
                                             indexer::IndexBuildResult>{});
             auto config = IndexBuildConfig::for_file(trace_file)
                               .with_index_dir(index_dir)
-                              .with_manifest(true)
-                              .with_index_threshold(0);
+                              .with_manifest(true);
             *result_ptr = co_await exec.execute_with_context(scope, config);
         });
 
@@ -260,8 +280,6 @@ static void write_group_provenance(
         int fid = pdb.get_or_create_file_info(gz_path, 0);
         REQUIRE(fid >= 0);
 
-        pdb.begin_transaction();
-
         pdb.insert_info(fid, "version", "1.0");
         pdb.insert_info(fid, "tool", "dftracer_organize");
         pdb.insert_group(fid, g.name, g.query);
@@ -285,14 +303,13 @@ static void write_group_provenance(
         for (const auto& [src_idx, ckpts] : segment_events) {
             for (const auto& [ckpt, count] : ckpts) {
                 pdb.insert_segment(fid, static_cast<int>(src_idx),
-                                   static_cast<int>(ckpt), output_line,
+                                   static_cast<int>(ckpt), /*seq=*/0,
+                                   output_line,
                                    output_line + static_cast<int>(count),
                                    static_cast<int>(count));
                 output_line += static_cast<int>(count);
             }
         }
-
-        pdb.commit_transaction();
     }
 }
 
@@ -312,14 +329,13 @@ TEST_SUITE("ReconstructIntegration") {
         build_idx(trace_file, input_dir);
 
         // Step 2: Plan reorganization
-        ReorganizationPlannerUtility planner;
         ReorganizationPlannerInput planner_input;
         planner_input.source_files = {trace_file};
         planner_input.groups = {{"io", R"(cat == "POSIX")"},
                                 {"compute", R"(cat == "APP")"}};
         planner_input.index_dir = input_dir;
 
-        auto plan = planner.process(planner_input).get();
+        auto plan = run_planner(planner_input);
         REQUIRE(plan.tasks.size() > 0);
 
         // Step 3: Execute extraction
@@ -518,14 +534,13 @@ TEST_SUITE("ReconstructIntegration") {
         std::string trace_file = create_test_trace(input_dir);
         build_idx(trace_file, input_dir);
 
-        ReorganizationPlannerUtility planner;
         ReorganizationPlannerInput planner_input;
         planner_input.source_files = {trace_file};
         planner_input.groups = {{"io", R"(cat == "POSIX")"},
                                 {"compute", R"(cat == "APP")"}};
         planner_input.index_dir = input_dir;
 
-        auto plan = planner.process(planner_input).get();
+        auto plan = run_planner(planner_input);
         REQUIRE(plan.tasks.size() > 0);
 
         std::map<std::string, FILE*> group_files;
diff --git a/tests/utilities/composites/dft/reorganize/test_reconstruction_planner.cpp b/tests/utilities/composites/dft/reorganize/test_reconstruction_planner.cpp
index d0470119..72ecb760 100644
--- a/tests/utilities/composites/dft/reorganize/test_reconstruction_planner.cpp
+++ b/tests/utilities/composites/dft/reorganize/test_reconstruction_planner.cpp
@@ -50,8 +50,6 @@ TEST_SUITE("ReconstructionPlanner") {
             pdb.init_schema();
             int fid = pdb.get_or_create_file_info(reorg_file, 0);
 
-            pdb.begin_transaction();
-
             // Provenance info
             pdb.insert_info(fid, "version", "1.0");
             pdb.insert_info(fid, "tool", "dftracer_organize");
@@ -63,11 +61,9 @@ TEST_SUITE("ReconstructionPlanner") {
             pdb.insert_source(fid, 0, "/original/trace.pfw.gz", 3, "abc123");
 
             // Provenance segments (3 checkpoints)
-            pdb.insert_segment(fid, 0, 0, 0, 100, 100);
-            pdb.insert_segment(fid, 0, 1, 100, 250, 150);
-            pdb.insert_segment(fid, 0, 2, 250, 400, 150);
-
-            pdb.commit_transaction();
+            pdb.insert_segment(fid, 0, 0, 0, 0, 100, 100);
+            pdb.insert_segment(fid, 0, 1, 0, 100, 250, 150);
+            pdb.insert_segment(fid, 0, 2, 0, 250, 400, 150);
         }
 
         // Run planner
@@ -134,17 +130,14 @@ TEST_SUITE("ReconstructionPlanner") {
             pdb.init_schema();
             int fid = pdb.get_or_create_file_info(io_file, 0);
 
-            pdb.begin_transaction();
             pdb.insert_info(fid, "version", "1.0");
             pdb.insert_info(fid, "tool", "dftracer_organize");
             pdb.insert_group(fid, "io", "cat=POSIX");
             pdb.insert_source(fid, 0, "/original/trace.pfw.gz", 2, "hash1");
 
             // Segments for checkpoints 0 and 1
-            pdb.insert_segment(fid, 0, 0, 0, 50, 50);
-            pdb.insert_segment(fid, 0, 1, 50, 120, 70);
-
-            pdb.commit_transaction();
+            pdb.insert_segment(fid, 0, 0, 0, 0, 50, 50);
+            pdb.insert_segment(fid, 0, 1, 0, 50, 120, 70);
         }
 
         // Create .pidx for compute.pfw.gz
@@ -155,17 +148,14 @@ TEST_SUITE("ReconstructionPlanner") {
             pdb.init_schema();
             int fid = pdb.get_or_create_file_info(compute_file, 0);
 
-            pdb.begin_transaction();
             pdb.insert_info(fid, "version", "1.0");
             pdb.insert_info(fid, "tool", "dftracer_organize");
             pdb.insert_group(fid, "compute", "cat=APP");
             pdb.insert_source(fid, 0, "/original/trace.pfw.gz", 2, "hash1");
 
             // Segments for checkpoints 0 and 1
-            pdb.insert_segment(fid, 0, 0, 0, 30, 30);
-            pdb.insert_segment(fid, 0, 1, 30, 80, 50);
-
-            pdb.commit_transaction();
+            pdb.insert_segment(fid, 0, 0, 0, 0, 30, 30);
+            pdb.insert_segment(fid, 0, 1, 0, 30, 80, 50);
         }
 
         // Run planner with both files
diff --git a/tests/utilities/composites/dft/reorganize/test_reorganization_planner.cpp b/tests/utilities/composites/dft/reorganize/test_reorganization_planner.cpp
index d566ffa6..a06d1001 100644
--- a/tests/utilities/composites/dft/reorganize/test_reorganization_planner.cpp
+++ b/tests/utilities/composites/dft/reorganize/test_reorganization_planner.cpp
@@ -27,6 +27,27 @@ using dftracer::utils::utilities::indexer::IndexBuilderUtility;
 using dftracer::utils::utilities::indexer::ProvenanceDatabase;
 namespace tags = dftracer::utils::utilities::tags;
 
+static ExtractionPlan run_planner(const ReorganizationPlannerInput& input) {
+    Runtime rt(4);
+    ExtractionPlan result;
+    auto* result_ptr = &result;
+
+    auto task = run_coro_scope(
+        rt.executor(),
+        [input, result_ptr](CoroScope& scope) -> coro::CoroTask<void> {
+            auto planner = std::make_shared<ReorganizationPlannerUtility>();
+            UtilityExecutor<ReorganizationPlannerInput, ExtractionPlan,
+                            tags::NeedsContext>
+                exec(planner, BehaviorChain<ReorganizationPlannerInput,
+                                            ExtractionPlan>{});
+            *result_ptr = co_await exec.execute_with_context(scope, input);
+        });
+
+    rt.submit(std::move(task), "run_planner").wait();
+    rt.shutdown();
+    return result;
+}
+
 // Create a test trace with known events:
 // Line 0: HH metadata
 // Line 1: FH metadata
@@ -77,8 +98,7 @@ static void build_idx(const std::string& trace_file,
                                             indexer::IndexBuildResult>{});
             auto config = IndexBuildConfig::for_file(trace_file)
                               .with_index_dir(index_dir)
-                              .with_manifest(true)
-                              .with_index_threshold(0);
+                              .with_manifest(true);
             *result_ptr = co_await exec.execute_with_context(scope, config);
         });
 
@@ -133,13 +153,12 @@ TEST_SUITE("ReorganizationPlanner") {
         std::string trace_file = create_planner_test_trace(test_dir);
         build_idx(trace_file, test_dir);
 
-        ReorganizationPlannerUtility planner;
         ReorganizationPlannerInput input;
         input.source_files = {trace_file};
         input.groups = {{"io", R"(cat == "POSIX")"}};
         input.index_dir = test_dir;
 
-        auto plan = planner.process(input).get();
+        auto plan = run_planner(input);
 
         // Should have 2 groups: "io" + auto-created "remainder"
         CHECK(plan.groups.size() == 2);
@@ -197,14 +216,13 @@ TEST_SUITE("ReorganizationPlanner") {
         std::string trace_file = create_planner_test_trace(test_dir);
         build_idx(trace_file, test_dir);
 
-        ReorganizationPlannerUtility planner;
         ReorganizationPlannerInput input;
         input.source_files = {trace_file};
         input.groups = {{"io", R"(cat == "POSIX")"},
                         {"compute", R"(cat == "APP")"}};
         input.index_dir = test_dir;
 
-        auto plan = planner.process(input).get();
+        auto plan = run_planner(input);
 
         CHECK(plan.groups.size() == 3);
 
@@ -235,14 +253,13 @@ TEST_SUITE("ReorganizationPlanner") {
         std::string trace_file = create_planner_test_trace(test_dir);
         build_idx(trace_file, test_dir);
 
-        ReorganizationPlannerUtility planner;
         ReorganizationPlannerInput input;
         input.source_files = {trace_file};
         input.groups = {{"io", R"(cat == "POSIX")"},
                         {"compute", R"(cat == "APP")"}};
         input.index_dir = test_dir;
 
-        auto plan = planner.process(input).get();
+        auto plan = run_planner(input);
 
         for (const auto& t : plan.tasks) {
             if (t.target_group == "remainder") {
@@ -271,16 +288,12 @@ TEST_SUITE("ReorganizationPlanner") {
         pdb.init_schema();
         int fid = pdb.get_or_create_file_info("test.pfw.gz", 0);
 
-        pdb.begin_transaction();
-
         pdb.insert_info(fid, "version", "1.0");
         pdb.insert_info(fid, "created_at", "2026-02-17");
         pdb.insert_source(fid, 0, "/data/trace.pfw.gz", 9, "abc123");
         pdb.insert_group(fid, "io", R"(cat == "POSIX")");
-        pdb.insert_segment(fid, 0, 0, 0, 100, 50);
-        pdb.insert_segment(fid, 0, 1, 100, 200, 45);
-
-        pdb.commit_transaction();
+        pdb.insert_segment(fid, 0, 0, 0, 0, 100, 50);
+        pdb.insert_segment(fid, 0, 1, 0, 100, 200, 45);
 
         CHECK(pdb.query_info(fid, "version") == "1.0");
         CHECK(pdb.query_info(fid, "created_at") == "2026-02-17");
diff --git a/tests/utilities/composites/dft/reorganize/test_reorganize_integration.cpp b/tests/utilities/composites/dft/reorganize/test_reorganize_integration.cpp
index 29c8e6ee..eb4d6a67 100644
--- a/tests/utilities/composites/dft/reorganize/test_reorganize_integration.cpp
+++ b/tests/utilities/composites/dft/reorganize/test_reorganize_integration.cpp
@@ -36,6 +36,27 @@ using dftracer::utils::utilities::indexer::IndexBuilderUtility;
 using dftracer::utils::utilities::indexer::ProvenanceDatabase;
 namespace tags = dftracer::utils::utilities::tags;
 
+static ExtractionPlan run_planner(const ReorganizationPlannerInput& input) {
+    Runtime rt(4);
+    ExtractionPlan result;
+    auto* result_ptr = &result;
+
+    auto task = run_coro_scope(
+        rt.executor(),
+        [input, result_ptr](CoroScope& scope) -> coro::CoroTask<void> {
+            auto planner = std::make_shared<ReorganizationPlannerUtility>();
+            UtilityExecutor<ReorganizationPlannerInput, ExtractionPlan,
+                            tags::NeedsContext>
+                exec(planner, BehaviorChain<ReorganizationPlannerInput,
+                                            ExtractionPlan>{});
+            *result_ptr = co_await exec.execute_with_context(scope, input);
+        });
+
+    rt.submit(std::move(task), "run_planner").wait();
+    rt.shutdown();
+    return result;
+}
+
 // Test trace layout:
 // Line 0: HH metadata
 // Line 1: FH metadata
@@ -100,8 +121,7 @@ static void build_idx_for_file(const std::string& trace_file,
                                             indexer::IndexBuildResult>{});
             auto config = IndexBuildConfig::for_file(trace_file)
                               .with_index_dir(index_dir)
-                              .with_manifest(true)
-                              .with_index_threshold(0);
+                              .with_manifest(true);
             *result_ptr = co_await exec.execute_with_context(scope, config);
         });
 
@@ -231,14 +251,13 @@ TEST_SUITE("ReorganizeIntegration") {
         build_idx_for_file(trace_file, input_dir);
 
         // Step 2: Plan extraction
-        ReorganizationPlannerUtility planner;
         ReorganizationPlannerInput planner_input;
         planner_input.source_files = {trace_file};
         planner_input.groups = {{"io", R"(cat == "POSIX")"},
                                 {"compute", R"(cat == "APP")"}};
         planner_input.index_dir = input_dir;
 
-        auto plan = planner.process(planner_input).get();
+        auto plan = run_planner(planner_input);
         REQUIRE(plan.tasks.size() > 0);
 
         // Step 3: Execute extraction
@@ -318,13 +337,12 @@ TEST_SUITE("ReorganizeIntegration") {
         build_idx_for_file(trace_file, input_dir);
 
         // Plan for io group only
-        ReorganizationPlannerUtility planner;
         ReorganizationPlannerInput planner_input;
         planner_input.source_files = {trace_file};
         planner_input.groups = {{"io", R"(cat == "POSIX")"}};
         planner_input.index_dir = input_dir;
 
-        auto plan = planner.process(planner_input).get();
+        auto plan = run_planner(planner_input);
 
         // Extract io group
         std::string io_pfw = output_dir + "/io.pfw";
@@ -379,8 +397,7 @@ TEST_SUITE("ReorganizeIntegration") {
                                            indexer::IndexBuildResult>{});
                     auto config = IndexBuildConfig::for_file(io_gz)
                                       .with_index_dir(output_dir)
-                                      .with_manifest(true)
-                                      .with_index_threshold(0);
+                                      .with_manifest(true);
                     *idx_result_ptr =
                         co_await exec.execute_with_context(scope, config);
                 });
@@ -404,13 +421,11 @@ TEST_SUITE("ReorganizeIntegration") {
             int fid = pdb.get_or_create_file_info(io_gz, 0);
             REQUIRE(fid >= 0);
 
-            pdb.begin_transaction();
             pdb.insert_info(fid, "version", "1.0");
             pdb.insert_info(fid, "tool", "dftracer_organize");
             pdb.insert_group(fid, "io", R"(cat == "POSIX")");
             pdb.insert_source(fid, 0, trace_file, 1, "");
-            pdb.insert_segment(fid, 0, 0, 0, 5, 3);
-            pdb.commit_transaction();
+            pdb.insert_segment(fid, 0, 0, 0, 0, 5, 3);
         }
 
         // Verify provenance
diff --git a/tests/utilities/composites/dft/statistics/test_detailed_statistics.cpp b/tests/utilities/composites/dft/statistics/test_detailed_statistics.cpp
index 7c3d0318..3995c5ee 100644
--- a/tests/utilities/composites/dft/statistics/test_detailed_statistics.cpp
+++ b/tests/utilities/composites/dft/statistics/test_detailed_statistics.cpp
@@ -1,7 +1,7 @@
 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
 #include <dftracer/utils/utilities/composites/dft/statistics/detailed_statistics.h>
 #include <doctest/doctest.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
 #include <cmath>
 #include <string>
@@ -192,33 +192,36 @@ TEST_SUITE("DetailedStatistics") {
 
         std::string json = stats.to_json();
 
-        yyjson_doc* doc =
-            yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG);
-        REQUIRE(doc != nullptr);
-
-        yyjson_val* root = yyjson_doc_get_root(doc);
-        REQUIRE(yyjson_is_obj(root));
-
-        CHECK(yyjson_get_uint(yyjson_obj_get(root, "events_scanned")) == 2);
-        CHECK(yyjson_get_uint(yyjson_obj_get(root, "chunks_scanned")) == 1);
-        CHECK(yyjson_get_uint(yyjson_obj_get(root, "chunks_skipped")) == 3);
-
-        yyjson_val* dur = yyjson_obj_get(root, "duration");
-        REQUIRE(yyjson_is_obj(dur));
-        CHECK(yyjson_get_uint(yyjson_obj_get(dur, "count")) == 2);
-
-        yyjson_val* gd = yyjson_obj_get(root, "grouped_duration");
-        REQUIRE(yyjson_is_obj(gd));
-        yyjson_val* gd_read = yyjson_obj_get(gd, "read");
-        REQUIRE(yyjson_is_obj(gd_read));
-        CHECK(yyjson_get_uint(yyjson_obj_get(gd_read, "count")) == 1);
-
-        yyjson_val* gio = yyjson_obj_get(root, "grouped_io");
-        REQUIRE(yyjson_is_obj(gio));
-        yyjson_val* gio_read = yyjson_obj_get(gio, "read");
-        REQUIRE(yyjson_is_obj(gio_read));
-
-        yyjson_doc_free(doc);
+        simdjson::dom::parser parser;
+        auto result = parser.parse(json);
+        REQUIRE(!result.error());
+
+        auto root = result.value_unsafe();
+        REQUIRE(root.is_object());
+
+        CHECK(root["events_scanned"].get_uint64().value() == 2);
+        CHECK(root["chunks_scanned"].get_uint64().value() == 1);
+        CHECK(root["chunks_skipped"].get_uint64().value() == 3);
+
+        auto dur = root["duration"];
+        REQUIRE(!dur.error());
+        REQUIRE(dur.is_object());
+        CHECK(dur["count"].get_uint64().value() == 2);
+
+        auto gd = root["grouped_duration"];
+        REQUIRE(!gd.error());
+        REQUIRE(gd.is_object());
+        auto gd_read = gd["read"];
+        REQUIRE(!gd_read.error());
+        REQUIRE(gd_read.is_object());
+        CHECK(gd_read["count"].get_uint64().value() == 1);
+
+        auto gio = root["grouped_io"];
+        REQUIRE(!gio.error());
+        REQUIRE(gio.is_object());
+        auto gio_read = gio["read"];
+        REQUIRE(!gio_read.error());
+        REQUIRE(gio_read.is_object());
     }
 
     TEST_CASE("to_json - no grouped when empty") {
@@ -227,15 +230,13 @@ TEST_SUITE("DetailedStatistics") {
 
         std::string json = stats.to_json();
 
-        yyjson_doc* doc =
-            yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG);
-        REQUIRE(doc != nullptr);
+        simdjson::dom::parser parser;
+        auto result = parser.parse(json);
+        REQUIRE(!result.error());
 
-        yyjson_val* root = yyjson_doc_get_root(doc);
-        CHECK(yyjson_obj_get(root, "grouped_duration") == nullptr);
-        CHECK(yyjson_obj_get(root, "grouped_io") == nullptr);
-
-        yyjson_doc_free(doc);
+        auto root = result.value_unsafe();
+        CHECK(root["grouped_duration"].error());
+        CHECK(root["grouped_io"].error());
     }
 
     TEST_CASE("to_json - global duration always present") {
@@ -243,15 +244,14 @@ TEST_SUITE("DetailedStatistics") {
         // Even with no events, duration section should be present
         std::string json = stats.to_json();
 
-        yyjson_doc* doc =
-            yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG);
-        REQUIRE(doc != nullptr);
-
-        yyjson_val* root = yyjson_doc_get_root(doc);
-        yyjson_val* dur = yyjson_obj_get(root, "duration");
-        REQUIRE(yyjson_is_obj(dur));
-        CHECK(yyjson_get_uint(yyjson_obj_get(dur, "count")) == 0);
+        simdjson::dom::parser parser;
+        auto result = parser.parse(json);
+        REQUIRE(!result.error());
 
-        yyjson_doc_free(doc);
+        auto root = result.value_unsafe();
+        auto dur = root["duration"];
+        REQUIRE(!dur.error());
+        REQUIRE(dur.is_object());
+        CHECK(dur["count"].get_uint64().value() == 0);
     }
 }
diff --git a/tests/utilities/composites/dft/statistics/test_statistics_aggregator.cpp b/tests/utilities/composites/dft/statistics/test_statistics_aggregator.cpp
index 205c85fd..f4e9e1ae 100644
--- a/tests/utilities/composites/dft/statistics/test_statistics_aggregator.cpp
+++ b/tests/utilities/composites/dft/statistics/test_statistics_aggregator.cpp
@@ -3,6 +3,7 @@
 #include <dftracer/utils/utilities/composites/dft/internal/utils.h>
 #include <dftracer/utils/utilities/composites/dft/statistics/statistics_aggregator_utility.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
 #include <doctest/doctest.h>
 
@@ -16,13 +17,14 @@ using namespace dftracer::utils::utilities::composites::dft::internal;
 using namespace dftracer::utils::utilities::composites::dft::indexing;
 using namespace dftracer::utils::utilities::composites::dft::statistics;
 using dftracer::utils::utilities::indexer::IndexDatabase;
+using dftracer::utils::utilities::indexer::IndexDatabaseWriterContext;
 using dftracer::utils::utilities::indexer::internal::get_logical_path;
 
 static void write_chunk(
-    IndexDatabase& db, int fid, std::uint64_t checkpoint_idx,
+    IndexDatabaseWriterContext& writer, int fid, std::uint64_t checkpoint_idx,
     ChunkStatistics& stats,
     const std::vector<std::pair<std::string, std::string>>& dim_values) {
-    db.insert_chunk_statistics(fid, checkpoint_idx, stats);
+    writer.insert_chunk_statistics(fid, checkpoint_idx, stats);
 
     std::unordered_map<std::string, ChunkDimensionStats> dim_stats;
     for (const auto& [dim, val] : dim_values) {
@@ -32,27 +34,24 @@ static void write_chunk(
         ds.observe(val);
     }
     for (const auto& [dim, ds] : dim_stats) {
-        db.insert_chunk_dimension_stats(fid, checkpoint_idx, ds);
+        writer.insert_chunk_dimension_stats(fid, checkpoint_idx, ds);
     }
 }
 
 static void populate_test_db(const std::string& db_root,
                              const std::string& file_path) {
     IndexDatabase idx_db(db_root);
-    idx_db.init_base_schema();
-    idx_db.init_bloom_schema();
+    auto writer = idx_db.begin_write();
+    writer->init_schema();
 
     int fid =
-        idx_db.get_or_create_file_info(get_logical_path(file_path), 12345);
+        writer->get_or_create_file_info(get_logical_path(file_path), 12345);
 
-    idx_db.begin_transaction();
-
-    // Chunk 0: 2 events
     {
         ChunkStatistics stats;
         stats.update_from_event("read", "POSIX", 1, 1, 1000, 100);
         stats.update_from_event("write", "POSIX", 1, 2, 2000, 200);
-        write_chunk(idx_db, fid, 0, stats,
+        write_chunk(*writer, fid, 0, stats,
                     {{"cat", "POSIX"},
                      {"cat", "POSIX"},
                      {"name", "read"},
@@ -61,20 +60,18 @@ static void populate_test_db(const std::string& db_root,
                      {"pid_tid", "1:2"}});
     }
 
-    // Chunk 1: 1 event
     {
         ChunkStatistics stats;
         stats.update_from_event("open", "storage", 2, 1, 5000, 50);
-        write_chunk(idx_db, fid, 1, stats,
+        write_chunk(*writer, fid, 1, stats,
                     {{"cat", "storage"}, {"name", "open"}, {"pid_tid", "2:1"}});
     }
 
-    // Chunk 2: 2 events
     {
         ChunkStatistics stats;
         stats.update_from_event("read", "POSIX", 1, 1, 8000, 300);
         stats.update_from_event("stat", "POSIX", 3, 1, 9000, 10);
-        write_chunk(idx_db, fid, 2, stats,
+        write_chunk(*writer, fid, 2, stats,
                     {{"cat", "POSIX"},
                      {"cat", "POSIX"},
                      {"name", "read"},
@@ -83,7 +80,7 @@ static void populate_test_db(const std::string& db_root,
                      {"pid_tid", "3:1"}});
     }
 
-    idx_db.commit_transaction();
+    writer->commit();
 }
 
 TEST_SUITE("StatisticsAggregatorUtility") {
@@ -179,11 +176,13 @@ TEST_SUITE("StatisticsAggregatorUtility") {
             determine_index_path(test_dir + "/test.pfw.gz", "");
         std::string file_path = "/fake/test.pfw.gz";
 
-        // Create idx with file_info but no chunk_statistics
         IndexDatabase idx_db(db_root);
-        idx_db.init_base_schema();
-        idx_db.init_bloom_schema();
-        idx_db.get_or_create_file_info(get_logical_path(file_path), 12345);
+        {
+            auto writer = idx_db.begin_write();
+            writer->init_schema();
+            writer->get_or_create_file_info(get_logical_path(file_path), 12345);
+            writer->commit();
+        }
 
         StatisticsAggregatorUtility aggregator;
         StatisticsAggregatorInput input;
@@ -211,32 +210,31 @@ TEST_SUITE("StatisticsAggregatorUtility") {
         std::string file_path = "/fake/test.pfw.gz";
 
         IndexDatabase idx_db(db_root);
-        idx_db.init_base_schema();
-        idx_db.init_bloom_schema();
-        int fid =
-            idx_db.get_or_create_file_info(get_logical_path(file_path), 12345);
-
-        idx_db.begin_transaction();
-
-        // Chunk 0: durations 10, 20
+        int fid;
         {
-            ChunkStatistics stats;
-            stats.update_from_event("op", "cat", 1, 1, 1000, 10);
-            stats.update_from_event("op", "cat", 1, 1, 2000, 20);
-            idx_db.insert_chunk_statistics(fid, 0, stats);
+            auto writer = idx_db.begin_write();
+            writer->init_schema();
+            fid = writer->get_or_create_file_info(get_logical_path(file_path),
+                                                  12345);
+
+            {
+                ChunkStatistics stats;
+                stats.update_from_event("op", "cat", 1, 1, 1000, 10);
+                stats.update_from_event("op", "cat", 1, 1, 2000, 20);
+                writer->insert_chunk_statistics(fid, 0, stats);
+            }
+
+            {
+                ChunkStatistics stats;
+                stats.update_from_event("op", "cat", 1, 1, 3000, 30);
+                stats.update_from_event("op", "cat", 1, 1, 4000, 40);
+                stats.update_from_event("op", "cat", 1, 1, 5000, 50);
+                writer->insert_chunk_statistics(fid, 1, stats);
+            }
+
+            writer->commit();
         }
 
-        // Chunk 1: durations 30, 40, 50
-        {
-            ChunkStatistics stats;
-            stats.update_from_event("op", "cat", 1, 1, 3000, 30);
-            stats.update_from_event("op", "cat", 1, 1, 4000, 40);
-            stats.update_from_event("op", "cat", 1, 1, 5000, 50);
-            idx_db.insert_chunk_statistics(fid, 1, stats);
-        }
-
-        idx_db.commit_transaction();
-
         StatisticsAggregatorUtility aggregator;
         StatisticsAggregatorInput input;
         input.file_path = file_path;
diff --git a/tests/utilities/composites/dft/statistics/test_statistics_query.cpp b/tests/utilities/composites/dft/statistics/test_statistics_query.cpp
index 125fc6a2..18f3aa36 100644
--- a/tests/utilities/composites/dft/statistics/test_statistics_query.cpp
+++ b/tests/utilities/composites/dft/statistics/test_statistics_query.cpp
@@ -1,7 +1,7 @@
 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
 #include <dftracer/utils/utilities/composites/dft/statistics/statistics_query_utility.h>
 #include <doctest/doctest.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
 #include <string>
 
@@ -189,18 +189,16 @@ TEST_SUITE("StatisticsQueryUtility") {
             auto output = query.process(input).get();
             std::string json = output.to_json();
 
-            yyjson_doc* doc =
-                yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG);
-            REQUIRE(doc != nullptr);
+            simdjson::dom::parser parser;
+            auto result = parser.parse(json);
+            REQUIRE(!result.error());
 
-            yyjson_val* root = yyjson_doc_get_root(doc);
-            REQUIRE(yyjson_is_obj(root));
+            auto root = result.value_unsafe();
+            REQUIRE(root.is_object());
 
             // query_type field should always be present
-            CHECK(yyjson_obj_get(root, "query_type") != nullptr);
-            CHECK(yyjson_obj_get(root, "total_events") != nullptr);
-
-            yyjson_doc_free(doc);
+            CHECK(!root["query_type"].error());
+            CHECK(!root["total_events"].error());
         }
     }
 }
diff --git a/tests/utilities/composites/dft/statistics/test_trace_statistics.cpp b/tests/utilities/composites/dft/statistics/test_trace_statistics.cpp
index 6af11bcb..2f3f0424 100644
--- a/tests/utilities/composites/dft/statistics/test_trace_statistics.cpp
+++ b/tests/utilities/composites/dft/statistics/test_trace_statistics.cpp
@@ -1,7 +1,7 @@
 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
 #include <dftracer/utils/utilities/composites/dft/statistics/trace_statistics.h>
 #include <doctest/doctest.h>
-#include <yyjson.h>
+#include <simdjson.h>
 
 #include <cmath>
 #include <limits>
@@ -63,35 +63,36 @@ TEST_SUITE("TraceStatistics") {
         std::string json = ts.to_json();
 
         // Parse and validate the JSON
-        yyjson_doc* doc =
-            yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG);
-        REQUIRE(doc != nullptr);
+        simdjson::dom::parser parser;
+        auto result = parser.parse(json);
+        REQUIRE(!result.error());
 
-        yyjson_val* root = yyjson_doc_get_root(doc);
-        REQUIRE(yyjson_is_obj(root));
+        auto root = result.value_unsafe();
+        REQUIRE(root.is_object());
 
-        CHECK(std::string(yyjson_get_str(yyjson_obj_get(root, "file_path"))) ==
+        CHECK(std::string(root["file_path"].get_string().value()) ==
               "/test/file.pfw.gz");
-        CHECK(yyjson_get_bool(yyjson_obj_get(root, "success")) == true);
-        CHECK(yyjson_get_uint(yyjson_obj_get(root, "total_events")) == 2);
-        CHECK(yyjson_get_uint(yyjson_obj_get(root, "num_chunks")) == 2);
-        CHECK(yyjson_get_uint(yyjson_obj_get(root, "num_categories")) == 2);
-        CHECK(yyjson_get_uint(yyjson_obj_get(root, "num_unique_names")) == 2);
+        CHECK(root["success"].get_bool().value() == true);
+        CHECK(root["total_events"].get_uint64().value() == 2);
+        CHECK(root["num_chunks"].get_uint64().value() == 2);
+        CHECK(root["num_categories"].get_uint64().value() == 2);
+        CHECK(root["num_unique_names"].get_uint64().value() == 2);
 
         // Check time_range object exists
-        yyjson_val* time_range = yyjson_obj_get(root, "time_range");
-        REQUIRE(yyjson_is_obj(time_range));
+        auto time_range = root["time_range"];
+        REQUIRE(!time_range.error());
+        REQUIRE(time_range.is_object());
 
         // Check duration object exists
-        yyjson_val* duration = yyjson_obj_get(root, "duration");
-        REQUIRE(yyjson_is_obj(duration));
-        CHECK(yyjson_get_uint(yyjson_obj_get(duration, "count")) == 2);
+        auto duration = root["duration"];
+        REQUIRE(!duration.error());
+        REQUIRE(duration.is_object());
+        CHECK(duration["count"].get_uint64().value() == 2);
 
         // Check category_counts object exists
-        yyjson_val* cats = yyjson_obj_get(root, "category_counts");
-        REQUIRE(yyjson_is_obj(cats));
-
-        yyjson_doc_free(doc);
+        auto cats = root["category_counts"];
+        REQUIRE(!cats.error());
+        REQUIRE(cats.is_object());
     }
 
     TEST_CASE("TraceStatistics - to_json with error") {
@@ -103,15 +104,13 @@ TEST_SUITE("TraceStatistics") {
 
         std::string json = ts.to_json();
 
-        yyjson_doc* doc =
-            yyjson_read(json.c_str(), json.size(), YYJSON_READ_NOFLAG);
-        REQUIRE(doc != nullptr);
+        simdjson::dom::parser parser;
+        auto result = parser.parse(json);
+        REQUIRE(!result.error());
 
-        yyjson_val* root = yyjson_doc_get_root(doc);
-        CHECK(yyjson_get_bool(yyjson_obj_get(root, "success")) == false);
-        CHECK(std::string(yyjson_get_str(yyjson_obj_get(root, "error"))) ==
+        auto root = result.value_unsafe();
+        CHECK(root["success"].get_bool().value() == false);
+        CHECK(std::string(root["error"].get_string().value()) ==
               "File not found");
-
-        yyjson_doc_free(doc);
     }
 }
diff --git a/tests/utilities/composites/dft/test_index_builder.cpp b/tests/utilities/composites/dft/test_index_builder.cpp
index 19ae5f69..a927eb23 100644
--- a/tests/utilities/composites/dft/test_index_builder.cpp
+++ b/tests/utilities/composites/dft/test_index_builder.cpp
@@ -52,8 +52,7 @@ TEST_SUITE("IndexBuilder") {
 
             auto input = IndexBuildConfig::for_file(gz_file)
                              .with_index_dir("")
-                             .with_checkpoint_size(10)
-                             .with_index_threshold(0);
+                             .with_checkpoint_size(10);
 
             auto output = run_builder(input);
 
@@ -68,9 +67,8 @@ TEST_SUITE("IndexBuilder") {
         SUBCASE("Use existing index without force rebuild") {
             std::string gz_file = env.create_dft_test_gzip_file(20);
 
-            auto input1 = IndexBuildConfig::for_file(gz_file)
-                              .with_index_dir("")
-                              .with_index_threshold(0);
+            auto input1 =
+                IndexBuildConfig::for_file(gz_file).with_index_dir("");
 
             auto output1 = run_builder(input1);
             CHECK(output1.success == true);
@@ -89,8 +87,7 @@ TEST_SUITE("IndexBuilder") {
 
         auto input = IndexBuildConfig::for_file(gz_file)
                          .with_index_dir("")
-                         .with_force_rebuild(true)
-                         .with_index_threshold(0);
+                         .with_force_rebuild(true);
 
         auto output1 = run_builder(input);
         CHECK(output1.success == true);
diff --git a/tests/utilities/composites/dft/views/test_view_builder.cpp b/tests/utilities/composites/dft/views/test_view_builder.cpp
index ca2c86f3..92e9b814 100644
--- a/tests/utilities/composites/dft/views/test_view_builder.cpp
+++ b/tests/utilities/composites/dft/views/test_view_builder.cpp
@@ -4,6 +4,7 @@
 #include <dftracer/utils/utilities/composites/dft/views/view_builder_utility.h>
 #include <dftracer/utils/utilities/composites/dft/views/view_definition.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
 #include <doctest/doctest.h>
 
@@ -26,13 +27,11 @@ using dftracer::utils::utilities::indexer::internal::get_logical_path;
 static void populate_test_idx(const std::string& index_path,
                               const std::string& file_path) {
     IndexDatabase idx_db(index_path);
-    idx_db.init_base_schema();
-    idx_db.init_bloom_schema();
+    auto writer = idx_db.begin_write();
+    writer->init_schema();
 
     int fid =
-        idx_db.get_or_create_file_info(get_logical_path(file_path), 40000);
-
-    idx_db.begin_transaction();
+        writer->get_or_create_file_info(get_logical_path(file_path), 40000);
 
     struct ChunkDims {
         std::vector<std::string> names;
@@ -46,7 +45,6 @@ static void populate_test_idx(const std::string& index_path,
         {{"forward"}, {"compute", "ai_framework"}},
     };
 
-    // File-level blooms (union of all chunks)
     BloomFilter file_name_bloom(100, 0.01);
     BloomFilter file_cat_bloom(100, 0.01);
 
@@ -57,7 +55,7 @@ static void populate_test_idx(const std::string& index_path,
             file_name_bloom.add(n);
         }
         auto name_blob = name_bloom.serialize();
-        idx_db.insert_chunk_bloom_filter(
+        writer->insert_chunk_bloom_filter(
             fid, static_cast<std::uint64_t>(ckpt), "name", name_blob.data(),
             static_cast<int>(name_blob.size()), name_bloom.num_entries());
 
@@ -67,26 +65,24 @@ static void populate_test_idx(const std::string& index_path,
             file_cat_bloom.add(c);
         }
         auto cat_blob = cat_bloom.serialize();
-        idx_db.insert_chunk_bloom_filter(
+        writer->insert_chunk_bloom_filter(
             fid, static_cast<std::uint64_t>(ckpt), "cat", cat_blob.data(),
             static_cast<int>(cat_blob.size()), cat_bloom.num_entries());
     }
 
-    // File-level bloom filters
     auto name_blob = file_name_bloom.serialize();
-    idx_db.insert_file_bloom_filter(fid, "name", name_blob.data(),
-                                    static_cast<int>(name_blob.size()),
-                                    file_name_bloom.num_entries());
+    writer->insert_file_bloom_filter(fid, "name", name_blob.data(),
+                                     static_cast<int>(name_blob.size()),
+                                     file_name_bloom.num_entries());
 
     auto cat_blob = file_cat_bloom.serialize();
-    idx_db.insert_file_bloom_filter(fid, "cat", cat_blob.data(),
-                                    static_cast<int>(cat_blob.size()),
-                                    file_cat_bloom.num_entries());
-
-    idx_db.insert_index_dimension(fid, "name");
-    idx_db.insert_index_dimension(fid, "cat");
+    writer->insert_file_bloom_filter(fid, "cat", cat_blob.data(),
+                                     static_cast<int>(cat_blob.size()),
+                                     file_cat_bloom.num_entries());
 
-    idx_db.commit_transaction();
+    writer->insert_index_dimension(fid, "name");
+    writer->insert_index_dimension(fid, "cat");
+    writer->commit();
 }
 
 TEST_SUITE("ViewBuilderUtility") {
@@ -301,27 +297,27 @@ TEST_SUITE("ViewBuilderUtility") {
         std::string index_path = test_dir + "/test.pfw.gz.idx";
         std::string file_path = "/fake/test.pfw.gz";
 
-        // Create idx with fhash dimension
         IndexDatabase idx_db(index_path);
-        idx_db.init_base_schema();
-        idx_db.init_bloom_schema();
-        int fid =
-            idx_db.get_or_create_file_info(get_logical_path(file_path), 10000);
-        idx_db.begin_transaction();
-
-        BloomFilter fhash_bloom(100, 0.01);
-        fhash_bloom.add("hash123");
-        auto blob = fhash_bloom.serialize();
-
-        idx_db.insert_file_bloom_filter(fid, "fhash", blob.data(),
-                                        static_cast<int>(blob.size()),
-                                        fhash_bloom.num_entries());
-        idx_db.insert_chunk_bloom_filter(fid, 0, "fhash", blob.data(),
-                                         static_cast<int>(blob.size()),
-                                         fhash_bloom.num_entries());
-        idx_db.insert_index_dimension(fid, "fhash");
-        idx_db.insert_hash_resolution(fid, "fhash", "hash123", "/data/file.h5");
-        idx_db.commit_transaction();
+        {
+            auto writer = idx_db.begin_write();
+            writer->init_schema();
+            int fid = writer->get_or_create_file_info(
+                get_logical_path(file_path), 10000);
+
+            BloomFilter fhash_bloom(100, 0.01);
+            fhash_bloom.add("hash123");
+            auto blob = fhash_bloom.serialize();
+
+            writer->insert_file_bloom_filter(fid, "fhash", blob.data(),
+                                             static_cast<int>(blob.size()),
+                                             fhash_bloom.num_entries());
+            writer->insert_chunk_bloom_filter(fid, 0, "fhash", blob.data(),
+                                              static_cast<int>(blob.size()),
+                                              fhash_bloom.num_entries());
+            writer->insert_index_dimension(fid, "fhash");
+            writer->insert_hash_table_entry(0, "hash123", "/data/file.h5");
+            writer->commit();
+        }
 
         // Use "file" alias which should resolve to "fhash"
         ViewDefinition view;
diff --git a/tests/utilities/composites/test_file_merger.cpp b/tests/utilities/composites/test_file_merger.cpp
index aa2b46a2..28e5f869 100644
--- a/tests/utilities/composites/test_file_merger.cpp
+++ b/tests/utilities/composites/test_file_merger.cpp
@@ -2,9 +2,9 @@
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/utilities/composites/file_merger_utility.h>
 #include <doctest/doctest.h>
+#include <simdjson.h>
 #include <testing_utilities.h>
 #include <unistd.h>
-#include <yyjson.h>
 
 #include <fstream>
 #include <sstream>
@@ -156,13 +156,13 @@ TEST_SUITE("FileMerger") {
                 }
 
                 // Parse each JSON line
-                yyjson_doc *doc = yyjson_read(line.c_str(), line.size(), 0);
-                if (doc != nullptr) {
-                    yyjson_val *root = yyjson_doc_get_root(doc);
-                    if (yyjson_is_obj(root)) {
+                simdjson::dom::parser parser;
+                auto result = parser.parse(line);
+                if (!result.error()) {
+                    auto root = result.value_unsafe();
+                    if (root.is_object()) {
                         event_count++;
                     }
-                    yyjson_doc_free(doc);
                 }
             }
             ifs.close();
diff --git a/tests/utilities/fileio/parallel/test_layout_sizing.cpp b/tests/utilities/fileio/parallel/test_layout_sizing.cpp
new file mode 100644
index 00000000..5d96fd73
--- /dev/null
+++ b/tests/utilities/fileio/parallel/test_layout_sizing.cpp
@@ -0,0 +1,93 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/utilities/fileio/parallel/layout.h>
+#include <doctest/doctest.h>
+
+using dftracer::utils::utilities::fileio::parallel::compute_writer_sizing;
+using dftracer::utils::utilities::fileio::parallel::FileLayout;
+using dftracer::utils::utilities::fileio::parallel::FilesystemKind;
+using dftracer::utils::utilities::fileio::parallel::LayoutInfo;
+
+namespace {
+constexpr std::size_t MB = 1024 * 1024;
+}
+
+TEST_CASE("compute_writer_sizing - local FS uses defaults") {
+    LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LOCAL, 0, 0};
+    auto s = compute_writer_sizing(info, 8, 12 * MB, 4 * MB);
+    CHECK(s.num_workers == 8);
+    CHECK(s.flush_threshold == 12 * MB);
+    CHECK(s.buffer_capacity == 16 * MB);
+}
+
+TEST_CASE("compute_writer_sizing - NFS keeps defaults (no stripe info)") {
+    LayoutInfo info{FileLayout::SHARDED, FilesystemKind::NFS, 0, 0};
+    auto s = compute_writer_sizing(info, 8, 12 * MB, 4 * MB);
+    CHECK(s.num_workers == 8);
+    CHECK(s.flush_threshold == 12 * MB);
+    CHECK(s.buffer_capacity == 16 * MB);
+}
+
+TEST_CASE("compute_writer_sizing - Lustre caps workers at stripe_count") {
+    LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LUSTRE, 1 * MB, 4};
+    auto s = compute_writer_sizing(info, 8, 12 * MB, 4 * MB);
+    CHECK(s.num_workers == 4);
+    CHECK(s.flush_threshold == 12 * MB);  // stripe 1MB < default 12MB
+    CHECK(s.buffer_capacity == 16 * MB);
+}
+
+TEST_CASE("compute_writer_sizing - Lustre grows flush to stripe_size") {
+    LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LUSTRE, 32 * MB, 2};
+    auto s = compute_writer_sizing(info, 8, 12 * MB, 4 * MB);
+    CHECK(s.num_workers == 2);
+    CHECK(s.flush_threshold == 32 * MB);
+    CHECK(s.buffer_capacity == 36 * MB);
+}
+
+TEST_CASE("compute_writer_sizing - baseline smaller than stripe_count wins") {
+    LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LUSTRE, 4 * MB, 16};
+    auto s = compute_writer_sizing(info, 4, 12 * MB, 4 * MB);
+    CHECK(s.num_workers == 4);
+}
+
+TEST_CASE("compute_writer_sizing - zero baseline coerced to one worker") {
+    LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LOCAL, 0, 0};
+    auto s = compute_writer_sizing(info, 0, 12 * MB, 4 * MB);
+    CHECK(s.num_workers == 1);
+}
+
+TEST_CASE(
+    "compute_writer_sizing - GPFS treated like Lustre when stripe given") {
+    LayoutInfo info{FileLayout::STRIPED, FilesystemKind::GPFS, 8 * MB, 3};
+    auto s = compute_writer_sizing(info, 8, 12 * MB, 4 * MB);
+    CHECK(s.num_workers == 3);
+    CHECK(s.flush_threshold == 12 * MB);
+}
+
+TEST_CASE("compute_writer_sizing - padded layout clamps flush to stripe") {
+    LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LUSTRE, 4 * MB, 4};
+    auto s = compute_writer_sizing(info, 8, 12 * MB, 4 * MB,
+                                   /*padded_layout=*/true);
+    // Padded layout does not cap workers by stripe_count; the packer
+    // serializes stripe assembly, so extra compression workers are useful.
+    CHECK(s.num_workers == 8);
+    CHECK(s.flush_threshold == 4 * MB);  // clamped to stripe, not default
+    CHECK(s.buffer_capacity == 8 * MB);  // flush + headroom
+}
+
+TEST_CASE(
+    "compute_writer_sizing - padded keeps baseline regardless of stripe") {
+    LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LUSTRE, 32 * MB, 8};
+    auto s = compute_writer_sizing(info, 16, 12 * MB, 4 * MB,
+                                   /*padded_layout=*/true);
+    CHECK(s.num_workers == 16);  // not capped by stripe_count
+    CHECK(s.flush_threshold == 32 * MB);
+    CHECK(s.buffer_capacity == 36 * MB);
+}
+
+TEST_CASE("compute_writer_sizing - padded_layout without stripe is a no-op") {
+    LayoutInfo info{FileLayout::STRIPED, FilesystemKind::LOCAL, 0, 0};
+    auto s = compute_writer_sizing(info, 4, 12 * MB, 4 * MB,
+                                   /*padded_layout=*/true);
+    // No stripe known, fall back to default flush.
+    CHECK(s.flush_threshold == 12 * MB);
+}
diff --git a/tests/utilities/fileio/parallel/test_padded_striped_writer.cpp b/tests/utilities/fileio/parallel/test_padded_striped_writer.cpp
new file mode 100644
index 00000000..4639b408
--- /dev/null
+++ b/tests/utilities/fileio/parallel/test_padded_striped_writer.cpp
@@ -0,0 +1,219 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/core/common/byte_view.h>
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/runtime.h>
+#include <dftracer/utils/core/tasks/coro_scope.h>
+#include <dftracer/utils/utilities/fileio/parallel/parallel_writer.h>
+#include <doctest/doctest.h>
+#include <zlib.h>
+
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+using namespace dftracer::utils;
+using namespace dftracer::utils::coro;
+using namespace dftracer::utils::utilities::fileio::parallel;
+
+namespace {
+
+constexpr std::size_t STRIPE = 1 * 1024 * 1024;  // 1 MB stripe for the test
+
+std::string tmp_path(const char* name) {
+    return (fs::temp_directory_path() / name).string();
+}
+
+// Compress `data` into a standalone gzip member with zlib.
+std::vector<std::uint8_t> gzip_member(const std::string& data) {
+    uLongf bound = compressBound(data.size()) + 64;
+    std::vector<std::uint8_t> out(bound);
+    z_stream s{};
+    REQUIRE(deflateInit2(&s, Z_BEST_SPEED, Z_DEFLATED, 15 | 16, 8,
+                         Z_DEFAULT_STRATEGY) == Z_OK);
+    s.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(data.data()));
+    s.avail_in = static_cast<uInt>(data.size());
+    s.next_out = out.data();
+    s.avail_out = static_cast<uInt>(out.size());
+    REQUIRE(deflate(&s, Z_FINISH) == Z_STREAM_END);
+    out.resize(s.total_out);
+    deflateEnd(&s);
+    return out;
+}
+
+ByteView as_bv(const std::vector<std::uint8_t>& v) {
+    return ByteView(reinterpret_cast<const std::byte*>(v.data()), v.size());
+}
+
+// Decompress the whole file via gzread (spans concatenated members natively).
+std::string gunzip_all(const std::string& path) {
+    gzFile g = gzopen(path.c_str(), "rb");
+    REQUIRE(g != nullptr);
+    std::string out;
+    char buf[4096];
+    int n;
+    while ((n = gzread(g, buf, sizeof(buf))) > 0) {
+        out.append(buf, n);
+    }
+    gzclose(g);
+    return out;
+}
+
+}  // namespace
+
+TEST_CASE("PaddedStripedWriter - gzip -t passes and payload round-trips") {
+    std::string path = tmp_path("test_padded_basic.json.gz");
+    std::remove(path.c_str());
+
+    std::string hdr_payload = "HDR\n";
+    std::string w0_payload = R"({"ev":0})"
+                             "\n";
+    std::string w1_payload = R"({"ev":1})"
+                             "\n";
+    std::string w2_payload = R"({"ev":2})"
+                             "\n";
+    std::string ftr_payload = "]\n";
+
+    auto hdr_mem = gzip_member(hdr_payload);
+    auto w0_mem = gzip_member(w0_payload);
+    auto w1_mem = gzip_member(w1_payload);
+    auto w2_mem = gzip_member(w2_payload);
+    auto ftr_mem = gzip_member(ftr_payload);
+
+    Runtime runtime(2);
+    int result = -1;
+    runtime
+        .scope("padded_basic",
+               [&](CoroScope& s) -> CoroTask<void> {
+                   auto w = make_padded_striped_writer(STRIPE);
+                   if (co_await w->open(path, 3, true, &s) != 0) {
+                       result = 1;
+                       co_return;
+                   }
+                   if (co_await w->write_header(as_bv(hdr_mem)) != 0) {
+                       result = 2;
+                       co_return;
+                   }
+                   if (co_await w->write_chunk(0, as_bv(w0_mem)) != 0) {
+                       result = 3;
+                       co_return;
+                   }
+                   if (co_await w->write_chunk(1, as_bv(w1_mem)) != 0) {
+                       result = 4;
+                       co_return;
+                   }
+                   if (co_await w->write_chunk(2, as_bv(w2_mem)) != 0) {
+                       result = 5;
+                       co_return;
+                   }
+                   if (co_await w->write_footer(as_bv(ftr_mem)) != 0) {
+                       result = 6;
+                       co_return;
+                   }
+                   if (co_await w->close() != 0) {
+                       result = 7;
+                       co_return;
+                   }
+                   result = 0;
+               })
+        .get();
+    CHECK(result == 0);
+
+    // With coalescing, the three tiny worker members pack into a single
+    // stripe. File = [header stripe][one coalesced stripe][footer bytes].
+    CHECK(fs::file_size(path) == 2 * STRIPE + ftr_mem.size());
+
+    auto decompressed = gunzip_all(path);
+    // Decompressed content = hdr + w0 + w1 + w2 + footer, with worker ordering
+    // determined by the atomic stripe allocation (sequential here since we
+    // serialized calls).
+    CHECK(decompressed ==
+          hdr_payload + w0_payload + w1_payload + w2_payload + ftr_payload);
+
+    fs::remove(path);
+}
+
+TEST_CASE("PaddedStripedWriter - oversize chunk is rejected") {
+    std::string path = tmp_path("test_padded_oversize.gz");
+    std::remove(path.c_str());
+
+    // Create a "payload" larger than one stripe (minus overhead).
+    std::vector<std::uint8_t> huge(STRIPE, 0xAA);
+
+    Runtime runtime(2);
+    int result = -1;
+    runtime
+        .scope("padded_oversize",
+               [&](CoroScope& s) -> CoroTask<void> {
+                   auto w = make_padded_striped_writer(STRIPE);
+                   if (co_await w->open(path, 1, true, &s) != 0) {
+                       result = 1;
+                       co_return;
+                   }
+                   auto rc = co_await w->write_chunk(0, as_bv(huge));
+                   co_await w->close();
+                   result = rc == 0 ? 99 : 0;  // expect failure
+               })
+        .get();
+    CHECK(result == 0);
+    std::remove(path.c_str());
+}
+
+TEST_CASE(
+    "PaddedStripedWriter - file-level gzip integrity (decompress twice)") {
+    std::string path = tmp_path("test_padded_integrity.json.gz");
+    std::remove(path.c_str());
+
+    // Bigger payload so the test meaningfully exercises padding members.
+    std::string big;
+    big.reserve(200 * 1024);
+    for (int i = 0; i < 2000; ++i) {
+        char line[128];
+        std::snprintf(line, sizeof(line),
+                      R"({"id":%d,"name":"evt_%d"} )"
+                      "\n",
+                      i, i);
+        big += line;
+    }
+    auto mem = gzip_member(big);
+    REQUIRE(mem.size() + 25 < STRIPE);  // fits in one stripe with padding
+
+    Runtime runtime(2);
+    int result = -1;
+    runtime
+        .scope("padded_integrity",
+               [&](CoroScope& s) -> CoroTask<void> {
+                   auto w = make_padded_striped_writer(STRIPE);
+                   if (co_await w->open(path, 1, true, &s) != 0) {
+                       result = 1;
+                       co_return;
+                   }
+                   // Two chunks of the same payload; packer coalesces both
+                   // into a single stripe since they fit together.
+                   if (co_await w->write_chunk(0, as_bv(mem)) != 0) {
+                       result = 2;
+                       co_return;
+                   }
+                   if (co_await w->write_chunk(0, as_bv(mem)) != 0) {
+                       result = 3;
+                       co_return;
+                   }
+                   if (co_await w->close() != 0) {
+                       result = 4;
+                       co_return;
+                   }
+                   result = 0;
+               })
+        .get();
+    CHECK(result == 0);
+
+    // Two payloads totaling 2*mem.size() + 25 (pad overhead). Whether they
+    // fit in a single stripe or spill into two depends on mem size relative
+    // to STRIPE; we assert only that the file is a multiple of STRIPE.
+    CHECK(fs::file_size(path) % STRIPE == 0);
+    auto round = gunzip_all(path);
+    CHECK(round == big + big);
+
+    fs::remove(path);
+}
diff --git a/tests/utilities/fileio/parallel/test_sharded_writer.cpp b/tests/utilities/fileio/parallel/test_sharded_writer.cpp
new file mode 100644
index 00000000..d7f656d5
--- /dev/null
+++ b/tests/utilities/fileio/parallel/test_sharded_writer.cpp
@@ -0,0 +1,109 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/core/common/byte_view.h>
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/runtime.h>
+#include <dftracer/utils/utilities/fileio/parallel/parallel_writer.h>
+#include <doctest/doctest.h>
+
+#include <cstdio>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace dftracer::utils;
+using namespace dftracer::utils::coro;
+using namespace dftracer::utils::utilities::fileio::parallel;
+
+namespace {
+
+std::string tmp_path(const char* name) {
+    return (fs::temp_directory_path() / name).string();
+}
+
+std::string read_all(const std::string& path) {
+    std::ifstream f(path, std::ios::binary);
+    std::stringstream ss;
+    ss << f.rdbuf();
+    return ss.str();
+}
+
+ByteView sv(const std::string& s) { return ByteView(s.data(), s.size()); }
+
+void cleanup(const std::vector<std::string>& paths) {
+    for (const auto& p : paths) std::remove(p.c_str());
+}
+
+}  // namespace
+
+TEST_CASE("ShardedWriter - one shard per worker with header/footer placement") {
+    std::string base = tmp_path("test_sharded_basic");
+    std::vector<std::string> expected = {base + ".shard_0", base + ".shard_1",
+                                         base + ".shard_2"};
+    cleanup(expected);
+
+    Runtime runtime(2);
+    std::vector<std::string> paths;
+    auto task = [&]() -> CoroTask<int> {
+        auto w = make_sharded_writer();
+        if (co_await w->open(base, 3, false, nullptr) != 0) co_return 1;
+        if (co_await w->write_header(sv("HDR\n")) != 0) co_return 2;
+        if (co_await w->write_chunk(0, sv("A\n")) != 0) co_return 3;
+        if (co_await w->write_chunk(1, sv("B\n")) != 0) co_return 4;
+        if (co_await w->write_chunk(2, sv("C\n")) != 0) co_return 5;
+        if (co_await w->write_footer(sv("END\n")) != 0) co_return 6;
+        paths = w->output_paths();
+        if (co_await w->close() != 0) co_return 7;
+        co_return 0;
+    };
+    CHECK(runtime.submit(task(), "sharded_basic").get() == 0);
+
+    REQUIRE(paths.size() == 3);
+    CHECK(paths == expected);
+
+    CHECK(read_all(paths[0]) == "HDR\nA\n");  // header + worker 0
+    CHECK(read_all(paths[1]) == "B\n");
+    CHECK(read_all(paths[2]) == "C\nEND\n");  // worker 2 + footer
+
+    cleanup(expected);
+}
+
+TEST_CASE("ShardedWriter - gzip_extension appends .gz to shard names") {
+    std::string base = tmp_path("test_sharded_gz");
+    std::vector<std::string> expected = {base + ".shard_0.gz",
+                                         base + ".shard_1.gz"};
+    cleanup(expected);
+
+    Runtime runtime(2);
+    std::vector<std::string> paths;
+    auto task = [&]() -> CoroTask<int> {
+        auto w = make_sharded_writer();
+        if (co_await w->open(base, 2, true, nullptr) != 0) co_return 1;
+        if (co_await w->write_chunk(0, sv("X")) != 0) co_return 2;
+        if (co_await w->write_chunk(1, sv("Y")) != 0) co_return 3;
+        paths = w->output_paths();
+        if (co_await w->close() != 0) co_return 4;
+        co_return 0;
+    };
+    CHECK(runtime.submit(task(), "sharded_gz").get() == 0);
+    CHECK(paths == expected);
+    for (const auto& p : expected) CHECK(fs::exists(p));
+    cleanup(expected);
+}
+
+TEST_CASE("ShardedWriter - out-of-range worker_idx fails") {
+    std::string base = tmp_path("test_sharded_oor");
+    cleanup({base + ".shard_0"});
+
+    Runtime runtime(2);
+    auto task = [&]() -> CoroTask<int> {
+        auto w = make_sharded_writer();
+        if (co_await w->open(base, 1, false, nullptr) != 0) co_return 1;
+        auto rc = co_await w->write_chunk(5, sv("oops"));
+        co_await w->close();
+        co_return rc == 0 ? 99 : 0;
+    };
+    CHECK(runtime.submit(task(), "sharded_oor").get() == 0);
+    cleanup({base + ".shard_0"});
+}
diff --git a/tests/utilities/fileio/parallel/test_striped_writer.cpp b/tests/utilities/fileio/parallel/test_striped_writer.cpp
new file mode 100644
index 00000000..dd839c15
--- /dev/null
+++ b/tests/utilities/fileio/parallel/test_striped_writer.cpp
@@ -0,0 +1,109 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/core/common/byte_view.h>
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/coro/task.h>
+#include <dftracer/utils/core/runtime.h>
+#include <dftracer/utils/utilities/fileio/parallel/parallel_writer.h>
+#include <doctest/doctest.h>
+
+#include <cstdio>
+#include <fstream>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace dftracer::utils;
+using namespace dftracer::utils::coro;
+using namespace dftracer::utils::utilities::fileio::parallel;
+
+namespace {
+
+std::string tmp_path(const char* name) {
+    return (fs::temp_directory_path() / name).string();
+}
+
+std::string read_all(const std::string& path) {
+    std::ifstream f(path, std::ios::binary);
+    std::stringstream ss;
+    ss << f.rdbuf();
+    return ss.str();
+}
+
+ByteView sv(const std::string& s) { return ByteView(s.data(), s.size()); }
+
+}  // namespace
+
+TEST_CASE("StripedWriter - header, chunks, footer land in single file") {
+    std::string path = tmp_path("test_striped_basic.txt");
+    std::remove(path.c_str());
+
+    Runtime runtime(2);
+    auto task = [&]() -> CoroTask<int> {
+        auto w = make_striped_writer();
+        if (co_await w->open(path, 4, false, nullptr) != 0) co_return 1;
+        if (co_await w->write_header(sv("HDR\n")) != 0) co_return 2;
+        if (co_await w->write_chunk(0, sv("worker0\n")) != 0) co_return 3;
+        if (co_await w->write_chunk(1, sv("worker1\n")) != 0) co_return 4;
+        if (co_await w->write_chunk(2, sv("worker2\n")) != 0) co_return 5;
+        if (co_await w->write_footer(sv("END\n")) != 0) co_return 6;
+        if (co_await w->close() != 0) co_return 7;
+        co_return 0;
+    };
+
+    CHECK(runtime.submit(task(), "striped_basic").get() == 0);
+
+    auto content = read_all(path);
+    // All bytes must be present regardless of interleave order.
+    CHECK(content.size() ==
+          std::string("HDR\nworker0\nworker1\nworker2\nEND\n").size());
+
+    std::set<std::string> lines;
+    std::stringstream ss(content);
+    std::string line;
+    while (std::getline(ss, line)) lines.insert(line);
+    CHECK(lines.count("HDR") == 1);
+    CHECK(lines.count("worker0") == 1);
+    CHECK(lines.count("worker1") == 1);
+    CHECK(lines.count("worker2") == 1);
+    CHECK(lines.count("END") == 1);
+
+    fs::remove(path);
+}
+
+TEST_CASE("StripedWriter - empty chunks are no-ops") {
+    std::string path = tmp_path("test_striped_empty.txt");
+    std::remove(path.c_str());
+
+    Runtime runtime(2);
+    auto task = [&]() -> CoroTask<int> {
+        auto w = make_striped_writer();
+        if (co_await w->open(path, 2, false, nullptr) != 0) co_return 1;
+        if (co_await w->write_chunk(0, ByteView()) != 0) co_return 2;
+        if (co_await w->write_chunk(1, ByteView()) != 0) co_return 3;
+        if (co_await w->close() != 0) co_return 4;
+        co_return 0;
+    };
+    CHECK(runtime.submit(task(), "striped_empty").get() == 0);
+    CHECK(fs::exists(path));
+    CHECK(fs::file_size(path) == 0);
+    fs::remove(path);
+}
+
+TEST_CASE("StripedWriter - output_paths returns single entry") {
+    std::string path = tmp_path("test_striped_paths.txt");
+    std::remove(path.c_str());
+
+    Runtime runtime(2);
+    std::vector<std::string> paths;
+    auto task = [&]() -> CoroTask<int> {
+        auto w = make_striped_writer();
+        if (co_await w->open(path, 4, false, nullptr) != 0) co_return 1;
+        paths = w->output_paths();
+        co_await w->close();
+        co_return 0;
+    };
+    CHECK(runtime.submit(task(), "striped_paths").get() == 0);
+    REQUIRE(paths.size() == 1);
+    CHECK(paths[0] == path);
+    fs::remove(path);
+}
diff --git a/tests/utilities/indexer/test_index_builder.cpp b/tests/utilities/indexer/test_index_builder.cpp
index ce0f4432..789acebe 100644
--- a/tests/utilities/indexer/test_index_builder.cpp
+++ b/tests/utilities/indexer/test_index_builder.cpp
@@ -1,18 +1,23 @@
 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
 #include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/rocksdb/db_manager.h>
 #include <dftracer/utils/core/runtime.h>
 #include <dftracer/utils/core/tasks/coro_scope.h>
 #include <dftracer/utils/core/utilities/behaviors/behavior_chain.h>
 #include <dftracer/utils/core/utilities/utility_executor.h>
+#include <dftracer/utils/utilities/composites/dft/event.h>
+#include <dftracer/utils/utilities/composites/dft/visitors/bloom_visitor.h>
 #include <dftracer/utils/utilities/indexer/index_builder_utility.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
 #include <dftracer/utils/utilities/indexer/internal/helpers.h>
-#include <dftracer/utils/utilities/indexer/visitors/bloom_visitor.h>
 #include <doctest/doctest.h>
+#include <simdjson.h>
 #include <testing_utilities.h>
 
 using namespace dftracer::utils;
 using namespace dftracer::utils::utilities::indexer;
+using namespace dftracer::utils::utilities::composites::dft::visitors;
 using namespace dftracer::utils::utilities::behaviors;
 using namespace dft_utils_test;
 
@@ -37,9 +42,7 @@ TEST_SUITE("IndexBuilder") {
         TestEnvironment env(1000);
         std::string gz_file = env.create_dft_test_gzip_file(1000);
 
-        auto config =
-            IndexBuildConfig::for_file(gz_file).with_bloom(false).with_manifest(
-                false);
+        auto config = IndexBuildConfig::for_file(gz_file).with_manifest(false);
 
         IndexBuildResult result;
         run_coro([&config, &result](CoroScope& scope) -> coro::CoroTask<void> {
@@ -65,24 +68,35 @@ TEST_SUITE("IndexBuilder") {
 
         std::string json_line =
             R"({"name":"read","cat":"POSIX","pid":1,"tid":1,"ts":100,"dur":50,"ph":"X"})";
-        visitor.on_line(json_line, 0);
+        simdjson::dom::parser parser;
+        auto result = parser.parse(json_line.data(), json_line.size());
+        REQUIRE(!result.error());
+        dftracer::utils::utilities::common::json::JsonValue json(
+            result.value_unsafe());
+        dftracer::utils::utilities::composites::dft::DFTracerEvent ev;
+        REQUIRE(decltype(ev)::parse(json, ev));
+        dftracer::utils::utilities::composites::dft::EventRecord record{
+            ev, json, json_line, 0, 0};
+        visitor.on_event(record);
 
         CHECK(visitor.num_chunks() >= 1);
-        MESSAGE("BloomVisitor chunks after on_line: ", visitor.num_chunks());
+        MESSAGE("BloomVisitor chunks after on_event: ", visitor.num_chunks());
 
         auto db_path = dft_utils_test::make_unique_test_path("bloom_direct");
-        db_path += ".idx";
+        db_path /= ".dftindex";
+        fs::remove_all(db_path);
+        dftracer::utils::rocksdb::RocksDBManager::instance().reset(
+            db_path.string());
         {
             IndexDatabase db(db_path.string());
-            db.init_base_schema();
-            db.init_bloom_schema();
-            int fid = db.get_or_create_file_info("test.pfw.gz", 123);
-            db.begin_transaction();
-            visitor.finalize(db, fid);
-            db.commit_transaction();
+            db.init_schema();
+            auto writer = db.begin_write();
+            int fid = writer->get_or_create_file_info("test.pfw.gz", 123);
+            visitor.finalize(*writer, fid);
+            writer->commit();
             CHECK(db.has_bloom_data(fid));
         }
-        fs::remove(db_path);
+        fs::remove_all(db_path);
     }
 
     TEST_CASE("Build with bloom") {
@@ -90,9 +104,8 @@ TEST_SUITE("IndexBuilder") {
         std::string gz_file = env.create_dft_test_gzip_file(1000);
 
         auto config = IndexBuildConfig::for_file(gz_file)
-                          .with_bloom(true)
-                          .with_manifest(false)
-                          .with_index_threshold(0);
+
+                          .with_manifest(false);
 
         IndexBuildResult result;
         run_coro([&config, &result](CoroScope& scope) -> coro::CoroTask<void> {
@@ -119,9 +132,8 @@ TEST_SUITE("IndexBuilder") {
         std::string gz_file = env.create_dft_test_gzip_file(1000);
 
         auto config = IndexBuildConfig::for_file(gz_file)
-                          .with_bloom(false)
-                          .with_manifest(true)
-                          .with_index_threshold(0);
+
+                          .with_manifest(true);
 
         IndexBuildResult result;
         run_coro([&config, &result](CoroScope& scope) -> coro::CoroTask<void> {
@@ -148,9 +160,8 @@ TEST_SUITE("IndexBuilder") {
         std::string gz_file = env.create_dft_test_gzip_file(1000);
 
         auto config = IndexBuildConfig::for_file(gz_file)
-                          .with_bloom(true)
-                          .with_manifest(true)
-                          .with_index_threshold(0);
+
+                          .with_manifest(true);
 
         IndexBuildResult result;
         run_coro([&config, &result](CoroScope& scope) -> coro::CoroTask<void> {
@@ -178,10 +189,9 @@ TEST_SUITE("IndexBuilder") {
         std::string gz_file = env.create_dft_test_gzip_file(1000);
 
         auto config = IndexBuildConfig::for_file(gz_file)
-                          .with_bloom(false)
+
                           .with_manifest(false)
-                          .with_force_rebuild(false)
-                          .with_index_threshold(0);
+                          .with_force_rebuild(false);
 
         IndexBuildResult first;
         run_coro([&config, &first](CoroScope& scope) -> coro::CoroTask<void> {
@@ -213,10 +223,9 @@ TEST_SUITE("IndexBuilder") {
         std::string gz_file = env.create_dft_test_gzip_file(1000);
 
         auto config_normal = IndexBuildConfig::for_file(gz_file)
-                                 .with_bloom(false)
+
                                  .with_manifest(false)
-                                 .with_force_rebuild(false)
-                                 .with_index_threshold(0);
+                                 .with_force_rebuild(false);
 
         IndexBuildResult first;
         run_coro([&config_normal,
@@ -231,10 +240,9 @@ TEST_SUITE("IndexBuilder") {
         REQUIRE(first.success);
 
         auto config_force = IndexBuildConfig::for_file(gz_file)
-                                .with_bloom(false)
+
                                 .with_manifest(false)
-                                .with_force_rebuild(true)
-                                .with_index_threshold(0);
+                                .with_force_rebuild(true);
 
         IndexBuildResult second;
         run_coro([&config_force,
@@ -254,9 +262,7 @@ TEST_SUITE("IndexBuilder") {
         TestEnvironment env(1000);
         std::string gz_file = env.create_dft_test_gzip_file(1000);
 
-        auto config =
-            IndexBuildConfig::for_file(gz_file).with_bloom(false).with_manifest(
-                false);
+        auto config = IndexBuildConfig::for_file(gz_file).with_manifest(false);
 
         IndexBuildResult result;
         run_coro([&config, &result](CoroScope& scope) -> coro::CoroTask<void> {
@@ -273,72 +279,14 @@ TEST_SUITE("IndexBuilder") {
         CHECK(result.total_lines >= 1000);
     }
 
-    TEST_CASE("Incremental bloom add to existing checkpoint-only index") {
-        TestEnvironment env(1000);
-        std::string gz_file = env.create_dft_test_gzip_file(1000);
-
-        // First build: checkpoint only
-        auto config1 = IndexBuildConfig::for_file(gz_file)
-                           .with_bloom(false)
-                           .with_manifest(false)
-                           .with_index_threshold(0);
-
-        IndexBuildResult r1;
-        run_coro([&config1, &r1](CoroScope& scope) -> coro::CoroTask<void> {
-            auto builder = std::make_shared<IndexBuilderUtility>();
-            UtilityExecutor<IndexBuildConfig, IndexBuildResult,
-                            tags::NeedsContext>
-                exec(builder,
-                     BehaviorChain<IndexBuildConfig, IndexBuildResult>{});
-            r1 = co_await exec.execute_with_context(scope, config1);
-        });
-        REQUIRE(r1.success);
-        CHECK(r1.index_created);
-
-        // Verify no bloom data yet
-        {
-            IndexDatabase db(r1.index_path);
-            int fid = db.get_file_info_id(internal::get_logical_path(gz_file));
-            CHECK(fid >= 0);
-            CHECK_FALSE(db.has_bloom_data(fid));
-        }
-
-        // Second build: add bloom (should NOT rebuild checkpoints)
-        auto config2 = IndexBuildConfig::for_file(gz_file)
-                           .with_bloom(true)
-                           .with_manifest(false)
-                           .with_index_threshold(0);
-
-        IndexBuildResult r2;
-        run_coro([&config2, &r2](CoroScope& scope) -> coro::CoroTask<void> {
-            auto builder = std::make_shared<IndexBuilderUtility>();
-            UtilityExecutor<IndexBuildConfig, IndexBuildResult,
-                            tags::NeedsContext>
-                exec(builder,
-                     BehaviorChain<IndexBuildConfig, IndexBuildResult>{});
-            r2 = co_await exec.execute_with_context(scope, config2);
-        });
-        REQUIRE(r2.success);
-        CHECK_FALSE(r2.was_skipped);
-
-        // Verify bloom data now exists
-        {
-            IndexDatabase db(r2.index_path);
-            int fid = db.get_file_info_id(internal::get_logical_path(gz_file));
-            CHECK(fid >= 0);
-            CHECK(db.has_bloom_data(fid));
-        }
-    }
-
     TEST_CASE("Incremental manifest add to existing index with bloom") {
         TestEnvironment env(1000);
         std::string gz_file = env.create_dft_test_gzip_file(1000);
 
         // First build: checkpoint + bloom
         auto config1 = IndexBuildConfig::for_file(gz_file)
-                           .with_bloom(true)
-                           .with_manifest(false)
-                           .with_index_threshold(0);
+
+                           .with_manifest(false);
 
         IndexBuildResult r1;
         run_coro([&config1, &r1](CoroScope& scope) -> coro::CoroTask<void> {
@@ -360,9 +308,8 @@ TEST_SUITE("IndexBuilder") {
 
         // Second build: add manifest (bloom already exists, skip it)
         auto config2 = IndexBuildConfig::for_file(gz_file)
-                           .with_bloom(false)
-                           .with_manifest(true)
-                           .with_index_threshold(0);
+
+                           .with_manifest(true);
 
         IndexBuildResult r2;
         run_coro([&config2, &r2](CoroScope& scope) -> coro::CoroTask<void> {
@@ -391,9 +338,8 @@ TEST_SUITE("IndexBuilder") {
 
         // Build with bloom + manifest
         auto config1 = IndexBuildConfig::for_file(gz_file)
-                           .with_bloom(true)
-                           .with_manifest(true)
-                           .with_index_threshold(0);
+
+                           .with_manifest(true);
 
         IndexBuildResult r1;
         run_coro([&config1, &r1](CoroScope& scope) -> coro::CoroTask<void> {
diff --git a/tests/utilities/indexer/test_index_database.cpp b/tests/utilities/indexer/test_index_database.cpp
index b8345791..d5aeed17 100644
--- a/tests/utilities/indexer/test_index_database.cpp
+++ b/tests/utilities/indexer/test_index_database.cpp
@@ -1,14 +1,17 @@
 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
 #include <dftracer/utils/core/common/filesystem.h>
 #include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
 #include <doctest/doctest.h>
 #include <testing_utilities.h>
 
 #include <string>
 #include <vector>
 
-namespace fs = std::filesystem;
+using dftracer::utils::utilities::indexer::ChunkStatistics;
 using dftracer::utils::utilities::indexer::IndexDatabase;
+using dftracer::utils::utilities::indexer::IndexDatabaseWriterContext;
+using dftracer::utils::utilities::indexer::MergedStatisticsResult;
 
 TEST_SUITE("IndexDatabase") {
     TEST_CASE("normalizes legacy .idx-style input to root-local .dftindex") {
@@ -27,10 +30,23 @@ TEST_SUITE("IndexDatabase") {
         IndexDatabase db1((root / ".dftindex").string());
         IndexDatabase db2((root / "other-name.idx").string());
 
-        db1.init_base_schema();
-        db2.init_base_schema();
+        {
+            auto writer = db1.begin_write();
+            writer->init_schema();
+            writer->commit();
+        }
+        {
+            auto writer = db2.begin_write();
+            writer->init_schema();
+            writer->commit();
+        }
 
-        int id1 = db1.get_or_create_file_info("a.pfw.gz", 0x1111);
+        int id1;
+        {
+            auto writer = db1.begin_write();
+            id1 = writer->get_or_create_file_info("a.pfw.gz", 0x1111);
+            writer->commit();
+        }
         int id2 = db2.get_file_info_id("a.pfw.gz");
 
         CHECK(id1 > 0);
@@ -42,29 +58,41 @@ TEST_SUITE("IndexDatabase") {
         fs::create_directories(root);
 
         IndexDatabase db((root / ".dftindex").string());
-        db.init_base_schema();
-        db.init_bloom_schema();
-        db.init_manifest_schema();
-
-        const int file_id = db.get_or_create_file_info("trace.pfw.gz", 0xAAAA);
-
-        std::vector<unsigned char> blob = {0xDE, 0xAD, 0xBE, 0xEF};
-        db.insert_chunk_bloom_filter(file_id, 0, "name", std::span(blob), 4);
-        db.insert_file_bloom_filter(file_id, "name", std::span(blob), 4);
-        db.insert_index_dimension(file_id, "name");
-        db.insert_hash_resolution(file_id, "fhash", "hashA", "resolvedA");
-        db.insert_event_range(file_id, 0, "POSIX", "read",
-                              std::vector<std::uint32_t>{1, 2, 3});
-        db.insert_metadata_lines(file_id, 0, "HH",
-                                 std::vector<std::uint32_t>{0, 4});
+
+        int file_id;
+        {
+            auto writer = db.begin_write();
+            writer->init_schema();
+
+            file_id = writer->get_or_create_file_info("trace.pfw.gz", 0xAAAA);
+
+            std::vector<unsigned char> blob = {0xDE, 0xAD, 0xBE, 0xEF};
+            writer->insert_chunk_bloom_filter(file_id, 0, "name",
+                                              std::span(blob), 4);
+            writer->insert_file_bloom_filter(file_id, "name", std::span(blob),
+                                             4);
+            writer->insert_index_dimension(file_id, "name");
+            writer->insert_hash_table_entry(0, "hashA", "resolvedA");
+            writer->insert_event_range(file_id, 0, "POSIX", "read",
+                                       std::vector<std::uint32_t>{1, 2, 3});
+            writer->insert_metadata_lines(file_id, 0, "HH",
+                                          std::vector<std::uint32_t>{0, 4});
+            writer->commit();
+        }
 
         CHECK(db.has_bloom_data(file_id));
         CHECK(db.has_manifest_data(file_id));
         CHECK(db.query_file_bloom_filter(file_id, "name").has_value());
-        CHECK(db.query_resolved_by_hash("fhash", "hashA").has_value());
+        CHECK(db.resolve_hash(IndexDatabase::HashType::FILE, "hashA")
+                  .has_value());
 
-        const int rebuilt_id =
-            db.get_or_create_file_info("trace.pfw.gz", 0xBBBB);
+        int rebuilt_id;
+        {
+            auto writer = db.begin_write();
+            rebuilt_id =
+                writer->get_or_create_file_info("trace.pfw.gz", 0xBBBB);
+            writer->commit();
+        }
         CHECK(rebuilt_id == file_id);
 
         CHECK_FALSE(db.has_bloom_data(file_id));
@@ -73,26 +101,209 @@ TEST_SUITE("IndexDatabase") {
         CHECK(db.query_chunk_bloom_filters(file_id, "name").empty());
         CHECK(db.query_event_ranges(file_id).empty());
         CHECK(db.query_metadata_lines(file_id).empty());
-        CHECK_FALSE(db.query_resolved_by_hash("fhash", "hashA").has_value());
+        CHECK(db.resolve_hash(IndexDatabase::HashType::FILE, "hashA")
+                  .has_value());
     }
 
-    TEST_CASE("rollback discards transactional writes") {
-        auto root = dft_utils_test::make_unique_test_path("idx_rollback");
+    TEST_CASE("writer context batches multiple files and all are readable") {
+        auto root = dft_utils_test::make_unique_test_path("idx_writer_ctx");
         fs::create_directories(root);
 
         IndexDatabase db((root / ".dftindex").string());
-        db.init_base_schema();
-        db.init_bloom_schema();
+        db.init_schema();
 
-        const int file_id = db.get_or_create_file_info("trace.pfw.gz", 0xAAAA);
-        std::vector<unsigned char> blob = {0xAB, 0xCD};
+        static constexpr int NUM_FILES = 100;
+        static constexpr int BATCH_SIZE = 10;
 
-        db.begin_transaction();
-        db.insert_file_bloom_filter(file_id, "name", std::span(blob), 2);
-        db.insert_hash_resolution(file_id, "fhash", "hashA", "resolvedA");
-        db.rollback_transaction();
+        // Create file IDs first
+        std::vector<int> file_ids;
+        {
+            auto writer = db.begin_write();
+            for (int i = 0; i < NUM_FILES; ++i) {
+                auto name = "file_" + std::to_string(i) + ".pfw.gz";
+                int fid = writer->get_or_create_file_info(name, i + 1);
+                file_ids.push_back(fid);
+            }
+            writer->commit();
+        }
+        CHECK(file_ids.size() == NUM_FILES);
 
-        CHECK_FALSE(db.query_file_bloom_filter(file_id, "name").has_value());
-        CHECK_FALSE(db.query_resolved_by_hash("fhash", "hashA").has_value());
+        // Write scalar stats in batches
+        for (int batch_start = 0; batch_start < NUM_FILES;
+             batch_start += BATCH_SIZE) {
+            auto writer = db.begin_write();
+            int batch_end = std::min(batch_start + BATCH_SIZE, NUM_FILES);
+            for (int i = batch_start; i < batch_end; ++i) {
+                ChunkStatistics stats;
+                stats.total_events = static_cast<std::uint64_t>(i + 1) * 100;
+                writer->insert_file_scalar_stats(file_ids[i], stats, 1);
+            }
+            writer->commit();
+        }
+
+        // Verify ALL data is readable
+        auto results = db.query_file_scalar_stats_batch(file_ids);
+        CHECK(results.size() == NUM_FILES);
+
+        std::uint64_t total_events = 0;
+        for (int i = 0; i < NUM_FILES; ++i) {
+            auto it = results.find(file_ids[i]);
+            REQUIRE(it != results.end());
+            CHECK(it->second.stats.total_events ==
+                  static_cast<std::uint64_t>(i + 1) * 100);
+            total_events += it->second.stats.total_events;
+        }
+        CHECK(total_events == 505000);  // sum of 100+200+...+10000
+    }
+
+    TEST_CASE("PID manifest - insert and query single file PIDs") {
+        auto root = dft_utils_test::make_unique_test_path("idx_pid_single");
+        fs::create_directories(root);
+
+        IndexDatabase db((root / ".dftindex").string());
+
+        int file_id;
+        {
+            auto writer = db.begin_write();
+            writer->init_schema();
+            file_id = writer->get_or_create_file_info("trace.pfw.gz", 0xAAAA);
+
+            std::unordered_set<std::uint64_t> pids = {1234, 5678, 9012};
+            writer->insert_file_pids(file_id, pids);
+            writer->commit();
+        }
+
+        auto result = db.query_file_pids(file_id);
+        CHECK(result.size() == 3);
+        CHECK(result.count(1234) == 1);
+        CHECK(result.count(5678) == 1);
+        CHECK(result.count(9012) == 1);
+    }
+
+    TEST_CASE("PID manifest - query non-existent file returns empty set") {
+        auto root = dft_utils_test::make_unique_test_path("idx_pid_empty");
+        fs::create_directories(root);
+
+        IndexDatabase db((root / ".dftindex").string());
+        db.init_schema();
+
+        auto result = db.query_file_pids(999);
+        CHECK(result.empty());
+    }
+
+    TEST_CASE("PID manifest - query all file PIDs") {
+        auto root = dft_utils_test::make_unique_test_path("idx_pid_all");
+        fs::create_directories(root);
+
+        IndexDatabase db((root / ".dftindex").string());
+
+        int file_id1, file_id2, file_id3;
+        {
+            auto writer = db.begin_write();
+            writer->init_schema();
+
+            file_id1 = writer->get_or_create_file_info("trace1.pfw.gz", 0xAAA1);
+            file_id2 = writer->get_or_create_file_info("trace2.pfw.gz", 0xAAA2);
+            file_id3 = writer->get_or_create_file_info("trace3.pfw.gz", 0xAAA3);
+
+            writer->insert_file_pids(file_id1, {1000, 1001});
+            writer->insert_file_pids(file_id2, {1000, 2000, 2001});
+            writer->insert_file_pids(file_id3, {3000});
+            writer->commit();
+        }
+
+        auto all_pids = db.query_all_file_pids();
+        CHECK(all_pids.size() == 3);
+
+        CHECK(all_pids[file_id1].size() == 2);
+        CHECK(all_pids[file_id1].count(1000) == 1);
+        CHECK(all_pids[file_id1].count(1001) == 1);
+
+        CHECK(all_pids[file_id2].size() == 3);
+        CHECK(all_pids[file_id2].count(1000) == 1);
+        CHECK(all_pids[file_id2].count(2000) == 1);
+        CHECK(all_pids[file_id2].count(2001) == 1);
+
+        CHECK(all_pids[file_id3].size() == 1);
+        CHECK(all_pids[file_id3].count(3000) == 1);
+    }
+
+    TEST_CASE("PID manifest - large PIDs") {
+        auto root = dft_utils_test::make_unique_test_path("idx_pid_large");
+        fs::create_directories(root);
+
+        IndexDatabase db((root / ".dftindex").string());
+
+        int file_id;
+        {
+            auto writer = db.begin_write();
+            writer->init_schema();
+            file_id = writer->get_or_create_file_info("trace.pfw.gz", 0xBBBB);
+
+            // Use large PID values to test varint encoding
+            std::unordered_set<std::uint64_t> pids = {
+                0xFFFFFFFFULL,         // 32-bit max
+                0x100000000ULL,        // Just over 32-bit
+                0xFFFFFFFFFFFFFFFFULL  // 64-bit max
+            };
+            writer->insert_file_pids(file_id, pids);
+            writer->commit();
+        }
+
+        auto result = db.query_file_pids(file_id);
+        CHECK(result.size() == 3);
+        CHECK(result.count(0xFFFFFFFFULL) == 1);
+        CHECK(result.count(0x100000000ULL) == 1);
+        CHECK(result.count(0xFFFFFFFFFFFFFFFFULL) == 1);
+    }
+
+    TEST_CASE("PID manifest - empty PID set not stored") {
+        auto root = dft_utils_test::make_unique_test_path("idx_pid_empty_set");
+        fs::create_directories(root);
+
+        IndexDatabase db((root / ".dftindex").string());
+
+        int file_id;
+        {
+            auto writer = db.begin_write();
+            writer->init_schema();
+            file_id = writer->get_or_create_file_info("trace.pfw.gz", 0xCCCC);
+
+            std::unordered_set<std::uint64_t> empty_pids;
+            writer->insert_file_pids(file_id, empty_pids);
+            writer->commit();
+        }
+
+        auto result = db.query_file_pids(file_id);
+        CHECK(result.empty());
+    }
+
+    TEST_CASE("PID manifest - rebuild clears PIDs") {
+        auto root = dft_utils_test::make_unique_test_path("idx_pid_rebuild");
+        fs::create_directories(root);
+
+        IndexDatabase db((root / ".dftindex").string());
+
+        int file_id;
+        {
+            auto writer = db.begin_write();
+            writer->init_schema();
+            file_id = writer->get_or_create_file_info("trace.pfw.gz", 0xDDDD);
+            writer->insert_file_pids(file_id, {1234, 5678});
+            writer->commit();
+        }
+
+        CHECK(db.query_file_pids(file_id).size() == 2);
+
+        // Rebuild with new checksum clears data
+        {
+            auto writer = db.begin_write();
+            int rebuilt_id =
+                writer->get_or_create_file_info("trace.pfw.gz", 0xEEEE);
+            writer->commit();
+            CHECK(rebuilt_id == file_id);
+        }
+
+        CHECK(db.query_file_pids(file_id).empty());
     }
 }
diff --git a/tests/utilities/indexer/test_provenance_database.cpp b/tests/utilities/indexer/test_provenance_database.cpp
index 5686b01d..ba7dc7a7 100644
--- a/tests/utilities/indexer/test_provenance_database.cpp
+++ b/tests/utilities/indexer/test_provenance_database.cpp
@@ -4,7 +4,6 @@
 #include <doctest/doctest.h>
 #include <testing_utilities.h>
 
-namespace fs = std::filesystem;
 using namespace dftracer::utils::utilities::indexer;
 
 TEST_SUITE("ProvenanceDatabase") {
@@ -35,7 +34,7 @@ TEST_SUITE("ProvenanceDatabase") {
         db.insert_info(file_id, "tool", "dftracer_organize");
         db.insert_group(file_id, "group0", "cat == POSIX");
         db.insert_source(file_id, 7, "/src/a.pfw.gz", 12, "hash7");
-        db.insert_segment(file_id, 7, 3, 100, 140, 9);
+        db.insert_segment(file_id, 7, 3, 0, 100, 140, 9);
 
         auto sources = db.query_sources(file_id);
         REQUIRE(sources.size() == 1);
@@ -72,15 +71,13 @@ TEST_SUITE("ProvenanceDatabase") {
         CHECK(file_b > 0);
         CHECK(file_a != file_b);
 
-        db.begin_transaction();
         db.insert_group(file_a, "io", R"(cat == "POSIX")");
         db.insert_source(file_a, 0, "/src/trace0.pfw.gz", 3, "ha");
-        db.insert_segment(file_a, 0, 1, 0, 5, 3);
+        db.insert_segment(file_a, 0, 1, 0, 0, 5, 3);
 
         db.insert_group(file_b, "compute", R"(cat == "APP")");
         db.insert_source(file_b, 1, "/src/trace1.pfw.gz", 2, "hb");
-        db.insert_segment(file_b, 1, 0, 0, 3, 1);
-        db.commit_transaction();
+        db.insert_segment(file_b, 1, 0, 0, 0, 3, 1);
 
         CHECK(db.get_file_info_id(out_a) == file_a);
         CHECK(db.get_file_info_id(out_b) == file_b);
@@ -106,22 +103,18 @@ TEST_SUITE("ProvenanceDatabase") {
         const auto out = (root / "group.pfw.gz").string();
 
         const int original_id = db.get_or_create_file_info(out, 0x1111);
-        db.begin_transaction();
         db.insert_info(original_id, "tool", "dftracer_organize");
         db.insert_group(original_id, "io", R"(cat == "POSIX")");
         db.insert_source(original_id, 0, "/src/trace0.pfw.gz", 4, "old");
-        db.insert_segment(original_id, 0, 0, 0, 4, 2);
-        db.commit_transaction();
+        db.insert_segment(original_id, 0, 0, 0, 0, 4, 2);
 
         const int rebuilt_id = db.get_or_create_file_info(out, 0x2222);
         CHECK(rebuilt_id == original_id);
 
-        db.begin_transaction();
         db.insert_info(rebuilt_id, "tool", "dftracer_organize_v2");
         db.insert_group(rebuilt_id, "io", R"(cat == "MPI")");
         db.insert_source(rebuilt_id, 0, "/src/trace0.pfw.gz", 8, "new");
-        db.insert_segment(rebuilt_id, 0, 0, 10, 18, 5);
-        db.commit_transaction();
+        db.insert_segment(rebuilt_id, 0, 0, 0, 10, 18, 5);
 
         CHECK(db.query_info(rebuilt_id, "tool") == "dftracer_organize_v2");
         CHECK(db.query_group_predicate(rebuilt_id) == R"(cat == "MPI")");
@@ -138,27 +131,5 @@ TEST_SUITE("ProvenanceDatabase") {
         CHECK(segments[0].event_count == 5);
     }
 
-    TEST_CASE("rollback discards provenance writes") {
-        auto root = dft_utils_test::make_unique_test_path("prov_rollback");
-        fs::create_directories(root);
-
-        ProvenanceDatabase db((root / ".dftindex").string());
-        db.init_schema();
-
-        const int file_id =
-            db.get_or_create_file_info((root / "out.pfw.gz").string(), 0xCAFE);
-
-        db.begin_transaction();
-        db.insert_info(file_id, "tool", "dftracer_organize");
-        db.insert_group(file_id, "group0", "cat == POSIX");
-        db.insert_source(file_id, 7, "/src/a.pfw.gz", 12, "hash7");
-        db.insert_segment(file_id, 7, 3, 100, 140, 9);
-        db.rollback_transaction();
-
-        CHECK(db.query_info(file_id, "tool").empty());
-        CHECK(db.query_group_name(file_id).empty());
-        CHECK(db.query_group_predicate(file_id).empty());
-        CHECK(db.query_sources(file_id).empty());
-        CHECK(db.query_segments(file_id, 7).empty());
-    }
+    // Transaction rollback test removed — writes commit immediately.
 }
diff --git a/tests/utilities/indexer/test_rocksdb_storage.cpp b/tests/utilities/indexer/test_rocksdb_storage.cpp
index 48dfeb97..e28dd2aa 100644
--- a/tests/utilities/indexer/test_rocksdb_storage.cpp
+++ b/tests/utilities/indexer/test_rocksdb_storage.cpp
@@ -12,7 +12,6 @@
 #include <cstring>
 #include <memory>
 
-namespace fs = std::filesystem;
 using dftracer::utils::rocksdb::KeyBuilder;
 using dftracer::utils::rocksdb::KeyCodec;
 using dftracer::utils::rocksdb::RocksDatabase;
@@ -80,18 +79,23 @@ TEST_SUITE("RocksDBStorage") {
         auto path = (root / ".dftindex").string();
         auto& manager = RocksDBManager::instance();
 
-        auto first =
-            manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite);
-        REQUIRE(first != nullptr);
-        auto* first_raw = first.get();
-
-        manager.reset(path);
-        first.reset();
+        std::weak_ptr<RocksDatabase> first_weak;
+        {
+            auto first =
+                manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite);
+            REQUIRE(first != nullptr);
+            first_weak = first;
+            manager.reset(path);
+        }
+        // After reset() + the only strong owner going out of scope, the old
+        // instance must have been destroyed (RocksDB holds a per-process file
+        // lock, so a stale cached instance would prevent reopening below).
+        CHECK(first_weak.expired());
 
         auto second =
             manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite);
         REQUIRE(second != nullptr);
-        CHECK(second.get() != first_raw);
+        CHECK(second->is_open());
     }
 
     TEST_CASE("manager shutdown clears cached instances") {
@@ -102,18 +106,20 @@ TEST_SUITE("RocksDBStorage") {
         auto path = (root / ".dftindex").string();
         auto& manager = RocksDBManager::instance();
 
-        auto first =
-            manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite);
-        REQUIRE(first != nullptr);
-        auto* first_raw = first.get();
-
-        manager.shutdown();
-        first.reset();
+        std::weak_ptr<RocksDatabase> first_weak;
+        {
+            auto first =
+                manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite);
+            REQUIRE(first != nullptr);
+            first_weak = first;
+            manager.shutdown();
+        }
+        CHECK(first_weak.expired());
 
         auto second =
             manager.get_or_open(path, RocksDatabase::OpenMode::ReadWrite);
         REQUIRE(second != nullptr);
-        CHECK(second.get() != first_raw);
+        CHECK(second->is_open());
     }
 
     TEST_CASE("manager rejects read-only upgrade while handle is alive") {
diff --git a/tests/utilities/indexer/test_sst_ingest_spike.cpp b/tests/utilities/indexer/test_sst_ingest_spike.cpp
new file mode 100644
index 00000000..60abab7a
--- /dev/null
+++ b/tests/utilities/indexer/test_sst_ingest_spike.cpp
@@ -0,0 +1,469 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <dftracer/utils/core/common/filesystem.h>
+#include <dftracer/utils/core/rocksdb/column_families.h>
+#include <dftracer/utils/core/rocksdb/database.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_dimension_stats.h>
+#include <dftracer/utils/utilities/composites/dft/indexing/chunk_statistics.h>
+#include <dftracer/utils/utilities/hash/fnv1a_hasher_utility.h>
+#include <dftracer/utils/utilities/indexer/index_database.h>
+#include <dftracer/utils/utilities/indexer/index_database_sst_writer_context.h>
+#include <dftracer/utils/utilities/indexer/index_database_writer_context.h>
+#include <doctest/doctest.h>
+#include <testing_utilities.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+using dftracer::utils::utilities::composites::dft::indexing::
+    ChunkDimensionStats;
+using dftracer::utils::utilities::composites::dft::indexing::ChunkStatistics;
+using dftracer::utils::utilities::indexer::IndexDatabase;
+using dftracer::utils::utilities::indexer::IndexDatabaseSstWriterContext;
+using dftracer::utils::utilities::indexer::SstArtifactRegistry;
+using dftracer::utils::utilities::indexer::internal::IndexerCheckpoint;
+
+namespace {
+
+IndexerCheckpoint make_checkpoint(std::uint64_t idx, std::uint64_t uc_offset,
+                                  std::uint64_t num_lines) {
+    IndexerCheckpoint cp{};
+    cp.checkpoint_idx = idx;
+    cp.uc_offset = uc_offset;
+    cp.uc_size = 64 * 1024;
+    cp.c_offset = uc_offset / 2;
+    cp.c_size = 32 * 1024;
+    cp.bits = 8;
+    cp.dict_compressed = std::vector<unsigned char>{0xAA, 0xBB, 0xCC};
+    cp.num_lines = num_lines;
+    cp.first_line_num = idx * num_lines + 1;
+    cp.last_line_num = (idx + 1) * num_lines;
+    return cp;
+}
+
+ChunkStatistics make_chunk_stats(std::uint64_t total_events) {
+    ChunkStatistics stats;
+    stats.total_events = total_events;
+    stats.min_timestamp_us = 1000;
+    stats.max_timestamp_us = 9000;
+    stats.name_counts["read"] = total_events / 2;
+    stats.name_counts["write"] = total_events - total_events / 2;
+    stats.category_counts["posix"] = total_events;
+    stats.pid_tid_counts["1:1"] = total_events;
+    return stats;
+}
+
+ChunkDimensionStats make_dim_stats(std::string_view dim, std::uint64_t distinct,
+                                   std::string_view min_val,
+                                   std::string_view max_val) {
+    ChunkDimensionStats ds;
+    ds.dimension = std::string(dim);
+    ds.distinct_count = distinct;
+    ds.min_value = std::string(min_val);
+    ds.max_value = std::string(max_val);
+    ds.value_type = "string";
+    return ds;
+}
+
+struct Fixture {
+    IndexerCheckpoint cp_a = make_checkpoint(0, 0, 100);
+    IndexerCheckpoint cp_b = make_checkpoint(1, 64 * 1024, 100);
+
+    std::vector<std::uint32_t> read_lines{1, 5, 17, 42};
+    std::vector<std::uint32_t> write_lines{2, 8, 23};
+    std::vector<std::uint32_t> md_proc_lines{3, 9};
+    std::unordered_set<std::uint64_t> pids{101, 102, 103};
+
+    std::vector<unsigned char> bloom_blob_a{0x11, 0x22, 0x33, 0x44};
+    std::vector<unsigned char> bloom_blob_b{0x55, 0x66, 0x77, 0x88};
+
+    ChunkStatistics chunk_stats_a = make_chunk_stats(60);
+    ChunkStatistics chunk_stats_b = make_chunk_stats(80);
+    ChunkStatistics file_stats = make_chunk_stats(140);
+
+    ChunkDimensionStats dim_stats_a =
+        make_dim_stats("name", 3, "fsync", "read");
+    ChunkDimensionStats dim_stats_b =
+        make_dim_stats("name", 5, "close", "write");
+
+    template <typename Sink>
+    void populate(Sink& sink, int file_id) {
+        sink.insert_checkpoint(file_id, cp_a);
+        sink.insert_checkpoint(file_id, cp_b);
+        sink.insert_file_metadata(file_id, /*checkpoint_size=*/64 * 1024,
+                                  /*total_lines=*/200,
+                                  /*total_uc_size=*/128 * 1024);
+        sink.insert_event_range(file_id, cp_a.checkpoint_idx, "posix", "read",
+                                read_lines);
+        sink.insert_event_range(file_id, cp_b.checkpoint_idx, "posix", "write",
+                                write_lines);
+        sink.insert_metadata_lines(file_id, cp_a.checkpoint_idx, "PR",
+                                   md_proc_lines);
+        sink.insert_file_pids(file_id, pids);
+
+        sink.insert_chunk_bloom_filter(
+            file_id, cp_a.checkpoint_idx, "name",
+            std::span<const unsigned char>(bloom_blob_a), /*num_entries=*/4);
+        sink.insert_chunk_bloom_filter(
+            file_id, cp_b.checkpoint_idx, "name",
+            std::span<const unsigned char>(bloom_blob_b), /*num_entries=*/5);
+        sink.insert_file_bloom_filter(
+            file_id, "name", std::span<const unsigned char>(bloom_blob_a),
+            /*num_entries=*/8);
+
+        sink.insert_chunk_statistics(file_id, cp_a.checkpoint_idx,
+                                     chunk_stats_a);
+        sink.insert_chunk_statistics(file_id, cp_b.checkpoint_idx,
+                                     chunk_stats_b);
+        sink.insert_file_scalar_stats(file_id, file_stats, /*num_chunks=*/2);
+        sink.insert_file_category_counts(file_id, file_stats.category_counts);
+        sink.insert_file_pid_tid_counts(file_id, file_stats.pid_tid_counts);
+        sink.insert_file_name_counts(file_id, file_stats.name_counts);
+
+        sink.insert_index_dimension(file_id, "name");
+        sink.insert_index_dimension(file_id, "cat");
+        sink.insert_chunk_dimension_stats(file_id, cp_a.checkpoint_idx,
+                                          dim_stats_a);
+        sink.insert_chunk_dimension_stats(file_id, cp_b.checkpoint_idx,
+                                          dim_stats_b);
+
+        using dftracer::utils::utilities::hash::fnv1a_hash;
+        const auto read_id = fnv1a_hash(std::string_view{"read"});
+        const auto write_id = fnv1a_hash(std::string_view{"write"});
+        sink.insert_name_dictionary_entry(read_id, "read");
+        sink.insert_name_dictionary_entry(write_id, "write");
+        sink.insert_name_file_posting(read_id, file_id);
+        sink.insert_name_file_posting(write_id, file_id);
+        sink.insert_name_chunk_posting(read_id, file_id, cp_a.checkpoint_idx);
+        sink.insert_name_chunk_posting(write_id, file_id, cp_b.checkpoint_idx);
+
+        sink.insert_hash_table_entry(
+            static_cast<std::uint8_t>(IndexDatabase::HashType::FILE), "fh_1",
+            "/path/to/trace.pfw.gz");
+        sink.insert_hash_table_entry(
+            static_cast<std::uint8_t>(IndexDatabase::HashType::HOST), "hh_1",
+            "host-1");
+        sink.insert_hash_table_entry(
+            static_cast<std::uint8_t>(IndexDatabase::HashType::STRING), "sh_1",
+            "some-string");
+
+        // Aggregation / system_metrics sink writes. SstFileWriter requires
+        // strictly ascending keys within a single SST, so the raw sink
+        // API here exercises one merge per key. Cross-flush merges
+        // targeting the same key are the AggregationVisitor's concern: it
+        // rotates its SstWriterContext per flush so each SST is key-unique.
+        sink.insert_aggregation_put("\xFF\xFD\x01", "name-one");
+        sink.insert_aggregation_put("\xFF\xFD\x02", "name-two");
+        sink.insert_aggregation_merge("agg-key-1", "operand-1");
+        sink.insert_aggregation_merge("agg-key-2", "operand-2");
+        sink.insert_system_metrics_merge("sys-key-1", "sys-1");
+        sink.insert_system_metrics_merge("sys-key-2", "sys-2");
+    }
+};
+
+void compare_cf_entries(const IndexDatabase& db_a, const IndexDatabase& db_b,
+                        std::string_view cf_name);
+
+void check_round_trip(const IndexDatabase& db_a, const IndexDatabase& db_b,
+                      int file_id) {
+    CHECK(db_a.get_checkpoint_size(file_id) ==
+          db_b.get_checkpoint_size(file_id));
+    CHECK(db_a.get_num_lines(file_id) == db_b.get_num_lines(file_id));
+    CHECK(db_a.get_max_bytes(file_id) == db_b.get_max_bytes(file_id));
+
+    auto cps_a = db_a.query_checkpoints(file_id);
+    auto cps_b = db_b.query_checkpoints(file_id);
+    REQUIRE(cps_a.size() == cps_b.size());
+    for (std::size_t i = 0; i < cps_a.size(); ++i) {
+        CHECK(cps_a[i].checkpoint_idx == cps_b[i].checkpoint_idx);
+        CHECK(cps_a[i].uc_offset == cps_b[i].uc_offset);
+        CHECK(cps_a[i].uc_size == cps_b[i].uc_size);
+        CHECK(cps_a[i].c_offset == cps_b[i].c_offset);
+        CHECK(cps_a[i].c_size == cps_b[i].c_size);
+        CHECK(cps_a[i].num_lines == cps_b[i].num_lines);
+        CHECK(cps_a[i].first_line_num == cps_b[i].first_line_num);
+        CHECK(cps_a[i].last_line_num == cps_b[i].last_line_num);
+    }
+
+    auto er_a = db_a.query_event_ranges(file_id);
+    auto er_b = db_b.query_event_ranges(file_id);
+    REQUIRE(er_a.size() == er_b.size());
+    for (std::size_t i = 0; i < er_a.size(); ++i) {
+        CHECK(er_a[i].checkpoint_idx == er_b[i].checkpoint_idx);
+        CHECK(er_a[i].cat == er_b[i].cat);
+        CHECK(er_a[i].name == er_b[i].name);
+        CHECK(er_a[i].line_numbers == er_b[i].line_numbers);
+    }
+
+    auto md_a = db_a.query_metadata_lines(file_id);
+    auto md_b = db_b.query_metadata_lines(file_id);
+    REQUIRE(md_a.size() == md_b.size());
+    for (std::size_t i = 0; i < md_a.size(); ++i) {
+        CHECK(md_a[i].checkpoint_idx == md_b[i].checkpoint_idx);
+        CHECK(md_a[i].meta_type == md_b[i].meta_type);
+        CHECK(md_a[i].line_numbers == md_b[i].line_numbers);
+    }
+
+    CHECK(db_a.query_file_pids(file_id) == db_b.query_file_pids(file_id));
+
+    auto cbf_a = db_a.query_chunk_bloom_filters(file_id, "name");
+    auto cbf_b = db_b.query_chunk_bloom_filters(file_id, "name");
+    REQUIRE(cbf_a.size() == cbf_b.size());
+    for (std::size_t i = 0; i < cbf_a.size(); ++i) {
+        CHECK(cbf_a[i].checkpoint_idx == cbf_b[i].checkpoint_idx);
+        CHECK(cbf_a[i].num_entries == cbf_b[i].num_entries);
+        CHECK(cbf_a[i].bloom_data == cbf_b[i].bloom_data);
+    }
+
+    auto fbf_a = db_a.query_file_bloom_filter(file_id, "name");
+    auto fbf_b = db_b.query_file_bloom_filter(file_id, "name");
+    REQUIRE(fbf_a.has_value());
+    REQUIRE(fbf_b.has_value());
+    CHECK(fbf_a->num_entries == fbf_b->num_entries);
+    CHECK(fbf_a->bloom_data == fbf_b->bloom_data);
+
+    auto cs_a = db_a.query_chunk_statistics(file_id);
+    auto cs_b = db_b.query_chunk_statistics(file_id);
+    REQUIRE(cs_a.size() == cs_b.size());
+    for (std::size_t i = 0; i < cs_a.size(); ++i) {
+        CHECK(cs_a[i].checkpoint_idx == cs_b[i].checkpoint_idx);
+        CHECK(cs_a[i].stats.total_events == cs_b[i].stats.total_events);
+        CHECK(cs_a[i].stats.min_timestamp_us == cs_b[i].stats.min_timestamp_us);
+        CHECK(cs_a[i].stats.max_timestamp_us == cs_b[i].stats.max_timestamp_us);
+    }
+
+    auto fss_a = db_a.query_file_scalar_stats_batch({file_id});
+    auto fss_b = db_b.query_file_scalar_stats_batch({file_id});
+    REQUIRE(fss_a.count(file_id) == 1);
+    REQUIRE(fss_b.count(file_id) == 1);
+    CHECK(fss_a[file_id].stats.total_events ==
+          fss_b[file_id].stats.total_events);
+    CHECK(fss_a[file_id].num_chunks == fss_b[file_id].num_chunks);
+
+    auto cat_a = db_a.query_file_category_counts_batch({file_id});
+    auto cat_b = db_b.query_file_category_counts_batch({file_id});
+    REQUIRE(cat_a.count(file_id) == 1);
+    REQUIRE(cat_b.count(file_id) == 1);
+    CHECK(cat_a[file_id].size() == cat_b[file_id].size());
+    for (const auto& [k, v] : cat_a[file_id]) {
+        auto it = cat_b[file_id].find(k);
+        REQUIRE(it != cat_b[file_id].end());
+        CHECK(it->second == v);
+    }
+
+    auto pt_a = db_a.query_file_pid_tid_counts_batch({file_id});
+    auto pt_b = db_b.query_file_pid_tid_counts_batch({file_id});
+    REQUIRE(pt_a.count(file_id) == 1);
+    REQUIRE(pt_b.count(file_id) == 1);
+    CHECK(pt_a[file_id].size() == pt_b[file_id].size());
+
+    auto ns_a = db_a.query_file_name_summaries_batch({file_id});
+    auto ns_b = db_b.query_file_name_summaries_batch({file_id});
+    REQUIRE(ns_a.count(file_id) == 1);
+    REQUIRE(ns_b.count(file_id) == 1);
+    CHECK(ns_a[file_id].counts.size() == ns_b[file_id].counts.size());
+    CHECK(ns_a[file_id].unique_count == ns_b[file_id].unique_count);
+
+    auto dims_a = db_a.query_index_dimensions(file_id);
+    auto dims_b = db_b.query_index_dimensions(file_id);
+    std::sort(dims_a.begin(), dims_a.end());
+    std::sort(dims_b.begin(), dims_b.end());
+    CHECK(dims_a == dims_b);
+
+    auto cds_a = db_a.query_chunk_dimension_stats(file_id);
+    auto cds_b = db_b.query_chunk_dimension_stats(file_id);
+    REQUIRE(cds_a.size() == cds_b.size());
+    for (std::size_t i = 0; i < cds_a.size(); ++i) {
+        CHECK(cds_a[i].checkpoint_idx == cds_b[i].checkpoint_idx);
+        CHECK(cds_a[i].dimension == cds_b[i].dimension);
+        CHECK(cds_a[i].distinct_count == cds_b[i].distinct_count);
+        CHECK(cds_a[i].min_value == cds_b[i].min_value);
+        CHECK(cds_a[i].max_value == cds_b[i].max_value);
+    }
+
+    CHECK(db_a.query_name_id("read") == db_b.query_name_id("read"));
+    CHECK(db_a.query_name_id("write") == db_b.query_name_id("write"));
+    CHECK(db_a.query_name_by_id(*db_a.query_name_id("read")) ==
+          db_b.query_name_by_id(*db_b.query_name_id("read")));
+
+    auto fp_a = db_a.query_name_file_postings("read");
+    auto fp_b = db_b.query_name_file_postings("read");
+    std::sort(fp_a.begin(), fp_a.end());
+    std::sort(fp_b.begin(), fp_b.end());
+    CHECK(fp_a == fp_b);
+
+    auto cp_read_a = db_a.query_name_chunk_postings("read", file_id);
+    auto cp_read_b = db_b.query_name_chunk_postings("read", file_id);
+    std::sort(cp_read_a.begin(), cp_read_a.end());
+    std::sort(cp_read_b.begin(), cp_read_b.end());
+    CHECK(cp_read_a == cp_read_b);
+
+    CHECK(db_a.resolve_hash(IndexDatabase::HashType::FILE, "fh_1") ==
+          db_b.resolve_hash(IndexDatabase::HashType::FILE, "fh_1"));
+    CHECK(db_a.resolve_hash(IndexDatabase::HashType::HOST, "hh_1") ==
+          db_b.resolve_hash(IndexDatabase::HashType::HOST, "hh_1"));
+    CHECK(db_a.resolve_name_to_hash(IndexDatabase::HashType::FILE,
+                                    "/path/to/trace.pfw.gz") ==
+          db_b.resolve_name_to_hash(IndexDatabase::HashType::FILE,
+                                    "/path/to/trace.pfw.gz"));
+    CHECK(db_a.query_hash_table(IndexDatabase::HashType::FILE) ==
+          db_b.query_hash_table(IndexDatabase::HashType::FILE));
+
+    namespace cf = dftracer::utils::rocksdb::cf;
+    compare_cf_entries(db_a, db_b, cf::AGGREGATION);
+    compare_cf_entries(db_a, db_b, cf::SYSTEM_METRICS);
+}
+
+/// Compare all entries in `cf` between two databases. Uses raw iteration
+/// over the CF; for merge-operand CFs rocksdb combines operands on read
+/// automatically, so both DBs must return byte-identical values.
+void compare_cf_entries(const IndexDatabase& db_a, const IndexDatabase& db_b,
+                        std::string_view cf_name) {
+    auto collect =
+        [&](const IndexDatabase& db) -> std::map<std::string, std::string> {
+        std::map<std::string, std::string> out;
+        auto it = db.db()->new_iterator(cf_name);
+        for (it->SeekToFirst(); it->Valid(); it->Next()) {
+            out.emplace(std::string(it->key().data(), it->key().size()),
+                        std::string(it->value().data(), it->value().size()));
+        }
+        return out;
+    };
+    auto a_map = collect(db_a);
+    auto b_map = collect(db_b);
+    REQUIRE(a_map.size() == b_map.size());
+    for (auto& [k, v] : a_map) {
+        auto it = b_map.find(k);
+        REQUIRE(it != b_map.end());
+        CHECK(it->second == v);
+    }
+}
+
+void check_root_summaries(const IndexDatabase& db_a,
+                          const IndexDatabase& db_b) {
+    auto r_a = db_a.query_root_scalar_stats();
+    auto r_b = db_b.query_root_scalar_stats();
+    REQUIRE(r_a.has_value());
+    REQUIRE(r_b.has_value());
+    CHECK(r_a->stats.total_events == r_b->stats.total_events);
+    CHECK(r_a->num_chunks == r_b->num_chunks);
+    CHECK(r_a->num_files == r_b->num_files);
+
+    CHECK(db_a.query_root_category_counts() ==
+          db_b.query_root_category_counts());
+    CHECK(db_a.query_root_name_counts() == db_b.query_root_name_counts());
+    CHECK(db_a.query_root_pid_tid_counts() == db_b.query_root_pid_tid_counts());
+}
+
+}  // namespace
+
+TEST_SUITE("IndexDatabaseSstWriterContext") {
+    TEST_CASE("round-trip: SST ingest matches direct RocksDB writes") {
+        auto root_a = dft_utils_test::make_unique_test_path("sst_spike_db_a");
+        auto root_b = dft_utils_test::make_unique_test_path("sst_spike_db_b");
+        auto staging =
+            dft_utils_test::make_unique_test_path("sst_spike_staging");
+        fs::create_directories(root_a);
+        fs::create_directories(root_b);
+        fs::create_directories(staging);
+
+        Fixture f;
+        const int file_id = 1;
+
+        IndexDatabase db_a((root_a / ".dftindex").string());
+        {
+            auto w = db_a.begin_write();
+            w->init_schema();
+            f.populate(*w, file_id);
+            w->commit();
+        }
+
+        IndexDatabase db_b((root_b / ".dftindex").string());
+        {
+            auto w = db_b.begin_write();
+            w->init_schema();
+            w->commit();
+        }
+
+        SstArtifactRegistry registry;
+        {
+            IndexDatabaseSstWriterContext sst(staging.string(), "batch_0");
+            f.populate(sst, file_id);
+            registry.append(sst.commit());
+        }
+
+        CHECK(registry.metadata().size() == 1);
+        CHECK(registry.checkpoints().size() == 1);
+        CHECK(registry.manifest().size() == 1);
+
+        db_b.bulk_ingest(registry);
+
+        // Both paths must converge on the same root summaries after an
+        // explicit rebuild on each side.
+        db_a.rebuild_root_summaries();
+        db_b.rebuild_root_summaries();
+
+        check_round_trip(db_a, db_b, file_id);
+        check_root_summaries(db_a, db_b);
+    }
+
+    TEST_CASE("bulk_ingest composes across multiple disjoint batches") {
+        auto root_a = dft_utils_test::make_unique_test_path("sst_multi_db_a");
+        auto root_b = dft_utils_test::make_unique_test_path("sst_multi_db_b");
+        auto staging =
+            dft_utils_test::make_unique_test_path("sst_multi_staging");
+        fs::create_directories(root_a);
+        fs::create_directories(root_b);
+        fs::create_directories(staging);
+
+        Fixture f1;
+        Fixture f2;
+        // Vary the second fixture so the comparison covers distinct data.
+        f2.read_lines = {7, 11, 13};
+        f2.write_lines = {4};
+        f2.pids = {201, 202};
+
+        IndexDatabase db_a((root_a / ".dftindex").string());
+        {
+            auto w = db_a.begin_write();
+            w->init_schema();
+            f1.populate(*w, /*file_id=*/1);
+            f2.populate(*w, /*file_id=*/2);
+            w->commit();
+        }
+
+        IndexDatabase db_b((root_b / ".dftindex").string());
+        {
+            auto w = db_b.begin_write();
+            w->init_schema();
+            w->commit();
+        }
+
+        SstArtifactRegistry registry;
+        {
+            IndexDatabaseSstWriterContext sst(staging.string(), "worker_0");
+            f1.populate(sst, /*file_id=*/1);
+            registry.append(sst.commit());
+        }
+        {
+            IndexDatabaseSstWriterContext sst(staging.string(), "worker_1");
+            f2.populate(sst, /*file_id=*/2);
+            registry.append(sst.commit());
+        }
+
+        CHECK(registry.metadata().size() == 2);
+        CHECK(registry.checkpoints().size() == 2);
+        CHECK(registry.manifest().size() == 2);
+
+        db_b.bulk_ingest(registry);
+
+        db_a.rebuild_root_summaries();
+        db_b.rebuild_root_summaries();
+
+        check_round_trip(db_a, db_b, 1);
+        check_round_trip(db_a, db_b, 2);
+        check_root_summaries(db_a, db_b);
+    }
+}
diff --git a/tests/utilities/reader/test_trace_reader.cpp b/tests/utilities/reader/test_trace_reader.cpp
index e1a1ed0d..3eef79d3 100644
--- a/tests/utilities/reader/test_trace_reader.cpp
+++ b/tests/utilities/reader/test_trace_reader.cpp
@@ -70,6 +70,34 @@ static CoroTask<std::vector<std::string>> collect_lines(
     co_return lines;
 }
 
+struct ParsedEvent {
+    std::string name;
+    std::string cat;
+    std::string ph;
+};
+
+static CoroTask<std::vector<ParsedEvent>> collect_json_events(
+    AsyncGenerator<JsonLine> gen) {
+    std::vector<ParsedEvent> events;
+    while (auto opt = co_await gen.next()) {
+        auto* p = opt->parser;
+        ParsedEvent ev;
+        if (auto v = p->get_string("name")) ev.name = std::string(*v);
+        if (auto v = p->get_string("cat")) ev.cat = std::string(*v);
+        if (auto v = p->get_string("ph")) ev.ph = std::string(*v);
+        events.push_back(std::move(ev));
+    }
+    co_return events;
+}
+
+static CoroTask<std::size_t> count_json_lines(AsyncGenerator<JsonLine> gen) {
+    std::size_t n = 0;
+    while (auto opt = co_await gen.next()) {
+        ++n;
+    }
+    co_return n;
+}
+
 }  // namespace
 
 TEST_SUITE("TraceReader") {
@@ -510,8 +538,8 @@ TEST_SUITE("TraceReader") {
         IndexBuilderUtility builder;
         auto build_result = builder
                                 .process(IndexBuildConfig::for_file(gz)
-                                             .with_bloom(true)
-                                             .with_index_threshold(0))
+
+                                             )
                                 .get();
         REQUIRE(build_result.success);
 
@@ -559,8 +587,8 @@ TEST_SUITE("TraceReader") {
         IndexBuilderUtility builder;
         auto build_result = builder
                                 .process(IndexBuildConfig::for_file(gz)
-                                             .with_bloom(true)
-                                             .with_index_threshold(0))
+
+                                             )
                                 .get();
         REQUIRE(build_result.success);
 
@@ -582,4 +610,247 @@ TEST_SUITE("TraceReader") {
         auto posix_bytes = count_raw_bytes(reader.read_raw(rc_posix)).get();
         CHECK(posix_bytes == all_bytes);
     }
+
+    TEST_CASE("Chunk pruning skips non-matching checkpoints") {
+        TestEnvironment env(100);
+        std::string pfw = env.get_dir() + "/multi_ckpt.pfw";
+        constexpr int POSIX_BEFORE = 100;
+        constexpr int COMPUTE_COUNT = 5;
+        constexpr int POSIX_AFTER = 100;
+        constexpr int TOTAL = POSIX_BEFORE + COMPUTE_COUNT + POSIX_AFTER;
+        // Each line is ~550 bytes (padded args). 205 events * 550 = ~112KB.
+        // With 32KB checkpoint window -> 3-4 checkpoints.
+        // COMPUTE events cluster in one checkpoint in the middle.
+        std::string pad(400, 'x');
+        {
+            std::ofstream out(pfw);
+            for (int i = 0; i < POSIX_BEFORE; ++i) {
+                out << R"({"ph":"X","name":"read","cat":"POSIX","pid":1,"tid":1,"ts":)"
+                    << (1000 + i) << R"(,"dur":10,"args":{"pad":")" << pad
+                    << R"("}})" << "\n";
+            }
+            for (int i = 0; i < COMPUTE_COUNT; ++i) {
+                out << R"({"ph":"X","name":"train","cat":"COMPUTE","pid":2,"tid":2,"ts":)"
+                    << (100000 + i) << R"(,"dur":500,"args":{"pad":")" << pad
+                    << R"("}})" << "\n";
+            }
+            for (int i = 0; i < POSIX_AFTER; ++i) {
+                out << R"({"ph":"X","name":"write","cat":"POSIX","pid":1,"tid":1,"ts":)"
+                    << (200000 + i) << R"(,"dur":10,"args":{"pad":")" << pad
+                    << R"("}})" << "\n";
+            }
+        }
+        std::string gz = pfw + ".gz";
+        REQUIRE(dft_utils_test::compress_file_to_gzip(pfw, gz));
+        fs::remove(pfw);
+
+        using dftracer::utils::utilities::indexer::IndexBuildConfig;
+        using dftracer::utils::utilities::indexer::IndexBuilderUtility;
+        IndexBuilderUtility builder;
+        auto build_result = builder
+                                .process(IndexBuildConfig::for_file(gz)
+                                             .with_checkpoint_size(32 * 1024)
+                                             .with_manifest(true))
+                                .get();
+        REQUIRE(build_result.success);
+
+        TraceReader reader({.file_path = gz, .checkpoint_size = 32 * 1024});
+        REQUIRE(reader.has_index());
+
+        auto all = count_lines(reader.read_lines()).get();
+        REQUIRE(all == TOTAL);
+
+        // Selective query: only COMPUTE events (5 out of 205)
+        ReadConfig rc_compute;
+        rc_compute.query = R"(cat == "COMPUTE")";
+        auto compute_lines = collect_lines(reader.read_lines(rc_compute)).get();
+        CHECK(compute_lines.size() == COMPUTE_COUNT);
+        for (const auto& line : compute_lines) {
+            CHECK(line.find("\"cat\":\"COMPUTE\"") != std::string::npos);
+        }
+
+        // Full category query should still return all POSIX
+        ReadConfig rc_posix;
+        rc_posix.query = R"(cat == "POSIX")";
+        auto posix_lines = collect_lines(reader.read_lines(rc_posix)).get();
+        CHECK(posix_lines.size() == POSIX_BEFORE + POSIX_AFTER);
+
+        // No match
+        ReadConfig rc_none;
+        rc_none.query = R"(cat == "NONEXISTENT")";
+        auto none_lines = count_lines(reader.read_lines(rc_none)).get();
+        CHECK(none_lines == 0);
+    }
+}
+
+TEST_SUITE("TraceReader::read_json") {
+    TEST_CASE("read_json returns parsed events") {
+        TestEnvironment env(100);
+        std::string gz_file = env.create_dft_test_gzip_file(100);
+        TraceReader reader({.file_path = gz_file});
+
+        auto events = collect_json_events(reader.read_json()).get();
+        CHECK(events.size() > 0);
+        for (const auto& ev : events) {
+            CHECK_FALSE(ev.ph.empty());
+        }
+    }
+
+    TEST_CASE("read_json count matches read_lines count") {
+        TestEnvironment env(100);
+        std::string gz_file = env.create_dft_test_gzip_file(100);
+        TraceReader reader({.file_path = gz_file});
+
+        auto line_count = count_lines(reader.read_lines()).get();
+        auto json_count = count_json_lines(reader.read_json()).get();
+        CHECK(json_count <= line_count);
+        CHECK(json_count > 0);
+    }
+
+    TEST_CASE("read_json query filters events") {
+        TestEnvironment env(100);
+        std::string gz_file = env.create_dft_test_gzip_file(100);
+        TraceReader reader({.file_path = gz_file});
+
+        auto all = count_json_lines(reader.read_json()).get();
+        REQUIRE(all > 0);
+
+        ReadConfig rc;
+        rc.query = R"(cat == "POSIX")";
+        auto events = collect_json_events(reader.read_json(rc)).get();
+        CHECK(events.size() > 0);
+        CHECK(events.size() <= all);
+        for (const auto& ev : events) {
+            CHECK(ev.cat == "POSIX");
+        }
+    }
+
+    TEST_CASE("read_json query with no matches returns zero") {
+        TestEnvironment env(100);
+        std::string gz_file = env.create_dft_test_gzip_file(100);
+        TraceReader reader({.file_path = gz_file});
+
+        ReadConfig rc;
+        rc.query = R"(cat == "NONEXISTENT")";
+        auto n = count_json_lines(reader.read_json(rc)).get();
+        CHECK(n == 0);
+    }
+
+    TEST_CASE("read_json with AND query") {
+        TestEnvironment env(100);
+        std::string gz_file = env.create_dft_test_gzip_file(100);
+        TraceReader reader({.file_path = gz_file});
+
+        ReadConfig rc;
+        rc.query = R"(cat == "POSIX" and name == "read")";
+        auto events = collect_json_events(reader.read_json(rc)).get();
+        CHECK(events.size() > 0);
+        for (const auto& ev : events) {
+            CHECK(ev.cat == "POSIX");
+            CHECK(ev.name == "read");
+        }
+    }
+
+    TEST_CASE("read_json matches read_lines query count") {
+        TestEnvironment env(100);
+        std::string pfw = env.get_dir() + "/json_vs_lines.pfw";
+        {
+            std::ofstream out(pfw);
+            for (int i = 0; i < 100; ++i) {
+                out << R"({"ph":"X","name":"read","cat":"POSIX","pid":1,"tid":1,"ts":)"
+                    << (1000 + i) << R"(,"dur":10,"args":{}})" << "\n";
+            }
+            for (int i = 0; i < 50; ++i) {
+                out << R"({"ph":"X","name":"train","cat":"COMPUTE","pid":2,"tid":2,"ts":)"
+                    << (100000 + i) << R"(,"dur":500,"args":{}})" << "\n";
+            }
+        }
+
+        TraceReader reader({.file_path = pfw});
+
+        ReadConfig rc;
+        rc.query = R"(cat == "POSIX")";
+        auto line_count = count_lines(reader.read_lines(rc)).get();
+        auto json_count = count_json_lines(reader.read_json(rc)).get();
+        CHECK(line_count == json_count);
+        CHECK(json_count == 100);
+
+        fs::remove(pfw);
+    }
+
+    TEST_CASE("read_json works with index and chunk pruning") {
+        TestEnvironment env(100);
+        std::string pfw = env.get_dir() + "/json_indexed.pfw";
+        std::string pad(400, 'x');
+        {
+            std::ofstream out(pfw);
+            for (int i = 0; i < 100; ++i) {
+                out << R"({"ph":"X","name":"read","cat":"POSIX","pid":1,"tid":1,"ts":)"
+                    << (1000 + i) << R"(,"dur":10,"args":{"pad":")" << pad
+                    << R"("}})" << "\n";
+            }
+            for (int i = 0; i < 5; ++i) {
+                out << R"({"ph":"X","name":"train","cat":"COMPUTE","pid":2,"tid":2,"ts":)"
+                    << (100000 + i) << R"(,"dur":500,"args":{"pad":")" << pad
+                    << R"("}})" << "\n";
+            }
+            for (int i = 0; i < 100; ++i) {
+                out << R"({"ph":"X","name":"write","cat":"POSIX","pid":1,"tid":1,"ts":)"
+                    << (200000 + i) << R"(,"dur":10,"args":{"pad":")" << pad
+                    << R"("}})" << "\n";
+            }
+        }
+        std::string gz = pfw + ".gz";
+        REQUIRE(dft_utils_test::compress_file_to_gzip(pfw, gz));
+        fs::remove(pfw);
+
+        using dftracer::utils::utilities::indexer::IndexBuildConfig;
+        using dftracer::utils::utilities::indexer::IndexBuilderUtility;
+        IndexBuilderUtility builder;
+        auto build_result = builder
+                                .process(IndexBuildConfig::for_file(gz)
+                                             .with_checkpoint_size(32 * 1024)
+                                             .with_manifest(true))
+                                .get();
+        REQUIRE(build_result.success);
+
+        TraceReader reader({.file_path = gz, .checkpoint_size = 32 * 1024});
+        REQUIRE(reader.has_index());
+
+        ReadConfig rc;
+        rc.query = R"(cat == "COMPUTE")";
+        auto events = collect_json_events(reader.read_json(rc)).get();
+        CHECK(events.size() == 5);
+        for (const auto& ev : events) {
+            CHECK(ev.cat == "COMPUTE");
+        }
+
+        ReadConfig rc_posix;
+        rc_posix.query = R"(cat == "POSIX")";
+        auto posix_count = count_json_lines(reader.read_json(rc_posix)).get();
+        CHECK(posix_count == 200);
+
+        ReadConfig rc_none;
+        rc_none.query = R"(cat == "NONEXISTENT")";
+        auto none_count = count_json_lines(reader.read_json(rc_none)).get();
+        CHECK(none_count == 0);
+    }
+
+    TEST_CASE("read_json parser fields are accessible") {
+        auto test_file = make_unique_test_path("json_parser_fields.pfw");
+        {
+            std::ofstream out(test_file);
+            out << R"({"ph":"X","name":"read","cat":"POSIX","pid":42,"tid":7,"ts":1000,"dur":10,"args":{"ret":1}})"
+                << "\n";
+        }
+
+        TraceReader reader({.file_path = test_file.string()});
+        auto events = collect_json_events(reader.read_json()).get();
+        REQUIRE(events.size() == 1);
+        CHECK(events[0].name == "read");
+        CHECK(events[0].cat == "POSIX");
+        CHECK(events[0].ph == "X");
+
+        fs::remove(test_file);
+    }
 }